Repository: eval-sys/mcpmark
Branch: main
Commit: adc5e6558f05
Files: 670
Total size: 3.1 MB

Directory structure:
gitextract_5znolca_/

├── .dockerignore
├── .editorconfig
├── .gitattributes
├── .github/
│   ├── ISSUE_TEMPLATE/
│   │   ├── 1_bug_report.yml
│   │   ├── 2_feature_request.yml
│   │   └── config.yml
│   ├── PULL_REQUEST_TEMPLATE.md
│   ├── scripts/
│   │   └── pr-comment.js
│   └── workflows/
│       └── publish-docker-image.yml
├── .gitignore
├── CHANGELOG.md
├── Dockerfile
├── LICENSE
├── README.md
├── build-docker.sh
├── cspell.config.yaml
├── docs/
│   ├── contributing/
│   │   └── make-contribution.md
│   ├── datasets/
│   │   └── task.md
│   ├── installation_and_docker_usage.md
│   ├── introduction.md
│   ├── mcp/
│   │   ├── filesystem.md
│   │   ├── github.md
│   │   ├── notion.md
│   │   ├── playwright.md
│   │   └── postgres.md
│   └── quickstart.md
├── pipeline.py
├── pyproject.toml
├── run-benchmark.sh
├── run-task.sh
├── src/
│   ├── agents/
│   │   ├── __init__.py
│   │   ├── base_agent.py
│   │   ├── mcp/
│   │   │   ├── __init__.py
│   │   │   ├── http_server.py
│   │   │   └── stdio_server.py
│   │   ├── mcpmark_agent.py
│   │   ├── react_agent.py
│   │   └── utils/
│   │       ├── __init__.py
│   │       └── token_usage.py
│   ├── aggregators/
│   │   ├── aggregate_results.py
│   │   ├── aggregate_specific_results.py
│   │   ├── aggregate_task_meta.py
│   │   └── pricing.py
│   ├── base/
│   │   ├── __init__.py
│   │   ├── login_helper.py
│   │   ├── state_manager.py
│   │   └── task_manager.py
│   ├── config/
│   │   ├── __init__.py
│   │   └── config_schema.py
│   ├── errors.py
│   ├── evaluator.py
│   ├── factory.py
│   ├── logger.py
│   ├── mcp_services/
│   │   ├── filesystem/
│   │   │   ├── __init__.py
│   │   │   ├── filesystem_login_helper.py
│   │   │   ├── filesystem_state_manager.py
│   │   │   └── filesystem_task_manager.py
│   │   ├── github/
│   │   │   ├── __init__.py
│   │   │   ├── github_login_helper.py
│   │   │   ├── github_state_manager.py
│   │   │   ├── github_task_manager.py
│   │   │   ├── repo_exporter.py
│   │   │   ├── repo_importer.py
│   │   │   └── token_pool.py
│   │   ├── insforge/
│   │   │   ├── __init__.py
│   │   │   ├── insforge_login_helper.py
│   │   │   ├── insforge_state_manager.py
│   │   │   └── insforge_task_manager.py
│   │   ├── notion/
│   │   │   ├── __init__.py
│   │   │   ├── notion_login_helper.py
│   │   │   ├── notion_state_manager.py
│   │   │   └── notion_task_manager.py
│   │   ├── playwright/
│   │   │   ├── __init__.py
│   │   │   ├── playwright_login_helper.py
│   │   │   ├── playwright_state_manager.py
│   │   │   └── playwright_task_manager.py
│   │   ├── playwright_webarena/
│   │   │   ├── playwright_login_helper.py
│   │   │   ├── playwright_state_manager.py
│   │   │   ├── playwright_task_manager.py
│   │   │   └── reddit_env_setup.md
│   │   ├── postgres/
│   │   │   ├── __init__.py
│   │   │   ├── postgres_login_helper.py
│   │   │   ├── postgres_state_manager.py
│   │   │   └── postgres_task_manager.py
│   │   └── supabase/
│   │       ├── __init__.py
│   │       ├── supabase_login_helper.py
│   │       ├── supabase_state_manager.py
│   │       └── supabase_task_manager.py
│   ├── model_config.py
│   ├── results_reporter.py
│   └── services.py
└── tasks/
    ├── __init__.py
    ├── filesystem/
    │   ├── easy/
    │   │   ├── .gitkeep
    │   │   ├── file_context/
    │   │   │   ├── file_splitting/
    │   │   │   │   ├── description.md
    │   │   │   │   ├── meta.json
    │   │   │   │   └── verify.py
    │   │   │   ├── pattern_matching/
    │   │   │   │   ├── description.md
    │   │   │   │   ├── meta.json
    │   │   │   │   └── verify.py
    │   │   │   └── uppercase/
    │   │   │       ├── description.md
    │   │   │       ├── meta.json
    │   │   │       └── verify.py
    │   │   ├── file_property/
    │   │   │   ├── largest_rename/
    │   │   │   │   ├── description.md
    │   │   │   │   ├── meta.json
    │   │   │   │   └── verify.py
    │   │   │   └── txt_merging/
    │   │   │       ├── description.md
    │   │   │       ├── meta.json
    │   │   │       └── verify.py
    │   │   ├── folder_structure/
    │   │   │   └── structure_analysis/
    │   │   │       ├── description.md
    │   │   │       ├── meta.json
    │   │   │       └── verify.py
    │   │   ├── legal_document/
    │   │   │   └── file_reorganize/
    │   │   │       ├── description.md
    │   │   │       ├── meta.json
    │   │   │       └── verify.py
    │   │   ├── papers/
    │   │   │   └── papers_counting/
    │   │   │       ├── description.md
    │   │   │       ├── meta.json
    │   │   │       └── verify.py
    │   │   └── student_database/
    │   │       ├── duplicate_name/
    │   │       │   ├── description.md
    │   │       │   ├── meta.json
    │   │       │   └── verify.py
    │   │       └── recommender_name/
    │   │           ├── description.md
    │   │           ├── meta.json
    │   │           └── verify.py
    │   └── standard/
    │       ├── desktop/
    │       │   ├── music_report/
    │       │   │   ├── description.md
    │       │   │   ├── meta.json
    │       │   │   └── verify.py
    │       │   ├── project_management/
    │       │   │   ├── description.md
    │       │   │   ├── meta.json
    │       │   │   └── verify.py
    │       │   └── timeline_extraction/
    │       │       ├── description.md
    │       │       ├── meta.json
    │       │       └── verify.py
    │       ├── desktop_template/
    │       │   ├── budget_computation/
    │       │   │   ├── description.md
    │       │   │   ├── meta.json
    │       │   │   └── verify.py
    │       │   ├── contact_information/
    │       │   │   ├── description.md
    │       │   │   ├── meta.json
    │       │   │   └── verify.py
    │       │   └── file_arrangement/
    │       │       ├── description.md
    │       │       ├── meta.json
    │       │       └── verify.py
    │       ├── file_context/
    │       │   ├── duplicates_searching/
    │       │   │   ├── description.md
    │       │   │   ├── meta.json
    │       │   │   └── verify.py
    │       │   ├── file_merging/
    │       │   │   ├── description.md
    │       │   │   ├── meta.json
    │       │   │   └── verify.py
    │       │   ├── file_splitting/
    │       │   │   ├── description.md
    │       │   │   ├── meta.json
    │       │   │   └── verify.py
    │       │   ├── pattern_matching/
    │       │   │   ├── description.md
    │       │   │   ├── meta.json
    │       │   │   └── verify.py
    │       │   └── uppercase/
    │       │       ├── description.md
    │       │       ├── meta.json
    │       │       └── verify.py
    │       ├── file_property/
    │       │   ├── size_classification/
    │       │   │   ├── description.md
    │       │   │   ├── meta.json
    │       │   │   └── verify.py
    │       │   └── time_classification/
    │       │       ├── description.md
    │       │       ├── meta.json
    │       │       └── verify.py
    │       ├── folder_structure/
    │       │   ├── structure_analysis/
    │       │   │   ├── description.md
    │       │   │   ├── meta.json
    │       │   │   └── verify.py
    │       │   └── structure_mirror/
    │       │       ├── description.md
    │       │       ├── meta.json
    │       │       └── verify.py
    │       ├── legal_document/
    │       │   ├── dispute_review/
    │       │   │   ├── description.md
    │       │   │   ├── meta.json
    │       │   │   └── verify.py
    │       │   ├── individual_comments/
    │       │   │   ├── description.md
    │       │   │   ├── meta.json
    │       │   │   └── verify.py
    │       │   └── solution_tracing/
    │       │       ├── description.md
    │       │       ├── meta.json
    │       │       └── verify.py
    │       ├── papers/
    │       │   ├── author_folders/
    │       │   │   ├── description.md
    │       │   │   ├── meta.json
    │       │   │   └── verify.py
    │       │   ├── find_math_paper/
    │       │   │   ├── description.md
    │       │   │   ├── meta.json
    │       │   │   └── verify.py
    │       │   └── organize_legacy_papers/
    │       │       ├── description.md
    │       │       ├── meta.json
    │       │       └── verify.py
    │       ├── student_database/
    │       │   ├── duplicate_name/
    │       │   │   ├── description.md
    │       │   │   ├── meta.json
    │       │   │   └── verify.py
    │       │   ├── english_talent/
    │       │   │   ├── description.md
    │       │   │   ├── meta.json
    │       │   │   └── verify.py
    │       │   └── gradebased_score/
    │       │       ├── description.md
    │       │       ├── meta.json
    │       │       └── verify.py
    │       ├── threestudio/
    │       │   ├── code_locating/
    │       │   │   ├── description.md
    │       │   │   ├── meta.json
    │       │   │   └── verify.py
    │       │   ├── output_analysis/
    │       │   │   ├── description.md
    │       │   │   ├── meta.json
    │       │   │   └── verify.py
    │       │   └── requirements_completion/
    │       │       ├── description.md
    │       │       ├── meta.json
    │       │       └── verify.py
    │       └── votenet/
    │           ├── dataset_comparison/
    │           │   ├── description.md
    │           │   ├── meta.json
    │           │   └── verify.py
    │           ├── debugging/
    │           │   ├── description.md
    │           │   ├── meta.json
    │           │   └── verify.py
    │           └── requirements_writing/
    │               ├── description.md
    │               ├── meta.json
    │               └── verify.py
    ├── github/
    │   ├── easy/
    │   │   ├── build-your-own-x/
    │   │   │   ├── close_commented_issues/
    │   │   │   │   ├── description.md
    │   │   │   │   ├── meta.json
    │   │   │   │   └── verify.py
    │   │   │   └── record_recent_commits/
    │   │   │       ├── description.md
    │   │   │       ├── meta.json
    │   │   │       └── verify.py
    │   │   ├── claude-code/
    │   │   │   ├── add_terminal_shortcuts_doc/
    │   │   │   │   ├── description.md
    │   │   │   │   ├── meta.json
    │   │   │   │   └── verify.py
    │   │   │   ├── thank_docker_pr_author/
    │   │   │   │   ├── description.md
    │   │   │   │   ├── meta.json
    │   │   │   │   └── verify.py
    │   │   │   └── triage_missing_tool_result_issue/
    │   │   │       ├── description.md
    │   │   │       ├── meta.json
    │   │   │       └── verify.py
    │   │   ├── mcpmark-cicd/
    │   │   │   ├── basic_ci_checks/
    │   │   │   │   ├── description.md
    │   │   │   │   ├── meta.json
    │   │   │   │   └── verify.py
    │   │   │   ├── issue_lint_guard/
    │   │   │   │   ├── description.md
    │   │   │   │   ├── meta.json
    │   │   │   │   └── verify.py
    │   │   │   └── nightly_health_check/
    │   │   │       ├── description.md
    │   │   │       ├── meta.json
    │   │   │       └── verify.py
    │   │   └── missing-semester/
    │   │       ├── count_translations/
    │   │       │   ├── description.md
    │   │       │   ├── meta.json
    │   │       │   └── verify.py
    │   │       └── find_ga_tracking_id/
    │   │           ├── description.md
    │   │           ├── meta.json
    │   │           └── verify.py
    │   └── standard/
    │       ├── build_your_own_x/
    │       │   ├── find_commit_date/
    │       │   │   ├── description.md
    │       │   │   ├── meta.json
    │       │   │   └── verify.py
    │       │   └── find_rag_commit/
    │       │       ├── description.md
    │       │       ├── meta.json
    │       │       └── verify.py
    │       ├── claude-code/
    │       │   ├── automated_changelog_generation/
    │       │   │   ├── description.md
    │       │   │   ├── meta.json
    │       │   │   └── verify.py
    │       │   ├── claude_collaboration_analysis/
    │       │   │   ├── description.md
    │       │   │   ├── meta.json
    │       │   │   └── verify.py
    │       │   ├── critical_issue_hotfix_workflow/
    │       │   │   ├── description.md
    │       │   │   ├── meta.json
    │       │   │   └── verify.py
    │       │   ├── feature_commit_tracking/
    │       │   │   ├── description.md
    │       │   │   ├── meta.json
    │       │   │   └── verify.py
    │       │   └── label_color_standardization/
    │       │       ├── description.md
    │       │       ├── meta.json
    │       │       └── verify.py
    │       ├── easyr1/
    │       │   ├── advanced_branch_strategy/
    │       │   │   ├── description.md
    │       │   │   ├── meta.json
    │       │   │   └── verify.py
    │       │   ├── config_parameter_audit/
    │       │   │   ├── description.md
    │       │   │   ├── meta.json
    │       │   │   └── verify.py
    │       │   ├── performance_regression_investigation/
    │       │   │   ├── description.md
    │       │   │   ├── meta.json
    │       │   │   └── verify.py
    │       │   └── qwen3_issue_management/
    │       │       ├── description.md
    │       │       ├── meta.json
    │       │       └── verify.py
    │       ├── harmony/
    │       │   ├── fix_conflict/
    │       │   │   ├── description.md
    │       │   │   ├── meta.json
    │       │   │   └── verify.py
    │       │   ├── issue_pr_commit_workflow/
    │       │   │   ├── description.md
    │       │   │   ├── meta.json
    │       │   │   └── verify.py
    │       │   ├── issue_tagging_pr_closure/
    │       │   │   ├── description.md
    │       │   │   ├── meta.json
    │       │   │   └── verify.py
    │       │   ├── multi_branch_commit_aggregation/
    │       │   │   ├── description.md
    │       │   │   ├── meta.json
    │       │   │   └── verify.py
    │       │   └── release_management_workflow/
    │       │       ├── description.md
    │       │       ├── meta.json
    │       │       └── verify.py
    │       ├── mcpmark-cicd/
    │       │   ├── deployment_status_workflow/
    │       │   │   ├── description.md
    │       │   │   ├── meta.json
    │       │   │   └── verify.py
    │       │   ├── issue_management_workflow/
    │       │   │   ├── description.md
    │       │   │   ├── meta.json
    │       │   │   └── verify.py
    │       │   ├── linting_ci_workflow/
    │       │   │   ├── description.md
    │       │   │   ├── meta.json
    │       │   │   └── verify.py
    │       │   └── pr_automation_workflow/
    │       │       ├── description.md
    │       │       ├── meta.json
    │       │       └── verify.py
    │       └── missing-semester/
    │           ├── assign_contributor_labels/
    │           │   ├── description.md
    │           │   ├── meta.json
    │           │   └── verify.py
    │           ├── find_legacy_name/
    │           │   ├── description.md
    │           │   ├── meta.json
    │           │   └── verify.py
    │           └── find_salient_file/
    │               ├── description.md
    │               ├── meta.json
    │               └── verify.py
    ├── notion/
    │   ├── easy/
    │   │   ├── .gitkeep
    │   │   ├── computer_science_student_dashboard/
    │   │   │   ├── simple__code_snippets_go/
    │   │   │   │   ├── description.md
    │   │   │   │   ├── meta.json
    │   │   │   │   └── verify.py
    │   │   │   └── simple__study_session_tracker/
    │   │   │       ├── description.md
    │   │   │       ├── meta.json
    │   │   │       └── verify.py
    │   │   ├── it_trouble_shooting_hub/
    │   │   │   └── simple__asset_retirement_migration/
    │   │   │       ├── description.md
    │   │   │       ├── meta.json
    │   │   │       └── verify.py
    │   │   ├── japan_travel_planner/
    │   │   │   └── simple__remove_osaka_itinerary/
    │   │   │       ├── description.md
    │   │   │       ├── meta.json
    │   │   │       └── verify.py
    │   │   ├── online_resume/
    │   │   │   └── simple__skills_development_tracker/
    │   │   │       ├── description.md
    │   │   │       ├── meta.json
    │   │   │       └── verify.py
    │   │   ├── python_roadmap/
    │   │   │   └── simple__expert_level_lessons/
    │   │   │       ├── description.md
    │   │   │       ├── meta.json
    │   │   │       └── verify.py
    │   │   ├── self_assessment/
    │   │   │   └── simple__faq_column_layout/
    │   │   │       ├── description.md
    │   │   │       ├── meta.json
    │   │   │       └── verify.py
    │   │   ├── standard_operating_procedure/
    │   │   │   └── simple__section_organization/
    │   │   │       ├── description.md
    │   │   │       ├── meta.json
    │   │   │       └── verify.py
    │   │   ├── team_projects/
    │   │   │   └── simple__swap_tasks/
    │   │   │       ├── description.md
    │   │   │       ├── meta.json
    │   │   │       └── verify.py
    │   │   └── toronto_guide/
    │   │       └── simple__change_color/
    │   │           ├── description.md
    │   │           ├── meta.json
    │   │           └── verify.py
    │   └── standard/
    │       ├── company_in_a_box/
    │       │   ├── employee_onboarding/
    │       │   │   ├── description.md
    │       │   │   ├── meta.json
    │       │   │   └── verify.py
    │       │   ├── goals_restructure/
    │       │   │   ├── description.md
    │       │   │   ├── meta.json
    │       │   │   └── verify.py
    │       │   └── quarterly_review_dashboard/
    │       │       ├── description.md
    │       │       ├── meta.json
    │       │       └── verify.py
    │       ├── computer_science_student_dashboard/
    │       │   ├── code_snippets_go/
    │       │   │   ├── description.md
    │       │   │   ├── meta.json
    │       │   │   └── verify.py
    │       │   ├── courses_internships_relation/
    │       │   │   ├── description.md
    │       │   │   ├── meta.json
    │       │   │   └── verify.py
    │       │   └── study_session_tracker/
    │       │       ├── description.md
    │       │       ├── meta.json
    │       │       └── verify.py
    │       ├── it_trouble_shooting_hub/
    │       │   ├── asset_retirement_migration/
    │       │   │   ├── description.md
    │       │   │   ├── meta.json
    │       │   │   └── verify.py
    │       │   ├── security_audit_ticket/
    │       │   │   ├── description.md
    │       │   │   ├── meta.json
    │       │   │   └── verify.py
    │       │   └── verification_expired_update/
    │       │       ├── description.md
    │       │       ├── meta.json
    │       │       └── verify.py
    │       ├── japan_travel_planner/
    │       │   ├── daily_itinerary_overview/
    │       │   │   ├── description.md
    │       │   │   ├── meta.json
    │       │   │   └── verify.py
    │       │   ├── packing_progress_summary/
    │       │   │   ├── description.md
    │       │   │   ├── meta.json
    │       │   │   └── verify.py
    │       │   ├── remove_osaka_itinerary/
    │       │   │   ├── description.md
    │       │   │   ├── meta.json
    │       │   │   └── verify.py
    │       │   └── restaurant_expenses_sync/
    │       │       ├── description.md
    │       │       ├── meta.json
    │       │       └── verify.py
    │       ├── online_resume/
    │       │   ├── layout_adjustment/
    │       │   │   ├── description.md
    │       │   │   ├── meta.json
    │       │   │   └── verify.py
    │       │   ├── projects_section_update/
    │       │   │   ├── description.md
    │       │   │   ├── meta.json
    │       │   │   └── verify.py
    │       │   ├── skills_development_tracker/
    │       │   │   ├── description.md
    │       │   │   ├── meta.json
    │       │   │   └── verify.py
    │       │   └── work_history_addition/
    │       │       ├── description.md
    │       │       ├── meta.json
    │       │       └── verify.py
    │       ├── python_roadmap/
    │       │   ├── expert_level_lessons/
    │       │   │   ├── description.md
    │       │   │   ├── meta.json
    │       │   │   └── verify.py
    │       │   └── learning_metrics_dashboard/
    │       │       ├── description.md
    │       │       ├── meta.json
    │       │       └── verify.py
    │       ├── self_assessment/
    │       │   ├── faq_column_layout/
    │       │   │   ├── description.md
    │       │   │   ├── meta.json
    │       │   │   └── verify.py
    │       │   ├── hyperfocus_analysis_report/
    │       │   │   ├── description.md
    │       │   │   ├── meta.json
    │       │   │   └── verify.py
    │       │   └── numbered_list_emojis/
    │       │       ├── description.md
    │       │       ├── meta.json
    │       │       └── verify.py
    │       ├── standard_operating_procedure/
    │       │   ├── deployment_process_sop/
    │       │   │   ├── description.md
    │       │   │   ├── meta.json
    │       │   │   └── verify.py
    │       │   └── section_organization/
    │       │       ├── description.md
    │       │       ├── meta.json
    │       │       └── verify.py
    │       ├── team_projects/
    │       │   ├── priority_tasks_table/
    │       │   │   ├── description.md
    │       │   │   ├── meta.json
    │       │   │   └── verify.py
    │       │   └── swap_tasks/
    │       │       ├── description.md
    │       │       ├── meta.json
    │       │       └── verify.py
    │       └── toronto_guide/
    │           ├── change_color/
    │           │   ├── description.md
    │           │   ├── meta.json
    │           │   └── verify.py
    │           └── weekend_adventure_planner/
    │               ├── description.md
    │               ├── meta.json
    │               └── verify.py
    ├── playwright/
    │   ├── easy/
    │   │   └── .gitkeep
    │   └── standard/
    │       ├── eval_web/
    │       │   ├── cloudflare_turnstile_challenge/
    │       │   │   ├── description.md
    │       │   │   ├── meta.json
    │       │   │   └── verify.py
    │       │   └── extraction_table/
    │       │       ├── data.csv
    │       │       ├── description.md
    │       │       ├── meta.json
    │       │       └── verify.py
    │       └── web_search/
    │           ├── birth_of_arvinxu/
    │           │   ├── description.md
    │           │   ├── meta.json
    │           │   └── verify.py
    │           └── r1_arxiv/
    │               ├── content.txt
    │               ├── description.md
    │               ├── meta.json
    │               └── verify.py
    ├── playwright_webarena/
    │   ├── easy/
    │   │   ├── .gitkeep
    │   │   ├── reddit/
    │   │   │   ├── ai_data_analyst/
    │   │   │   │   ├── description.md
    │   │   │   │   ├── label.txt
    │   │   │   │   ├── meta.json
    │   │   │   │   └── verify.py
    │   │   │   ├── llm_research_summary/
    │   │   │   │   ├── description.md
    │   │   │   │   ├── label.txt
    │   │   │   │   ├── meta.json
    │   │   │   │   └── verify.py
    │   │   │   ├── movie_reviewer_analysis/
    │   │   │   │   ├── description.md
    │   │   │   │   ├── label.txt
    │   │   │   │   ├── meta.json
    │   │   │   │   └── verify.py
    │   │   │   ├── nba_statistics_analysis/
    │   │   │   │   ├── description.md
    │   │   │   │   ├── label.txt
    │   │   │   │   ├── meta.json
    │   │   │   │   └── verify.py
    │   │   │   └── routine_tracker_forum/
    │   │   │       ├── description.md
    │   │   │       ├── meta.json
    │   │   │       └── verify.py
    │   │   └── shopping_admin/
    │   │       ├── fitness_promotion_strategy/
    │   │       │   ├── description.md
    │   │       │   ├── label.txt
    │   │       │   ├── meta.json
    │   │       │   └── verify.py
    │   │       ├── ny_expansion_analysis/
    │   │       │   ├── description.md
    │   │       │   ├── label.txt
    │   │       │   ├── meta.json
    │   │       │   └── verify.py
    │   │       ├── products_sales_analysis/
    │   │       │   ├── description.md
    │   │       │   ├── label.txt
    │   │       │   ├── meta.json
    │   │       │   └── verify.py
    │   │       ├── sales_inventory_analysis/
    │   │       │   ├── description.md
    │   │       │   ├── label.txt
    │   │       │   ├── meta.json
    │   │       │   └── verify.py
    │   │       └── search_filtering_operations/
    │   │           ├── description.md
    │   │           ├── label.txt
    │   │           ├── meta.json
    │   │           └── verify.py
    │   └── standard/
    │       ├── reddit/
    │       │   ├── ai_data_analyst/
    │       │   │   ├── description.md
    │       │   │   ├── label.txt
    │       │   │   ├── meta.json
    │       │   │   └── verify.py
    │       │   ├── budget_europe_travel/
    │       │   │   ├── description.md
    │       │   │   ├── meta.json
    │       │   │   └── verify.py
    │       │   ├── buyitforlife_research/
    │       │   │   ├── description.md
    │       │   │   ├── label.txt
    │       │   │   ├── meta.json
    │       │   │   └── verify.py
    │       │   ├── llm_research_summary/
    │       │   │   ├── description.md
    │       │   │   ├── label.txt
    │       │   │   ├── meta.json
    │       │   │   └── verify.py
    │       │   ├── movie_reviewer_analysis/
    │       │   │   ├── description.md
    │       │   │   ├── label.txt
    │       │   │   ├── meta.json
    │       │   │   └── verify.py
    │       │   ├── nba_statistics_analysis/
    │       │   │   ├── description.md
    │       │   │   ├── label.txt
    │       │   │   ├── meta.json
    │       │   │   └── verify.py
    │       │   └── routine_tracker_forum/
    │       │       ├── description.md
    │       │       ├── meta.json
    │       │       └── verify.py
    │       ├── shopping/
    │       │   ├── advanced_product_analysis/
    │       │   │   ├── description.md
    │       │   │   ├── label.txt
    │       │   │   ├── meta.json
    │       │   │   └── verify.py
    │       │   ├── gaming_accessories_analysis/
    │       │   │   ├── description.md
    │       │   │   ├── label.txt
    │       │   │   ├── meta.json
    │       │   │   └── verify.py
    │       │   ├── health_routine_optimization/
    │       │   │   ├── description.md
    │       │   │   ├── label.txt
    │       │   │   ├── meta.json
    │       │   │   └── verify.py
    │       │   ├── holiday_baking_competition/
    │       │   │   ├── description.md
    │       │   │   ├── label.txt
    │       │   │   ├── meta.json
    │       │   │   └── verify.py
    │       │   ├── multi_category_budget_analysis/
    │       │   │   ├── description.md
    │       │   │   ├── label.txt
    │       │   │   ├── meta.json
    │       │   │   └── verify.py
    │       │   ├── printer_keyboard_search/
    │       │   │   ├── description.md
    │       │   │   ├── label.txt
    │       │   │   ├── meta.json
    │       │   │   └── verify.py
    │       │   └── running_shoes_purchase/
    │       │       ├── description.md
    │       │       ├── label.txt
    │       │       ├── meta.json
    │       │       └── verify.py
    │       └── shopping_admin/
    │           ├── customer_segmentation_setup/
    │           │   ├── description.md
    │           │   ├── label.txt
    │           │   ├── meta.json
    │           │   └── verify.py
    │           ├── fitness_promotion_strategy/
    │           │   ├── description.md
    │           │   ├── label.txt
    │           │   ├── meta.json
    │           │   └── verify.py
    │           ├── marketing_customer_analysis/
    │           │   ├── description.md
    │           │   ├── label.txt
    │           │   ├── meta.json
    │           │   └── verify.py
    │           ├── ny_expansion_analysis/
    │           │   ├── description.md
    │           │   ├── label.txt
    │           │   ├── meta.json
    │           │   └── verify.py
    │           ├── products_sales_analysis/
    │           │   ├── description.md
    │           │   ├── label.txt
    │           │   ├── meta.json
    │           │   └── verify.py
    │           ├── sales_inventory_analysis/
    │           │   ├── description.md
    │           │   ├── label.txt
    │           │   ├── meta.json
    │           │   └── verify.py
    │           └── search_filtering_operations/
    │               ├── description.md
    │               ├── label.txt
    │               ├── meta.json
    │               └── verify.py
    ├── postgres/
    │   ├── easy/
    │   │   ├── .gitkeep
    │   │   ├── chinook/
    │   │   │   ├── customer_data_migration_basic/
    │   │   │   │   ├── customer_data.pkl
    │   │   │   │   ├── description.md
    │   │   │   │   ├── meta.json
    │   │   │   │   └── verify.py
    │   │   │   └── update_employee_info/
    │   │   │       ├── description.md
    │   │   │       ├── meta.json
    │   │   │       └── verify.py
    │   │   ├── dvdrental/
    │   │   │   └── create_payment_index/
    │   │   │       ├── description.md
    │   │   │       ├── meta.json
    │   │   │       └── verify.py
    │   │   ├── employees/
    │   │   │   ├── department_summary_view/
    │   │   │   │   ├── description.md
    │   │   │   │   ├── meta.json
    │   │   │   │   └── verify.py
    │   │   │   ├── employee_gender_statistics/
    │   │   │   │   ├── description.md
    │   │   │   │   ├── meta.json
    │   │   │   │   └── verify.py
    │   │   │   ├── employee_projects_basic/
    │   │   │   │   ├── description.md
    │   │   │   │   ├── meta.json
    │   │   │   │   └── verify.py
    │   │   │   └── hiring_year_summary/
    │   │   │       ├── description.md
    │   │   │       ├── meta.json
    │   │   │       └── verify.py
    │   │   ├── lego/
    │   │   │   ├── basic_security_setup/
    │   │   │   │   ├── description.md
    │   │   │   │   ├── meta.json
    │   │   │   │   └── verify.py
    │   │   │   └── fix_data_inconsistencies/
    │   │   │       ├── description.md
    │   │   │       ├── meta.json
    │   │   │       └── verify.py
    │   │   └── sports/
    │   │       └── create_performance_indexes/
    │   │           ├── description.md
    │   │           ├── meta.json
    │   │           └── verify.py
    │   └── standard/
    │       ├── chinook/
    │       │   ├── customer_data_migration/
    │       │   │   ├── customer_data.pkl
    │       │   │   ├── description.md
    │       │   │   ├── meta.json
    │       │   │   └── verify.py
    │       │   ├── employee_hierarchy_management/
    │       │   │   ├── description.md
    │       │   │   ├── meta.json
    │       │   │   └── verify.py
    │       │   └── sales_and_music_charts/
    │       │       ├── description.md
    │       │       ├── meta.json
    │       │       └── verify.py
    │       ├── dvdrental/
    │       │   ├── customer_analysis_fix/
    │       │   │   ├── description.md
    │       │   │   ├── meta.json
    │       │   │   └── verify.py
    │       │   ├── customer_analytics_optimization/
    │       │   │   ├── description.md
    │       │   │   ├── meta.json
    │       │   │   └── verify.py
    │       │   └── film_inventory_management/
    │       │       ├── description.md
    │       │       ├── meta.json
    │       │       └── verify.py
    │       ├── employees/
    │       │   ├── employee_demographics_report/
    │       │   │   ├── description.md
    │       │   │   ├── meta.json
    │       │   │   └── verify.py
    │       │   ├── employee_performance_analysis/
    │       │   │   ├── description.md
    │       │   │   ├── meta.json
    │       │   │   └── verify.py
    │       │   ├── employee_project_tracking/
    │       │   │   ├── description.md
    │       │   │   ├── meta.json
    │       │   │   └── verify.py
    │       │   ├── employee_retention_analysis/
    │       │   │   ├── description.md
    │       │   │   ├── meta.json
    │       │   │   └── verify.py
    │       │   ├── executive_dashboard_automation/
    │       │   │   ├── description.md
    │       │   │   ├── meta.json
    │       │   │   └── verify.py
    │       │   └── management_structure_analysis/
    │       │       ├── description.md
    │       │       ├── meta.json
    │       │       └── verify.py
    │       ├── lego/
    │       │   ├── consistency_enforcement/
    │       │   │   ├── description.md
    │       │   │   ├── meta.json
    │       │   │   └── verify.py
    │       │   ├── database_security_policies/
    │       │   │   ├── description.md
    │       │   │   ├── meta.json
    │       │   │   └── verify.py
    │       │   └── transactional_inventory_transfer/
    │       │       ├── description.md
    │       │       ├── meta.json
    │       │       └── verify.py
    │       ├── security/
    │       │   ├── rls_business_access/
    │       │   │   ├── description.md
    │       │   │   ├── ground_truth.sql
    │       │   │   ├── meta.json
    │       │   │   ├── prepare_environment.py
    │       │   │   └── verify.py
    │       │   └── user_permission_audit/
    │       │       ├── description.md
    │       │       ├── ground_truth.sql
    │       │       ├── meta.json
    │       │       ├── prepare_environment.py
    │       │       └── verify.py
    │       ├── sports/
    │       │   ├── baseball_player_analysis/
    │       │   │   ├── description.md
    │       │   │   ├── meta.json
    │       │   │   └── verify.py
    │       │   ├── participant_report_optimization/
    │       │   │   ├── description.md
    │       │   │   ├── meta.json
    │       │   │   └── verify.py
    │       │   └── team_roster_management/
    │       │       ├── description.md
    │       │       ├── meta.json
    │       │       └── verify.py
    │       └── vectors/
    │           ├── dba_vector_analysis/
    │           │   ├── description.md
    │           │   ├── ground_truth.sql
    │           │   ├── meta.json
    │           │   ├── prepare_environment.py
    │           │   └── verify.py
    │           └── vectors_setup.py
    └── utils/
        ├── __init__.py
        ├── notion_utils.py
        └── postgres_utils.py

================================================
FILE CONTENTS
================================================

================================================
FILE: .dockerignore
================================================
# Git
.git
.gitignore

# Python
__pycache__
*.pyc
*.pyo
*.pyd
.Python
*.egg
*.egg-info/
dist/
build/
.eggs/
*.so

# Virtual environments
venv/
env/
ENV/
.venv/

# IDE
.vscode/
.idea/
*.swp
*.swo
*~
.DS_Store

# Environment files (contain secrets)
.env
.mcp_env
notion_state.json

# Test and development files
.pytest_cache/
.coverage
htmlcov/
.tox/
.mypy_cache/
.ruff_cache/
tests/
test_environments/

# Results and logs
results/
*.log
logs/

# PostgreSQL data
.postgres/

# Playwright
playwright-report/
test-results/

# Documentation images
asset/

# Temporary files
*.tmp
tmp/
temp/

# Docker
Dockerfile
docker-compose.yml
.dockerignore

# Node modules (if any locally installed)
node_modules/

# Pixi lock file
pixi.lock
.pixi/

# GitHub state files
github_state/
github_template_repo/

# Backup directories
.mcpbench_backups/

================================================
FILE: .editorconfig
================================================
root = true

; Always use Unix style new lines with new line ending on every file and trim whitespace
[*]
end_of_line = lf
insert_final_newline = true
trim_trailing_whitespace = true

; Python: PEP8 defines 4 spaces for indentation
[*.py]
indent_style = space
indent_size = 4


================================================
FILE: .gitattributes
================================================
# SCM syntax highlighting & preventing 3-way merges
pixi.lock merge=binary linguist-language=YAML linguist-generated=true


================================================
FILE: .github/ISSUE_TEMPLATE/1_bug_report.yml
================================================
name: '🐛 Bug Report'
description: 'Report an bug'
labels: ['unconfirm']
type: Bug
body:
  - type: textarea
    attributes:
      label: '🐛 Bug Description'
      description: A clear and concise description of the bug, if the above option is `Other`, please also explain in detail.
    validations:
      required: true
  - type: textarea
    attributes:
      label: '📷 Recurrence Steps'
      description: A clear and concise description of how to recurrence.
  - type: textarea
    attributes:
      label: '🚦 Expected Behavior'
      description: A clear and concise description of what you expected to happen.
  - type: textarea
    attributes:
      label: '📝 Additional Information'
      description: If your problem needs further explanation, or if the issue you're seeing cannot be reproduced in a gist, please add more information here.


================================================
FILE: .github/ISSUE_TEMPLATE/2_feature_request.yml
================================================
name: '🌠 Feature Request'
description: 'Suggest an idea'
title: '[Request] '
type: Feature
body:
  - type: textarea
    attributes:
      label: '🥰 Feature Description'
      description: Please add a clear and concise description of the problem you are seeking to solve with this feature request.
    validations:
      required: true
  - type: textarea
    attributes:
      label: '🧐 Proposed Solution'
      description: Describe the solution you'd like in a clear and concise manner.
    validations:
      required: true
  - type: textarea
    attributes:
      label: '📝 Additional Information'
      description: Add any other context about the problem here.


================================================
FILE: .github/ISSUE_TEMPLATE/config.yml
================================================
contact_links:
  - name: Questions and ideas
    url: https://github.com/eval-sys/mcpmark/discussions/new/choose
    about: Please post questions, and ideas in discussions.


================================================
FILE: .github/PULL_REQUEST_TEMPLATE.md
================================================
#### Change Type

<!-- For change type, change [ ] to [x]. -->

- [ ] ✨ feat
- [ ] 🐛 fix
- [ ] ♻️ refactor
- [ ] 💄 style
- [ ] 👷 build
- [ ] ⚡️ perf
- [ ] 📝 docs
- [ ] 🔨 chore

#### Description of Change

<!-- Thank you for your Pull Request. Please provide a description above. -->

#### Additional Information

<!-- Add any other context about the Pull Request here. -->


================================================
FILE: .github/scripts/pr-comment.js
================================================
/**
 * Generate or update PR comment with Docker build info
 */
module.exports = async ({ github, context, dockerMetaJson, image, version, dockerhubUrl, platforms }) => {
  const COMMENT_IDENTIFIER = '<!-- DOCKER-BUILD-COMMENT -->';

  const parseTags = () => {
    try {
      if (dockerMetaJson) {
        const parsed = JSON.parse(dockerMetaJson);
        if (Array.isArray(parsed.tags) && parsed.tags.length > 0) {
          return parsed.tags;
        }
      }
    } catch (e) {
      // ignore parsing error, fallback below
    }
    if (image && version) {
      return [`${image}:${version}`];
    }
    return [];
  };

  const generateCommentBody = () => {
    const tags = parseTags();
    const buildTime = new Date().toISOString();

    // Use the first tag as the main version
    const mainTag = tags.length > 0 ? tags[0] : `${image}:${version}`;
    const tagVersion = mainTag.includes(':') ? mainTag.split(':')[1] : version;

    return [
      COMMENT_IDENTIFIER,
      '',
      '### 🐳 Docker Build Completed!',
      `**Version**: \`${tagVersion || 'N/A'}\``,
      `**Build Time**: \`${buildTime}\``,
      '',
      dockerhubUrl ? `🔗 View all tags on Docker Hub: ${dockerhubUrl}` : '',
      '',
      '### Pull Image',
      'Download the Docker image to your local machine:',
      '',
      '```bash',
      `docker pull ${mainTag}`,
      '```',
      '',
      '### Run Eval',
      'Execute evaluation tasks using the built image:',
      '',
      '```bash',
      `DOCKER_IMAGE_VERSION=${tagVersion} ./run-task.sh --models gpt-4.1-mini --tasks file_context/uppercase`,
      '```',
      '',
      '> [!IMPORTANT]',
      '> This build is for testing and validation purposes.',
    ]
      .filter(Boolean)
      .join('\n');
  };

  const body = generateCommentBody();

  // List comments on the PR
  const { data: comments } = await github.rest.issues.listComments({
    issue_number: context.issue.number,
    owner: context.repo.owner,
    repo: context.repo.repo,
  });

  const existing = comments.find((c) => c.body && c.body.includes(COMMENT_IDENTIFIER));
  if (existing) {
    await github.rest.issues.updateComment({
      comment_id: existing.id,
      owner: context.repo.owner,
      repo: context.repo.repo,
      body,
    });
    return { updated: true, id: existing.id };
  }

  const result = await github.rest.issues.createComment({
    issue_number: context.issue.number,
    owner: context.repo.owner,
    repo: context.repo.repo,
    body,
  });
  return { updated: false, id: result.data.id };
};


================================================
FILE: .github/workflows/publish-docker-image.yml
================================================
name: Publish Docker Image

on:
  workflow_dispatch:
  release:
    types: [ published ]
  pull_request:
    types: [ synchronize, labeled, unlabeled ]

permissions:
  contents: read
  pull-requests: write

concurrency:
  group: ${{ github.ref }}-${{ github.workflow }}
  cancel-in-progress: true

env:
  REGISTRY_IMAGE: evalsysorg/mcpmark
  PR_TAG_PREFIX: pr-

jobs:
  build:
    if: |
      (github.event_name == 'pull_request' &&
       contains(github.event.pull_request.labels.*.name, 'Build Docker')) ||
      github.event_name != 'pull_request'

    strategy:
      matrix:
        include:
          - platform: linux/amd64
            os: ubuntu-latest
          - platform: linux/arm64
            os: ubuntu-24.04-arm
    runs-on: ${{ matrix.os }}
    name: Build ${{ matrix.platform }} Image
    steps:
      - name: Prepare
        run: |
          platform=${{ matrix.platform }}
          echo "PLATFORM_PAIR=${platform//\//-}" >> $GITHUB_ENV

      - name: Checkout base
        uses: actions/checkout@v4
        with:
          fetch-depth: 0

      - name: Set up Docker Buildx
        uses: docker/setup-buildx-action@v3

      - name: Generate PR metadata
        if: github.event_name == 'pull_request'
        id: pr_meta
        run: |
          branch_name="${{ github.head_ref }}"
          sanitized_branch=$(echo "${branch_name}" | sed -E 's/[^a-zA-Z0-9_.-]+/-/g')
          echo "pr_tag=${sanitized_branch}-$(git rev-parse --short HEAD)" >> $GITHUB_OUTPUT

      - name: Docker meta
        id: meta
        uses: docker/metadata-action@v5
        with:
          images: ${{ env.REGISTRY_IMAGE }}
          tags: |
            type=raw,value=${{ env.PR_TAG_PREFIX }}${{ steps.pr_meta.outputs.pr_tag }},enable=${{ github.event_name == 'pull_request' }}
            type=semver,pattern={{version}},enable=${{ github.event_name != 'pull_request' }}
            type=raw,value=latest,enable=${{ github.event_name != 'pull_request' }}

      - name: Docker login
        uses: docker/login-action@v3
        with:
          username: ${{ secrets.DOCKER_REGISTRY_USER }}
          password: ${{ secrets.DOCKER_REGISTRY_PASSWORD }}

      - name: Get commit SHA
        if: github.ref == 'refs/heads/main'
        id: vars
        run: echo "sha_short=$(git rev-parse --short HEAD)" >> $GITHUB_OUTPUT

      - name: Build and export
        id: build
        uses: docker/build-push-action@v6
        with:
          platforms: ${{ matrix.platform }}
          context: .
          file: ./Dockerfile
          labels: ${{ steps.meta.outputs.labels }}
          build-args: |
            SHA=${{ steps.vars.outputs.sha_short }}
          outputs: type=image,name=${{ env.REGISTRY_IMAGE }},push-by-digest=true,name-canonical=true,push=true

      - name: Export digest
        run: |
          rm -rf /tmp/digests
          mkdir -p /tmp/digests
          digest="${{ steps.build.outputs.digest }}"
          touch "/tmp/digests/${digest#sha256:}"

      - name: Upload artifact
        uses: actions/upload-artifact@v4
        with:
          name: digest-${{ env.PLATFORM_PAIR }}
          path: /tmp/digests/*
          if-no-files-found: error
          retention-days: 1

  merge:
    name: Merge
    needs: build
    runs-on: ubuntu-latest
    steps:
      - name: Checkout base
        uses: actions/checkout@v4
        with:
          fetch-depth: 0

      - name: Download digests
        uses: actions/download-artifact@v5
        with:
          path: /tmp/digests
          pattern: digest-*
          merge-multiple: true

      - name: Set up Docker Buildx
        uses: docker/setup-buildx-action@v3

      - name: Generate PR metadata
        if: github.event_name == 'pull_request'
        id: pr_meta
        run: |
          branch_name="${{ github.head_ref }}"
          sanitized_branch=$(echo "${branch_name}" | sed -E 's/[^a-zA-Z0-9_.-]+/-/g')
          echo "pr_tag=${sanitized_branch}-$(git rev-parse --short HEAD)" >> $GITHUB_OUTPUT

      - name: Docker meta
        id: meta
        uses: docker/metadata-action@v5
        with:
          images: ${{ env.REGISTRY_IMAGE }}
          tags: |
            type=raw,value=${{ env.PR_TAG_PREFIX }}${{ steps.pr_meta.outputs.pr_tag }},enable=${{ github.event_name == 'pull_request' }}
            type=semver,pattern={{version}},enable=${{ github.event_name != 'pull_request' }}
            type=raw,value=latest,enable=${{ github.event_name != 'pull_request' }}

      - name: Docker login
        uses: docker/login-action@v3
        with:
          username: ${{ secrets.DOCKER_REGISTRY_USER }}
          password: ${{ secrets.DOCKER_REGISTRY_PASSWORD }}

      - name: Create manifest list and push
        working-directory: /tmp/digests
        run: |
          docker buildx imagetools create $(jq -cr '.tags | map("-t " + .) | join(" ")' <<< "$DOCKER_METADATA_OUTPUT_JSON") \
            $(printf '${{ env.REGISTRY_IMAGE }}@sha256:%s ' *)

      - name: Inspect image
        run: |
          docker buildx imagetools inspect ${{ env.REGISTRY_IMAGE }}:${{ steps.meta.outputs.version }}

      - name: Comment on PR with Docker build info
        if: github.event_name == 'pull_request'
        uses: actions/github-script@v7
        with:
          github-token: ${{ secrets.GITHUB_TOKEN }}
          script: |
            const prComment = require('${{ github.workspace }}/.github/scripts/pr-comment.js');
            const result = await prComment({
              github,
              context,
              dockerMetaJson: ${{ toJSON(steps.meta.outputs.json) }},
              image: "${{ env.REGISTRY_IMAGE }}",
              version: "${{ steps.meta.outputs.version }}",
              dockerhubUrl: "https://hub.docker.com/r/${{ env.REGISTRY_IMAGE }}/tags",
              platforms: "linux/amd64, linux/arm64",
            });
            core.info(`Status: ${result.updated ? 'Updated' : 'Created'}, ID: ${result.id}`);


================================================
FILE: .gitignore
================================================
logs
.claude
CLAUDE.md
.gemini
results
materials
scripts
!.github/scripts
.nfs*
.mcp_env
.idea
# Byte-compiled / optimized / DLL files
__pycache__/
*.py[codz]
*$py.class
logs
logs/*
.DS_Store
notion-sdk-py/
github_state/*

# for playwright cookies
notion_state.json

# C extensions
*.so

# Distribution / packaging
.Python
build/
develop-eggs/
dist/
downloads/
eggs/
.eggs/
lib/
lib64/
parts/
sdist/
var/
wheels/
share/python-wheels/
*.egg-info/
.installed.cfg
*.egg
MANIFEST

# PyInstaller
#  Usually these files are written by a python script from a template
#  before PyInstaller builds the exe, so as to inject date/other infos into it.
*.manifest
*.spec

# Installer logs
pip-log.txt
pip-delete-this-directory.txt

# Unit test / coverage reports
htmlcov/
.tox/
.nox/
.coverage
.coverage.*
.cache
nosetests.xml
coverage.xml
*.cover
*.py.cover
.hypothesis/
.pytest_cache/
cover/

# Translations
*.mo
*.pot

# Django stuff:
*.log
local_settings.py
db.sqlite3
db.sqlite3-journal

# Flask stuff:
instance/
.webassets-cache

# Scrapy stuff:
.scrapy

# Sphinx documentation
docs/_build/

# PyBuilder
.pybuilder/
target/

# Jupyter Notebook
.ipynb_checkpoints

# IPython
profile_default/
ipython_config.py

# pyenv
#   For a library or package, you might want to ignore these files since the code is
#   intended to run in multiple environments; otherwise, check them in:
# .python-version

# pipenv
#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
#   However, in case of collaboration, if having platform-specific dependencies or dependencies
#   having no cross-platform support, pipenv may install dependencies that don't work, or not
#   install all needed dependencies.
#Pipfile.lock

# UV
#   Similar to Pipfile.lock, it is generally recommended to include uv.lock in version control.
#   This is especially recommended for binary packages to ensure reproducibility, and is more
#   commonly ignored for libraries.
#uv.lock

# poetry
#   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
#   This is especially recommended for binary packages to ensure reproducibility, and is more
#   commonly ignored for libraries.
#   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
#poetry.lock
#poetry.toml

# pdm
#   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
#   pdm recommends including project-wide configuration in pdm.toml, but excluding .pdm-python.
#   https://pdm-project.org/en/latest/usage/project/#working-with-version-control
#pdm.lock
#pdm.toml
.pdm-python
.pdm-build/

# pixi
#   Similar to Pipfile.lock, it is generally recommended to include pixi.lock in version control.
#pixi.lock
#   Pixi creates a virtual environment in the .pixi directory, just like venv module creates one
#   in the .venv directory. It is recommended not to include this directory in version control.
.pixi

# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
__pypackages__/

# Celery stuff
celerybeat-schedule
celerybeat.pid

# SageMath parsed files
*.sage.py

# Environments
.env
.envrc
.venv
env/
venv/
ENV/
env.bak/
venv.bak/

# Spyder project settings
.spyderproject
.spyproject

# Rope project settings
.ropeproject

# mkdocs documentation
/site

# mypy
.mypy_cache/
.dmypy.json
dmypy.json

# Pyre type checker
.pyre/

# pytype static type analyzer
.pytype/

# Cython debug symbols
cython_debug/

# PyCharm
#  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
#  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
#  and can be added to the global gitignore or merged into this file.  For a more nuclear
#  option (not recommended) you can uncomment the following to ignore the entire idea folder.
#.idea/

# Abstra
# Abstra is an AI-powered process automation framework.
# Ignore directories containing user credentials, local state, and settings.
# Learn more at https://abstra.io/docs
.abstra/

# Visual Studio Code
#  Visual Studio Code specific template is maintained in a separate VisualStudioCode.gitignore
#  that can be found at https://github.com/github/gitignore/blob/main/Global/VisualStudioCode.gitignore
#  and can be added to the global gitignore or merged into this file. However, if you prefer,
#  you could uncomment the following to ignore the entire vscode folder
# .vscode/

# Ruff stuff:
.ruff_cache/

# PyPI configuration file
.pypirc

# Cursor
#  Cursor is an AI-powered code editor. `.cursorignore` specifies files/directories to
#  exclude from AI features like autocomplete and code analysis. Recommended for sensitive data
#  refer to https://docs.cursor.com/context/ignore-files
.cursorignore
.cursorindexingignore

# Marimo
marimo/_static/
marimo/_lsp/
__marimo__/

# pixi environments
.pixi
*.egg-info

.postgres

# MCPMark backup directories
.mcpmark_backups/*
test_environments/
postgres_state


================================================
FILE: CHANGELOG.md
================================================
# Changelog

All notable changes to this project will be documented in this file.

The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).

## v1.2.0 - 2025-09-20

This version includes multiple important feature enhancements, particularly improvements in cost calculation, error handling, and Notion integration. Added per-model cost calculation, comprehensive aggregator functionality, and more robust error recovery mechanisms.
### ✨ Features
- **Add 1m parameter & improve log** (#198) - Added claude-1m-context option and enhanced logging functionality
- **Refine Notion parent resolution and duplicate recovery** (#197) - Improved Notion parent page resolution and duplicate content recovery mechanism
- **Comprehensive aggregator, enable push to new branch** (#185) - Implemented comprehensive aggregator functionality with support for pushing to new branches
- **Support price cost calculating per model** (#186) - Added per-model price cost calculation functionality
- **Improve agent end log** (#183) - Enhanced agent end logging
- **Improve litellm error handling** (#181) - Enhanced LiteLLM error handling mechanism

### ♻️ Refactoring
- **Use notion child block list to locate page** (#196) - Refactored page location logic to use Notion child block list approach

### 🐛 Bug Fixes
- **Fix verification in Notion task company_in_a_box/goals_restructure** (#194) - Fixed verification logic for specific Notion tasks
- **Improve claude error handling** (#195) - Improved error handling for Claude API interactions
- **Fix tailing slash issue for find_legacy_name** - Resolved trailing slash issues in find_legacy_name path handling
- **Recover when duplication lands on parent** (#189) - Fixed recovery mechanism when duplicate content affects parent pages
- **Correctly handle playwright parser** (#184) - Properly handle Playwright parser
- **Handle timeout error, add timeout error for resuming** (#182) - Handle timeout errors and add timeout error handling for resume operations

### 📝 Documentation
- **Better readme, notion language guide** (#190) - Improved README documentation and added comprehensive Notion language guide

### 🔨 Maintenance
- **Update price info** (#188) - Updated pricing information
- **Update desktop_template/file_arrangement/verify.py** (#187) - Maintenance updates to verification scripts


================================================
FILE: Dockerfile
================================================
# MCPMark Docker image with optimized layer caching
# Stage 1: Builder for Python dependencies only
FROM python:3.12-slim AS builder

RUN apt-get update && apt-get install -y --no-install-recommends \
    gcc \
    g++ \
    libpq-dev \
    && rm -rf /var/lib/apt/lists/*

WORKDIR /build

# Copy project files needed for pip install
COPY pyproject.toml ./
COPY src/ ./src/
COPY tasks/ ./tasks/

# Install dependencies
RUN pip install --no-cache-dir --user .

# Stage 2: Final image with all runtime dependencies
FROM python:3.12-slim

# Layer 1: Core system dependencies (very stable, rarely changes)
RUN apt-get update && apt-get install -y --no-install-recommends \
    ca-certificates \
    && rm -rf /var/lib/apt/lists/*

# Layer 2: PostgreSQL runtime and client tools (stable, only changes with postgres version)
RUN apt-get update && apt-get install -y --no-install-recommends \
    libpq5 \
    postgresql-client \
    && rm -rf /var/lib/apt/lists/*

# Layer 3: Git (stable)
RUN apt-get update && apt-get install -y --no-install-recommends \
    git \
    && rm -rf /var/lib/apt/lists/*

# Layer 4: Playwright system dependencies (changes with browser requirements)
RUN apt-get update && apt-get install -y --no-install-recommends \
    libnss3 \
    libnspr4 \
    libatk1.0-0 \
    libatk-bridge2.0-0 \
    libcups2 \
    libdrm2 \
    libxkbcommon0 \
    libatspi2.0-0 \
    libx11-6 \
    libxcomposite1 \
    libxdamage1 \
    libxfixes3 \
    libxrandr2 \
    libgbm1 \
    libxcb1 \
    libpango-1.0-0 \
    libcairo2 \
    libasound2 \
    && rm -rf /var/lib/apt/lists/*

# Layer 5: Download tools and Node.js (changes with Node version)
RUN apt-get update && \
    apt-get install -y --no-install-recommends curl wget unzip && \
    curl -fsSL https://deb.nodesource.com/setup_20.x | bash - && \
    apt-get install -y --no-install-recommends nodejs && \
    apt-get autoremove -y && \
    rm -rf /var/lib/apt/lists/*

# Layer 6: pipx (rarely changes)
RUN pip install --no-cache-dir pipx && \
    pipx ensurepath

# Layer 7: Copy Python packages from builder (changes with dependencies)
COPY --from=builder /root/.local /root/.local

# Layer 8: Playwright browsers (changes with browser versions)
RUN python3 -m playwright install chromium && \
    npx -y playwright install chromium

# Layer 9: Install PostgreSQL MCP server (Python, used via `pipx run postgres-mcp`)
RUN pipx install postgres-mcp

# Set working directory
WORKDIR /app

# Layer 9: Create directory structure (rarely changes)
RUN mkdir -p /app/results

# Layer 10: Application code (changes frequently)
COPY . .

# Set environment
ENV PATH="/root/.local/bin:/root/.local/pipx/venvs/*/bin:${PATH}"
ENV PYTHONPATH="/app"
ENV PLAYWRIGHT_BROWSERS_PATH=/root/.cache/ms-playwright
ENV PIPX_HOME=/root/.local/pipx
ENV PIPX_BIN_DIR=/root/.local/bin

# Default command
CMD ["python3", "-m", "pipeline", "--help"]

================================================
FILE: LICENSE
================================================
                                 Apache License
                           Version 2.0, January 2004
                        http://www.apache.org/licenses/

   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION

   1. Definitions.

      "License" shall mean the terms and conditions for use, reproduction,
      and distribution as defined by Sections 1 through 9 of this document.

      "Licensor" shall mean the copyright owner or entity authorized by
      the copyright owner that is granting the License.

      "Legal Entity" shall mean the union of the acting entity and all
      other entities that control, are controlled by, or are under common
      control with that entity. For the purposes of this definition,
      "control" means (i) the power, direct or indirect, to cause the
      direction or management of such entity, whether by contract or
      otherwise, or (ii) ownership of fifty percent (50%) or more of the
      outstanding shares, or (iii) beneficial ownership of such entity.

      "You" (or "Your") shall mean an individual or Legal Entity
      exercising permissions granted by this License.

      "Source" form shall mean the preferred form for making modifications,
      including but not limited to software source code, documentation
      source, and configuration files.

      "Object" form shall mean any form resulting from mechanical
      transformation or translation of a Source form, including but
      not limited to compiled object code, generated documentation,
      and conversions to other media types.

      "Work" shall mean the work of authorship, whether in Source or
      Object form, made available under the License, as indicated by a
      copyright notice that is included in or attached to the work
      (an example is provided in the Appendix below).

      "Derivative Works" shall mean any work, whether in Source or Object
      form, that is based on (or derived from) the Work and for which the
      editorial revisions, annotations, elaborations, or other modifications
      represent, as a whole, an original work of authorship. For the purposes
      of this License, Derivative Works shall not include works that remain
      separable from, or merely link (or bind by name) to the interfaces of,
      the Work and Derivative Works thereof.

      "Contribution" shall mean any work of authorship, including
      the original version of the Work and any modifications or additions
      to that Work or Derivative Works thereof, that is intentionally
      submitted to Licensor for inclusion in the Work by the copyright owner
      or by an individual or Legal Entity authorized to submit on behalf of
      the copyright owner. For the purposes of this definition, "submitted"
      means any form of electronic, verbal, or written communication sent
      to the Licensor or its representatives, including but not limited to
      communication on electronic mailing lists, source code control systems,
      and issue tracking systems that are managed by, or on behalf of, the
      Licensor for the purpose of discussing and improving the Work, but
      excluding communication that is conspicuously marked or otherwise
      designated in writing by the copyright owner as "Not a Contribution."

      "Contributor" shall mean Licensor and any individual or Legal Entity
      on behalf of whom a Contribution has been received by Licensor and
      subsequently incorporated within the Work.

   2. Grant of Copyright License. Subject to the terms and conditions of
      this License, each Contributor hereby grants to You a perpetual,
      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
      copyright license to reproduce, prepare Derivative Works of,
      publicly display, publicly perform, sublicense, and distribute the
      Work and such Derivative Works in Source or Object form.

   3. Grant of Patent License. Subject to the terms and conditions of
      this License, each Contributor hereby grants to You a perpetual,
      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
      (except as stated in this section) patent license to make, have made,
      use, offer to sell, sell, import, and otherwise transfer the Work,
      where such license applies only to those patent claims licensable
      by such Contributor that are necessarily infringed by their
      Contribution(s) alone or by combination of their Contribution(s)
      with the Work to which such Contribution(s) was submitted. If You
      institute patent litigation against any entity (including a
      cross-claim or counterclaim in a lawsuit) alleging that the Work
      or a Contribution incorporated within the Work constitutes direct
      or contributory patent infringement, then any patent licenses
      granted to You under this License for that Work shall terminate
      as of the date such litigation is filed.

   4. Redistribution. You may reproduce and distribute copies of the
      Work or Derivative Works thereof in any medium, with or without
      modifications, and in Source or Object form, provided that You
      meet the following conditions:

      (a) You must give any other recipients of the Work or
          Derivative Works a copy of this License; and

      (b) You must cause any modified files to carry prominent notices
          stating that You changed the files; and

      (c) You must retain, in the Source form of any Derivative Works
          that You distribute, all copyright, patent, trademark, and
          attribution notices from the Source form of the Work,
          excluding those notices that do not pertain to any part of
          the Derivative Works; and

      (d) If the Work includes a "NOTICE" text file as part of its
          distribution, then any Derivative Works that You distribute must
          include a readable copy of the attribution notices contained
          within such NOTICE file, excluding those notices that do not
          pertain to any part of the Derivative Works, in at least one
          of the following places: within a NOTICE text file distributed
          as part of the Derivative Works; within the Source form or
          documentation, if provided along with the Derivative Works; or,
          within a display generated by the Derivative Works, if and
          wherever such third-party notices normally appear. The contents
          of the NOTICE file are for informational purposes only and
          do not modify the License. You may add Your own attribution
          notices within Derivative Works that You distribute, alongside
          or as an addendum to the NOTICE text from the Work, provided
          that such additional attribution notices cannot be construed
          as modifying the License.

      You may add Your own copyright statement to Your modifications and
      may provide additional or different license terms and conditions
      for use, reproduction, or distribution of Your modifications, or
      for any such Derivative Works as a whole, provided Your use,
      reproduction, and distribution of the Work otherwise complies with
      the conditions stated in this License.

   5. Submission of Contributions. Unless You explicitly state otherwise,
      any Contribution intentionally submitted for inclusion in the Work
      by You to the Licensor shall be under the terms and conditions of
      this License, without any additional terms or conditions.
      Notwithstanding the above, nothing herein shall supersede or modify
      the terms of any separate license agreement you may have executed
      with Licensor regarding such Contributions.

   6. Trademarks. This License does not grant permission to use the trade
      names, trademarks, service marks, or product names of the Licensor,
      except as required for reasonable and customary use in describing the
      origin of the Work and reproducing the content of the NOTICE file.

   7. Disclaimer of Warranty. Unless required by applicable law or
      agreed to in writing, Licensor provides the Work (and each
      Contributor provides its Contributions) on an "AS IS" BASIS,
      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
      implied, including, without limitation, any warranties or conditions
      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
      PARTICULAR PURPOSE. You are solely responsible for determining the
      appropriateness of using or redistributing the Work and assume any
      risks associated with Your exercise of permissions under this License.

   8. Limitation of Liability. In no event and under no legal theory,
      whether in tort (including negligence), contract, or otherwise,
      unless required by applicable law (such as deliberate and grossly
      negligent acts) or agreed to in writing, shall any Contributor be
      liable to You for damages, including any direct, indirect, special,
      incidental, or consequential damages of any character arising as a
      result of this License or out of the use or inability to use the
      Work (including but not limited to damages for loss of goodwill,
      work stoppage, computer failure or malfunction, or any and all
      other commercial damages or losses), even if such Contributor
      has been advised of the possibility of such damages.

   9. Accepting Warranty or Additional Liability. While redistributing
      the Work or Derivative Works thereof, You may choose to offer,
      and charge a fee for, acceptance of support, warranty, indemnity,
      or other liability obligations and/or rights consistent with this
      License. However, in accepting such obligations, You may act only
      on Your own behalf and on Your sole responsibility, not on behalf
      of any other Contributor, and only if You agree to indemnify,
      defend, and hold each Contributor harmless for any liability
      incurred by, or claims asserted against, such Contributor by reason
      of your accepting any such warranty or additional liability.

   END OF TERMS AND CONDITIONS

   APPENDIX: How to apply the Apache License to your work.

      To apply the Apache License to your work, attach the following
      boilerplate notice, with the fields enclosed by brackets "[]"
      replaced with your own identifying information. (Don't include
      the brackets!)  The text should be enclosed in the appropriate
      comment syntax for the file format. We also recommend that a
      file or class name and description of purpose be included on the
      same "printed page" as the copyright notice for easier
      identification within third-party archives.

   Copyright [yyyy] [name of copyright owner]

   Licensed under the Apache License, Version 2.0 (the "License");
   you may not use this file except in compliance with the License.
   You may obtain a copy of the License at

       http://www.apache.org/licenses/LICENSE-2.0

   Unless required by applicable law or agreed to in writing, software
   distributed under the License is distributed on an "AS IS" BASIS,
   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   See the License for the specific language governing permissions and
   limitations under the License.


================================================
FILE: README.md
================================================
<div align="center">

# MCPMark: Stress-Testing Comprehensive MCP Use

[![Website](https://img.shields.io/badge/Website-mcpmark.ai-4285F4?style=for-the-badge&logo=google-chrome&logoColor=white)](https://mcpmark.ai)
[![arXiv](https://img.shields.io/badge/arXiv-2509.24002-b31b1b?style=for-the-badge&logo=arxiv&logoColor=white)](https://arxiv.org/abs/2509.24002)
[![Discord](https://img.shields.io/badge/Join_our_discord-5865F2?style=for-the-badge&logo=discord&logoColor=white)](https://discord.gg/HrKkJAxDnA)
[![Docs](https://img.shields.io/badge/Docs-000000?style=for-the-badge&logo=mdbook&color=105864)](https://mcpmark.ai/docs)
[![Hugging Face](https://img.shields.io/badge/Trajectory_Logs-FFD21E?style=for-the-badge&logo=huggingface&logoColor=black)](https://huggingface.co/datasets/Jakumetsu/mcpmark-trajectory-log)

</div>

An evaluation suite for agentic models in real MCP tool environments (Notion / GitHub / Filesystem / Postgres / Playwright).

MCPMark provides a reproducible, extensible benchmark for researchers and engineers: one-command tasks, isolated sandboxes, auto-resume for failures, unified metrics, and aggregated reports.

[![MCPMark](https://github.com/user-attachments/assets/dfc06a41-e387-45e3-bc98-db7097ffa3dc)](https://mcpmark.ai)

## News

- 📌 **21 Jan** — Pinned MCP server versions for reproducible benchmarks: GitHub MCP Server `v0.15.0` (switched to Docker for version control), Notion MCP Server `@1.9.1` (Notion released 2.0 but it has many bugs, not recommended). See [#246](https://github.com/eval-sys/mcpmark/pull/246).
- 🔥 **13 Dec** — Added auto-compaction support (`--compaction-token`) to summarize long conversations and avoid context overflow during evaluation ([#236](https://github.com/eval-sys/mcpmark/pull/236])).
- 🏅 **02 Dec** — Evaluated `gemini-3-pro-preview` (thinking: low): **Pass@1 50.6%** ± 2.3% — so close to `gpt-5-high` (51.6%)! Also `deepseek-v3.2-thinking` 36.8% and `deepseek-v3.2-chat` 29.7%
- 🔥 **02 Dec** — Obfuscate GitHub @mentions to prevent notification spam during evaluation ([#229](https://github.com/eval-sys/mcpmark/pull/229))
- 🏅 **01 Dec** — DeepSeek v3.2 uses MCPMark! Kudos on securing the best open-source model. [X Post](https://x.com/deepseek_ai/status/1995452650557763728) | [Technical Report](https://huggingface.co/deepseek-ai/DeepSeek-V3.2/resolve/main/assets/paper.pdf)
- 🔥 **17 Nov** — Added 50 easy tasks (10 per MCP server) for smaller open-source models ([#225](https://github.com/eval-sys/mcpmark/pull/225))
- 🤝 **31 Oct** — Community PR from insforge: better MCP servers achieve better results with fewer tokens! ([#214](https://github.com/eval-sys/mcpmark/pull/214))
- 🔥 **13 Oct** — Added ReAct agent support. PRs for new agent scaffolds welcome! ([#209](https://github.com/eval-sys/mcpmark/pull/209))
- 🏅 **10 Sep** — `qwen-3-coder-plus` is the best open-source model! Kudos to Qwen team. [X Post](https://x.com/Alibaba_Qwen/status/1965457023438651532)

---

## What you can do with MCPMark

- **Evaluate real tool usage** across multiple MCP services: `Notion`, `GitHub`, `Filesystem`, `Postgres`, `Playwright`.
- **Use ready-to-run tasks** covering practical workflows, each with strict automated verification.
- **Reliable and reproducible**: isolated environments that do not pollute your accounts/data; failed tasks auto-retry and resume.
- **Unified metrics and aggregation**: single/multi-run (pass@k, avg@k, etc.) with automated results aggregation.
- **Flexible deployment**: local or Docker; fully validated on macOS and Linux.

---

## Quickstart (5 minutes)

### 1) Clone the repository
```bash
git clone https://github.com/eval-sys/mcpmark.git
cd mcpmark
```

### 2) Set environment variables (create `.mcp_env` at repo root)
Only set what you need. Add service credentials when running tasks for that service.

```env
# Example: OpenAI
OPENAI_BASE_URL="https://api.openai.com/v1"
OPENAI_API_KEY="sk-..."

# Optional: Notion (only for Notion tasks)
SOURCE_NOTION_API_KEY="your-source-notion-api-key"
EVAL_NOTION_API_KEY="your-eval-notion-api-key"
EVAL_PARENT_PAGE_TITLE="MCPMark Eval Hub"
PLAYWRIGHT_BROWSER="chromium"   # chromium | firefox
PLAYWRIGHT_HEADLESS="True"

# Optional: GitHub (only for GitHub tasks)
GITHUB_TOKENS="token1,token2"   # token pooling for rate limits
GITHUB_EVAL_ORG="your-eval-org"

# Optional: Postgres (only for Postgres tasks)
POSTGRES_HOST="localhost"
POSTGRES_PORT="5432"
POSTGRES_USERNAME="postgres"
POSTGRES_PASSWORD="password"
```

See `docs/introduction.md` and the service guides below for more details.

### 3) Install and run a minimal example

Local (Recommended)
```bash
pip install -e .
# If you'll use browser-based tasks, install Playwright browsers first
playwright install
```

MCPMark defaults to the built-in orchestration agent (`MCPMarkAgent`). To experiment with the ReAct-style agent, pass `--agent react` to `pipeline.py` (other settings stay the same).

Docker
```bash
./build-docker.sh
```

Run a filesystem task (no external accounts required):
```bash
python -m pipeline \
  --mcp filesystem \
  --k 1 \ # run once to quick start
  --models gpt-5  \ # or any model you configured
  --tasks file_property/size_classification
# Add --task-suite easy to run the lightweight dataset (where available)
```

Results are saved to `./results/{exp_name}/{model}__{mcp}/run-*/...` for the standard suite and `./results/{exp_name}/{model}__{mcp}-easy/run-*/...` when you run `--task-suite easy` (e.g., `./results/test-run/gpt-5__filesystem/run-1/...` or `./results/test-run/gpt-5__github-easy/run-1/...`).

---

## Run your evaluations

### Task suites (standard vs easy)

- Each MCP service now stores tasks under `tasks/<mcp>/<task_suite>/<category>/<task>/`.
- `standard` (default) covers the full benchmark (127 tasks today).
- `easy` hosts 10 lightweight tasks per MCP, ideal for smoke tests and CI (GitHub’s are already available under `tasks/github/easy`).
- Switch suites with `--task-suite easy` (defaults to `--task-suite standard`).

### Single run (k=1)
```bash
# Run ALL tasks for a service
python -m pipeline --exp-name exp --mcp notion --tasks all --models MODEL --k 1

# Run a task group
python -m pipeline --exp-name exp --mcp notion --tasks online_resume --models MODEL --k 1

# Run a specific task
python -m pipeline --exp-name exp --mcp notion --tasks online_resume/daily_itinerary_overview --models MODEL --k 1

# Evaluate multiple models
python -m pipeline --exp-name exp --mcp notion --tasks all --models MODEL1,MODEL2,MODEL3 --k 1
```

### Multiple runs (k>1) for pass@k
```bash
# Run k=4 to compute stability metrics (requires --exp-name to aggregate final results)
python -m pipeline --exp-name exp --mcp notion --tasks all --models MODEL

# Aggregate results (pass@1 / pass@k / pass^k / avg@k)
python -m src.aggregators.aggregate_results --exp-name exp
```

### Run with Docker
```bash
# Run all tasks for a service
./run-task.sh --mcp notion --models MODEL --exp-name exp --tasks all

# Cross-service benchmark
./run-benchmark.sh --models MODEL --exp-name exp --docker
```

Please visit `docs/introduction.md` for choices of *MODEL*.

Tip: MCPMark supports **auto-resume**. When re-running, only unfinished tasks will execute. Failures matching our retryable patterns (see [RETRYABLE_PATTERNS](src/errors.py)) are retried automatically. Models may emit different error strings—if you encounter a new resumable error, please open a PR or issue.

Tip: MCPMark supports **auto-compaction**; pass `--compaction-token N` to enable automatic context summarization when prompt tokens reach `N` (use `999999999` to disable).

---

## Service setup and authentication

| Service     | Setup summary                                                                                                  | Docs                                  |
|-------------|-----------------------------------------------------------------------------------------------------------------|---------------------------------------|
| Notion      | Environment isolation (Source Hub / Eval Hub), integration creation and grants, browser login verification.     | [Guide](docs/mcp/notion.md)           |
| GitHub      | Multi-account token pooling recommended; import pre-exported repo state if needed.                              | [Guide](docs/mcp/github.md)           |
| Postgres    | Start via Docker and import sample databases.                                                                   | [Setup](docs/mcp/postgres.md)         |
| Playwright  | Install browsers before first run; defaults to `chromium`.                                                      | [Setup](docs/mcp/playwright.md)       |
| Filesystem  | Zero-configuration, run directly.                                                                               | [Config](docs/mcp/filesystem.md)      |

You can also follow [Quickstart](docs/quickstart.md) for the shortest end-to-end path.

### Important Notice: GitHub Repository Privacy

> **Please ensure your evaluation repositories are set to PRIVATE.**

GitHub state templates are now automatically downloaded from our CDN during evaluation — no manual download is required. However, because these templates contain issues and pull requests from real open-source repositories, the recreation process includes `@username` mentions of the original authors.

**We have received feedback from original GitHub authors who were inadvertently notified** when evaluation repositories were created as public. To be a responsible member of the open-source community, we urge all users to:

1. **Always keep evaluation repositories private** during the evaluation process.
2. **In the latest version**, we have added random suffixes to all `@username` mentions (e.g., `@user` becomes `@user_x7k2`) and implemented a safety check that prevents importing templates to public repositories.
3. **If you are using an older version of MCPMark**, please either:
   - Pull the latest code immediately, or
   - Manually ensure all GitHub evaluation repositories are set to private.

Thank you for helping us maintain a respectful relationship with the open-source community.

---

## Results and metrics

- Results are organized under `./results/{exp_name}/{model}__{mcp}/run-*/` (JSON + CSV per task).
- Generate a summary with:
```bash
# Basic usage
python -m src.aggregators.aggregate_results --exp-name exp

# For k-run experiments with single-run models
python -m src.aggregators.aggregate_results --exp-name exp --k 4 --single-run-models claude-opus-4-1
```
- Only models with complete results across all tasks and runs are included in the final summary.
- Includes multi-run metrics (pass@k, pass^k) for stability comparisons when k > 1.

---

## Model and Tasks
- **Model support**: MCPMark calls models via LiteLLM — see the LiteLLM docs: [`LiteLLM Doc`](https://docs.litellm.ai/docs/). For Anthropic (Claude) extended thinking mode (enabled via `--reasoning-effort`), we use Anthropic’s native API.
- See `docs/introduction.md` for details and configuration of supported models in MCPMark.
- To add a new model, edit `src/model_config.py`. Before adding, check LiteLLM supported models/providers. See [`LiteLLM Doc`](https://docs.litellm.ai/docs/).
- Task design principles in `docs/datasets/task.md`. Each task ships with an automated `verify.py` for objective, reproducible evaluation, see `docs/task.md` for details.

---

## Contributing

Contributions are welcome:
1. Add a new task under `tasks/<mcp>/<task_suite>/<category_id>/<task_id>/` with `meta.json`, `description.md` and `verify.py`.
2. Ensure local checks pass and open a PR.
3. See `docs/contributing/make-contribution.md`.

---

## Citation

If you find our works useful for your research, please consider citing:

```bibtex
@misc{wu2025mcpmark,
      title={MCPMark: A Benchmark for Stress-Testing Realistic and Comprehensive MCP Use}, 
      author={Zijian Wu and Xiangyan Liu and Xinyuan Zhang and Lingjun Chen and Fanqing Meng and Lingxiao Du and Yiran Zhao and Fanshi Zhang and Yaoqi Ye and Jiawei Wang and Zirui Wang and Jinjie Ni and Yufan Yang and Arvin Xu and Michael Qizhe Shieh},
      year={2025},
      eprint={2509.24002},
      archivePrefix={arXiv},
      primaryClass={cs.CL},
      url={https://arxiv.org/abs/2509.24002}, 
}
```

## License

This project is licensed under the Apache License 2.0 — see `LICENSE`.


================================================
FILE: build-docker.sh
================================================
#!/bin/bash

# Build Docker image for MCPMark
set -e

# Color codes for output
GREEN='\033[0;32m'
YELLOW='\033[1;33m'
NC='\033[0m' # No Color

echo -e "${YELLOW}Building MCPMark Docker image locally...${NC}"

# Build the Docker image with the same tag as Docker Hub for local testing
docker build -t evalsysorg/mcpmark:latest . "$@"

# Check if build was successful
if [ $? -eq 0 ]; then
    echo -e "${GREEN}✓ Docker image built successfully${NC}"
    echo "  Tag: evalsysorg/mcpmark:latest"

    # Show image info
    echo ""
    echo "Image details:"
    docker images evalsysorg/mcpmark:latest --format "table {{.Repository}}\t{{.Tag}}\t{{.Size}}\t{{.CreatedAt}}"

    echo ""
    echo "You can now run tasks using:"
    echo "  ./run-task.sh --mcp notion --models o3 --exp-name test --tasks all"
else
    echo "Docker build failed!"
    exit 1
fi


================================================
FILE: cspell.config.yaml
================================================
version: "0.2"
ignorePaths: []
dictionaryDefinitions: []
dictionaries: []
words:
  - datname
  - domcontentloaded
  - modelcontextprotocol
  - pgdumplib
  - pixi
  - pypi
  - topbar
  - usename
ignoreWords: []
import: []


================================================
FILE: docs/contributing/make-contribution.md
================================================
# Contributing

1. Fork the repository and create a feature branch.

2. Add new tasks under `tasks/<mcp>/<task_suite>/<category>/<task_id>/` with the files of `meta.json`, `description.md` and `verify.py`. Please refer to [Task Page](../datasets/task.md) for detailed instructions.

3. Ensure all tests pass.

4. Submit a pull request — contributions are welcome!


================================================
FILE: docs/datasets/task.md
================================================
# Task

The tasks in MCPMark follows two major principles
- The tasks are based on realistic digital environments that are also used by human programmers.
- The task outcome can be robustly verified in python scripts.

Therefore, each MCPMark task consists of three files
- `meta.json`
- `description.md`
- `verify.py`

Here, `metadata.json` includes the meta information of the task, `description.md` describes the purpose and setting of the task, as well as the instruction to complete the task. `verify.py` checks whether the task is completed successfully.

For example, you can ask the model agent to create a file with specific name and write specific content to the file, which belongs to the category of operating the file context. The structure looks like

```
tasks 
│
└───filesystem
   │
   └───standard          # task_suite (also supports `easy`)
      │
      └───file_context   # category_id
         │
         └───create_file_write
            │   meta.json 
            │   description.md
            │   verify.py
```

All tasks live under `tasks/<mcp>/<task_suite>/<category>/<task_id>/`. `filesystem` refers to the MCP service and `task_suite` captures the difficulty slice (`standard` benchmark vs `easy` smoke tests).

`meta.json` includes the meta information about the task, including the following key
- task_id: the id of the task.
- task_name: full name of the task.
- description: task description.
- category_id: the id of task category.
- category_name: the full name of task categeory.
- author: the author of the task.
- difficulty: the task difficulty level.
- created_at: the timestamp of task creation.
- tags: a list of tags that describe the task.
- mcp: a list of MCP services it belongs to.
- metadata: other meta information.

Here `category_name` describes the shared feature or the environment across different tasks (e.g. the github repository or notion page the task is built on). In this running example, `category_name` refers to `file_context`.

`description.md` could include the following information

- Task name
    - Create and Write File.
- Task description
    - Use the filesystem MCP tools to create a new file and write content to it.
- Task Objectives
    - Create a new file named `hello_world.txt` in the test directory.
    - Write the following content to the file:   ```   Hello, World```
    - Verify the file was created successfully
-  Verification Criteria
    - File `hello_world.txt` exists in the test directory
    - File contains the expected content structure
    - File includes "Hello, World!" on the first line
- Tips
    - Use the `write_file` tool to create and write content to the file
    - The test directory path will be provided in the task context

The entire content of `description.md` will be read by the model agent for completing the task. 

Accordingly, the `verify.py` contains the following functionalities
- Check whether the target directory exists. [![Check Target Directory](https://i.postimg.cc/SQfBYvby/task-sample-verify-get-test-dir.png)](https://postimg.cc/4nnLrw3M)
- Check whether the target directory contains the file with target file name. [![Check Target File Existence](https://i.postimg.cc/Qx0Zwnf6/task-sample-verify-file-existence.png)](https://postimg.cc/7fGRTX87)
- Check whether the target file contains the desired content `EXPECTED_PATTERNS = ["Hello Wolrd"]`. [![Check Content in Target File](https://i.postimg.cc/JzzMhWyV/task-sample-verify-check-content.png)](https://postimg.cc/w7ZSWZc0)

- If the outcome passes **all the above verification functionalities**, the task would be marked as successfully completed.


================================================
FILE: docs/installation_and_docker_usage.md
================================================
# Installation and Docker Task Usage Guideline

## Overview

The MCPMark setup supports installation through either pip or MCPMark Docker (recommended) after cloning the code repository.

### Pip Installtion
```bash
pip install -e .
```

The MCPMark Docker setup provides a simple way to run evaluation tasks in isolated containers. PostgreSQL is automatically handled when needed.

## 1. Quick Start

### 1.1 Docker Image

The official Docker image is automatically pulled from Docker Hub on first use.
The image is hosted at: https://hub.docker.com/r/evalsysorg/mcpmark

**Image Management:**
- The scripts automatically download the image when it's not found locally
- To manually update to the latest version:
  ```bash
  docker pull evalsysorg/mcpmark:latest
  ```
- For local development/testing, you can build your own docker:
  ```bash
   # Creates evalsysorg/mcpmark:latest locally
  ./build-docker.sh
  ```

## 2. Running MCP Experiments

### 2.1 Running Individual MCP Experiment 

The `run-task.sh` script provides simplified Docker usage:

```bash
# Run filesystem tasks (filesystem is the default mcp service)
./run-task.sh --models MODEL_NAME --k K

# Run github/notion/postgres/playwright/playwright_webarena with specific task
./run-task.sh --mcp MCPSERVICE --models MODEL_NAME --exp-name EXPNAME --tasks TASK --k K
```

where *MODEL_NAME* refers to the model choice from the supported models (see [Introduction Page](./introduction.md) for more information), *EXPNAME* refers to customized experiment name, *TASK* refers to specific task or task group (see `tasks/<mcp>/<task_suite>/...` for more information), *K* refers to the time of independent experiments.


Additionally, the `run-benchmark.sh` script evaluates models across all MCP services:

```bash
# Run all services with Docker (recommended)
./run-benchmark.sh --models MODEL --exp-name EXPNAME --docker

# Run specific services
./run-benchmark.sh --models MODEL --exp-name EXPNAME --mcps MCPSERVICES --docker

# Run with parallel execution for faster results
./run-benchmark.sh --models MODEL --exp-name EXPNAME --docker --parallel

# Run locally without Docker
./run-benchmark.sh --models MODEL --exp-name EXPNAME --mcps MCPSERVICES
```

Here *MCPSERVICES* refers to group of MCP services, separated by comma (e.g. *filesystem,postgres*)

The benchmark script:
- Runs all or selected MCP services automatically
- Supports progress tracking and timing
- Generates summary reports and logs
- Supports parallel service execution
- Continues running even if some services fail
- Automatically generates performance dashboards

### Manual Docker Commands

#### For Non-Postgres Services
Suppose Notion is the service:
```bash
# Build the image first
./build-docker.sh

# Run a task
docker run --rm \
  -v $(pwd)/results:/app/results \
  -v $(pwd)/.mcp_env:/app/.mcp_env:ro \
  -v $(pwd)/notion_state.json:/app/notion_state.json:ro \
  evalsysorg/mcpmark:latest \
  python3 -m pipeline --mcp notion --models MODEL --exp-name EXPNAME --tasks TASK --k K
```

#### For Postgres Service
```bash
# The run-task.sh script handles postgres automatically, but if doing manually:

# Start postgres container
docker run -d \
  --name mcp-postgres \
  --network mcp-network \
  -e POSTGRES_DATABASE=postgres \
  -e POSTGRES_USER=postgres \
  -e POSTGRES_PASSWORD=123456 \
  ghcr.io/cloudnative-pg/postgresql:17-bookworm

# Run postgres task
docker run --rm \
  --network mcp-network \
  -e POSTGRES_HOST=mcp-postgres \
  -v $(pwd)/results:/app/results \
  -v $(pwd)/.mcp_env:/app/.mcp_env:ro \
  evalsysorg/mcpmark:latest \
  python3 -m pipeline --mcp postgres --models MODEL --exp-name EXPNAME --tasks TASK --k K

# Stop and remove postgres when done
docker stop mcp-postgres && docker rm mcp-postgres
```

## Script Usage

### Benchmark Runner (`run-benchmark.sh`)

```
./run-benchmark.sh --models MODELS --exp-name NAME [OPTIONS]

Required Options:
    --models MODELS      Comma-separated list of models to evaluate
    --exp-name NAME     Experiment name for organizing results

Optional Options:
    --docker            Run tasks in Docker containers (recommended)
    --mcps SERVICES Comma-separated list of services to test
                        Default: filesystem,notion,github,postgres,playwright
    --parallel          Run services in parallel (experimental)
    --timeout SECONDS   Timeout per task in seconds (default: 300)
```

### Individual Task Runner (`run-task.sh`)

```
./run-task.sh [--mcp SERVICE] [PIPELINE_ARGS]

Options:
    --mcp SERVICE    MCP service (notion|github|filesystem|playwright|postgres)
                        Default: filesystem

Environment Variables:
    DOCKER_MEMORY_LIMIT  Memory limit for container (default: 4g)
    DOCKER_CPU_LIMIT     CPU limit for container (default: 2)
    DOCKER_IMAGE_VERSION Docker image tag to use (default: latest)

All other arguments are passed directly to the pipeline command.

Pipeline arguments (see python3 -m pipeline --help):
  --mcp {notion,github,filesystem,playwright,postgres,playwright_webarena}
                        MCP service to use (default: filesystem)
  --models MODELS       Comma-separated list of models to evaluate (e.g., 'o3,k2,gpt-4.1')
  --tasks TASKS         Tasks to run: "all", a category name, or "category/task_name"
  --exp-name EXP_NAME   Experiment name; results are saved under results/<exp-name>/ (default: YYYY-MM-DD-HH-MM-SS)
  --k K                 Number of evaluation runs for pass@k metrics (default: 1)
  --timeout TIMEOUT     Timeout in seconds for each task
  --output-dir OUTPUT_DIR
                        Directory to save results
```

## Docker Benefits

1. **Efficiency**: Only starts necessary containers
2. **Isolation**: Each task runs in a fresh container
3. **Resource Management**: Automatic cleanup of containers and networks
4. **Smart Dependencies**: PostgreSQL only starts for postgres service
5. **Parallel Support**: Can run multiple services simultaneously for faster benchmarks
6. **Comprehensive Testing**: Benchmark script runs all services with one command
7. **Progress Tracking**: Colored output with timing and status information
8. **Automatic Reporting**: Generates summary reports and performance dashboards

## Common Troubleshooting

### Permission Issues
```bash
chmod +x run-task.sh
```

### Docker Build Issues
```bash
# Force rebuild with no cache
./run-task.sh --build --mcp MCPSERVICE --models MODEL_NAME --exp-name EXPNAME --tasks TASK
```

### PostgreSQL Connection Issues
```bash
# Check if postgres is running
docker ps | grep postgres

# View postgres logs
docker logs mcp-postgres-task
```

### Cleanup Stuck Resources
```bash
# Stop all containers
docker stop $(docker ps -q)

# Remove task network
docker network rm mcp-task-network

# Remove postgres data volume (careful!)
docker volume rm mcp-postgres-data
```

## Environment Variables

Create `.mcp_env` file with your credentials:
```env
# Service credentials
SOURCE_NOTION_API_KEY=your-key
EVAL_NOTION_API_KEY=your-key
GITHUB_TOKEN=your-token
POSTGRES_PASSWORD=your-password

# Model API keys
OPENAI_API_KEY=your-key
ANTHROPIC_API_KEY=your-key
# ... etc
```

Please refer to [Quick Start](./quickstart.md) for setting up API key for specific model.

## Docker Compose Files

- `docker-compose.yml` - Full stack with postgres (for development/testing)

## Notes

- Results are saved under `./results/<exp-name>/`.
- Each task runs in an ephemeral container.
- Docker image is shared across all tasks.
- PostgreSQL data persists in Docker volume.


================================================
FILE: docs/introduction.md
================================================
# MCPMark
MCPMark is a comprehensive evaluation suite for evaluating the agentic ability of frontier models.

MCPMark includes Model Context Protocol (MCP) service in following environments
- Notion
- Github
- Filesystem
- Postgres
- Playwright
- Playwright-WebArena

### General Procedure
MCPMark is designed to run agentic tasks in complex environment **safely**. Specifically, it sets up an isolated environment for the experiment, completing the task, and then destroy the environment without affecting existing user profile or information.

### How to Use MCPMark
1. MCPMark Installation.
2. Authorize service (for Github and Notion).
3. Configure the environment variables in `.mcp_env`.
4. Run MCPMark experiment.

Please refer to [Quick Start](./quickstart.md) for details regarding how to start a sample filesystem experiment in properly, and [Task Page](./datasets/task.md) for task details. Please visit [Installation and Docker Uusage](./installation_and_docker_usage.md) information of full MCPMark setup.

### Running MCPMark

MCPMark supports the following mode to run experiments (suppose the experiment is named as new_exp, and the model used are o3 and gpt-4.1 and the environment is notion), with K repetive experiments.

#### MCPMark in Pip Installation
```bash
# Evaluate ALL tasks
python -m pipeline --exp-name new_exp --mcp notion --tasks all --models o3 --k K

# Evaluate a single task group (online_resume)
python -m pipeline --exp-name new_exp --mcp notion --tasks online_resume --models o3 --k K

# Evaluate one specific task (task_1 in online_resume)
python -m pipeline --exp-name new_exp --mcp notion --tasks online_resume/task_1 --models o3 --k K

# Evaluate multiple models
python -m pipeline --exp-name new_exp --mcp notion --tasks all --models o3,gpt-4.1 --k K
```

#### MCPMark in Docker Installation
```bash
# Run all tasks for one service
./run-task.sh --mcp notion --models o3 --exp-name new_exp --tasks all

# Run comprehensive benchmark across all services
./run-benchmark.sh --models o3,gpt-4.1 --exp-name new_exp --docker
```

#### Experiment Auto-Resume
For re-run experiments, only unfinished tasks will be executed. Tasks that previously failed due to pipeline errors (such as State Duplication Error or MCP Network Error) will also be retried automatically.

### Results
The experiment results are written to `./results/` (JSON + CSV).

#### Reult Aggregation (for K > 1)
MCP supports aggreated metrics of pass@1, pass@K, pass^K, avg@K.
```bash
python -m src.aggregators.aggregate_results --exp-name new_exp
```

### Model Support
MCPMark supports the following models with according providers (model codes in the brackets).
#### OpenAI
- GPT-5 (gpt-5)
- o3 (o3)

#### Anthropic
- Claude-4.1-Opus (claude-4.1-opus)
- Claude-4-Sonnet (claude-4-sonnet)

#### Google
- Gemini-2.5-Pro (gemini-2.5-pro)

#### Grok
- Grok-4 (grok-4)

#### Deepseek
- DeepSeek-Chat (deepseek-chat)

#### Alibaba
- Qwen3-Coder (qwen-3-coder)

#### Kimi
- Kimi-K2 (k2)

### Want to contribute?
Visit [Contributing Page](./contributing) to learn how to make contribution to MCPMark.


================================================
FILE: docs/mcp/filesystem.md
================================================
# Filesystem

This guide walks you through preparing your filesystem environment for MCPMark.

## 1 · Configure Environment Variables

Set the `FILESYSTEM_TEST_ROOT` environment variable in your `.mcp_env` file:

```env
## Filesystem
FILESYSTEM_TEST_ROOT=./test_environments
```

**Recommended**: Use `FILESYSTEM_TEST_ROOT=./test_environments` (relative to project root)

---

## 2 · Automatic Test Environment Download

Our code automatically downloads test folders to your specified `FILESYSTEM_TEST_ROOT` directory when the pipeline starts running.

**Downloaded Structure**:

```
./test_environments/
├── desktop/               # Desktop environment 
├── desktop_template/      # Template files for desktop
├── file_context/          # File content understanding tasks
├── file_property/         # File metadata and properties related tasks
├── folder_structure/      # Directory organization tasks
├── legal_document/        # Legal document processing
├── papers/                # Academic paper tasks
├── student_database/      # Database management tasks
├── threestudio/           # 3D Generation codebase
└── votenet/               # 3D Object Detection codebase
```

---

## 3 · Running Filesystem Tasks

**Basic Command**:

```bash
python -m pipeline --exp-name EXPNAME --mcp filesystem --tasks FILESYSTEMTASK --models MODEL --k K
```

**Docker Usage (Recommended)**

Docker is recommended to avoid library version conflicts:

```bash
# Build Docker image
./build-docker.sh

# Run with Docker
./run-task.sh --mcp filesystem --models MODEL --exp-name EXPNAME --tasks FILESYSTEMTASK --k K
```

Here *EXPNAME* refers to customized experiment name, *FILESYSTEMTASK* refers to the github task or task group selected (see [Task Page](../datasets/task.md) for specific task information), *MODEL* refers to the selected model (see [Introduction Page](../introduction.md) for model supported), *K* refers to the time of independent experiments.

---

## 5 · Troubleshooting

**Common Issues**:

- **Test Environment Not Found**: Ensure `FILESYSTEM_TEST_ROOT` is set correctly
- **Prerequisites**: Make sure your terminal has `wget` and `unzip` commands available
- **Recommended**: Use Docker to prevent library version conflicts


================================================
FILE: docs/mcp/github.md
================================================
# GitHub
This guide walks you through preparing your GitHub environment for MCPMark and authenticating the CLI tools with support for **token pooling** to mitigate rate limits.

## 1 · Prepare An Evaluation Organization in Github

1. **Create a free GitHub Organization**  
   - In GitHub, click your avatar → **Your organizations** → **New organization**.  
   - We recommend a name like `mcpmark-eval-xxx`. (Check if there is a conflict with other organization names.)
   - This keeps all benchmark repositories isolated from your personal and work code. 
   - [![Create Org](https://i.postimg.cc/CxqJkRnj/github-create-org.png)](https://postimg.cc/k27xdXc4)
2. **Create Multiple GitHub Accounts (Recommended for Rate Limit Relief)**  
   To effectively distribute API load and avoid rate limiting, we recommend creating **2-4 additional GitHub accounts**:
   - Create new GitHub accounts (e.g., `your-name-eval-1`, `your-name-eval-2`, etc.)
   - **Important**: Add all these accounts as **Owners** to your evaluation organization
   - This allows the token pooling system to distribute requests across multiple accounts

3. **Generate Fine-Grained Personal Access Tokens (PATs) for Each Account**  
   **Repeat the following process for each GitHub account (including your main account):**
   - Navigate to *Settings → Developer settings → Personal access tokens → Fine-grained tokens*
   - Click **Generate new token**, select the evaluation organization you created
      - [![Create Token](https://i.postimg.cc/Z5SjPT82/github-create-token.png)](https://postimg.cc/Mv9yqJrm)
   - Give the token a descriptive name (e.g., *MCPMark Eval Token - Account 1*)
   - Under **Repository permissions** and **Organization permissions**, enable **All permissions** (read and write if applicable)
      - [![Token Permissions](https://i.postimg.cc/nc81ZHPr/github-token-permissions.png)](https://postimg.cc/14HFrZP1)
   - Copy the generated token and save it safely — you'll need all tokens for the next step

4. **Configure Token Pooling in `.mcp_env`**  
   In your project root, edit (or create) the `.mcp_env` file and add your tokens:
   
   **For single token (Basic setup):**
   ```env
   ## GitHub - Single Token Configuration
   GITHUB_TOKENS="your-single-token-here"
   GITHUB_EVAL_ORG="your-eval-org-name"
   ```

   **For multiple tokens (Recommended for handling rate limits):**
   ```env
   ## GitHub - Token Pooling Configuration
   GITHUB_TOKENS="token1,token2,token3,token4"
   GITHUB_EVAL_ORG="your-eval-org-name"
   ```

   **Important Notes:**
   - Replace `token1,token2,token3,token4` with your actual tokens (comma-separated, no spaces)
   - **2-4 tokens** is recommended for optimal rate limit distribution
   - All tokens must have **the same permissions** on the evaluation organization
   - The system automatically rotates between tokens to distribute API load

---

## 2 · Download the Sample Repository State

We have pre-exported several popular open-source repositories along with curated Issues and PRs.

1. Download the archive from [Google Drive](https://drive.google.com/drive/folders/16bFDjdtqJYzYJlqKcjKBGomo8DwOhWcN?usp=drive_link).  
2. Extract it so that the directory `./github_state/` appears in the project root:

   ```bash
   mkdir -p github_state
   unzip github_state.zip -d ./github_state
   ```

---

## 3 · Add New Repositories (Optional)

If you want to benchmark additional repositories:

1. Export the desired repository state:
   ```bash
   python -m src.mcp_services.github.repo_exporter --source_repo_url owner/name --max-issues 20 --max-pulls 5
   ```
2. Open `src/mcp_services/github/state_manager.py` and add a new entry to `self.initial_state_mapping` pointing to the exported folder.

---

## 4 · GitHub Rate Limits & Token Pooling Benefits

### Understanding Rate Limits
Fine-grained tokens are subject to GitHub API rate limits:
- **Read operations**: 5,000 requests per hour per token
- **General write operations**: 80 writes per minute and 500 writes per hour per token
- **Content creation (Issues, PRs, Comments)**: **500 requests per hour per token** (Secondary Rate Limit)

### How Token Pooling Helps
With **token pooling**, MCPMark automatically:
- **Distributes requests** across multiple tokens to multiply your rate limits
- **Rotates tokens** for each task execution to balance load
- **Handles rate limit failures** by trying the next available token
- **Ensures consistency** between agent execution and verification

### Example: Rate Limit Multiplication
**Read Operations:**
- **Single token**: 5,000 requests/hour
- **4 tokens**: ~20,000 requests/hour total capacity

**Content Creation (Critical for MCPMark):**
- **Single token**: 500 content creation requests/hour
- **4 tokens**: ~2,000 content creation requests/hour total capacity
- **Automatic failover**: If one token hits limits, others continue working

This dramatically improves evaluation performance, especially for large task batches or frequent testing cycles. **The content creation limit is often the bottleneck**, making token pooling essential for efficient evaluations.

### Repository Limits
MCPMark places a cap on the number of PRs and issues (≤ 50 in total) per repository to ensure reasonable evaluation times and to stay within rate limits.


## 2. Running Github Tasks

1. Configure environment variables: make sure `GITHUB_TOKENS` and `GITHUB_EVAL_ORG` are properly set in `.mcp_env`.

2. For single task or task group, run 
```bash
python -m pipeline --exp-name EXPNAME --mcp github --tasks GITHUBTASK --models MODEL --k K
```
Here *EXPNAME* refers to customized experiment name, *GITHUBTASK* refers to the github task or task group selected (see [Task Page](../datasets/task.md) for specific task information), *MODEL* refers to the selected model (see [Introduction Page](../introduction.md) for model supported), *K* refers to the time of independent experiments.

================================================
FILE: docs/mcp/notion.md
================================================
# Notion

This guide walks you through preparing your Notion environment for MCPMark and authenticating the CLI tools.

> Note: Set your Notion app and workspace interface language to English. We use Playwright for browser automation and our locator logic relies on raw English text in the UI. Non-English interfaces can cause element selection to fail.

## 1 · Set up Notion Environment

1. **Duplicate the MCPMark Source Pages**
   Copy the template database and pages into your workspace from the public template following this tutorial:
   [Duplicate MCPMark Source](https://painted-tennis-ebc.notion.site/MCPBench-Source-Hub-23181626b6d7805fb3a7d59c63033819).

2. **Set up the Source and Eval Hub for Environment Isolation**
   - Prepare **two separate Notion pages**:
     - **Source Hub**: Stores all the template databases/pages. Managed by `SOURCE_NOTION_API_KEY`.
     - **Eval Hub**: Only contains the duplicated templates for the current evaluation. Managed by `EVAL_NOTION_API_KEY`.
   - In Notion, create an **empty page** in your Eval Hub. The page name **must exactly match** the value you set for `EVAL_PARENT_PAGE_TITLE` in your environment variables (e.g., `MCPMark Eval Hub`).
   - Name your **Source Hub** page to match `SOURCE_PARENT_PAGE_TITLE` (default: `MCPMark Source Hub`). This is where all initial-state templates live; we enumerate this page’s first-level children by exact title.
   - In Notion's **Connections** settings:
     - Bind the integration corresponding to `EVAL_NOTION_API_KEY` to the Eval Hub parent page you just created.
     - Bind the integration corresponding to `SOURCE_NOTION_API_KEY` to your Source Hub (where the templates are stored).

3. **Create Notion Integrations & Grant Access**
   
   a. Visit [Notion Integrations](https://www.notion.so/profile/integrations) and create **two internal integrations** (one for Source Hub, one for Eval Hub).
   
   b. Copy the generated **Internal Integration Tokens** (these will be your `SOURCE_NOTION_API_KEY` and `EVAL_NOTION_API_KEY`).
   
   c. Share the **Source Hub** with the Source integration, and the **Eval Hub parent page** with the Eval integration (*Full Access*).

   [![Source Page](https://i.postimg.cc/pVjDswLH/source-page.png)](https://postimg.cc/XXVGJD5H)
   [![Create Integration](https://i.postimg.cc/vZ091M3W/create-integration.png)](https://postimg.cc/NKrLShhM)
   [![Notion API Access](https://i.postimg.cc/YCDGrRCR/api-access.png)](https://postimg.cc/CRDLJjDn)
   [![Grant Access Source](https://i.postimg.cc/2yxyPFt4/grant-access-source.png)](https://postimg.cc/n9Cnm7pz)
   [![Grant Access Eval](https://i.postimg.cc/1RM91ttc/grant-access-eval.png)](https://postimg.cc/s1QFp35v)

---

## 2 · Authenticate with Notion

```bash
# First, install Playwright and the browser binaries
playwright install
# Then, run the Notion login helper with your preferred browser
python -m src.mcp_services.notion.notion_login_helper --browser {firefox|chromium}
```

The verification script will tell you which browser is working properly. The pipeline defaults to using **chromium**. Our pipeline has been **fully tested on macOS and Linux**.

## 3. Running Notion Tasks

1. Configure environment variables: make sure the following service credentials are added in `.mcp_env`.
```env
## Notion
SOURCE_NOTION_API_KEY="your-source-notion-api-key"   # For Source Hub (templates)
EVAL_NOTION_API_KEY="your-eval-notion-api-key"       # For Eval Hub (active evaluation)
SOURCE_PARENT_PAGE_TITLE="MCPMark Source Hub"        # Source hub page name (exact match)
EVAL_PARENT_PAGE_TITLE="MCPMark Eval Hub"           # Must match the name of the empty page you created in Eval Hub
PLAYWRIGHT_BROWSER="chromium" # default to chromium, you can also choose firefox
PLAYWRIGHT_HEADLESS="True"
```

2. For single task or task group, run 
```bash
python -m pipeline --exp-name EXPNAME --mcp notion --tasks NOTIONTASK --models MODEL --k K
```
Here *EXPNAME* refers to customized experiment name, *NOTIONTASK* refers to the notion task or task group selected (see [Task Page](../datasets/task.md) for specific task information), *MODEL* refers to the selected model (see [Introduction Page](../introduction.md) for model supported), *K* refers to the time of independent experiments.


================================================
FILE: docs/mcp/playwright.md
================================================
# Playwright

This guide walks you through setting up WebArena environments for Playwright MCP automated testing, including Shopping, Shopping Admin, and Reddit instances.

Section 1 is designed mainly for completing the Playwright-WebArena tasks.

## 1. Setup WebArena Environment (For Playwright-WebArena Tasks)
### 1.1 Download Docker Images

[WebArena](https://github.com/web-arena-x/webarena/tree/main/environment_docker) provides Docker images from multiple sources. Choose the fastest one for your network:

### Shopping Environment (Port 7770)
```bash
# Option 1: Google Drive (Recommended)
pip install gdown
gdown 1gxXalk9O0p9eu1YkIJcmZta1nvvyAJpA

# Option 2: Archive.org
wget https://archive.org/download/webarena-env-shopping-image/shopping_final_0712.tar

# Option 3: CMU Server
wget http://metis.lti.cs.cmu.edu/webarena-images/shopping_final_0712.tar
```

### Shopping Admin Environment (Port 7780)
```bash
# Option 1: Google Drive (Recommended)
gdown 1See0ZhJRw0WTTL9y8hFlgaduwPZ_nGfd

# Option 2: Archive.org
wget https://archive.org/download/webarena-env-shopping-admin-image/shopping_admin_final_0719.tar

# Option 3: CMU Server
wget http://metis.lti.cs.cmu.edu/webarena-images/shopping_admin_final_0719.tar
```

### Reddit Environment (Port 9999)
```bash
# Option 1: Google Drive (Recommended)
gdown 17Qpp1iu_mPqzgO_73Z9BnFjHrzmX9DGf

# Option 2: Archive.org
wget https://archive.org/download/webarena-env-forum-image/postmill-populated-exposed-withimg.tar

# Option 3: CMU Server
wget http://metis.lti.cs.cmu.edu/webarena-images/postmill-populated-exposed-withimg.tar
```

### 1.2 Deploy Environments

#### Shopping (E-commerce Site)
```bash
docker load --input shopping_final_0712.tar

# Start container
docker run --name shopping -p 7770:80 -d shopping_final_0712

# Wait for service initialization (2-3 minutes)
sleep 180

# Configure for local access
docker exec shopping /var/www/magento2/bin/magento setup:store-config:set --base-url="http://localhost:7770"
docker exec shopping mysql -u magentouser -pMyPassword magentodb -e "UPDATE core_config_data SET value='http://localhost:7770/' WHERE path IN ('web/secure/base_url', 'web/unsecure/base_url');"
docker exec shopping /var/www/magento2/bin/magento cache:flush
```

**Access**: `http://localhost:7770`  


#### Shopping Admin (Management Panel)
```bash
docker load --input shopping_admin_final_0719.tar

# Start container
docker run --name shopping_admin -p 7780:80 -d shopping_admin_final_0719

# Wait for service initialization
sleep 120

# Configure for local access
docker exec shopping_admin /var/www/magento2/bin/magento setup:store-config:set --base-url="http://localhost:7780"
docker exec shopping_admin mysql -u magentouser -pMyPassword magentodb -e "UPDATE core_config_data SET value='http://localhost:7780/' WHERE path IN ('web/secure/base_url', 'web/unsecure/base_url');"
docker exec shopping_admin php /var/www/magento2/bin/magento config:set admin/security/password_is_forced 0
docker exec shopping_admin php /var/www/magento2/bin/magento config:set admin/security/password_lifetime 0
docker exec shopping_admin /var/www/magento2/bin/magento cache:flush
```

**Access**: `http://localhost:7780/admin`  
**Admin Credentials**: `admin / admin1234`

#### Reddit (Forum)
```bash
docker load --input postmill-populated-exposed-withimg.tar

# Start container
docker run --name forum -p 9999:80 -d postmill-populated-exposed-withimg

# Wait for PostgreSQL initialization
sleep 120

# Verify service status
docker logs forum | grep "database system is ready"
curl -I http://localhost:9999
```

**Access**: `http://localhost:9999`

### 1.3 External Access Configuration

For cloud deployments (GCP, AWS, etc.), configure external access:

#### Configure Firewall (GCP Example)
```bash
# Shopping environment
gcloud compute firewall-rules create allow-shopping-7770 \
  --allow tcp:7770 --source-ranges 0.0.0.0/0

# Shopping Admin
gcloud compute firewall-rules create allow-shopping-admin-7780 \
  --allow tcp:7780 --source-ranges 0.0.0.0/0

# Reddit
gcloud compute firewall-rules create allow-reddit-9999 \
  --allow tcp:9999 --source-ranges 0.0.0.0/0
```

#### Update Base URLs for External Access
```bash
# Get external IP
EXTERNAL_IP=$(curl -s ifconfig.me)

# Shopping
docker exec shopping /var/www/magento2/bin/magento setup:store-config:set --base-url="http://${EXTERNAL_IP}:7770"
docker exec shopping mysql -u magentouser -pMyPassword magentodb -e "UPDATE core_config_data SET value='http://${EXTERNAL_IP}:7770/' WHERE path IN ('web/secure/base_url', 'web/unsecure/base_url');"
docker exec shopping /var/www/magento2/bin/magento cache:flush

# Shopping Admin  
docker exec shopping_admin /var/www/magento2/bin/magento setup:store-config:set --base-url="http://${EXTERNAL_IP}:7780"
docker exec shopping_admin mysql -u magentouser -pMyPassword magentodb -e "UPDATE core_config_data SET value='http://${EXTERNAL_IP}:7780/' WHERE path IN ('web/secure/base_url', 'web/unsecure/base_url');"
docker exec shopping_admin /var/www/magento2/bin/magento cache:flush
```

### 1.4 Alternative Access Methods (Not Verified)

#### Cloudflared Tunnel (Free & Persistent)
```bash
# Install cloudflared
wget https://github.com/cloudflare/cloudflared/releases/latest/download/cloudflared-linux-amd64
sudo mv cloudflared-linux-amd64 /usr/local/bin/cloudflared
sudo chmod +x /usr/local/bin/cloudflared

# Create tunnels
cloudflared tunnel --url http://localhost:7770  # Shopping
cloudflared tunnel --url http://localhost:7780  # Admin
cloudflared tunnel --url http://localhost:9999  # Reddit
```

#### ngrok (Quick Sharing)
```bash
# Install ngrok
wget https://bin.equinox.io/c/bNyj1mQVY4c/ngrok-v3-stable-linux-amd64.tgz
tar xvzf ngrok-v3-stable-linux-amd64.tgz
sudo mv ngrok /usr/local/bin

# Create tunnel (choose port)
ngrok http 7770  # For Shopping
```

## 2. Running Playwright Tasks

1. Configure environment variables: make sure the following service credentials are added in `.mcp_env`.
```env
PLAYWRIGHT_BROWSER="chromium" # default to chromium, you can also choose firefox
PLAYWRIGHT_HEADLESS="True"
```

2. For single task or task group, run 
```bash
python -m pipeline --exp-name EXPNAME --mcp MCP --tasks  PLAYWRIGHTTASK --models MODEL
```
Here *EXPNAME* refers to customized experiment name, *MCP* refers to playwright or playwright_webarena denpending on the task, *PLAYWRIGHTTASK* refers to the task or task group selected (see [Task Page](../datasets/task.md) for specific task information), *MODEL* refers to the selected model (see [Introduction Page](../introduction.md) for model supported), *K* refers to the time of independent experiments.

## 3. Troubleshooting

### Container Issues
```bash
# Check status
docker ps -a | grep -E "shopping|forum"

# View logs
docker logs [container_name] --tail 50

# Restart container
docker restart [container_name]
```

### Access Problems
- **First load is slow** (1-2 minutes for Magento) - this is normal
- **Ensure ports are available**: `netstat -tlnp | grep -E "7770|7780|9999"`
- **Clear cache after URL changes**: Required for Magento environments

### Reset Environment
```bash
# Stop and remove container
docker stop [container_name]
docker rm [container_name]

# Re-deploy (follow steps in Section 3)
```

## 4. Important Notes

- **Service startup time**: Allow 2-3 minutes for Magento, 1-2 minutes for Reddit
- **Memory requirements**: Ensure Docker has at least 4GB RAM allocated per container
- **URL configuration**: Must reconfigure base URLs after container restart for external access
- **Port assignments**: 
  - 7770: Shopping
  - 7780: Shopping Admin  
  - 9999: Reddit

================================================
FILE: docs/mcp/postgres.md
================================================
# PostgreSQL

This guide walks you through preparing your PostgreSQL environment for MCPMark evaluation.

## 1. Setup PostgreSQL Environment

### 1.1 Start PostgreSQL with Docker

1. **Run PostgreSQL Container**
   Start a PostgreSQL instance using Docker:
   ```bash
   docker run -d \
     --name mcpmark-postgres \
     -e POSTGRES_PASSWORD=password \
     -e POSTGRES_USER=postgres \
     -p 5432:5432 \
     pgvector/pgvector:0.8.0-pg17-bookworm
   ```

2. **Verify Container is Running**
   ```bash
   docker ps | grep mcpmark-postgres
   ```

---

### 1.2 Import Sample Databases

1. **Download Database Backups**
   Download the backup files and place them in `./postgres_state/` directory:
   ```bash
   mkdir -p ./postgres_state
   cd ./postgres_state
   
   # Download all database backups
   wget https://storage.mcpmark.ai/postgres/employees.backup
   wget https://storage.mcpmark.ai/postgres/chinook.backup
   wget https://storage.mcpmark.ai/postgres/dvdrental.backup
   wget https://storage.mcpmark.ai/postgres/sports.backup
   wget https://storage.mcpmark.ai/postgres/lego.backup
   
   cd ..
   ```

2. **Create Databases and Restore from Backups**
   > Make sure your Postgres client version matches the server's version (e.g., pg17).

   ```bash
   # Set the password environment variable
   export PGPASSWORD=password
   
   # Create and restore each database
   createdb -h localhost -U postgres employees
   pg_restore -h localhost -U postgres -d employees -v ./postgres_state/employees.backup
   
   createdb -h localhost -U postgres chinook
   pg_restore -h localhost -U postgres -d chinook -v ./postgres_state/chinook.backup
   
   createdb -h localhost -U postgres dvdrental
   pg_restore -h localhost -U postgres -d dvdrental -v ./postgres_state/dvdrental.backup
   
   createdb -h localhost -U postgres sports
   pg_restore -h localhost -U postgres -d sports -v ./postgres_state/sports.backup
   
   createdb -h localhost -U postgres lego
   pg_restore -h localhost -U postgres -d lego -v ./postgres_state/lego.backup
   ```

3. **Verify Databases are Imported**
   ```bash
   # List all databases
   PGPASSWORD=password psql -h localhost -U postgres -c "\l"
   ```

---

## 2. Configure Environment Variables

Configure environment variables: make sure the following enservice credentials are added in `.mcp_env`:
```env
## PostgreSQL Configuration
POSTGRES_HOST="localhost"
POSTGRES_PORT="5432"
POSTGRES_USERNAME="postgres"
POSTGRES_PASSWORD="password"
```


## 3. Verify Connection

Verify the PostgreSQL setup is working correctly:

```bash
# Test connection using psql
PGPASSWORD=password psql -h localhost -U postgres -c "SELECT version();"
```


## 4. Common Operations

### Stop PostgreSQL Container
```bash
docker stop mcpmark-postgres
```

### Start PostgreSQL Container
```bash
docker start mcpmark-postgres
```

### Remove PostgreSQL Container (Clean Setup)
```bash
docker stop mcpmark-postgres
docker rm mcpmark-postgres
```

### Access PostgreSQL Shell
```bash
PGPASSWORD=mysecretpassword psql -h localhost -U postgres
```

## 5. Running Postgres Experiment

For single task or task group, run 
```bash
python -m pipeline --exp-name EXPNAME --mcp postgres --tasks POSTGRESTASK --models MODEL
```
Here *EXPNAME* refers to customized experiment name, *POSTGRESTASK* refers to the postgres task or task group selected (see `tasks/` for specific task information), *MODEL* refers to the selected model (see [Introduction Page](../introduction.md) for model supported), *K* refers to the time of independent experiments.


## 6. Troubleshooting

### Port Already in Use
If port 5432 is already in use, you can use a different port:
```bash
docker run -d \
   ```bash
   docker run -d \
     --name mcpmark-postgres \
     -e POSTGRES_PASSWORD=password \
     -e POSTGRES_USER=postgres \
     -p 5433:5432 \
     pgvector/pgvector:0.8.0-pg17-bookworm
   ```
Remember to update `POSTGRES_PORT="5433"` in your `.mcp_env` file.

### Connection Refused
Ensure the Docker container is running and the port mapping is correct:
```bash
docker ps
docker logs mcpmark-postgres
```


================================================
FILE: docs/quickstart.md
================================================
# Quick Start
To quickly experience MCPMark, we recommend firstly preparing the environment, and then execute the Postgres tasks.

### 1. Clone MCPMark
```bash
git clone https://github.com/eval-sys/mcpmark.git

cd mcpmark
```

### 2. Setup Environment Variables
To setup the model access in environment variable, edit the `.mcp_env` file in `mcpmark/`.

```env
# Model Providers (set only those you need)
## Google Gemini
GEMINI_BASE_URL="https://your-gemini-base-url.com/v1"
GEMINI_API_KEY="your-gemini-api-key"

## DeepSeek
DEEPSEEK_BASE_URL="https://your-deepseek-base-url.com/v1"
DEEPSEEK_API_KEY="your-deepseek-api-key"

## OpenAI
OPENAI_BASE_URL="https://your-openai-base-url.com/v1"
OPENAI_API_KEY="your-openai-api-key"

## Anthropic
ANTHROPIC_BASE_URL="https://your-anthropic-base-url.com/v1"
ANTHROPIC_API_KEY="your-anthropic-api-key"

## Moonshot
MOONSHOT_BASE_URL="https://your-moonshot-base-url.com/v1"
MOONSHOT_API_KEY="your-moonshot-api-key"

## xAI
XAI_BASE_URL="https://your-xai-base-url.com/v1"
XAI_API_KEY="your-xai-api-key"
```

### 3. Run Quick Example in MCPMark
Suppose you are running the employee query task with gemini-2.5-flash, and name your experiment as test-run-1, you can use the following command to test the `size_classification` task in `file_property`, which categorizes files by their sizes.

```bash
python -m pipeline 
--exp-name test-run-1
--mcp filesystem
--tasks file_property/size_classification
--models gemini-2.5-flash
```

Here is the expected output (the verification may encounter failure due to model choices). 
[![Sample Experiment Output](https://i.postimg.cc/4NRDYRS2/task-sample-file-property-size-classification.png)](https://postimg.cc/Yj8nPZkQ)

The reuslts are saved under `restuls/{exp_name}/{mcp}_{model}/{tasks}`, if `exp-name` is not specified, the default name would be timestamp of the experiment (but specifying the `exp-name` is useful for resuming experiments).

For other MCP services, please refers to the [Installation and Docker Usage Page](./installation_and_docker_usage.md) for detailed instruction. 


================================================
FILE: pipeline.py
================================================
#!/usr/bin/env python3
"""
MCPMark Unified Evaluation Pipeline
===================================

This script provides an automated evaluation pipeline for testing Large Language Models (LLMs)
on various Multi-Step Cognitive Processes (MCP) services like Notion, GitHub, and PostgreSQL.
"""

import argparse
import sys
from datetime import datetime
from pathlib import Path
from dotenv import load_dotenv

from src.logger import get_logger
from src.evaluator import MCPEvaluator
from src.agents import AGENT_REGISTRY
from src.factory import MCPServiceFactory
from src.model_config import ModelConfig


# Suppress httpcore/anyio cleanup exceptions that don't affect functionality.
# These "Exception ignored" messages are caused by MCP library's streamablehttp_client
# timing issues during cleanup, but don't impact actual task execution.
def _suppress_cleanup_exceptions(unraisable):
    """Suppress known cleanup exceptions from httpcore/anyio."""
    msg = str(unraisable.exc_value)
    if any(
        pattern in msg
        for pattern in [
            "async generator ignored GeneratorExit",
            "cancel scope in a different task",
            "no running event loop",
        ]
    ):
        return  # Silently ignore
    # Use default handler for other exceptions
    sys.__unraisablehook__(unraisable)


sys.unraisablehook = _suppress_cleanup_exceptions

# Initialize logger
logger = get_logger(__name__)


def main():
    """Main entry point for the evaluation pipeline."""
    parser = argparse.ArgumentParser(description="MCPMark Unified Evaluation Pipeline.")

    supported_mcp_services = MCPServiceFactory.get_supported_mcp_services()
    supported_models = ModelConfig.get_supported_models()

    # Main configuration
    parser.add_argument(
        "--mcp",
        default="filesystem",
        choices=supported_mcp_services,
        help="MCP service to use (default: filesystem)",
    )
    parser.add_argument(
        "--models",
        required=True,
        help="Comma-separated list of models to evaluate (e.g., 'o3,k2,gpt-4.1')",
    )

    parser.add_argument(
        "--agent",
        default="mcpmark",
        choices=sorted(AGENT_REGISTRY.keys()),
        help="Agent implementation to use (default: mcpmark)",
    )
    parser.add_argument(
        "--tasks",
        default="all",
        help='Tasks to run: (1). "all"; (2). "category"; or (3). "category/task".',
    )
    parser.add_argument(
        "--task-suite",
        default="standard",
        choices=["standard", "easy"],
        help="Task suite to run (default: standard). Use 'easy' to run the lightweight dataset.",
    )
    parser.add_argument(
        "--exp-name",
        default=None,
        help="Experiment name; results are saved under results/<exp-name>/ (default: YYYY-MM-DD-HH-MM-SS)",
    )
    parser.add_argument(
        "--k",
        type=int,
        default=4,
        help="Number of evaluation runs (default: 1)",
    )

    # Execution configuration
    parser.add_argument(
        "--timeout",
        type=int,
        default=3600,
        help="Timeout in seconds for agent execution",
    )
    parser.add_argument(
        "--compaction-token",
        type=int,
        default=999_999_999,
        help=(
            "Auto-compact conversation when prompt tokens (from API usage) reach this limit. "
            "Use 999999999 to disable compaction."
        ),
    )
    parser.add_argument(
        "--reasoning-effort",
        default="default",
        choices=["default", "minimal", "low", "medium", "high"],
        help="Reasoning effort level for supported models (default: None)",
    )

    # Output configuration
    parser.add_argument(
        "--output-dir",
        type=Path,
        default=Path("./results"),
        help="Directory to save results",
    )

    # Load arguments and environment variables
    args = parser.parse_args()
    load_dotenv(dotenv_path=".mcp_env", override=False)

    # Validate k parameter and exp-name requirement
    if args.k > 1 and args.exp_name is None:
        parser.error("--exp-name is required when k > 1")

    # Generate default exp-name if not provided
    if args.exp_name is None:
        args.exp_name = datetime.now().strftime("%Y-%m-%d-%H-%M-%S")

    # Parse models (no validation - allow unsupported models)
    model_list = [m.strip() for m in args.models.split(",") if m.strip()]
    if not model_list:
        parser.error("No valid models provided")

    # Log warning for unsupported models but don't error
    unsupported_models = [m for m in model_list if m not in supported_models]
    if unsupported_models:
        logger.warning(
            f"Using unsupported models: {', '.join(unsupported_models)}. Will use OPENAI_BASE_URL and OPENAI_API_KEY from environment."
        )

    logger.info("MCPMark Evaluation")
    logger.info(
        f"Experiment: {args.exp_name} | {len(model_list)} Model(s): {', '.join(model_list)}"
    )
    logger.info(f"Task suite: {args.task_suite}")
    if args.k > 1:
        logger.info(f"Running {args.k} evaluation runs for pass@k metrics")

    # Run k evaluation runs
    for run_idx in range(1, args.k + 1):
        if args.k > 1:
            logger.info(f"\n{'=' * 80}")
            logger.info(f"Starting Run {run_idx}/{args.k}")
            logger.info(f"{'=' * 80}\n")

            # For k-runs, results/{exp}/{mcp}__{model}/run-N
            run_exp_name = f"run-{run_idx}"
            run_output_dir = args.output_dir / args.exp_name
        else:
            # For single run, still use run-1 under service_model
            run_exp_name = "run-1"
            run_output_dir = args.output_dir / args.exp_name

        # Run evaluation for each model
        for i, model in enumerate(model_list, 1):
            logger.info(f"\n{'=' * 60}")
            if args.k > 1:
                logger.info(
                    f"Run {run_idx}/{args.k} | Model {i}/{len(model_list)}: {model}"
                )
            else:
                logger.info(f"Starting evaluation {i}/{len(model_list)}: {model}")
            logger.info(f"{'=' * 60}\n")

            # Initialize and run the evaluation pipeline for this model
            pipeline = MCPEvaluator(
                mcp_service=args.mcp,
                model=model,
                timeout=args.timeout,
                exp_name=run_exp_name,
                output_dir=run_output_dir,
                reasoning_effort=args.reasoning_effort,
                agent_name=args.agent,
                task_suite=args.task_suite,
                compaction_token=args.compaction_token,
            )

            pipeline.run_evaluation(args.tasks)
            logger.info(f"📁 Results: {pipeline.base_experiment_dir}")

    logger.info(f"\n{'=' * 60}")
    if args.k > 1:
        logger.info(f"✓ All {args.k} runs completed for {len(model_list)} model(s)")
        logger.info(
            f"Run `python -m src.aggregators.aggregate_results --exp-name {args.exp_name}` to compute all metrics"
        )
    else:
        logger.info(f"✓ All evaluations completed for {len(model_list)} model(s)")
    logger.info(f"{'=' * 60}")


if __name__ == "__main__":
    main()


================================================
FILE: pyproject.toml
================================================
[project]
authors = []
name = "MCPMark"
requires-python = ">= 3.11"
version = "0.0.1"
dependencies = [
  "notion-client==2.4.0",
  "playwright>=1.43.0",
  "seaborn>=0.12.0",
  "matplotlib>=3.7.0",
  "numpy>=1.23.0",
  "openai-agents>=0.2.3,<0.3",
  "openai>=1.96.1",
  "python-dotenv>=1.1.1,<2",
  "ruff>=0.12.4,<0.13",
  "psycopg2-binary>=2.9.10,<3",
  "pyyaml>=6.0.2,<7",
  "nest-asyncio>=1.6.0,<2",
  "pixi",
  "pipx>=1.7.1,<2",
  "pgdumplib>=3.1.0,<4",
  "litellm==1.80.0"
]

[build-system]
build-backend = "hatchling.build"
requires = ["hatchling"]

[tool.pixi.workspace]
channels = ["conda-forge"]
platforms = [
  "osx-arm64",
  "linux-aarch64",
  "linux-64",
  "win-64",
  "osx-64",
]

[tool.pixi.tasks]
fmt = "ruff"

[tool.ruff.format]
indent-style = "space"
line-ending = "auto"

[tool.hatch.build.targets.wheel]
packages = ["src", "tasks"]


================================================
FILE: run-benchmark.sh
================================================
#!/bin/bash

# MCPMark Full Benchmark Runner
# Runs all tasks across all MCP services for comprehensive model evaluation

set -e

# Default values
MODELS=""
EXP_NAME=""
USE_DOCKER=false
SERVICES="filesystem,notion,github,postgres,playwright"
PARALLEL=false
TIMEOUT=3600
K=4

# Color codes for output
RED='\033[0;31m'
GREEN='\033[0;32m'
YELLOW='\033[1;33m'
BLUE='\033[0;34m'
NC='\033[0m' # No Color

# Function to print colored output
print_status() {
    echo -e "${BLUE}[$(date '+%Y-%m-%d %H:%M:%S')]${NC} $1"
}

print_success() {
    echo -e "${GREEN}✓${NC} $1"
}

print_warning() {
    echo -e "${YELLOW}⚠${NC} $1"
}

print_error() {
    echo -e "${RED}✗${NC} $1"
}

# Parse arguments
while [[ $# -gt 0 ]]; do
    case $1 in
        --models)
            MODELS="$2"
            shift 2
            ;;
        --exp-name)
            EXP_NAME="$2"
            shift 2
            ;;
        --docker)
            USE_DOCKER=true
            shift
            ;;
        --mcps)
            SERVICES="$2"
            shift 2
            ;;
        --parallel)
            PARALLEL=true
            shift
            ;;
        --timeout)
            TIMEOUT="$2"
            shift 2
            ;;
        --k)
            K="$2"
            shift 2
            ;;
        --help)
            cat << EOF
Usage: $0 --models MODELS --exp-name NAME [OPTIONS]

Run comprehensive benchmark across all MCP services.

Required Options:
    --models MODELS      Comma-separated list of models to evaluate
                        (e.g., "o3,gpt-4.1,claude-4-sonnet")
    --exp-name NAME     Experiment name for organizing results

Optional Options:
    --docker            Run tasks in Docker containers (recommended)
    --mcps SERVICES     Comma-separated list of services to test
                        Default: filesystem,notion,github,postgres,playwright
    --parallel          Run services in parallel (experimental)
    --timeout SECONDS   Timeout per task in seconds (default: 300)
    --k RUNS            Repeat runs per service for pass@k (default: 4)

Examples:
    # Run all services with Docker
    $0 --models o3,gpt-4.1 --exp-name benchmark-1 --docker

    # Run specific services locally
    $0 --models o3 --exp-name test-1 --mcps filesystem,postgres

    # Run with parallel execution
    $0 --models claude-4 --exp-name parallel-test --docker --parallel

EOF
            exit 0
            ;;
        *)
            print_error "Unknown option: $1"
            echo "Use --help for usage information"
            exit 1
            ;;
    esac
done

# Validate required arguments
if [ -z "$MODELS" ]; then
    print_error "Error: --models is required"
    exit 1
fi

if [ -z "$EXP_NAME" ]; then
    print_error "Error: --exp-name is required"
    exit 1
fi

# Check prerequisites
if [ "$USE_DOCKER" = true ]; then
    if ! command -v docker &> /dev/null; then
        print_error "Docker is not installed"
        exit 1
    fi

    # Always use Docker Hub image
    DOCKER_IMAGE="evalsysorg/mcpmark:latest"

    # Check if Docker image exists locally, pull only if not found
    if ! docker image inspect "$DOCKER_IMAGE" >/dev/null 2>&1; then
        print_status "Docker image not found locally, pulling from Docker Hub..."
        docker pull "$DOCKER_IMAGE" || {
            print_error "Failed to pull Docker image from Docker Hub"
            exit 1
        }
    else
        print_status "Using local Docker image: $DOCKER_IMAGE"
    fi
else
    # Check Python installation
    if ! command -v python3 &> /dev/null; then
        print_error "Python 3 is not installed"
        exit 1
    fi

    # Check if dependencies are installed
    if ! python3 -c "import src.evaluator" 2>/dev/null; then
        print_warning "Python dependencies not installed"
        echo "Installing dependencies..."
        pip install -e . || {
            print_error "Failed to install dependencies"
            exit 1
        }
    fi
fi

# Check .mcp_env file
if [ ! -f .mcp_env ]; then
    print_warning ".mcp_env file not found. Some tasks may fail without API credentials."
    echo "Create one from .mcp_env.example: cp .mcp_env.example .mcp_env"
fi

# Convert comma-separated services to array
IFS=',' read -ra SERVICE_ARRAY <<< "$SERVICES"

# Summary
echo ""
print_status "MCPMark Benchmark Configuration"
echo "━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━"
echo "Models:      $MODELS"
echo "Experiment:  $EXP_NAME"
echo "Services:    ${SERVICE_ARRAY[*]}"
echo "Docker:      $USE_DOCKER"
echo "Parallel:    $PARALLEL"
echo "Timeout:     ${TIMEOUT}s per task"
echo "K-Runs:      $K"
echo "━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━"
echo ""

# Create results directory
RESULTS_DIR="./results/${EXP_NAME}"
mkdir -p "$RESULTS_DIR"

# Log file for this run with timestamp and models
TIMESTAMP=$(date '+%Y%m%d_%H%M%S')
LOG_FILE="${RESULTS_DIR}/benchmark_${TIMESTAMP}.log"
echo "Benchmark started at $(date '+%Y-%m-%d %H:%M:%S')" > "$LOG_FILE"
echo "Models: $MODELS" >> "$LOG_FILE"
echo "Services: ${SERVICE_ARRAY[*]}" >> "$LOG_FILE"
echo "━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━" >> "$LOG_FILE"

# Function to run a single service
run_service() {
    local service=$1
    local start_time=$(date +%s)
    local start_time_formatted=$(date '+%Y-%m-%d %H:%M:%S')

    print_status "[$start_time_formatted] Starting $service tasks..."

    if [ "$USE_DOCKER" = true ]; then
        # Run with Docker
        ./run-task.sh --mcp "$service" \
            --models "$MODELS" \
            --exp-name "$EXP_NAME" \
            --tasks all \
            --timeout "$TIMEOUT" \
            --k "$K" 2>&1 | tee -a "$LOG_FILE"
    else
        # Run locally
        python3 -m pipeline \
            --mcp "$service" \
            --models "$MODELS" \
            --exp-name "$EXP_NAME" \
            --tasks all \
            --timeout "$TIMEOUT" \
            --k "$K" 2>&1 | tee -a "$LOG_FILE"
    fi

    local exit_code=$?
    local end_time=$(date +%s)
    local duration=$((end_time - start_time))

    if [ $exit_code -eq 0 ]; then
        print_success "$service completed in ${duration}s"
        echo "[$(date '+%Y-%m-%d %H:%M:%S')] $service: SUCCESS (${duration}s)" >> "${RESULTS_DIR}/summary.txt"
    else
        print_error "$service failed with exit code $exit_code"
        echo "[$(date '+%Y-%m-%d %H:%M:%S')] $service: FAILED (exit code $exit_code)" >> "${RESULTS_DIR}/summary.txt"
    fi

    return $exit_code
}

# Track overall results
TOTAL_SERVICES=${#SERVICE_ARRAY[@]}
COMPLETED_SERVICES=0
FAILED_SERVICES=0

# Main execution
BENCHMARK_START=$(date +%s)

if [ "$PARALLEL" = true ]; then
    print_status "Running services in parallel..."

    # Run all services in background
    for service in "${SERVICE_ARRAY[@]}"; do
        (
            run_service "$service"
        ) &
        pids+=($!)
    done

    # Wait for all background jobs and collect exit codes
    for pid in "${pids[@]}"; do
        if wait $pid; then
            ((COMPLETED_SERVICES++))
        else
            ((FAILED_SERVICES++))
        fi
    done
else
    print_status "Running services sequentially..."

    for service in "${SERVICE_ARRAY[@]}"; do
        if run_service "$service"; then
            ((COMPLETED_SERVICES++))
        else
            ((FAILED_SERVICES++))
            print_warning "Continuing despite failure in $service"
        fi
    done
fi

BENCHMARK_END=$(date +%s)
TOTAL_DURATION=$((BENCHMARK_END - BENCHMARK_START))

# Generate final summary
echo ""
echo "━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━"
print_status "Benchmark Summary"
echo "━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━"
echo "Completed at:      $(date '+%Y-%m-%d %H:%M:%S')"
echo "Total Services:    $TOTAL_SERVICES"
echo "Completed:         $COMPLETED_SERVICES"
echo "Failed:            $FAILED_SERVICES"
echo "Total Duration:    ${TOTAL_DURATION}s ($(($TOTAL_DURATION / 60))m $(($TOTAL_DURATION % 60))s)"
echo "Results saved to:  $RESULTS_DIR"
echo "Log file:          $LOG_FILE"
echo "━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━"


# Final status
if [ $FAILED_SERVICES -eq 0 ]; then
    print_success "Benchmark completed successfully!"
    exit 0
else
    print_warning "Benchmark completed with $FAILED_SERVICES failed service(s)"
    exit 1
fi


================================================
FILE: run-task.sh
================================================
#!/bin/bash

# MCPMark Task Runner
# Enable strict error handling
set -euo pipefail

# Default values
SERVICE="filesystem"
NETWORK_NAME="mcp-network"
POSTGRES_CONTAINER="mcp-postgres"

# Resource limits (can be overridden by environment variables)
DOCKER_MEMORY_LIMIT="${DOCKER_MEMORY_LIMIT:-4g}"
DOCKER_CPU_LIMIT="${DOCKER_CPU_LIMIT:-2}"

# Cleanup function
cleanup() {
    if [ "${SERVICE:-}" = "postgres" ]; then
        if docker ps --format '{{.Names}}' | grep -q "^${POSTGRES_CONTAINER}$"; then
            echo "Cleaning up PostgreSQL container..."
            docker stop "$POSTGRES_CONTAINER" >/dev/null 2>&1 || true
            docker rm "$POSTGRES_CONTAINER" >/dev/null 2>&1 || true
        fi
    fi
}

# Set up cleanup on exit
trap cleanup EXIT

# Parse arguments
while [[ $# -gt 0 ]]; do
    case $1 in
        --mcp) SERVICE="$2"; shift 2 ;;
        --help)
            cat << EOF
Usage: $0 [--mcp SERVICE] [PIPELINE_ARGS]

Run MCPMark tasks in Docker containers.

Options:
    --mcp SERVICE    MCP service (notion|github|filesystem|playwright|postgres)
                        Default: filesystem

Environment Variables:
    DOCKER_MEMORY_LIMIT  Memory limit for container (default: 4g)
    DOCKER_CPU_LIMIT     CPU limit for container (default: 2)
    DOCKER_IMAGE_VERSION Docker image tag to use (default: latest)

All other arguments are passed directly to the pipeline.

Examples:
    $0 --mcp notion --models o3 --exp-name test-1 --tasks all
    $0 --mcp postgres --models gpt-4 --exp-name pg-test --tasks basic_queries
EOF
            exit 0
            ;;
        *) break ;;  # Stop parsing, rest goes to pipeline
    esac
done

# Docker image tag can be overridden by environment variable
DOCKER_IMAGE_REPO="evalsysorg/mcpmark"
DOCKER_IMAGE_VERSION="${DOCKER_IMAGE_VERSION:-latest}"
DOCKER_IMAGE="${DOCKER_IMAGE_REPO}:${DOCKER_IMAGE_VERSION}"

# Check if Docker image exists locally, pull only if not found
if ! docker image inspect "$DOCKER_IMAGE" >/dev/null 2>&1; then
    echo "Docker image not found locally, pulling from Docker Hub..."
    docker pull "$DOCKER_IMAGE" || {
        echo "Error: Failed to pull Docker image from Docker Hub"
        echo "Please check your internet connection or Docker Hub access"
        exit 1
    }
else
    echo "Using local Docker image: $DOCKER_IMAGE"
fi

# Check if .mcp_env exists (warn but don't fail)
if [ ! -f .mcp_env ]; then
    echo "Warning: .mcp_env file not found. Some tasks may fail without API credentials."
fi

# Create network if doesn't exist
if ! docker network ls --format '{{.Name}}' | grep -q "^${NETWORK_NAME}$"; then
    echo "Creating Docker network: $NETWORK_NAME"
    docker network create "$NETWORK_NAME" || {
        echo "Error: Failed to create Docker network"
        exit 1
    }
fi

# Service-specific configurations
if [ "$SERVICE" = "postgres" ]; then
    # For postgres service, ensure PostgreSQL container is running
    if ! docker ps --format '{{.Names}}' | grep -q "^${POSTGRES_CONTAINER}$"; then
        echo "Starting PostgreSQL container..."
        docker run -d \
            --name "$POSTGRES_CONTAINER" \
            --network "$NETWORK_NAME" \
            -e POSTGRES_DATABASE=postgres \
            -e POSTGRES_USER=postgres \
            -e POSTGRES_PASSWORD="${POSTGRES_PASSWORD:-password}" \
            pgvector/pgvector:0.8.0-pg17-bookworm

        echo "Waiting for PostgreSQL to be ready..."
        for i in {1..10}; do
            if docker exec "$POSTGRES_CONTAINER" pg_isready -U postgres >/dev/null 2>&1; then
                echo "PostgreSQL is ready!"
                break
            fi
            sleep 1
        done
    else
        echo "PostgreSQL container already running"
    fi

    # Run task with network connection to postgres
    docker run --rm \
        --memory="$DOCKER_MEMORY_LIMIT" \
        --cpus="$DOCKER_CPU_LIMIT" \
        --network "$NETWORK_NAME" \
        -e POSTGRES_HOST="$POSTGRES_CONTAINER" \
        -e POSTGRES_PORT=5432 \
        -e POSTGRES_USERNAME=postgres \
        -e POSTGRES_PASSWORD="${POSTGRES_PASSWORD:-password}" \
        -e POSTGRES_DATABASE=postgres \
        -v "$(pwd)/results:/app/results" \
        -v "$(pwd)/postgres_state:/app/postgres_state" \
        $([ -f .mcp_env ] && echo "-v $(pwd)/.mcp_env:/app/.mcp_env:ro") \
        "$DOCKER_IMAGE" \
        python3 -m pipeline --mcp "$SERVICE" --k 1 "$@"
elif [ "$SERVICE" = "filesystem" ]; then
    # For filesystem service, mount test_environments
    docker run --rm \
        --memory="$DOCKER_MEMORY_LIMIT" \
        --cpus="$DOCKER_CPU_LIMIT" \
        -v "$(pwd)/results:/app/results" \
        -v "$(pwd)/test_environments:/app/test_environments" \
        $([ -f .mcp_env ] && echo "-v $(pwd)/.mcp_env:/app/.mcp_env:ro") \
        "$DOCKER_IMAGE" \
        python3 -m pipeline --mcp "$SERVICE" --k 1 "$@"
elif [ "$SERVICE" = "insforge" ]; then
    # For Insforge service, use host network to access Insforge backend on host
    docker run --rm \
        --memory="$DOCKER_MEMORY_LIMIT" \
        --cpus="$DOCKER_CPU_LIMIT" \
        --add-host=host.docker.internal:host-gateway \
        -v "$(pwd)/results:/app/results" \
        $([ -f .mcp_env ] && echo "-v $(pwd)/.mcp_env:/app/.mcp_env:ro") \
        "$DOCKER_IMAGE" \
        python3 -m pipeline --mcp "$SERVICE" --k 1 "$@"
else
    # For other services (notion, github, playwright, etc.)
    docker run --rm \
        --memory="$DOCKER_MEMORY_LIMIT" \
        --cpus="$DOCKER_CPU_LIMIT" \
        -v "$(pwd)/results:/app/results" \
        -v "$(pwd)/test_environments:/app/test_environments" \
        $([ -f .mcp_env ] && echo "-v $(pwd)/.mcp_env:/app/.mcp_env:ro") \
        $([ -f notion_state.json ] && echo "-v $(pwd)/notion_state.json:/app/notion_state.json") \
        "$DOCKER_IMAGE" \
        python3 -m pipeline --mcp "$SERVICE" --k 1 "$@"
fi

echo "Task completed!"


================================================
FILE: src/agents/__init__.py
================================================
"""
MCPMark Agent Module
====================

Provides agent implementations and registry for MCPMark.
"""

from .base_agent import BaseMCPAgent
from .mcpmark_agent import MCPMarkAgent
from .react_agent import ReActAgent

AGENT_REGISTRY = {
    "mcpmark": MCPMarkAgent,
    "react": ReActAgent,
}

__all__ = ["BaseMCPAgent", "MCPMarkAgent", "ReActAgent", "AGENT_REGISTRY"]


================================================
FILE: src/agents/base_agent.py
================================================
"""Shared base agent functionality for MCPMark agents."""

from __future__ import annotations

import asyncio
import copy
import json
import uuid
from abc import ABC, abstractmethod
from typing import Any, Dict, List, Optional, Callable

from src.logger import get_logger
from .mcp import MCPStdioServer, MCPHttpServer
from .utils import TokenUsageTracker

logger = get_logger(__name__)


class BaseMCPAgent(ABC):
    """Base class with shared functionality for MCPMark agents."""

    STDIO_SERVICES = [
        "notion",
        "filesystem",
        "playwright",
        "playwright_webarena",
        "postgres",
        "insforge",
        "github",
    ]
    HTTP_SERVICES = ["supabase"]
    DEFAULT_TIMEOUT = 600
    COMPACTION_DISABLED_TOKEN = 999_999_999

    CLAUDE_THINKING_BUDGETS = {
        "low": 1024,
        "medium": 2048,
        "high": 4096,
    }

    def __init__(
        self,
        litellm_input_model_name: str,
        api_key: str,
        base_url: str,
        mcp_service: str,
        timeout: int = DEFAULT_TIMEOUT,
        service_config: Optional[Dict[str, Any]] = None,
        service_config_provider: Optional[Callable[[], Dict[str, Any]]] = None,
        reasoning_effort: Optional[str] = "default",
        compaction_token: int = COMPACTION_DISABLED_TOKEN,
    ):
        self.litellm_input_model_name = litellm_input_model_name
        self.api_key = api_key
        self.base_url = base_url
        self.mcp_service = mcp_service
        self.timeout = timeout
        self.service_config = service_config or {}
        self._service_config_provider = service_config_provider
        self.reasoning_effort = reasoning_effort or "default"
        self.compaction_token = int(compaction_token)

        self.is_claude = self._is_anthropic_model(litellm_input_model_name)
        self.use_claude_thinking = self.is_claude and self.reasoning_effort != "default"

        self.usage_tracker = TokenUsageTracker()
        self.litellm_run_model_name = None

        self._partial_messages: List[Dict[str, Any]] = []
        self._partial_token_usage: Dict[str, int] = {}
        self._partial_turn_count: int = 0

        logger.debug(
            "Initialized %s for service '%s' with model '%s'",
            self.__class__.__name__,
            self.mcp_service,
            self.litellm_input_model_name,
        )

        # Warn if Gemini 3 model uses unsupported reasoning_effort value
        if self._is_gemini_3_model() and self.reasoning_effort not in [
            "default",
            "low",
            "high",
        ]:
            logger.warning(
                "Gemini 3 models only support reasoning_effort 'low' or 'high', "
                "got '%s'. LiteLLM may map this to the nearest supported value.",
                self.reasoning_effort,
            )

    def __repr__(self) -> str:  # pragma: no cover - debug helper
        return (
            f"{self.__class__.__name__}(service='{self.mcp_service}', "
            f"model='{self.litellm_input_model_name}')"
        )

    @abstractmethod
    async def execute(
        self,
        instruction: str,
        tool_call_log_file: Optional[str] = None,
    ) -> Dict[str, Any]:
        """Execute the agent logic and return execution metadata."""

    def execute_sync(
        self,
        instruction: str,
        tool_call_log_file: Optional[str] = None,
    ) -> Dict[str, Any]:
        """Synchronous wrapper for async execution."""
        return asyncio.run(self.execute(instruction, tool_call_log_file))

    def get_usage_stats(self) -> Dict[str, Any]:
        """Return aggregated usage statistics."""
        return self.usage_tracker.get_stats()

    def reset_usage_stats(self):
        """Clear usage statistics."""
        self.usage_tracker.reset()

    # ------------------------------------------------------------------
    # Shared helpers
    # ------------------------------------------------------------------

    def _is_anthropic_model(self, model_name: str) -> bool:
        return "claude" in model_name.lower()

    def _get_claude_thinking_budget(self) -> Optional[int]:
        if not self.use_claude_thinking:
            return None
        return self.CLAUDE_THINKING_BUDGETS.get(self.reasoning_effort, 2048)

    def _refresh_service_config(self):
        if not self._service_config_provider:
            return
        try:
            latest_cfg = self._service_config_provider() or {}
            self.service_config.update(latest_cfg)
        except Exception as exc:  # pragma: no cover - best effort refresh
            logger.warning("Failed to refresh service config: %s", exc)

    def _reset_progress(self):
        self._partial_messages = []
        self._partial_token_usage = {}
        self._partial_turn_count = 0

    def _update_progress(
        self,
        messages: List[Dict[str, Any]],
        token_usage: Dict[str, Any],
        turn_count: int,
    ):
        try:
            self._partial_messages = copy.deepcopy(messages)
            self._partial_token_usage = dict(token_usage or {})
            self._partial_turn_count = int(turn_count or 0)
        except Exception:  # pragma: no cover - defensive copy
            pass

    # ------------------------------------------------------------------
    # MCP server management
    # ------------------------------------------------------------------

    async def _create_mcp_server(self) -> Any:
        if self.mcp_service in self.STDIO_SERVICES:
            return self._create_stdio_server()
        if self.mcp_service in self.HTTP_SERVICES:
            return self._create_http_server()
        raise ValueError(f"Unsupported MCP service: {self.mcp_service}")

    def _create_stdio_server(self) -> MCPStdioServer:
        if self.mcp_service == "notion":
            notion_key = self.service_config.get("notion_key")
            if not notion_key:
                raise ValueError("Notion API key required")
            return MCPStdioServer(
                command="npx",
                args=["-y", "@notionhq/notion-mcp-server"],
                env={
                    "OPENAPI_MCP_HEADERS": (
                        '{"Authorization": "Bearer ' + notion_key + '", '
                        '"Notion-Version": "2022-06-28"}'
                    )
                },
            )

        if self.mcp_service == "filesystem":
            test_directory = self.service_config.get("test_directory")
            if not test_directory:
                raise ValueError("Test directory required for filesystem service")
            return MCPStdioServer(
                command="npx",
                args=[
                    "-y",
                    "@modelcontextprotocol/server-filesystem",
                    str(test_directory),
                ],
            )

        if self.mcp_service in ("playwright", "playwright_webarena"):
            browser = self.service_config.get("browser", "chromium")
            headless = self.service_config.get("headless", True)
            viewport_width = self.service_config.get("viewport_width", 1280)
            viewport_height = self.service_config.get("viewport_height", 720)

            args = ["-y", "@playwright/mcp@latest"]
            if headless:
                args.append("--headless")
            args.extend(
                [
                    "--isolated",
                    "--no-sandbox",
                    "--browser",
                    browser,
                    "--viewport-size",
                    f"{viewport_width},{viewport_height}",
                ]
            )
            return MCPStdioServer(command="npx", args=args)

        if self.mcp_service == "postgres":
            host = self.service_config.get("host", "localhost")
            port = self.service_config.get("port", 5432)
            username = self.service_config.get("username")
            password = self.service_config.get("password")
            database = self.service_config.get(
                "current_database"
            ) or self.service_config.get("database")
            if not all([username, password, database]):
                raise ValueError("PostgreSQL requires username, password, and database")
            database_url = (
                f"postgresql://{username}:{password}@{host}:{port}/{database}"
            )
            return MCPStdioServer(
                command="pipx",
                args=["run", "postgres-mcp", "--access-mode=unrestricted"],
                env={"DATABASE_URI": database_url},
            )

        if self.mcp_service == "insforge":
            api_key = self.service_config.get("api_key")
            backend_url = self.service_config.get("backend_url")
            if not all([api_key, backend_url]):
                raise ValueError("Insforge requires api_key and backend_url")
            return MCPStdioServer(
                command="npx",
                args=["-y", "@insforge/mcp@dev"],
                env={
                    "INSFORGE_API_KEY": api_key,
                    "INSFORGE_BACKEND_URL": backend_url,
                },
            )

        raise ValueError(f"Unsupported stdio service: {self.mcp_service}")

    def _create_http_server(self) -> MCPHttpServer:
        if self.mcp_service == "github":
            github_token = self.service_config.get("github_token")
            if not github_token:
                raise ValueError("GitHub token required")
            return MCPHttpServer(
                url="https://api.githubcopilot.com/mcp/",
                headers={
                    "Authorization": f"Bearer {github_token}",
                    "User-Agent": "MCPMark/1.0",
                },
            )
        raise ValueError(f"Unsupported HTTP service: {self.mcp_service}")

    # ------------------------------------------------------------------
    # Message/Tool formatting helpers
    # ------------------------------------------------------------------

    def _compaction_enabled(self) -> bool:
        return 0 < self.compaction_token < self.COMPACTION_DISABLED_TOKEN

    def _count_prompt_tokens_litellm(self, messages: List[Dict[str, Any]]) -> int:
        try:
            from litellm import token_counter

            return int(
                token_counter(model=self.litellm_input_model_name, messages=messages)
                or 0
            )
        except Exception:  # pragma: no cover - best effort
            return 0

    def _convert_to_sdk_format(
        self, messages: List[Dict[str, Any]]
    ) -> List[Dict[str, Any]]:
        sdk_format: List[Dict[str, Any]] = []
        function_call_map: Dict[str, str] = {}

        for msg in messages:
            role = msg.get("role")

            if role == "user":
                user_content = msg.get("content", "")
                if isinstance(user_content, list):
                    tool_results = [
                        item
                        for item in user_content
                        if isinstance(item, dict) and item.get("type") == "tool_result"
                    ]
                    if tool_results:
                        for tr in tool_results:
                            content_items = tr.get("content", [])
                            text_content = ""
                            for ci in content_items:
                                if isinstance(ci, dict) and ci.get("type") == "text":
                                    text_content = ci.get("text", "")
                                    break
                            sdk_format.append(
                                {
                                    "call_id": tr.get("tool_use_id", ""),
                                    "output": json.dumps(
                                        {
                                            "type": "text",
                                            "text": text_content,
                                            "annotations": None,
                                            "meta": None,
                                        }
                                    ),
                                    "type": "function_call_output",
                                }
                            )
                    else:
                        text_parts = []
                        for item in user_content:
                            if isinstance(item, dict) and item.get("type") == "text":
                                text_parts.append(item.get("text", ""))
                        sdk_format.append(
                            {"content": "\n".join(text_parts), "role": "user"}
                        )
                else:
                    sdk_format.append({"content": user_content, "role": "user"})

            elif role == "assistant":
                tool_calls = msg.get("tool_calls", [])
                function_call = msg.get("function_call")
                content = msg.get("content")

                if isinstance(content, list):
                    text_parts = []
                    claude_tool_uses = []
                    for block in content:
                        if isinstance(block, dict):
                            if block.get("type") == "text":
                                text_parts.append(block.get("text", ""))
                            elif block.get("type") == "thinking":
                                thinking_text = block.get("thinking", "")
                                if thinking_text:
                                    text_parts.append(
                                        f"<think>\n{thinking_text}\n</think>"
                                    )
                            elif block.get("type") == "tool_use":
                                claude_tool_uses.append(block)
                    content = "\n".join(text_parts)
                    if claude_tool_uses and not tool_calls:
                        tool_calls = []
                        for tu in claude_tool_uses:
                            tool_calls.append(
                                {
                                    "id": tu.get("id"),
                                    "function": {
                                        "name": tu.get("name"),
                                        "arguments": json.dumps(tu.get("input", {})),
                                    },
                                }
                            )

                if content:
                    sdk_format.append(
                        {
                            "id": "__fake_id__",
                            "content": [
                                {
                                    "annotations": [],
                                    "text": content,
                                    "type": "output_text",
                                }
                            ],
                            "role": "assistant",
                            "status": "completed",
                            "type": "message",
                        }
                    )

                if tool_calls:
                    for tool_call in tool_calls:
                        call_id = tool_call.get("id", f"call_{uuid.uuid4().hex}")
                        func_name = tool_call.get("function", {}).get("name", "")
                        sdk_format.append(
                            {
                                "arguments": tool_call.get("function", {}).get(
                                    "arguments", "{}"
                                ),
                                "call_id": call_id,
                                "name": func_name,
                                "type": "function_call",
                                "id": "__fake_id__",
                            }
                        )

                if function_call:
                    func_name = function_call.get("name", "")
                    call_id = f"call_{uuid.uuid4().hex}"
                    function_call_map[func_name] = call_id
                    sdk_format.append(
                        {
                            "arguments": function_call.get("arguments", "{}"),
                            "call_id": call_id,
                            "name": func_name,
                            "type": "function_call",
                            "id": "__fake_id__",
                        }
                    )

            elif role == "tool":
                sdk_format.append(
                    {
                        "call_id": msg.get("tool_call_id", ""),
                        "output": json.dumps(
                            {
                                "type": "text",
                                "text": msg.get("content", ""),
                                "annotations": None,
                                "meta": None,
                            }
                        ),
                        "type": "function_call_output",
                    }
                )

            elif role == "function":
                func_name = msg.get("name", "")
                call_id = function_call_map.get(func_name, f"call_{uuid.uuid4().hex}")
                sdk_format.append(
                    {
                        "call_id": call_id,
                        "output": json.dumps(
                            {
                                "type": "text",
                                "text": msg.get("content", ""),
                                "annotations": None,
                                "meta": None,
                            }
                        ),
                        "type": "function_call_output",
                    }
                )

        return sdk_format

    def _convert_to_anthropic_format(
        self, tools: List[Dict[str, Any]]
    ) -> List[Dict[str, Any]]:
        anthropic_tools = []
        for tool in tools:
            anthropic_tool = {
                "name": tool.get("name"),
                "description": tool.get("description", ""),
                "input_schema": tool.get(
                    "inputSchema",
                    {"type": "object", "properties": {}, "required": []},
                ),
            }
            anthropic_tools.append(anthropic_tool)
        return anthropic_tools

    def _is_gemini_model(self) -> bool:
        model_lower = self.litellm_input_model_name.lower()
        return "gemini" in model_lower or "bison" in model_lower

    def _is_gemini_3_model(self) -> bool:
        """Check if this is a Gemini 3 series model."""
        model_lower = self.litellm_input_model_name.lower()
        return "gemini-3" in model_lower or "gemini/gemini-3" in model_lower

    def _simplify_schema_for_gemini(
        self, schema: Optional[Dict[str, Any]]
    ) -> Dict[str, Any]:
        if not isinstance(schema, dict):
            return schema or {}

        simplified: Dict[str, Any] = {}
        for key, value in schema.items():
            if key == "type" and isinstance(value, list):
                simplified[key] = value[0] if value else "string"
            elif key == "items" and isinstance(value, dict):
                simplified[key] = self._simplify_schema_for_gemini(value)
            elif key == "properties" and isinstance(value, dict):
                simplified[key] = {
                    prop_key: self._simplify_schema_for_gemini(prop_val)
                    for prop_key, prop_val in value.items()
                }
            elif isinstance(value, dict):
                simplified[key] = self._simplify_schema_for_gemini(value)
            elif isinstance(value, list) and key not in ("required", "enum"):
                simplified[key] = [
                    self._simplify_schema_for_gemini(item)
                    if isinstance(item, dict)
                    else item
                    for item in value
                ]
            else:
                simplified[key] = value
        return simplified

    def _convert_to_openai_format(
        self, tools: List[Dict[str, Any]]
    ) -> List[Dict[str, Any]]:
        functions = []
        is_gemini = self._is_gemini_model()

        if is_gemini:
            logger.debug(
                "Detected Gemini model '%s' – simplifying tool schemas",
                self.litellm_input_model_name,
            )

        for tool in tools:
            input_schema = tool.get(
                "inputSchema", {"type": "object", "properties": {}, "required": []}
            )
            if is_gemini:
                simplified = self._simplify_schema_for_gemini(input_schema)
                if simplified != input_schema:
                    input_schema = simplified
                    logger.debug("Simplified schema for tool '%s'", tool.get("name"))

            functions.append(
                {
                    "name": tool.get("name"),
                    "description": tool.get("description", ""),
                    "parameters": input_schema,
                }
            )

        if is_gemini:
            logger.info("Converted %d tools for Gemini compatibility", len(functions))

        return functions


================================================
FILE: src/agents/mcp/__init__.py
================================================
"""
MCP (Model Context Protocol) Components
========================================

Minimal MCP server implementations for MCPMark.
"""

from .stdio_server import MCPStdioServer
from .http_server import MCPHttpServer

__all__ = ["MCPStdioServer", "MCPHttpServer"]

================================================
FILE: src/agents/mcp/http_server.py
================================================
"""
Minimal MCP HTTP Server Implementation  
=======================================

Provides HTTP-based MCP server communication for services like GitHub.
"""

import asyncio
from contextlib import AsyncExitStack
from typing import Any, Dict, List, Optional

from mcp import ClientSession
from mcp.client.streamable_http import streamablehttp_client

class MCPHttpServer:
    """
    HTTP-based MCP client using the official MCP Python SDK
    (Streamable HTTP transport).
    """

    def __init__(
        self,
        url: str,
        headers: Optional[Dict[str, str]] = None,
        timeout: int = 30,
    ):
        self.url = url.rstrip("/")
        self.headers = headers or {}
        self.timeout = timeout

        self._stack: Optional[AsyncExitStack] = None
        self.session: Optional[ClientSession] = None
        self._tools_cache: Optional[List[Dict[str, Any]]] = None

    async def __aenter__(self):
        await self.start()
        return self

    async def __aexit__(self, exc_type, exc, tb):
        await self.stop()

    async def start(self):
        """Open Streamable HTTP transport and initialize MCP session."""
        self._stack = AsyncExitStack()

        read_stream, write_stream, _ = await self._stack.enter_async_context(
            streamablehttp_client(self.url, headers=self.headers)
        )

        self.session = await self._stack.enter_async_context(ClientSession(read_stream, write_stream))
        await asyncio.wait_for(self.session.initialize(), timeout=self.timeout)

    async def stop(self):
        """Close the session/transport cleanly."""
        if self._stack:
            await self._stack.aclose()
        self._stack = None
        self.session = None
        self._tools_cache = None

    async def list_tools(self) -> List[Dict[str, Any]]:
        """Return tool definitions (cached)."""
        if self._tools_cache is not None:
            return self._tools_cache
        if not self.session:
            raise RuntimeError("MCP HTTP client not started")

        resp = await asyncio.wait_for(self.session.list_tools(), timeout=self.timeout)
        self._tools_cache = [t.model_dump() for t in resp.tools]
        return self._tools_cache

    async def call_tool(self, name: str, arguments: Dict[str, Any]) -> Any:
        """Invoke a remote tool and return the structured result."""
        if not self.session:
            raise RuntimeError("MCP HTTP client not started")

        result = await asyncio.wait_for(self.session.call_tool(name, arguments), timeout=self.timeout)
        return result.model_dump()


================================================
FILE: src/agents/mcp/stdio_server.py
================================================
"""
Minimal MCP Stdio Server Implementation
========================================

Provides stdio-based MCP server communication for services like
Notion, Filesystem, Playwright, and Postgres.
"""

import asyncio
import os
from contextlib import AsyncExitStack
from typing import Any, Dict, List, Optional

from mcp import ClientSession, StdioServerParameters
from mcp.client.stdio import stdio_client

class MCPStdioServer:
    """Lightweight wrapper around the official MCP Python SDK."""

    def __init__(self, command: str, args: List[str], env: Optional[Dict[str, str]] = None, timeout: int = 120):
        self.params = StdioServerParameters(command=command, args=args, env={**os.environ, **(env or {})})
        self.timeout = timeout
        self._stack: Optional[AsyncExitStack] = None
        self._streams = None
        self.session: Optional[ClientSession] = None

    async def __aenter__(self):
        self._stack = AsyncExitStack()
        read, write = await self._stack.enter_async_context(stdio_client(self.params))
        self.session = await self._stack.enter_async_context(ClientSession(read, write))
        await asyncio.wait_for(self.session.initialize(), timeout=self.timeout)
        return self

    async def __aexit__(self, exc_type, exc, tb):
        if self._stack:
            await self._stack.aclose()
        self._stack = None
        self.session = None

    async def list_tools(self) -> List[Dict[str, Any]]:
        resp = await asyncio.wait_for(self.session.list_tools(), timeout=self.timeout)
        return [t.model_dump() for t in resp.tools]

    async def call_tool(self, name: str, arguments: Dict[str, Any]) -> Any:
        result = await asyncio.wait_for(self.session.call_tool(name, arguments), timeout=self.timeout)
        return result.model_dump()  # 同上，转成 dict


================================================
FILE: src/agents/mcpmark_agent.py
================================================
"""
MCPMark Agent Implementation
============================

Unified agent using LiteLLM for all model interactions with minimal MCP support.
"""

import asyncio
import json
import time
from typing import Any, Dict, List, Optional, Callable
from pydantic import AnyUrl

import httpx
import litellm
import nest_asyncio

from src.logger import get_logger
from .base_agent import BaseMCPAgent
from .mcp import MCPStdioServer, MCPHttpServer

# Apply nested asyncio support
nest_asyncio.apply()

# Configure LiteLLM
litellm.suppress_debug_info = True

logger = get_logger(__name__)


# To fix the "Object of type AnyUrl is not JSON serializable" error in the find_file_contents function.
class CustomJSONEncoder(json.JSONEncoder):
    def default(self, obj):
        if isinstance(obj, AnyUrl):
            return str(obj)
        return super().default(obj)


class MCPMarkAgent(BaseMCPAgent):
    """
    Unified agent for LLM and MCP server management using LiteLLM.

    - Anthropic models: Native MCP support via extra_body
    - Other models: Manual MCP server management with function calling
    """

    MAX_TURNS = 100
    SYSTEM_PROMPT = (
        "You are a helpful agent that uses tools iteratively to complete the user's task, "
        'and when finished, provides the final answer or simply states "Task completed" without further tool calls.'
    )
    COMPACTION_PROMPT = (
        "You are performing a CONTEXT CHECKPOINT COMPACTION.\n"
        "Summarize the conversation so far for another model to continue.\n\n"
        "Include:\n"
        "- Current progress and key decisions made\n"
        "- Important context, constraints, or user preferences\n"
        "- What remains to be done (clear next steps)\n"
        "- Any critical data, examples, or references needed to continue\n\n"
        "Be concise and structured. Do NOT call tools."
    )
    DEFAULT_TIMEOUT = BaseMCPAgent.DEFAULT_TIMEOUT

    def __init__(
        self,
        litellm_input_model_name: str,
        api_key: str,
        base_url: str,
        mcp_service: str,
        timeout: int = DEFAULT_TIMEOUT,
        service_config: Optional[Dict[str, Any]] = None,
        service_config_provider: Optional[Callable[[], Dict[str, Any]]] = None,
        reasoning_effort: Optional[str] = "default",
        compaction_token: int = BaseMCPAgent.COMPACTION_DISABLED_TOKEN,
    ):
        super().__init__(
            litellm_input_model_name=litellm_input_model_name,
            api_key=api_key,
            base_url=base_url,
            mcp_service=mcp_service,
            timeout=timeout,
            service_config=service_config,
            service_config_provider=service_config_provider,
            reasoning_effort=reasoning_effort,
            compaction_token=compaction_token,
        )
        logger.debug(
            "Initialized MCPMarkAgent for '%s' with model '%s' (Claude: %s, Thinking: %s, Reasoning: %s)",
            mcp_service,
            litellm_input_model_name,
            self.is_claude,
            self.use_claude_thinking,
            reasoning_effort,
        )

    # ==================== Public Interface Methods ====================

    async def execute(
        self, instruction: str, tool_call_log_file: Optional[str] = None
    ) -> Dict[str, Any]:
        """
        Execute instruction with the agent.

        Args:
            instruction: The instruction/prompt to execute
            tool_call_log_file: Optional path to log tool calls

        Returns:
            Dictionary containing execution results
        """
        start_time = time.time()

        try:
            # Reset partial progress for this run
            self._reset_progress()
            # Refresh service configuration
            self._refresh_service_config()

            # Execute with timeout control
            async def _execute_with_strategy():
                if self.use_claude_thinking:
                    # Claude with thinking -> native Anthropic API with tools
                    return await self._execute_claude_native_with_tools(
                        instruction, tool_call_log_file
                    )
                else:
                    # All other cases -> LiteLLM with tools
                    return await self._execute_litellm_with_tools(
                        instruction, tool_call_log_file
                    )

            # Apply timeout to the entire execution
            result = await asyncio.wait_for(
                _execute_with_strategy(), timeout=self.timeout
            )

            execution_time = time.time() - start_time

            # Update usage statistics
            self.usage_tracker.update(
                success=result["success"],
                token_usage=result.get("token_usage", {}),
                turn_count=result.get("turn_count", 0),
                execution_time=execution_time,
            )

            result["execution_time"] = execution_time
            return result

        except Exception as e:
            execution_time = time.time() - start_time
            if isinstance(e, asyncio.TimeoutError):
                error_msg = f"Execution timed out after {self.timeout} seconds"
                logger.error(error_msg)
            else:
                error_msg = f"Agent execution failed: {e}"
                logger.error(error_msg, exc_info=True)

            self.usage_tracker.update(
                success=False,
                token_usage=self._partial_token_usage or {},
                turn_count=self._partial_turn_count or 0,
                execution_time=execution_time,
            )

            if self._partial_messages:
                if not self.is_claude:
                    final_msg = self._convert_to_sdk_format(self._partial_messages)
                else:
                    final_msg = self._partial_messages
            else:
                final_msg = []

            return {
                "success": False,
                "output": final_msg,
                "token_usage": self._partial_token_usage or {},
                "turn_count": self._partial_turn_count or 0,
                "execution_time": execution_time,
                "error": error_msg,
                "litellm_run_model_name": self.litellm_run_model_name,
            }

    def execute_sync(
        self, instruction: str, tool_call_log_file: Optional[str] = None
    ) -> Dict[str, Any]:
        """
        Synchronous wrapper for execute method.
        """
        return asyncio.run(self.execute(instruction, tool_call_log_file))

    def get_usage_stats(self) -> Dict[str, Any]:
        """Get usage statistics."""
        return self.usage_tracker.get_stats()

    def reset_usage_stats(self):
        """Reset usage statistics."""
        self.usage_tracker.reset()

    # ==================== Claude Native API Execution Path ====================

    async def _execute_claude_native_with_tools(
        self, instruction: str, tool_call_log_file: Optional[str] = None
    ) -> Dict[str, Any]:
        """
        Execute Claude with thinking using native Anthropic API.
        Creates MCP server, gets tools, and executes with thinking.
        """
        logger.debug("Using Claude native API with thinking")

        thinking_budget = self._get_claude_thinking_budget()

        # Create and start MCP server
        mcp_server = await self._create_mcp_server()

        async with mcp_server:
            # Get available tools
            tools = await mcp_server.list_tools()

            # Convert MCP tools to Anthropic format
            anthropic_tools = self._convert_to_anthropic_format(tools)

            # Execute with function calling loop
            return await self._execute_anthropic_native_tool_loop(
                instruction,
                anthropic_tools,
                mcp_server,
                thinking_budget,
                tool_call_log_file,
            )

    async def _call_claude_native_api(
        self,
        messages: List[Dict],
        thinking_budget: int,
        tools: Optional[List[Dict]] = None,
        mcp_servers: Optional[List[Dict]] = None,
        system: Optional[str] = None,
    ) -> Dict[str, Any]:
        """
        Call Claude's native API directly using httpx.

        Args:
            messages: Conversation messages
            thinking_budget: Token budget for thinking
            tools: Tool definitions for function calling
            mcp_servers: MCP server configurations
            system: System prompt

        Returns:
            API response as dictionary
        """
        # Get API base and headers
        import os

        api_base = os.getenv("ANTHROPIC_API_BASE", "https://api.anthropic.com")
        headers = {
            "x-api-key": self.api_key,
            "anthropic-version": "2023-06-01",
            "content-type": "application/json",
            "anthropic-beta": "context-1m-2025-08-07",  # by default
        }

        # Build payload
        max_tokens = max(thinking_budget + 4096, 4096)
        payload = {
            "model": self.litellm_input_model_name.replace("anthropic/", ""),
            "max_tokens": max_tokens,
            "messages": messages,
        }

        # Add thinking configuration
        if thinking_budget:
            payload["thinking"] = {"type": "enabled", "budget_tokens": thinking_budget}

        # Add tools if provided
        if tools:
            payload["tools"] = tools
            payload["tool_choice"] = {"type": "auto"}

        # Add MCP servers if provided
        if mcp_servers:
            headers["anthropic-beta"] = "mcp-client-2025-04-04"
            payload["mcp_servers"] = mcp_servers

        # Add system prompt if provided
        if system:
            payload["system"] = system

        # Make the API call
        async with httpx.AsyncClient() as client:
            try:
                response = await client.post(
                    f"{api_base}/v1/messages",
                    headers=headers,
                    json=payload,
                    timeout=self.timeout,
                )
                response.raise_for_status()
                return response.json(), None
            except httpx.HTTPStatusError as e:
                return None, e.response.text
            except Exception as e:
                return None, e

    async def _count_claude_input_tokens(
        self,
        messages: List[Dict[str, Any]],
        tools: Optional[List[Dict]] = None,
        system: Optional[str] = None,
    ) -> int:
        import os

        api_base = os.getenv("ANTHROPIC_API_BASE", "https://api.anthropic.com")
        headers = {
            "x-api-key": self.api_key,
            "anthropic-version": "2023-06-01",
            "content-type": "application/json",
        }
        payload: Dict[str, Any] = {
            "model": self.litellm_input_model_name.replace("anthropic/", ""),
            "messages": messages,
        }
        if tools:
            payload["tools"] = tools
        if system:
            payload["system"] = system

        async with httpx.AsyncClient() as client:
            response = await client.post(
                f"{api_base}/v1/messages/count_tokens",
                headers=headers,
                json=payload,
                timeout=self.timeout,
            )
            response.raise_for_status()
            data = response.json() or {}
            return int(data.get("input_tokens", 0) or 0)

    def _extract_litellm_text(self, response: Any) -> str:
        try:
            choices = getattr(response, "choices", None) or []
            if not choices:
                return ""
            msg = getattr(choices[0], "message", None)
            if msg is not None:
                return str(getattr(msg, "content", "") or "")
            return str(getattr(choices[0], "text", "") or "")
        except Exception:  # pragma: no cover - best effort
            return ""

    def _extract_anthropic_text(self, response_json: Dict[str, Any]) -> str:
        pieces: List[str] = []
        for block in response_json.get("content", []) or []:
            if isinstance(block, dict) and block.get("type") == "text":
                text = block.get("text")
                if text:
                    pieces.append(str(text))
        return "\n".join(pieces).strip()

    def _merge_usage(self, total_tokens: Dict[str, int], usage: Dict[str, Any]) -> None:
        try:
            input_tokens = int(usage.get("input_tokens", 0) or 0)
            output_tokens = int(usage.get("output_tokens", 0) or 0)
            total_tokens_count = int(
                usage.get("total_tokens", 0) or (input_tokens + output_tokens)
            )
            total_tokens["input_tokens"] += input_tokens
            total_tokens["output_tokens"] += output_tokens
            total_tokens["total_tokens"] += total_tokens_count
        except Exception:  # pragma: no cover - best effort
            return

    async def _maybe_compact_litellm_messages(
        self,
        messages: List[Dict[str, Any]],
        total_tokens: Dict[str, int],
        tool_call_log_file: Optional[str],
        current_prompt_tokens: int,
    ) -> List[Dict[str, Any]]:
        if not self._compaction_enabled():
            return messages
        if current_prompt_tokens < self.compaction_token:
            return messages

        logger.info(
            f"| [compaction] Triggered at prompt tokens: {current_prompt_tokens:,}"
        )
        if tool_call_log_file:
            try:
                with open(tool_call_log_file, "a", encoding="utf-8") as f:
                    f.write(
                        f"| [compaction] Triggered at prompt tokens: {current_prompt_tokens:,}\n"
                    )
            except Exception:
                pass

        compact_messages = [
            {"role": "system", "content": self.COMPACTION_PROMPT},
            {"role": "user", "content": json.dumps(messages, ensure_ascii=False)},
        ]
        completion_kwargs = {
            "model": self.litellm_input_model_name,
            "messages": compact_messages,
            "api_key": self.api_key,
        }
        if self.base_url:
            completion_kwargs["base_url"] = self.base_url
        response = await litellm.acompletion(**completion_kwargs)

        usage = getattr(response, "usage", None)
        if usage:
            input_tokens = (
                getattr(usage, "prompt_tokens", None)
                or getattr(usage, "input_tokens", None)
                or 0
            )
            output_tokens = (
                getattr(usage, "completion_tokens", None)
                or getattr(usage, "output_tokens", None)
                or 0
            )
            total_tokens_count = getattr(usage, "total_tokens", None)
            if total_tokens_count is None:
                total_tokens_count = input_tokens + output_tokens
            total_tokens["input_tokens"] += int(input_tokens or 0)
            total_tokens["output_tokens"] += int(output_tokens or 0)
            total_tokens["total_tokens"] += int(total_tokens_count or 0)

        summary = self._extract_litellm_text(response).strip() or "(no summary)"
        system_msg = (
            messages[0]
            if messages
            else {"role": "system", "content": self.SYSTEM_PROMPT}
        )
        first_user = (
            messages[1] if len(messages) > 1 else {"role": "user", "content": ""}
        )
        return [
            system_msg,
            first_user,
            {
                "role": "user",
                "content": f"Context summary (auto-compacted due to token limit):\n{summary}",
            },
        ]

    async def _maybe_compact_anthropic_messages(
        self,
        messages: List[Dict[str, Any]],
        total_tokens: Dict[str, int],
        thinking_budget: int,
        tool_call_log_file: Optional[str],
        current_input_tokens: int,
    ) -> List[Dict[str, Any]]:
        if not self._compaction_enabled():
            return messages
        if current_input_tokens < self.compaction_token:
            return messages

        logger.info(
            f"| [compaction] Triggered at input tokens: {current_input_tokens:,}"
        )
        if tool_call_log_file:
            try:
                with open(tool_call_log_file, "a", encoding="utf-8") as f:
                    f.write(
                        f"| [compaction] Triggered at input tokens: {current_input_tokens:,}\n"
                    )
            except Exception:
                pass

        compact_messages = [
            {"role": "user", "content": self.COMPACTION_PROMPT},
            {"role": "user", "content": json.dumps(messages, ensure_ascii=False)},
        ]
        response, error_msg = await self._call_claude_native_api(
            messages=compact_messages,
            thinking_budget=thinking_budget,
            tools=None,
            system=None,
        )
        if error_msg or not response:
            logger.warning(f"| [compaction] Failed: {error_msg}")
            return messages

        usage = response.get("usage", {}) or {}
        input_tokens = usage.get("input_tokens", 0) or 0
        output_tokens = usage.get("output_tokens", 0) or 0
        total_tokens["input_tokens"] += int(input_tokens)
        total_tokens["output_tokens"] += int(output_tokens)
        total_tokens["total_tokens"] += int(input_tokens + output_tokens)

        summary = self._extract_anthropic_text(response) or "(no summary)"
        first_user = messages[0] if messages else {"role": "user", "content": ""}
        return [
            first_user,
            {
                "role": "user",
                "content": f"Context summary (auto-compacted due to token limit):\n{summary}",
            },
        ]

    async def _execute_anthropic_native_tool_loop(
        self,
        instruction: str,
        tools: List[Dict],
        mcp_server: Any,
        thinking_budget: int,
        tool_call_log_file: Optional[str] = None,
    ) -> Dict[str, Any]:
        """
        Execute Claude thinking loop with function calling.
        Handles thinking blocks, tool calls, and message formatting.
        """
        messages = [{"role": "user", "content": instruction}]
        total_tokens = {
            "input_tokens": 0,
            "output_tokens": 0,
            "total_tokens": 0,
            "reasoning_tokens": 0,
        }
        turn_count = 0
        max_turns = self.MAX_TURNS
        hit_turn_limit = False
        ended_normally = False

        system_text = self.SYSTEM_PROMPT
        # Record initial state
        self._update_progress(messages, total_tokens, turn_count)

        for _ in range(max_turns):
            turn_count += 1

            current_input_tokens = 0
            if self._compaction_enabled():
                try:
                    current_input_tokens = await self._count_claude_input_tokens(
                        messages=messages,
                        tools=tools,
                        system=system_text,
                    )
                except Exception as exc:  # noqa: BLE001
                    logger.debug("Claude token counting failed: %s", exc)

            messages = await self._maybe_compact_anthropic_messages(
                messages=messages,
                total_tokens=total_tokens,
                thinking_budget=thinking_budget,
                tool_call_log_file=tool_call_log_file,
                current_input_tokens=current_input_tokens,
            )
            self._update_progress(messages, total_tokens, turn_count)

            # Call Claude native API
            response, error_msg = await self._call_claude_native_api(
                messages=messages,
                thinking_budget=thinking_budget,
                tools=tools,
                system=system_text,
            )
            if turn_count == 1:
                self.litellm_run_model_name = response["model"].split("/")[-1]

            if error_msg:
                break

            # Update token usage
            if "usage" in response:
                usage = response["usage"]
                input_tokens = usage.get("input_tokens", 0)
                output_tokens = usage.get("output_tokens", 0)
                # Calculate output tokens as total - input for consistency
                total_tokens_count = output_tokens + input_tokens

                total_tokens["input_tokens"] += input_tokens
                total_tokens["output_tokens"] += output_tokens
                total_tokens["total_tokens"] += total_tokens_count

                ## TODO: add reasoning tokens for claude

            # Extract blocks from response
            blocks = response.get("content", [])
            tool_uses = [b for b in blocks if b.get("type") == "tool_use"]
            thinking_blocks = [b for b in blocks if b.get("type") == "thinking"]
            text_blocks = [b for b in blocks if b.get("type") == "text"]

            # Log text output
            for tb in text_blocks:
                if tb.get("text") and tool_call_log_file:
                    with open(tool_call_log_file, "a", encoding="utf-8") as f:
                        f.write(f"{tb['text']}\n")
                if tb.get("text"):
                    for line in tb["text"].splitlines():
                        logger.info(f"| {line}")

            # Build assistant message with all blocks
            assistant_content = []

            # Add thinking blocks
            for tb in thinking_blocks:
                assistant_content.append(
                    {
                        "type": "thinking",
                        "thinking": tb.get("thinking", ""),
                        "signature": tb.get("signature", ""),
                    }
                )

            # Add text blocks
            for tb in text_blocks:
                if tb.get("text"):
                    assistant_content.append({"type": "text", "text": tb["text"]})

            # Add tool_use blocks
            for tu in tool_uses:
                assistant_content.append(
                    {
                        "type": "tool_use",
                        "id": tu.get("id"),
                        "name": tu.get("name"),
                        "input": tu.get("input", {}),
                    }
                )

            messages.append({"role": "assistant", "content": assistant_content})

            # Update partial progress after assistant response
            self._update_progress(messages, total_tokens, turn_count)

            # If no tool calls, we're done
            if not tool_uses:
                ended_normally = True
                break

            # Execute tools and add results
            tool_results = []
            for tu in tool_uses:
                name = tu.get("name")
                inputs = tu.get("input", {})

                # Log tool call
                args_str = json.dumps(inputs, separators=(",", ": "))
                display_args = (
                    args_str[:140] + "..." if len(args_str) > 140 else args_str
                )
                logger.info(f"| \033[1m{name}\033[0m \033[2;37m{display_args}\033[0m")

                if tool_call_log_file:
                    with open(tool_call_log_file, "a", encoding="utf-8") as f:
                        f.write(f"| {name} {args_str}\n")

                # Execute tool
                try:
                    result = await asyncio.wait_for(
                        mcp_server.call_tool(name, inputs), timeout=60
                    )
                    tool_results.append(
                        {
                            "type": "tool_result",
                            "tool_use_id": tu["id"],
                            "content": [
                                {
                                    "type": "text",
                                    "text": json.dumps(result, cls=CustomJSONEncoder),
                                }
                            ],
                        }
                    )
                except Exception as e:
                    logger.error(f"Tool call failed: {e}")
                    tool_results.append(
                        {
                            "type": "tool_result",
                            "tool_use_id": tu["id"],
                            "content": [{"type": "text", "text": f"Error: {str(e)}"}],
                        }
                    )

            messages.append({"role": "user", "content": tool_results})
            # Update partial progress after tool results
            self._update_progress(messages, total_tokens, turn_count)

        # Detect if we exited due to hitting the turn limit
        if not ended_normally:
            if turn_count >= max_turns:
                hit_turn_limit = True
                logger.warning(
                    f"| Max turns ({max_turns}) exceeded; returning failure with partial output."
                )
                if tool_call_log_file:
                    try:
                        with open(tool_call_log_file, "a", encoding="utf-8") as f:
                            f.write(f"| Max turns ({max_turns}) exceeded\n")
                    except Exception:
                        pass
            elif error_msg:
                logger.warning(f"| {error_msg}\n")
                if tool_call_log_file:
                    try:
                        with open(tool_call_log_file, "a", encoding="utf-8") as f:
                            f.write(f"| {error_msg}\n")
                    except Exception:
                        pass

        # Display final token usage
        if total_tokens["total_tokens"] > 0:
            log_msg = (
                f"|\n| Token usage: Total: {total_tokens['total_tokens']:,} | "
                f"Input: {total_tokens['input_tokens']:,} | "
                f"Output: {total_tokens['output_tokens']:,}"
            )
            if total_tokens.get("reasoning_tokens", 0) > 0:
                log_msg += f" | Reasoning: {total_tokens['reasoning_tokens']:,}"
            logger.info(log_msg)
            logger.info(f"| Turns: {turn_count}")

        # Convert messages to SDK format
        sdk_format_messages = self._convert_to_sdk_format(messages)

        if hit_turn_limit:
            return {
                "success": False,
                "output": sdk_format_messages,
                "token_usage": total_tokens,
                "turn_count": turn_count,
                "error": f"Max turns ({max_turns}) exceeded",
                "litellm_run_model_name": self.litellm_run_model_name,
            }

        if error_msg:
            return {
                "success": False,
                "output": sdk_format_messages,
                "token_usage": total_tokens,
                "turn_count": turn_count,
                "error": error_msg,
                "litellm_run_model_name": self.litellm_run_model_name,
            }

        return {
            "success": True,
            "output": sdk_format_messages,
            "token_usage": total_tokens,
            "turn_count": turn_count,
            "error": None,
            "litellm_run_model_name": self.litellm_run_model_name,
        }

    # ==================== LiteLLM Execution Path ====================

    async def _execute_litellm_with_tools(
        self, instruction: str, tool_call_log_file: Optional[str] = None
    ) -> Dict[str, Any]:
        """
        Execute with manual MCP server management.
        Used for all non-Anthropic models and Anthropic models with STDIO services.
        """
        logger.debug("Using manual MCP execution with function calling loop")

        # Create and start MCP server
        mcp_server = await self._create_mcp_server()

        try:
            async with mcp_server:
                # Get available tools
                tools = await mcp_server.list_tools()

                # Convert MCP tools to OpenAI function format
                functions = self._convert_to_openai_format(tools)

                # Execute with function calling loop
                return await self._execute_litellm_tool_loop(
                    instruction, functions, mcp_server, tool_call_log_file
                )

        except Exception as e:
            logger.error(f"Manual MCP execution failed: {e}")
            raise

    async def _execute_litellm_tool_loop(
        self,
        instruction: str,
        functions: List[Dict],
        mcp_server: Any,
        tool_call_log_file: Optional[str] = None,
    ) -> Dict[str, Any]:
        """Execute function calling loop with LiteLLM."""
        messages = [
            {"role": "system", "content": self.SYSTEM_PROMPT},
            {"role": "user", "content": instruction},
        ]
        total_tokens = {
            "input_tokens": 0,
            "output_tokens": 0,
            "total_tokens": 0,
            "reasoning_tokens": 0,
        }
        turn_count = 0
        max_turns = self.MAX_TURNS  # Limit turns to prevent infinite loops
        consecutive_failures = 0
        max_consecutive_failures = 3
        hit_turn_limit = False
        ended_normally = False

        # Convert functions to tools format for newer models
        tools = (
            [{"type": "function", "function": func} for func in functions]
            if functions
            else None
        )

        if tool_call_log_file and tools:
            max_name_length = (
                max(len(tool.get("function", {}).get("name", "")) for tool in tools)
                if tools
                else 15
            )
            with open(tool_call_log_file, "a", encoding="utf-8") as f:
                f.write("===== Available Tools =====\n")
                for tool in tools:
                    function_info = tool.get("function", {})
                    tool_name = function_info.get("name", "N/A")
                    description = function_info.get("description", "N/A")
                    f.write(
                        f"- ToolName: {tool_name:<{max_name_length}} Description: {description}\n"
                    )
                f.write("\n===== Execution Logs =====\n")

        # Record initial state
        self._update_progress(messages, total_tokens, turn_count)

        try:
            while turn_count < max_turns:
                current_prompt_tokens = 0
                if self._compaction_enabled():
                    current_prompt_tokens = self._count_prompt_tokens_litellm(messages)

                messages = await self._maybe_compact_litellm_messages(
                    messages=messages,
                    total_tokens=total_tokens,
                    tool_call_log_file=tool_call_log_file,
                    current_prompt_tokens=current_prompt_tokens,
                )
                self._update_progress(messages, total_tokens, turn_count)

                # Build completion kwargs
                completion_kwargs = {
                    "model": self.litellm_input_model_name,
                    "messages": messages,
                    "api_key": self.api_key,
                }

                # Always use tools format if available - LiteLLM will handle conversion
                if tools:
                    completion_kwargs["tools"] = tools
                    completion_kwargs["tool_choice"] = "auto"

                # Add reasoning_effort and base_url if specified
                if self.reasoning_effort != "default":
                    completion_kwargs["reasoning_effort"] = self.reasoning_effort
                if self.base_url:
                    completion_kwargs["base_url"] = self.base_url

                try:
                    # Call LiteLLM with timeout for individual call
                    response = await asyncio.wait_for(
                        litellm.acompletion(**completion_kwargs),
                        timeout=self.timeout / 2,  # Use half of total timeout
                    )
                    consecutive_failures = 0  # Reset failure counter on success
                except asyncio.TimeoutError:
                    logger.warning(f"| ✗ LLM call timed out on turn {turn_count + 1}")
                    consecutive_failures += 1
                    if consecutive_failures >= max_consecutive_failures:
                        raise Exception(
                            f"Too many consecutive failures ({consecutive_failures})"
                        )
                    await asyncio.sleep(8**consecutive_failures)  # Exponential backoff
                    continue
                except Exception as e:
                    logger.error(f"| ✗ LLM call failed on turn {turn_count + 1}: {e}")
                    consecutive_failures += 1
                    if consecutive_failures >= max_consecutive_failures:
                        raise
                    if "ContextWindowExceededError" in str(e):
                        # Best-effort fallback: compact and retry once.
                        messages = await self._maybe_compact_litellm_messages(
                            messages=messages,
                            total_tokens=total_tokens,
                            tool_call_log_file=tool_call_log_file,
                            current_prompt_tokens=self.compaction_token,
                        )
                        self._update_progress(messages, total_tokens, turn_count)
                        continue
                    elif "RateLimitError" in str(e):
                        await asyncio.sleep(12**consecutive_failures)
                    else:
                        await asyncio.sleep(2**consecutive_failures)
                    continue

                # Extract actual model name from response (first turn only)
                if turn_count == 0 and hasattr(response, "model") and response.model:
                    self.litellm_run_model_name = response.model.split("/")[-1]

                # Update token usage including reasoning tokens
                if hasattr(response, "usage") and response.usage:
                    input_tokens = response.usage.prompt_tokens or 0
                    total_tokens_count = response.usage.total_tokens or 0
                    # Calculate output tokens as total - input for consistency
                    output_tokens = (
                        total_tokens_count - input_tokens
                        if total_tokens_count > 0
                        else (response.usage.completion_tokens or 0)
                    )

                    total_tokens["input_tokens"] += input_tokens
                    total_tokens["output_tokens"] += output_tokens
                    total_tokens["total_tokens"] += total_tokens_count

                    # Extract reasoning tokens if available
                    if hasattr(response.usage, "completion_tokens_details"):
                        details = response.usage.completion_tokens_details
                        if hasattr(details, "reasoning_tokens"):
                            total_tokens["reasoning_tokens"] += (
                                details.reasoning_tokens or 0
                            )

                # Get response message
                choices = response.choices
                if len(choices):
                    message = choices[0].message
                    # deeply dump the message to ensure we capture all fields
                    message_dict = (
                        message.model_dump()
                        if hasattr(message, "model_dump")
                        else dict(message)
                    )

                    # Explicitly preserve function_call if present (even if tool_calls exists),
                    # as it may contain provider-specific metadata (e.g. Gemini thought_signature)
                    if hasattr(message, "function_call") and message.function_call:
                        # Ensure it's in the dict if model_dump missed it or it was excluded
                        if (
                            "function_call" not in message_dict
                            or not message_dict["function_call"]
                        ):
                            fc = message.function_call
                            message_dict["function_call"] = (
                                fc.model_dump() if hasattr(fc, "model_dump") else fc
                            )

                # Log assistant's text content if present
                if hasattr(message, "content") and message.content:
                    # Display the content with line prefix
                    for line in message.content.splitlines():
                        logger.info(f"| {line}")

                    # Also log to file if specified
                    if tool_call_log_file:
                        with open(tool_call_log_file, "a", encoding="utf-8") as f:
                            f.write(f"{message.content}\n")

                # Check for tool calls (newer format)
                if hasattr(message, "tool_calls") and message.tool_calls:
                    messages.append(message_dict)
                    turn_count += 1
                    # Update progress after assistant with tool calls
                    self._update_progress(messages, total_tokens, turn_count)
                    # Process tool calls
                    for tool_call in message.tool_calls:
                        func_name = tool_call.function.name
                        func_args = json.loads(tool_call.function.arguments)

                        try:
                            result = await asyncio.wait_for(
                                mcp_server.call_tool(func_name, func_args), timeout=60
                            )
                            messages.append(
                                {
                                    "role": "tool",
                                    "tool_call_id": tool_call.id,
                                    "content": json.dumps(
                                        result, cls=CustomJSONEncoder
                                    ),
                                }
                            )
                        except asyncio.TimeoutError:
                            error_msg = (
                                f"Tool call '{func_name}' timed out after 60 seconds"
                            )
                            logger.error(error_msg)
                            messages.append(
                                {
                                    "role": "tool",
                                    "tool_call_id": tool_call.id,
                                    "content": f"Error: {error_msg}",
                                }
                            )
                        except Exception as e:
                            logger.error(f"Tool call failed: {e}")
                            messages.append(
                                {
                                    "role": "tool",
                                    "tool_call_id": tool_call.id,
                                    "content": f"Error: {str(e)}",
                                }
                            )

                        # Format arguments for display (truncate if too long)
                        args_str = json.dumps(func_args, separators=(",", ": "))
                        display_arguments = (
                            args_str[:140] + "..." if len(args_str) > 140 else args_str
                        )

                        # Log with ANSI color codes (bold tool name, dim gray arguments)
                        logger.info(
                            f"| \033[1m{func_name}\033[0m \033[2;37m{display_arguments}\033[0m"
                        )

                        if tool_call_log_file:
                            with open(tool_call_log_file, "a", encoding="utf-8") as f:
                                f.write(f"| {func_name} {args_str}\n")
                    # Update progress after tool results appended
                    self._update_progress(messages, total_tokens, turn_count)
                    continue
                else:
                    # Log end reason
                    if not choices:
                        logger.info(
                            "|\n|\n| Task ended with no messages generated by the model."
                        )
                    elif choices[0].finish_reason == "stop":
                        logger.info(
                            "|\n|\n| Task ended with the finish reason from messages being 'stop'."
                        )

                    # No tool/function call, add message and we're done
                    messages.append(message_dict)
                    turn_count += 1
                    # Update progress before exiting
                    self._update_progress(messages, total_tokens, turn_count)
                    ended_normally = True
                    break

        except Exception as loop_error:
            # On any error, return partial conversation, token usage, and turn count
            logger.error(f"Manual MCP loop failed: {loop_error}", exc_info=True)
            sdk_format_messages = self._convert_to_sdk_format(messages)
            return {
                "success": False,
                "output": sdk_format_messages,
                "token_usage": total_tokens,
                "turn_count": turn_count,
                "error": str(loop_error),
                "litellm_run_model_name": self.litellm_run_model_name,
            }

        # Detect if we exited due to hitting the turn limit
        if (not ended_normally) and (turn_count >= max_turns):
            hit_turn_limit = True
            logger.warning(
                f"| Max turns ({max_turns}) exceeded); returning failure with partial output."
            )
            if tool_call_log_file:
                try:
                    with open(tool_call_log_file, "a", encoding="utf-8") as f:
                        f.write(f"| Max turns ({max_turns}) exceeded\n")
                except Exception:
                    pass

        # Display final token usage
        if total_tokens["total_tokens"] > 0:
            log_msg = (
                f"| Token usage: Total: {total_tokens['total_tokens']:,} | "
                f"Input: {total_tokens['input_tokens']:,} | "
                f"Output: {total_tokens['output_tokens']:,}"
            )
            if total_tokens.get("reasoning_tokens", 0) > 0:
                log_msg += f" | Reasoning: {total_tokens['reasoning_tokens']:,}"
            logger.info(log_msg)
            logger.info(f"| Turns: {turn_count}")

        # Convert messages to SDK format for backward compatibility
        sdk_format_messages = self._convert_to_sdk_format(messages)

        return {
            "success": not hit_turn_limit,
            "output": sdk_format_messages,
            "token_usage": total_tokens,
            "turn_count": turn_count,
            "error": (f"Max turns ({max_turns}) exceeded" if hit_turn_limit else None),
            "litellm_run_model_name": self.litellm_run_model_name,
        }

    # ==================== MCP Server Management ====================

    async def _create_mcp_server(self) -> Any:
        """Create and return an MCP server instance."""
        if self.mcp_service in self.STDIO_SERVICES:
            return self._create_stdio_server()
        elif self.mcp_service in self.HTTP_SERVICES:
            return self._create_http_server()
        else:
            raise ValueError(f"Unsupported MCP service: {self.mcp_service}")

    def _create_stdio_server(self) -> MCPStdioServer:
        """Create stdio-based MCP server."""
        if self.mcp_service == "notion":
            notion_key = self.service_config.get("notion_key")
            if not notion_key:
                raise ValueError("Notion API key required")

            return MCPStdioServer(
                command="npx",
                args=["-y", "@notionhq/notion-mcp-server@1.9.1"],
                env={
                    "OPENAPI_MCP_HEADERS": (
                        '{"Authorization": "Bearer ' + notion_key + '", '
                        '"Notion-Version": "2022-06-28"}'
                    )
                },
            )

        elif self.mcp_service == "filesystem":
            test_directory = self.service_config.get("test_directory")
            if not test_directory:
                raise ValueError("Test directory required for filesystem service")

            return MCPStdioServer(
                command="npx",
                args=[
                    "-y",
                    "@modelcontextprotocol/server-filesystem",
                    str(test_directory),
                ],
            )

        elif self.mcp_service in ["playwright", "playwright_webarena"]:
            browser = self.service_config.get("browser", "chromium")
            headless = self.service_config.get("headless", True)
            viewport_width = self.service_config.get("viewport_width", 1280)
            viewport_height = self.service_config.get("viewport_height", 720)

            args = ["-y", "@playwright/mcp@latest"]
            if headless:
                args.append("--headless")
            args.extend(
                [
                    "--isolated",
                    "--no-sandbox",
                    "--browser",
                    browser,
                    "--viewport-size",
                    f"{viewport_width},{viewport_height}",
                ]
            )

            return MCPStdioServer(command="npx", args=args)

        elif self.mcp_service == "postgres":
            host = self.service_config.get("host", "localhost")
            port = self.service_config.get("port", 5432)
            username = self.service_config.get("username")
            password = self.service_config.get("password")
            database = self.service_config.get(
                "current_database"
            ) or self.service_config.get("database")

            if not all([username, password, database]):
                raise ValueError("PostgreSQL requires username, password, and database")

            database_url = (
                f"postgresql://{username}:{password}@{host}:{port}/{database}"
            )

            return MCPStdioServer(
                command="pipx",
                args=["run", "postgres-mcp", "--access-mode=unrestricted"],
                env={"DATABASE_URI": database_url},
            )

        elif self.mcp_service == "insforge":
            api_key = self.service_config.get("api_key")
            backend_url = self.service_config.get("backend_url")
            if not all([api_key, backend_url]):
                raise ValueError("Insforge requires api_key and backend_url")
            return MCPStdioServer(
                command="npx",
                args=["-y", "@insforge/mcp@dev"],
                env={
                    "INSFORGE_API_KEY": api_key,
                    "INSFORGE_BACKEND_URL": backend_url,
                },
            )

        elif self.mcp_service == "github":
            github_token = self.service_config.get("github_token")
            if not github_token:
                raise ValueError("GitHub token required")

            return MCPStdioServer(
                command="docker",
                args=[
                    "run", "-i", "--rm",
                    "-e", "GITHUB_PERSONAL_ACCESS_TOKEN",
                    "ghcr.io/github/github-mcp-server:v0.15.0",
                ],
                env={"GITHUB_PERSONAL_ACCESS_TOKEN": github_token},
            )

        else:
            raise ValueError(f"Unsupported stdio service: {self.mcp_service}")

    def _create_http_server(self) -> MCPHttpServer:
        """Create HTTP-based MCP server."""
        if self.mcp_service == "supabase":
            # Use built-in MCP server from Supabase CLI
            api_url = self.service_config.get("api_url", "http://localhost:54321")
            api_key = self.service_config.get("api_key", "")

            if not api_key:
                raise ValueError(
                    "Supabase requires api_key (use secret key from 'supabase status')"
                )

            # Supabase CLI exposes MCP at /mcp endpoint
            mcp_url = f"{api_url}/mcp"

            return MCPHttpServer(
                url=mcp_url,
                headers={
                    "apikey": api_key,
                    "Authorization": f"Bearer {api_key}",
                },
            )

        else:
            raise ValueError(f"Unsupported HTTP service: {self.mcp_service}")


================================================
FILE: src/agents/react_agent.py
================================================
"""ReAct agent implementation for the MCPMark pipeline."""

from __future__ import annotations

import asyncio
import json
import time
from typing import Any, Dict, List, Optional, Callable

import litellm

from src.logger import get_logger
from .base_agent import BaseMCPAgent

logger = get_logger(__name__)


class ReActAgent(BaseMCPAgent):
    """ReAct-style agent that reuses MCPMark infrastructure."""

    DEFAULT_SYSTEM_PROMPT = (
        "You are a careful ReAct (reasoning and acting) agent. "
        "At each step you must decide whether to call a tool or provide a final response. "
        "Only use the tools that are listed for you. When you finish, respond with either the final answer "
        "or the phrase \"Task completed.\" if no further detail is required. "
        "Every reply must be valid JSON without code fences."
    )
    COMPACTION_PROMPT = (
        "You are performing a CONTEXT CHECKPOINT COMPACTION.\n"
        "Summarize the conversation so far for another model to continue.\n\n"
        "Include:\n"
        "- Current progress and key decisions made\n"
        "- Important context, constraints, or user preferences\n"
        "- What remains to be done (clear next steps)\n"
        "- Any critical data, examples, or references needed to continue\n\n"
        "Be concise and structured. Do NOT call tools."
    )

    def __init__(
        self,
        litellm_input_model_name: str,
        api_key: str,
        base_url: str,
        mcp_service: str,
        timeout: int = BaseMCPAgent.DEFAULT_TIMEOUT,
        service_config: Optional[Dict[str, Any]] = None,
        service_config_provider: Optional[Callable[[], Dict[str, Any]]] = None,
        reasoning_effort: Optional[str] = "default",
        max_iterations: int = 100,
        system_prompt: Optional[str] = None,
        compaction_token: int = BaseMCPAgent.COMPACTION_DISABLED_TOKEN,
    ):
        super().__init__(
            litellm_input_model_name=litellm_input_model_name,
            api_key=api_key,
            base_url=base_url,
            mcp_service=mcp_service,
            timeout=timeout,
            service_config=service_config,
            service_config_provider=service_config_provider,
            reasoning_effort=reasoning_effort,
            compaction_token=compaction_token,
        )
        self.max_iterations = max_iterations
        self.react_system_prompt = system_prompt or self.DEFAULT_SYSTEM_PROMPT

    async def execute(
        self,
        instruction: str,
        tool_call_log_file: Optional[str] = None,
    ) -> Dict[str, Any]:
        start_time = time.time()

        try:
            self._reset_progress()
            self._refresh_service_config()

            async def _run_react():
                return await self._execute_react_loop(instruction, tool_call_log_file)

            result = await asyncio.wait_for(_run_react(), timeout=self.timeout)
            execution_time = time.time() - start_time
            self.usage_tracker.update(
                success=result.get("success", False),
                token_usage=result.get("token_usage", {}),
                turn_count=result.get("turn_count", 0),
                execution_time=execution_time,
            )
            result["execution_time"] = execution_time
            return result
        except Exception as exc:  # noqa: BLE001
            execution_time = time.time() - start_time

            if isinstance(exc, asyncio.TimeoutError):
                error_msg = f"Execution timed out after {self.timeout} seconds"
                logger.error(error_msg)
            else:
                error_msg = f"ReAct agent execution failed: {exc}"
                logger.error(error_msg, exc_info=True)

            self.usage_tracker.update(
                success=False,
                token_usage=self._partial_token_usage or {},
                turn_count=self._partial_turn_count or 0,
                execution_time=execution_time,
            )

            if self._partial_messages:
                final_msg = self._convert_to_sdk_format(self._partial_messages)
            else:
                final_msg = []

            return {
                "success": False,
                "output": final_msg,
                "token_usage": self._partial_token_usage or {},
                "turn_count": self._partial_turn_count or 0,
                "execution_time": execution_time,
                "error": error_msg,
                "litellm_run_model_name": self.litellm_run_model_name,
            }

    async def _execute_react_loop(
        self,
        instruction: str,
        tool_call_log_file: Optional[str],
    ) -> Dict[str, Any]:
        system_message = {"role": "system", "content": self.react_system_prompt}
        total_tokens = {
            "input_tokens": 0,
            "output_tokens": 0,
            "total_tokens": 0,
            "reasoning_tokens": 0,
        }
        turn_count = 0
        success = False
        final_error: Optional[str] = None

        mcp_server = await self._create_mcp_server()
        async with mcp_server:
            tools = await mcp_server.list_tools()
            tool_map = {tool.get("name"): tool for tool in tools}
            tools_description = self._render_tools_description(tools)

            task_message = {
                "role": "user",
                "content": self._build_task_prompt(
                    instruction=instruction,
                    tools_description=tools_description,
                ),
            }
            messages: List[Dict[str, Any]] = [system_message, task_message]
            self._update_progress(messages, total_tokens, turn_count)

            for step in range(1, self.max_iterations + 1):
                current_prompt_tokens = 0
                if self._compaction_enabled():
                    current_prompt_tokens = self._count_prompt_tokens_litellm(messages)

                if self._compaction_enabled() and current_prompt_tokens >= self.compaction_token:
                    logger.info(
                        f"| [compaction] Triggered at prompt tokens: {current_prompt_tokens:,}"
                    )
                    if tool_call_log_file:
                        try:
                            with open(tool_call_log_file, "a", encoding="utf-8") as log_file:
                                log_file.write(
                                    f"| [compaction] Triggered at prompt tokens: {current_prompt_tokens:,}\n"
                                )
                        except Exception:  # noqa: BLE001
                            pass

                    compact_messages = [
                        {"role": "system", "content": self.COMPACTION_PROMPT},
                        {"role": "user", "content": json.dumps(messages, ensure_ascii=False)},
                    ]
                    compact_kwargs = {
                        "model": self.litellm_input_model_name,
                        "messages": compact_messages,
                        "api_key": self.api_key,
                    }
                    if self.base_url:
                        compact_kwargs["base_url"] = self.base_url

                    compact_response = await litellm.acompletion(**compact_kwargs)
                    usage = getattr(compact_response, "usage", None)
                    if usage:
                        prompt_tokens = (
                            getattr(usage, "prompt_tokens", None)
                            or getattr(usage, "input_tokens", None)
                            or 0
                        )
                        completion_tokens = (
                            getattr(usage, "completion_tokens", None)
                            or getattr(usage, "output_tokens", None)
                            or 0
                        )
                        total_tokens_count = getattr(usage, "total_tokens", None)
                        if total_tokens_count is None:
                            total_tokens_count = prompt_tokens + completion_tokens

                        total_tokens["input_tokens"] += int(prompt_tokens or 0)
                        total_tokens["output_tokens"] += int(completion_tokens or 0)
                        total_tokens["total_tokens"] += int(total_tokens_count or 0)

                    summary = ""
                    try:
                        summary = compact_response.choices[0].message.content or ""
                    except Exception:  # noqa: BLE001
                        summary = ""
                    summary = summary.strip() or "(no summary)"

                    messages = [
                        system_message,
                        task_message,
                        {
                            "role": "user",
                            "content": (
                                "Context summary (auto-compacted due to token limit):\n"
                                f"{summary}"
                            ),
                        },
                    ]
                    self._update_progress(messages, total_tokens, turn_count)

                completion_kwargs = {
                    "model": self.litellm_input_model_name,
                    "messages": messages,
                    "api_key": self.api_key,
                }
                if self.base_url:
                    completion_kwargs["base_url"] = self.base_url
                if self.reasoning_effort != "default":
                    completion_kwargs["reasoning_effort"] = self.reasoning_effort

                try:
                    response = await asyncio.wait_for(
                        litellm.acompletion(**completion_kwargs),
                        timeout=self.timeout / 2,
                    )
                except asyncio.TimeoutError:
                    final_error = f"LLM call timed out on step {step}"
                    logger.error(final_error)
                    break
                except Exception as exc:  # noqa: BLE001
                    final_error = f"LLM call failed on step {step}: {exc}"
                    logger.error(final_error)
                    if "ContextWindowExceededError" in str(exc):
                        continue
                    break

                if turn_count == 0 and getattr(response, "model", None):
                    self.litellm_run_model_name = response.model.split("/")[-1]

                usage = getattr(response, "usage", None)
                if usage:
                    prompt_tokens = (
                        getattr(usage, "prompt_tokens", None)
                        or getattr(usage, "input_tokens", None)
                        or 0
                    )
                    completion_tokens = (
                        getattr(usage, "completion_tokens", None)
                        or getattr(usage, "output_tokens", None)
                        or 0
                    )
                    total_tokens_count = getattr(usage, "total_tokens", None)
                    if total_tokens_count is None:
                        total_tokens_count = prompt_tokens + completion_tokens

                    total_tokens["input_tokens"] += prompt_tokens
                    total_tokens["output_tokens"] += completion_tokens
                    total_tokens["total_tokens"] += total_tokens_count

                    # Extract reasoning tokens if available
                    if hasattr(response.usage, 'completion_tokens_details'):
                        details = response.usage.completion_tokens_details
                        if hasattr(details, 'reasoning_tokens'):
                            total_tokens["reasoning_tokens"] += details.reasoning_tokens or 0

                choice = response.choices[0]
                message_obj = getattr(choice, "message", None)
                if message_obj is None and isinstance(choice, dict):
                    message_obj = choice.get("message")

                if message_obj is None:
                    content_raw = getattr(choice, "text", "")
                else:
                    content_raw = message_obj.get("content", "")

                assistant_text = self._normalize_content(content_raw)
                assistant_message = {"role": "assistant", "content": assistant_text}
                messages.append(assistant_message)
                turn_count += 1
                self._update_progress(messages, total_tokens, turn_count)

                parsed = self._parse_react_response(assistant_text)
                if not parsed or "thought" not in parsed:
                    warning = (
                        "The previous response was not valid JSON following the required schema. "
                        "Please respond again using the JSON formats provided."
                    )
                    messages.append({"role": "user", "content": warning})
                    self._update_progress(messages, total_tokens, turn_count)
                    final_error = "Model produced an invalid response format."
                    continue

                thought = parsed.get("thought", "")
                action = parsed.get("action")
                answer = parsed.get("answer")
                result = parsed.get("result")

                logger.info(f"|\n| \033[1;3mThought\033[0m: {str(thought)}")
                if tool_call_log_file:
                    try:
                        with open(tool_call_log_file, "a", encoding="utf-8") as log_file:
                            log_file.write(f"| {str(thought)}\n")
                    except Exception:  # noqa: BLE001
                        pass
                if action is not None:
                    func_name = action.get("tool")
                    arguments = action.get("arguments", {}) or {}
                    args_str = json.dumps(arguments, separators=(",", ": "))
                    display_arguments = args_str[:140] + "..." if len(args_str) > 140 else args_str
                    logger.info(f"| \033[1;3mAction\033[0m: \033[1m{func_name}\033[0m \033[2;37m{display_arguments}\033[0m")


                if answer is not None:
                    success = True
                    break

                if action is not None and isinstance(action, dict):
                    tool_name = action.get("tool")
                    arguments = action.get("arguments", {}) or {}

                    if tool_name not in tool_map:
                        observation = (
                            f"Invalid tool '{tool_name}'. Available tools: "
                            f"{', '.join(tool_map)}"
                        )
                    else:
                        try:
                            tool_response = await asyncio.wait_for(
                                mcp_server.call_tool(tool_name, arguments),
                                timeout=60,
                            )
                            observation = self._tool_result_to_text(tool_response)
                        except asyncio.TimeoutError:
                            observation = f"Tool '{tool_name}' timed out"
                        except Exception as tool_exc:  # noqa: BLE001
                            observation = f"Tool '{tool_name}' failed: {tool_exc}"

                        if tool_call_log_file:
                            try:
                                with open(tool_call_log_file, "a", encoding="utf-8") as log_file:
                                    log_file.write(f"| {tool_name} {json.dumps(arguments, ensure_ascii=False)}\n")
                            except Exception:  # noqa: BLE001
                                pass

                    observation_message = {
                        "role": "user",
                        "content": (
                            f"Observation:\n{observation}\n"
                            "Please continue reasoning and reply using the required JSON format."
                        ),
                    }
                    messages.append(observation_message)
                    self._update_progress(messages, total_tokens, turn_count)
                    continue

                if result is not None:
                    observation_message = {
                        "role": "user",
                        "content": (
                            f"Observation:\n{result}\n"
                            "Please continue reasoning and reply using the required JSON format."
                        ),
                    }
                    messages.append(observation_message)
                    self._update_progress(messages, total_tokens, turn_count)
                    continue

                # Unexpected structure: ask model to restate properly
                messages.append(
                    {
                        "role": "user",
                        "content": (
                            "The previous reply did not include an action, result, or answer. "
                            "Please respond again using the JSON formats provided."
                        ),
                    }
                )
                self._update_progress(messages, total_tokens, turn_count)

            if not success and final_error is None:
                final_error = (
                    f"Max iterations ({self.max_iterations}) reached without a final answer."
                )

        if total_tokens["total_tokens"] > 0:
            log_msg = (
                f"|\n|\n| Token usage: Total: {total_tokens['total_tokens']:,} | "
                f"Input: {total_tokens['input_tokens']:,} | "
                f"Output: {total_tokens['output_tokens']:,}"
            )
            if total_tokens.get("reasoning_tokens", 0) > 0:
                log_msg += f" | Reasoning: {total_tokens['reasoning_tokens']:,}"
            logger.info(log_msg)
            logger.info(f"| Turns: {turn_count}")

        sdk_messages = self._convert_to_sdk_format(messages)

        return {
            "success": success,
            "output": sdk_messages,
            "token_usage": total_tokens,
            "turn_count": turn_count,
            "error": None if success else final_error,
            "litellm_run_model_name": self.litellm_run_model_name,
        }

    def _build_task_prompt(
        self,
        instruction: str,
        tools_description: str,
    ) -> str:
        return (
            f"Task:\n{instruction}\n\n"
            f"Available MCP tools:\n{tools_description}\n\n"
            "Respond using the JSON formats below.\n\n"
            "If you need to use a tool:\n"
            "{\n"
            '  "thought": "Reasoning for the next action",\n'
            '  "action": {\n'
            '    "tool": "tool-name",\n'
            '    "arguments": {\n'
            '      "parameter": value\n'
            "    }\n"
            "  }\n"
            "}\n\n"
            "If you can provide the final answer:\n"
            "{\n"
            '  "thought": "Reasoning that justifies the answer",\n'
            '  "answer": "Either the final solution or \'Task completed.\' when no more detail is required"\n'
            "}\n\n"
            "Remember: omitting the action object ends the task, so only do this when finished."
        )

    def _render_tools_description(self, tools: List[Dict[str, Any]]) -> str:
        descriptions = []
        for tool in tools:
            name = tool.get("name", "unknown")
            description = tool.get("description", "No description provided.")
            input_schema = tool.get("inputSchema", {}) or {}
            properties = input_schema.get("properties", {}) or {}
            required = set(input_schema.get("required", []) or [])

            arg_lines = []
            for prop_name, prop_details in properties.items():
                details = json.dumps(prop_details, ensure_ascii=False, indent=2)
                suffix = " (required)" if prop_name in required else ""
                arg_lines.append(f"- {prop_name}{suffix}: {details}")

            if arg_lines:
                arguments_text = "\n".join(arg_lines)
            else:
                arguments_text = "(no arguments)"

            descriptions.append(
                f"Tool: {name}\nDescription: {description}\nArguments:\n{arguments_text}"
            )

        return "\n\n".join(descriptions) if descriptions else "(no tools available)"

    def _normalize_content(self, content: Any) -> str:
        if isinstance(content, str):
            return content
        if isinstance(content, list):
            parts = []
            for block in content:
                if isinstance(block, dict):
                    if block.get("type") == "text":
                        parts.append(block.get("text", ""))
                    elif "text" in block:
                        parts.append(str(block.get("text")))
                else:
                    parts.append(str(block))
            return "\n".join(part for part in parts if part)
        return json.dumps(content, ensure_ascii=False)

    def _parse_react_response(self, payload: str) -> Dict[str, Any]:
        candidate = payload.strip().strip("`").strip()
        if candidate.lower().startswith("json"):
            candidate = candidate[4:].lstrip()
        try:
            return json.loads(candidate)
        except json.JSONDecodeError:
            return {}

    def _tool_result_to_text(self, result: Any) -> str:
        if result is None:
            return ""
        if isinstance(result, str):
            return result
        try:
            return json.dumps(result, ensure_ascii=False)
        except TypeError:
            return str(result)


================================================
FILE: src/agents/utils/__init__.py
================================================
"""
Utility functions for MCPMark Agent
====================================
"""

from .token_usage import TokenUsageTracker

__all__ = ["TokenUsageTracker"]

================================================
FILE: src/agents/utils/token_usage.py
================================================
"""
Token Usage Tracking Utilities
===============================
"""

from typing import Dict, Any


class TokenUsageTracker:
    """Track token usage across agent executions."""
    
    def __init__(self):
        """Initialize token usage tracker."""
        self.reset()
    
    def reset(self):
        """Reset all usage statistics."""
        self._stats = {
            "total_input_tokens": 0,
            "total_output_tokens": 0,
            "total_tokens": 0,
            "total_turns": 0,
            "total_execution_time": 0.0,
            "successful_executions": 0,
            "failed_executions": 0,
        }
    
    def update(self, success: bool, token_usage: Dict[str, int], 
               turn_count: int, execution_time: float):
        """
        Update usage statistics.
        
        Args:
            success: Whether execution was successful
            token_usage: Token usage dict with input_tokens, output_tokens, total_tokens
            turn_count: Number of conversation turns
            execution_time: Execution time in seconds
        """
        if success:
            self._stats["successful_executions"] += 1
        else:
            self._stats["failed_executions"] += 1
        
        self._stats["total_input_tokens"] += token_usage.get("input_tokens", 0)
        self._stats["total_output_tokens"] += token_usage.get("output_tokens", 0)
        self._stats["total_tokens"] += token_usage.get("total_tokens", 0)
        self._stats["total_turns"] += turn_count
        self._stats["total_execution_time"] += execution_time
    
    def get_stats(self) -> Dict[str, Any]:
        """
        Get usage statistics with calculated averages.
        
        Returns:
            Dictionary containing usage statistics
        """
        stats = self._stats.copy()
        
        # Calculate averages
        total_executions = stats["successful_executions"] + stats["failed_executions"]
        if total_executions > 0:
            stats["avg_input_tokens"] = stats["total_input_tokens"] / total_executions
            stats["avg_output_tokens"] = stats["total_output_tokens"] / total_executions
            stats["avg_total_tokens"] = stats["total_tokens"] / total_executions
            stats["avg_turns"] = stats["total_turns"] / total_executions
            stats["avg_execution_time"] = stats["total_execution_time"] / total_executions
            stats["success_rate"] = (stats["successful_executions"] / total_executions * 100)
        else:
            stats.update({
                "avg_input_tokens": 0.0,
                "avg_output_tokens": 0.0,
                "avg_total_tokens": 0.0,
                "avg_turns": 0.0,
                "avg_execution_time": 0.0,
                "success_rate": 0.0,
            })
        
        return stats

================================================
FILE: src/aggregators/aggregate_results.py
================================================
#!/usr/bin/env python3
"""
Simplified MCPMark Results Aggregator
Aggregates evaluation results and generates summary with pass@k metrics.
"""

import json
import os
import argparse
import subprocess
import shutil
import tempfile
from pathlib import Path
from collections import defaultdict
from typing import Dict, List, Any, Tuple, Optional
from datetime import datetime
import sys
sys.path.append(str(Path(__file__).parent.parent.parent))
from src.errors import is_retryable_error
from src.aggregators.pricing import compute_cost_usd


# Supported difficulty splits in ./tasks/<service>/<task_set>/
SUPPORTED_TASK_SETS = {"standard", "easy"}


def discover_tasks(task_set: str = "standard") -> Dict[str, List[str]]:
    """Discover all tasks from ./tasks directory filtered by task set."""
    tasks_dir = Path("./tasks")

    all_tasks = {}

    # Handle each MCP service
    # Note: playwright and playwright_webarena both map to "playwright" MCP
    service_mappings = {
        "filesystem": ["filesystem"],
        "github": ["github"],
        "notion": ["notion"],
        "playwright": ["playwright", "playwright_webarena"],  # Both count as playwright
        "postgres": ["postgres"],  # supabase and insforge are variants with same tasks, don't merge
    }

    for mcp_service, task_dirs in service_mappings.items():
        tasks: List[str] = []
        for task_dir_name in task_dirs:
            service_path = tasks_dir / task_dir_name
            if not service_path.exists():
                continue

            selected_root = service_path / task_set

            # Detect if this service has partitioned task sets (e.g. standard/easy)
            has_partitioned_layout = any(
                child.is_dir() and child.name in SUPPORTED_TASK_SETS
                for child in service_path.iterdir()
            )

            if selected_root.exists():
                search_roots = [selected_root]
            elif has_partitioned_layout:
                # Requested task set missing for this service; skip it for this run
                print(f"  ⚠️ No '{task_set}' tasks found under {service_path}")
                search_roots = []
            else:
                # Legacy layout without task sets – fall back to original structure
                search_roots = [service_path]

            for root in search_roots:
                for category_dir in root.iterdir():
                    if not category_dir.is_dir() or category_dir.name.startswith("__"):
                        continue

                    for task_dir in category_dir.iterdir():
                        if task_dir.is_dir() and not task_dir.name.startswith("__"):
                            tasks.append(f"{category_dir.name}__{task_dir.name}")

        all_tasks[mcp_service] = sorted(tasks)
    
    return all_tasks


def collect_results(exp_dir: Path, k: int) -> Dict[str, Dict[str, Any]]:
    """Collect all results from experiment directory."""
    results = defaultdict(lambda: defaultdict(lambda: defaultdict(dict)))
    
    # Current layout: results/<exp>/<model>__<service>/run-N/<category>__<task>/
    # Some pipelines include task-set suffix in service dir (e.g., "filesystem-easy").
    # Normalize such names back to canonical service keys used by tasks/ (filesystem, github, notion, playwright, postgres).

    def normalize_service_name(name: str) -> str:
        # Strip known task-set suffixes like "-easy" or "-standard"
        if name.endswith("-easy") or name.endswith("-standard"):
            base = name.rsplit("-", 1)[0]
        else:
            base = name

        # Map variant names to canonical service
        if base == "playwright_webarena":
            return "playwright"
        return base
    for model_service_dir in exp_dir.iterdir():
        if not model_service_dir.is_dir() or "__" not in model_service_dir.name:
            continue
        
        model, service = model_service_dir.name.split("__", 1)
        # Normalize service names
        if service == "playwright_webarena":
            service = "playwright"
        elif service in ["supabase", "insforge"]:
            service = "postgres"
        
        for run_idx in range(1, k + 1):
            run_dir = model_service_dir / f"run-{run_idx}"
            if not run_dir.exists():
                continue
            
            for task_dir in run_dir.iterdir():
                if not task_dir.is_dir() or "__" not in task_dir.name:
                    continue
                
                meta_path = task_dir / "meta.json"
                if meta_path.exists():
                    with open(meta_path) as f:
                        meta = json.load(f)
                        task_name = task_dir.name
                        results[model][service][f"run-{run_idx}"][task_name] = meta
    
    return results


def check_completeness_and_validity(
    results: Dict, all_tasks: Dict, k: int, single_run_models: List[str]
) -> Tuple[Dict, Dict, Dict]:
    """Check completeness and validity of results."""
    complete_models = {}
    incomplete_models = {}
    invalid_models = {}
    
    for model, model_results in results.items():
        is_single_run = any(srm in model for srm in single_run_models)
        required_runs = 1 if is_single_run else k
        
        missing_info = []
        invalid_info = []
        
        # Check each service
        for service, service_tasks in all_tasks.items():
            if service not in model_results:
                missing_info.append(f"Missing entire service: {service}")
                continue
            
            service_results = model_results[service]
            
            # Check runs
            for run_idx in range(1, required_runs + 1):
                run_name = f"run-{run_idx}"
                if run_name not in service_results:
                    missing_info.append(f"Missing {run_name} for {service}")
                    continue
                
                run_results = service_results[run_name]
                
                # Check tasks
                missing_tasks = []
                invalid_tasks = []
                
                for task in service_tasks:
                    if task not in run_results:
                        missing_tasks.append(task)
                    else:
                        # Check for retryable errors only if the task did not succeed
                        meta = run_results[task]
                        success = bool(meta.get("execution_result", {}).get("success", False))
                        error_msg = meta.get("execution_result", {}).get("error_message", "")
                        if (not success) and error_msg and is_retryable_error(error_msg):
                            invalid_tasks.append(f"{task}: {error_msg[:50]}...")
                
                if missing_tasks:
                    missing_info.append(f"{service}/{run_name}: missing {len(missing_tasks)} tasks")
                if invalid_tasks:
                    invalid_info.extend([f"{service}/{run_name}/{t}" for t in invalid_tasks])
        
        if missing_info:
            incomplete_models[model] = missing_info
        elif invalid_info:
            invalid_models[model] = invalid_info
        else:
            complete_models[model] = model_results
    
    return complete_models, incomplete_models, invalid_models


def calculate_metrics(complete_models: Dict, all_tasks: Dict, k: int, single_run_models: List[str]) -> Dict:
    """Calculate rich metrics (totals, averages, per-run aggregates, pass@k) for complete models."""
    summary = {
        "generated_at": datetime.now().isoformat(),
        "k": k,
        "overall": {},
    }

    # Initialize per-service sections mirroring overall structure
    for service in all_tasks.keys():
        summary[service] = {}

    # Helper to safely extract token usage numbers
    def get_token_counts(meta: Dict[str, Any]) -> Tuple[int, int, int]:
        tu = meta.get("token_usage", {}) or {}
        input_tokens = int(tu.get("input_tokens", 0) or 0)
        output_tokens = int(tu.get("output_tokens", 0) or 0)
        total_tokens = int(tu.get("total_tokens", input_tokens + output_tokens) or (input_tokens + output_tokens))
        return input_tokens, output_tokens, total_tokens

    for model, model_results in complete_models.items():
        is_single_run = any(srm in model for srm in single_run_models)
        runs_count = 1 if is_single_run else k

        total_tasks = sum(len(tasks) for tasks in all_tasks.values())

        # Aggregates across all services and runs
        total_agent_execution_time = 0.0
        total_input_tokens = 0
        total_output_tokens = 0
        total_tokens = 0
        total_turns = 0
        # For optional fields
        actual_model_name: Optional[str] = None
        # If cost info is not present in metas, leave as None
        per_run_cost: Optional[float] = None
        # Model-level flags (to be inferred from meta.json)
        is_open_source_model: Optional[bool] = None
        is_reasoning_model: Optional[bool] = None

        # For pass@1 per-run statistics across all services
        pass1_rates_per_run_overall: List[float] = []

        # For pass@k and pass^k across all services
        pass_k_task_success_any = 0
        pass_power_k_task_success_all = 0

        # Precompute successes per task across runs for overall
        # Also accumulate totals for tokens/time/turns
        for run_idx in range(1, runs_count + 1):
            run_name = f"run-{run_idx}"
            successes_this_run = 0

            for service, service_tasks in all_tasks.items():
                # service-level aggregates for this model (will compute fully below)
                for task in service_tasks:
                    meta = (
                        model_results
                        .get(service, {})
                        .get(run_name, {})
                        .get(task)
                    )

                    # In complete_models, meta should exist; still guard
                    if not meta:
                        continue

                    success = bool(meta.get("execution_result", {}).get("success", False))
                    if success:
                        successes_this_run += 1

                    # totals accumulation
                    total_agent_execution_time += float(meta.get("agent_execution_time", 0.0) or 0.0)
                    in_tok, out_tok, ttl_tok = get_token_counts(meta)
                    total_input_tokens += in_tok
                    total_output_tokens += out_tok
                    total_tokens += ttl_tok
                    total_turns += int(meta.get("turn_count", 0) or 0)

                    # capture actual model name if present
                    if actual_model_name is None:
                        actual_model_name = meta.get("actual_model_name") or None

                    # capture cost if present in any meta as per-run cost token (rare)
                    if per_run_cost is None:
                        # A few possible fields people use; if none present, stays None
                        possible_cost = meta.get("per_run_cost") or meta.get("run_cost") or meta.get("cost")
                        if isinstance(possible_cost, (int, float)):
                            per_run_cost = float(possible_cost)

                    # capture model flags if present
                    if is_open_source_model is None and "is_open_source_model" in meta:
                        is_open_source_model = bool(meta.get("is_open_source_model"))
                    if is_reasoning_model is None and "is_reasoning_model" in meta:
                        is_reasoning_model = bool(meta.get("is_reasoning_model"))

            pass1_rates_per_run_overall.append(round(successes_this_run / total_tasks, 6))

        # Compute pass@k and pass^k across tasks (overall)
        if not is_single_run:
            for service, service_tasks in all_tasks.items():
                for task in service_tasks:
                    successes = []
                    for run_idx in range(1, runs_count + 1):
                        run_name = f"run-{run_idx}"
                        meta = (
                            model_results
                            .get(service, {})
                            .get(run_name, {})
                            .get(task)
                        )
                        success = bool(meta.get("execution_result", {}).get("success", False)) if meta else False
                        successes.append(success)
                    if any(successes):
                        pass_k_task_success_any += 1
                    if all(successes):
                        pass_power_k_task_success_all += 1

        # Build overall metrics entry
        denom = total_tasks * runs_count if total_tasks > 0 else 1
        avg_agent_execution_time = total_agent_execution_time / denom
        avg_input_tokens = total_input_tokens / denom
        avg_output_tokens = total_output_tokens / denom
        avg_total_tokens = total_tokens / denom
        avg_turns = total_turns / denom

        # pass@1 stats across runs
        if pass1_rates_per_run_overall:
            avg_pass1 = sum(pass1_rates_per_run_overall) / len(pass1_rates_per_run_overall)
            mean = avg_pass1
            variance = (
                sum((r - mean) ** 2 for r in pass1_rates_per_run_overall) / len(pass1_rates_per_run_overall)
            )
            std_pass1 = variance ** 0.5
        else:
            avg_pass1 = 0.0
            std_pass1 = 0.0

        # Compute per-run tokens and cost
        per_run_input_tokens = total_input_tokens / runs_count if runs_count else 0
        per_run_output_tokens = total_output_tokens / runs_count if runs_count else 0
        model_for_pricing = actual_model_name or model
        computed_per_run_cost = compute_cost_usd(model_for_pricing, per_run_input_tokens, per_run_output_tokens)

        overall_metrics = {
            "total_tasks": total_tasks,
            "total_agent_execution_time": total_agent_execution_time,
            "total_input_tokens": total_input_tokens,
            "total_output_tokens": total_output_tokens,
            "total_tokens": total_tokens,
            "total_turns": total_turns,
            "avg_agent_execution_time": round(avg_agent_execution_time, 4),
            "avg_input_tokens": round(avg_input_tokens, 4),
            "avg_output_tokens": round(avg_output_tokens, 4),
            "avg_total_tokens": round(avg_total_tokens, 4),
            "avg_turns": round(avg_turns, 4),
            "per_run_input_tokens": per_run_input_tokens,
            "per_run_output_tokens": per_run_output_tokens,
            "per_run_cost": computed_per_run_cost if computed_per_run_cost is not None else (per_run_cost if per_run_cost is not None else None),
            "actual_model_name": actual_model_name or "",
            "is_open_source_model": (is_open_source_model if is_open_source_model is not None else False),
            "is_reasoning_model": (is_reasoning_model if is_reasoning_model is not None else False),
            "pass@1": {
                "avg": round(avg_pass1, 4),
                "std": round(std_pass1, 4),
            },
        }
        if not is_single_run:
            overall_metrics[f"pass@{k}"] = round(pass_k_task_success_any / total_tasks, 4)
            overall_metrics[f"pass^{k}"] = round(pass_power_k_task_success_all / total_tasks, 4)

        summary["overall"][model] = overall_metrics

        # Per-service detailed metrics mirroring overall
        for service, service_tasks in all_tasks.items():
            service_total_tasks = len(service_tasks)
            if service_total_tasks == 0:
                continue

            s_total_agent_execution_time = 0.0
            s_total_input_tokens = 0
            s_total_output_tokens = 0
            s_total_tokens = 0
            s_total_turns = 0

            # per-run pass@1 for this service
            s_pass1_rates_per_run: List[float] = []

            # pass@k for this service
            s_pass_k_task_success_any = 0
            s_pass_power_k_task_success_all = 0

            for run_idx in range(1, runs_count + 1):
                run_name = f"run-{run_idx}"
                s_successes_this_run = 0

                for task in service_tasks:
                    meta = (
                        model_results
                        .get(service, {})
                        .get(run_name, {})
                        .get(task)
                    )
                    if not meta:
                        continue

                    success = bool(meta.get("execution_result", {}).get("success", False))
                    if success:
                        s_successes_this_run += 1

                    s_total_agent_execution_time += float(meta.get("agent_execution_time", 0.0) or 0.0)
                    in_tok, out_tok, ttl_tok = get_token_counts(meta)
                    s_total_input_tokens += in_tok
                    s_total_output_tokens += out_tok
                    s_total_tokens += ttl_tok
                    s_total_turns += int(meta.get("turn_count", 0) or 0)

                s_pass1_rates_per_run.append(round(s_successes_this_run / service_total_tasks, 6))

            if not is_single_run:
                for task in service_tasks:
                    successes = []
                    for run_idx in range(1, runs_count + 1):
                        run_name = f"run-{run_idx}"
                        meta = (
                            model_results
                            .get(service, {})
                            .get(run_name, {})
                            .get(task)
                        )
                        success = bool(meta.get("execution_result", {}).get("success", False)) if meta else False
                        successes.append(success)
                    if any(successes):
                        s_pass_k_task_success_any += 1
                    if all(successes):
                        s_pass_power_k_task_success_all += 1

            s_denom = service_total_tasks * runs_count if service_total_tasks > 0 else 1
            s_avg_agent_execution_time = s_total_agent_execution_time / s_denom
            s_avg_input_tokens = s_total_input_tokens / s_denom
            s_avg_output_tokens = s_total_output_tokens / s_denom
            s_avg_total_tokens = s_total_tokens / s_denom
            s_avg_turns = s_total_turns / s_denom

            if s_pass1_rates_per_run:
                s_mean = sum(s_pass1_rates_per_run) / len(s_pass1_rates_per_run)
                s_var = sum((r - s_mean) ** 2 for r in s_pass1_rates_per_run) / len(s_pass1_rates_per_run)
                s_std = s_var ** 0.5
            else:
                s_mean = 0.0
                s_std = 0.0

            # Compute per-run tokens and cost for this service
            s_per_run_input_tokens = s_total_input_tokens / runs_count if runs_count else 0
            s_per_run_output_tokens = s_total_output_tokens / runs_count if runs_count else 0
            s_computed_per_run_cost = compute_cost_usd(model_for_pricing, s_per_run_input_tokens, s_per_run_output_tokens)

            service_metrics = {
                "total_tasks": service_total_tasks,
                "total_agent_execution_time": s_total_agent_execution_time,
                "total_input_tokens": s_total_input_tokens,
                "total_output_tokens": s_total_output_tokens,
                "total_tokens": s_total_tokens,
                "total_turns": s_total_turns,
                "avg_agent_execution_time": round(s_avg_agent_execution_time, 4),
                "avg_input_tokens": round(s_avg_input_tokens, 4),
                "avg_output_tokens": round(s_avg_output_tokens, 4),
                "avg_total_tokens": round(s_avg_total_tokens, 4),
                "avg_turns": round(s_avg_turns, 4),
                "per_run_input_tokens": s_per_run_input_tokens,
                "per_run_output_tokens": s_per_run_output_tokens,
                "per_run_cost": s_computed_per_run_cost if s_computed_per_run_cost is not None else (per_run_cost if per_run_cost is not None else None),
                "actual_model_name": actual_model_name or "",
                "is_open_source_model": (is_open_source_model if is_open_source_model is not None else False),
                "is_reasoning_model": (is_reasoning_model if is_reasoning_model is not None else False),
                "pass@1": {
                    "avg": round(s_mean, 4),
                    "std": round(s_std, 4),
                },
            }

            if not is_single_run:
                service_metrics[f"pass@{k}"] = round(s_pass_k_task_success_any / service_total_tasks, 4)
                service_metrics[f"pass^{k}"] = round(s_pass_power_k_task_success_all / service_total_tasks, 4)

            summary[service][model] = service_metrics

    return summary


def generate_model_results(exp_dir: Path, complete_models: Dict, all_tasks: Dict):
    """Generate model_results directory."""
    model_results_dir = exp_dir / "model_results"
    if model_results_dir.exists():
        shutil.rmtree(model_results_dir)
    model_results_dir.mkdir()
    
    for model, model_data in complete_models.items():
        model_dir = model_results_dir / model
        model_dir.mkdir()
        
        # Create a file for each task
        for service, service_tasks in all_tasks.items():
            if service not in model_data:
                continue
            
            for task in service_tasks:
                task_data = {
                    "model": model,
                    "service": service,
                    "task": task,
                    "runs": {}
                }
                
                # Collect data from all runs
                for run_name, run_data in model_data[service].items():
                    if task in run_data:
                        meta = run_data[task]
                        task_data["runs"][run_name] = {
                            "success": meta.get("execution_result", {}).get("success", False),
                            "error_message": meta.get("execution_result", {}).get("error_message"),
                            "execution_time": meta.get("agent_execution_time", 0),
                            "token_usage": meta.get("token_usage", {}),
                            "turn_count": meta.get("turn_count", 0)
                        }
                
                # Save task file
                task_file = model_dir / f"{task}.json"
                with open(task_file, "w") as f:
                    json.dump(task_data, f, indent=2)


def generate_task_results(exp_dir: Path, complete_models: Dict, all_tasks: Dict):
    """Generate task_results directory."""
    task_results_dir = exp_dir / "task_results"
    if task_results_dir.exists():
        shutil.rmtree(task_results_dir)
    task_results_dir.mkdir()
    
    # For each task, collect results across all models
    for service, service_tasks in all_tasks.items():
        for task in service_tasks:
            task_data = {
                "task": task,
                "service": service,
                "models": {}
            }
            
            for model, model_data in complete_models.items():
                if service not in model_data:
                    continue
                
                model_task_data = {"runs": []}
                
                for run_name, run_data in model_data[service].items():
                    if task in run_data:
                        meta = run_data[task]
                        agent_time = float(meta.get("agent_execution_time", 0.0) or 0.0)
                        token_usage = meta.get("token_usage", {}) or {}
                        turn_count = int(meta.get("turn_count", 0) or 0)
                        success = bool(meta.get("execution_result", {}).get("success", False))
                        model_task_data["runs"].append({
                            "run": run_name,
                            "success": success,
                            "execution_time": agent_time,
                            "agent_execution_time": agent_time,
                            "token_usage": token_usage,
                            "turn_count": turn_count,
                        })
                
                if model_task_data["runs"]:
                    # Compute per-model summary across runs for this task
                    runs_list = model_task_data["runs"]
                    runs_count = len(runs_list)
                    successful_runs = sum(1 for r in runs_list if r.get("success"))

                    # Averages
                    total_agent_time = sum(float(r.get("agent_execution_time", r.get("execution_time", 0.0)) or 0.0) for r in runs_list)
                    avg_agent_time = round(total_agent_time / runs_count, 2)

                    def _tok(r, key):
                        tu = r.get("token_usage") or {}
                        return int(tu.get(key, 0) or 0)

                    total_input_tokens = 0
                    total_output_tokens = 0
                    total_total_tokens = 0
                    for r in runs_list:
                        in_tok = _tok(r, "input_tokens")
                        out_tok = _tok(r, "output_tokens")
                        ttl_tok = int((r.get("token_usage") or {}).get("total_tokens", in_tok + out_tok) or (in_tok + out_tok))
                        total_input_tokens += in_tok
                        total_output_tokens += out_tok
                        total_total_tokens += ttl_tok

                    avg_input_tokens = round(total_input_tokens / runs_count, 1)
                    avg_output_tokens = round(total_output_tokens / runs_count, 1)
                    avg_total_tokens = round(total_total_tokens / runs_count, 1)

                    total_turns = sum(int(r.get("turn_count", 0) or 0) for r in runs_list)
                    avg_turn_count = round(total_turns / runs_count, 2)

                    summary_obj = {
                        "total_runs": runs_count,
                        "successful_runs": successful_runs,
                        "avg_agent_execution_time": avg_agent_time,
                        "avg_input_tokens": avg_input_tokens,
                        "avg_output_tokens": avg_output_tokens,
                        "avg_total_tokens": avg_total_tokens,
                        "avg_turn_count": avg_turn_count,
                    }

                    # Include pass@k and pass^k only for multi-run models
                    if runs_count > 1:
                        summary_obj[f"pass@{runs_count}"] = 1.0 if successful_runs > 0 else 0.0
                        summary_obj[f"pass^{runs_count}"] = 1.0 if successful_runs == runs_count else 0.0

                    model_task_data["summary"] = summary_obj
                    task_data["models"][model] = model_task_data
            
            # Save task file
            task_file = task_results_dir / f"{task}.json"
            with open(task_file, "w") as f:
                json.dump(task_data, f, indent=2)


def generate_readme(exp_name: str, summary: Dict, k: int) -> str:
    """Generate README.md content with six tables: overall + 5 MCP services.
    Each table includes Total Tasks, Pass@1 (avg ± std), Avg Agent Time (s), and Pass@k/Pass^k (if k > 1).
    """

    def get_pass1_avg_std(metrics: Dict[str, Any]) -> Tuple[float, float]:
        p1 = metrics.get("pass@1")
        if isinstance(p1, dict):
            return float(p1.get("avg", 0.0) or 0.0), float(p1.get("std", 0.0) or 0.0)
        # Back-compat if older summaries exist
        return float(p1 or 0.0), 0.0

    def render_section(title: str, section_data: Dict[str, Any]) -> List[str]:
        lines_sec: List[str] = [
            f"## {title}",
            "",
        ]

        header = "| Model | Total Tasks | Pass@1 (avg ± std) |"
        sep = "|-------|-------------|--------------------|"
        # include pass@k headers if present (k>1)
        include_k = k > 1
        if include_k:
            header += f" Pass@{k} | Pass^{k} |"
            sep += "----------|----------|"
        # Add Per-Run Cost (USD) and Avg Agent Time (s) at the end
        header += " Per-Run Cost (USD) |"
        sep += "---------------------|"
        header += " Avg Agent Time (s) |"
        sep += "--------------------|"

        lines_sec.append(header)
        lines_sec.append(sep)

        # Sort by Pass@1 avg
        sorted_items = sorted(
            section_data.items(),
            key=lambda x: get_pass1_avg_std(x[1])[0],
            reverse=True
        )

        for model, metrics in sorted_items:
            pass1_avg, pass1_std = get_pass1_avg_std(metrics)
            avg_time = float(metrics.get("avg_agent_execution_time", 0.0) or 0.0)
            # Format per-run cost (up to 2 decimal places, trim trailing zeros)
            cost_val = metrics.get("per_run_cost")
            if isinstance(cost_val, (int, float)):
                rounded_cost = round(float(cost_val), 2)
                formatted_cost = f"{rounded_cost:.2f}".rstrip('0').rstrip('.')
                cost_str = f"${formatted_cost}"
            else:
                cost_str = "/"
            row = (
                f"| {model} | {metrics.get('total_tasks', 0)} | "
                f"{pass1_avg * 100:.1f}% ± {pass1_std * 100:.1f}% |"
            )
            if include_k:
                if f"pass@{k}" in metrics and f"pass^{k}" in metrics:
                    row += f" {metrics[f'pass@{k}'] * 100:.1f}% | {metrics[f'pass^{k}'] * 100:.1f}% |"
                else:
                    # Single-run models do not have pass@k or pass^k; show placeholders
                    row += " / | / |"
            # Append cost and avg agent time at the end
            row += f" {cost_str} |"
            row += f" {avg_time:.1f} |"
            lines_sec.append(row)

        lines_sec.append("")
        return lines_sec

    lines: List[str] = [
        f"# {exp_name} - Evaluation Results",
        "",
        f"Generated: {summary['generated_at']}",
    ]

    task_set = summary.get("task_set")
    if task_set:
        lines.append(f"Task set: {task_set}")

    lines.append("")

    # Overall table
    lines.extend(render_section("Overall Performance", summary.get("overall", {})))

    # Service tables: infer service keys from summary
    reserved = {"overall", "generated_at", "k", "experiment_name", "task_set"}
    service_keys = [key for key in summary.keys() if key not in reserved]
    # Keep stable order
    for service in sorted(service_keys):
        title = f"{service.capitalize()} Performance"
        lines.extend(render_section(title, summary.get(service, {})))

    return "\n".join(lines)


def push_to_github(exp_dir: Path, exp_name: str, branch: Optional[str] = None):
    """Push results to GitHub repository."""
    try:
        with tempfile.TemporaryDirectory() as temp_dir:
            temp_path = Path(temp_dir)
            
            print("📥 Cloning experiments repository...")
            subprocess.run([
                "git", "clone",
                "git@github.com:eval-sys/mcpmark-experiments.git",
                str(temp_path)
            ], check=True, capture_output=True)
            
            # Copy files
            for item in ["summary.json", "README.md", "model_results", "task_results"]:
                src = exp_dir / item
                if src.exists():
                    dst = temp_path / item
                    if src.is_dir():
                        if dst.exists():
                            shutil.rmtree(dst)
                        shutil.copytree(src, dst)
                    else:
                        shutil.copy2(src, dst)
                    print(f"  📄 {item}")
            
            # Git operations
            os.chdir(temp_path)

            # If a branch is specified, create/checkout it before staging changes. Otherwise, ensure main.
            if branch:
                try:
                    subprocess.run(["git", "fetch", "origin"], check=True)
                except subprocess.CalledProcessError:
                    # Non-fatal if fetch fails in some environments
                    pass
                subprocess.run(["git", "checkout", "-B", branch], check=True)
                print(f"  🌿 Using branch '{branch}'")
            else:
                # Default to main branch
                try:
                    subprocess.run(["git", "fetch", "origin"], check=True)
                except subprocess.CalledProcessError:
                    pass
                # Prefer main; if it doesn't exist locally, create tracking from origin/main
                result = subprocess.run(["git", "rev-parse", "--verify", "main"], capture_output=True)
                if result.returncode != 0:
                    # Try to checkout origin/main
                    try:
                        subprocess.run(["git", "checkout", "-B", "main", "origin/main"], check=True)
                    except subprocess.CalledProcessError:
                        # Fallback: create main if no origin/main
                        subprocess.run(["git", "checkout", "-B", "main"], check=True)
                else:
                    subprocess.run(["git", "checkout", "main"], check=True)
            subprocess.run(["git", "add", "."], check=True)
            
            # Check for changes
            result = subprocess.run(
                ["git", "diff", "--staged", "--name-only"],
                capture_output=True, text=True
            )
            
            if not result.stdout.strip():
                print("✅ No changes to push")
                return True
            
            # Commit and push
            subprocess.run([
                "git", "commit", "-m", f"Update results for {exp_name}"
            ], check=True)
            if branch:
                subprocess.run(["git", "push", "--set-upstream", "origin", branch], check=True)
            else:
                subprocess.run(["git", "push", "--set-upstream", "origin", "main"], check=True)
            print("✅ Successfully pushed to GitHub")
            
            return True
            
    except subprocess.CalledProcessError as e:
        print(f"❌ Git operation failed: {e}")
        return False


def print_validation_report(complete: Dict, incomplete: Dict, invalid: Dict, all_tasks: Dict, k: int, single_run_models: List[str], raw_results: Dict):
    """Print structured validation report with summary table."""
    
    # Combine all models
    all_models = {}
    for model in complete:
        all_models[model] = {"status": "complete", "data": complete[model]}
    for model in incomplete:
        all_models[model] = {"status": "incomplete", "issues": incomplete[model]}
    for model in invalid:
        all_models[model] = {"status": "invalid", "issues": invalid[model]}
    
    # Calculate expected counts
    total_expected_tasks = sum(len(tasks) for tasks in all_tasks.values())
    
    # Summary table
    print("\n" + "=" * 100)
    print("COMPLETENESS SUMMARY TABLE")
    print("=" * 100)
    print()
    print(f"{'Model':<30} {'Expected':<12} {'Actual':<12} {'Missing':<12} {'Status':<30}")
    print("-" * 100)
    
    sorted_models = sorted(all_models.keys())
    
    for model_name in sorted_models:
        model_info = all_models[model_name]
        
        # Determine expected runs and tasks
        is_single_run = any(srm in model_name for srm in single_run_models)
        expected_runs = 1 if is_single_run else k
        expected_total = total_expected_tasks * expected_runs
        
        if model_info["status"] == "complete":
            # Count actual tasks from complete model data
            actual_total = 0
            for service, service_data in model_info["data"].items():
                for run_name, run_data in service_data.items():
                    actual_total += len(run_data)
            missing = 0
            status = "✅ Complete"
        else:
            # For incomplete/invalid models, count from raw results
            actual_total = 0
            if model_name in raw_results:
                for service, service_data in raw_results[model_name].items():
                    for run_name, run_data in service_data.items():
                        actual_total += len(run_data)
            
            missing = expected_total - actual_total
            
            if model_info["status"] == "incomplete":
                # Find which services have issues
                problem_services = set()
                for issue in model_info["issues"]:
                    if "Missing entire service:" in issue:
                        service = issue.split(": ")[1]
                        problem_services.add(service)
                    elif "/" in issue:
                        service = issue.split("/")[0]
                        problem_services.add(service)
                    elif "Missing run" in issue:
                        service = issue.split(" for ")[1]
                        problem_services.add(service)
                
                if problem_services:
                    services_str = ", ".join(sorted(problem_services))
                    status = f"❌ Incomplete ({services_str})"
                else:
                    status = "❌ Incomplete"
            else:  # invalid
                status = "⚠️  Invalid (retryable errors)"
        
        # Format the row
        print(f"{model_name:<30} {expected_total:<12} {actual_total:<12} {missing:<12} {status:<30}")
    
    print()
    
    # Overall statistics
    complete_count = len(complete)
    incomplete_count = len(incomplete)
    invalid_count = len(invalid)
    total_models = complete_count + incomplete_count + invalid_count
    
    print("=" * 100)
    print("OVERALL STATISTICS")
    print("=" * 100)
    print(f"Total models analyzed: {total_models}")
    print(f"Complete models: {complete_count}")
    print(f"Incomplete models: {incomplete_count}")
    print(f"Invalid models (with retryable errors): {invalid_count}")
    print(f"Total tasks per MCP: {total_expected_tasks}")
    print(f"Expected runs (k): {k}")
    
    if not complete:
        print("\n❌ No models have complete and valid results!")
    else:
        print(f"\n✅ {complete_count} model(s) ready for aggregation: {', '.join(sorted(complete.keys()))}")


def main():
    # Extra parser for push-related options
    push_parent = argparse.ArgumentParser(add_help=False)
    push_parent.add_argument(
        "--branch",
        type=str,
        help="If provided with --push, push to this new branch"
    )

    parser = argparse.ArgumentParser(
        description="Simplified MCPMark results aggregator"
    , parents=[push_parent])
    parser.add_argument("--exp-name", required=True, help="Experiment name")
    parser.add_argument("--k", type=int, default=4, help="Number of runs (default: 4)")
    parser.add_argument(
        "--single-run-models",
        type=str,
        help="Comma-separated list of models that only need run-1"
    )
    parser.add_argument(
        "--task-set",
        choices=sorted(SUPPORTED_TASK_SETS),
        default="standard",
        help="Which task subset to aggregate (default: standard)"
    )
    parser.add_argument("--push", action="store_true", help="Push to GitHub (default to main)")

    args = parser.parse_args()

    # Parse single-run models
    single_run_models = []
    if args.single_run_models:
        single_run_models = [m.strip() for m in args.single_run_models.split(",")]
        print(f"📌 Single-run models: {', '.join(single_run_models)}")

    # Setup paths
    exp_dir = Path("./results") / args.exp_name
    if not exp_dir.exists():
        print(f"❌ Experiment directory {exp_dir} does not exist")
        return 1

    print(f"🔄 Processing experiment: {args.exp_name}")

    # Discover all tasks
    print(f"📋 Discovering tasks (task set: {args.task_set})...")
    all_tasks = discover_tasks(args.task_set)
    total_tasks = sum(len(tasks) for tasks in all_tasks.values())
    print(f"  Found {total_tasks} tasks across {len(all_tasks)} services")
    
    print("📥 Collecting results...")
    results = collect_results(exp_dir, args.k)
    print(f"  Found results for {len(results)} models")
    
    # Check completeness and validity
    print("✓ Checking completeness and validity...")
    complete_models, incomplete_models, invalid_models = check_completeness_and_validity(
        results, all_tasks, args.k, single_run_models
    )
    
    # Print validation report with summary table
    print_validation_report(complete_models, incomplete_models, invalid_models, 
                           all_tasks, args.k, single_run_models, results)

    # Determine which models to include in output (strict: only complete models)
    models_for_output = dict(complete_models)
    if not models_for_output:
        return 1
    
    # Calculate metrics
    print("\n📊 Calculating metrics...")
    summary = calculate_metrics(models_for_output, all_tasks, args.k, single_run_models)
    summary["experiment_name"] = args.exp_name
    summary["task_set"] = args.task_set
    
    # Save summary
    summary_path = exp_dir / "summary.json"
    with open(summary_path, "w") as f:
        json.dump(summary, f, indent=2)
    print(f"  📄 Saved summary.json")
    
    # Generate model_results
    print("📁 Generating model_results...")
    generate_model_results(exp_dir, models_for_output, all_tasks)
    print(f"  Created {len(models_for_output)} model directories")
    
    # Generate task_results
    print("📁 Generating task_results...")
    generate_task_results(exp_dir, models_for_output, all_tasks)
    print(f"  Created {total_tasks} task files")
    
    # Generate README
    readme_content = generate_readme(args.exp_name, summary, args.k)
    readme_path = exp_dir / "README.md"
    with open(readme_path, "w") as f:
        f.write(readme_content)
    print("  📄 Generated README.md")
    
    # Push to GitHub if requested
    if args.push:
        print("\n🚀 Pushing to GitHub...")
        push_to_github(exp_dir, args.exp_name, branch=args.branch)
    
    print(f"\n🎉 Successfully processed {args.exp_name}")
    return 0


if __name__ == "__main__":
    exit(main())


================================================
FILE: src/aggregators/aggregate_specific_results.py
================================================
#!/usr/bin/env python3
"""
Simple Results Aggregator - Aggregate specific result directories
Usage: python -m src.aggregators.aggregate_specific_results --result-dir results/exp/model__service --k 4
"""

import json
import argparse
from pathlib import Path
from collections import defaultdict
from typing import Dict, Any, Tuple, List
from datetime import datetime
import sys
sys.path.append(str(Path(__file__).parent.parent.parent))
from src.aggregators.pricing import compute_cost_usd


def collect_results_from_dir(result_dir: Path, k: int) -> Dict[str, Any]:
    """Collect all results from a specific result directory."""
    results = {}

    for run_idx in range(1, k + 1):
        run_dir = result_dir / f"run-{run_idx}"
        if not run_dir.exists():
            print(f"⚠️  Warning: {run_dir} does not exist, skipping")
            continue

        run_results = {}
        for task_dir in run_dir.iterdir():
            if not task_dir.is_dir():
                continue

            meta_path = task_dir / "meta.json"
            if meta_path.exists():
                with open(meta_path) as f:
                    meta = json.load(f)
                    run_results[task_dir.name] = meta

        results[f"run-{run_idx}"] = run_results

    return results


def get_token_counts(meta: Dict[str, Any]) -> Tuple[int, int, int]:
    """Extract token counts from meta."""
    tu = meta.get("token_usage", {}) or {}
    input_tokens = int(tu.get("input_tokens", 0) or 0)
    output_tokens = int(tu.get("output_tokens", 0) or 0)
    total_tokens = int(tu.get("total_tokens", input_tokens + output_tokens) or (input_tokens + output_tokens))
    return input_tokens, output_tokens, total_tokens


def calculate_metrics(results: Dict, k: int, model_name: str) -> Dict:
    """Calculate metrics from results."""

    # Get all unique task names
    all_tasks = set()
    for run_name, run_data in results.items():
        all_tasks.update(run_data.keys())
    all_tasks = sorted(all_tasks)

    total_tasks = len(all_tasks)
    actual_runs = len(results)

    print(f"\n📊 Analysis:")
    print(f"  Total unique tasks: {total_tasks}")
    print(f"  Runs found: {actual_runs} (expected: {k})")

    # Aggregates
    total_agent_execution_time = 0.0
    total_input_tokens = 0
    total_output_tokens = 0
    total_tokens = 0
    total_turns = 0

    actual_model_name = None

    # Per-run pass@1
    pass1_rates_per_run = []

    # For pass@k
    pass_k_task_success_any = 0
    pass_power_k_task_success_all = 0

    for run_idx in range(1, actual_runs + 1):
        run_name = f"run-{run_idx}"
        successes_this_run = 0

        for task in all_tasks:
            meta = results.get(run_name, {}).get(task)

            if not meta:
                continue

            success = bool(meta.get("execution_result", {}).get("success", False))
            if success:
                successes_this_run += 1

            total_agent_execution_time += float(meta.get("agent_execution_time", 0.0) or 0.0)
            in_tok, out_tok, ttl_tok = get_token_counts(meta)
            total_input_tokens += in_tok
            total_output_tokens += out_tok
            total_tokens += ttl_tok
            total_turns += int(meta.get("turn_count", 0) or 0)

            if actual_model_name is None:
                actual_model_name = meta.get("actual_model_name") or None

        pass1_rate = successes_this_run / total_tasks if total_tasks > 0 else 0
        pass1_rates_per_run.append(pass1_rate)
        print(f"  Run {run_idx}: {successes_this_run}/{total_tasks} = {pass1_rate*100:.1f}%")

    # Calculate pass@k
    for task in all_tasks:
        successes = []
        for run_idx in range(1, actual_runs + 1):
            run_name = f"run-{run_idx}"
            meta = results.get(run_name, {}).get(task)
            success = bool(meta.get("execution_result", {}).get("success", False)) if meta else False
            successes.append(success)

        if any(successes):
            pass_k_task_success_any += 1
        if all(successes):
            pass_power_k_task_success_all += 1

    # Averages
    denom = total_tasks * actual_runs if total_tasks > 0 else 1
    avg_agent_execution_time = total_agent_execution_time / denom
    avg_input_tokens = total_input_tokens / denom
    avg_output_tokens = total_output_tokens / denom
    avg_total_tokens = total_tokens / denom
    avg_turns = total_turns / denom

    # Pass@1 stats
    if pass1_rates_per_run:
        avg_pass1 = sum(pass1_rates_per_run) / len(pass1_rates_per_run)
        mean = avg_pass1
        variance = sum((r - mean) ** 2 for r in pass1_rates_per_run) / len(pass1_rates_per_run)
        std_pass1 = variance ** 0.5
    else:
        avg_pass1 = 0.0
        std_pass1 = 0.0

    # Cost calculation
    per_run_input_tokens = total_input_tokens / actual_runs if actual_runs else 0
    per_run_output_tokens = total_output_tokens / actual_runs if actual_runs else 0
    model_for_pricing = actual_model_name or model_name
    per_run_cost = compute_cost_usd(model_for_pricing, per_run_input_tokens, per_run_output_tokens)

    summary = {
        "generated_at": datetime.now().isoformat(),
        "model": model_name,
        "actual_model_name": actual_model_name or model_name,
        "runs": actual_runs,
        "total_tasks": total_tasks,
        "total_agent_execution_time": round(total_agent_execution_time, 2),
        "total_input_tokens": total_input_tokens,
        "total_output_tokens": total_output_tokens,
        "total_tokens": total_tokens,
        "total_turns": total_turns,
        "avg_agent_execution_time": round(avg_agent_execution_time, 4),
        "avg_input_tokens": round(avg_input_tokens, 2),
        "avg_output_tokens": round(avg_output_tokens, 2),
        "avg_total_tokens": round(avg_total_tokens, 2),
        "avg_turns": round(avg_turns, 2),
        "per_run_input_tokens": round(per_run_input_tokens, 2),
        "per_run_output_tokens": round(per_run_output_tokens, 2),
        "per_run_cost": round(per_run_cost, 4) if per_run_cost else None,
        "pass@1": {
            "avg": round(avg_pass1, 4),
            "std": round(std_pass1, 4),
            "per_run": [round(r, 4) for r in pass1_rates_per_run]
        },
    }

    if actual_runs > 1:
        summary[f"pass@{actual_runs}"] = round(pass_k_task_success_any / total_tasks, 4)
        summary[f"pass^{actual_runs}"] = round(pass_power_k_task_success_all / total_tasks, 4)

    return summary


def main():
    parser = argparse.ArgumentParser(description="Simple results aggregator for specific directories")
    parser.add_argument("--result-dir", required=True, help="Path to result directory (e.g., results/exp/model__service)")
    parser.add_argument("--k", type=int, default=4, help="Number of runs (default: 4)")
    parser.add_argument("--output", help="Output JSON file path (default: <result-dir>/summary.json)")

    args = parser.parse_args()

    result_dir = Path(args.result_dir)
    if not result_dir.exists():
        print(f"❌ Result directory {result_dir} does not exist")
        return 1

    # Extract model name from directory name
    model_name = result_dir.name.replace("__", "-")

    print(f"🔄 Processing: {result_dir}")
    print(f"📋 Model: {model_name}")

    # Collect results
    results = collect_results_from_dir(result_dir, args.k)

    if not results:
        print("❌ No results found")
        return 1

    # Calculate metrics
    summary = calculate_metrics(results, args.k, model_name)

    # Save summary
    output_path = Path(args.output) if args.output else result_dir / "summary.json"
    with open(output_path, "w") as f:
        json.dump(summary, f, indent=2)

    print(f"\n✅ Summary saved to: {output_path}")
    print(f"\n📈 Results:")
    print(f"  Pass@1: {summary['pass@1']['avg']*100:.1f}% ± {summary['pass@1']['std']*100:.1f}%")
    if f"pass@{args.k}" in summary:
        print(f"  Pass@{args.k}: {summary[f'pass@{args.k}']*100:.1f}%")
        print(f"  Pass^{args.k}: {summary[f'pass^{args.k}']*100:.1f}%")
    print(f"  Per-run cost: ${summary['per_run_cost']:.4f}" if summary['per_run_cost'] else "  Per-run cost: N/A")
    print(f"  Avg agent time: {summary['avg_agent_execution_time']:.2f}s")
    print(f"  Avg turns: {summary['avg_turns']:.2f}")
    print(f"\n📊 Token Usage:")
    avg_tokens_per_run = summary['total_tokens'] / summary['runs'] if summary['runs'] > 0 else 0
    print(f"  Avg tokens per run: {avg_tokens_per_run:,.0f}")
    print(f"  Avg tokens per turn: {summary['avg_total_tokens'] / summary['avg_turns']:.0f}" if summary['avg_turns'] > 0 else "  Avg tokens per turn: N/A")
    print(f"  Total tokens (all runs): {summary['total_tokens']:,}")
    print(f"  Total turns (all runs): {summary['total_turns']:,}")

    return 0


if __name__ == "__main__":
    exit(main())


================================================
FILE: src/aggregators/aggregate_task_meta.py
================================================
#!/usr/bin/env python3
"""
Task Meta Aggregator for MCPBench
Aggregates all meta.json files from the tasks directory into a single JSON file.
"""

import json
import os
import argparse
import subprocess
import shutil
from pathlib import Path
from typing import Dict, List, Any, Set


def find_all_meta_files(tasks_root: Path = Path("tasks")) -> List[Path]:
    """Find all meta.json files in the tasks directory"""
    meta_files = []
    for root, dirs, files in os.walk(tasks_root):
        if "meta.json" in files:
            meta_files.append(Path(root) / "meta.json")
    return meta_files


def parse_meta_file(meta_path: Path) -> Dict[str, Any]:
    """Parse a single meta.json file"""
    try:
        with open(meta_path, "r", encoding="utf-8") as f:
            return json.load(f)
    except Exception as e:
        print(f"Error parsing {meta_path}: {e}")
        return {}


def aggregate_task_meta(meta_files: List[Path]) -> Dict[str, Any]:
    """Aggregate all meta.json files into the required structure"""
    all_data = []
    categories_dict = {}  # Use dict to track unique categories
    all_tags_set = set()  # Set to collect all unique tags

    for meta_path in meta_files:
        meta_data = parse_meta_file(meta_path)
        if meta_data:
            # Exclude model_results field from aggregated data
            filtered_data = {k: v for k, v in meta_data.items() if k != "model_results"}
            all_data.append(filtered_data)

            # Collect categories using category_id and category_name
            if "category_id" in filtered_data and "category_name" in filtered_data:
                category_id = filtered_data["category_id"]
                category_name = filtered_data["category_name"]
                # Use category_id as the key to ensure uniqueness
                categories_dict[category_id] = {
                    "id": category_id,
                    "name": category_name,
                }

            # Collect all unique tags
            if "tags" in filtered_data and isinstance(filtered_data["tags"], list):
                all_tags_set.update(filtered_data["tags"])

    # Convert categories dict to sorted list
    categories_list = sorted(categories_dict.values(), key=lambda x: x["id"])

    # Convert tags set to sorted list
    all_tags_list = sorted(all_tags_set)

    return {
        "data": all_data,
        "count": len(all_data),
        "categories": categories_list,
        "tags": all_tags_list,
    }


def create_individual_task_files(meta_files: List[Path]) -> List[Dict[str, Any]]:
    """Create individual task JSON files with instruction and verify content"""
    task_files = []

    for meta_path in meta_files:
        meta_data = parse_meta_file(meta_path)
        if not meta_data or "task_id" not in meta_data:
            continue

        # Get the task directory
        task_dir = meta_path.parent

        # Read description.md if exists
        description_path = task_dir / "description.md"
        instruction_content = ""
        if description_path.exists():
            try:
                with open(description_path, "r", encoding="utf-8") as f:
                    instruction_content = f.read()
            except Exception as e:
                print(f"Warning: Could not read {description_path}: {e}")

        # Read verify.py if exists
        verify_path = task_dir / "verify.py"
        verify_content = ""
        if verify_path.exists():
            try:
                with open(verify_path, "r", encoding="utf-8") as f:
                    verify_content = f.read()
            except Exception as e:
                print(f"Warning: Could not read {verify_path}: {e}")

        # Create combined task data, excluding model_results
        task_data = {
            k: v for k, v in meta_data.items() if k != "model_results"
        }
        task_data["instruction"] = instruction_content
        task_data["verify"] = verify_content

        task_files.append({"filename": f"{meta_data['task_id']}.json", "data": task_data})

    return task_files


def push_to_file(
    output_file: Path,
    data: Dict[str, Any],
    task_files: List[Dict[str, Any]] = None,
    push_to_repo: bool = False,
) -> bool:
    """Save the aggregated data to file and optionally push to repo"""
    try:
        # Create parent directory if it doesn't exist
        output_file.parent.mkdir(parents=True, exist_ok=True)

        # Write the aggregated data
        with open(output_file, "w", encoding="utf-8") as f:
            json.dump(data, f, indent=2, ensure_ascii=False)

        print(f"✅ Task meta data saved to: {output_file}")
        print(f"📊 Summary:")
        print(f"   - Total tasks with meta.json: {data['count']}")
        print(f"   - Categories: {len(data['categories'])}")
        print(f"   - Unique tags: {len(data['tags'])}")

        if push_to_repo:
            return push_to_experiments_repo(output_file, task_files)

        return True

    except Exception as e:
        print(f"❌ Error saving file: {e}")
        return False


def push_to_experiments_repo(
    file_path: Path, task_files: List[Dict[str, Any]] = None
) -> bool:
    """Push the task meta file and individual task files to eval-sys/mcpmark-experiments repo"""
    if not file_path.exists():
        print("⚠️  File does not exist")
        return False

    repo_url = "https://github.com/eval-sys/mcpmark-experiments.git"
    temp_dir = Path("./temp_experiments_repo")

    try:
        print(f"\n🔄 Preparing to push task meta to experiments repo...")

        # Clean up any existing temp directory
        if temp_dir.exists():
            shutil.rmtree(temp_dir)

        # Clone the repo
        print("📥 Cloning experiments repo...")
        subprocess.run(
            ["git", "clone", repo_url, str(temp_dir)], check=True, capture_output=True
        )

        # Copy the main task_meta.json file
        target_path = temp_dir / "task_meta.json"
        print(f"📁 Copying task meta file: task_meta.json")
        shutil.copy2(file_path, target_path)

        # Create tasks directory and copy individual task files
        if task_files:
            tasks_dir = temp_dir / "tasks"
            tasks_dir.mkdir(exist_ok=True)
            print(f"📁 Creating individual task files in ./tasks directory...")

            for task_file in task_files:
                task_file_path = tasks_dir / task_file["filename"]
                with open(task_file_path, "w", encoding="utf-8") as f:
                    json.dump(task_file["data"], f, indent=2, ensure_ascii=False)

            print(f"   - Created {len(task_files)} individual task files")

        # Change to repo directory for git operations
        original_dir = os.getcwd()
        os.chdir(temp_dir)

        # Add all changes
        subprocess.run(["git", "add", "."], check=True)

        # Check if there are changes to commit
        result = subprocess.run(
            ["git", "status", "--porcelain"], capture_output=True, text=True
        )

        if not result.stdout.strip():
            print("✅ No changes to push (files are up to date)")
            return True

        # Commit changes
        commit_msg = "Update task meta data and individual task files"
        subprocess.run(["git", "commit", "-m", commit_msg], check=True)

        # Push changes
        print("🚀 Pushing to remote repository...")
        subprocess.run(["git", "push"], check=True)

        print("✅ Successfully pushed task meta and individual task files to repo!")
        return True

    except subprocess.CalledProcessError as e:
        print(f"❌ Git operation failed: {e}")
        return False
    except Exception as e:
        print(f"❌ Error pushing to repo: {e}")
        return False
    finally:
        # Change back to original directory
        os.chdir(original_dir)
        # Clean up temp directory
        if temp_dir.exists():
            shutil.rmtree(temp_dir)


def main():
    parser = argparse.ArgumentParser(description="Aggregate all task meta.json files")
    parser.add_argument(
        "--output",
        type=str,
        default="task_meta.json",
        help="Output file path (default: task_meta.json)",
    )
    parser.add_argument(
        "--push",
        action="store_true",
        help="Push results to eval-sys/mcpmark-experiments repo",
    )
    args = parser.parse_args()

    print("🔍 Searching for meta.json files in tasks directory...")

    # Find all meta.json files
    meta_files = find_all_meta_files()

    if not meta_files:
        print("❌ No meta.json files found in tasks directory")
        return 1

    print(f"📁 Found {len(meta_files)} meta.json files")

    # Aggregate the data
    print("🔄 Aggregating task meta data...")
    aggregated_data = aggregate_task_meta(meta_files)

    # Create individual task files if pushing to repo
    task_files = None
    if args.push:
        print("🔄 Creating individual task files...")
        task_files = create_individual_task_files(meta_files)
        print(f"📝 Prepared {len(task_files)} individual task files")

    # Save to file
    output_path = Path(args.output)
    success = push_to_file(output_path, aggregated_data, task_files, args.push)

    if not success:
        return 1

    if args.push:
        print(
            f"🚀 Task meta data and individual task files pushed to eval-sys/mcpmark-experiments repo"
        )

    return 0


if __name__ == "__main__":
    exit(main())


================================================
FILE: src/aggregators/pricing.py
================================================
"""
Pricing utilities for computing per-run cost from token usage.

All prices are specified per 1,000,000 tokens (M tokens) in USD.
"""

from __future__ import annotations

from typing import Dict, Optional


# Price map keyed by canonical model name (lowercased)
# Values are dicts with per-M token prices for input and output tokens
MODEL_PRICES_PER_M: Dict[str, Dict[str, float]] = {
    # Use exact actual_model_name keys (lowercased) provided by the user
    # Anthropic
    "claude-opus-4-1-20250805": {"input": 15.0, "output": 75.0},
    "claude-opus-4-5-20251101": {"input": 5.0, "output": 25.0},
    "claude-sonnet-4-20250514": {"input": 3.0, "output": 15.0},
    "claude-sonnet-4-5-20250929": {"input": 3.0, "output": 15.0},

    # DeepSeek
    "deepseek-v3.1-non-think": {"input": 0.56, "output": 1.68},
    "deepseek-v3.2-chat": {"input": 0.27, "output": 0.40},
    "deepseek-v3.2-reasoner": {"input": 0.27, "output": 0.40},
    "deepseek-v3.1-terminus-thinking": {"input": 0.21, "output": 0.79},
    "deepseek-v3.1-terminus": {"input": 0.21, "output": 0.79},

    # Google Gemini
    "gemini-2.5-pro": {"input": 2.5, "output": 15.0},
    "gemini-2.5-flash": {"input": 0.3, "output": 2.5},
    "gemini-3-pro": {"input": 2.0, "output": 12.0},

    # Z.AI
    "glm-4.5": {"input": 0.33, "output": 1.32},

    # OpenAI
    "gpt-5-2025-08-07": {"input": 1.25, "output": 10.0},
    "gpt-5.2-2025-12-11": {"input": 1.75, "output": 14.0},
    "gpt-5-mini-2025-08-07": {"input": 0.25, "output": 2.0},
    "gpt-5-nano-2025-08-07": {"input": 0.05, "output": 0.4},
    "gpt-4.1-2025-04-14": {"input": 2.0, "output": 8.0},
    "gpt-4.1-mini-2025-04-14": {"input": 0.4, "output": 1.6},
    "gpt-4.1-nano-2025-04-14": {"input": 0.1, "output": 0.4},
    "o3-2025-04-16": {"input": 2.0, "output": 8.0},
    "o4-mini-2025-04-16": {"input": 1.1, "output": 4.4},
    "gpt-oss-120b": {"input": 0.072, "output": 0.28},

    # Qwen
    "qwen3-coder-480b-a35b-instruct": {"input": 0.2, "output": 0.8},
    "qwen3-max-preview": {"input": 1.2, "output": 6},
    
    # Xai
    "grok-4-0709": {"input": 3.0, "output": 15.0},
    "grok-code-fast-1": {"input": 0.2, "output": 1.5},
    "grok-4-fast": {"input": 0.2, "output": 0.5},

    # Moonshot
    "kimi-k2-0711-preview": {"input": 0.6, "output": 2.5},
    "kimi-k2-0905-preview": {"input": 0.6, "output": 2.5},
}


def normalize_model_name(model_name: str) -> str:
    """Normalize model name for pricing lookup.

    Lowercases only.
    """
    return (model_name or "").strip().lower()


def get_price_per_m(model_name: str) -> Optional[Dict[str, float]]:
    """Return per-M token prices for given model, or None if unknown."""
    key = normalize_model_name(model_name)
    return MODEL_PRICES_PER_M.get(key)


def compute_cost_usd(model_name: str, input_tokens: float, output_tokens: float) -> Optional[float]:
    """Compute cost in USD given token usage and model pricing.

    Prices are per 1,000,000 tokens. If pricing unknown, returns None.
    """
    prices = get_price_per_m(model_name)
    if not prices:
        return None
    input_cost = (input_tokens / 1_000_000.0) * prices["input"]
    output_cost = (output_tokens / 1_000_000.0) * prices["output"]
    return float(round(input_cost + output_cost, 6))


================================================
FILE: src/base/__init__.py
================================================


================================================
FILE: src/base/login_helper.py
================================================
from abc import ABC, abstractmethod


class BaseLoginHelper(ABC):
    """Abstract base class for login helpers."""

    def __init__(self):
        pass

    @abstractmethod
    def login(self, **kwargs):
        pass


================================================
FILE: src/base/state_manager.py
================================================
import time
from abc import ABC, abstractmethod
from dataclasses import dataclass
from typing import Any, Dict, List, Optional

from src.logger import get_logger
from .task_manager import BaseTask

# Initialize logger
logger = get_logger(__name__)


@dataclass
class InitialStateInfo:
    """Information about created initial state for a task."""

    state_id: str
    state_url: Optional[str] = None
    metadata: Optional[Dict[str, Any]] = None


class BaseStateManager(ABC):
    """
    Simplified abstract base class for state management in MCP services.

    This class provides essential functionality for initial state creation and cleanup
    while allowing service-specific implementations through template methods.
    """

    def __init__(self, service_name: str):
        self.service_name = service_name
        # Simple resource tracking for cleanup
        self.tracked_resources: List[Dict[str, Any]] = []

    # Note: Initialization is now handled in service-specific constructors

    def set_up(self, task: BaseTask) -> bool:
        """Set up initial state for a specific task.

        Args:
            task: The task for which to set up the initial state

        Returns:
            True if setup successful, False otherwise
        """
        try:
            logger.info(
                f"| Setting up initial state for {self.service_name} task: {task.name}"
            )

            # Create initial state
            initial_state_info = self._create_initial_state(task)
            if not initial_state_info:
                logger.error(f"| Failed to create initial state for {task.name}")
                return False

            # Store initial state info in task
            self._store_initial_state_info(task, initial_state_info)

            logger.info(f"| ✓ Initial state setup completed for {task.name}")
            return True

        except Exception as e:
            logger.error(f"| Setup failed for {task.name}: {e}")
            return False

    def clean_up(self, task: BaseTask = None) -> bool:
        """Clean up resources with common patterns and service-specific hooks.

        Args:
            task: Optional task to clean up specific resources for

        Returns:
            True if cleanup successful, False otherwise
        """
        try:
            cleanup_success = True

            # Task-specific cleanup
            if task:
                logger.info(
                    f"| ○ Cleaning up initial state for {self.service_name} task: {task.name}"
                )
                if not self._cleanup_task_initial_state(task):
                    cleanup_success = False

            # Clean up all tracked resources
            if not self._cleanup_tracked_resources():
                cleanup_success = False

            if cleanup_success:
                logger.info(f"| ✓ Cleanup completed for {self.service_name}")
            else:
                logger.warning(
                    f"| Cleanup completed with some failures for {self.service_name}"
                )

            return cleanup_success

        except Exception as e:
            logger.error(f"Cleanup failed for {self.service_name}: {e}")
            return False

    def track_resource(
        self,
        resource_type: str,
        identifier: str,
        metadata: Optional[Dict[str, Any]] = None,
    ) -> None:
        """Track a resource for later cleanup.

        Args:
            resource_type: Type of resource (e.g., 'repository', 'page')
            identifier: Unique identifier for the resource
            metadata: Additional metadata about the resource
        """
        resource = {
            "type": resource_type,
            "id": identifier,
            "created_at": time.time(),
            "metadata": metadata or {},
        }
        self.tracked_resources.append(resource)
        logger.debug(f"Tracked {resource_type} resource: {identifier}")

    def get_service_config_for_agent(self) -> dict:
        """
        Get service-specific configuration for agent execution.

        This method should be overridden by service implementations that need
        to provide additional configuration to the agent.

        Returns:
            Dictionary containing configuration needed by the agent/MCP server
        """
        return {}

    def set_verification_environment(self, messages_path: str = None) -> None:
        """
        Set environment variables needed for verification scripts.

        Args:
            messages_path: Optional path to messages.json file for verification

        This method can be overridden by service implementations that need
        to set specific environment variables for their verification scripts.
        The default implementation sets MCP_MESSAGES if provided.
        """
        import os
        if messages_path:
            os.environ["MCP_MESSAGES"] = str(messages_path)

    def _cleanup_tracked_resources(self) -> bool:
        """Clean up all tracked resources."""
        cleanup_success = True

        for resource in self.tracked_resources:
            try:
                if not self._cleanup_single_resource(resource):
                    cleanup_success = False
            except Exception as e:
                logger.error(f"Failed to cleanup resource {resource}: {e}")
                cleanup_success = False

        # Clear resources after cleanup attempt
        self.tracked_resources.clear()
        return cleanup_success

    # =========================================================================
    # Abstract methods for service-specific behavior (simplified)
    # =========================================================================

    # Note: Service-specific initialization is now handled in constructors

    @abstractmethod
    def _create_initial_state(self, task: BaseTask) -> Optional[InitialStateInfo]:
        """Create initial state for a task (e.g., duplicate page, fork repo).

        Args:
            task: Task for which to create initial state

        Returns:
            InitialStateInfo object or None if creation failed
        """
        pass

    @abstractmethod
    def _store_initial_state_info(
        self, task: BaseTask, state_info: InitialStateInfo
    ) -> None:
        """Store initial state information in the task object.

        Args:
            task: Task object to update
            state_info: Initial state information to store
        """
        pass

    @abstractmethod
    def _cleanup_task_initial_state(self, task: BaseTask) -> bool:
        """Clean up initial state for a specific task.

        Args:
            task: Task whose initial state should be cleaned up

        Returns:
            True if cleanup successful, False otherwise
        """
        pass

    @abstractmethod
    def _cleanup_single_resource(self, resource: Dict[str, Any]) -> bool:
        """Clean up a single tracked resource.

        Args:
            resource: Resource dictionary with type, id, and metadata

        Returns:
            True if cleanup successful, False otherwise
        """
        pass


================================================
FILE: src/base/task_manager.py
================================================
#!/usr/bin/env python3
"""
Enhanced Base Task Manager with Common Task Discovery Logic
===========================================================

This module provides an improved base class for task managers that consolidates
common task discovery patterns while maintaining flexibility for service-specific needs.
"""

import json
import subprocess
import sys
from abc import ABC
from dataclasses import dataclass
from pathlib import Path
from typing import Any, Dict, List, Optional

from src.logger import get_logger
from src.results_reporter import TaskResult

logger = get_logger(__name__)


@dataclass
class BaseTask:
    """Base class for evaluation tasks."""

    task_instruction_path: Path
    task_verification_path: Path
    service: str
    category_id: str  # From meta.json if available, otherwise directory name
    task_id: str  # From meta.json if available, otherwise directory name

    @property
    def name(self) -> str:
        """Return the task name using '__' separator format: 'category_id__task_id'."""
        return f"{self.category_id}__{self.task_id}"

    def get_task_instruction(self) -> str:
        """Return the full text content of the task instruction file."""
        if not self.task_instruction_path.exists():
            raise FileNotFoundError(
                f"Task instruction file not found: {self.task_instruction_path}"
            )

        return self.task_instruction_path.read_text(encoding="utf-8")


class BaseTaskManager(ABC):
    """Enhanced base class for service-specific task managers with common discovery logic."""

    def __init__(
        self,
        tasks_root: Path,
        mcp_service: str = None,
        task_class: type = None,
        task_organization: str = None,
        task_suite: str | None = "standard",
    ):
        """Initialize the base task manager.

        Args:
            tasks_root: Root directory containing all tasks
            mcp_service: MCP service name (e.g., 'notion', 'github', 'filesystem')
            task_class: Custom task class to use (defaults to BaseTask)
            task_organization: 'file' or 'directory' based task organization
            task_suite: Logical task suite (e.g., 'standard', 'easy')
        """
        self.tasks_root = tasks_root
        self.mcp_service = mcp_service or self.__class__.__name__.lower().replace(
            "taskmanager", ""
        )
        self.task_class = task_class or BaseTask
        self.task_organization = task_organization
        self.task_suite = task_suite
        self._tasks_cache = None

    # =========================================================================
    # Common Task Discovery Implementation
    # =========================================================================

    def discover_all_tasks(self) -> List[BaseTask]:
        """Discover all available tasks for this service (common implementation)."""
        if self._tasks_cache is not None:
            return self._tasks_cache

        tasks = []
        service_dir = self.tasks_root / (
            self.mcp_service or self._get_service_directory_name()
        )
        if self.task_suite:
            service_dir = service_dir / self.task_suite

        if not service_dir.exists():
            logger.warning(
                f"{self.mcp_service.title()} tasks directory does not exist: {service_dir}"
            )
            return tasks

        # Scan categories
        for category_dir in service_dir.iterdir():
            if not self._is_valid_category_dir(category_dir):
                continue

            category_id = category_dir.name
            logger.info("Discovering tasks in category: %s", category_id)

            # Find tasks using service-specific logic
            task_files = self._find_task_files(category_dir)
            for task_files_info in task_files:
                task = self._create_task_from_files(category_id, task_files_info)
                if task:
                    tasks.append(task)
                    logger.debug("Found task: %s", task.name)

        # Sort and cache
        # Sort by category_id and a stringified task_id to handle both numeric IDs and slugs uniformly
        self._tasks_cache = sorted(tasks, key=lambda t: (t.category_id, str(t.task_id)))
        logger.info(
            "Discovered %d %s tasks across all categories (suite=%s)",
            len(self._tasks_cache),
            self.mcp_service.title(),
            self.task_suite or "default",
        )
        return self._tasks_cache

    def get_categories(self) -> List[str]:
        """Get a list of all task categories (common implementation)."""
        tasks = self.discover_all_tasks()
        return sorted(list(set(task.category_id for task in tasks)))

    def filter_tasks(self, task_filter: str) -> List[BaseTask]:
        """Filter tasks based on category or specific task pattern (common implementation)."""
        all_tasks = self.discover_all_tasks()

        if not task_filter or task_filter.lower() == "all":
            return all_tasks

        # Check if it's a category filter
        categories = self.get_categories()
        if task_filter in categories:
            return [task for task in all_tasks if task.category_id == task_filter]

        # Check for specific task pattern (category_id/task_id)
        if "/" in task_filter:
            try:
                category, task_part = task_filter.split("/", 1)

                # First try to match by task_id (could be numeric or string)
                for task in all_tasks:
                    if task.category_id == category:
                        # Check if task_id matches (as string or as specific pattern)
                        if str(task.task_id) == task_part:
                            return [task]
            except (ValueError, IndexError):
                pass

        # Fallback: check for partial matches in task names or categories
        filtered_tasks = []
        for task in all_tasks:
            if (
                task_filter in task.category_id
                or task_filter in task.name
                or task_filter == str(task.task_id)
            ):
                filtered_tasks.append(task)

        return filtered_tasks

    # =========================================================================
    # Common Helper Methods
    # =========================================================================

    def get_task_instruction(self, task: BaseTask) -> str:
        """Get formatted task instruction (template method)."""
        base_instruction = self._read_task_instruction(task)
        return self._format_task_instruction(base_instruction)

    def execute_task(self, task: BaseTask, agent_result: Dict[str, Any]) -> TaskResult:
        """Execute task verification (template method)."""
        logger.info(f"| Verifying task ({self.mcp_service.title()}): {task.name}")

        # Track agent success separately
        agent_success = agent_result.get("success", False)
        agent_error = None
        verification_success = False
        verification_error = None
        verification_output = None

        # Handle agent failure (but still continue to verification)
        if not agent_success:
            agent_error = agent_result.get("error", "Agent execution failed")
            # Standardize MCP network errors
            agent_error = self._standardize_error_message(agent_error)
            
            logger.error(f"| ✗ Agent execution failed for task")
            logger.error(f"| ⚠️ Error: {agent_error}")
            logger.info(f"| - Proceeding with verification despite agent failure")

        try:
            # Always run verification regardless of agent success
            verify_result = self.run_verification(task)

            # Process verification results
            verification_success = verify_result.returncode == 0
            verification_output = verify_result.stdout
            
            # Log verification output
            if verification_output:
                print(verification_output)
            
            # Capture verification error if failed
            if not verification_success:
                verification_error = verify_result.stderr if verify_result.stderr else "Verification failed with no error message"

            if verification_success:
                logger.info(f"| Verification Result: \033[92m✓ PASSED\033[0m")
            else:
                logger.error(f"| Verification Result: \033[91m✗ FAILED\033[0m")
                if verification_error:
                    logger.error(f"| Verification Error: {verification_error}")

            return TaskResult(
                task_name=task.name,
                success=verification_success,
                error_message=agent_error,  # Agent execution error
                verification_error=verification_error,  # Verification error
                verification_output=verification_output,  # Verification output
                model_output=agent_result.get("output", ""),
                category_id=task.category_id,
                task_id=task.task_id,
                token_usage=agent_result.get("token_usage", {}),
                turn_count=agent_result.get("turn_count", -1),
            )

        except Exception as e:
            logger.error(f"| Task verification failed: {e}", exc_info=True)
            return TaskResult(
                task_name=task.name,
                success=False,
                error_message=agent_error,  # Keep agent error if any
                verification_error=str(e),  # Verification exception
                verification_output=None,
                category_id=task.category_id,
                task_id=task.task_id,
                model_output=agent_result.get("output", ""),
                token_usage=agent_result.get("token_usage", {}),
                turn_count=agent_result.get("turn_count", 0),
            )

    def run_verification(self, task: BaseTask) -> subprocess.CompletedProcess:
        """Run the verification script for a task (can be overridden).

        Default implementation runs the verification command.
        Services can override this to add environment variables or custom logic.
        """
        return subprocess.run(
            self._get_verification_command(task),
            capture_output=True,  # Capture stdout and stderr for logging
            text=True,
            timeout=300,
        )

    # =========================================================================
    # Abstract Methods - Minimal Set Required
    # =========================================================================

    def _get_service_directory_name(self) -> str:
        """Return the service directory name (e.g., 'notion', 'github').

        Default implementation uses the service parameter if provided.
        """
        if self.mcp_service:
            return self.mcp_service
        raise NotImplementedError(
            "Must provide service parameter or implement _get_service_directory_name"
        )

    def _get_task_organization(self) -> str:
        """Return task organization type: 'directory' or 'file'.

        - 'directory': Tasks organized as task_X/description.md (Notion)
        - 'file': Tasks organized as task_X.md (GitHub, Filesystem)

        Default implementation uses the task_organization parameter if provided.
        """
        if self.task_organization:
            return self.task_organization
        raise NotImplementedError(
            "Must provide task_organization parameter or implement _get_task_organization"
        )

    # Note: _create_task_instance is no longer needed - use task_class parameter instead

    # =========================================================================
    # Hook Methods with Smart Defaults
    # =========================================================================

    def _is_valid_category_dir(self, category_dir: Path) -> bool:
        """Check if a directory is a valid category directory."""
        return (
            category_dir.is_dir()
            and not category_dir.name.startswith(".")
            and category_dir.name != "utils"
            and category_dir.name != "__pycache__"
        )

    def _find_task_files(self, category_dir: Path) -> List[Dict[str, Any]]:
        """Find task files in a category directory (smart default implementation).

        Automatically handles both directory-based and file-based organization.
        """
        task_files: List[Dict[str, Any]] = []

        for task_dir in category_dir.iterdir():
            # Skip anything that is not a directory or is hidden
            if not task_dir.is_dir() or task_dir.name.startswith("."):
                continue

            description_path = task_dir / "description.md"
            verify_path = task_dir / "verify.py"

            # We consider a directory a valid task only if the two mandatory files exist
            if not (description_path.exists() and verify_path.exists()):
                logger.warning(
                    "Skipping %s – missing description.md or verify.py", task_dir
                )
                continue

            task_files.append(
                {
                    "task_id": task_dir.name,
                    "instruction_path": description_path,
                    "verification_path": verify_path,
                }
            )

        return task_files

    def _create_task_from_files(
        self, category_id: str, task_files_info: Dict[str, Any]
    ) -> Optional[BaseTask]:
        """Create a task from file information with meta.json support."""
        # Check for meta.json
        meta_path = task_files_info["instruction_path"].parent / "meta.json"
        # Default to directory names
        task_id = task_files_info["task_id"]
        final_category_id = category_id
        
        if meta_path.exists():
            try:
                with open(meta_path, 'r') as f:
                    meta_data = json.load(f)
                    # Use values from meta.json if available
                    final_category_id = meta_data.get("category_id", category_id)
                    task_id = meta_data.get("task_id", task_id)
            except Exception as e:
                logger.warning(f"Failed to load meta.json from {meta_path}: {e}")
        
        return self.task_class(
            task_instruction_path=task_files_info["instruction_path"],
            task_verification_path=task_files_info["verification_path"],
            service=self.mcp_service,
            category_id=final_category_id,
            task_id=task_id,
        )

    def _read_task_instruction(self, task: BaseTask) -> str:
        """Read and return the task instruction content."""
        return task.get_task_instruction()

    def _format_task_instruction(self, base_instruction: str) -> str:
        """Format task instruction with Notion-specific additions."""
        return (
            base_instruction
            + "\n\nNote: Based on your understanding, solve the task all at once by yourself, don't ask for my opinions on anything."
        )

    def _get_verification_command(self, task: BaseTask) -> List[str]:
        """Get the command to run task verification (default implementation)."""
        return [sys.executable, str(task.task_verification_path)]

    def _standardize_error_message(self, error_message: str) -> str:
        """Standardize error messages for consistent reporting."""
        from src.errors import standardize_error_message

        return standardize_error_message(error_message, mcp_service=self.mcp_service)


================================================
FILE: src/config/__init__.py
================================================


================================================
FILE: src/config/config_schema.py
================================================
#!/usr/bin/env python3
"""
Centralized Configuration Schema for MCPMark
=============================================

This module provides a unified configuration system with validation,
type safety, and support for multiple configuration sources.
"""

import os
from abc import ABC, abstractmethod
from dataclasses import dataclass
from pathlib import Path
from typing import Any, Dict, Optional

import yaml
from dotenv import load_dotenv

from src.logger import get_logger

logger = get_logger(__name__)


# Lazy import to avoid circular dependencies
def get_service_definition(service_name: str) -> dict:
    from src.services import get_service_definition as _get_service_def

    return _get_service_def(service_name)


@dataclass
class ConfigValue:
    """Represents a configuration value with metadata."""

    key: str
    value: Any
    source: str  # 'env', 'file', 'default'
    required: bool = True
    description: str = ""
    validator: Optional[callable] = None

    def validate(self) -> bool:
        """Validate the configuration value."""
        if self.required and self.value is None:
            raise ValueError(f"Required configuration '{self.key}' is missing")

        if self.validator and self.value is not None:
            if not self.validator(self.value):
                raise ValueError(f"Invalid value for '{self.key}': {self.value}")

        return True


class ConfigSchema(ABC):
    """Abstract base class for service configuration schemas."""

    def __init__(self, service_name: str):
        self.service_name = service_name
        self._values: Dict[str, ConfigValue] = {}
        self._load_dotenv()
        self._define_schema()
        self._load_values()
        self._validate()

    @abstractmethod
    def _define_schema(self) -> None:
        """Define the configuration schema for this service."""
        pass

    def _load_dotenv(self) -> None:
        """Load environment variables from .mcp_env file."""
        load_dotenv(dotenv_path=".mcp_env", override=False)

    def _add_config(
        self,
        key: str,
        env_var: Optional[str] = None,
        default: Any = None,
        required: bool = True,
        description: str = "",
        validator: Optional[callable] = None,
        transform: Optional[callable] = None,
    ) -> None:
        """Add a configuration value to the schema."""
        # Try to get value from environment first
        value = None
        source = "default"

        if env_var:
            env_value = os.getenv(env_var)
            if env_value is not None:
                value = transform(env_value) if transform else env_value
                source = "env"

        # Use default if no environment value
        if value is None and default is not None:
            value = default
            source = "default"

        self._values[key] = ConfigValue(
            key=key,
            value=value,
            source=source,
            required=required,
            description=description,
            validator=validator,
        )

    def _load_values(self) -> None:
        """Load configuration values from file if available."""
        config_file = Path(f"config/{self.service_name}.yaml")
        if config_file.exists():
            with open(config_file) as f:
                file_config = yaml.safe_load(f)

            for key, value in file_config.items():
                if key in self._values and self._values[key].value is None:
                    self._values[key].value = value
                    self._values[key].source = "file"

    def _validate(self) -> None:
        """Validate all configuration values."""
        for config_value in self._values.values():
            config_value.validate()

    def get(self, key: str, default: Any = None) -> Any:
        """Get a configuration value."""
        if key in self._values:
            return self._values[key].value
        return default

    def get_all(self) -> Dict[str, Any]:
        """Get all configuration values as a dictionary."""
        return {k: v.value for k, v in self._values.items()}

    def get_debug_info(self) -> Dict[str, Dict[str, Any]]:
        """Get detailed configuration information for debugging."""
        return {
            k: {
                "value": v.value,
                "source": v.source,
                "required": v.required,
                "description": v.description,
            }
            for k, v in self._values.items()
        }


class GenericConfigSchema(ConfigSchema):
    """Generic configuration schema that reads from service definitions."""

    def __init__(self, service_name: str):
        # Get service definition before calling parent init
        self.service_definition = get_service_definition(service_name)
        super().__init__(service_name)

    def _define_schema(self) -> None:
        """Define schema from service definition."""
        config_schema = self.service_definition.get("config_schema", {})

        for key, config in config_schema.items():
            # Handle transform strings
            transform = None
            transform_str = config.get("transform")
            if transform_str == "bool":
                transform = lambda x: x.lower() in ["true", "1", "yes"]
            elif transform_str == "int":
                transform = int
            elif transform_str == "path":
                transform = lambda x: Path(x) if x else None
            elif transform_str == "list":
                transform = lambda x: [t.strip() for t in x.split(",")] if x else []

            # Handle validator strings
            validator = None
            validator_str = config.get("validator")
            if validator_str == "port":
                validator = lambda x: 1 <= x <= 65535
            elif validator_str and validator_str.startswith("in:"):
                valid_values = validator_str[3:].split(",")
                validator = lambda x, values=valid_values: x in values

            self._add_config(
                key=key,
                env_var=config.get("env_var"),
                default=config.get("default"),
                required=config.get("required", True),
                description=config.get("description", ""),
                validator=validator,
                transform=transform,
            )


# Configuration Registry


class ConfigRegistry:
    """Central registry for all service configurations."""

    _instances: Dict[str, ConfigSchema] = {}

    @classmethod
    def get_config(cls, service_name: str) -> ConfigSchema:
        """Get or create configuration for a service."""
        if service_name not in cls._instances:
            cls._instances[service_name] = GenericConfigSchema(service_name)
        return cls._instances[service_name]

    @classmethod
    def validate_all(cls) -> Dict[str, bool]:
        """Validate all registered configurations."""
        from src.services import get_supported_mcp_services

        results = {}
        for service_name in get_supported_mcp_services():
            try:
                cls.get_config(service_name)
                results[service_name] = True
            except Exception as e:
                logger.error(f"Configuration validation failed for {service_name}: {e}")
                results[service_name] = False
        return results

    @classmethod
    def export_template(cls, service_name: str, output_path: Path) -> None:
        """Export a configuration template for a service."""
        config = cls.get_config(service_name)

        template = {"service": service_name, "configuration": {}}

        for key, config_value in config._values.items():
            template["configuration"][key] = {
                "value": config_value.value
                if config_value.source == "default"
                else None,
                "description": config_value.description,
                "required": config_value.required,
                "env_var": f"${{{key.upper()}}}",
            }

        with open(output_path, "w") as f:
            yaml.dump(template, f, default_flow_style=False, sort_keys=False)


# Utility Functions


def get_service_config(service_name: str) -> Dict[str, Any]:
    """Get service configuration as a dictionary."""
    return ConfigRegistry.get_config(service_name).get_all()


================================================
FILE: src/errors.py
================================================
#!/usr/bin/env python3
"""
Simple Error Handling for MCPMark
==================================

Provides basic error standardization and retry logic.
"""

from typing import Optional


"""Retryable error detection via minimal substring matching (lower-case)."""

# Keep this list short and generic; aim to catch API/infrastructure issues only.
RETRYABLE_PATTERNS = {
    "ratelimit",              # e.g., RateLimitError, too many requests
    # "connection",             # connection refused/reset/error
    "agent execution failed",
    "unavailable",            # service unavailable
    # "execution timed out",    # timeout
    "internal server error",  # 500s
    "network error",          # generic network issue
    "quota",                  # budget/quota exceeded
    # "llm provider not provided",  # litellm error
    # pipeline infra signals
    "account balance",
    "mcp network error",
    "state duplication error",
    "thought_signature",
    "overloaded."
}


def is_retryable_error(error: str) -> bool:
    """Return True if the error string contains any retryable pattern."""
    error_lower = str(error or "").lower()
    return any(pattern in error_lower for pattern in RETRYABLE_PATTERNS)


def standardize_error_message(error: str, mcp_service: Optional[str] = None) -> str:
    """Standardize error messages for consistent reporting."""
    error_str = str(error).strip()

    # Common standardizations
    if "timeout" in error_str.lower():
        base_msg = "Operation timed out"
    elif (
        "connection refused" in error_str.lower() or "econnrefused" in error_str.lower()
    ):
        base_msg = "Connection refused"
    elif "not found" in error_str.lower():
        base_msg = "Resource not found"
    elif "already exists" in error_str.lower():
        base_msg = "Resource already exists"
    else:
        # Return original message if no standardization applies
        return error_str

    # Add MCP service prefix if provided
    if mcp_service:
        return f"{mcp_service.title()} {base_msg}"

    return base_msg


================================================
FILE: src/evaluator.py
================================================
import time
import json
import shutil

from datetime import datetime
from pathlib import Path
from typing import List, Optional

from src.logger import get_logger
from src.factory import MCPServiceFactory
from src.model_config import ModelConfig
from src.results_reporter import EvaluationReport, ResultsReporter, TaskResult
from src.errors import is_retryable_error
from src.agents import AGENT_REGISTRY

# Initialize logger
logger = get_logger(__name__)


class MCPEvaluator:
    def __init__(
        self,
        mcp_service: str,
        model: str,
        timeout: int = 300,
        exp_name: str = "test-run",
        output_dir: Path = None,
        reasoning_effort: str = "default",
        agent_name: str = "mcpmark",
        task_suite: str = "standard",
        compaction_token: int = 0,
    ):
        # Main configuration
        self.mcp_service = mcp_service
        self.timeout = timeout
        self.agent_name = (agent_name or "mcpmark").lower()
        self.task_suite = (task_suite or "standard").lower()
        if self.agent_name not in AGENT_REGISTRY:
            raise ValueError(f"Unsupported agent '{agent_name}'. Available: {sorted(AGENT_REGISTRY)}")
        
        # Initialize model configuration
        self.reasoning_effort = reasoning_effort
        self.model_name = model
        
        model_config = ModelConfig(self.model_name)
        self.api_key = model_config.api_key
        self.base_url = model_config.base_url
        self.litellm_input_model_name = model_config.litellm_input_model_name
        
        # Track the actual model name from LiteLLM responses
        self.litellm_run_model_name = None

        # Initialize managers using the factory pattern (simplified)
        self.task_manager = MCPServiceFactory.create_task_manager(
            mcp_service, task_suite=self.task_suite
        )
        self.state_manager = MCPServiceFactory.create_state_manager(mcp_service)

        # Obtain static service configuration from state manager (e.g., notion_key)
        self.service_config = self.state_manager.get_service_config_for_agent()

        # Initialize agent for LLM and MCP server management. The agent will
        # automatically refresh its service configuration from the state
        # manager before each execution, so per-task manual updates are no
        # longer needed.
        agent_cls = AGENT_REGISTRY[self.agent_name]
        self.agent = agent_cls(
            litellm_input_model_name=self.litellm_input_model_name,
            api_key=self.api_key,
            base_url=self.base_url,
            mcp_service=mcp_service,
            timeout=timeout,
            service_config=self.service_config,
            service_config_provider=self.state_manager.get_service_config_for_agent,
            reasoning_effort=self.reasoning_effort,
            compaction_token=compaction_token,
        )

        # Initialize results reporter
        self.results_reporter = ResultsReporter()

        # Output directory handling
        if self.reasoning_effort != "default":
            model_slug = self.model_name.replace(".", "-") + "-" + self.reasoning_effort
        else:
            model_slug = self.model_name.replace(".", "-")

        service_for_dir = "playwright" if mcp_service == "playwright_webarena" else mcp_service
        suite_suffix = "" if self.task_suite in ("standard", "", None) else f"-{self.task_suite}"
        service_dir_name = f"{service_for_dir}{suite_suffix}"
        self.base_experiment_dir = output_dir / f"{model_slug}__{service_dir_name}" / exp_name
        self.base_experiment_dir.mkdir(parents=True, exist_ok=True)

    def _format_duration(self, seconds: float) -> str:
        """Format duration: <1s as ms, otherwise seconds."""
        return f"{(seconds * 1000):.2f}ms" if seconds < 1 else f"{seconds:.2f}s"

    def _get_task_output_dir(self, task) -> Path:
        """Return the directory path for storing this task's reports using '__' separator."""
        # Use category_id and task_id with '__' separator
        category_id = task.category_id if task.category_id else "uncategorized"
        task_id = str(task.task_id)

        return self.base_experiment_dir / f"{category_id}__{task_id}"

    # ------------------------------------------------------------------
    # Resuming helpers
    # ------------------------------------------------------------------

    def _load_latest_task_result(self, task) -> Optional[TaskResult]:
        """Return the most recent TaskResult for *task* if it has been run before."""
        task_dir = self._get_task_output_dir(task)
        if not task_dir.exists():
            return None

        meta_path = task_dir / "meta.json"
        if not meta_path.exists():
            return None

        try:
            with meta_path.open("r", encoding="utf-8") as f:
                meta_data = json.load(f)

            return TaskResult(
                task_name=meta_data["task_name"],
                success=meta_data["execution_result"]["success"],
                error_message=meta_data["execution_result"].get("error_message"),
                verification_error=meta_data["execution_result"].get("verification_error"),
                verification_output=meta_data["execution_result"].get("verification_output"),
                category_id=task.category_id,
                task_id=task.task_id,
                model_output=None,
                token_usage=meta_data.get("token_usage", {}),
                turn_count=meta_data.get("turn_count"),
                agent_execution_time=meta_data.get("agent_execution_time", 0.0),
                task_execution_time=meta_data.get("task_execution_time", 0.0),
            )
        except Exception as exc:
            logger.warning("Failed to load existing result for %s: %s", task.name, exc)
        return None

    def _gather_all_task_results(self) -> List[TaskResult]:
        """Scan *all* task sub-directories and collect the latest TaskResult from each."""
        results: list[TaskResult] = []
        if not self.base_experiment_dir.exists():
            return results

        for task_dir in self.base_experiment_dir.iterdir():
            if not task_dir.is_dir():
                continue
            meta_path = task_dir / "meta.json"
            if not meta_path.exists():
                continue
            try:
                with meta_path.open("r", encoding="utf-8") as f:
                    meta_data = json.load(f)

                category_id, task_id = task_dir.name.split("__", 1)

                result = TaskResult(
                    task_name=meta_data["task_name"],
                    success=meta_data["execution_result"]["success"],
                    error_message=meta_data["execution_result"].get("error_message"),
                    verification_error=meta_data["execution_result"].get("verification_error"),
                    verification_output=meta_data["execution_result"].get("verification_output"),
                    category_id=category_id,
                    task_id=task_id,
                    model_output=None,
                    token_usage=meta_data.get("token_usage", {}),
                    turn_count=meta_data.get("turn_count"),
                    agent_execution_time=meta_data.get("agent_execution_time", 0.0),
                    task_execution_time=meta_data.get("task_execution_time", 0.0),
                )
                results.append(result)
            except Exception as exc:
                logger.warning(
                    "Failed to parse existing report in %s: %s", task_dir, exc
                )
        return results

    def _run_single_task(self, task) -> TaskResult:
        """
        Runs a single task, including setup, agent execution, verification, and cleanup.
        """
        # Track overall task start time
        task_start_time = time.time()

        # ------------------------------------------------------------------
        # Stage 1: Set up the initial state for the task
        # ------------------------------------------------------------------
        setup_start_time = time.time()
        logger.info(
            "\n┌─ Stage 1: Setup ─────────────────────────────────────────────────────"
        )
        setup_success = self.state_manager.set_up(task)
        setup_time = time.time() - setup_start_time

        if not setup_success:
            logger.error(f"| State setup failed for task: {task.name}")
            task_total_time = time.time() - task_start_time
            return TaskResult(
                task_name=task.name,
                success=False,
                error_message="State Duplication Error",
                verification_error=None,
                verification_output=None,
                category_id=task.category_id,
                task_id=task.task_id,
                agent_execution_time=0.0,
                task_execution_time=task_total_time,
            )
        display_time = self._format_duration(setup_time)
        logger.info(f"└─ Completed in {display_time}\n")
        
        # ------------------------------------------------------------------
        # Stage 2: Execute the task using the agent
        # ------------------------------------------------------------------
        logger.info(
            "┌─ Stage 2: Execute ───────────────────────────────────────────────────"
        )

        agent_execution_start_time = time.time()

        # Get task instruction from task manager
        task_instruction = self.task_manager.get_task_instruction(task)

        # Prepare task_output_dir and tool call log file
        task_output_dir = self._get_task_output_dir(task)
        task_output_dir.mkdir(parents=True, exist_ok=True)
        execution_log_path = task_output_dir / "execution.log"

        # Remove existing execution.log to ensure clean start
        if execution_log_path.exists():
            execution_log_path.unlink()

        # Execute with agent
        agent_result = self.agent.execute_sync(
            task_instruction, str(execution_log_path)
        )

        agent_execution_time = time.time() - agent_execution_start_time
        
        # Extract actual model name from LiteLLM response
        if agent_result.get("litellm_run_model_name"):
            self.litellm_run_model_name = agent_result["litellm_run_model_name"]

        # Write messages.json to task_output_dir
        messages_path = task_output_dir / "messages.json"
        self.results_reporter.save_messages_json(
            agent_result.get("output", []), messages_path
        )

        # Set service-specific environment variables for verification scripts
        self.state_manager.set_verification_environment(str(messages_path))
        logger.info(f"└─ Completed in {self._format_duration(agent_execution_time)}\n")

        # ------------------------------------------------------------------
        # Stage 3: Verify
        # ------------------------------------------------------------------
        logger.info(
            "┌─ Stage 3: Verify ────────────────────────────────────────────────────"
        )
        verify_start_time = time.time()
        try:
            result = self.task_manager.execute_task(task, agent_result)
        finally:
            # Clean up environment variables
            import os

            os.environ.pop("MCP_MESSAGES", None)
            os.environ.pop("MCP_GITHUB_TOKEN", None)
            
        verify_time = time.time() - verify_start_time
        logger.info(f"└─ Completed in {self._format_duration(verify_time)}\n")

        # ------------------------------------------------------------------
        # Stage 4: Clean up
        # ------------------------------------------------------------------
        logger.info(
            "┌─ Stage 4: Cleanup ───────────────────────────────────────────────────"
        )
        cleanup_start_time = time.time()
        self.state_manager.clean_up(task)
        cleanup_time = time.time() - cleanup_start_time
        logger.info(f"└─ Completed in {self._format_duration(cleanup_time)}\n")

        # Calculate total task execution time
        task_total_time = time.time() - task_start_time

        # Add timing information to the result
        result.agent_execution_time = agent_execution_time
        result.task_execution_time = task_total_time

        return result

    def run_evaluation(self, task_filter: str) -> EvaluationReport:
        """
        Runs the full evaluation for the specified tasks.
        """
        tasks = self.task_manager.filter_tasks(task_filter)

        results = []

        for task in tasks:
            # --------------------------------------------------------------
            # Resume check
            # --------------------------------------------------------------
            existing_result = self._load_latest_task_result(task)

            # Decide whether to skip or retry this task
            retry_due_to_error = (
                existing_result is not None
                and not existing_result.success
                and is_retryable_error(existing_result.error_message)
            )

            if existing_result and not retry_due_to_error:
                # Existing result is either successful or failed with a non-retryable error – skip.
                logger.info(
                    "↩️  Skipping already-completed task (resume): %s", task.name
                )
                results.append(existing_result)
                continue

            if retry_due_to_error:
                # Clean previous artifacts so that new results fully replace them.
                task_output_dir = self._get_task_output_dir(task)
                if task_output_dir.exists():
                    shutil.rmtree(task_output_dir)
                logger.info(
                    "🔄 Retrying task due to pipeline error (%s): %s",
                    existing_result.error_message,
                    task.name,
                )

            # --------------------------------------------------------------
            # Execute new task
            # --------------------------------------------------------------
            task_start = time.time()
            task_result = self._run_single_task(task)
            task_end = time.time()

            results.append(task_result)
            
            # Prepare directory & save
            task_output_dir = self._get_task_output_dir(task)
            task_output_dir.mkdir(parents=True, exist_ok=True)

            # Save messages.json (conversation trajectory)
            messages_path = task_output_dir / "messages.json"

            if not messages_path.exists():  # 已经写过就跳过
                messages = (
                    task_result.model_output
                    if getattr(task_result, "model_output", None)
                    else []
                )
                self.results_reporter.save_messages_json(messages, messages_path)

            # Save meta.json (all other metadata)
            meta_path = task_output_dir / "meta.json"
            model_config = {
                "mcp_service": self.mcp_service,
                "model_name": self.model_name,
                "litellm_run_model_name": self.litellm_run_model_name,
                "reasoning_effort": self.reasoning_effort,
                "timeout": self.timeout,
                "agent_name": self.agent_name,
            }
            self.results_reporter.save_meta_json(
                task_result,
                model_config,
                datetime.fromtimestamp(task_start),
                datetime.fromtimestamp(task_end),
                meta_path,
            )

        # --------------------------------------------------------------
        # Aggregate results – combine current `results` with any previously
        # saved TaskResults that ALSO match the current task_filter.
        # --------------------------------------------------------------

        # Helper: determine if a TaskResult matches the filter string
        def _matches_filter(tr: TaskResult, flt: str) -> bool:
            if flt.lower() == "all":
                return True
            if "/" in flt:
                # specific task (category_id/task_id)
                category_id, task_id = flt.split("/", 1)
                return tr.category_id == category_id and str(tr.task_id) == task_id
            # category level
            return tr.category_id == flt

        # Pull existing reports from disk and merge
        existing_results = [
            r
            for r in self._gather_all_task_results()
            if _matches_filter(r, task_filter)
        ]

        # Merge, giving preference to fresh `results` (avoids duplicates)
        merged: dict[str, TaskResult] = {r.task_name: r for r in existing_results}
        merged.update({r.task_name: r for r in results})  # overwrite with latest run

        final_results = list(merged.values())

        aggregated_report = EvaluationReport(
            model_name=self.model_name,
            model_config={
                "mcp_service": self.mcp_service,
                "model_name": self.model_name,
                "litellm_run_model_name": self.litellm_run_model_name,
                "reasoning_effort": self.reasoning_effort,
                "timeout": self.timeout,
                "agent_name": self.agent_name,
            },
            total_tasks=len(final_results),
            successful_tasks=sum(1 for r in final_results if r.success),
            failed_tasks=sum(1 for r in final_results if not r.success),
            task_results=final_results,
            tasks_filter=task_filter,
        )

        # Save model-level summary
        summary_path = self.base_experiment_dir / "summary.json"
        self.results_reporter.save_model_summary(aggregated_report, summary_path)

        logger.info(
            "\n============================================================"
            "\nResults Summary"
            "\n============================================================"
        )
        logger.info(
            f"✓ Tasks passed: {aggregated_report.successful_tasks}/{aggregated_report.total_tasks} ({aggregated_report.success_rate:.1f}%)"
        )
        logger.info(f"⏱ Total time: {aggregated_report.total_task_execution_time:.1f}s")

        return aggregated_report


================================================
FILE: src/factory.py
================================================
#!/usr/bin/env python3
"""
MCP Service Factory for MCPMark
=================================

This module provides a simplified factory pattern for creating service-specific managers
with centralized configuration management.

Features:
- Dynamic service loading from definitions
- Centralized configuration
- Simplified service registration
"""

import importlib
from dataclasses import dataclass
from typing import Dict, Type

from src.base.login_helper import BaseLoginHelper
from src.base.state_manager import BaseStateManager
from src.base.task_manager import BaseTaskManager
from src.config.config_schema import ConfigRegistry
from src.services import get_service_definition, get_supported_mcp_services


@dataclass
class ServiceComponents:
    """All components required for an MCP service."""

    task_manager_class: Type[BaseTaskManager]
    state_manager_class: Type[BaseStateManager]
    login_helper_class: Type[BaseLoginHelper]
    config_mapping: Dict[str, Dict[str, str]]


def import_class(module_path: str):
    """Dynamically import a class from module path string."""
    if not module_path:
        return None
    module_name, class_name = module_path.rsplit(".", 1)
    module = importlib.import_module(module_name)
    return getattr(module, class_name)


def apply_config_mapping(config: dict, mapping: dict) -> dict:
    """Apply config mapping to transform config keys to constructor params."""
    if not mapping:
        return {}

    result = {}
    for param_name, config_key in mapping.items():
        if config_key in config:
            result[param_name] = config[config_key]
    return result


class ServiceRegistry:
    """Central registry that loads MCP services from definitions."""

    # Cache for loaded components
    _components_cache: Dict[str, ServiceComponents] = {}

    @classmethod
    def get_components(cls, service_name: str) -> ServiceComponents:
        """Get MCP service components from definition."""
        if service_name in cls._components_cache:
            return cls._components_cache[service_name]

        definition = get_service_definition(service_name)

        # Import classes dynamically
        components = ServiceComponents(
            task_manager_class=import_class(definition["components"]["task_manager"]),
            state_manager_class=import_class(definition["components"]["state_manager"]),
            login_helper_class=import_class(definition["components"]["login_helper"]),
            config_mapping=definition.get("config_mapping", {}),
        )

        cls._components_cache[service_name] = components
        return components


class GenericServiceFactory:
    """Generic factory that works with any MCP service."""

    def __init__(self, components: ServiceComponents, service_name: str):
        self.components = components
        self.service_name = service_name

    def create_task_manager(self, **kwargs) -> BaseTaskManager:
        """Create task manager instance."""
        return self.components.task_manager_class(**kwargs)

    def create_state_manager(self, config) -> BaseStateManager:
        """Create state manager with config mapping."""
        mapping = self.components.config_mapping.get("state_manager", {})
        # Handle both dict and config schema objects
        config_dict = config.get_all() if hasattr(config, "get_all") else config
        kwargs = apply_config_mapping(config_dict, mapping)
        return self.components.state_manager_class(**kwargs)

    def create_login_helper(self, config) -> BaseLoginHelper:
        """Create login helper with config mapping."""
        mapping = self.components.config_mapping.get("login_helper", {})
        # Handle both dict and config schema objects
        config_dict = config.get_all() if hasattr(config, "get_all") else config
        kwargs = apply_config_mapping(config_dict, mapping)
        
        # Special handling for GitHub login helper - it needs a single token
        if self.service_name == "github" and "token" in kwargs:
            tokens_list = kwargs["token"]
            if isinstance(tokens_list, list) and tokens_list:
                kwargs["token"] = tokens_list[0]  # Use first token for login helper
                
        return self.components.login_helper_class(**kwargs)


class MCPServiceFactory:
    """Main factory interface."""

    @classmethod
    def create_service_config(cls, service_name: str):
        """Create MCP service configuration (backward compatible)."""
        config = ConfigRegistry.get_config(service_name)

        # Create a backward-compatible ServiceConfig-like object
        class ServiceConfigCompat:
            def __init__(self, service_name: str, config_dict: dict):
                self.service_name = service_name
                self.config = config_dict
                self.api_key = config_dict.get("api_key")

        return ServiceConfigCompat(service_name, config.get_all())

    @classmethod
    def create_task_manager(cls, service_name: str, **kwargs) -> BaseTaskManager:
        """Create task manager for the specified MCP service."""
        components = ServiceRegistry.get_components(service_name)
        return components.task_manager_class(**kwargs)

    @classmethod
    def create_state_manager(cls, service_name: str, **kwargs) -> BaseStateManager:
        """Create state manager for the specified MCP service."""
        components = ServiceRegistry.get_components(service_name)
        config = ConfigRegistry.get_config(service_name).get_all()

        # Use provided kwargs or apply config mapping
        if not kwargs:
            mapping = components.config_mapping.get("state_manager", {})
            kwargs = apply_config_mapping(config, mapping)

        return components.state_manager_class(**kwargs)

    @classmethod
    def create_login_helper(cls, service_name: str, **kwargs) -> BaseLoginHelper:
        """Create login helper for the specified MCP service."""
        components = ServiceRegistry.get_components(service_name)
        config = ConfigRegistry.get_config(service_name).get_all()

        # Use provided kwargs or apply config mapping
        if not kwargs:
            mapping = components.config_mapping.get("login_helper", {})
            kwargs = apply_config_mapping(config, mapping)
            
            # Special handling for GitHub login helper - it needs a single token
            if service_name == "github" and "token" in kwargs:
                tokens_list = kwargs["token"]
                if isinstance(tokens_list, list) and tokens_list:
                    kwargs["token"] = tokens_list[0]  # Use first token for login helper

        return components.login_helper_class(**kwargs)

    @classmethod
    def get_supported_mcp_services(cls) -> list:
        """Get list of supported MCP services."""
        return get_supported_mcp_services()

    @classmethod
    def get_config_info(cls, service_name: str) -> dict:
        """Get detailed configuration information for debugging."""
        config = ConfigRegistry.get_config(service_name)
        return config.get_debug_info()

    @classmethod
    def export_config_template(cls, service_name: str, output_path: str) -> None:
        """Export a configuration template for an MCP service."""
        from pathlib import Path

        ConfigRegistry.export_template(service_name, Path(output_path))


================================================
FILE: src/logger.py
================================================
#!/usr/bin/env python3
"""Logger configuration for MCPMark."""

import logging
import sys


def get_logger(name: str) -> logging.Logger:
    """Get a configured logger instance."""
    logger = logging.getLogger(name)

    if not logger.handlers:
        handler = logging.StreamHandler(sys.stdout)
        formatter = logging.Formatter("%(message)s")
        handler.setFormatter(formatter)
        logger.addHandler(handler)
        logger.setLevel(logging.INFO)

    return logger


================================================
FILE: src/mcp_services/filesystem/__init__.py
================================================
"""
Filesystem MCP Service for MCPMark
===================================

This module provides filesystem-specific MCP server integration for MCPMark evaluation.
Uses the official filesystem MCP server for local file operations.
"""

from .filesystem_login_helper import FilesystemLoginHelper
from .filesystem_state_manager import FilesystemStateManager
from .filesystem_task_manager import FilesystemTaskManager, FilesystemTask

__all__ = [
    "FilesystemLoginHelper",
    "FilesystemStateManager",
    "FilesystemTaskManager",
    "FilesystemTask",
]


================================================
FILE: src/mcp_services/filesystem/filesystem_login_helper.py
================================================
"""
Filesystem Login Helper for MCPMark
====================================

This module provides a minimal login helper for the filesystem MCP service.
Since filesystem operations don't require authentication, this is a simple
pass-through implementation that satisfies the interface requirements.
"""

from pathlib import Path
from typing import Optional

from src.base.login_helper import BaseLoginHelper
from src.logger import get_logger

logger = get_logger(__name__)


class FilesystemLoginHelper(BaseLoginHelper):
    """
    Login helper for filesystem MCP service.

    The filesystem MCP server doesn't require authentication, so this
    implementation simply returns success for all login operations.
    """

    def __init__(self, state_path: Optional[Path] = None):
        """
        Initialize the filesystem login helper.

        Args:
            state_path: Path to save state (not used for filesystem)
        """
        super().__init__()
        self.state_path = (
            state_path or Path.home() / ".mcpmark" / "filesystem_state.json"
        )
        logger.info("Initialized FilesystemLoginHelper (no auth required)")

    def login(self, **kwargs) -> bool:
        """
        Perform login operation.

        Since filesystem doesn't require authentication, this always returns True.

        Returns:
            bool: Always True for filesystem service
        """
        logger.info("Filesystem service does not require authentication")
        return True

    def is_authenticated(self) -> bool:
        """
        Check if authenticated.

        Returns:
            bool: Always True for filesystem service
        """
        return True

    def get_credentials(self) -> dict:
        """
        Get credentials for the service.

        Returns:
            dict: Empty dict as no credentials needed
        """
        return {}


================================================
FILE: src/mcp_services/filesystem/filesystem_state_manager.py
================================================
"""
Filesystem State Manager for MCPMark
=====================================

This module handles filesystem state management for consistent task evaluation.
It manages test directories, file creation/cleanup, and environment isolation.
"""

import os
import shutil
import tempfile
from pathlib import Path
from typing import Any, Dict, List, Optional

from src.base.state_manager import BaseStateManager
from src.base.task_manager import BaseTask
from src.logger import get_logger

logger = get_logger(__name__)


class FilesystemStateManager(BaseStateManager):
    """
    Manages filesystem state for task evaluation.

    This includes creating isolated test directories, tracking created resources,
    and cleaning up after task completion.
    """

    def _get_project_root(self) -> Path:
        """Find project root by looking for marker files."""
        current = Path(__file__).resolve()

        # Look for project root markers
        for parent in current.parents:
            if (parent / "pyproject.toml").exists() or (parent / "pipeline.py").exists():
                return parent

        # Fallback to old method if markers not found
        return Path(__file__).parent / "../../../"

    def __init__(self, test_root: Optional[Path] = None, cleanup_on_exit: bool = False):
        """
        Initialize filesystem state manager.

        Args:
            test_root: Root directory for test operations (from FILESYSTEM_TEST_ROOT env var)
            cleanup_on_exit: Whether to clean up test directories after tasks (default False for persistent environment)
        """
        super().__init__(service_name="filesystem")

        # Use provided test root or default to persistent test environment
        if test_root:
            self.test_root = Path(test_root)
        else:
            # Default to persistent test environment
            project_root = self._get_project_root()
            self.test_root = (project_root / "test_environments/desktop").resolve()

        self.cleanup_on_exit = cleanup_on_exit
        self.current_task_dir: Optional[Path] = None
        self.created_resources: List[Path] = []

        # Backup and restore functionality
        self.backup_dir: Optional[Path] = None
        self.backup_enabled = (
            True  # Enable backup/restore by default for task isolation
        )

        logger.info(
            f"Initialized FilesystemStateManager with persistent test environment: {self.test_root}"
        )

    def initialize(self, **kwargs) -> bool:
        """
        Initialize the filesystem environment.

        Ensures the persistent test environment exists and is accessible.

        Returns:
            bool: True if initialization successful
        """
        try:
            # Ensure test environment directory exists
            if not self.test_root.exists():
                logger.error(f"Persistent test environment not found: {self.test_root}")
                logger.error(
                    "Please ensure test_environments/desktop/ exists in the repository"
                )
                return False

            logger.info(f"Using persistent test environment: {self.test_root}")

            # Verify we can write to the directory
            test_file = self.test_root / ".mcpbench_test"
            test_file.write_text("test")
            test_file.unlink()

            return True

        except Exception as e:
            logger.error(f"Failed to initialize filesystem environment: {e}")
            return False

    def set_up(self, task: BaseTask) -> bool:
        """
        Set up filesystem environment for a specific task.

        Creates a backup of the current environment, then uses the backup
        as the working directory to keep the original unchanged.

        Args:
            task: The task for which to set up the state

        Returns:
            bool: True if setup successful
        """
        try:
            # Dynamically set test root based on task category
            self._set_dynamic_test_root(task)

            # Create backup of current test environment before task execution
            if self.backup_enabled:
                if not self._create_backup(task):
                    logger.error(f"Failed to create backup for task {task.name}")
                    return False

            # Use the backup directory as the working directory instead of the original
            self.current_task_dir = (
                self.backup_dir
            )  # Use backup directory for operations

            logger.info(
                f"| ✓ Using the backup environment for operations"
            )

            # Store the test directory path in the task object for use by task manager
            if hasattr(task, "__dict__"):
                task.test_directory = str(self.current_task_dir)

            # Set environment variable for verification scripts and MCP server
            os.environ["FILESYSTEM_TEST_DIR"] = str(self.current_task_dir)

            return True

        except Exception as e:
            logger.error(f"Failed to set up filesystem state for {task.name}: {e}")
            return False

    def _set_dynamic_test_root(self, task: BaseTask) -> None:
        """
        Dynamically set the test root directory based on the task category.

        Args:
            task: The task for which to set the test root
        """
        # Get the base test environments directory from environment variable
        base_test_root = os.getenv("FILESYSTEM_TEST_ROOT")
        if not base_test_root:
            # Fallback to default path
            project_root = self._get_project_root()
            base_test_root = str(project_root / "test_environments")

        base_test_path = Path(base_test_root)

        # If task has a category_id, append it to the base path
        if task.category_id:
            self.test_root = base_test_path / task.category_id
            # Store the current task category for URL selection
            self._current_task_category = task.category_id
            logger.info(f"| ✓ Setting test root to category-specific directory: {self.test_root}")
        else:
            # Use the base test environments directory
            self.test_root = base_test_path
            # For base directory, use 'desktop' as default category
            self._current_task_category = 'desktop'
            logger.info(f"| Setting test root to base directory: {self.test_root}")

        # Ensure the directory exists by downloading and extracting if needed
        if not self.test_root.exists():
            logger.warning(f"| Test directory does not exist: {self.test_root}")
            if not self._download_and_extract_test_environment():
                logger.error(f"Failed to download and extract test environment for: {self.test_root}")
                raise RuntimeError(f"Test environment not available: {self.test_root}")
            logger.info(f"| Downloaded and extracted test environment: {self.test_root}")


    def clean_up(self, task: Optional[BaseTask] = None, **kwargs) -> bool:
        """
        Clean up filesystem resources created during task execution.

        Since we operate on the backup directory, we just need to clean up the backup.

        Args:
            task: The task to clean up after (optional)
            **kwargs: Additional cleanup options

        Returns:
            bool: True if cleanup successful
        """
        try:
            cleanup_success = True

            # Clean up the backup directory since we operated on it
            if self.backup_enabled and self.backup_dir and self.backup_dir.exists():
                try:
                    shutil.rmtree(self.backup_dir)
                    logger.info(
                        f"| ✓ Cleaned up backup directory for task {task.name if task else 'unknown'}"
                    )
                    self.backup_dir = None
                except Exception as e:
                    logger.error(f"Failed to clean up backup directory: {e}")
                    cleanup_success = False
            else:
                logger.info("No backup directory to clean up")

            # Clear the resources list
            self.created_resources.clear()

            return cleanup_success

        except Exception as e:
            logger.error(f"Filesystem cleanup failed: {e}")
            return False

    def get_test_directory(self) -> Optional[Path]:
        """
        Get the current test directory path.

        Returns:
            Path to the current test directory, or None if not set up
        """
        return self.current_task_dir

    def get_service_config_for_agent(self) -> dict:
        """
        Get service-specific configuration for agent execution.

        Returns:
            Dictionary containing configuration needed by the agent/MCP server
        """
        service_config = {}

        # Add test directory if available
        if self.current_task_dir:
            service_config["test_directory"] = str(self.current_task_dir)

        return service_config

    def track_resource(self, resource_path: Path):
        """
        Track a resource for cleanup.

        Args:
            resource_path: Path to the resource to track
        """
        if resource_path not in self.created_resources:
            self.created_resources.append(resource_path)
            logger.debug(f"Tracking resource for cleanup: {resource_path}")

    def reset_test_environment(self) -> bool:
        """
        Reset the test environment to its original state.

        This method can be used for development/debugging purposes.
        In normal operation, the persistent environment is maintained.

        Returns:
            bool: True if reset successful
        """
        try:
            # Remove any sorting directories that might have been created
            sorting_dirs = ["has_test", "no_test", "organized", "backup"]
            for dir_name in sorting_dirs:
                dir_path = self.test_root / dir_name
                if dir_path.exists():
                    shutil.rmtree(dir_path)
                    logger.info(f"Removed sorting directory: {dir_path}")

            # Remove any temporary files that might have been created
            temp_files = ["hello_world.txt", "new_file.txt", "temp.txt"]
            for file_name in temp_files:
                file_path = self.test_root / file_name
                if file_path.exists():
                    file_path.unlink()
                    logger.info(f"Removed temporary file: {file_path}")

            logger.info("Test environment reset completed")
            return True
        except Exception as e:
            logger.error(f"Test environment reset failed: {e}")
            return False

    # =========================================================================
    # Backup and Restore Methods for Task Isolation
    # =========================================================================

    def _create_backup(self, task: BaseTask) -> bool:
        """
        Create a complete backup of the test environment before task execution.

        Args:
            task: The task for which to create backup

        Returns:
            bool: True if backup successful
        """
        try:
            # Create backup directory with task-specific name
            project_root = self._get_project_root()
            backup_root = (project_root / ".mcpmark_backups").resolve()
            backup_root.mkdir(exist_ok=True)

            task_id = f"{task.service}_{task.category_id}_{task.task_id}"
            self.backup_dir = backup_root / f"backup_{task_id}_{os.getpid()}"

            # Remove existing backup if it exists
            if self.backup_dir.exists():
                shutil.rmtree(self.backup_dir)

            # Create fresh backup by copying entire test environment
            shutil.copytree(self.test_root, self.backup_dir)

            logger.info(f"| ✓ Created backup for task {task.name}: {self.backup_dir}")
            return True

        except Exception as e:
            logger.error(f"Failed to create backup for task {task.name}: {e}")
            return False

    def _restore_from_backup(self, task: Optional[BaseTask] = None) -> bool:
        """
        Restore the test environment from backup.

        Args:
            task: The task to restore after (optional, for logging)

        Returns:
            bool: True if restore successful
        """
        try:
            if not self.backup_dir or not self.backup_dir.exists():
                logger.error("No backup directory available for restore")
                return False

            # Remove current test environment
            if self.test_root.exists():
                shutil.rmtree(self.test_root)

            # Restore from backup
            shutil.copytree(self.backup_dir, self.test_root)

            # Clean up backup directory
            shutil.rmtree(self.backup_dir)
            self.backup_dir = None

            task_name = task.name if task else "unknown"
            logger.info(
                f"✅ Restored test environment from backup after task {task_name}"
            )
            return True

        except Exception as e:
            task_name = task.name if task else "unknown"
            logger.error(f"Failed to restore from backup after task {task_name}: {e}")
            return False

    # =========================================================================
    # Abstract Method Implementations Required by BaseStateManager
    # =========================================================================

    def _create_initial_state(self, task: BaseTask) -> Optional[Dict[str, Any]]:
        """Create initial state for a task.

        For filesystem, this is handled in set_up() method by creating task directories.
        Returns the task directory path as state info.
        """
        if self.current_task_dir and self.current_task_dir.exists():
            return {"task_directory": str(self.current_task_dir)}
        return None

    def _store_initial_state_info(
        self, task: BaseTask, state_info: Dict[str, Any]
    ) -> None:
        """Store initial state information in the task object.

        For filesystem, we store the test directory path.
        """
        if state_info and "task_directory" in state_info:
            if hasattr(task, "__dict__"):
                task.test_directory = state_info["task_directory"]

    def _cleanup_task_initial_state(self, task: BaseTask) -> bool:
        """Clean up initial state for a specific task.

        For filesystem, this means removing the task directory.
        """
        if hasattr(task, "test_directory") and task.test_directory:
            task_dir = Path(task.test_directory)
            if task_dir.exists():
                try:
                    shutil.rmtree(task_dir)
                    logger.info(f"Cleaned up task directory: {task_dir}")
                    return True
                except Exception as e:
                    logger.error(f"Failed to clean up task directory: {e}")
                    return False
        return True

    def _cleanup_single_resource(self, resource: Dict[str, Any]) -> bool:
        """Clean up a single tracked resource.

        For filesystem, resources are paths to files/directories.
        """
        if "path" in resource:
            resource_path = Path(resource["path"])
            if resource_path.exists():
                try:
                    if resource_path.is_dir():
                        shutil.rmtree(resource_path)
                    else:
                        resource_path.unlink()
                    logger.info(f"Cleaned up resource: {resource_path}")
                    return True
                except Exception as e:
                    logger.error(f"Failed to clean up {resource_path}: {e}")
                    return False
        return True

    def _download_and_extract_test_environment(self) -> bool:
        """
        Download and extract test environment using wget and unzip commands.
        
        This approach preserves original file timestamps and is simpler than Python zipfile.

        Returns:
            bool: True if download and extraction successful
        """
        try:
            import subprocess
            import sys
            
            # Define URL mapping for different test environment categories
            url_mapping = {
                'desktop': 'https://storage.mcpmark.ai/filesystem/desktop.zip',
                'file_context': 'https://storage.mcpmark.ai/filesystem/file_context.zip',
                'file_property': 'https://storage.mcpmark.ai/filesystem/file_property.zip',
                'folder_structure': 'https://storage.mcpmark.ai/filesystem/folder_structure.zip',
                'papers': 'https://storage.mcpmark.ai/filesystem/papers.zip',
                'student_database': 'https://storage.mcpmark.ai/filesystem/student_database.zip',
                'threestudio': 'https://storage.mcpmark.ai/filesystem/threestudio.zip',
                'votenet': 'https://storage.mcpmark.ai/filesystem/votenet.zip',
                'legal_document': 'https://storage.mcpmark.ai/filesystem/legal_document.zip',
                'desktop_template': 'https://storage.mcpmark.ai/filesystem/desktop_template.zip'
            }

            # Get the category from the current task context
            category = getattr(self, '_current_task_category', None)
            if not category:
                logger.error("| No task category available for URL selection")
                return False

            # Select the appropriate URL based on category
            if category in url_mapping:
                test_env_url = url_mapping[category]
                logger.info(f"| ○ Selected URL for category '{category}': {test_env_url}")
            else:
                logger.error(f"| No URL mapping found for category: {category}")
                return False

            # Allow override via environment variable
            test_env_url = os.getenv('TEST_ENVIRONMENT_URL', test_env_url)

            logger.info(f"| ○ Downloading test environment from: {test_env_url}")

            # Create a temporary directory for the download
            with tempfile.TemporaryDirectory() as temp_dir:
                temp_path = Path(temp_dir)
                zip_path = temp_path / "test_environment.zip"

                # Step 1: Download using wget
                logger.info("| ○ Downloading test environment zip file...")
                try:
                    # Use wget if available, otherwise fall back to curl
                    if sys.platform == "win32":
                        # Windows: try wget, fall back to curl
                        try:
                            result = subprocess.run(
                                ["wget", "-O", str(zip_path), test_env_url],
                                capture_output=True, text=True, check=True
                            )
                        except (subprocess.CalledProcessError, FileNotFoundError):
                            # Fall back to curl
                            result = subprocess.run(
                                ["curl", "-L", "-o", str(zip_path), test_env_url],
                                capture_output=True, text=True, check=True
                            )
                    else:
                        # Unix-like systems: try wget, fall back to curl
                        try:
                            result = subprocess.run(
                                ["wget", "-O", str(zip_path), test_env_url],
                                capture_output=True, text=True, check=True
                            )
                        except (subprocess.CalledProcessError, FileNotFoundError):
                            # Fall back to curl
                            result = subprocess.run(
                                ["curl", "-L", "-o", str(zip_path), test_env_url],
                                capture_output=True, text=True, check=True
                            )
                    
                    logger.info("| ✓ Download completed successfully")
                except Exception as e:
                    logger.error(f"| Download failed: {e}")
                    return False

                # Step 2: Extract using unzip
                logger.info("| ○ Extracting test environment...")
                try:
                    # Extract to parent directory to maintain expected structure
                    result = subprocess.run(
                        ["unzip", "-o", str(zip_path), "-d", str(self.test_root.parent)],
                        capture_output=True, text=True, check=True
                    )
                    logger.info("| ✓ Extraction completed successfully")
                except Exception as e:
                    logger.error(f"| Extraction failed: {e}")
                    return False

                # Step 3: Remove __MACOSX folder if it exists
                logger.info("| ○ Cleaning up macOS metadata...")
                macosx_path = self.test_root.parent / "__MACOSX"
                if macosx_path.exists():
                    try:
                        shutil.rmtree(macosx_path)
                        logger.info("| ✓ Removed __MACOSX folder")
                    except Exception as e:
                        logger.warning(f"| Failed to remove __MACOSX folder: {e}")

                # Verify the extracted directory exists
                if not self.test_root.exists():
                    logger.error(f"| Extracted directory not found at expected path: {self.test_root}")
                    return False

                logger.info(f"| ✓ Successfully downloaded and extracted test environment to: {self.test_root}")
                return True

        except Exception as e:
            logger.error(f"| Failed to download and extract test environment: {e}")
            return False


================================================
FILE: src/mcp_services/filesystem/filesystem_task_manager.py
================================================
"""
Simplified Filesystem Task Manager using Enhanced Base Class
============================================================

This module shows how the filesystem task manager can be simplified
using the enhanced base task manager.
"""

import os
import subprocess
from dataclasses import dataclass
from pathlib import Path
from typing import List, Optional, Dict, Any

from src.base.task_manager import BaseTask, BaseTaskManager
from src.logger import get_logger

logger = get_logger(__name__)


@dataclass
class FilesystemTask(BaseTask):
    """Filesystem-specific task with additional fields."""

    test_directory: Optional[str] = None
    expected_files: Optional[List[str]] = None
    expected_directories: Optional[List[str]] = None


class FilesystemTaskManager(BaseTaskManager):
    """Simplified filesystem task manager using enhanced base class."""

    def __init__(self, tasks_root: Path = None, task_suite: str = "standard"):
        """Initialize filesystem task manager."""
        if tasks_root is None:
            tasks_root = Path(__file__).resolve().parents[3] / "tasks"

        super().__init__(
            tasks_root,
            mcp_service="filesystem",
            task_class=FilesystemTask,
            task_organization="directory",
            task_suite=task_suite,
        )

    # Override only what's needed for filesystem-specific behavior
    def _create_task_from_files(
        self, category_id: str, task_files_info: Dict[str, Any]
    ) -> BaseTask:
        """Instantiate a `BaseTask` from the dictionary returned by `_find_task_files`."""
        import json
        
        # Support arbitrary task names, not just task_n format
        task_name = task_files_info["task_id"]

        # Use task_name as default task_id
        task_id = task_name

        # Check for meta.json
        meta_path = task_files_info["instruction_path"].parent / "meta.json"
        final_category_id = category_id
        
        if meta_path.exists():
            try:
                with open(meta_path, 'r') as f:
                    meta_data = json.load(f)
                    # Use values from meta.json if available
                    final_category_id = meta_data.get("category_id", category_id)
                    task_id = meta_data.get("task_id", task_id)
            except Exception as e:
                logger.warning(f"Failed to load meta.json from {meta_path}: {e}")

        return self.task_class(
            task_instruction_path=task_files_info["instruction_path"],
            task_verification_path=task_files_info["verification_path"],
            service="filesystem",
            category_id=final_category_id,
            task_id=task_id,
        )

    def run_verification(self, task: BaseTask) -> subprocess.CompletedProcess:
        """Run verification with filesystem-specific environment."""
        env = os.environ.copy()

        # Pass test directory to verification script
        # Priority: task.test_directory (set by state manager) > environment variable
        test_dir = None
        if hasattr(task, "test_directory") and task.test_directory:
            test_dir = task.test_directory
        else:
            test_dir = os.getenv("FILESYSTEM_TEST_DIR")

        if test_dir:
            env["FILESYSTEM_TEST_DIR"] = test_dir
            logger.debug(f"Setting FILESYSTEM_TEST_DIR to: {test_dir}")

        return subprocess.run(
            self._get_verification_command(task),
            capture_output=True,
            text=True,
            timeout=300,
            env=env,
        )

    def filter_tasks(self, task_filter: str) -> List[BaseTask]:
        """Filter tasks based on category or specific task pattern with support for arbitrary task names."""
        all_tasks = self.discover_all_tasks()

        if not task_filter or task_filter.lower() == "all":
            return all_tasks

        # Check if it's a category filter
        categories = self.get_categories()
        if task_filter in categories:
            return [task for task in all_tasks if task.category_id == task_filter]

        # Check for specific task pattern (category_id/task_X or category_id/arbitrary_name)
        if "/" in task_filter:
            try:
                category_id, task_id = task_filter.split("/", 1)
                # Direct string matching for task_id
                for task in all_tasks:
                    if task.category_id == category_id and str(task.task_id) == task_id:
                        return [task]
            except (ValueError, IndexError):
                pass

        # Fallback: check for partial matches in task names or categories
        filtered_tasks = []
        for task in all_tasks:
            if (
                task_filter in task.category_id
                or task_filter in task.name
                or task_filter == str(task.task_id)
            ):
                filtered_tasks.append(task)

        return filtered_tasks


================================================
FILE: src/mcp_services/github/__init__.py
================================================
"""
GitHub MCP Service for MCPMark
===============================

This module provides GitHub-specific MCP server integration for MCPMark evaluation.
Uses GitHub's official remote MCP server for streamable HTTP/SSE communication.

Updated to include initial state-based environment replication mechanism.
"""

from .github_login_helper import GitHubLoginHelper
from .github_task_manager import GitHubTaskManager, GitHubTask
from .github_state_manager import GitHubStateManager

__all__ = ["GitHubLoginHelper", "GitHubTaskManager", "GitHubTask", "GitHubStateManager"]


================================================
FILE: src/mcp_services/github/github_login_helper.py
================================================
"""
GitHub Login Helper for MCPMark
================================

This module provides GitHub token authentication and validation utilities.
Unlike browser-based services, GitHub uses token-based authentication.
"""

import json
import requests
from pathlib import Path
from typing import Optional, Dict, Any

from src.base.login_helper import BaseLoginHelper
from src.logger import get_logger

logger = get_logger(__name__)


class GitHubLoginHelper(BaseLoginHelper):
    """
    Utility helper for GitHub token authentication and validation.
    """

    def __init__(
        self,
        token: Optional[str] = None,
        state_path: Optional[Path] = None,
    ) -> None:
        """
        Initialize the GitHub login helper.

        Args:
            token: GitHub Personal Access Token
            state_path: Path to save authentication state
        """
        self.token = token
        self.state_path = state_path or Path.home() / ".mcpmark" / "github_auth.json"

        # Ensure state directory exists
        self.state_path.parent.mkdir(parents=True, exist_ok=True)

    def login_and_save_state(self, **kwargs) -> bool:
        """
        Validate GitHub token and save authentication state.

        Returns:
            bool: True if authentication successful, False otherwise
        """
        if not self.token:
            logger.error("No GitHub token provided")
            return False

        try:
            # Validate token by making an authenticated request
            session = requests.Session()
            session.headers.update(
                {
                    "Authorization": f"Bearer {self.token}",
                    "Accept": "application/vnd.github.v3+json",
                    "X-GitHub-Api-Version": "2022-11-28",
                    "User-Agent": "MCPMark/1.0",
                }
            )

            # Get user information
            response = session.get("https://api.github.com/user")

            if response.status_code != 200:
                logger.error(
                    f"GitHub authentication failed: {response.status_code} {response.text}"
                )
                return False

            user_info = response.json()
            logger.info(
                f"GitHub authentication successful for user: {user_info['login']}"
            )

            # Get token scopes
            token_scopes = self._get_token_scopes(session)

            # Save authentication state
            auth_state = {
                "user": user_info,
                "token_scopes": token_scopes,
                "authenticated_at": self._get_current_timestamp(),
            }
            self._save_auth_state(auth_state)

            # Verify required permissions
            if not self._verify_required_permissions(token_scopes):
                logger.warning("GitHub token may not have all required permissions")
                return False

            return True

        except Exception as e:
            logger.error(f"GitHub authentication error: {e}")
            return False

    def _get_token_scopes(self, session: requests.Session) -> list:
        """Get the scopes available to the current token."""
        try:
            response = session.get("https://api.github.com/user")
            scopes_header = response.headers.get("X-OAuth-Scopes", "")
            if scopes_header:
                return [
                    scope.strip() for scope in scopes_header.split(",") if scope.strip()
                ]
            return []
        except Exception as e:
            logger.warning(f"Could not determine token scopes: {e}")
            return []

    def _verify_required_permissions(self, scopes: list) -> bool:
        """
        Verify that the token has the minimum required permissions.

        For MCPMark GitHub tasks, we typically need:
        - repo (for repository access)
        - read:user (for user information)
        """
        required_scopes = ["repo"]  # Minimum requirement
        recommended_scopes = ["repo", "read:user", "read:org"]

        has_required = all(scope in scopes for scope in required_scopes)
        if not has_required:
            logger.error(
                f"Token missing required scopes. Required: {required_scopes}, Available: {scopes}"
            )
            return False

        has_recommended = all(scope in scopes for scope in recommended_scopes)
        if not has_recommended:
            logger.warning(
                f"Token missing some recommended scopes. Recommended: {recommended_scopes}, Available: {scopes}"
            )

        return True

    def _save_auth_state(self, auth_state: Dict[str, Any]):
        """Save authentication state to local file."""
        try:
            with open(self.state_path, "w") as f:
                json.dump(auth_state, f, indent=2, default=str)

            # Set restrictive permissions (user read/write only)
            self.state_path.chmod(0o600)
            logger.info(f"Authentication state saved to: {self.state_path}")

        except Exception as e:
            logger.error(f"Failed to save authentication state: {e}")

    def _get_current_timestamp(self) -> str:
        """Get current timestamp in ISO format."""
        from datetime import datetime

        return datetime.utcnow().isoformat() + "Z"

    def get_saved_auth_state(self) -> Optional[Dict[str, Any]]:
        """Load and return saved authentication state."""
        try:
            if self.state_path.exists():
                with open(self.state_path, "r") as f:
                    return json.load(f)
        except Exception as e:
            logger.error(f"Failed to load authentication state: {e}")
        return None

    def is_token_valid(self) -> bool:
        """Check if the current token is still valid."""
        if not self.token:
            return False

        try:
            session = requests.Session()
            session.headers.update(
                {
                    "Authorization": f"Bearer {self.token}",
                    "Accept": "application/vnd.github.v3+json",
                }
            )

            response = session.get("https://api.github.com/user")
            return response.status_code == 200

        except Exception:
            return False

    def get_rate_limit_info(self) -> Dict[str, Any]:
        """Get current rate limit information for the token."""
        if not self.token:
            return {}

        try:
            session = requests.Session()
            session.headers.update(
                {
                    "Authorization": f"Bearer {self.token}",
                    "Accept": "application/vnd.github.v3+json",
                }
            )

            response = session.get("https://api.github.com/rate_limit")
            if response.status_code == 200:
                return response.json()

        except Exception as e:
            logger.warning(f"Failed to get rate limit info: {e}")

        return {}

    def test_repository_access(self, owner: str, repo: str) -> bool:
        """Test if the token has access to a specific repository."""
        if not self.token:
            return False

        try:
            session = requests.Session()
            session.headers.update(
                {
                    "Authorization": f"Bearer {self.token}",
                    "Accept": "application/vnd.github.v3+json",
                }
            )

            response = session.get(f"https://api.github.com/repos/{owner}/{repo}")
            return response.status_code == 200

        except Exception:
            return False


================================================
FILE: src/mcp_services/github/github_state_manager.py
================================================
"""
GitHub State Manager for MCPMark
=================================

This module handles GitHub repository state management for consistent task evaluation.
Manages test repositories, branches, and cleanup after evaluation.
"""

import requests
from typing import Optional, List, Union
from pathlib import Path

from src.base.state_manager import BaseStateManager, InitialStateInfo
from src.base.task_manager import BaseTask
from src.logger import get_logger
from src.mcp_services.github.token_pool import GitHubTokenPool

logger = get_logger(__name__)


class GitHubStateManager(BaseStateManager):
    """
    Manages GitHub repository state for task evaluation.
    """

    def __init__(
        self,
        github_token: Union[str, List[str]],
        # Name of the evaluation organisation / user where temporary test repositories are created
        eval_org: str = "mcpmark-eval",
        # Local directory that stores *exported* repository templates (produced by repo_exporter.py)
        templates_root: str = "./github_state",
    ):
        """
        Initialize GitHub state manager.

        Args:
            github_token: GitHub Personal Access Token(s). Can be a single token string or a list of tokens for round-robin usage.
            eval_org: Organisation / user used to host **ephemeral evaluation repositories**.
        """
        super().__init__(service_name="github")

        # Track repos created via template import so we can delete them afterwards
        self._repos_to_cleanup: list[tuple[str, str]] = []  # (owner, repo_name)

        # Initialize token pool
        if isinstance(github_token, str):
            # Single token - create pool with one token
            self.token_pool = GitHubTokenPool([github_token])
            self.github_token = github_token  # Keep for backward compatibility
        else:
            # Multiple tokens - use token pool
            self.token_pool = GitHubTokenPool(github_token)
            self.github_token = (
                self.token_pool.get_current_token()
            )  # For backward compatibility

        # Store evaluation context (consistent naming)
        self.eval_org = eval_org  # evaluation organisation / user

        # Local path that contains exported repository templates
        self.templates_root = Path(templates_root).expanduser().resolve()

        # Set up HTTP session for GitHub API
        self.session = requests.Session()
        # Note: We'll update the Authorization header before each request
        self.session.headers.update(
            {
                "Accept": "application/vnd.github.v3+json",
                "X-GitHub-Api-Version": "2022-11-28",
                "User-Agent": "MCPMark/1.0",
            }
        )

        # Validate GitHub configuration during initialization
        try:
            # Set initial token for validation
            self._update_session_token()

            response = self.session.get("https://api.github.com/user")
            if response.status_code != 200:
                raise ValueError(
                    f"Invalid GitHub token: {response.status_code} {response.text}"
                )

            user_info = response.json()
            logger.info(f"GitHub authenticated as: {user_info['login']}")
            logger.info(f"Using token pool with {self.token_pool.pool_size} token(s)")

            # Check if evaluation organisation exists (optional)
            if self.eval_org:
                org_response = self.session.get(
                    f"https://api.github.com/orgs/{self.eval_org}"
                )
                if org_response.status_code == 200:
                    logger.info(f"Using evaluation organisation: {self.eval_org}")
                else:
                    logger.warning(
                        f"Evaluation organisation {self.eval_org} not accessible, using user account"
                    )
                    # Fall back to user account
                    self.eval_org = user_info["login"]

            logger.info("GitHub state manager initialized successfully")

        except Exception as e:
            raise RuntimeError(f"GitHub initialization failed: {e}")

        # Initial state mapping - categories to initial state repositories
        self.initial_state_mapping = {
            "build_your_own_x": "codecrafters-io-build-your-own-x",
            "missing-semester": "missing-semester-missing-semester",
            "mcpmark-cicd": "zjwu0522-mcpmark-cicd",
            "harmony": "openai-harmony",
            "claude-code": "anthropics-claude-code",
            "easyr1": "hiyouga-EasyR1",
        }

        # CDN URL mapping for downloading GitHub templates
        self.github_template_url_mapping = {
            "codecrafters-io-build-your-own-x": "https://storage.mcpmark.ai/github/codecrafters-io-build-your-own-x.zip",
            "missing-semester-missing-semester": "https://storage.mcpmark.ai/github/missing-semester-missing-semester.zip",
            "zjwu0522-mcpmark-cicd": "https://storage.mcpmark.ai/github/zjwu0522-mcpmark-cicd.zip",
            "openai-harmony": "https://storage.mcpmark.ai/github/openai-harmony.zip",
            "anthropics-claude-code": "https://storage.mcpmark.ai/github/anthropics-claude-code.zip",
            "hiyouga-EasyR1": "https://storage.mcpmark.ai/github/hiyouga-EasyR1.zip",
        }

    # =========================================================================
    # Core Template Methods (Required by BaseStateManager)
    # =========================================================================

    # ---------------------------------------------------------------------
    # Internal helper – template importer (replicates repo_importer logic)
    # ---------------------------------------------------------------------

    def _import_template_repo(
        self, template_dir: Path, owner: str, private: bool = True
    ) -> str:
        """Import repository from local template directory to GitHub (simplified)."""

        import json
        import subprocess
        import time

        # ------------------------------------------------------------------
        # Helper functions (stripped-down versions of repo_importer utilities)
        # ------------------------------------------------------------------

        def _list_refs(repo_dir: str) -> list[str]:
            result = subprocess.run(
                ["git", "-C", repo_dir, "for-each-ref", "--format=%(refname)"],
                check=True,
                capture_output=True,
                text=True,
            )
            return result.stdout.strip().splitlines()

        def _push_repo(
            repo_path: Path, repo_owner: str, repo_name: str, required_refs: list[str]
        ):
            """Push repo to GitHub: try mirror, else per-ref."""
            token = self.github_token
            dst_url = f"https://x-access-token:{token}@github.com/{repo_owner}/{repo_name}.git"

            try:
                subprocess.run(
                    ["git", "-C", str(repo_path), "push", "--mirror", dst_url],
                    check=True,
                    capture_output=True,
                )
                return
            except subprocess.CalledProcessError as err:
                logger.warning(
                    "| [push] Mirror push failed – falling back: %s",
                    err.stderr.decode(errors="ignore"),
                )

            refs = required_refs or _list_refs(str(repo_path))
            for ref in refs:
                for attempt in range(3):
                    try:
                        subprocess.run(
                            [
                                "git",
                                "-C",
                                str(repo_path),
                                "push",
                                dst_url,
                                f"{ref}:{ref}",
                            ],
                            check=True,
                            capture_output=True,
                        )
                        break
                    except subprocess.CalledProcessError as ref_err:
                        if attempt == 2:
                            raise RuntimeError(
                                f"Failed to push ref {ref}: {ref_err.stderr}"
                            ) from ref_err
                        time.sleep(2 * (attempt + 1))

        # ------------------------------------------------------------------
        # Phase 0 – read template metadata
        # ------------------------------------------------------------------
        meta = json.loads((template_dir / "meta.json").read_text())
        repo_name: str = meta["repo"]
        pr_head_refs = meta.get("pr_head_refs", [])
        default_branch = meta.get("default_branch", "main")

        pulls_data = json.loads((template_dir / "pulls.json").read_text())
        fork_branches = [
            pr["local_branch"]
            for pr in pulls_data
            if pr.get("is_from_fork") and "local_branch" in pr
        ]
        needed_refs = (
            [f"refs/heads/{default_branch}"]
            + [f"refs/heads/{h}" for h in pr_head_refs]
            + [f"refs/heads/{b}" for b in fork_branches]
        )

        # ------------------------------------------------------------------
        # Phase 1 – create empty repo under owner
        # ------------------------------------------------------------------
        create_payload = {
            "name": repo_name,
            "description": f"Restored template repo {repo_name}",
            "private": private,
            "auto_init": False,
            "has_issues": True,
            "has_projects": True,
            "has_wiki": False,
            "default_branch": default_branch,  # Set the correct default branch
        }

        auth_user = self._get_authenticated_user()
        create_url = (
            "https://api.github.com/user/repos"
            if owner == auth_user
            else f"https://api.github.com/orgs/{owner}/repos"
        )

        resp = self._request_with_retry("POST", create_url, json=create_payload)
        if resp.status_code == 422 and "name already exists" in resp.text:
            # Attempt to delete and recreate
            self._delete_repository(owner, repo_name)
            resp = self._request_with_retry("POST", create_url, json=create_payload)

        if resp.status_code not in (200, 201):
            raise RuntimeError(f"Failed to create repo: {resp.status_code} {resp.text}")

        html_url = resp.json()["html_url"]
        logger.info("| [import] Target repository created: %s", html_url)

        # Safety check: Prevent importing to public repositories
        # Public repos would send @ mention notifications to real users, causing spam
        if not private:
            error_msg = (
                "ERROR: Cannot import template to a public repository.\n\n"
                "Reason: The template contains @ mentions of real GitHub users from the original\n"
                "repository. Importing to a public repository would send notifications to these\n"
                "users, which is disruptive and inappropriate.\n\n"
                "Solution: Set private=True when calling _import_template_repo()."
            )
            logger.error(error_msg)
            # Clean up the created repo before raising
            self._delete_repository(owner, repo_name)
            raise RuntimeError(error_msg)

        # Immediately disable GitHub Actions for ALL repositories to prevent any accidental triggers
        # We'll re-enable it later only for mcpmark-cicd
        logger.info(
            "| [import] Disabling GitHub Actions immediately after repo creation..."
        )
        self._disable_github_actions(owner, repo_name)

        # ------------------------------------------------------------------
        # Phase 2 – push git history
        # ------------------------------------------------------------------
        repo_path = template_dir / "repo"

        logger.info("| [import] Pushing git history …")
        _push_repo(repo_path, owner, repo_name, needed_refs)

        # Remove .github directory after pushing with a new commit
        import shutil

        github_dir = repo_path / ".github"
        if github_dir.exists():
            logger.info("| [import] Removing .github directory after push …")
            shutil.rmtree(github_dir)
            # Commit the deletion
            subprocess.run(
                ["git", "-C", str(repo_path), "add", "-A"],
                check=True,
                capture_output=True,
            )
            subprocess.run(
                [
                    "git",
                    "-C",
                    str(repo_path),
                    "commit",
                    "-m",
                    "Remove .github directory",
                ],
                capture_output=True,
            )
            # Push the new commit
            token = self.github_token
            dst_url = (
                f"https://x-access-token:{token}@github.com/{owner}/{repo_name}.git"
            )
            subprocess.run(
                ["git", "-C", str(repo_path), "push", dst_url],
                check=True,
                capture_output=True,
            )

        # ------------------------------------------------------------------
        # Phase 3 – recreate issues & PRs
        # ------------------------------------------------------------------

        def _create_comment(issue_number: int, body: str):
            self._request_with_retry(
                "POST",
                f"https://api.github.com/repos/{owner}/{repo_name}/issues/{issue_number}/comments",
                json={"body": body},
            )

        def _create_issue(item: dict) -> Optional[int]:
            data = {
                "title": item["title"],
                "body": self._obfuscate_mentions(item.get("body", "")),
                "labels": item.get("labels", []),
            }
            r = self._request_with_retry(
                "POST",
                f"https://api.github.com/repos/{owner}/{repo_name}/issues",
                json=data,
            )
            if r.status_code not in (200, 201):
                return None
            new_no = r.json()["number"]
            if item.get("state") == "closed":
                self._request_with_retry(
                    "PATCH",
                    f"https://api.github.com/repos/{owner}/{repo_name}/issues/{new_no}",
                    json={"state": "closed"},
                )
            return new_no

        def _create_pull(pr_itm: dict) -> Optional[int]:
            body = self._obfuscate_mentions(pr_itm.get("body", ""))
            if pr_itm.get("is_from_fork", False):
                fork_note = f"\n\n---\n_This PR was originally from a fork: **{pr_itm.get('fork_owner')}/{pr_itm.get('fork_repo')}** (branch: `{pr_itm['head']}`)_"
                body = body + fork_note if body else fork_note[2:]
            payload = {
                "title": pr_itm["title"],
                "body": body,
                "head": pr_itm.get("local_branch", pr_itm["head"]),
                "base": pr_itm["base"],
            }
            r = self._request_with_retry(
                "POST",
                f"https://api.github.com/repos/{owner}/{repo_name}/pulls",
                json=payload,
            )
            if r.status_code not in (200, 201):
                return None
            return r.json()["number"]

        # Issues
        issues_data = json.loads((template_dir / "issues.json").read_text())
        created_issues = 0
        logger.info("| [phase] Re-creating issues …")
        for itm in issues_data:
            new_no = _create_issue(itm)
            if new_no:
                created_issues += 1
                for c in itm.get("comments", []):
                    _create_comment(
                        new_no,
                        self._obfuscate_mentions(
                            f"*Original author: @{c['user']}*\n\n{c['body']}"
                        ),
                    )
        logger.info(
            "| [phase] Created %d out of %d issues", created_issues, len(issues_data)
        )

        # Pull requests
        logger.info("| [phase] Re-creating pull requests …")
        created_prs = 0
        skipped_prs = 0
        for pr in pulls_data:
            new_pr_no = _create_pull(pr)
            if new_pr_no:
                created_prs += 1
                for c in pr.get("comments", []):
                    _create_comment(
                        new_pr_no,
                        self._obfuscate_mentions(
                            f"*Original author: @{c['user']}*\n\n{c['body']}"
                        ),
                    )
                for rc in pr.get("review_comments", []):
                    _create_comment(
                        new_pr_no,
                        self._obfuscate_mentions(
                            f"*Original author: @{rc['user']}* (review)\n\n{rc['body']}"
                        ),
                    )
            else:
                skipped_prs += 1
        logger.info(
            "| [phase] Created %d PRs, skipped %d PRs", created_prs, skipped_prs
        )

        # Re-enable GitHub Actions ONLY for mcpmark-cicd repository
        # All other repos remain disabled (as set immediately after creation)
        if "mcpmark-cicd" in template_dir.name:
            logger.info("| [import] Re-enabling GitHub Actions for CI/CD repository…")
            self._enable_github_actions(owner, repo_name)

        # Disable notifications to prevent email spam
        logger.info("| [import] Disabling repository notifications …")
        self._disable_repository_notifications(owner, repo_name)

        logger.info("| [import] Repository import complete: %s", html_url)
        return html_url

    # ---------------------------------------------------------------------
    # Public – create initial state using local template import
    # ---------------------------------------------------------------------

    def _create_initial_state(self, task: "BaseTask") -> Optional[InitialStateInfo]:
        """
        Set up GitHub environment for a specific task.

        This may involve:
        1. Creating/forking test repositories
        2. Setting up branches
        3. Creating issues or PRs if needed
        """
        try:
            logger.info(f"| Setting up GitHub state for task: {task.name}")

            template_name = self.select_initial_state_for_task(task.category_id)
            if template_name is None:
                raise RuntimeError(
                    f"No template configured for task category: {task.category_id}"
                )

            template_dir = (self.templates_root / template_name).resolve()
            if not template_dir.exists():
                logger.warning(
                    "| Template directory %s not found locally, attempting to download from CDN",
                    template_dir,
                )
                if not self._download_and_extract_github_template(template_name):
                    logger.error(
                        "| Failed to download template %s from CDN", template_name
                    )
                    return None
                logger.info("| Template %s downloaded successfully", template_name)

            logger.info(f"| Importing repository template from {template_dir} …")
            owner = self.eval_org if self.eval_org else self._get_authenticated_user()

            if "mcpmark-cicd" in template_name:
                repo_url = self._import_template_repo(template_dir, owner, False)
            else:
                repo_url = self._import_template_repo(template_dir, owner, True)

            # Record for cleanup later
            repo_name = repo_url.rstrip("/").split("/")[-1]
            self._repos_to_cleanup.append((owner, repo_name))

            # Build InitialStateInfo
            return InitialStateInfo(
                state_id=f"{owner}/{repo_name}",
                state_url=repo_url,
                metadata={
                    "owner": owner,
                    "repo_name": repo_name,
                    "category": task.category_id,
                    "task_id": task.task_id,
                },
            )

        except Exception as e:
            logger.error(f"| GitHub setup failed for {task.name}: {e}")
            return None

    # ---------------------------------------------------------------------
    # BaseStateManager required hooks
    # ---------------------------------------------------------------------

    def _store_initial_state_info(self, task, state_info: InitialStateInfo) -> None:  # type: ignore[override]
        if hasattr(task, "repository_url"):
            task.repository_url = state_info.state_url

    def _cleanup_task_initial_state(self, task) -> bool:  # type: ignore[override]
        """No-op – cleanup is handled by self.clean_up which deletes imported repos."""
        return True

    def _cleanup_single_resource(self, resource) -> bool:  # type: ignore[override]
        """No-op – we don't use BaseStateManager's tracked_resources anymore."""
        return True

    # ---------------------------------------------------------------------
    def clean_up(self, task=None, **kwargs) -> bool:
        """Delete repositories that were imported for tasks."""
        success = True
        for owner, repo_name in self._repos_to_cleanup:
            try:
                self._delete_repository(owner, repo_name)
                logger.info("| Deleted repository: %s/%s", owner, repo_name)
            except Exception as err:
                logger.error(
                    "| Failed to delete repository %s/%s: %s", owner, repo_name, err
                )
                success = False

        self._repos_to_cleanup.clear()
        return success

    # =========================================================================
    # Repository Creation and Setup Operations
    # =========================================================================

    def _delete_repository(self, owner: str, repo_name: str):
        """Delete a repository (use with caution)."""
        delete_url = f"https://api.github.com/repos/{owner}/{repo_name}"
        response = self.session.delete(delete_url)

        if response.status_code not in [200, 204]:
            logger.warning(
                f"| Failed to delete repository {owner}/{repo_name}: {response.text}"
            )
            raise Exception(
                f"| Failed to delete repository {owner}/{repo_name}: {response.status_code} {response.text}"
            )
        else:
            logger.info(f"| Successfully deleted repository {owner}/{repo_name}")

    def _obfuscate_mentions(self, text: str) -> str:
        """
        Obfuscate @ mentions to prevent notifications to real users.

        Replaces @username with @username_XXXX (random suffix) to ensure the mentioned
        user does not exist on GitHub. This prevents notification spam when importing
        templates that contain @ mentions from original repositories.

        Args:
            text: The text content that may contain @ mentions

        Returns:
            Text with obfuscated @ mentions
        """
        import re
        import random
        import string

        if not text:
            return text

        # Pattern matches @username (GitHub usernames: alphanumeric, hyphens, max 39 chars)
        # Negative lookbehind (?<![a-zA-Z0-9]) ensures @ is not preceded by alphanumeric,
        # which excludes emails like user@example.com
        pattern = r"(?<![a-zA-Z0-9])@([a-zA-Z0-9](?:[a-zA-Z0-9-]*[a-zA-Z0-9])?)"

        def replace_mention(match):
            username = match.group(1)
            # Generate random 4-char suffix
            suffix = "".join(
                random.choices(string.ascii_lowercase + string.digits, k=4)
            )
            return f"@{username}_{suffix}"

        return re.sub(pattern, replace_mention, text)

    # ---------------------------------------------------------------------
    # Helper utilities (organisation vs user)
    # ---------------------------------------------------------------------

    def _get_authenticated_user(self) -> str:
        """Return cached authenticated username or fetch once from GitHub."""
        if hasattr(self, "_auth_user") and self._auth_user:
            return self._auth_user

        response = self.session.get("https://api.github.com/user")
        if response.status_code == 200:
            self._auth_user = response.json()["login"]
            return self._auth_user
        return None

    # ---------------------------------------------------------------------
    # Token management helpers
    # ---------------------------------------------------------------------
    def _update_session_token(self):
        """Update the session Authorization header with the current token."""
        current_token = self.token_pool.get_current_token()
        self.session.headers.update({"Authorization": f"Bearer {current_token}"})
        # Update backward compatibility attribute
        self.github_token = current_token

    def _rotate_token(self):
        """Rotate to the next token in the pool and update session."""
        next_token = self.token_pool.get_next_token()
        self.session.headers.update({"Authorization": f"Bearer {next_token}"})
        # Update backward compatibility attribute
        self.github_token = next_token
        logger.debug(f"| Rotated to next token in pool")

    # ---------------------------------------------------------------------
    # Generic request helper with rate-limit (403) retry handling
    # ---------------------------------------------------------------------
    def _request_with_retry(
        self,
        method: str,
        url: str,
        *,
        max_retries: int = 2,
        sleep_seconds: int = 120,
        **kwargs,
    ):
        """Send a GitHub API request with basic rate-limit handling and token rotation.

        If a request receives HTTP 403 (rate limit):
        1. First try rotating to the next token in the pool
        2. If still rate limited, sleep and retry
        3. After max_retries are exhausted, raise RuntimeError
        """
        import time  # local import to avoid adding global dependency

        attempt = 0
        tokens_tried = 0

        while True:
            # Ensure we have the current token set
            self._update_session_token()

            resp = self.session.request(method, url, **kwargs)
            # Successful or non-rate-limited response – return immediately
            if resp.status_code != 403:
                return resp

            # 403 – very likely rate-limited
            # First, try rotating tokens if we have multiple
            if (
                self.token_pool.pool_size > 1
                and tokens_tried < self.token_pool.pool_size
            ):
                logger.warning(
                    "| GitHub API rate limit encountered. Rotating to next token (tried %d/%d tokens)",
                    tokens_tried + 1,
                    self.token_pool.pool_size,
                )
                self._rotate_token()
                tokens_tried += 1
                continue

            # All tokens exhausted or single token, resort to sleep/retry
            if attempt >= max_retries:
                raise RuntimeError(
                    f"GitHub API rate limited after {attempt + 1} attempts with {self.token_pool.pool_size} token(s): {resp.status_code} {resp.text}"
                )

            logger.warning(
                "| All tokens rate limited (attempt %d/%d). Sleeping %d seconds before retrying …",
                attempt + 1,
                max_retries + 1,
                sleep_seconds,
            )
            time.sleep(sleep_seconds)
            attempt += 1
            tokens_tried = 0  # Reset token counter for next attempt

    # =========================================================================
    # Initial State Selection and Repository Creation
    # =========================================================================

    # Initial state for each task category is resolved via self.initial_state_mapping
    def select_initial_state_for_task(self, task_category: str) -> Optional[str]:
        """Resolve template name for a task category with light normalization."""
        if not task_category:
            return None

        candidate_keys = []
        candidate_keys.append(task_category)

        # Allow users to swap between hyphen/underscore naming conventions.
        hyphen_to_underscore = task_category.replace("-", "_")
        if hyphen_to_underscore not in candidate_keys:
            candidate_keys.append(hyphen_to_underscore)

        underscore_to_hyphen = task_category.replace("_", "-")
        if underscore_to_hyphen not in candidate_keys:
            candidate_keys.append(underscore_to_hyphen)

        for key in candidate_keys:
            template = self.initial_state_mapping.get(key)
            if template:
                if key != task_category:
                    logger.debug(
                        "| Resolved GitHub template for %s via alias %s -> %s",
                        task_category,
                        key,
                        template,
                    )
                return template

        return None

    def extract_repo_info_from_url(self, repo_url: str) -> tuple[str, str]:
        """Extract owner and repo name from GitHub URL."""
        try:
            from urllib.parse import urlparse

            # Support https://github.com/owner/repo format
            if "github.com" in repo_url:
                path = urlparse(repo_url).path.strip("/")
                parts = path.split("/")
                if len(parts) >= 2:
                    return parts[0], parts[1]

            raise ValueError(f"Invalid GitHub URL format: {repo_url}")

        except Exception as e:
            logger.error(f"| Failed to extract repo info from URL {repo_url}: {e}")
            raise

    def get_service_config_for_agent(self) -> dict:
        """
        Get service-specific configuration for agent execution.

        Rotates to the next token in the pool before returning config
        to distribute load across tokens.

        Returns:
            Dictionary containing configuration needed by the agent/MCP server
        """
        service_config = {}

        # Add GitHub token if available
        if self.github_token:
            service_config["github_token"] = self.github_token

        return service_config

    def set_verification_environment(self, messages_path: str = None) -> None:
        """
        Set GitHub-specific environment variables for verification scripts.

        This ensures verification scripts use the same token as the current
        agent execution, maintaining consistency across the evaluation flow.

        Args:
            messages_path: Optional path to messages.json file for verification
        """
        import os

        # Set common MCP_MESSAGES if provided
        if messages_path:
            os.environ["MCP_MESSAGES"] = str(messages_path)

        # Set GitHub-specific token
        current_token = self.token_pool.get_current_token()
        os.environ["MCP_GITHUB_TOKEN"] = current_token
        logger.info("| Set MCP_GITHUB_TOKEN for verification scripts")

    def _enable_github_actions(self, owner: str, repo_name: str):
        """Enable GitHub Actions for the repository using REST API."""
        try:
            # Enable GitHub Actions
            url = (
                f"https://api.github.com/repos/{owner}/{repo_name}/actions/permissions"
            )
            response = self.session.put(
                url, json={"enabled": True, "allowed_actions": "all"}
            )

            if response.status_code in [200, 204]:
                logger.info(
                    "| Successfully enabled GitHub Actions for %s/%s", owner, repo_name
                )
            else:
                logger.warning(
                    "| Failed to enable GitHub Actions: %s %s",
                    response.status_code,
                    response.text,
                )

        except Exception as e:
            logger.error("| Failed to enable GitHub Actions: %s", e)

    def _disable_github_actions(self, owner: str, repo_name: str):
        """Disable GitHub Actions for the repository using REST API."""
        try:
            # Disable GitHub Actions
            url = (
                f"https://api.github.com/repos/{owner}/{repo_name}/actions/permissions"
            )
            response = self.session.put(url, json={"enabled": False})

            if response.status_code in [200, 204]:
                logger.info(
                    "| Successfully disabled GitHub Actions for %s/%s", owner, repo_name
                )
            else:
                logger.warning(
                    "| Failed to disable GitHub Actions: %s %s",
                    response.status_code,
                    response.text,
                )

        except Exception as e:
            logger.error("| Failed to disable GitHub Actions: %s", e)

    def _disable_repository_notifications(self, owner: str, repo_name: str):
        """Disable repository notifications to prevent email spam."""
        try:
            # Set repository notification subscription to ignore
            url = f"https://api.github.com/repos/{owner}/{repo_name}/subscription"
            response = self.session.put(
                url, json={"subscribed": False, "ignored": True}
            )

            if response.status_code in [200, 201]:
                logger.info(
                    "| Successfully disabled notifications for %s/%s", owner, repo_name
                )
            elif response.status_code == 403:
                # This is expected if the token doesn't have notifications scope
                logger.debug(
                    "| Cannot disable notifications for %s/%s (token lacks notifications scope - this is OK)",
                    owner,
                    repo_name,
                )
            else:
                logger.warning(
                    "| Failed to disable repository notifications: %s %s",
                    response.status_code,
                    response.text,
                )

        except Exception as e:
            logger.error("| Failed to disable repository notifications: %s", e)

    def _download_and_extract_github_template(self, template_name: str) -> bool:
        """
        Download and extract GitHub template from CDN using wget and unzip commands.

        This approach preserves original file timestamps and is simpler than Python zipfile.

        Args:
            template_name: Name of the template to download (e.g., "anthropics-claude-code")

        Returns:
            bool: True if download and extraction successful
        """
        try:
            import subprocess
            import sys
            import tempfile
            import shutil
            import os

            # Get the URL from mapping
            if template_name not in self.github_template_url_mapping:
                logger.error(f"| No URL mapping found for template: {template_name}")
                return False

            template_url = self.github_template_url_mapping[template_name]
            # Allow override via environment variable
            template_url = os.getenv("GITHUB_TEMPLATE_URL", template_url)

            logger.info(f"| ○ Downloading GitHub template from: {template_url}")

            # Create a temporary directory for the download
            with tempfile.TemporaryDirectory() as temp_dir:
                temp_path = Path(temp_dir)
                zip_path = temp_path / "github_template.zip"

                # Step 1: Download using wget/curl
                logger.info("| ○ Downloading GitHub template zip file...")
                try:
                    # Use wget if available, otherwise fall back to curl
                    if sys.platform == "win32":
                        # Windows: try wget, fall back to curl
                        try:
                            result = subprocess.run(
                                ["wget", "-O", str(zip_path), template_url],
                                capture_output=True,
                                text=True,
                                check=True,
                            )
                        except (subprocess.CalledProcessError, FileNotFoundError):
                            # Fall back to curl
                            result = subprocess.run(
                                ["curl", "-L", "-o", str(zip_path), template_url],
                                capture_output=True,
                                text=True,
                                check=True,
                            )
                    else:
                        # Unix-like systems: try wget, fall back to curl
                        try:
                            result = subprocess.run(
                                ["wget", "-O", str(zip_path), template_url],
                                capture_output=True,
                                text=True,
                                check=True,
                            )
                        except (subprocess.CalledProcessError, FileNotFoundError):
                            # Fall back to curl
                            result = subprocess.run(
                                ["curl", "-L", "-o", str(zip_path), template_url],
                                capture_output=True,
                                text=True,
                                check=True,
                            )

                    logger.info("| ✓ Download completed successfully")
                except Exception as e:
                    logger.error(f"| Download failed: {e}")
                    return False

                # Step 2: Extract using unzip
                logger.info("| ○ Extracting GitHub template...")
                try:
                    # Extract to templates root directory
                    result = subprocess.run(
                        ["unzip", "-o", str(zip_path), "-d", str(self.templates_root)],
                        capture_output=True,
                        text=True,
                        check=True,
                    )
                    logger.info("| ✓ Extraction completed successfully")
                except Exception as e:
                    logger.error(f"| Extraction failed: {e}")
                    return False

                # Step 3: Remove __MACOSX folder if it exists
                macosx_path = self.templates_root / "__MACOSX"
                if macosx_path.exists():
                    logger.info("| ○ Cleaning up macOS metadata...")
                    try:
                        shutil.rmtree(macosx_path)
                        logger.info("| ✓ Removed __MACOSX folder")
                    except Exception as e:
                        logger.warning(f"| Failed to remove __MACOSX folder: {e}")

                # Verify the extracted template directory exists
                template_path = self.templates_root / template_name
                if not template_path.exists():
                    logger.error(
                        f"| Extracted template directory not found at expected path: {template_path}"
                    )
                    return False

                logger.info(
                    f"| ✓ Successfully downloaded and extracted GitHub template to: {template_path}"
                )
                return True

        except Exception as e:
            logger.error(f"| Failed to download and extract GitHub template: {e}")
            return False


================================================
FILE: src/mcp_services/github/github_task_manager.py
================================================
"""
GitHub Task Manager for MCPMark Evaluation Pipeline
====================================================

This module provides utilities for discovering, filtering, and managing
GitHub-based evaluation tasks.

The task manager is responsible for:
- Task discovery and filtering
- Task verification and result processing
- Task-specific logic (NOT LLM execution)
"""

import sys
from dataclasses import dataclass
from pathlib import Path
from typing import Any, Dict, List, Optional

from src.base.task_manager import BaseTask, BaseTaskManager
from src.logger import get_logger

logger = get_logger(__name__)


@dataclass
class GitHubTask(BaseTask):
    """Represents a single evaluation task for GitHub service."""

    # GitHub-specific fields
    repository_url: Optional[str] = None
    branch_name: Optional[str] = None
    pr_number: Optional[int] = None
    issue_number: Optional[int] = None
    expected_actions: Optional[List[str]] = None  # Expected GitHub actions to verify

    # Directory-based task slug (e.g., "update_readme")
    task_name: str = ""

    # No need to override name property, inherited from BaseTask


class GitHubTaskManager(BaseTaskManager):
    """Manages task discovery, filtering, and verification for GitHub-based MCPMark evaluation."""

    def __init__(self, tasks_root: Path = None, task_suite: str = "standard"):
        """Initialize GitHub task manager.

        Args:
            tasks_root: Path to the tasks directory
        """
        if tasks_root is None:
            tasks_root = Path(__file__).resolve().parents[3] / "tasks"

        # Call parent constructor
        super().__init__(
            tasks_root,
            mcp_service="github",
            task_class=GitHubTask,
            task_organization="file",
            task_suite=task_suite,
        )  # GitHub uses file-based tasks

    # =========================================================================
    # Service-specific implementations
    # =========================================================================
    # No custom task discovery methods needed; relying entirely on BaseTaskManager defaults.

    def _create_task_from_files(
        self, category_id: str, task_files_info: Dict[str, Any]
    ) -> Optional[GitHubTask]:
        """Instantiate a GitHubTask from the dictionary yielded by _find_task_files."""
        import json
        
        # Check for meta.json
        meta_path = task_files_info["instruction_path"].parent / "meta.json"
        final_category_id = category_id
        task_id = task_files_info["task_id"]
        
        if meta_path.exists():
            try:
                with open(meta_path, 'r') as f:
                    meta_data = json.load(f)
                    # Use values from meta.json if available
                    final_category_id = meta_data.get("category_id", category_id)
                    task_id = meta_data.get("task_id", task_id)
            except Exception as e:
                logger.warning(f"Failed to load meta.json from {meta_path}: {e}")

        return GitHubTask(
            task_instruction_path=task_files_info["instruction_path"],
            task_verification_path=task_files_info["verification_path"],
            service="github",
            category_id=final_category_id,
            task_id=task_id,
            task_name=task_files_info["task_id"],
        )

    def _get_verification_command(self, task: GitHubTask) -> List[str]:
        """Get the verification command for GitHub tasks."""
        return [sys.executable, str(task.task_verification_path)]

    def get_task_instruction(self, task: GitHubTask) -> str:
        """Return task instruction prefixed with repository context.

        Adds an English prefix to every GitHub task instruction so that the
        agent knows **exactly** which repository to operate on, following the
        pattern requested by the user:

            Please execute the following task in my repository {owner}/{repo_name}:

        If the repository URL has not yet been injected into the ``task`` (for
        example when the state manager has not run), we fall back to a more
        generic prefix without owner/repo placeholder.
        """
        # Read the original task description first
        base_instruction = task.get_task_instruction()

        # Derive the owner/repo pair from the repository URL if available
        prefix: str
        if task.repository_url:
            # Example URL: https://github.com/owner/repo_name.git (or without .git)
            url_parts = task.repository_url.rstrip("/").replace(".git", "").split("/")
            if len(url_parts) >= 2:
                owner, repo_name = url_parts[-2], url_parts[-1]
                prefix = f"Please execute the following task in my repository {owner}/{repo_name}:"
            else:
                prefix = "Please execute the following task:"
        else:
            prefix = "Please execute the following task:"

        # Compose instruction with prefix
        instruction_with_prefix = f"{prefix}\n\n{base_instruction.strip()}"
        
        # Apply the common formatting suffix from base class
        return self._format_task_instruction(instruction_with_prefix)


================================================
FILE: src/mcp_services/github/repo_exporter.py
================================================
"""
repo_exporter.py – Export public GitHub repository *and* open Issues/PRs
=====================================================================
Workflow
--------
1. Mirror-clone the public repository to a local bare repo directory
   ``${out_dir}/${owner}-${repo}/repo.git``.
2. Fetch all *open* Issues & Pull-Requests via GitHub REST API (no auth
   needed for public repos, but a token can be provided to increase the rate
   limit) and serialise them as JSON under the same folder:
   • ``issues.json`` – list[Issue]
   • ``pulls.json`` – list[PullRequest]
   • ``meta.json``  – {"owner": owner, "repo": repo}

Usage (CLI)
-----------
$ python -m src.mcp_services.github.repo_exporter \
    https://github.com/octocat/Hello-World \
    --out-dir ./github_state

Optionally ``--token`` can be supplied (or env GITHUB_TOKEN) to avoid the
60-req/h anonymous limit.
"""

from __future__ import annotations

import json
import logging
import os
from dotenv import load_dotenv
import subprocess
from pathlib import Path
from tempfile import mkdtemp
from typing import Optional
from urllib.parse import urlparse

import requests

logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

_API_ROOT = "https://api.github.com"
_DEFAULT_HEADERS = {
    "Accept": "application/vnd.github.v3+json",
    "User-Agent": "MCPMark/RepoExporter/1.0",
}


# ---------------------------------------------------------------------------
# Helper utilities
# ---------------------------------------------------------------------------


def _make_session(token: Optional[str] = None) -> requests.Session:
    sess = requests.Session()
    sess.headers.update(_DEFAULT_HEADERS)
    if token:
        sess.headers["Authorization"] = f"Bearer {token}"
    return sess


def _parse_repo(url: str) -> tuple[str, str]:
    parsed = urlparse(url)
    parts = parsed.path.strip("/").split("/")
    if len(parts) < 2:
        raise ValueError(f"Invalid GitHub repo URL: {url}")
    return parts[0], parts[1]


# ---------------------------------------------------------------------------
# Core export logic
# ---------------------------------------------------------------------------


def export_repository(
    source_repo_url: str,
    out_dir: str = "./github_state",
    github_token: str | None = None,
    max_issues: int | None = None,
    max_pulls: int | None = None,
) -> str:
    """Export repository code plus Issues/PRs to ``out_dir``.

    ``max_issues`` / ``max_pulls`` – when supplied, export **only** the most
    recently created *open* Issues or Pull Requests (respectively).

    Returns the absolute path of the export folder.
    """

    owner, repo = _parse_repo(source_repo_url)
    export_root = Path(out_dir).expanduser().resolve()
    repo_dir = export_root / f"{owner}-{repo}"
    repo_dir.mkdir(parents=True, exist_ok=True)

    # ------------------------------------------------------------------
    # 1. Clone repository – full or shallow *working* clone (no bare repo)
    # ------------------------------------------------------------------
    repo_path = repo_dir / "repo"
    if repo_path.exists():
        logger.info("[clone] Repository already exists, skipping clone: %s", repo_path)
    else:
        logger.info("[clone] Cloning %s/%s to %s", owner, repo, repo_path)
        env = {
            **os.environ,
            "GIT_TERMINAL_PROMPT": "0",
            "GIT_LFS_SKIP_SMUDGE": "1",
        }
        tmp_dir = mkdtemp(prefix="mcp_export_")
        try:
            # Always perform a full clone (no shallow depth limitation).
            clone_cmd = [
                "git",
                "clone",
                "--no-single-branch",
                f"https://github.com/{owner}/{repo}.git",
                tmp_dir,
            ]

            subprocess.run(clone_cmd, check=True, capture_output=True, env=env)
            subprocess.run(["mv", tmp_dir, str(repo_path)], check=True)
            logger.info("[clone] Clone completed")
        finally:
            # tmp_dir moved if success; remove if left
            if os.path.isdir(tmp_dir):
                subprocess.run(["rm", "-rf", tmp_dir])

    # ------------------------------------------------------------------
    # 2. Dump Issues & Pull Requests
    # ------------------------------------------------------------------
    sess = _make_session(github_token)

    def _paginate(url: str, state: str = "all", extra_params: dict | None = None):
        page = 1
        while True:
            params = {"state": state, "per_page": 100, "page": page}
            if extra_params:
                params.update(extra_params)
            resp = sess.get(url, params=params)
            if resp.status_code != 200:
                logger.warning("Failed to list: %s – %s", url, resp.text)
                break
            items = resp.json()
            if not items:
                break
            yield from items
            page += 1

    # --------------------------------------------------------------
    # Helper: fetch all issue comments for a given issue / PR number
    # --------------------------------------------------------------
    def _fetch_issue_comments(number: int) -> list[dict]:
        """Return a list of {user, body} comment dicts for the given issue/PR."""
        comments = []
        for c in _paginate(
            f"{_API_ROOT}/repos/{owner}/{repo}/issues/{number}/comments"
        ):
            comments.append(
                {
                    "user": c.get("user", {}).get("login", "unknown"),
                    "body": c.get("body", ""),
                }
            )
        return comments

    # --------------------------------------------------------------
    # Helper: fetch all *review* comments (code comments) for a PR
    # --------------------------------------------------------------
    def _fetch_review_comments(number: int) -> list[dict]:
        """Return a list of {user, body} review comments for the given PR."""
        comments = []
        for c in _paginate(f"{_API_ROOT}/repos/{owner}/{repo}/pulls/{number}/comments"):
            comments.append(
                {
                    "user": c.get("user", {}).get("login", "unknown"),
                    "body": c.get("body", ""),
                }
            )
        return comments

    # Issues (non-PR)
    issues = []
    # If max_issues is 0, skip fetching issues entirely
    if max_issues == 0:
        logger.info("[export] Skipping issues (max_issues=0)")
    else:
        for itm in _paginate(
            f"{_API_ROOT}/repos/{owner}/{repo}/issues",
            extra_params={"sort": "created", "direction": "desc"},
        ):
            if "pull_request" in itm:
                continue
            issues.append(
                {
                    "title": itm.get("title"),
                    "body": itm.get("body", ""),
                    "labels": [lbl.get("name") for lbl in itm.get("labels", [])],
                    "state": itm.get("state", "open"),  # Store issue state
                    "number": itm.get("number"),  # Store issue number for reference
                    "comments": _fetch_issue_comments(itm.get("number")),
                }
            )

            if max_issues is not None and len(issues) >= max_issues:
                break
    (repo_dir / "issues.json").write_text(json.dumps(issues, indent=2))
    logger.info("[export] Saved %d issues", len(issues))

    # Pull requests – include *all* PRs including those from forks
    pulls = []
    pr_head_refs: set[str] = set()
    fork_pr_branches: dict[str, dict] = {}  # Maps PR branch names to fork info

    # If max_pulls is 0, skip fetching pull requests entirely
    if max_pulls == 0:
        logger.info("[export] Skipping pull requests (max_pulls=0)")
    else:
        for pr in _paginate(
            f"{_API_ROOT}/repos/{owner}/{repo}/pulls",
            state="open",
            extra_params={"sort": "created", "direction": "desc"},
        ):
            pr_number = pr.get("number")
            head = pr.get("head", {})
            if head is None:
                logger.warning("PR #%s has no head (deleted fork), skipping", pr_number)
                continue  # skip PRs with missing head (deleted fork)

            head_repo = head.get("repo")
            head_ref = head.get("ref")
            head_sha = head.get("sha")

            if head_repo is None:
                logger.warning("PR #%s source repo was deleted, skipping", pr_number)
                continue  # skip PRs where source repo was deleted

            head_repo_full = head_repo.get("full_name")
            is_from_fork = head_repo_full != f"{owner}/{repo}"

            # Create PR data with fork information
            pr_data = {
                "number": pr_number,
                "title": pr.get("title"),
                "body": pr.get("body", ""),
                "head": head_ref,
                "base": pr.get("base", {}).get("ref"),
                "is_from_fork": is_from_fork,
            }

            if is_from_fork:
                # Store additional metadata for forked PRs
                pr_data["fork_owner"] = head_repo.get("owner", {}).get("login")
                pr_data["fork_repo"] = head_repo.get("name")
                pr_data["head_sha"] = head_sha

                # Create a unique branch name for this forked PR
                fork_branch_name = f"pr/{pr_number}-{pr_data['fork_owner']}-{head_ref}"
                pr_data["local_branch"] = fork_branch_name

                fork_pr_branches[fork_branch_name] = {
                    "clone_url": head_repo.get("clone_url"),
                    "ref": head_ref,
                    "sha": head_sha,
                    "pr_number": pr_number,
                }
            else:
                # For non-fork PRs, keep the original branch reference
                pr_head_refs.add(head_ref)

            # Attach comments
            pr_data["comments"] = _fetch_issue_comments(pr_number)
            pr_data["review_comments"] = _fetch_review_comments(pr_number)

            pulls.append(pr_data)

            if max_pulls is not None and len(pulls) >= max_pulls:
                break
    (repo_dir / "pulls.json").write_text(json.dumps(pulls, indent=2))
    logger.info("[export] Saved %d pull requests", len(pulls))

    # Get default branch info first (needed for fetching)
    sess = _make_session(github_token)
    try:
        repo_info = sess.get(f"{_API_ROOT}/repos/{owner}/{repo}")
        default_branch = repo_info.json().get("default_branch", "main")
    except Exception:
        default_branch = "main"

    # Fetch branches from non-fork PRs (branches from the same repository)
    non_fork_branches = list(pr_head_refs)  # These are branches from the same repo
    # Always include the default branch in the branches to fetch
    if default_branch not in non_fork_branches:
        non_fork_branches.append(default_branch)
        pr_head_refs.add(default_branch)

    if non_fork_branches:
        logger.info(
            "[fetch] Fetching %d branches from same repository (including default branch '%s')",
            len(non_fork_branches),
            default_branch,
        )
        try:
            # Fetch all remote branches to ensure we have the PR branches
            subprocess.run(
                ["git", "-C", str(repo_path), "fetch", "origin", "--no-tags"],
                check=True,
                capture_output=True,
            )

            # Create local branches for each PR branch
            for branch in non_fork_branches:
                try:
                    # Create local branch tracking the remote branch
                    subprocess.run(
                        [
                            "git",
                            "-C",
                            str(repo_path),
                            "branch",
                            "--track",
                            branch,
                            f"origin/{branch}",
                        ],
                        check=False,
                        capture_output=True,
                    )  # check=False because branch might already exist
                    logger.info("[fetch] Created local branch %s", branch)
                except subprocess.CalledProcessError:
                    # Branch might already exist, which is fine
                    pass

        except subprocess.CalledProcessError as e:
            logger.warning(
                "[fetch] Failed to fetch branches from origin: %s",
                e.stderr.decode(errors="ignore") if e.stderr else str(e),
            )

    # Fetch branches from forks for PRs
    if fork_pr_branches:
        logger.info(
            "[fetch] Fetching branches from %d forked PRs", len(fork_pr_branches)
        )

        for branch_name, fork_info in fork_pr_branches.items():
            try:
                logger.info(
                    "[fetch] Fetching branch %s from fork %s",
                    fork_info["ref"],
                    fork_info["clone_url"],
                )

                # Add fork as remote and fetch the specific branch
                remote_name = f"fork-pr-{fork_info['pr_number']}"

                # Add remote
                subprocess.run(
                    [
                        "git",
                        "-C",
                        str(repo_path),
                        "remote",
                        "add",
                        remote_name,
                        fork_info["clone_url"],
                    ],
                    check=True,
                    capture_output=True,
                )

                # Fetch the specific branch from the fork
                subprocess.run(
                    [
                        "git",
                        "-C",
                        str(repo_path),
                        "fetch",
                        remote_name,
                        f"{fork_info['ref']}:refs/heads/{branch_name}",
                    ],
                    check=True,
                    capture_output=True,
                )

                # Remove the remote after fetching
                subprocess.run(
                    ["git", "-C", str(repo_path), "remote", "remove", remote_name],
                    check=True,
                    capture_output=True,
                )

                # Add the fork branch to pr_head_refs so it gets pushed
                pr_head_refs.add(branch_name)

                logger.info("[fetch] Successfully fetched branch %s", branch_name)

            except subprocess.CalledProcessError as e:
                logger.warning(
                    "[fetch] Failed to fetch branch from fork PR #%s: %s",
                    fork_info["pr_number"],
                    e.stderr.decode(errors="ignore") if e.stderr else str(e),
                )
            except Exception as e:
                logger.warning(
                    "[fetch] Unexpected error fetching fork PR #%s: %s",
                    fork_info["pr_number"],
                    str(e),
                )

    meta = {
        "owner": owner,
        "repo": repo,
        "default_branch": default_branch,
        "pr_head_refs": sorted(pr_head_refs),
    }
    (repo_dir / "meta.json").write_text(json.dumps(meta, indent=2))

    logger.info("[done] Export finished – data stored at %s", repo_dir)
    return str(repo_dir)


# ---------------------------------------------------------------------------
# CLI
# ---------------------------------------------------------------------------
if __name__ == "__main__":
    import argparse

    load_dotenv(".mcp_env")

    parser = argparse.ArgumentParser(
        description="Export public GitHub repository with Issues/PRs"
    )
    parser.add_argument(
        "--source_repo_url", required=True, help="HTTPS URL of the public repository"
    )
    parser.add_argument(
        "--out-dir", default="./github_state", help="Output directory root"
    )
    parser.add_argument(
        "--max-issues",
        type=int,
        default=20,
        help="Export only the latest N issues (optional)",
    )
    parser.add_argument(
        "--max-pulls",
        type=int,
        default=5,
        help="Export only the latest N pull requests (optional)",
    )
    args = parser.parse_args()

    token = os.getenv("GITHUB_TOKEN")

    export_repository(
        args.source_repo_url, args.out_dir, token, args.max_issues, args.max_pulls
    )


================================================
FILE: src/mcp_services/github/repo_importer.py
================================================
"""
repo_importer.py – Restore previously exported GitHub repository into an org/user
===============================================================================
Given a local export folder created by ``repo_exporter.py`` that contains
``repo.git`` (bare mirror) and JSON files for Issues/PRs, this script:
1. Creates an empty repository under the specified owner (user/org) using the
   provided GitHub token.
2. Pushes *all* Git history from the local bare repository to the target repo
   (fallback to per-ref push to avoid timeouts).
3. Re-creates the open Issues & Pull Requests from the JSON dump.

CLI usage
---------
$ python -m src.mcp_services.github.repo_importer \
    ./github_template_repo/octocat-Hello-World \
    --token YOUR_GH_PAT \
    --target-owner EvalOrg \
    --private
"""

from __future__ import annotations

import json
import logging
import os
import subprocess
import time
from pathlib import Path
from typing import Iterable

import requests
from dotenv import load_dotenv

logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

_API_ROOT = "https://api.github.com"
_HEADERS = {
    "Accept": "application/vnd.github.v3+json",
    "User-Agent": "MCPMark/RepoImporter/1.0",
}

# ---------------------------------------------------------------------------
# Helper functions copied / simplified from repo_mirror (shallow clone logic removed)
# ---------------------------------------------------------------------------


def _make_session(token: str) -> requests.Session:
    sess = requests.Session()
    sess.headers.update(_HEADERS | {"Authorization": f"Bearer {token}"})
    return sess


def _create_target_repo(
    sess: requests.Session, owner: str, repo_name: str, description: str, private: bool
) -> str:
    data = {
        "name": repo_name,
        "description": description,
        "private": private,
        "auto_init": False,
        "has_issues": True,
        "has_projects": True,
        "has_wiki": False,
    }

    # Determine if owner == auth user
    auth_user = _get_authenticated_user(sess)
    create_url = (
        f"{_API_ROOT}/user/repos"
        if owner == auth_user
        else f"{_API_ROOT}/orgs/{owner}/repos"
    )

    resp = sess.post(create_url, json=data)
    if resp.status_code == 422 and "name already exists" in resp.text:
        logger.warning("Repository already exists; attempting to delete and recreate …")
        _delete_repo(sess, owner, repo_name)
        resp = sess.post(create_url, json=data)

    if resp.status_code not in (200, 201):
        raise RuntimeError(f"Failed to create repo: {resp.status_code} {resp.text}")

    html_url = resp.json()["html_url"]
    logger.info("[init] Target repository created: %s", html_url)
    return html_url


def _get_authenticated_user(sess: requests.Session) -> str:
    resp = sess.get(f"{_API_ROOT}/user")
    resp.raise_for_status()
    return resp.json()["login"]


def _delete_repo(sess: requests.Session, owner: str, repo: str):
    sess.delete(f"{_API_ROOT}/repos/{owner}/{repo}")


def _list_refs(repo_dir: str) -> list[str]:
    result = subprocess.run(
        ["git", "-C", repo_dir, "for-each-ref", "--format=%(refname)"],
        check=True,
        capture_output=True,
        text=True,
    )
    return result.stdout.strip().splitlines()


def _push_repo(
    repo_path: Path,
    owner: str,
    repo_name: str,
    token: str,
    required_refs: Iterable[str] | None = None,
):
    """Push repository to GitHub.

    Strategy:
    1. Attempt a full `git push --mirror`.
    2. If that fails (e.g. due to large repo), fall back to pushing refs one-by-one.
    """

    dst_url = f"https://x-access-token:{token}@github.com/{owner}/{repo_name}.git"

    # First try mirror push (fast path)
    try:
        subprocess.run(
            ["git", "-C", str(repo_path), "push", "--mirror", dst_url],
            check=True,
            capture_output=True,
        )
        logger.info("[push] Mirror push succeeded")
        return
    except subprocess.CalledProcessError as err:
        logger.warning(
            "[push] Mirror push failed (%s). Falling back to per-ref",
            err.stderr.decode(errors="ignore"),
        )

    # ------------------------------------------------------------------
    # Fallback: push each ref individually (robust but slower)
    # ------------------------------------------------------------------
    refs = required_refs or _list_refs(str(repo_path))
    logger.info("[push] Pushing %d refs individually …", len(refs))
    for ref in refs:
        for attempt in range(3):
            try:
                subprocess.run(
                    ["git", "-C", str(repo_path), "push", dst_url, f"{ref}:{ref}"],
                    check=True,
                    capture_output=True,
                )
                break
            except subprocess.CalledProcessError as ref_err:
                if attempt == 2:
                    raise RuntimeError(
                        f"Failed to push ref {ref}: {ref_err.stderr}"
                    ) from ref_err
                time.sleep(2 * (attempt + 1))


def _create_comment(
    sess: requests.Session, owner: str, repo: str, issue_number: int, body: str
):
    """Create a comment on an Issue or Pull Request. Returns True on success."""
    resp = sess.post(
        f"{_API_ROOT}/repos/{owner}/{repo}/issues/{issue_number}/comments",
        json={"body": body},
    )
    if resp.status_code not in (200, 201):
        logger.debug("Failed to create comment on #%s: %s", issue_number, resp.text)
        return False
    return True


def _create_issue(
    sess: requests.Session,
    owner: str,
    repo: str,
    title: str,
    body: str,
    labels: list[str],
    state: str = "open",
    number: int = None,
):
    """Create a new Issue and return the *new* issue number (or None on failure)."""
    data = {"title": title, "body": body, "labels": labels}
    resp = sess.post(f"{_API_ROOT}/repos/{owner}/{repo}/issues", json=data)
    if resp.status_code not in (200, 201):
        logger.debug("Failed to create issue #%s: %s", number, resp.text)
        return None

    new_number = resp.json().get("number")

    # Close issue if original state was closed
    if state == "closed":
        close_resp = sess.patch(
            f"{_API_ROOT}/repos/{owner}/{repo}/issues/{new_number}",
            json={"state": "closed"},
        )
        if close_resp.status_code not in (200, 201):
            logger.debug("Failed to close issue #%s: %s", new_number, close_resp.text)

    return new_number


def _create_pull(
    sess: requests.Session,
    owner: str,
    repo: str,
    title: str,
    body: str,
    head: str,
    base: str,
    pr_number: int = None,
):
    """Create a Pull Request and return the *new* PR number (or None on failure)."""
    data = {"title": title, "body": body, "head": head, "base": base}
    resp = sess.post(f"{_API_ROOT}/repos/{owner}/{repo}/pulls", json=data)
    if resp.status_code not in (200, 201):
        logger.warning(
            "Failed to create PR #%s (head: %s, base: %s): %s",
            pr_number,
            head,
            base,
            resp.text,
        )
        return None
    return resp.json().get("number")


def _enable_github_actions(sess: requests.Session, owner: str, repo_name: str):
    """Enable GitHub Actions for the repository using REST API."""
    try:
        url = f"{_API_ROOT}/repos/{owner}/{repo_name}/actions/permissions"
        response = sess.put(url, json={"enabled": True, "allowed_actions": "all"})

        if response.status_code in [200, 204]:
            logger.info(
                "Successfully enabled GitHub Actions for %s/%s", owner, repo_name
            )
        else:
            logger.warning(
                "Failed to enable GitHub Actions: %s %s",
                response.status_code,
                response.text,
            )

    except Exception as e:
        logger.error("Failed to enable GitHub Actions: %s", e)


def _disable_repository_notifications(
    sess: requests.Session, owner: str, repo_name: str
):
    """Disable repository notifications to prevent email spam."""
    try:
        url = f"{_API_ROOT}/repos/{owner}/{repo_name}/subscription"
        response = sess.put(url, json={"subscribed": False, "ignored": True})

        if response.status_code in [200, 201]:
            logger.info(
                "Successfully disabled notifications for %s/%s", owner, repo_name
            )
        elif response.status_code == 403:
            # This is expected if the token doesn't have notifications scope
            logger.debug(
                "Cannot disable notifications for %s/%s (token lacks notifications scope - this is OK)",
                owner,
                repo_name,
            )
        else:
            logger.warning(
                "Failed to disable repository notifications: %s %s",
                response.status_code,
                response.text,
            )

    except Exception as e:
        logger.error("Failed to disable repository notifications: %s", e)


def _set_default_branch(
    sess: requests.Session, owner: str, repo_name: str, default_branch: str
):
    """Set the default branch for a repository."""
    if default_branch != "main":  # Only update if not already main
        logger.info("[import] Setting default branch to '%s'", default_branch)
        url = f"{_API_ROOT}/repos/{owner}/{repo_name}"
        data = {"default_branch": default_branch}
        resp = sess.patch(url, json=data)
        if resp.status_code in (200, 201):
            logger.info(
                "[import] Successfully set default branch to '%s'", default_branch
            )
        else:
            logger.warning(
                "[import] Failed to set default branch: %s %s",
                resp.status_code,
                resp.text,
            )


def _remove_github_directory(repo_path: Path, owner: str, repo_name: str, token: str):
    """Remove .github directory after pushing and commit the deletion."""
    import shutil

    github_dir = repo_path / ".github"
    if github_dir.exists():
        logger.info("[import] Removing .github directory after push …")
        shutil.rmtree(github_dir)
        # Commit the deletion
        subprocess.run(
            ["git", "-C", str(repo_path), "add", "-A"], check=True, capture_output=True
        )
        subprocess.run(
            ["git", "-C", str(repo_path), "commit", "-m", "Remove .github directory"],
            capture_output=True,
        )
        # Push the new commit
        dst_url = f"https://x-access-token:{token}@github.com/{owner}/{repo_name}.git"
        subprocess.run(
            ["git", "-C", str(repo_path), "push", dst_url],
            check=True,
            capture_output=True,
        )


# ---------------------------------------------------------------------------
# Main import logic
# ---------------------------------------------------------------------------


def import_repository(
    template_dir: str, github_token: str, target_owner: str, private: bool = False
) -> str:
    """Import repository from a local template directory to GitHub."""

    # ------------------------------------------------------------------
    # Ensure Git HTTP buffer large enough to avoid 400 errors on big pushes
    # ------------------------------------------------------------------
    try:
        subprocess.run(
            [
                "git",
                "config",
                "--global",
                "http.postBuffer",
                "157286400",  # 150 MiB
            ],
            check=True,
            capture_output=True,
        )
        logger.debug("[init] Set git http.postBuffer to 150MiB globally")
    except subprocess.CalledProcessError as cfg_err:
        logger.warning(
            "[init] Failed to set http.postBuffer – proceeding anyway: %s",
            cfg_err.stderr.decode(errors="ignore"),
        )

    tdir = Path(template_dir).expanduser().resolve()
    meta = json.loads((tdir / "meta.json").read_text())
    repo_name = meta["repo"]
    pr_head_refs = meta.get("pr_head_refs", [])
    default_branch = meta.get("default_branch", "main")

    # Also include fork PR branches that were fetched
    pulls = json.loads((tdir / "pulls.json").read_text())
    fork_branches = [
        pr["local_branch"]
        for pr in pulls
        if pr.get("is_from_fork", False) and "local_branch" in pr
    ]

    needed_refs = (
        [f"refs/heads/{default_branch}"]
        + [f"refs/heads/{h}" for h in pr_head_refs]
        + [f"refs/heads/{b}" for b in fork_branches]
    )

    sess = _make_session(github_token)

    # 1. Create target repo
    html_url = _create_target_repo(
        sess, target_owner, repo_name, f"Restored mirror of {repo_name}", private
    )

    # 2. Push code
    repo_path = tdir / "repo"
    logger.info("[phase] Pushing git history …")
    _push_repo(repo_path, target_owner, repo_name, github_token, needed_refs)

    # Set the default branch if it's not 'main'
    _set_default_branch(sess, target_owner, repo_name, default_branch)

    # Remove .github directory right after pushing, before creating issues/PRs
    _remove_github_directory(repo_path, target_owner, repo_name, github_token)

    # 3. Re-create issues & PRs
    logger.info("[phase] Re-creating issues …")
    issues = json.loads((tdir / "issues.json").read_text())
    created_issues = 0
    for itm in issues:
        new_issue_no = _create_issue(
            sess,
            target_owner,
            repo_name,
            itm["title"],
            itm.get("body", ""),
            itm.get("labels", []),
            itm.get("state", "open"),
            itm.get("number"),
        )
        if new_issue_no:
            created_issues += 1
            for c in itm.get("comments", []):
                comment_body = f"*Original author: @{c['user']}*\n\n{c['body']}"
                _create_comment(
                    sess, target_owner, repo_name, new_issue_no, comment_body
                )
    logger.info("[phase] Created %d out of %d issues", created_issues, len(issues))

    logger.info("[phase] Re-creating pull requests …")
    pulls = json.loads((tdir / "pulls.json").read_text())
    created_prs = 0
    skipped_prs = 0

    for pr in pulls:
        # Use local_branch for forked PRs, otherwise use original head
        head_branch = pr.get("local_branch", pr["head"])

        # Add note to PR body if it's from a fork
        body = pr.get("body", "")
        if pr.get("is_from_fork", False):
            fork_note = f"\n\n---\n_This PR was originally from a fork: **{pr.get('fork_owner')}/{pr.get('fork_repo')}** (branch: `{pr['head']}`)_"
            body = (
                body + fork_note if body else fork_note[2:]
            )  # Remove leading newlines if body is empty

        new_pr_number = _create_pull(
            sess,
            target_owner,
            repo_name,
            pr["title"],
            body,
            head_branch,
            pr["base"],
            pr.get("number"),
        )

        if new_pr_number:
            created_prs += 1
            for c in pr.get("comments", []):
                comment_body = f"*Original author: @{c['user']}*\n\n{c['body']}"
                _create_comment(
                    sess, target_owner, repo_name, new_pr_number, comment_body
                )
            for rc in pr.get("review_comments", []):
                comment_body = (
                    f"*Original author: @{rc['user']}* (review)\n\n{rc['body']}"
                )
                _create_comment(
                    sess, target_owner, repo_name, new_pr_number, comment_body
                )
        else:
            skipped_prs += 1

    logger.info("[phase] Created %d PRs, skipped %d PRs", created_prs, skipped_prs)

    # Enable GitHub Actions after creating issues and PRs
    logger.info("[import] Enabling GitHub Actions …")
    _enable_github_actions(sess, target_owner, repo_name)

    # Disable notifications to prevent email spam
    logger.info("[import] Disabling repository notifications …")
    _disable_repository_notifications(sess, target_owner, repo_name)

    logger.info("[done] Import complete: %s", html_url)
    return html_url


# ---------------------------------------------------------------------------
# CLI
# ---------------------------------------------------------------------------
if __name__ == "__main__":
    import argparse

    load_dotenv(".mcp_env")

    parser = argparse.ArgumentParser(
        description="Import repository from local template into GitHub"
    )
    parser.add_argument("--template_dir", help="Path to exported template directory")
    parser.add_argument(
        "--target-owner",
        "-o",
        default="mcpmark-eval",
        help="User or organisation that will own the new repository",
    )
    args = parser.parse_args()

    token = os.getenv("GITHUB_TOKEN")
    if not token:
        parser.error("GITHUB_TOKEN not set in environment or .mcp_env")

    # Always create the target repository as private
    import_repository(args.template_dir, token, args.target_owner, True)


================================================
FILE: src/mcp_services/github/token_pool.py
================================================
"""
GitHub Token Pool Manager
=========================

Simple round-robin token pool for distributing API requests across multiple tokens
to avoid rate limit issues.
"""

from typing import List
from src.logger import get_logger

logger = get_logger(__name__)


class GitHubTokenPool:
    """
    Manages a pool of GitHub tokens with round-robin selection.
    """
    
    def __init__(self, tokens: List[str]):
        """
        Initialize token pool.
        
        Args:
            tokens: List of GitHub personal access tokens
        """
        if not tokens:
            raise ValueError("Token pool must contain at least one token")
            
        self.tokens = tokens
        self.current_index = 0
        logger.info(f"Initialized GitHub token pool with {len(tokens)} token(s)")
    
    def get_next_token(self) -> str:
        """
        Get the next token in round-robin fashion.
        
        Returns:
            The next GitHub token to use
        """
        token = self.tokens[self.current_index]
        self.current_index = (self.current_index + 1) % len(self.tokens)
        return token
    
    def get_current_token(self) -> str:
        """
        Get the current token without advancing the index.
        
        Returns:
            The current GitHub token
        """
        return self.tokens[self.current_index]
    
    @property
    def pool_size(self) -> int:
        """Get the number of tokens in the pool."""
        return len(self.tokens)

================================================
FILE: src/mcp_services/insforge/__init__.py
================================================
"""Insforge MCP Service Implementation for MCPMark."""


================================================
FILE: src/mcp_services/insforge/insforge_login_helper.py
================================================
"""
Insforge Login Helper for MCPMark
==================================

Handles Insforge backend authentication and connection validation.
"""

import json
import requests
from pathlib import Path
from typing import Optional, Dict, Any

from src.base.login_helper import BaseLoginHelper
from src.logger import get_logger

logger = get_logger(__name__)


class InsforgeLoginHelper(BaseLoginHelper):
    """Handles Insforge backend authentication and connection validation."""

    def __init__(
        self,
        api_key: str,
        backend_url: str,
        state_path: Optional[Path] = None,
    ):
        """Initialize Insforge login helper.

        Args:
            api_key: Insforge backend API key for authentication
            backend_url: Insforge backend URL (e.g., https://your-app.insforge.app)
            state_path: Path to save connection state
        """
        super().__init__()
        self.api_key = api_key
        self.backend_url = backend_url.rstrip('/')
        self.state_path = state_path or Path.home() / ".mcpbench" / "insforge_auth.json"

        # Ensure state directory exists
        self.state_path.parent.mkdir(parents=True, exist_ok=True)

    def login(self, **kwargs) -> bool:
        """Test Insforge backend connection and validate API key.

        Returns:
            bool: True if connection successful and API key valid
        """
        try:
            # Test 1: Basic connectivity - try to get backend metadata
            logger.info(f"Testing connection to Insforge backend: {self.backend_url}")

            headers = {
                "Authorization": f"Bearer {self.api_key}",
                "Content-Type": "application/json",
            }

            # Test with a simple API endpoint - get current user or backend info
            # Try the auth current session endpoint first
            test_url = f"{self.backend_url}/api/auth/sessions/current"

            response = requests.get(
                test_url,
                headers=headers,
                timeout=10,
            )

            if response.status_code == 200:
                # API key is valid and can authenticate
                logger.info("✓ Insforge API key authentication successful")
                connection_info = {
                    "backend_url": self.backend_url,
                    "authenticated": True,
                    "authenticated_at": self._get_current_timestamp(),
                }
            elif response.status_code == 401:
                # Invalid API key
                logger.error("✗ Invalid Insforge API key")
                return False
            else:
                # API key might be admin key, try a different endpoint
                # Try listing tables/backend metadata as a test
                logger.info("Testing with backend metadata endpoint...")

                # Simple connectivity test - just check if backend is reachable
                health_url = f"{self.backend_url}/api/health"
                try:
                    health_response = requests.get(health_url, timeout=5)
                    if health_response.status_code in [200, 404]:  # 404 is ok, backend is reachable
                        logger.info("✓ Insforge backend is reachable")
                        connection_info = {
                            "backend_url": self.backend_url,
                            "api_key_type": "admin",
                            "authenticated": True,
                            "authenticated_at": self._get_current_timestamp(),
                        }
                    else:
                        logger.warning(f"Unexpected response from backend: {health_response.status_code}")
                        connection_info = {
                            "backend_url": self.backend_url,
                            "authenticated": True,
                            "authenticated_at": self._get_current_timestamp(),
                        }
                except Exception as e:
                    logger.warning(f"Health check failed, but proceeding: {e}")
                    # Still consider it successful if we have credentials
                    connection_info = {
                        "backend_url": self.backend_url,
                        "authenticated": True,
                        "authenticated_at": self._get_current_timestamp(),
                    }

            # Save connection state
            self._save_connection_state(connection_info)

            logger.info(f"Insforge backend connection validated: {self.backend_url}")
            return True

        except requests.exceptions.Timeout:
            logger.error(f"Connection timeout to Insforge backend: {self.backend_url}")
            return False
        except requests.exceptions.ConnectionError:
            logger.error(f"Cannot connect to Insforge backend: {self.backend_url}")
            return False
        except Exception as e:
            logger.error(f"Unexpected error during Insforge authentication: {e}")
            return False

    def _save_connection_state(self, state: Dict[str, Any]):
        """Save connection state to file."""
        try:
            # Don't save API key
            safe_state = {k: v for k, v in state.items() if k not in ["api_key", "access_token"]}

            with open(self.state_path, "w") as f:
                json.dump(safe_state, f, indent=2)

            # Set restrictive permissions
            self.state_path.chmod(0o600)
            logger.info(f"Connection state saved to: {self.state_path}")

        except Exception as e:
            logger.error(f"Failed to save connection state: {e}")

    def _get_current_timestamp(self) -> str:
        """Get current timestamp in ISO format."""
        from datetime import datetime, timezone

        return datetime.now(timezone.utc).isoformat()

    def is_connected(self) -> bool:
        """Check if we can connect to Insforge backend."""
        return self.login()

    def get_connection_params(self) -> Dict[str, Any]:
        """Get connection parameters (without API key)."""
        return {
            "backend_url": self.backend_url,
        }


================================================
FILE: src/mcp_services/insforge/insforge_state_manager.py
================================================
"""
Insforge State Manager for MCPMark
===================================

Manages backend state for Insforge tasks including setup via prepare_environment.py
and resource cleanup tracking.
"""

import os
import sys
import subprocess
import requests
from pathlib import Path
from typing import Optional, Dict, Any, List

from src.base.state_manager import BaseStateManager, InitialStateInfo
from src.base.task_manager import BaseTask
from src.logger import get_logger

logger = get_logger(__name__)


class InsforgeStateManager(BaseStateManager):
    """Manages Insforge backend state for task evaluation."""

    def __init__(
        self,
        api_key: str,
        backend_url: str,
    ):
        """Initialize Insforge state manager.

        Args:
            api_key: Insforge backend API key for authentication
            backend_url: Insforge backend URL (e.g., https://your-app.insforge.app)
        """
        super().__init__(service_name="insforge")

        self.api_key = api_key
        self.backend_url = backend_url.rstrip('/')

        # HTTP headers for API requests
        self.headers = {
            "Authorization": f"Bearer {self.api_key}",
            "Content-Type": "application/json",
        }

        # Track current task context for agent configuration
        self._current_task_context: Optional[Dict[str, Any]] = None

        # Validate connection on initialization
        try:
            self._test_connection()
            logger.info("Insforge state manager initialized successfully")
        except Exception as e:
            raise RuntimeError(f"Insforge initialization failed: {e}")

        # Store baseline tables (system tables that exist before any tasks run)
        self._baseline_tables = set(
            (t['schema'], t['name']) for t in self._get_all_tables()
        )
        logger.debug(f"Stored baseline: {len(self._baseline_tables)} tables")

    def _test_connection(self):
        """Test backend connection."""
        try:
            # Simple connectivity test - try any endpoint
            response = requests.get(
                f"{self.backend_url}/api/health",
                timeout=5,
            )
            # Any response (even 404) means backend is reachable
            logger.debug(f"Insforge backend connectivity test: {response.status_code}")
        except requests.exceptions.RequestException:
            # Try with API key
            try:
                response = requests.get(
                    f"{self.backend_url}/api/auth/sessions/current",
                    headers=self.headers,
                    timeout=5,
                )
                logger.debug(f"Insforge backend auth test: {response.status_code}")
            except Exception as inner_e:
                raise RuntimeError(f"Cannot connect to Insforge backend: {inner_e}")

    def _create_initial_state(self, task: BaseTask) -> Optional[InitialStateInfo]:
        """Create initial backend state for a task.

        Restores from backup which may place tables in public or task-specific schema.

        Args:
            task: Task for which to create initial state

        Returns:
            InitialStateInfo object or None if creation failed
        """
        try:
            # Generate unique state ID for this task run
            state_id = f"{task.category_id}_{task.task_id}_{self._get_timestamp()}"
            schema_name = task.category_id

            logger.info(f"| Creating initial state for Insforge task: {task.name}")

            # Drop schema first (cleanup from previous runs)
            self._drop_schema(schema_name)

            # Get list of existing tables before restore (to track what we create)
            tables_before = self._get_all_tables()
            logger.info(f"| Tables before restore: {len(tables_before)}")

            # Note: Don't create schema here - pg_restore will create it from the backup

            # Restore from backup if backup exists (may create tables in public or task schema)
            if self._restore_from_backup(schema_name):
                logger.info(f"| ✓ Restored '{schema_name}' from backup")
            else:
                logger.info(f"| ○ No backup found for '{schema_name}'")
                # Run prepare_environment.py if it exists
                task_prepared = self._run_prepare_environment(task)
                if not task_prepared:
                    logger.debug(f"| No prepare_environment.py found for task {task.name}")

            # Get list of tables after restore (to track what we need to clean up)
            tables_after = self._get_all_tables()

            # Track ALL new tables created by the restore (compare before/after)
            tables_before_set = {(t['schema'], t['name']) for t in tables_before}
            created_tables = [
                t for t in tables_after
                if (t['schema'], t['name']) not in tables_before_set
            ]

            logger.info(f"| Tracked {len(created_tables)} new tables for cleanup")
            for t in created_tables:
                logger.debug(f"|   - {t['schema']}.{t['name']}")

            # Track the task context including created tables
            context = {
                "state_id": state_id,
                "category_id": task.category_id,
                "task_id": task.task_id,
                "task_name": task.name,
                "schema": schema_name,
                "created_tables": created_tables,  # Track all created tables
            }

            return InitialStateInfo(
                state_id=state_id,
                state_url=self.backend_url,
                metadata=context,
            )

        except Exception as e:
            logger.error(f"Failed to create initial state for {task.name}: {e}")
            return None

    def _store_initial_state_info(
        self, task: BaseTask, state_info: InitialStateInfo
    ) -> None:
        """Store backend info in task object for agent access."""
        if hasattr(task, "__dict__"):
            task.backend_url = self.backend_url
            task.api_key = self.api_key
            task.state_id = state_info.state_id

            # Store current task context for agent configuration
            self._current_task_context = state_info.metadata

    def _cleanup_task_initial_state(self, task: BaseTask) -> bool:
        """Clean up task-specific resources.

        Drops ALL tables created during task (both setup and agent-created)
        by comparing against baseline.

        Args:
            task: Task whose initial state should be cleaned up

        Returns:
            True if cleanup successful
        """
        try:
            logger.info(f"| Cleaning up initial state for task: {task.name}")

            if self._current_task_context:
                schema_name = self._current_task_context.get("schema")

                # Get ALL current tables
                all_current_tables = self._get_all_tables()

                # Find tables to drop: anything not in baseline
                tables_to_drop = [
                    t for t in all_current_tables
                    if (t['schema'], t['name']) not in self._baseline_tables
                ]

                logger.info(f"| Found {len(tables_to_drop)} tables to clean up (setup + agent-created)")

                # Drop individual tables
                for table_info in tables_to_drop:
                    try:
                        self._drop_table(table_info["schema"], table_info["name"])
                        logger.debug(f"| ✓ Dropped table: {table_info['schema']}.{table_info['name']}")
                    except Exception as e:
                        logger.warning(f"| Failed to drop table {table_info}: {e}")

                # Drop the task schema (may be empty if all tables were in public)
                if schema_name:
                    try:
                        self._drop_schema(schema_name)
                        logger.info(f"| ✓ Dropped schema: {schema_name}")
                    except Exception as e:
                        logger.warning(f"| Failed to drop schema {schema_name}: {e}")

                # Clear task context
                if self._current_task_context.get("task_name") == task.name:
                    self._current_task_context = None

            logger.info(f"| ✓ Initial state cleanup completed for {task.name}")
            return True

        except Exception as e:
            logger.error(f"Failed to cleanup task initial state for {task.name}: {e}")
            return False

    def _cleanup_single_resource(self, resource: Dict[str, Any]) -> bool:
        """Clean up a single tracked resource.

        This is a placeholder for resource-specific cleanup logic.
        Tasks should handle their own cleanup via cleanup scripts.

        Args:
            resource: Resource dictionary with type, id, and metadata

        Returns:
            True if cleanup successful
        """
        resource_type = resource["type"]
        resource_id = resource["id"]

        logger.debug(f"| Cleanup for {resource_type} {resource_id} (handled by task scripts)")
        return True

    def _run_prepare_environment(self, task: BaseTask) -> bool:
        """Run prepare_environment.py script if it exists in the task directory.

        The script should use Insforge MCP tools or HTTP API to set up required state.

        Args:
            task: Task for which to prepare environment

        Returns:
            True if script ran successfully, False if script doesn't exist
        """
        task_dir = task.task_instruction_path.parent
        prepare_script = task_dir / "prepare_environment.py"

        if not prepare_script.exists():
            logger.debug(f"No prepare_environment.py found for task {task.name}")
            return False

        logger.info(f"| Running prepare_environment.py for task {task.name}")

        # Set up environment variables for the script
        env = os.environ.copy()
        env.update({
            "INSFORGE_BACKEND_URL": self.backend_url,
            "INSFORGE_API_KEY": self.api_key,
        })

        try:
            # Run the prepare_environment.py script
            result = subprocess.run(
                [sys.executable, str(prepare_script)],
                cwd=str(task_dir),  # Run from task directory
                env=env,
                capture_output=True,
                text=True,
                timeout=300,  # 5 minute timeout
            )

            if result.returncode == 0:
                logger.info(f"| ✓ Environment preparation completed for {task.name}")
                if result.stdout.strip():
                    logger.debug(f"| prepare_environment.py output: {result.stdout}")
                return True
            else:
                logger.error(f"| ✗ Environment preparation failed for {task.name}")
                logger.error(f"| Error output: {result.stderr}")
                raise RuntimeError(f"prepare_environment.py failed with exit code {result.returncode}")

        except subprocess.TimeoutExpired:
            logger.error(f"✗ Environment preparation timed out for {task.name}")
            raise RuntimeError("prepare_environment.py execution timed out")
        except Exception as e:
            logger.error(f"✗ Failed to run prepare_environment.py for {task.name}: {e}")
            raise

    def _get_timestamp(self) -> str:
        """Get timestamp for unique naming."""
        from datetime import datetime

        return datetime.now().strftime("%Y%m%d%H%M%S")

    def _drop_schema(self, schema_name: str) -> None:
        """Drop schema and all its contents."""
        import psycopg2
        from psycopg2 import sql

        conn_params = {
            "host": "localhost",
            "port": 5432,
            "user": "postgres",
            "password": "postgres",
            "database": "insforge",
        }

        conn = psycopg2.connect(**conn_params)
        conn.autocommit = True
        try:
            with conn.cursor() as cur:
                cur.execute(
                    sql.SQL("DROP SCHEMA IF EXISTS {} CASCADE").format(
                        sql.Identifier(schema_name)
                    )
                )
                logger.debug(f"| Dropped schema: {schema_name}")
        finally:
            conn.close()

    def _create_schema(self, schema_name: str) -> None:
        """Create empty schema."""
        import psycopg2
        from psycopg2 import sql

        conn_params = {
            "host": "localhost",
            "port": 5432,
            "user": "postgres",
            "password": "postgres",
            "database": "insforge",
        }

        conn = psycopg2.connect(**conn_params)
        conn.autocommit = True
        try:
            with conn.cursor() as cur:
                cur.execute(
                    sql.SQL("CREATE SCHEMA {}").format(sql.Identifier(schema_name))
                )
                logger.debug(f"| Created schema: {schema_name}")
        finally:
            conn.close()

    def _get_all_tables(self) -> List[Dict[str, str]]:
        """Get list of all user tables.

        Returns:
            List of dicts with 'schema' and 'name' keys
        """
        import psycopg2

        conn_params = {
            "host": "localhost",
            "port": 5432,
            "user": "postgres",
            "password": "postgres",
            "database": "insforge",
        }

        conn = psycopg2.connect(**conn_params)
        try:
            with conn.cursor() as cur:
                cur.execute("""
                    SELECT table_schema, table_name
                    FROM information_schema.tables
                    WHERE table_type = 'BASE TABLE'
                    AND table_schema NOT IN ('information_schema', 'pg_catalog')
                    AND table_schema NOT LIKE 'pg_%'
                    AND table_name NOT LIKE '\\_%'
                    ORDER BY table_schema, table_name
                """)
                rows = cur.fetchall()
                return [{"schema": row[0], "name": row[1]} for row in rows]
        finally:
            conn.close()

    def _drop_table(self, schema_name: str, table_name: str) -> None:
        """Drop a specific table or materialized view."""
        import psycopg2
        from psycopg2 import sql

        conn_params = {
            "host": "localhost",
            "port": 5432,
            "user": "postgres",
            "password": "postgres",
            "database": "insforge",
        }

        conn = psycopg2.connect(**conn_params)
        conn.autocommit = True
        try:
            with conn.cursor() as cur:
                # Try dropping as table first
                cur.execute(
                    sql.SQL("DROP TABLE IF EXISTS {}.{} CASCADE").format(
                        sql.Identifier(schema_name),
                        sql.Identifier(table_name)
                    )
                )
                # Also try dropping as materialized view (in case agent created one)
                cur.execute(
                    sql.SQL("DROP MATERIALIZED VIEW IF EXISTS {}.{} CASCADE").format(
                        sql.Identifier(schema_name),
                        sql.Identifier(table_name)
                    )
                )
                logger.debug(f"| Dropped table/view: {schema_name}.{table_name}")
        finally:
            conn.close()

    def _restore_from_backup(self, category_name: str) -> bool:
        """Restore from backup file.

        Tables may be restored into public schema or category-specific schema
        depending on how the backup was created.

        Args:
            category_name: Name of category (e.g., 'employees', 'chinook', 'lego')

        Returns:
            True if backup was restored, False if no backup exists
        """
        # Path to backup file
        backup_dir = Path(__file__).parent.parent.parent.parent / "postgres_state"
        backup_file = backup_dir / f"{category_name}.backup"

        logger.debug(f"| Looking for backup at: {backup_file}")
        logger.debug(f"| Backup exists: {backup_file.exists()}")

        if not backup_file.exists():
            logger.info(f"| ○ No backup file found: {backup_file}")
            return False

        logger.info(f"| Restoring {category_name} from backup...")

        # Set up environment for pg_restore
        env = os.environ.copy()
        env["PGPASSWORD"] = "postgres"

        try:
            # Restore backup without schema filter (tables go to whatever schema they're in)
            result = subprocess.run(
                [
                    "pg_restore",
                    "-h", "localhost",
                    "-p", "5432",
                    "-U", "postgres",
                    "-d", "insforge",
                    "-v",
                    str(backup_file),
                ],
                env=env,
                capture_output=True,
                text=True,
                timeout=120,  # 2 minute timeout
            )

            if result.returncode != 0 and "ERROR" in result.stderr:
                logger.warning(f"| pg_restore had errors for {category_name}: {result.stderr}")
                return False

            logger.info(f"| ✓ {category_name} restored successfully")
            return True

        except subprocess.TimeoutExpired:
            logger.error(f"| ✗ Restore timed out for {category_name}")
            return False
        except Exception as e:
            logger.error(f"| ✗ Failed to restore {category_name}: {e}")
            return False

    def get_service_config_for_agent(self) -> dict:
        """Get configuration for agent execution.

        This configuration is passed to the agent/MCP server so it can
        connect to the Insforge backend.

        Returns:
            Dictionary containing backend URL and API key
        """
        config = {
            "backend_url": self.backend_url,
            "api_key": self.api_key,
        }

        # Include current task context if available
        if self._current_task_context:
            config["task_context"] = self._current_task_context

        return config

    def set_verification_environment(self, messages_path: str = None) -> None:
        """Set environment variables needed for verification scripts.

        Args:
            messages_path: Optional path to messages.json file for verification
        """
        os.environ["INSFORGE_BACKEND_URL"] = self.backend_url
        os.environ["INSFORGE_API_KEY"] = self.api_key

        # Set PostgreSQL connection details for direct database verification
        # (Insforge exposes its internal postgres database for verification)
        os.environ["POSTGRES_HOST"] = "localhost"
        os.environ["POSTGRES_PORT"] = "5432"
        os.environ["POSTGRES_DATABASE"] = "insforge"
        os.environ["POSTGRES_USERNAME"] = "postgres"
        os.environ["POSTGRES_PASSWORD"] = "postgres"

        if messages_path:
            os.environ["MCP_MESSAGES"] = str(messages_path)

        logger.debug("Verification environment variables set for Insforge (including direct postgres access)")


================================================
FILE: src/mcp_services/insforge/insforge_task_manager.py
================================================
"""
Insforge Task Manager for MCPMark
===================================

Manages Insforge task discovery, execution, and verification.
"""

import os
import subprocess
import sys
from dataclasses import dataclass
from pathlib import Path
from typing import Any, Dict, List, Optional

from src.base.task_manager import BaseTask, BaseTaskManager
from src.logger import get_logger

logger = get_logger(__name__)


@dataclass
class InsforgeTask(BaseTask):
    """Insforge-specific task with backend information."""

    task_name: str = ""
    backend_url: Optional[str] = None
    api_key: Optional[str] = None


class InsforgeTaskManager(BaseTaskManager):
    """Manages Insforge tasks for MCPMark evaluation."""

    def __init__(self, tasks_root: Path = None):
        """Initialize Insforge task manager.

        Args:
            tasks_root: Path to tasks directory
        """
        if tasks_root is None:
            tasks_root = Path(__file__).resolve().parents[3] / "tasks"

        super().__init__(
            tasks_root,
            mcp_service="insforge",
            task_class=InsforgeTask,
            task_organization="file",  # Insforge uses file-based tasks
        )

    def _create_task_from_files(
        self, category_id: str, task_files_info: Dict[str, Any]
    ) -> Optional[InsforgeTask]:
        """Instantiate an `InsforgeTask` from the dictionary returned by `_find_task_files`."""
        import json

        # Check for meta.json
        meta_path = task_files_info["instruction_path"].parent / "meta.json"
        final_category_id = category_id
        task_id = task_files_info["task_id"]

        if meta_path.exists():
            try:
                with open(meta_path, 'r') as f:
                    meta_data = json.load(f)
                    # Use values from meta.json if available
                    final_category_id = meta_data.get("category_id", category_id)
                    task_id = meta_data.get("task_id", task_id)
            except Exception as e:
                logger.warning(f"Failed to load meta.json from {meta_path}: {e}")

        return InsforgeTask(
            task_instruction_path=task_files_info["instruction_path"],
            task_verification_path=task_files_info["verification_path"],
            service="insforge",
            category_id=final_category_id,
            task_id=task_id,
            task_name=task_files_info["task_id"],
        )

    def _get_verification_command(self, task: InsforgeTask) -> List[str]:
        """Get verification command with Insforge backend info."""
        cmd = [sys.executable, str(task.task_verification_path)]
        return cmd

    def run_verification(self, task: BaseTask) -> subprocess.CompletedProcess:
        """Run verification with Insforge environment."""
        env = os.environ.copy()

        # Pass Insforge connection info to verification script
        if hasattr(task, "backend_url") and task.backend_url:
            env["INSFORGE_BACKEND_URL"] = task.backend_url

        if hasattr(task, "api_key") and task.api_key:
            env["INSFORGE_API_KEY"] = task.api_key

        return subprocess.run(
            self._get_verification_command(task),
            capture_output=True,
            text=True,
            timeout=300,
            env=env,
        )

    def _format_task_instruction(self, base_instruction: str) -> str:
        """Add Insforge-specific instructions."""
        return (
            base_instruction
            + "\n\nNote: Use Insforge MCP tools to complete this task. The backend connection is already configured."
        )


================================================
FILE: src/mcp_services/notion/__init__.py
================================================
"""
Notion-specific modules for MCPMark.
"""

from .notion_task_manager import NotionTaskManager, NotionTask
from .notion_state_manager import NotionStateManager

__all__ = ["NotionTaskManager", "NotionTask", "NotionStateManager"]


================================================
FILE: src/mcp_services/notion/notion_login_helper.py
================================================
"""
Notion Login Helper for MCPMark
=================================

This module provides a utility class and CLI script for logging into Notion
using Playwright. It saves the authenticated session state to a file,
which can be used for subsequent automated tasks.
"""

import argparse
from pathlib import Path
from typing import Optional

from playwright.sync_api import (
    BrowserContext,
    Page,
    TimeoutError as PlaywrightTimeoutError,
    sync_playwright,
)

from src.base.login_helper import BaseLoginHelper
from src.logger import get_logger

# Initialize logger
logger = get_logger(__name__)


class NotionLoginHelper(BaseLoginHelper):
    """
    Utility helper for logging into Notion using Playwright.
    """

    SUPPORTED_BROWSERS = {"chromium", "firefox"}

    def __init__(
        self,
        *,
        url: Optional[str] = None,
        headless: bool = True,
        state_path: Optional[str | Path] = None,
        browser: str = "firefox",
    ) -> None:
        """
        Initializes the Notion login helper.

        Args:
            url: The Notion URL to open after launching the browser.
            headless: Whether to run Playwright in headless mode.
            state_path: The path to save the authenticated session state.
            browser: The browser engine to use ('chromium' or 'firefox').
        """
        super().__init__()
        if browser not in self.SUPPORTED_BROWSERS:
            raise ValueError(
                f"Unsupported browser '{browser}'. Supported browsers are: {', '.join(self.SUPPORTED_BROWSERS)}"
            )

        self.url = url or "https://www.notion.so/login"
        self.headless = headless
        self.browser_name = browser
        self.state_path = (
            Path(state_path or Path.cwd() / "notion_state.json").expanduser().resolve()
        )
        self._browser_context: Optional[BrowserContext] = None
        self._playwright = None
        self._browser = None

    def login(self) -> BrowserContext:
        """
        Launches a browser, performs login, and saves the session state.
        """
        if self.state_path.exists():
            try:
                self.state_path.unlink()
            except OSError as e:
                logger.warning("Unable to remove existing state file: %s", e)

        if self._playwright is None:
            self._playwright = sync_playwright().start()

        browser_type = getattr(self._playwright, self.browser_name)
        self._browser = browser_type.launch(headless=self.headless)
        context = self._browser.new_context()
        page = context.new_page()

        logger.info("Navigating to Notion URL: %s", self.url)
        page.goto(self.url, wait_until="load")

        if self.headless:
            self._handle_headless_login(context)
        else:
            logger.info(
                "A browser window has been opened. Please complete the Notion login."
            )
            logger.info(
                "After you see your workspace, return to this terminal and press <ENTER>."
            )
            initial_url = page.url
            input()
            try:
                page.wait_for_url(lambda u: u != initial_url, timeout=10_000)
            except PlaywrightTimeoutError:
                pass  # It's okay if the URL doesn't change

        try:
            page.wait_for_load_state("domcontentloaded", timeout=5_000)
        except PlaywrightTimeoutError:
            pass

        context.storage_state(path=str(self.state_path))
        logger.info("✅ Login successful! Session state saved to %s", self.state_path)

        self._browser_context = context
        return context

    def close(self) -> None:
        """Closes the underlying browser and Playwright instance."""
        if self._browser_context:
            try:
                self._browser_context.close()
            finally:
                self._browser_context = None
        if self._browser:
            try:
                self._browser.close()
            finally:
                self._browser = None
        if self._playwright:
            self._playwright.stop()
            self._playwright = None

    def _handle_headless_login(self, context: BrowserContext) -> None:
        """
        Guides the user through the login process in headless mode.
        """
        page: Page = context.pages[0]
        login_url = "https://www.notion.so/login"
        page.goto(login_url, wait_until="domcontentloaded")

        email = input("Enter your Notion email address: ").strip()
        try:
            email_input = page.locator(
                'input[placeholder="Enter your email address..."]'
            )
            email_input.wait_for(state="visible", timeout=120_000)
            email_input.fill(email)
            email_input.press("Enter")
        except PlaywrightTimeoutError:
            raise RuntimeError("Timed out waiting for the email input field.")
        except Exception:
            page.get_by_role("button", name="Continue", exact=True).click()

        try:
            code_input = page.locator('input[placeholder="Enter code"]')
            code_input.wait_for(state="visible", timeout=120_000)
            code = input("Enter the verification code from your email: ").strip()
            code_input.fill(code)
            code_input.press("Enter")
        except PlaywrightTimeoutError:
            raise RuntimeError("Timed out waiting for the verification code input.")
        except Exception:
            page.get_by_role("button", name="Continue", exact=True).click()

        try:
            page.wait_for_url(lambda url: url != login_url, timeout=180_000)
        except PlaywrightTimeoutError:
            logger.warning("Login redirect timed out, but proceeding to save state.")

        if self.url and self.url != login_url:
            page.goto(self.url, wait_until="domcontentloaded")

    def __enter__(self) -> "NotionLoginHelper":
        self.login()
        return self

    def __exit__(self, exc_type, exc_val, exc_tb):
        self.close()


def main():
    """Main entry point for the Notion login CLI script."""
    parser = argparse.ArgumentParser(
        description="Authenticate to Notion and generate a session state file.",
        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
    )
    parser.add_argument(
        "--headless",
        action="store_true",
        help="Run the login flow in headless mode (prompts for credentials).",
    )
    parser.add_argument(
        "--browser",
        default="firefox",
        choices=["chromium", "firefox"],
        help="The browser engine to use for Playwright.",
    )
    args = parser.parse_args()

    helper = NotionLoginHelper(headless=args.headless, browser=args.browser)
    with helper:
        logger.info("Login process completed.")


if __name__ == "__main__":
    main()


================================================
FILE: src/mcp_services/notion/notion_state_manager.py
================================================
"""
Notion State Manager for MCPMark
=================================

This module handles the duplication and management of Notion initial states
Pages for consistent task evaluation using Playwright automation.
"""

import time
from pathlib import Path
from typing import Optional, Tuple, Dict, Any, Set

from notion_client import Client
from playwright.sync_api import (
    Browser,
    BrowserContext,
    Page,
    Playwright,
    TimeoutError as PlaywrightTimeoutError,
    sync_playwright,
)

from src.base.state_manager import BaseStateManager, InitialStateInfo
from src.base.task_manager import BaseTask
from src.logger import get_logger
from src.mcp_services.notion.notion_task_manager import NotionTask
import re

# Initialize logger
logger = get_logger(__name__)

# Pattern to match orphan pages with "(n)" suffix, e.g., "Title (1)", "Title (2)"
ORPHAN_PAGE_PATTERN = re.compile(r".+\s+\(\d+\)$")

# Selectors for Notion UI elements
PAGE_MENU_BUTTON_SELECTOR = '[data-testid="more-button"], div.notion-topbar-more-button, [aria-label="More"], button[aria-label="More"]'
DUPLICATE_MENU_ITEM_SELECTOR = 'text="Duplicate"'
DUPLICATE_WITH_CONTENT_SELECTOR = 'text="Duplicate with content"'
MOVE_TO_MENU_ITEM_SELECTOR = 'text="Move to"'
MOVE_TO_SEARCH_INPUT_SELECTOR = (
    'input[placeholder*="Move page to"], textarea[placeholder*="Move page to"]'
)


class NotionStateManager(BaseStateManager):
    """
    Manages the state of Notion initial states using Playwright and the Notion API.
    """

    def __init__(
        self,
        source_notion_key: str,
        eval_notion_key: str,
        headless: bool = True,
        browser: str = "firefox",
        eval_parent_page_title: str = "MCPMark Eval Hub",
        source_parent_page_title: str = "MCPMark Source Hub",
    ):
        """
        Initializes the Notion state manager.

        Args:
            source_notion_key: The Notion API key for source workspace.
            eval_notion_key: The Notion API key for evaluation workspace.
            headless: Whether to run Playwright in headless mode.
            browser: The browser engine to use ('chromium' or 'firefox').
            eval_parent_page_title: Parent page title for evaluation workspace.
        """
        super().__init__(service_name="notion")
        supported_browsers = {"chromium", "firefox"}
        if browser not in supported_browsers:
            raise ValueError(
                f"Unsupported browser '{browser}'. Supported browsers are: {', '.join(supported_browsers)}"
            )

        self.browser_name = browser

        # Initialize separate Notion clients with provided keys
        if not source_notion_key or not eval_notion_key:
            raise ValueError(
                "Both source_notion_key and eval_notion_key must be provided to NotionStateManager."
            )

        self.source_notion_client = Client(auth=source_notion_key)
        self.eval_notion_client = Client(auth=eval_notion_key)

        self.headless = headless
        self.state_file = Path("notion_state.json")
        # Parent page under which duplicated pages should be moved for evaluation
        self.eval_parent_page_title = eval_parent_page_title
        # Source hub page that contains all initial-state templates
        self.source_parent_page_title = source_parent_page_title

        # Cache resolved parent page IDs to avoid repeated workspace-wide searches
        self._eval_parent_page_id: Optional[str] = None
        self._source_hub_page_id: Optional[str] = None

        # Browser instance management for reuse within session
        self._playwright: Optional[Playwright] = None
        self._browser: Optional[Browser] = None
        self._context: Optional[BrowserContext] = None

        # Validate initialization
        if not self.source_notion_client or not self.eval_notion_client:
            raise ValueError(
                "Both source_notion_key and eval_notion_key must be provided and valid"
            )

        if not self.state_file.exists():
            raise FileNotFoundError(
                "Authentication state 'notion_state.json' not found. Run the Notion login helper first."
            )

        logger.info("Notion state manager initialized successfully")

    # =========================================================================
    # Core Template Methods (Required by BaseStateManager)
    # =========================================================================

    def _cleanup_eval_hub_orphans(self) -> None:
        """Clean up all pages in MCPMark Eval Hub before creating new task state."""
        try:
            parent_page_id = self._ensure_eval_parent_page_id()

            if not parent_page_id:
                logger.debug(
                    "| ✗ Parent page '%s' not found in eval workspace, skipping cleanup",
                    self.eval_parent_page_title,
                )
                return

            # Get all child pages and archive them
            children = self.eval_notion_client.blocks.children.list(
                block_id=parent_page_id
            )
            orphan_count = 0
            for child in children.get("results", []):
                if child.get("type") == "child_page":
                    try:
                        self.eval_notion_client.pages.update(
                            page_id=child["id"], archived=True
                        )
                        orphan_count += 1
                        logger.debug("| ✓ Archived orphan page: %s", child["id"])
                    except Exception as e:
                        logger.warning(
                            "| ✗ Failed to archive orphan page %s: %s", child["id"], e
                        )

            if orphan_count > 0:
                logger.info(
                    "| ✓ Cleaned up %d orphan page(s) from MCPMark Eval Hub", orphan_count
                )

        except Exception as e:
            logger.warning("Orphan cleanup failed (non-critical, continuing): %s", e)
            # Don't raise exception - allow execution to continue

    def _cleanup_source_hub_orphans(self, exclude_page_ids: Optional[Set[str]] = None) -> int:
        """Clean up all orphan pages in source hub matching 'xxx (n)' pattern.

        Args:
            exclude_page_ids: Page IDs to exclude from cleanup (e.g., pages currently being operated on)

        Returns:
            Number of pages archived
        """
        exclude_page_ids = exclude_page_ids or set()
        source_hub_id = self._ensure_source_hub_page_id()
        if not source_hub_id:
            return 0

        orphan_count = 0
        next_cursor = None

        try:
            while True:
                kwargs: Dict[str, Any] = {"block_id": source_hub_id}
                if next_cursor:
                    kwargs["start_cursor"] = next_cursor

                children = self.source_notion_client.blocks.children.list(**kwargs)

                for child in children.get("results", []):
                    if child.get("type") != "child_page":
                        continue

                    child_id = child.get("id")
                    if child_id in exclude_page_ids:
                        continue

                    child_title = (child.get("child_page", {}) or {}).get("title", "").strip()

                    # Match "xxx (n)" pattern where n is any digit(s)
                    if ORPHAN_PAGE_PATTERN.match(child_title):
                        try:
                            self.source_notion_client.pages.update(
                                page_id=child_id, archived=True
                            )
                            orphan_count += 1
                            logger.info("| ✓ Archived source hub orphan: %s (%s)", child_title, child_id)
                        except Exception as e:
                            logger.warning("| ✗ Failed to archive orphan %s: %s", child_id, e)

                if not children.get("has_more"):
                    break
                next_cursor = children.get("next_cursor")

            if orphan_count > 0:
                logger.info("| ✓ Cleaned up %d orphan page(s) from source hub", orphan_count)

        except Exception as e:
            logger.warning("Source hub orphan cleanup failed (non-critical, continuing): %s", e)

        return orphan_count

    def _ensure_eval_parent_page_id(self) -> Optional[str]:
        """Resolve and cache the evaluation hub parent page ID."""
        if self._eval_parent_page_id:
            return self._eval_parent_page_id

        try:
            response = self.eval_notion_client.search(
                query=self.eval_parent_page_title,
                filter={"property": "object", "value": "page"},
            )

            for result in response.get("results", []):
                props = result.get("properties", {})
                title_prop = props.get("title", {}).get("title") or props.get(
                    "Name", {}
                ).get("title")
                if not title_prop:
                    continue

                title = "".join(t.get("plain_text", "") for t in title_prop).strip()
                if title == self.eval_parent_page_title:
                    self._eval_parent_page_id = result.get("id")
                    break

            if not self._eval_parent_page_id:
                logger.debug(
                    "| ✗ Eval parent page '%s' not found via search",
                    self.eval_parent_page_title,
                )
        except Exception as e:
            logger.error(
                "| ✗ Failed to resolve eval parent page '%s': %s",
                self.eval_parent_page_title,
                e,
            )

        return self._eval_parent_page_id

    def _ensure_source_hub_page_id(self) -> Optional[str]:
        """Resolve and cache the source hub parent page ID used for initial states."""
        if self._source_hub_page_id:
            return self._source_hub_page_id

        try:
            hub_search = self.source_notion_client.search(
                query=self.source_parent_page_title,
                filter={"property": "object", "value": "page"},
            )

            for result in hub_search.get("results", []):
                props = result.get("properties", {})
                title_prop = props.get("title", {}).get("title") or props.get(
                    "Name", {}
                ).get("title")
                current_title = "".join(
                    t.get("plain_text", "") for t in (title_prop or [])
                ).strip()
                if current_title == self.source_parent_page_title:
                    self._source_hub_page_id = result.get("id")
                    break

            if not self._source_hub_page_id:
                logger.error(
                    "| ✗ Source hub page '%s' not found.",
                    self.source_parent_page_title,
                )
        except Exception as e:
            logger.error(
                "| ✗ Failed to resolve source hub page '%s': %s",
                self.source_parent_page_title,
                e,
            )

        return self._source_hub_page_id

    def _wait_for_database_ready(
        self,
        page_id: str,
        max_retries: int = 10,
        retry_delay: int = 2
    ) -> bool:
        """
        Wait for the database backend to be ready by checking page accessibility.

        Args:
            page_id: The ID of the page to check
            max_retries: Maximum number of retry attempts
            retry_delay: Delay between retries in seconds

        Returns:
            True if the database is ready, False if timeout
        """
        logger.info("| ○ Starting heartbeat detection for page %s", page_id)

        for attempt in range(max_retries):
            try:
                # Try to retrieve the page from the evaluation workspace
                result = self.eval_notion_client.pages.retrieve(page_id=page_id)

                # Check if we got a valid response
                if result and isinstance(result, dict):
                    # Additional check: try to get page properties
                    if "properties" in result:
                        logger.info(
                            "| ✓ Database backend is ready (attempt %d/%d)",
                            attempt + 1,
                            max_retries
                        )
                        return True

            except Exception as e:
                logger.debug(
                    "| ✗ Database not ready yet (attempt %d/%d): %s",
                    attempt + 1,
                    max_retries,
                    str(e)
                )

            # Wait before next retry
            if attempt < max_retries - 1:
                time.sleep(retry_delay)

        logger.error(
            "| ✗ Database backend failed to become ready after %d attempts",
            max_retries
        )
        return False

    def _create_initial_state(self, task: BaseTask) -> Optional[InitialStateInfo]:
        """Create initial state by duplicating Notion page."""
        if not isinstance(task, NotionTask):
            logger.error("Task must be NotionTask for Notion state manager")
            return None

        # Clean up any orphan pages in eval hub before creating new state
        self._cleanup_eval_hub_orphans()

        # Clean up orphan pages in source hub before duplication
        self._cleanup_source_hub_orphans()

        try:
            initial_state_title = self._category_to_initial_state_title(task.category_id)
            initial_state_info = self._find_initial_state_by_title(initial_state_title)

            if not initial_state_info:
                logger.error(
                    "| ✗ Initial state not found for category '%s' (title: '%s')",
                    task.category_id,
                    initial_state_title,
                )
                return None

            _, initial_state_url = initial_state_info

            duplicated_url, duplicated_id = self._duplicate_initial_state_for_task(
                initial_state_url, task.category_id, task.name
            )

            # Wait for database backend to be ready
            logger.info("| ○ Checking database backend accessibility for duplicated page...")
            if not self._wait_for_database_ready(duplicated_id):
                logger.error(
                    "| ✗ Database backend is not accessible after duplication for task %s",
                    task.name
                )
                # Clean up the duplicated page if database is not ready
                try:
                    self.eval_notion_client.pages.update(
                        page_id=duplicated_id, archived=True
                    )
                    logger.info("| ✓ Cleaned up inaccessible duplicated page: %s", duplicated_id)
                except Exception as cleanup_error:
                    logger.error("| ✗ Failed to clean up duplicated page: %s", cleanup_error)

                raise RuntimeError(
                    f"| ✗ Database backend failed to become ready for duplicated page {duplicated_id}"
                )

            time.sleep(5) # allow the page to fully load

            return InitialStateInfo(
                state_id=duplicated_id,
                state_url=duplicated_url,
                metadata={
                    "original_url": initial_state_url,
                    "category": task.category_id,
                    "task_name": task.name,
                },
            )

        except Exception as e:
            logger.error(f"| ✗ Failed to create initial state for {task.name}: {e}")
            return None

    def _store_initial_state_info(
        self, task: BaseTask, state_info: InitialStateInfo
    ) -> None:
        """Store initial state information in NotionTask object."""
        if isinstance(task, NotionTask):
            task.duplicated_initial_state_id = state_info.state_id
            task.duplicated_initial_state_url = state_info.state_url
            task.original_initial_state_url = state_info.metadata.get("original_url")

            # Track the duplicated page for cleanup
            self.track_resource("page", state_info.state_id, state_info.metadata)

    def _cleanup_task_initial_state(self, task: BaseTask) -> bool:
        """Clean up initial state for a specific Notion task."""
        if not isinstance(task, NotionTask):
            return True  # Nothing to clean up for non-Notion tasks

        initial_state_id = task.duplicated_initial_state_id
        if not initial_state_id:
            logger.warning(
                "| ✗ No duplicated initial state ID found for task %s, skipping cleanup.",
                task.name,
            )
            return False

        try:
            # Archive the duplicated page
            self.eval_notion_client.pages.update(
                page_id=initial_state_id, archived=True
            )
            logger.info("| ✓ Archived page initial state: %s", initial_state_id)

            # Remove from tracked resources to avoid duplicate cleanup
            self.tracked_resources = [
                r
                for r in self.tracked_resources
                if not (r["type"] == "page" and r["id"] == initial_state_id)
            ]

            return True
        except Exception as e:
            logger.error("| ✗ Failed to archive initial state %s: %s", initial_state_id, e)
            return False

    def _cleanup_single_resource(self, resource: Dict[str, Any]) -> bool:
        """Clean up a single Notion resource."""
        if resource["type"] == "page":
            try:
                self.eval_notion_client.pages.update(
                    page_id=resource["id"], archived=True
                )
                logger.info(f"| ✓ Archived Notion page: {resource['id']}")
                return True
            except Exception as e:
                logger.error(f"| ✗ Failed to archive Notion page {resource['id']}: {e}")
                return False

        logger.warning(f"| ? Unknown resource type for cleanup: {resource['type']}")
        return False

    # =========================================================================
    # Notion API Operations
    # =========================================================================

    def _rename_initial_state_via_api(
        self, initial_state_id: str, new_title: str
    ) -> None:
        """Renames a Notion page using the API."""
        try:
            self.eval_notion_client.pages.update(
                page_id=initial_state_id,
                properties={"title": {"title": [{"text": {"content": new_title}}]}},
            )
        except Exception as e:
            logger.error("| ✗ Failed to rename page via API: %s", e)

    # ------------------------------------------------------------------
    # Playwright helpers
    # ------------------------------------------------------------------

    def _ensure_browser(self) -> Tuple[Browser, BrowserContext]:
        """Ensure browser instance is available, reusing existing or creating new.

        Returns:
            Tuple of (Browser, BrowserContext)
        """
        if self._playwright is None:
            self._playwright = sync_playwright().start()

        if self._browser is None:
            browser_type = getattr(self._playwright, self.browser_name)
            self._browser = browser_type.launch(headless=self.headless)

        if self._context is None:
            self._context = self._browser.new_context(
                storage_state=str(self.state_file),
                locale="en-US",
            )

        return self._browser, self._context

    def close(self) -> None:
        """Clean up browser resources. Should be called when session ends."""
        if self._context:
            try:
                # Save storage state before closing
                self._context.storage_state(path=str(self.state_file))
                self._context.close()
            except Exception:
                pass
            self._context = None

        if self._browser:
            try:
                self._browser.close()
            except Exception:
                pass
            self._browser = None

        if self._playwright:
            try:
                self._playwright.stop()
            except Exception:
                pass
            self._playwright = None

    def _recover_duplicate_via_ui(
        self,
        page: Page,
        original_title: str,
        *,
        timeout: int = 30_000,
    ) -> Optional[str]:
        """Recover duplicate page URL by navigating via UI when API-based recovery fails.

        This method navigates to the source hub and locates the duplicate page
        (e.g., "Title (1)") in the Notion sidebar, then clicks on it to obtain
        the URL directly from the browser.

        Args:
            page: The Playwright page instance
            original_title: The original page title (without suffix)
            timeout: Timeout for UI operations in milliseconds

        Returns:
            The URL of the duplicate page if found, None otherwise
        """
        try:
            source_hub_id = self._ensure_source_hub_page_id()
            if not source_hub_id:
                logger.warning("| ✗ Cannot resolve source hub for UI-based recovery")
                return None

            # Build URL to navigate to source hub
            # Format: https://www.notion.so/<hub-id>
            clean_hub_id = source_hub_id.replace("-", "")
            source_hub_url = f"https://www.notion.so/{clean_hub_id}"

            logger.info("| ○ Navigating to source hub for UI-based recovery...")
            page.goto(source_hub_url, wait_until="domcontentloaded", timeout=60_000)
            time.sleep(3)  # Allow page to settle

            # Look for page title with "(n)" suffix pattern in sidebar or page content
            # The duplicate will be named "Original Title (1)" or similar
            duplicate_pattern = re.compile(rf"^{re.escape(original_title)}\s*\(\d+\)$")

            # Try to find the duplicate page in the page list/sidebar
            # Notion uses different selectors for page links, try common patterns
            page_link_selectors = [
                f'a:has-text("{original_title} (1)")',
                f'div[data-block-id]:has-text("{original_title} (1)")',
                f'[role="treeitem"]:has-text("{original_title} (1)")',
            ]

            for selector in page_link_selectors:
                try:
                    locator = page.locator(selector).first
                    if locator.is_visible(timeout=5000):
                        logger.info("| ○ Found duplicate page in UI, clicking...")
                        locator.click()
                        page.wait_for_load_state("domcontentloaded", timeout=timeout)
                        time.sleep(3)
                        recovered_url = page.url
                        logger.info("| ✓ Recovered duplicate URL via UI: %s", recovered_url)
                        return recovered_url
                except Exception:
                    continue

            # If specific selectors didn't work, try a broader search
            try:
                # Look for any visible text matching the pattern and click it
                all_text_elements = page.locator(f'text="{original_title} ("')
                count = all_text_elements.count()
                if count > 0:
                    for i in range(count):
                        element = all_text_elements.nth(i)
                        text_content = element.text_content() or ""
                        if duplicate_pattern.match(text_content.strip()):
                            logger.info("| ○ Found duplicate via text search, clicking...")
                            element.click()
                            page.wait_for_load_state("domcontentloaded", timeout=timeout)
                            time.sleep(3)
                            recovered_url = page.url
                            logger.info("| ✓ Recovered duplicate URL via UI text search: %s", recovered_url)
                            return recovered_url
            except Exception as e:
                logger.debug("| ✗ Broad text search failed: %s", e)

            logger.warning("| ✗ Could not locate duplicate '%s (n)' in UI", original_title)
            return None

        except Exception as e:
            logger.warning("| ✗ UI-based recovery failed: %s", e)
            return None

    # =========================================================================
    # Playwright Automation Methods
    # =========================================================================

    def _move_current_page_to_env(
        self, page: Page, *, wait_timeout: int = 60_000
    ) -> None:
        """Moves the currently open page into the designated evaluation parent page.

        This operation is done via Playwright UI automation because the Notion API
        does not yet expose a direct "move" endpoint for pages. It relies on the
        following sequence:

        1. Open the page action menu (same selector as duplication).
        2. Choose the "Move to" menu item.
        3. In the search field that appears (placeholder starts with
           "Move page to"), type the target parent page title.
        4. Click the matching search result to complete the move.
        """

        logger.info(
            "| ○ Moving duplicated page to evaluation parent '%s'...",
            self.eval_parent_page_title,
        )

        try:
            # Step 1: Open the page menu
            page.wait_for_selector(
                PAGE_MENU_BUTTON_SELECTOR, state="visible", timeout=30_000
            )
            page.click(PAGE_MENU_BUTTON_SELECTOR)

            # Step 2: Select "Move to"
            page.hover(MOVE_TO_MENU_ITEM_SELECTOR)
            page.click(MOVE_TO_MENU_ITEM_SELECTOR)

            # Step 3: Fill the destination title
            page.wait_for_selector(
                MOVE_TO_SEARCH_INPUT_SELECTOR, state="visible", timeout=15_000
            )

            # Ensure focus then type the destination title – using type() triggers
            # key events Notion relies on for search filtering.
            search_input = page.locator(MOVE_TO_SEARCH_INPUT_SELECTOR).first
            search_input.click()
            search_input.fill("")  # Clear any residual text (safety)
            search_input.type(self.eval_parent_page_title, delay=50)

            # Step 4: Wait for the search result matching the page title, then click it
            # Selector for the menu item row – ensure we click the outer container, not a nested <div>
            result_selector = (
                f'div[role="menuitem"]:has-text("{self.eval_parent_page_title}")'
            )
            page.wait_for_selector(
                result_selector, state="visible", timeout=wait_timeout
            )
            page.locator(result_selector).first.click(force=True)

            # Wait for the dialog to disappear – indicates move finished
            page.wait_for_selector(
                MOVE_TO_SEARCH_INPUT_SELECTOR, state="detached", timeout=wait_timeout
            )

            # Give Notion a brief moment to process the move
            time.sleep(3)
        except PlaywrightTimeoutError as e:
            logger.error(
                "| ✗ Playwright timed out while moving page to evaluation parent – move may have failed."
            )
            raise RuntimeError("Playwright timeout during move-to operation") from e
        except Exception as exc:
            logger.error("| ✗ Unexpected error during move-to operation: %s", exc)
            # Propagate the error to allow retry logic at higher level if necessary
            raise

    def _category_to_initial_state_title(self, category: str) -> str:
        """Converts a category name to a capitalized initial state title."""
        return " ".join(word.capitalize() for word in category.split("_"))

    def _extract_initial_state_id_from_url(self, url: str) -> str:
        """Extracts the initial state ID from a Notion URL."""
        slug = url.split("?")[0].split("#")[0].rstrip("/").split("/")[-1]
        compact = "".join(c for c in slug if c.isalnum())
        if len(compact) < 32:
            raise ValueError(f"Could not parse initial state ID from URL: {url}")
        compact = compact[-32:]
        return f"{compact[:8]}-{compact[8:12]}-{compact[12:16]}-{compact[16:20]}-{compact[20:]}"

    # =========================================================================
    # URL and State Utilities
    # =========================================================================

    def _get_slug_base(self, url: str) -> str:
        """Returns the slug part without its trailing 32-char ID (hyphen separated)."""
        slug = url.split("?", 1)[0].split("#", 1)[0].rstrip("/").split("/")[-1]
        match = re.match(r"^(.*)-([0-9a-fA-F]{32})$", slug)
        if match:
            return match.group(1)
        return slug

    def _is_valid_duplicate_url(self, original_url: str, duplicated_url: str) -> bool:
        """Checks whether duplicated_url looks like a Notion duplicate (original slug + '-N')."""
        orig_base = self._get_slug_base(original_url)
        dup_base = self._get_slug_base(duplicated_url)
        if not dup_base.startswith(orig_base + "-"):
            return False
        suffix = dup_base[len(orig_base) + 1 :]
        return suffix.isdigit()

    def _find_initial_state_by_title(self, title: str) -> Optional[Tuple[str, str]]:
        """Find a child page under the source hub by exact title.

        Strategy:
        - Locate the source hub page ("MCPBench Source Hub") via search to get its ID.
        - List its first-level children via `blocks.children.list`.
        - Find a `child_page` whose title exactly matches `title`.
        - Return the page ID and URL (retrieved via `pages.retrieve`).
        """
        try:
            # 1) Resolve the source hub page once and reuse its ID
            source_hub_id = self._ensure_source_hub_page_id()

            if not source_hub_id:
                return None

            # 2) List first-level children of the hub page and find exact title match
            matched_child_id: Optional[str] = None
            next_cursor = None

            while True:
                kwargs = {"block_id": source_hub_id}
                if next_cursor:
                    kwargs["start_cursor"] = next_cursor

                children = self.source_notion_client.blocks.children.list(**kwargs)
                for child in children.get("results", []):
                    if child.get("type") != "child_page":
                        continue  # Only consider child pages
                    child_title = (child.get("child_page", {}) or {}).get("title", "").strip()
                    if child_title == title:
                        matched_child_id = child.get("id")
                        break

                if matched_child_id or not children.get("has_more"):
                    break

                next_cursor = children.get("next_cursor")

            if not matched_child_id:
                logger.debug("| ✗ No child page titled '%s' under '%s'", title, self.source_parent_page_title)
                return None

            # 3) Retrieve the page to get its canonical URL
            try:
                page_obj = self.source_notion_client.pages.retrieve(page_id=matched_child_id)
                page_url = page_obj.get("url")
            except Exception as e:
                logger.warning("| ✗ Failed to retrieve page URL for '%s' (%s): %s", title, matched_child_id, e)
                page_url = None

            if not page_url:
                # Fall back to returning just the ID if URL couldn't be retrieved
                logger.debug("| ○ Returning page ID without URL for '%s'", title)
                return matched_child_id, ""

            return matched_child_id, page_url
        except Exception as e:
            logger.error("| ✗ Error locating initial state '%s' via children listing: %s", title, e)
            return None

    # =========================================================================
    # Duplication and State Management
    # =========================================================================
    # NOTE: Initial state type detection logic has been removed because all initial states are pages.

    def _duplicate_current_initial_state(
        self,
        page: Page,
        new_title: Optional[str] = None,
        *,
        original_initial_state_id: str,
        original_initial_state_title: str,
        wait_timeout: int = 180_000,
    ) -> str:
        """Duplicates the currently open Notion initial state using Playwright."""
        try:
            logger.info("| ○ Opening page menu...")
            page.wait_for_selector(
                PAGE_MENU_BUTTON_SELECTOR, state="visible", timeout=30_000
            )
            page.click(PAGE_MENU_BUTTON_SELECTOR)

            logger.info("| ○ Clicking 'Duplicate'...")
            page.hover(DUPLICATE_MENU_ITEM_SELECTOR)
            page.click(DUPLICATE_MENU_ITEM_SELECTOR)

            original_url = page.url
            logger.info(
                "| ○ Waiting for duplicated initial state to load (up to %.1f s)...",
                wait_timeout / 1000,
            )
            page.wait_for_url(lambda url: url != original_url, timeout=wait_timeout)

            # wait for the page to fully load
            time.sleep(5)
            duplicated_url = page.url
            # Validate that the resulting URL is a genuine duplicate of the original template.
            if not self._is_valid_duplicate_url(original_url, duplicated_url):
                # Sometimes duplication succeeds but UI navigates to parent instead of the new page.
                # In that case, try to find the most recently created page named exactly "<title> (1)".
                logger.warning(
                    "| ✗ Duplicate URL pattern mismatch. Attempting recovery by searching for latest '%s (1)' page...",
                    original_initial_state_title,
                )

                target_title = f"{original_initial_state_title} (1)"
                try:
                    # Wait 5 seconds before the first search to allow Notion to index the new page
                    time.sleep(5)

                    attempts = 3
                    source_hub_id = self._ensure_source_hub_page_id()
                    if not source_hub_id:
                        logger.error(
                            "| ✗ Cannot resolve source hub ID while locating '%s' duplicate.",
                            target_title,
                        )
                    else:
                        for retry_idx in range(attempts):
                            candidates = []
                            next_cursor = None

                            while True:
                                kwargs: Dict[str, Any] = {"block_id": source_hub_id}
                                if next_cursor:
                                    kwargs["start_cursor"] = next_cursor

                                children = self.source_notion_client.blocks.children.list(**kwargs)
                                for child in children.get("results", []):
                                    if child.get("type") != "child_page":
                                        continue
                                    child_id = child.get("id")
                                    if child_id == original_initial_state_id:
                                        continue

                                    child_title = (
                                        (child.get("child_page", {}) or {})
                                        .get("title", "")
                                        .strip()
                                    )
                                    if child_title != target_title:
                                        continue

                                    created_time = child.get("created_time") or child.get(
                                        "last_edited_time"
                                    )
                                    candidates.append((created_time or "", child_id))

                                if not children.get("has_more"):
                                    break

                                next_cursor = children.get("next_cursor")

                            if candidates:
                                latest_child_id = max(candidates, key=lambda x: x[0])[1]
                                fallback_url = None
                                try:
                                    page_obj = self.source_notion_client.pages.retrieve(
                                        page_id=latest_child_id
                                    )
                                    fallback_url = page_obj.get("url")
                                except Exception as retrieve_error:
                                    logger.warning(
                                        "| ✗ Failed to resolve URL for duplicate '%s': %s",
                                        latest_child_id,
                                        retrieve_error,
                                    )

                                if fallback_url:
                                    logger.info(
                                        "| ○ Navigating directly to latest '%s' duplicate via children list...",
                                        target_title,
                                    )
                                    page.goto(fallback_url, wait_until="domcontentloaded", timeout=120_000)
                                    time.sleep(5)
                                    duplicated_url = page.url
                                    break

                            if retry_idx < attempts - 1:
                                logger.debug(
                                    "| ○ '%s' not visible yet via children listing. Waiting 5s before retry %d/%d...",
                                    target_title,
                                    retry_idx + 1,
                                    attempts - 1,
                                )
                                time.sleep(5)

                    # Re-validate after attempted recovery
                    if not self._is_valid_duplicate_url(original_url, duplicated_url):
                        # API-based recovery failed, try UI-based recovery as last resort
                        logger.warning(
                            "| ✗ API-based recovery failed. Trying UI-based recovery..."
                        )
                        ui_recovered_url = self._recover_duplicate_via_ui(
                            page,
                            original_initial_state_title,
                            timeout=wait_timeout,
                        )
                        if ui_recovered_url and self._is_valid_duplicate_url(original_url, ui_recovered_url):
                            duplicated_url = ui_recovered_url
                            logger.info("| ✓ UI-based recovery successful")
                        else:
                            logger.error(
                                "| ✗ Could not locate a valid '%s' duplicate after all recovery attempts.\n|  Original: %s\n|  Observed: %s",
                                target_title,
                                original_url,
                                duplicated_url,
                            )
                            # Attempt to clean up stray duplicate before propagating error.
                            self._cleanup_orphan_duplicate(
                                original_initial_state_id, original_initial_state_title
                            )
                            raise RuntimeError(
                                "Duplicate URL pattern mismatch – duplication likely failed"
                            )
                except Exception as search_exc:
                    logger.error(
                        "| ✗ Failed during recovery search for '%s': %s",
                        target_title,
                        search_exc,
                    )
                    # Attempt to clean up stray duplicate before propagating error.
                    self._cleanup_orphan_duplicate(
                        original_initial_state_id, original_initial_state_title
                    )
                    raise RuntimeError(
                        "Duplicate URL pattern mismatch – duplication likely failed"
                    ) from search_exc

            duplicated_initial_state_id = self._extract_initial_state_id_from_url(
                duplicated_url
            )

            # Always move to evaluation parent
            self._move_current_page_to_env(page, wait_timeout=wait_timeout)

            # Rename if new title is provided
            if new_title:
                self._rename_initial_state_via_api(
                    duplicated_initial_state_id, new_title
                )

            # verify whether the page is moved to the evaluation parent page
            try:
                result = self.eval_notion_client.pages.retrieve(
                    page_id=duplicated_initial_state_id
                )
                if not result or not isinstance(result, dict):
                    logger.error(
                        "| ✗ Playwright move to error: Notion API did not return a valid page dict after move."
                    )
                    raise RuntimeError(
                        "Playwright move to error: Notion API did not return a valid page dict after move."
                    )
                logger.info(
                    "| ✓ Page moved to '%s' successfully.", self.eval_parent_page_title
                )
            except Exception as move_exc:
                logger.error(f"Playwright move to error: {move_exc}")
                raise RuntimeError(
                    "Playwright move to error: Notion client failed to retrieve page after move."
                ) from move_exc

            return duplicated_initial_state_id
        except PlaywrightTimeoutError as e:
            logger.error("Playwright timed out while duplicating initial state.")
            raise RuntimeError("Playwright timeout during duplication") from e

    # =========================================================================
    # Cleanup and Maintenance
    # =========================================================================

    def _cleanup_orphan_duplicate(
        self,
        original_initial_state_id: str,
        initial_state_title: str,
    ) -> bool:
        """Finds and archives a stray duplicate ("orphan") that matches pattern 'Title (n)'.

        Returns True if at least one orphan duplicate was archived.
        """
        try:
            source_hub_id = self._ensure_source_hub_page_id()
            if not source_hub_id:
                logger.error(
                    "| ✗ Cannot resolve source hub while cleaning up duplicates for '%s'",
                    initial_state_title,
                )
                return False

            # Match any numbered duplicate "Title (n)" where n is any digit(s)
            title_regex = re.compile(rf"^{re.escape(initial_state_title)}\s*\(\d+\)$")

            archived_any = False
            next_cursor = None
            while True:
                kwargs: Dict[str, Any] = {"block_id": source_hub_id}
                if next_cursor:
                    kwargs["start_cursor"] = next_cursor

                children = self.source_notion_client.blocks.children.list(**kwargs)
                for child in children.get("results", []):
                    if child.get("type") != "child_page":
                        continue

                    dup_id = child.get("id")
                    if dup_id == original_initial_state_id:
                        continue

                    title_plain = (
                        (child.get("child_page", {}) or {}).get("title", "")
                    ).strip()
                    if not title_regex.match(title_plain):
                        continue  # not a numbered duplicate

                    try:
                        self.source_notion_client.pages.update(
                            page_id=dup_id, archived=True
                        )
                        logger.info("| ✓ Archived orphan duplicate (%s): %s", "page", dup_id)
                        archived_any = True
                    except Exception as exc:
                        logger.warning("| ✗ Failed to archive orphan page %s: %s", dup_id, exc)

                if not children.get("has_more"):
                    break

                next_cursor = children.get("next_cursor")

            return archived_any
        except Exception as exc:
            logger.warning(
                "Error while attempting to cleanup orphan duplicate: %s", exc
            )
            return False

    def _duplicate_initial_state_for_task(
        self,
        initial_state_url: str,
        category: str,
        task_name: str,
        *,
        max_retries: int = 2,
        initial_wait_ms: int = 180_000,
    ) -> Tuple[str, str]:
        """Duplicates an initial state for a task, with retries for reliability."""
        if not self.state_file.exists():
            raise FileNotFoundError(
                "Authentication state 'notion_state.json' not found. "
                "Run the Notion login helper first."
            )

        last_exc = None
        for attempt in range(max_retries + 1):
            wait_timeout = initial_wait_ms * (attempt + 1)
            page = None
            try:
                # Reuse browser instance within session
                _, context = self._ensure_browser()
                page = context.new_page()

                logger.info("| ○ Navigating to initial state for %s...", category)
                # Start timing from the moment we begin navigating to the initial state page.
                start_time = time.time()
                page.goto(initial_state_url, wait_until="domcontentloaded", timeout=120_000)
                context.storage_state(path=str(self.state_file))

                initial_state_id = self._extract_initial_state_id_from_url(
                    initial_state_url
                )
                initial_state_title = self._category_to_initial_state_title(
                    category
                )

                duplicated_id = self._duplicate_current_initial_state(
                    page,
                    new_title=initial_state_title,  # Use original initial state name without (1) suffix
                    original_initial_state_id=initial_state_id,
                    original_initial_state_title=initial_state_title,
                    wait_timeout=wait_timeout,
                )
                duplicated_url = page.url
                # Validate URL pattern again at this higher level (should already be validated inside).
                context.storage_state(path=str(self.state_file))
                # Log how long the whole duplication (navigate → duplicate) took.
                elapsed = time.time() - start_time
                logger.info(
                    "| ✓ Initial state duplicated successfully in %.2f seconds (task: %s).",
                    elapsed,
                    task_name,
                )
                return duplicated_url, duplicated_id
            except Exception as e:
                # No additional cleanup here—handled inside _duplicate_current_template.
                last_exc = e
                if attempt < max_retries:
                    logger.warning(
                        "| ✗ Duplication attempt %d failed: %s. Retrying...",
                        attempt + 1,
                        e,
                    )
                time.sleep(120 * attempt + 120)
            finally:
                # Close the page to prevent accumulation within reused context
                if page:
                    try:
                        page.close()
                    except Exception:
                        pass

        raise RuntimeError(
            f"Initial state duplication failed for task '{task_name}' after {max_retries + 1} attempts: {last_exc}"
        )

    def get_service_config_for_agent(self) -> dict:
        """
        Get service-specific configuration for agent execution.

        Returns:
            Dictionary containing configuration needed by the agent/MCP server
        """
        from src.config.config_schema import ConfigRegistry

        # Get the eval_api_key from config registry
        config = ConfigRegistry.get_config("notion").get_all()
        service_config = {}

        if "eval_api_key" in config:
            service_config["notion_key"] = config["eval_api_key"]

        return service_config


================================================
FILE: src/mcp_services/notion/notion_task_manager.py
================================================
"""
Notion Task Manager for MCPMark Evaluation Pipeline
====================================================

This module provides utilities for discovering, filtering, and managing
evaluation tasks within the MCPMark project structure for Notion service.

The task manager is responsible for:
- Task discovery and filtering
- Task verification and result processing
- Task-specific logic (NOT LLM execution)
"""

import sys
from dataclasses import dataclass
from pathlib import Path
from typing import Any, Dict, List, Optional

from src.base.task_manager import BaseTask, BaseTaskManager
from src.logger import get_logger

logger = get_logger(__name__)


@dataclass
class NotionTask(BaseTask):
    """Represents a single evaluation task for Notion service."""

    # Additional Notion-specific fields
    # A human-readable slug for the task directory (e.g. "employee_onboarding")
    task_name: str = ""
    original_initial_state_url: Optional[str] = None
    duplicated_initial_state_url: Optional[str] = None
    duplicated_initial_state_id: Optional[str] = None

    def __post_init__(self):
        # Ensure base class fields are set if not provided
        if (
            not hasattr(self, "task_instruction_path")
            or self.task_instruction_path is None
        ):
            self.task_instruction_path = self.description_path
        if (
            not hasattr(self, "task_verification_path")
            or self.task_verification_path is None
        ):
            self.task_verification_path = self.verify_path

    @property
    def description_path(self) -> Path:
        """Alias for task_instruction_path."""
        return self.task_instruction_path

    @property
    def verify_path(self) -> Path:
        """Alias for task_verification_path."""
        return self.task_verification_path


    def get_description(self) -> str:
        """Read and return the task description."""
        if self.description_path.exists():
            return self.description_path.read_text(encoding="utf-8")
        return ""


class NotionTaskManager(BaseTaskManager):
    """Manages task discovery, filtering, and verification for Notion-based MCPMark evaluation."""

    def __init__(self, tasks_root: Path = None, task_suite: str = "standard"):
        """Initialize with the tasks directory path.

        Args:
            tasks_root: Path to the tasks directory
            task_suite: Logical task suite (e.g., 'standard', 'easy')
        """
        if tasks_root is None:
            tasks_root = Path(__file__).resolve().parents[3] / "tasks"

        # Call parent constructor
        super().__init__(tasks_root, mcp_service="notion", task_suite=task_suite)

    # =========================================================================
    # Service-specific implementations for template methods
    # =========================================================================
    # No custom task discovery methods needed; relying entirely on BaseTaskManager defaults.

    def _get_service_directory_name(self) -> str:
        """Return the service directory name for Notion."""
        return "notion"

    def _create_task_from_files(
        self, category_id: str, task_files_info: Dict[str, Any]
    ) -> Optional[NotionTask]:
        """Instantiate a `NotionTask` from the dictionary returned by `_find_task_files`."""
        import json
        
        # Check for meta.json
        meta_path = task_files_info["instruction_path"].parent / "meta.json"
        final_category_id = category_id
        task_id = task_files_info["task_id"]
        
        if meta_path.exists():
            try:
                with open(meta_path, 'r') as f:
                    meta_data = json.load(f)
                    # Use values from meta.json if available
                    final_category_id = meta_data.get("category_id", category_id)
                    task_id = meta_data.get("task_id", task_id)
            except Exception as e:
                logger.warning(f"Failed to load meta.json from {meta_path}: {e}")

        return NotionTask(
            task_instruction_path=task_files_info["instruction_path"],
            task_verification_path=task_files_info["verification_path"],
            service="notion",
            category_id=final_category_id,
            task_id=task_id,
            task_name=task_files_info["task_id"],
        )

    def _get_verification_command(self, task: NotionTask) -> List[str]:
        """Get the verification command for Notion tasks.

        Notion verification requires the duplicated template ID.
        """
        return [
            sys.executable,
            str(task.task_verification_path),
            task.duplicated_initial_state_id or "",
        ]


================================================
FILE: src/mcp_services/playwright/__init__.py
================================================
#!/usr/bin/env python3
"""
Playwright MCP Service for MCPMark
==================================

This package provides Playwright MCP integration for web automation tasks.
"""


================================================
FILE: src/mcp_services/playwright/playwright_login_helper.py
================================================
"""
Playwright Login Helper for MCPMark
====================================

This module provides browser session management and authentication utilities
for Playwright-based web automation tasks. Handles browser context setup,
session persistence, and state management.
"""

from pathlib import Path
from typing import Optional

from playwright.sync_api import (
    BrowserContext,
    sync_playwright,
)

from src.base.login_helper import BaseLoginHelper
from src.logger import get_logger

logger = get_logger(__name__)


class PlaywrightLoginHelper(BaseLoginHelper):
    """
    Login helper for Playwright web automation tasks.

    Manages browser contexts, session persistence, and authentication state
    for web automation scenarios.
    """

    SUPPORTED_BROWSERS = {"chromium", "firefox"}

    def __init__(
        self,
        *,
        browser: str = "chromium",
        headless: bool = True,
        state_path: Optional[str | Path] = None,
    ) -> None:
        """
        Initialize the Playwright login helper.

        Args:
            browser: Browser engine to use ('chromium' or 'firefox')
            headless: Whether to run browser in headless mode
            state_path: Path to save browser session state
        """
        super().__init__()

        if browser not in self.SUPPORTED_BROWSERS:
            raise ValueError(
                f"Unsupported browser '{browser}'. Supported: {', '.join(self.SUPPORTED_BROWSERS)}"
            )

        self.browser_name = browser
        self.headless = headless
        self.state_path = (
            Path(state_path or Path.cwd() / "playwright_state.json")
            .expanduser()
            .resolve()
        )

        # Browser management
        self._playwright = None
        self._browser = None
        self._browser_context: Optional[BrowserContext] = None

        logger.info(f"Initialized PlaywrightLoginHelper with {browser} browser")

    def login(self, **kwargs) -> bool:
        """
        Set up browser context and session state.

        For most Playwright tasks, this creates a clean browser context
        that can be used for web automation. More complex authentication
        can be handled in specific implementations.

        Returns:
            bool: True if browser setup successful
        """
        try:
            # Clean up any existing browser instances
            self.close()

            # Start Playwright
            self._playwright = sync_playwright().start()
            browser_type = getattr(self._playwright, self.browser_name)
            self._browser = browser_type.launch(headless=self.headless)

            # Create browser context
            context_options = {}

            # Load existing state if available
            if self.state_path.exists():
                try:
                    context_options["storage_state"] = str(self.state_path)
                    logger.info(f"Loaded browser state from {self.state_path}")
                except Exception as e:
                    logger.warning(f"Failed to load browser state: {e}")

            self._browser_context = self._browser.new_context(**context_options)

            # Save current state
            self._save_browser_state()

            logger.info("✅ Browser context setup successful")
            return True

        except Exception as e:
            logger.error(f"Browser setup failed: {e}")
            self.close()
            return False

    def get_browser_context(self) -> Optional[BrowserContext]:
        """
        Get the current browser context.

        Returns:
            BrowserContext or None if not initialized
        """
        return self._browser_context

    def is_authenticated(self) -> bool:
        """
        Check if browser context is ready for use.

        Returns:
            bool: True if browser context is available
        """
        return self._browser_context is not None

    def get_credentials(self) -> dict:
        """
        Get browser configuration for MCP integration.

        Returns:
            dict: Browser configuration parameters
        """
        return {
            "browser": self.browser_name,
            "headless": self.headless,
            "state_path": str(self.state_path),
        }

    def _save_browser_state(self) -> None:
        """Save current browser state to file."""
        if self._browser_context:
            try:
                self._browser_context.storage_state(path=str(self.state_path))
                logger.debug(f"Browser state saved to {self.state_path}")
            except Exception as e:
                logger.warning(f"Failed to save browser state: {e}")

    def close(self) -> None:
        """Clean up browser resources."""
        if self._browser_context:
            try:
                # Save state before closing
                self._save_browser_state()
                self._browser_context.close()
            except Exception as e:
                logger.warning(f"Error closing browser context: {e}")
            finally:
                self._browser_context = None

        if self._browser:
            try:
                self._browser.close()
            except Exception as e:
                logger.warning(f"Error closing browser: {e}")
            finally:
                self._browser = None

        if self._playwright:
            try:
                self._playwright.stop()
            except Exception as e:
                logger.warning(f"Error stopping Playwright: {e}")
            finally:
                self._playwright = None


================================================
FILE: src/mcp_services/playwright/playwright_state_manager.py
================================================
"""
Playwright State Manager for MCPMark
======================================

This module manages browser contexts and test environments for Playwright-based
web automation tasks. Handles browser isolation, test page setup, and cleanup.
"""

import time
from pathlib import Path
from typing import Optional, Dict, Any, List

from playwright.sync_api import (
    BrowserContext,
    Page,
    TimeoutError as PlaywrightTimeoutError,
)

from src.base.state_manager import BaseStateManager, InitialStateInfo
from src.base.task_manager import BaseTask
from src.logger import get_logger

logger = get_logger(__name__)


class PlaywrightStateManager(BaseStateManager):
    """
    Manages browser state and test environments for Playwright tasks.

    Provides browser context isolation, test page setup, and resource cleanup
    for web automation evaluation.
    """

    def __init__(
        self,
        browser: str = "chromium",
        headless: bool = True,
        state_path: Optional[Path] = None,
        network_origins: str = "*",
        user_profile: str = "isolated",
        viewport_width: int = 1280,
        viewport_height: int = 720,
    ):
        """
        Initialize Playwright state manager.

        Args:
            browser: Browser engine to use ('chromium' or 'firefox')
            headless: Whether to run browser in headless mode
            state_path: Path to browser state file
            network_origins: Allowed network origins (comma-separated or *)
            user_profile: User profile type (isolated or persistent)
            viewport_width: Browser viewport width
            viewport_height: Browser viewport height
        """
        super().__init__(service_name="playwright")

        self.browser_name = browser
        self.headless = headless
        # self.headless = False
        self.state_path = state_path or Path.cwd() / "playwright_state.json"
        self.network_origins = network_origins
        self.user_profile = user_profile
        self.viewport_width = viewport_width
        self.viewport_height = viewport_height

        # Browser management
        self._playwright = None
        self._browser = None
        self._current_context: Optional[BrowserContext] = None

        # Task-specific tracking
        self._current_task_pages: List[Page] = []

        # Test environment URLs for different task categories
        self.test_environments = {
            "element_extraction": "https://mcp-eval-website.vercel.app/extraction",
            "form_interaction": "https://mcp-eval-website.vercel.app/forms/",
            "web_navigation": "https://mcp-eval-website.vercel.app/navigation",
            "authentication": "https://mcp-eval-website.vercel.app/auth/turnstile",
        }

        logger.info("Playwright state manager initialized")

    def _create_initial_state(self, task: BaseTask) -> Optional[InitialStateInfo]:
        """
        Create isolated browser context for task execution.

        Args:
            task: Task for which to create browser state

        Returns:
            InitialStateInfo with browser context details
        """
        try:
            logger.info(
                "| Skipping Playwright browser launch – no initial browser state "
                "needed for task: %s",
                task.name,
            )

            # Generate a lightweight identifier to allow resource tracking even
            # though no real browser context is created.
            context_id = f"noop_{task.category_id}_{task.task_id}_{int(time.time())}"

            # We still expose the canonical test URL (if any) because some
            # consumers add it to the task metadata.
            test_url = self.test_environments.get(task.category_id)

            # Record a dummy resource so cleanup logic remains symmetrical.
            self.track_resource(
                "browser_context",
                context_id,
                {
                    "task_name": task.name,
                    "task_category": task.category_id,
                    "test_url": test_url,
                },
            )

            return InitialStateInfo(
                state_id=context_id,
                state_url=test_url,
                metadata={
                    "browser": self.browser_name,
                    "headless": self.headless,
                    "test_url": test_url,
                    "task_category": task.category_id,
                },
            )

        except Exception as e:
            logger.error(f"Failed to create stub initial state for {task.name}: {e}")
            return None

    def _store_initial_state_info(
        self, task: BaseTask, state_info: InitialStateInfo
    ) -> None:
        """Store browser context information in task object."""
        if hasattr(task, "__dict__"):
            task.browser_context_id = state_info.state_id
            task.test_url = state_info.state_url
            task.browser_config = state_info.metadata

    def _cleanup_task_initial_state(self, task: BaseTask) -> bool:
        """Clean up browser context for specific task."""
        try:
            success = True

            # Close any open pages
            if self._current_task_pages:
                for page in self._current_task_pages:
                    try:
                        page.close()
                    except Exception as e:
                        logger.warning(f"Failed to close page: {e}")
                        success = False
                self._current_task_pages.clear()

            # Close browser context
            if self._current_context:
                try:
                    self._current_context.close()
                    logger.info("Closed browser context")
                except Exception as e:
                    logger.error(f"Failed to close browser context: {e}")
                    success = False
                finally:
                    self._current_context = None

            return success

        except Exception as e:
            logger.error(f"Error during browser cleanup for {task.name}: {e}")
            return False

    def _cleanup_single_resource(self, resource: Dict[str, Any]) -> bool:
        """Clean up a single browser resource."""
        try:
            if resource["type"] == "browser_context":
                # Context cleanup is handled in _cleanup_task_initial_state
                logger.debug(f"Browser context {resource['id']} marked for cleanup")
                return True

            logger.warning(f"Unknown resource type for cleanup: {resource['type']}")
            return False

        except Exception as e:
            logger.error(f"Failed to cleanup resource {resource}: {e}")
            return False

    def _get_context_options(self, task: BaseTask) -> Dict[str, Any]:
        """Get browser context options based on task requirements."""
        options = {
            "viewport": {"width": self.viewport_width, "height": self.viewport_height}
        }

        # Load browser state if available
        if self.state_path.exists():
            try:
                options["storage_state"] = str(self.state_path)
            except Exception as e:
                logger.warning(f"Failed to load browser state: {e}")

        # Task-specific context options
        if task.category_id == "form_interaction":
            # Enable form interactions
            options["permissions"] = ["geolocation"]
        elif task.category_id == "web_navigation":
            # Allow navigation between pages
            options["accept_downloads"] = False

        return options

    def _setup_test_environment(self, task: BaseTask) -> Optional[str]:
        """Set up test environment for task category."""
        try:
            test_url = self.test_environments.get(task.category_id)
            if not test_url:
                logger.warning(
                    f"No test environment defined for category: {task.category_id}"
                )
                return None

            # Create a page and navigate to test environment
            if self._current_context:
                page = self._current_context.new_page()

                # Navigate to test URL to ensure it's accessible
                page.goto(test_url, wait_until="networkidle", timeout=30000)
                logger.info(f"Test environment ready: {test_url}")

                # Track the page for cleanup
                self._current_task_pages.append(page)

                # Verify page loaded correctly
                title = page.title()
                if title:
                    logger.debug(f"Page loaded with title: {title}")

                return test_url

        except PlaywrightTimeoutError:
            logger.error(f"Timeout loading test environment: {test_url}")
        except Exception as e:
            logger.error(f"Failed to setup test environment: {e}")

        return None

    def get_current_context(self) -> Optional[BrowserContext]:
        """Get the current browser context."""
        return self._current_context

    def get_test_page(self) -> Optional[Page]:
        """Get a page for testing (creates new one if needed)."""
        if self._current_context:
            try:
                page = self._current_context.new_page()
                self._current_task_pages.append(page)
                return page
            except Exception as e:
                logger.error(f"Failed to create test page: {e}")
        return None

    def navigate_to_test_url(self, task: BaseTask) -> Optional[Page]:
        """Navigate to the test URL for a specific task."""
        test_url = self.test_environments.get(task.category_id)
        if not test_url:
            logger.error(f"No test URL defined for category: {task.category_id}")
            return None

        page = self.get_test_page()
        if page:
            try:
                page.goto(test_url, wait_until="networkidle", timeout=30000)
                logger.info(f"Navigated to test URL: {test_url}")
                return page
            except Exception as e:
                logger.error(f"Failed to navigate to {test_url}: {e}")

        return None

    def get_service_config_for_agent(self) -> dict:
        """
        Get service-specific configuration for agent execution.

        Returns:
            Dictionary containing browser configuration for MCP server
        """
        config = {
            "browser": self.browser_name,
            "headless": self.headless,
        }

        # Add browser state file if it exists
        if self.state_path.exists():
            config["browser_state"] = str(self.state_path)

        # Add test environment URLs
        config["test_environments"] = self.test_environments

        return config

    def close_all(self) -> None:
        """Close all browser resources."""
        try:
            # Close all pages
            for page in self._current_task_pages:
                try:
                    page.close()
                except Exception:
                    pass
            self._current_task_pages.clear()

            # Close context
            if self._current_context:
                self._current_context.close()
                self._current_context = None

            # Close browser
            if self._browser:
                self._browser.close()
                self._browser = None

            # Stop Playwright
            if self._playwright:
                self._playwright.stop()
                self._playwright = None

            logger.info("All browser resources closed")

        except Exception as e:
            logger.error(f"Error closing browser resources: {e}")

    def set_verification_environment(self, messages_path: str = None) -> None:
        """
        Set Playwright-specific environment variables for verification scripts.

        Args:
            messages_path: Optional path to messages.json file for verification
        """
        import os

        # Set common MCP_MESSAGES if provided
        if messages_path:
            os.environ["MCP_MESSAGES"] = str(messages_path)
            # Also set PLAYWRIGHT_WORK_DIR to the directory containing messages.json
            work_dir = str(Path(messages_path).parent)
            os.environ["PLAYWRIGHT_WORK_DIR"] = work_dir
            logger.info(f"| Set PLAYWRIGHT_WORK_DIR to: {work_dir}")
            logger.info(f"| Set MCP_MESSAGES to: {messages_path}")

    def __del__(self):
        """Ensure cleanup on deletion."""
        self.close_all()


================================================
FILE: src/mcp_services/playwright/playwright_task_manager.py
================================================
"""
Playwright Task Manager for MCPMark
====================================

Simple task manager for Playwright MCP tasks.
Follows anti-over-engineering principles: keep it simple, do what's needed.
"""

import sys
import os
import subprocess
from pathlib import Path
from typing import List, Dict, Any

from src.base.task_manager import BaseTask, BaseTaskManager
from src.logger import get_logger

logger = get_logger(__name__)


class PlaywrightTask(BaseTask):
    """Playwright-specific task that uses directory name as task name."""
    

class PlaywrightTaskManager(BaseTaskManager):
    """Simple task manager for Playwright MCP tasks."""

    def __init__(self, tasks_root: Path = None, task_suite: str = "standard"):
        """Initialize with tasks directory."""
        if tasks_root is None:
            tasks_root = Path(__file__).resolve().parents[3] / "tasks"

        super().__init__(
            tasks_root,
            mcp_service="playwright",
            task_class=PlaywrightTask,
            task_organization="directory",
            task_suite=task_suite,
        )

    def _create_task_from_files(
        self, category_id: str, task_files_info: Dict[str, Any]
    ) -> PlaywrightTask:
        """Instantiate a `PlaywrightTask` from the dictionary returned by `_find_task_files`."""
        import json
        
        # Check for meta.json
        meta_path = task_files_info["instruction_path"].parent / "meta.json"
        final_category_id = category_id
        task_id = task_files_info["task_id"]
        
        if meta_path.exists():
            try:
                with open(meta_path, 'r') as f:
                    meta_data = json.load(f)
                    # Use values from meta.json if available
                    final_category_id = meta_data.get("category_id", category_id)
                    task_id = meta_data.get("task_id", task_id)
            except Exception as e:
                logger.warning(f"Failed to load meta.json from {meta_path}: {e}")

        return PlaywrightTask(
            task_instruction_path=task_files_info["instruction_path"],
            task_verification_path=task_files_info["verification_path"],
            service="playwright",
            category_id=final_category_id,
            task_id=task_id,
        )

    def _get_verification_command(self, task: BaseTask) -> List[str]:
        """Get verification command - just run the verify.py script."""
        return [sys.executable, str(task.task_verification_path)]

    def run_verification(self, task: BaseTask) -> subprocess.CompletedProcess:
        """Run verification with Playwright-specific environment."""
        env = os.environ.copy()

        # Pass messages.json path and working directory to verification script
        messages_path = os.getenv("MCP_MESSAGES")
        work_dir = os.getenv("PLAYWRIGHT_WORK_DIR")
        
        if messages_path:
            env["MCP_MESSAGES"] = messages_path
            logger.debug(f"Setting MCP_MESSAGES to: {messages_path}")
        
        if work_dir:
            env["PLAYWRIGHT_WORK_DIR"] = work_dir
            logger.debug(f"Setting PLAYWRIGHT_WORK_DIR to: {work_dir}")

        return subprocess.run(
            self._get_verification_command(task),
            capture_output=True,
            text=True,
            timeout=90,
            env=env,
        )

    def _format_task_instruction(self, base_instruction: str) -> str:
        """Add Playwright-specific note to instructions."""
        return (
            base_instruction
            + "\n\nUse Playwright MCP tools to complete this web automation task."
        )


================================================
FILE: src/mcp_services/playwright_webarena/playwright_login_helper.py
================================================
"""
WebArena (Docker) Login Helper for MCPMark
==========================================

This helper exposes basic browser configuration for agents. Authentication is
not required for the public WebArena environment; isolation is handled via
Docker containerization in the state manager.
"""

from __future__ import annotations

from pathlib import Path
from typing import Optional

from src.base.login_helper import BaseLoginHelper
from src.logger import get_logger

logger = get_logger(__name__)


class PlaywrightLoginHelper(BaseLoginHelper):
    """
    Minimal login helper. It does not launch browsers; that is handled by
    the Playwright MCP client. It simply exposes configuration parameters such
    as headless mode and an optional storage state file path.
    """

    def __init__(
        self,
        *,
        browser: str = "chromium",
        headless: bool = True,
        state_path: Optional[str | Path] = None,
        base_url: Optional[str] = None,
    ) -> None:
        super().__init__()
        self.browser_name = browser
        self.headless = headless
        self.state_path = (
            Path(state_path or Path.cwd() / "playwright_state.json")
            .expanduser()
            .resolve()
        )
        self.base_url = base_url
        logger.info(
            "Initialized WebArenaLoginHelper (browser=%s, headless=%s)",
            browser,
            headless,
        )

    def login(self, **kwargs) -> bool:
        """
        No-op login. For WebArena we don't need credentials; we only provide
        configuration for the MCP to open a browser.
        """
        logger.info("WebArenaLoginHelper login: no-op")
        return True

    def is_authenticated(self) -> bool:
        return True

    def get_credentials(self) -> dict:
        return {
            "browser": self.browser_name,
            "headless": self.headless,
            "state_path": str(self.state_path),
            "base_url": self.base_url,
        }

    def close(self) -> None:
        # No resources to release
        pass


================================================
FILE: src/mcp_services/playwright_webarena/playwright_state_manager.py
================================================
"""
WebArena (Docker) State Manager for MCPMark
===========================================

This module manages a WebArena environment that runs inside a Docker container.
It is responsible for starting the container in the initial state phase and
stopping/removing it during cleanup. It exposes the target URL (e.g.
http://localhost:9999) for Playwright MCP-based automation.
"""

from __future__ import annotations

import socket
import subprocess
import time
from dataclasses import dataclass
from pathlib import Path
from typing import Optional, Dict, Any
from urllib.parse import urlparse

import requests

from src.base.state_manager import BaseStateManager, InitialStateInfo
from src.base.task_manager import BaseTask
from src.logger import get_logger

logger = get_logger(__name__)


@dataclass
class DockerConfig:
    image_name: str = "shopping_admin_final_0719"
    image_tar_path: Optional[Path] = None
    container_name: str = "shopping_admin"
    host_port: int = 7780
    container_port: int = 80
    readiness_path: str = "/admin"
    readiness_timeout_seconds: int = 600
    readiness_poll_interval_seconds: float = 2.0

    @property
    def base_url(self) -> str:
        return f"http://localhost:{self.host_port}"


class PlaywrightStateManager(BaseStateManager):
    """
    Manage Docker lifecycle for WebArena-backed tasks.

    - Initial state: ensure image is present (optionally load from tar), then
      run container and wait until HTTP endpoint is ready.
    - Cleanup: stop and remove the container.
    """
    
    # Category-specific Docker configurations
    CATEGORY_CONFIGS = {
        "reddit": {
            "image_name": "postmill-populated-exposed-withimg",
            "container_name": "forum",
            "host_port": 9999,
            "readiness_path": "/"
        },
        "shopping": {
            "image_name": "shopping_final_0712",
            "container_name": "shopping",
            "host_port": 7770,
            "readiness_path": "/"
        },
        "shopping_admin": {
            "image_name": "shopping_admin_final_0719",
            "container_name": "shopping_admin",
            "host_port": 7780,
            "readiness_path": "/admin"
        }
    }

    def __init__(
        self,
        *,
        docker_image_name: str = "shopping_admin_final_0719",
        docker_container_name: str = "shopping_admin",
        host_port: int = 7780,
        container_port: int = 80,
        image_tar_path: Optional[str | Path] = None,
        readiness_path: str = "/admin",
        readiness_timeout_seconds: int = 600,
        readiness_poll_interval_seconds: float = 2.0,
        # Playwright browser config params (ignored by this state manager)
        browser: Optional[str] = None,
        headless: Optional[bool] = None,
        network_origins: Optional[str] = None,
        user_profile: Optional[str] = None,
        viewport_width: Optional[int] = None,
        viewport_height: Optional[int] = None,
        # Debug mode - skip container cleanup
        skip_cleanup: bool = False,
    ) -> None:
        super().__init__(service_name="playwright_webarena")

        self.config = DockerConfig(
            image_name=docker_image_name,
            image_tar_path=Path(image_tar_path).expanduser().resolve()
            if image_tar_path
            else None,
            container_name=docker_container_name,
            host_port=host_port,
            container_port=container_port,
            readiness_path=readiness_path,
            readiness_timeout_seconds=readiness_timeout_seconds,
            readiness_poll_interval_seconds=readiness_poll_interval_seconds,
        )

        self.skip_cleanup = skip_cleanup

        logger.info(
            "Initialized WebArenaStateManager (image=%s, container=%s, port=%s, skip_cleanup=%s)",
            self.config.image_name,
            self.config.container_name,
            self.config.host_port,
            self.skip_cleanup,
        )

    # ---- Helpers ---------------------------------------------------------

    def _run_cmd(
        self, args: list[str], *, check: bool = False
    ) -> subprocess.CompletedProcess:
        logger.debug("| Running command: %s", " ".join(args))
        return subprocess.run(
            args, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True, check=check
        )

    def _image_exists(self, image: str) -> bool:
        result = self._run_cmd(
            ["docker", "images", "--format", "{{.Repository}}:{{.Tag}}"]
        )
        lines = [line.strip() for line in result.stdout.splitlines() if line.strip()]
        # Parse target image (allow optional tag; default latest)
        if ":" in image:
            target_repo, target_tag = image.split(":", 1)
        else:
            target_repo, target_tag = image, "latest"

        for repo_tag in lines:
            if ":" in repo_tag:
                repo, tag = repo_tag.split(":", 1)
            else:
                repo, tag = repo_tag, "latest"
            if repo == target_repo and tag == target_tag:
                logger.debug("| Found Docker image %s:%s", repo, tag)
                return True
        logger.debug("| Docker image not found: %s:%s", target_repo, target_tag)
        return False

    def _load_image_from_tar_if_needed(self) -> None:
        if self.config.image_tar_path and not self._image_exists(
            self.config.image_name
        ):
            logger.info("| Loading Docker image from tar: %s", self.config.image_tar_path)
            result = self._run_cmd(
                ["docker", "load", "--input", str(self.config.image_tar_path)]
            )
            if result.returncode != 0:
                logger.error("| Failed to load Docker image: %s", result.stderr.strip())
                raise RuntimeError(f"docker load failed: {result.stderr}")
            logger.info("| Docker image loaded")

    def _stop_and_remove_container(self, name: str) -> None:
        # Stop (ignore errors if not running)
        self._run_cmd(["docker", "stop", name])
        # Remove (ignore errors if not exists)
        self._run_cmd(["docker", "rm", name])

    def _container_is_running(self, name: str) -> bool:
        result = self._run_cmd(
            ["docker", "ps", "--filter", f"name=^{name}$", "--format", "{{.Names}}"]
        )
        running = any(line.strip() == name for line in result.stdout.splitlines())
        logger.debug("| Container '%s' running: %s", name, running)
        return running

    def _port_open(self, host: str, port: int) -> bool:
        try:
            with socket.create_connection((host, port), timeout=1.0):
                return True
        except OSError:
            return False

    def _http_ready(self, url: str) -> bool:
        try:
            resp = requests.get(url, timeout=3)
            return resp.status_code < 500
        except Exception:
            return False

    def _get_entry_url(self) -> str:
        base = self.config.base_url.rstrip("/")
        path = self.config.readiness_path
        if not path or path == "/":
            return base
        return f"{base}{path}"

    def _wait_until_ready(self) -> bool:
        deadline = time.time() + self.config.readiness_timeout_seconds
        base_url = self.config.base_url.rstrip("/")
        url = self._get_entry_url()

        # Determine host and port from URL for port checks
        parsed = urlparse(base_url)
        host = parsed.hostname or "localhost"
        port = parsed.port or self.config.host_port

        # First wait for port to open to avoid long HTTP errors
        while time.time() < deadline:
            if self._port_open(host, port):
                break
            time.sleep(self.config.readiness_poll_interval_seconds)

        while time.time() < deadline:
            if self._http_ready(url):
                logger.info("| WebArena HTTP endpoint ready: %s", url)
                return True
            time.sleep(self.config.readiness_poll_interval_seconds)

        logger.error("| Timed out waiting for WebArena at %s", url)
        return False

    def _wait_for_mysql_ready(self, max_wait_seconds: int = 120) -> bool:
        """Wait for MySQL to be ready in the container."""
        deadline = time.time() + max_wait_seconds
        while time.time() < deadline:
            result = self._run_cmd([
                "docker", "exec", self.config.container_name,
                "mysql", "-u", "magentouser", "-pMyPassword",
                "magentodb", "-e", "SELECT 1;"
            ])
            if result.returncode == 0:
                logger.info("| MySQL is ready in container %s", self.config.container_name)
                return True
            time.sleep(2)
        logger.warning("| MySQL not ready after %d seconds", max_wait_seconds)
        return False

    def _wait_for_magento_ready(self, max_wait_seconds: int = 180) -> bool:
        """Wait for Magento to be fully initialized."""
        deadline = time.time() + max_wait_seconds
        while time.time() < deadline:
            # Check if Magento's setup is complete by trying to access config
            result = self._run_cmd([
                "docker", "exec", self.config.container_name,
                "/var/www/magento2/bin/magento", "config:show", "web/unsecure/base_url"
            ])
            if result.returncode == 0:
                logger.info("| Magento is ready in container %s", self.config.container_name)
                return True
            time.sleep(5)
        logger.warning("| Magento not ready after %d seconds", max_wait_seconds)
        return False

    def _configure_shopping_post_start(self) -> None:
        """Run Magento-specific steps for shopping container.
        Waits for services to be ready before configuring.
        """
        logger.info("| Running shopping post-start setup")
        
        # Wait for MySQL to be ready first
        if not self._wait_for_mysql_ready():
            logger.warning("| MySQL not ready, attempting configuration anyway")
        
        # Wait for Magento to be ready
        if not self._wait_for_magento_ready():
            logger.warning("| Magento not ready, attempting configuration anyway")
        
        base_url = f"http://localhost:{self.config.host_port}"

        cmds = [
            [
                "docker",
                "exec",
                self.config.container_name,
                "/var/www/magento2/bin/magento",
                "setup:store-config:set",
                f"--base-url={base_url}",
            ],
            [
                "docker",
                "exec",
                self.config.container_name,
                "mysql",
                "-u",
                "magentouser",
                "-pMyPassword",
                "magentodb",
                "-e",
                f"UPDATE core_config_data SET value='{base_url}/' WHERE path IN ('web/secure/base_url', 'web/unsecure/base_url');",
            ],
            [
                "docker",
                "exec",
                self.config.container_name,
                "/var/www/magento2/bin/magento",
                "cache:flush",
            ],
        ]

        for cmd in cmds:
            result = self._run_cmd(cmd)
            if result.returncode != 0:
                logger.warning(
                    "| Shopping setup step failed (%s): %s",
                    " ".join(cmd),
                    result.stderr.strip(),
                )
            else:
                logger.debug(
                    "| Shopping setup step ok (%s): %s",
                    " ".join(cmd),
                    result.stdout.strip(),
                )


    def _configure_shopping_admin_post_start(self) -> None:
        """Run Magento-specific steps for shopping_admin container.
        Waits for services to be ready before configuring.
        """
        logger.info("| Running shopping_admin post-start setup")
        
        # Wait for MySQL to be ready first
        if not self._wait_for_mysql_ready():
            logger.warning("| MySQL not ready, attempting configuration anyway")
        
        # Wait for Magento to be ready
        if not self._wait_for_magento_ready():
            logger.warning("| Magento not ready, attempting configuration anyway")
        
        base_url = f"http://localhost:{self.config.host_port}"

        cmds = [
            [
                "docker",
                "exec",
                self.config.container_name,
                "/var/www/magento2/bin/magento",
                "setup:store-config:set",
                f"--base-url={base_url}",
            ],
            [
                "docker",
                "exec",
                self.config.container_name,
                "mysql",
                "-u",
                "magentouser",
                "-pMyPassword",
                "magentodb",
                "-e",
                f"UPDATE core_config_data SET value='{base_url}/' WHERE path IN ('web/secure/base_url', 'web/unsecure/base_url');",
            ],
            [
                "docker",
                "exec",
                self.config.container_name,
                "/var/www/magento2/bin/magento",
                "config:set",
                "admin/security/password_is_forced",
                "0",
            ],
            [
                "docker",
                "exec",
                self.config.container_name,
                "/var/www/magento2/bin/magento",
                "config:set",
                "admin/security/password_lifetime",
                "0",
            ],
            [
                "docker",
                "exec",
                self.config.container_name,
                "/var/www/magento2/bin/magento",
                "cache:flush",
            ],
        ]

        for cmd in cmds:
            result = self._run_cmd(cmd)
            if result.returncode != 0:
                logger.warning(
                    "| Shopping_admin setup step failed (%s): %s",
                    " ".join(cmd),
                    result.stderr.strip(),
                )
            else:
                logger.debug(
                    "| Shopping_admin setup step ok (%s): %s",
                    " ".join(cmd),
                    result.stdout.strip(),
                )

    # ---- BaseStateManager hooks -----------------------------------------

    def _create_initial_state(self, task: BaseTask) -> Optional[InitialStateInfo]:
        try:
            # Dynamically update config based on task category
            if hasattr(task, 'category_id') and task.category_id in self.CATEGORY_CONFIGS:
                category_config = self.CATEGORY_CONFIGS[task.category_id]
                logger.info(f"| Using category-specific config for '{task.category_id}': {category_config}")
                
                # Update the config with category-specific values
                self.config.image_name = category_config["image_name"]
                self.config.container_name = category_config["container_name"]
                self.config.host_port = category_config["host_port"]
                self.config.readiness_path = category_config["readiness_path"]
            
            # Ensure image exists (load from tar if configured)
            self._load_image_from_tar_if_needed()

            # Ensure any stale container is gone
            self._stop_and_remove_container(self.config.container_name)

            # Run container
            run_cmd = [
                "docker",
                "run",
                "--name",
                self.config.container_name,
                "-p",
                f"{self.config.host_port}:{self.config.container_port}",
                "-d",
                self.config.image_name,
            ]
            print("| Docker run command: ", run_cmd)
            result = self._run_cmd(run_cmd)
            if result.returncode != 0:
                logger.error("| Failed to start container: %s", result.stderr.strip())
                return None
            container_id = result.stdout.strip()
            logger.info(
                "| Started container %s (%s)", self.config.container_name, container_id
            )

            # Special handling for shopping and shopping_admin
            if self.config.container_name == "shopping":
                self._configure_shopping_post_start()
            if self.config.container_name == "shopping_admin":
                self._configure_shopping_admin_post_start()

            # Wait for readiness
            if not self._wait_until_ready():
                # Cleanup on failure
                self._stop_and_remove_container(self.config.container_name)
                return None

            entry_url = self._get_entry_url()

            # Track resource for cleanup
            self.track_resource(
                "docker_container",
                self.config.container_name,
                {
                    "image": self.config.image_name,
                    "host_port": self.config.host_port,
                    "container_port": self.config.container_port,
                    "base_url": entry_url,
                },
            )

            # Provide initial state info
            return InitialStateInfo(
                state_id=self.config.container_name,
                state_url=entry_url,
                metadata={
                    "docker_image": self.config.image_name,
                    "container_name": self.config.container_name,
                    "host_port": self.config.host_port,
                    "container_port": self.config.container_port,
                    "base_url": entry_url,
                    "category": task.category_id,
                },
            )
        except Exception as exc:
            logger.error("| Failed to create WebArena initial state: %s", exc)
            return None

    def _store_initial_state_info(
        self, task: BaseTask, state_info: InitialStateInfo
    ) -> None:
        if hasattr(task, "__dict__"):
            task.docker_container_name = state_info.state_id
            task.base_url = state_info.state_url
            task.docker_metadata = state_info.metadata

    def _cleanup_task_initial_state(self, task: BaseTask) -> bool:
        if self.skip_cleanup:
            logger.info("| Skipping container cleanup (skip_cleanup=True)")
            logger.info("| Container is still running at: %s", self._get_entry_url())
            logger.info(
                "| To manually stop: docker stop %s && docker rm %s",
                self.config.container_name,
                self.config.container_name,
            )
            return True

        try:
            self._stop_and_remove_container(self.config.container_name)
            return True
        except Exception as exc:
            logger.error("| Failed to cleanup container for %s: %s", task.name, exc)
            return False

    def _cleanup_single_resource(self, resource: Dict[str, Any]) -> bool:
        if self.skip_cleanup:
            logger.info(
                "| Skipping resource cleanup for %s (skip_cleanup=True)",
                resource.get("id"),
            )
            return True

        try:
            if resource.get("type") == "docker_container":
                self._stop_and_remove_container(resource["id"])
                return True
            logger.warning(
                "| Unknown resource type for cleanup: %s", resource.get("type")
            )
            return False
        except Exception as exc:
            logger.error("| Resource cleanup failed: %s", exc)
            return False

    def get_service_config_for_agent(self) -> dict:
        """
        Provide configuration to the agent. The key piece is the base URL that
        agents should navigate to when starting tasks.
        """
        return {
            "environment": "webarena-docker",
            "base_url": self._get_entry_url(),
            "docker": {
                "image": self.config.image_name,
                "container": self.config.container_name,
                "host_port": self.config.host_port,
                "container_port": self.config.container_port,
            },
        }

    def close_all(self) -> None:
        if self.skip_cleanup:
            logger.info("| Skipping container cleanup in close_all (skip_cleanup=True)")
            return

        try:
            self._stop_and_remove_container(self.config.container_name)
        except Exception:
            # Best effort
            pass

    def __del__(self) -> None:
        if not self.skip_cleanup:
            self.close_all()


================================================
FILE: src/mcp_services/playwright_webarena/playwright_task_manager.py
================================================
"""
WebArena Playwright Task Manager for MCPMark
============================================

Simple task manager for WebArena-backed Playwright MCP tasks.
"""

from __future__ import annotations
import sys
import os
import subprocess
from pathlib import Path
from typing import List, Dict, Any

from src.logger import get_logger
from src.base.task_manager import BaseTask, BaseTaskManager
logger = get_logger(__name__)

class PlaywrightTaskManager(BaseTaskManager):
    """Task manager for Playwright tasks against a WebArena environment."""

    def __init__(
        self,
        tasks_root: Path | None = None,
        base_url: str | None = None,
        task_suite: str = "standard",
    ):
        if tasks_root is None:
            tasks_root = Path(__file__).resolve().parents[3] / "tasks"
        super().__init__(
            tasks_root,
            mcp_service="playwright_webarena",
            task_class=BaseTask,
            task_organization="directory",
            task_suite=task_suite,
        )

    def _create_task_from_files(
        self, category_id: str, task_files_info: Dict[str, Any]
    ) -> BaseTask:
        import json
        
        # Check for meta.json
        meta_path = task_files_info["instruction_path"].parent / "meta.json"
        final_category_id = category_id
        task_id = task_files_info["task_id"]
        
        if meta_path.exists():
            try:
                with open(meta_path, 'r') as f:
                    meta_data = json.load(f)
                    # Use values from meta.json if available
                    final_category_id = meta_data.get("category_id", category_id)
                    task_id = meta_data.get("task_id", task_id)
            except Exception as e:
                logger.warning(f"Failed to load meta.json from {meta_path}: {e}")

        task = BaseTask(
            task_instruction_path=task_files_info["instruction_path"],
            task_verification_path=task_files_info["verification_path"],
            service="playwright_webarena",
            category_id=final_category_id,
            task_id=task_id,
        )
        
        return task


    # NEW: 注入统一前缀（基于 state manager 注入的 task.base_url）
    def get_task_instruction(self, task: BaseTask) -> str:
        base_instruction = task.get_task_instruction().strip()
        base_url = getattr(task, "base_url", None)
        prefix = f"Navigate to {base_url.rstrip('/')} and complete the following task."
        # 前缀 + 原始任务说明
        return self._format_task_instruction(f"{prefix}\n\n{base_instruction}")

    def _get_verification_command(self, task: BaseTask) -> List[str]:
        return [sys.executable, str(task.task_verification_path)]

    # 将 base_url 通过环境变量传给 verify.py
    def run_verification(self, task: BaseTask) -> subprocess.CompletedProcess:
        env = os.environ.copy()
        base_url = getattr(task, "base_url", None)
        if base_url:
            env["WEBARENA_BASE_URL"] = base_url.rstrip("/")
        return subprocess.run(
            self._get_verification_command(task),
            capture_output=True,
            text=True,
            timeout=300,
            env=env,
        )

    def _format_task_instruction(self, base_instruction: str) -> str:
        note = "Use Playwright MCP tools to complete this task."
        return (base_instruction 
                + "\n\n" 
                + note + "\n\nNote: Based on your understanding, solve the task all at once by yourself, don't ask for my opinions on anything.")


================================================
FILE: src/mcp_services/playwright_webarena/reddit_env_setup.md
================================================
# WebArena Reddit环境搭建指南

本指南介绍如何搭建WebArena Reddit环境，用于Playwright MCP自动化测试。

## 系统要求

- Ubuntu 22.04+ 或其他Linux发行版
- Docker环境
- 至少50GB可用磁盘空间
- 至少4GB内存

## 快速设置步骤

### 1. 下载Reddit Docker镜像

WebArena提供3个镜像源，选择网络最快的：

```bash
# 选项1: Google Drive (通常最快)
pip install gdown
gdown 17Qpp1iu_mPqzgO_73Z9BnFjHrzmX9DGf

# 选项2: Archive.org
wget https://archive.org/download/webarena-env-forum-image/postmill-populated-exposed-withimg.tar

# 选项3: CMU服务器
wget http://metis.lti.cs.cmu.edu/webarena-images/postmill-populated-exposed-withimg.tar
```

### 2. 安装Docker (如果尚未安装)

```bash
sudo apt update
sudo apt install docker.io -y
sudo systemctl start docker
sudo systemctl enable docker
sudo usermod -aG docker $USER
newgrp docker
```

### 3. 启动Reddit环境

```bash
# 加载Docker镜像 (约50GB，需要等待几分钟)
docker load --input postmill-populated-exposed-withimg.tar

# 启动容器
docker run --name forum -p 9999:80 -d postmill-populated-exposed-withimg

# 等待服务启动 (约1-2分钟)
sleep 120

# 验证服务状态
docker logs forum | tail -10
curl -I http://localhost:9999
```

### 4. 验证环境

访问 `http://localhost:9999` 应该看到Postmill论坛主页，包含：
- 导航栏 (Forums, Wiki)
- 搜索框
- 登录/注册链接
- 论坛列表 (AskReddit, technology, gaming等)

## 端口开放策略

根据使用场景选择合适的端口开放策略：

### 策略1: GCP防火墙规则 (推荐 - 生产环境)

**适用场景**: 长期使用、团队协作、稳定的公共访问

```bash
# 安装gcloud CLI (如果尚未安装)
curl https://sdk.cloud.google.com | bash
exec -l $SHELL

# 认证
gcloud auth login

# 创建防火墙规则
gcloud compute firewall-rules create allow-reddit-9999 \
  --allow tcp:9999 \
  --source-ranges 0.0.0.0/0 \
  --description "Allow access to WebArena Reddit on port 9999"

# 获取外部IP
gcloud compute instances list
```

**优点**: 永久有效、稳定、无额外依赖  
**缺点**: 需要GCP权限、公网完全开放

### 策略2: ngrok隧道 (快速分享)

**适用场景**: 临时演示、快速测试、无需GCP权限

```bash
# 安装ngrok
wget https://bin.equinox.io/c/bNyj1mQVY4c/ngrok-v3-stable-linux-amd64.tgz
tar xvzf ngrok-v3-stable-linux-amd64.tgz
sudo mv ngrok /usr/local/bin

# 创建隧道
ngrok http 9999
```

**优点**: 即时生效、HTTPS支持、无需服务器配置  
**缺点**: 临时URL、需要保持运行、免费版有限制

### 策略3: Cloudflared隧道 (免费持久)

**适用场景**: 长期免费使用、无需GCP、需要稳定访问

```bash
# 安装cloudflared
wget https://github.com/cloudflare/cloudflared/releases/latest/download/cloudflared-linux-amd64
sudo mv cloudflared-linux-amd64 /usr/local/bin/cloudflared
sudo chmod +x /usr/local/bin/cloudflared

# 创建临时隧道
cloudflared tunnel --url http://localhost:9999

# 或创建永久隧道 (需要Cloudflare账号)
cloudflared tunnel login
cloudflared tunnel create webarena-reddit
cloudflared tunnel route dns webarena-reddit reddit.yourdomain.com
```

**优点**: 免费、持久、自定义域名  
**缺点**: 需要Cloudflare账号、设置稍复杂

### 策略4: SSH端口转发 (开发调试)

**适用场景**: 本地开发、安全要求高、团队内部访问

```bash
# 在本地机器上执行
ssh -L 8080:localhost:9999 user@your-server-ip

# 然后访问 http://localhost:8080
```

**优点**: 最安全、无需开放公网端口  
**缺点**: 需要SSH访问、仅限本地使用

## Playwright MCP测试

环境搭建完成后，可以使用Playwright MCP进行自动化测试：

```javascript
// 基础连接测试
await page.goto('http://your-reddit-url:9999');

// 导航测试
await page.click('text=Forums');
await page.click('text=AskReddit');

// 表单交互测试
await page.click('text=Log in');
await page.fill('[placeholder="Username"]', 'testuser');
await page.fill('[placeholder="Password"]', 'testpass');
```

## 故障排除

### 容器启动失败
```bash
# 检查容器状态
docker ps -a

# 查看详细日志
docker logs forum

# 重启容器
docker restart forum
```

### 服务未就绪
```bash
# 检查PostgreSQL是否完全启动
docker logs forum | grep "database system is ready"

# 等待更长时间 (数据库恢复需要时间)
sleep 300
```

### 端口被占用
```bash
# 检查端口使用情况
netstat -tlnp | grep 9999

# 使用不同端口
docker run --name forum -p 8888:80 -d postmill-populated-exposed-withimg
```

## 环境重置

完成测试后重置环境：

```bash
# 停止并删除容器
docker stop forum
docker rm forum

# 重新启动
docker run --name forum -p 9999:80 -d postmill-populated-exposed-withimg
```

## 高级配置

### 环境变量设置 (WebArena标准)
```bash
export REDDIT="your-server-hostname:9999"
export REDDIT_URL="http://your-server-hostname:9999"
```

### 批量任务测试
```bash
# 准备WebArena测试配置
mkdir -p ~/.webarena
echo "REDDIT=your-server-hostname:9999" >> ~/.webarena/config
```

---

**注意**: 这个Reddit环境包含成千上万的预填充数据，完全模拟真实的Reddit使用场景，非常适合进行复杂的Web自动化任务测试。

================================================
FILE: src/mcp_services/postgres/__init__.py
================================================
"""
PostgreSQL MCP Service for MCPMark
===================================

This module provides PostgreSQL database integration for MCPMark evaluation.
"""

from .postgres_login_helper import PostgresLoginHelper
from .postgres_state_manager import PostgresStateManager
from .postgres_task_manager import PostgresTaskManager, PostgresTask

__all__ = [
    "PostgresLoginHelper",
    "PostgresStateManager",
    "PostgresTaskManager",
    "PostgresTask",
]


================================================
FILE: src/mcp_services/postgres/postgres_login_helper.py
================================================
"""
PostgreSQL Login Helper for MCPMark
====================================

Handles PostgreSQL authentication and connection validation.
"""

import json
import psycopg2
from pathlib import Path
from typing import Optional, Dict, Any

from src.base.login_helper import BaseLoginHelper
from src.logger import get_logger

logger = get_logger(__name__)


class PostgresLoginHelper(BaseLoginHelper):
    """Handles PostgreSQL authentication and connection validation."""

    def __init__(
        self,
        host: str = "localhost",
        port: int = 5432,
        database: str = "postgres",
        username: str = "postgres",
        password: str = None,
        state_path: Optional[Path] = None,
    ):
        """Initialize PostgreSQL login helper.

        Args:
            host: Database host
            port: Database port
            database: Database name
            username: Database username
            password: Database password
            state_path: Path to save connection state
        """
        super().__init__()
        self.host = host
        self.port = port
        self.database = database
        self.username = username
        self.password = password
        self.state_path = state_path or Path.home() / ".mcpbench" / "postgres_auth.json"

        # Ensure state directory exists
        self.state_path.parent.mkdir(parents=True, exist_ok=True)

    def login(self, **kwargs) -> bool:
        """Test PostgreSQL connection and save state.

        Returns:
            bool: True if connection successful
        """
        try:
            # Test connection
            conn = psycopg2.connect(
                host=self.host,
                port=self.port,
                database=self.database,
                user=self.username,
                password=self.password,
                connect_timeout=10,
            )

            # Execute test query
            with conn.cursor() as cur:
                cur.execute("SELECT version()")
                version = cur.fetchone()[0]
                logger.info(f"PostgreSQL connection successful: {version}")

                # Check permissions
                cur.execute(
                    """
                    SELECT has_database_privilege(%s, 'CREATE')
                """,
                    (self.database,),
                )
                can_create = cur.fetchone()[0]

                if not can_create:
                    logger.warning("User does not have CREATE privilege on database")

            conn.close()

            # Save connection state
            self._save_connection_state(
                {
                    "host": self.host,
                    "port": self.port,
                    "database": self.database,
                    "username": self.username,
                    "version": version,
                    "can_create": can_create,
                    "authenticated_at": self._get_current_timestamp(),
                }
            )

            return True

        except psycopg2.Error as e:
            logger.error(f"PostgreSQL connection failed: {e}")
            return False
        except Exception as e:
            logger.error(f"Unexpected error during PostgreSQL login: {e}")
            return False

    def _save_connection_state(self, state: Dict[str, Any]):
        """Save connection state to file."""
        try:
            # Don't save password
            safe_state = {k: v for k, v in state.items() if k != "password"}

            with open(self.state_path, "w") as f:
                json.dump(safe_state, f, indent=2)

            # Set restrictive permissions
            self.state_path.chmod(0o600)
            logger.info(f"Connection state saved to: {self.state_path}")

        except Exception as e:
            logger.error(f"Failed to save connection state: {e}")

    def _get_current_timestamp(self) -> str:
        """Get current timestamp in ISO format."""
        from datetime import datetime, timezone

        return datetime.now(timezone.utc).isoformat()

    def is_connected(self) -> bool:
        """Check if we can connect to PostgreSQL."""
        return self.login()

    def get_connection_params(self) -> Dict[str, Any]:
        """Get connection parameters (without password)."""
        return {
            "host": self.host,
            "port": self.port,
            "database": self.database,
            "user": self.username,
        }


================================================
FILE: src/mcp_services/postgres/postgres_state_manager.py
================================================
"""
PostgreSQL State Manager for MCPMark
=====================================

Manages database state for PostgreSQL tasks including schema setup,
test data creation, and cleanup.
"""

import os
import subprocess
import sys
import psycopg2
from psycopg2 import sql
from pathlib import Path
from typing import Optional, Dict, Any, List

from src.base.state_manager import BaseStateManager, InitialStateInfo
from src.base.task_manager import BaseTask
from src.logger import get_logger

logger = get_logger(__name__)


class PostgresStateManager(BaseStateManager):
    """Manages PostgreSQL database state for task evaluation."""

    def __init__(
        self,
        host: str = "localhost",
        port: int = 5432,
        database: str = "postgres",
        username: str = "postgres",
        password: str = None,
    ):
        """Initialize PostgreSQL state manager.

        Args:
            host: Database host
            port: Database port
            database: Main database name
            username: Database username
            password: Database password
            template_db: Template database for initial states
        """
        super().__init__(service_name="postgres")

        self.host = host
        self.port = port
        self.database = database
        self.username = username
        self.password = password

        # Connection parameters
        self.conn_params = {
            "host": host,
            "port": port,
            "user": username,
            "password": password,
        }

        # Track created databases for cleanup
        self.created_databases: List[str] = []

        # Track current task database for agent configuration
        self._current_task_database: Optional[str] = None

        # Validate connection on initialization
        try:
            self._test_connection()
            logger.info("PostgreSQL state manager initialized successfully")
            self._setup_database()
        except Exception as e:
            raise RuntimeError(f"PostgreSQL initialization failed: {e}")

    def _test_connection(self):
        """Test database connection."""
        conn = psycopg2.connect(**self.conn_params, database="postgres")
        conn.close()
    
    def _setup_database(self):
        """Setup all required databases by downloading and restoring from backup."""
        databases = ['employees', 'chinook', 'dvdrental', 'sports', 'lego']
        
        for db_name in databases:
            if not self._database_exists(db_name):
                logger.info(f"Setting up {db_name} database...")
                
                # Path to backup file
                backup_dir = Path(__file__).parent.parent.parent.parent / "postgres_state"
                backup_file = backup_dir / f"{db_name}.backup"
                
                # Download backup if not exists
                if not backup_file.exists():
                    backup_dir.mkdir(parents=True, exist_ok=True)
                    logger.info(f"Downloading {db_name} backup...")
                    try:
                        import urllib.request
                        urllib.request.urlretrieve(
                            f'https://storage.mcpmark.ai/postgres/{db_name}.backup',
                            str(backup_file)
                        )
                        logger.info(f"{db_name} backup downloaded")
                    except Exception as e:
                        logger.warning(f"Failed to download {db_name} backup: {e}")
                        continue
                
                # Create database
                try:
                    self._create_empty_database(db_name)
                    logger.info(f"Created {db_name} database")
                except Exception as e:
                    logger.warning(f"Failed to create {db_name} database: {e}")
                    continue
                
                # Restore from backup
                env = os.environ.copy()
                env['PGPASSWORD'] = self.password
                
                try:
                    result = subprocess.run([
                        'pg_restore',
                        '-h', str(self.host),
                        '-p', str(self.port),
                        '-U', self.username,
                        '-d', db_name,
                        '-v',
                        str(backup_file)
                    ], env=env, capture_output=True, text=True)
                    
                    if result.returncode != 0:
                        logger.warning(f"pg_restore had errors for {db_name}: {result.stderr}")
                    else:
                        logger.info(f"{db_name} database restored successfully")
                except Exception as e:
                    logger.warning(f"Failed to restore {db_name} database: {e}")
            else:
                logger.debug(f"{db_name} database already exists")

    def _setup_database(self):
        """Setup all required databases by downloading and restoring from backup."""
        databases = ['employees', 'chinook', 'dvdrental', 'sports', 'lego']

        for db_name in databases:
            if not self._database_exists(db_name):
                logger.info(f"Setting up {db_name} database...")

                # Path to backup file
                backup_dir = Path(__file__).parent.parent.parent.parent / "postgres_state"
                backup_file = backup_dir / f"{db_name}.backup"

                # Download backup if not exists
                if not backup_file.exists():
                    backup_dir.mkdir(parents=True, exist_ok=True)
                    logger.info(f"Downloading {db_name} backup...")
                    try:
                        import urllib.request
                        urllib.request.urlretrieve(
                            f'https://storage.mcpmark.ai/postgres/{db_name}.backup',
                            str(backup_file)
                        )
                        logger.info(f"{db_name} backup downloaded")
                    except Exception as e:
                        logger.warning(f"Failed to download {db_name} backup: {e}")
                        continue

                # Create database
                try:
                    self._create_empty_database(db_name)
                    logger.info(f"Created {db_name} database")
                except Exception as e:
                    logger.warning(f"Failed to create {db_name} database: {e}")
                    continue

                # Restore from backup
                env = os.environ.copy()
                env['PGPASSWORD'] = self.password

                try:
                    result = subprocess.run([
                        'pg_restore',
                        '-h', str(self.host),
                        '-p', str(self.port),
                        '-U', self.username,
                        '-d', db_name,
                        '-v',
                        str(backup_file)
                    ], env=env, capture_output=True, text=True)

                    if result.returncode != 0 and "ERROR" in result.stderr:
                        logger.warning(f"pg_restore had errors for {db_name}: {result.stderr}")
                    else:
                        logger.info(f"{db_name} database restored successfully")
                except Exception as e:
                    logger.warning(f"Failed to restore {db_name} database: {e}")
            else:
                logger.debug(f"{db_name} database already exists")

    def _create_initial_state(self, task: BaseTask) -> Optional[InitialStateInfo]:
        """Create initial database state for a task."""
        try:
            # Generate unique database name
            db_name = f"mcpmark_{task.category_id}_{task.task_id}_{self._get_timestamp()}"

            # Create database from template if exists, otherwise empty
            if self._database_exists(task.category_id):
                self._create_database_from_template(db_name, task.category_id)
                logger.info(
                    f"| Created database '{db_name}' from template '{task.category_id}'"
                )
            else:
                self._create_empty_database(db_name)
                logger.info(f"| Created empty database '{db_name}'")
                # Run prepare_environment.py if it exists
                self._run_prepare_environment(db_name, task)
                logger.info(f"| Prepared environment for database '{db_name}'")

            # Track for cleanup
            self.created_databases.append(db_name)
            self.track_resource("database", db_name, {"task": task.name})


            return InitialStateInfo(
                state_id=db_name,
                state_url=f"postgresql://{self.username}@{self.host}:{self.port}/{db_name}",
                metadata={
                    "database": db_name,
                    "category": task.category_id,
                    "task_id": task.task_id,
                },
            )

        except Exception as e:
            logger.error(f"Failed to create initial state for {task.name}: {e}")
            return None

    def _store_initial_state_info(
        self, task: BaseTask, state_info: InitialStateInfo
    ) -> None:
        """Store database info in task object."""
        if hasattr(task, "__dict__"):
            task.database_name = state_info.state_id
            task.database_url = state_info.state_url
            # Store current task database for agent configuration
            self._current_task_database = state_info.state_id

    def _cleanup_task_initial_state(self, task: BaseTask) -> bool:
        """Clean up task database."""
        if hasattr(task, "database_name") and task.database_name:
            try:
                self._drop_database(task.database_name)
                logger.info(f"| Dropped database: {task.database_name}")

                # Remove from tracking
                self.created_databases = [
                    db for db in self.created_databases if db != task.database_name
                ]
                # Clear current task database
                if self._current_task_database == task.database_name:
                    self._current_task_database = None
                return True
            except Exception as e:
                logger.error(f"Failed to drop database {task.database_name}: {e}")
                return False
        return True

    def _cleanup_single_resource(self, resource: Dict[str, Any]) -> bool:
        """Clean up a single PostgreSQL resource."""
        if resource["type"] == "database":
            try:
                self._drop_database(resource["id"])
                logger.info(f"| Dropped database: {resource['id']}")
                return True
            except Exception as e:
                logger.error(f"| Failed to drop database {resource['id']}: {e}")
                return False
        return False

    def _database_exists(self, db_name: str) -> bool:
        """Check if database exists."""
        conn = psycopg2.connect(**self.conn_params, database="postgres")
        try:
            with conn.cursor() as cur:
                cur.execute("SELECT 1 FROM pg_database WHERE datname = %s", (db_name,))
                return cur.fetchone() is not None
        finally:
            conn.close()

    def _create_database_from_template(self, new_db: str, template_db: str):
        """Create database from template."""
        conn = psycopg2.connect(**self.conn_params, database="postgres")
        conn.autocommit = True
        try:
            with conn.cursor() as cur:
                cur.execute(
                    sql.SQL("""
                    SELECT pg_terminate_backend(pid)
                    FROM pg_stat_activity
                    WHERE datname = %s AND pid <> pg_backend_pid()
                """),
                    (template_db,),
                )
                cur.execute(
                    sql.SQL("CREATE DATABASE {} WITH TEMPLATE {}").format(
                        sql.Identifier(new_db), sql.Identifier(template_db)
                    )
                )
        finally:
            conn.close()

    def _create_empty_database(self, db_name: str):
        """Create empty database."""
        conn = psycopg2.connect(**self.conn_params, database="postgres")
        conn.autocommit = True
        try:
            with conn.cursor() as cur:
                cur.execute(
                    sql.SQL("CREATE DATABASE {}").format(sql.Identifier(db_name))
                )
        finally:
            conn.close()

    def _drop_database(self, db_name: str):
        """Drop database."""
        conn = psycopg2.connect(**self.conn_params, database="postgres")
        conn.autocommit = True
        try:
            with conn.cursor() as cur:
                # Terminate connections
                cur.execute(
                    sql.SQL("""
                    SELECT pg_terminate_backend(pid)
                    FROM pg_stat_activity
                    WHERE datname = %s AND pid <> pg_backend_pid()
                """),
                    (db_name,),
                )

                # Drop database
                cur.execute(
                    sql.SQL("DROP DATABASE IF EXISTS {}").format(
                        sql.Identifier(db_name)
                    )
                )
        finally:
            conn.close()

    def _run_prepare_environment(self, db_name: str, task: BaseTask):
        """Run prepare_environment.py script if it exists in the task directory."""
        # Find the task directory containing prepare_environment.py
        task_dir = task.task_instruction_path.parent
        prepare_script = task_dir / "prepare_environment.py"

        if not prepare_script.exists():
            logger.debug(f"No prepare_environment.py found for task {task.name}")
            return

        logger.info(f"| Running prepare_environment.py for task {task.name}")

        # Set up environment variables for the script
        env = os.environ.copy()
        env.update({
            "POSTGRES_HOST": str(self.host),
            "POSTGRES_PORT": str(self.port),
            "POSTGRES_DATABASE": db_name,
            "POSTGRES_USERNAME": self.username,
            "POSTGRES_PASSWORD": self.password or "",
        })

        try:
            # Run the prepare_environment.py script
            result = subprocess.run(
                [sys.executable, str(prepare_script)],
                cwd=str(task_dir),  # Run from task directory to access data/ folder
                env=env,
                capture_output=True,
                text=True,
                timeout=300,  # 5 minute timeout
            )

            if result.returncode == 0:
                logger.info(f"| ✓ Environment preparation completed for {task.name}")
                if result.stdout.strip():
                    logger.debug(f"| prepare_environment.py output: {result.stdout}")
            else:
                logger.error(f"| ❌ Environment preparation failed for {task.name}")
                logger.error(f"| Error output: {result.stderr}")
                raise RuntimeError(f"prepare_environment.py failed with exit code {result.returncode}")

        except subprocess.TimeoutExpired:
            logger.error(f"❌ Environment preparation timed out for {task.name}")
            raise RuntimeError("prepare_environment.py execution timed out")
        except Exception as e:
            logger.error(f"❌ Failed to run prepare_environment.py for {task.name}: {e}")
            raise

    def _setup_task_specific_data(self, db_name: str, task: BaseTask):
        """Set up task-specific schema and data."""
        conn = psycopg2.connect(**self.conn_params, database=db_name)
        try:
            with conn.cursor() as cur:
                if task.category_id == "basic_queries":
                    self._setup_basic_queries_data(cur)
                elif task.category_id == "data_manipulation":
                    self._setup_data_manipulation_data(cur)
                elif task.category_id == "table_operations":
                    self._setup_table_operations_data(cur)
                # Add more categories as needed

            conn.commit()
        except Exception as e:
            conn.rollback()
            logger.error(f"Failed to setup task data: {e}")
            raise
        finally:
            conn.close()

    def _setup_basic_queries_data(self, cursor):
        """Set up data for basic query tasks."""
        cursor.execute("""
            CREATE TABLE employees (
                id SERIAL PRIMARY KEY,
                name VARCHAR(100) NOT NULL,
                department VARCHAR(50),
                salary DECIMAL(10, 2),
                hire_date DATE
            );

            INSERT INTO employees (name, department, salary, hire_date) VALUES
            ('John Doe', 'Engineering', 75000.00, '2020-01-15'),
            ('Jane Smith', 'Marketing', 65000.00, '2019-03-22'),
            ('Bob Johnson', 'Engineering', 80000.00, '2018-07-01'),
            ('Alice Brown', 'HR', 55000.00, '2021-02-10');
        """)

    def _setup_data_manipulation_data(self, cursor):
        """Set up data for data manipulation tasks."""
        cursor.execute("""
            CREATE TABLE products (
                id SERIAL PRIMARY KEY,
                name VARCHAR(100) NOT NULL,
                category VARCHAR(50),
                price DECIMAL(10, 2),
                stock INTEGER DEFAULT 0
            );

            CREATE TABLE orders (
                id SERIAL PRIMARY KEY,
                product_id INTEGER REFERENCES products(id),
                quantity INTEGER NOT NULL,
                order_date TIMESTAMP DEFAULT CURRENT_TIMESTAMP
            );
        """)

    def _setup_table_operations_data(self, cursor):
        """Set up for table operation tasks."""
        # Start with minimal schema that tasks will modify
        cursor.execute("""
            CREATE TABLE test_table (
                id SERIAL PRIMARY KEY,
                data VARCHAR(255)
            );
        """)

    def _get_timestamp(self) -> str:
        """Get timestamp for unique naming."""
        from datetime import datetime

        return datetime.now().strftime("%Y%m%d%H%M%S")

    def get_service_config_for_agent(self) -> dict:
        """Get configuration for agent execution."""
        config = {
            "host": self.host,
            "port": self.port,
            "username": self.username,
            "password": self.password,
        }

        # If there's a current task database, include it
        if hasattr(self, "_current_task_database") and self._current_task_database:
            config["current_database"] = self._current_task_database
            config["database_url"] = (
                f"postgresql://{self.username}:{self.password}@{self.host}:{self.port}/{self._current_task_database}"
            )
        else:
            # Fallback to default database
            config["database"] = self.database
            config["database_url"] = (
                f"postgresql://{self.username}:{self.password}@{self.host}:{self.port}/{self.database}"
            )

        return config


================================================
FILE: src/mcp_services/postgres/postgres_task_manager.py
================================================
"""
PostgreSQL Task Manager for MCPMark
====================================

Manages PostgreSQL task discovery, execution, and verification.
"""

import os
import subprocess
import sys
from dataclasses import dataclass
from pathlib import Path
from typing import Any, Dict, List, Optional

from src.base.task_manager import BaseTask, BaseTaskManager
from src.logger import get_logger

logger = get_logger(__name__)


@dataclass
class PostgresTask(BaseTask):
    """PostgreSQL-specific task with database information."""

    task_name: str = ""
    database_name: Optional[str] = None
    database_url: Optional[str] = None
    expected_queries: Optional[List[str]] = None
    expected_tables: Optional[List[str]] = None


class PostgresTaskManager(BaseTaskManager):
    """Manages PostgreSQL tasks for MCPMark evaluation."""

    def __init__(self, tasks_root: Path = None, task_suite: str = "standard"):
        """Initialize PostgreSQL task manager.

        Args:
            tasks_root: Path to tasks directory
            task_suite: Logical task suite (e.g., 'standard', 'easy')
        """
        if tasks_root is None:
            tasks_root = Path(__file__).resolve().parents[3] / "tasks"

        super().__init__(
            tasks_root,
            mcp_service="postgres",
            task_class=PostgresTask,
            task_organization="file",  # PostgreSQL uses file-based tasks
            task_suite=task_suite,
        )

    def _create_task_from_files(
        self, category_id: str, task_files_info: Dict[str, Any]
    ) -> Optional[PostgresTask]:
        """Instantiate a `PostgresTask` from the dictionary returned by `_find_task_files`."""
        import json
        
        # Check for meta.json
        meta_path = task_files_info["instruction_path"].parent / "meta.json"
        final_category_id = category_id
        task_id = task_files_info["task_id"]
        
        if meta_path.exists():
            try:
                with open(meta_path, 'r') as f:
                    meta_data = json.load(f)
                    # Use values from meta.json if available
                    final_category_id = meta_data.get("category_id", category_id)
                    task_id = meta_data.get("task_id", task_id)
            except Exception as e:
                logger.warning(f"Failed to load meta.json from {meta_path}: {e}")

        return PostgresTask(
            task_instruction_path=task_files_info["instruction_path"],
            task_verification_path=task_files_info["verification_path"],
            service="postgres",
            category_id=final_category_id,
            task_id=task_id,
            task_name=task_files_info["task_id"],
        )

    def _get_verification_command(self, task: PostgresTask) -> List[str]:
        """Get verification command with database info."""
        cmd = [sys.executable, str(task.task_verification_path)]

        # Pass database name as argument if available
        if task.database_name:
            cmd.append(task.database_name)

        return cmd

    def run_verification(self, task: BaseTask) -> subprocess.CompletedProcess:
        """Run verification with PostgreSQL environment."""
        env = os.environ.copy()

        # Pass database connection info to verification script
        if hasattr(task, "database_name") and task.database_name:
            env["POSTGRES_DATABASE"] = task.database_name

        if hasattr(task, "database_url") and task.database_url:
            env["DATABASE_URL"] = task.database_url

        return subprocess.run(
            self._get_verification_command(task),
            capture_output=True,
            text=True,
            timeout=300,
            env=env,
        )

    def _format_task_instruction(self, base_instruction: str) -> str:
        """Add PostgreSQL-specific instructions."""
        return (
            base_instruction
            + "\n\nNote: Use PostgreSQL MCP tools to complete this task. The database connection is already configured."
        )


================================================
FILE: src/mcp_services/supabase/__init__.py
================================================
"""Supabase MCP service integration for MCPMark."""

from .supabase_login_helper import SupabaseLoginHelper
from .supabase_state_manager import SupabaseStateManager
from .supabase_task_manager import SupabaseTaskManager

__all__ = [
    "SupabaseLoginHelper",
    "SupabaseStateManager",
    "SupabaseTaskManager",
]


================================================
FILE: src/mcp_services/supabase/supabase_login_helper.py
================================================
"""
Supabase Login Helper for MCPMark
===================================

Handles configuration and validation for Supabase MCP service.
"""

import os
from typing import Dict, Any, Optional

from src.base.login_helper import BaseLoginHelper
from src.logger import get_logger

logger = get_logger(__name__)


class SupabaseLoginHelper(BaseLoginHelper):
    """Login helper for Supabase MCP service.

    Validates PostgREST API URL and API key configuration.
    """

    def __init__(self):
        super().__init__("supabase")

    def prepare_credentials(self) -> Dict[str, Any]:
        """Prepare credentials for Supabase/PostgREST connection.

        Returns:
            Dictionary containing api_url, api_key, and postgres connection details
        """
        # Get PostgREST API configuration (from Supabase CLI)
        api_url = os.getenv("SUPABASE_API_URL", "http://localhost:54321")
        api_key = os.getenv("SUPABASE_API_KEY")

        # Get PostgreSQL connection details (Supabase CLI defaults)
        postgres_host = os.getenv("SUPABASE_DB_HOST", "localhost")
        postgres_port = int(os.getenv("SUPABASE_DB_PORT", "54322"))
        postgres_user = os.getenv("SUPABASE_DB_USER", "postgres")
        postgres_password = os.getenv("SUPABASE_DB_PASSWORD", "postgres")
        postgres_database = os.getenv("SUPABASE_DB_NAME", "postgres")

        if not api_key:
            logger.warning(
                "SUPABASE_API_KEY not set.\n"
                "Run 'supabase status' to get your anon or service_role key.\n"
                "Set SUPABASE_API_KEY in your .mcp_env file."
            )
            # Try to get it from supabase status
            api_key = self._get_key_from_supabase_status()

        return {
            "api_url": api_url,
            "api_key": api_key or "",
            "postgres_host": postgres_host,
            "postgres_port": postgres_port,
            "postgres_user": postgres_user,
            "postgres_password": postgres_password,
            "postgres_database": postgres_database,
        }

    def _get_key_from_supabase_status(self) -> Optional[str]:
        """Try to get anon key from supabase status command.

        Returns:
            Anon key if found, None otherwise
        """
        import subprocess

        try:
            result = subprocess.run(
                ["supabase", "status"],
                capture_output=True,
                text=True,
                timeout=10,
            )

            if result.returncode == 0:
                # Parse output for anon key
                for line in result.stdout.split('\n'):
                    if 'anon key:' in line.lower():
                        # Extract the key after the colon
                        key = line.split(':', 1)[1].strip()
                        logger.info("Found anon key from 'supabase status'")
                        return key

        except (subprocess.SubprocessError, FileNotFoundError):
            logger.debug("Could not run 'supabase status' to get anon key")

        return None

    def test_credentials(self, credentials: Dict[str, Any]) -> bool:
        """Test if Supabase credentials are valid.

        Args:
            credentials: Dictionary with api_url, api_key, and postgres connection details

        Returns:
            True if credentials are valid
        """
        import requests
        import psycopg2

        api_url = credentials["api_url"]
        api_key = credentials.get("api_key", "")

        # Test PostgreSQL connection
        try:
            conn_params = {
                "host": credentials["postgres_host"],
                "port": credentials["postgres_port"],
                "user": credentials["postgres_user"],
                "password": credentials["postgres_password"],
                "database": credentials["postgres_database"],
            }
            conn = psycopg2.connect(**conn_params)
            conn.close()
            logger.info("✓ PostgreSQL connection successful")
        except Exception as e:
            logger.error(f"✗ PostgreSQL connection failed: {e}")
            return False

        # Test PostgREST API connection (optional - may not be running yet)
        try:
            headers = {}
            if api_key:
                headers["apikey"] = api_key
                headers["Authorization"] = f"Bearer {api_key}"

            response = requests.get(api_url, headers=headers, timeout=5)

            # Any response (including 404, 401) means the API is reachable
            logger.info(f"✓ PostgREST API reachable at {api_url} (status: {response.status_code})")
            return True

        except requests.exceptions.ConnectionError:
            logger.warning(
                f"⚠ PostgREST API not reachable at {api_url}.\n"
                "Make sure PostgREST is running (e.g., docker run -p 3000:3000 postgrest/postgrest)\n"
                "or use a cloud Supabase instance URL."
            )
            # Still return True as PostgreSQL connection works
            return True
        except Exception as e:
            logger.warning(f"⚠ PostgREST API test failed: {e}")
            # Still return True as PostgreSQL connection works
            return True

    def format_credentials_info(self, credentials: Dict[str, Any]) -> str:
        """Format credentials info for display.

        Args:
            credentials: Dictionary with connection details

        Returns:
            Formatted string describing the credentials
        """
        api_url = credentials["api_url"]
        has_api_key = bool(credentials.get("api_key"))
        postgres_host = credentials["postgres_host"]
        postgres_db = credentials["postgres_database"]

        return (
            f"Supabase Configuration:\n"
            f"  API URL: {api_url}\n"
            f"  API Key: {'✓ Configured' if has_api_key else '✗ Not set'}\n"
            f"  PostgreSQL: {postgres_host}/{postgres_db}"
        )


================================================
FILE: src/mcp_services/supabase/supabase_state_manager.py
================================================
"""
Supabase State Manager for MCPMark
====================================

Manages database state for Supabase tasks using the same PostgreSQL backend
as Insforge, but accessed via PostgREST/Supabase MCP server.
"""

import os
import sys
import subprocess
import psycopg2
from psycopg2 import sql
from pathlib import Path
from typing import Optional, Dict, Any, List

from src.base.state_manager import BaseStateManager, InitialStateInfo
from src.base.task_manager import BaseTask
from src.logger import get_logger

logger = get_logger(__name__)


class SupabaseStateManager(BaseStateManager):
    """Manages Supabase/PostgREST database state for task evaluation.

    Uses the same PostgreSQL database as Insforge but exposes it via
    PostgREST API for the Supabase MCP server to access.
    """

    def __init__(
        self,
        api_url: str,
        api_key: str,
        postgres_host: str = "localhost",
        postgres_port: int = 54322,  # Supabase CLI default port
        postgres_user: str = "postgres",
        postgres_password: str = "postgres",
        postgres_database: str = "postgres",  # Supabase CLI default database
    ):
        """Initialize Supabase state manager.

        Args:
            api_url: PostgREST API URL from Supabase CLI (default: http://localhost:54321)
            api_key: API key from Supabase CLI (anon or service_role key)
            postgres_host: PostgreSQL host for direct database operations
            postgres_port: PostgreSQL port (Supabase CLI uses 54322)
            postgres_user: PostgreSQL username
            postgres_password: PostgreSQL password
            postgres_database: Main PostgreSQL database name
        """
        super().__init__(service_name="supabase")

        self.api_url = api_url.rstrip('/')
        self.api_key = api_key

        # PostgreSQL connection for state management (Supabase CLI instance)
        self.postgres_host = postgres_host
        self.postgres_port = postgres_port
        self.postgres_user = postgres_user
        self.postgres_password = postgres_password
        self.postgres_database = postgres_database

        # Track current task context for agent configuration
        self._current_task_context: Optional[Dict[str, Any]] = None

        # Validate connection on initialization
        try:
            self._test_connection()
            logger.info("Supabase state manager initialized successfully")
        except Exception as e:
            raise RuntimeError(f"Supabase initialization failed: {e}")

        # Store baseline tables (system tables that exist before any tasks run)
        self._baseline_tables = set(
            (t['schema'], t['name']) for t in self._get_all_tables()
        )
        logger.debug(f"Stored baseline: {len(self._baseline_tables)} tables")

    def _test_connection(self):
        """Test PostgreSQL connection."""
        try:
            conn_params = {
                "host": self.postgres_host,
                "port": self.postgres_port,
                "user": self.postgres_user,
                "password": self.postgres_password,
                "database": self.postgres_database,
            }
            conn = psycopg2.connect(**conn_params)
            conn.close()
            logger.debug("PostgreSQL connection test successful")
        except Exception as e:
            raise RuntimeError(f"Cannot connect to PostgreSQL: {e}")

    def _create_initial_state(self, task: BaseTask) -> Optional[InitialStateInfo]:
        """Create initial backend state for a task.

        Restores from backup which may place tables in public or task-specific schema.

        Args:
            task: Task for which to create initial state

        Returns:
            InitialStateInfo object or None if creation failed
        """
        try:
            # Generate unique state ID for this task run
            state_id = f"{task.category_id}_{task.task_id}_{self._get_timestamp()}"
            schema_name = task.category_id

            logger.info(f"| Creating initial state for Supabase task: {task.name}")

            # Drop schema first (cleanup from previous runs)
            self._drop_schema(schema_name)

            # Get list of existing tables before restore (to track what we create)
            tables_before = self._get_all_tables()
            logger.info(f"| Tables before restore: {len(tables_before)}")

            # Note: Don't create schema here - pg_restore will create it from the backup

            # Restore from backup if backup exists (may create tables in public or task schema)
            if self._restore_from_backup(schema_name):
                logger.info(f"| ✓ Restored '{schema_name}' from backup")
            else:
                logger.info(f"| ○ No backup found for '{schema_name}'")
                # Run prepare_environment.py if it exists
                task_prepared = self._run_prepare_environment(task)
                if not task_prepared:
                    logger.debug(f"| No prepare_environment.py found for task {task.name}")

            # Get list of tables after restore (to track what we need to clean up)
            tables_after = self._get_all_tables()

            # Track ALL new tables created by the restore (compare before/after)
            tables_before_set = {(t['schema'], t['name']) for t in tables_before}
            created_tables = [
                t for t in tables_after
                if (t['schema'], t['name']) not in tables_before_set
            ]

            logger.info(f"| Tracked {len(created_tables)} new tables for cleanup")
            for t in created_tables:
                logger.debug(f"|   - {t['schema']}.{t['name']}")

            # Track the task context including created tables
            context = {
                "state_id": state_id,
                "category_id": task.category_id,
                "task_id": task.task_id,
                "task_name": task.name,
                "schema": schema_name,
                "created_tables": created_tables,
            }

            return InitialStateInfo(
                state_id=state_id,
                state_url=self.api_url,
                metadata=context,
            )

        except Exception as e:
            logger.error(f"Failed to create initial state for {task.name}: {e}")
            return None

    def _store_initial_state_info(
        self, task: BaseTask, state_info: InitialStateInfo
    ) -> None:
        """Store backend info in task object for agent access."""
        if hasattr(task, "__dict__"):
            task.api_url = self.api_url
            task.api_key = self.api_key
            task.state_id = state_info.state_id

            # Store current task context for agent configuration
            self._current_task_context = state_info.metadata

    def _cleanup_task_initial_state(self, task: BaseTask) -> bool:
        """Clean up task-specific resources.

        Drops ALL tables created during task (both setup and agent-created)
        by comparing against baseline.

        Args:
            task: Task whose initial state should be cleaned up

        Returns:
            True if cleanup successful
        """
        try:
            logger.info(f"| Cleaning up initial state for task: {task.name}")

            if self._current_task_context:
                schema_name = self._current_task_context.get("schema")

                # Get ALL current tables
                all_current_tables = self._get_all_tables()

                # Find tables to drop: anything not in baseline
                tables_to_drop = [
                    t for t in all_current_tables
                    if (t['schema'], t['name']) not in self._baseline_tables
                ]

                logger.info(f"| Found {len(tables_to_drop)} tables to clean up (setup + agent-created)")

                # Drop individual tables
                for table_info in tables_to_drop:
                    try:
                        self._drop_table(table_info["schema"], table_info["name"])
                        logger.debug(f"| ✓ Dropped table: {table_info['schema']}.{table_info['name']}")
                    except Exception as e:
                        logger.warning(f"| Failed to drop table {table_info}: {e}")

                # Drop the task schema (may be empty if all tables were in public)
                if schema_name:
                    try:
                        self._drop_schema(schema_name)
                        logger.info(f"| ✓ Dropped schema: {schema_name}")
                    except Exception as e:
                        logger.warning(f"| Failed to drop schema {schema_name}: {e}")

                # Clear task context
                if self._current_task_context.get("task_name") == task.name:
                    self._current_task_context = None

            logger.info(f"| ✓ Initial state cleanup completed for {task.name}")
            return True

        except Exception as e:
            logger.error(f"Failed to cleanup task initial state for {task.name}: {e}")
            return False

    def _cleanup_single_resource(self, resource: Dict[str, Any]) -> bool:
        """Clean up a single tracked resource.

        Args:
            resource: Resource dictionary with type, id, and metadata

        Returns:
            True if cleanup successful
        """
        resource_type = resource["type"]
        resource_id = resource["id"]

        logger.debug(f"| Cleanup for {resource_type} {resource_id} (handled by task scripts)")
        return True

    def _run_prepare_environment(self, task: BaseTask) -> bool:
        """Run prepare_environment.py script if it exists in the task directory.

        The script should use database operations to set up required state.

        Args:
            task: Task for which to prepare environment

        Returns:
            True if script ran successfully, False if script doesn't exist
        """
        task_dir = task.task_instruction_path.parent
        prepare_script = task_dir / "prepare_environment.py"

        if not prepare_script.exists():
            logger.debug(f"No prepare_environment.py found for task {task.name}")
            return False

        logger.info(f"| Running prepare_environment.py for task {task.name}")

        # Set up environment variables for the script
        env = os.environ.copy()
        env.update({
            "SUPABASE_API_URL": self.api_url,
            "SUPABASE_API_KEY": self.api_key,
            "POSTGRES_HOST": self.postgres_host,
            "POSTGRES_PORT": str(self.postgres_port),
            "POSTGRES_DATABASE": self.postgres_database,
            "POSTGRES_USERNAME": self.postgres_user,
            "POSTGRES_PASSWORD": self.postgres_password,
        })

        try:
            # Run the prepare_environment.py script
            result = subprocess.run(
                [sys.executable, str(prepare_script)],
                cwd=str(task_dir),  # Run from task directory
                env=env,
                capture_output=True,
                text=True,
                timeout=300,  # 5 minute timeout
            )

            if result.returncode == 0:
                logger.info(f"| ✓ Environment preparation completed for {task.name}")
                if result.stdout.strip():
                    logger.debug(f"| prepare_environment.py output: {result.stdout}")
                return True
            else:
                logger.error(f"| ✗ Environment preparation failed for {task.name}")
                logger.error(f"| Error output: {result.stderr}")
                raise RuntimeError(f"prepare_environment.py failed with exit code {result.returncode}")

        except subprocess.TimeoutExpired:
            logger.error(f"✗ Environment preparation timed out for {task.name}")
            raise RuntimeError("prepare_environment.py execution timed out")
        except Exception as e:
            logger.error(f"✗ Failed to run prepare_environment.py for {task.name}: {e}")
            raise

    def _get_timestamp(self) -> str:
        """Get timestamp for unique naming."""
        from datetime import datetime
        return datetime.now().strftime("%Y%m%d%H%M%S")

    def _drop_schema(self, schema_name: str) -> None:
        """Drop schema and all its contents."""
        conn_params = {
            "host": self.postgres_host,
            "port": self.postgres_port,
            "user": self.postgres_user,
            "password": self.postgres_password,
            "database": self.postgres_database,
        }

        conn = psycopg2.connect(**conn_params)
        conn.autocommit = True
        try:
            with conn.cursor() as cur:
                cur.execute(
                    sql.SQL("DROP SCHEMA IF EXISTS {} CASCADE").format(
                        sql.Identifier(schema_name)
                    )
                )
                logger.debug(f"| Dropped schema: {schema_name}")
        finally:
            conn.close()

    def _create_schema(self, schema_name: str) -> None:
        """Create empty schema."""
        conn_params = {
            "host": self.postgres_host,
            "port": self.postgres_port,
            "user": self.postgres_user,
            "password": self.postgres_password,
            "database": self.postgres_database,
        }

        conn = psycopg2.connect(**conn_params)
        conn.autocommit = True
        try:
            with conn.cursor() as cur:
                cur.execute(
                    sql.SQL("CREATE SCHEMA {}").format(sql.Identifier(schema_name))
                )
                logger.debug(f"| Created schema: {schema_name}")
        finally:
            conn.close()

    def _get_all_tables(self) -> List[Dict[str, str]]:
        """Get list of all user tables.

        Returns:
            List of dicts with 'schema' and 'name' keys
        """
        conn_params = {
            "host": self.postgres_host,
            "port": self.postgres_port,
            "user": self.postgres_user,
            "password": self.postgres_password,
            "database": self.postgres_database,
        }

        conn = psycopg2.connect(**conn_params)
        try:
            with conn.cursor() as cur:
                cur.execute("""
                    SELECT table_schema, table_name
                    FROM information_schema.tables
                    WHERE table_type = 'BASE TABLE'
                    AND table_schema NOT IN ('information_schema', 'pg_catalog')
                    AND table_schema NOT LIKE 'pg_%'
                    AND table_name NOT LIKE '\\_%'
                    ORDER BY table_schema, table_name
                """)
                rows = cur.fetchall()
                return [{"schema": row[0], "name": row[1]} for row in rows]
        finally:
            conn.close()

    def _drop_table(self, schema_name: str, table_name: str) -> None:
        """Drop a specific table or materialized view."""
        conn_params = {
            "host": self.postgres_host,
            "port": self.postgres_port,
            "user": self.postgres_user,
            "password": self.postgres_password,
            "database": self.postgres_database,
        }

        conn = psycopg2.connect(**conn_params)
        conn.autocommit = True
        try:
            with conn.cursor() as cur:
                # Try dropping as table first
                cur.execute(
                    sql.SQL("DROP TABLE IF EXISTS {}.{} CASCADE").format(
                        sql.Identifier(schema_name),
                        sql.Identifier(table_name)
                    )
                )
                # Also try dropping as materialized view (in case agent created one)
                cur.execute(
                    sql.SQL("DROP MATERIALIZED VIEW IF EXISTS {}.{} CASCADE").format(
                        sql.Identifier(schema_name),
                        sql.Identifier(table_name)
                    )
                )
                logger.debug(f"| Dropped table/view: {schema_name}.{table_name}")
        finally:
            conn.close()

    def _restore_from_backup(self, category_name: str) -> bool:
        """Restore from backup file.

        Tables may be restored into public schema or category-specific schema
        depending on how the backup was created.

        Args:
            category_name: Name of category (e.g., 'employees', 'chinook', 'lego')

        Returns:
            True if backup was restored, False if no backup exists
        """
        # Path to backup file (same as used by Insforge/Postgres)
        backup_dir = Path(__file__).parent.parent.parent.parent / "postgres_state"
        backup_file = backup_dir / f"{category_name}.backup"

        logger.debug(f"| Looking for backup at: {backup_file}")

        if not backup_file.exists():
            logger.info(f"| ○ No backup file found: {backup_file}")
            return False

        logger.info(f"| Restoring {category_name} from backup...")

        # Set up environment for pg_restore
        env = os.environ.copy()
        env["PGPASSWORD"] = self.postgres_password

        try:
            # Restore backup
            result = subprocess.run(
                [
                    "pg_restore",
                    "-h", self.postgres_host,
                    "-p", str(self.postgres_port),
                    "-U", self.postgres_user,
                    "-d", self.postgres_database,
                    "-v",
                    str(backup_file),
                ],
                env=env,
                capture_output=True,
                text=True,
                timeout=120,  # 2 minute timeout
            )

            if result.returncode != 0 and "ERROR" in result.stderr:
                logger.warning(f"| pg_restore had errors for {category_name}: {result.stderr}")
                return False

            logger.info(f"| ✓ {category_name} restored successfully")
            return True

        except subprocess.TimeoutExpired:
            logger.error(f"| ✗ Restore timed out for {category_name}")
            return False
        except Exception as e:
            logger.error(f"| ✗ Failed to restore {category_name}: {e}")
            return False

    def get_service_config_for_agent(self) -> dict:
        """Get configuration for agent execution.

        This configuration is passed to the agent/MCP server so it can
        connect to the Supabase/PostgREST endpoint.

        Returns:
            Dictionary containing API URL and API key
        """
        config = {
            "api_url": self.api_url,
            "api_key": self.api_key,
            "schema": "public",  # Default schema for PostgREST
        }

        # Include current task context if available
        if self._current_task_context:
            config["task_context"] = self._current_task_context
            # If task uses a specific schema, include it
            if self._current_task_context.get("schema"):
                config["schema"] = self._current_task_context["schema"]

        return config

    def set_verification_environment(self, messages_path: str = None) -> None:
        """Set environment variables needed for verification scripts.

        Args:
            messages_path: Optional path to messages.json file for verification
        """
        os.environ["SUPABASE_API_URL"] = self.api_url
        os.environ["SUPABASE_API_KEY"] = self.api_key

        # Set PostgreSQL connection details for direct database verification
        os.environ["POSTGRES_HOST"] = self.postgres_host
        os.environ["POSTGRES_PORT"] = str(self.postgres_port)
        os.environ["POSTGRES_DATABASE"] = self.postgres_database
        os.environ["POSTGRES_USERNAME"] = self.postgres_user
        os.environ["POSTGRES_PASSWORD"] = self.postgres_password

        if messages_path:
            os.environ["MCP_MESSAGES"] = str(messages_path)

        logger.debug("Verification environment variables set for Supabase (including direct postgres access)")


================================================
FILE: src/mcp_services/supabase/supabase_task_manager.py
================================================
"""
Supabase Task Manager for MCPMark
===================================

Manages Supabase task discovery, execution, and verification.
Reuses Postgres tasks but accesses them via PostgREST/Supabase MCP.
"""

import os
import subprocess
import sys
from dataclasses import dataclass
from pathlib import Path
from typing import Any, Dict, List, Optional

from src.base.task_manager import BaseTask, BaseTaskManager
from src.logger import get_logger

logger = get_logger(__name__)


@dataclass
class SupabaseTask(BaseTask):
    """Supabase-specific task with API information."""

    task_name: str = ""
    api_url: Optional[str] = None
    api_key: Optional[str] = None


class SupabaseTaskManager(BaseTaskManager):
    """Manages Supabase tasks for MCPMark evaluation.

    Uses the same task structure as Postgres tasks but accessed via
    PostgREST/Supabase MCP server.
    """

    def __init__(self, tasks_root: Path = None):
        """Initialize Supabase task manager.

        Args:
            tasks_root: Path to tasks directory
        """
        if tasks_root is None:
            tasks_root = Path(__file__).resolve().parents[3] / "tasks"

        super().__init__(
            tasks_root,
            mcp_service="supabase",
            task_class=SupabaseTask,
            task_organization="file",  # Supabase uses file-based tasks (like Postgres)
        )

    def _create_task_from_files(
        self, category_id: str, task_files_info: Dict[str, Any]
    ) -> Optional[SupabaseTask]:
        """Instantiate a `SupabaseTask` from the dictionary returned by `_find_task_files`."""
        import json

        # Check for meta.json
        meta_path = task_files_info["instruction_path"].parent / "meta.json"
        final_category_id = category_id
        task_id = task_files_info["task_id"]

        if meta_path.exists():
            try:
                with open(meta_path, 'r') as f:
                    meta_data = json.load(f)
                    # Use values from meta.json if available
                    final_category_id = meta_data.get("category_id", category_id)
                    task_id = meta_data.get("task_id", task_id)
            except Exception as e:
                logger.warning(f"Failed to load meta.json from {meta_path}: {e}")

        return SupabaseTask(
            task_instruction_path=task_files_info["instruction_path"],
            task_verification_path=task_files_info["verification_path"],
            service="supabase",
            category_id=final_category_id,
            task_id=task_id,
            task_name=task_files_info["task_id"],
        )

    def _get_verification_command(self, task: SupabaseTask) -> List[str]:
        """Get verification command with Supabase API info."""
        cmd = [sys.executable, str(task.task_verification_path)]
        return cmd

    def run_verification(self, task: BaseTask) -> subprocess.CompletedProcess:
        """Run verification with Supabase environment."""
        env = os.environ.copy()

        # Pass Supabase connection info to verification script
        if hasattr(task, "api_url") and task.api_url:
            env["SUPABASE_API_URL"] = task.api_url

        if hasattr(task, "api_key") and task.api_key:
            env["SUPABASE_API_KEY"] = task.api_key

        return subprocess.run(
            self._get_verification_command(task),
            capture_output=True,
            text=True,
            timeout=300,
            env=env,
        )

    def _format_task_instruction(self, base_instruction: str) -> str:
        """Add Supabase-specific instructions."""
        return (
            base_instruction
            + "\n\nNote: Use Supabase MCP tools (PostgREST) to complete this task. The API connection is already configured."
        )


================================================
FILE: src/model_config.py
================================================
#!/usr/bin/env python3
"""
Model Configuration for MCPMark
================================

This module provides configuration management for different LLM models,
automatically detecting the required API keys and base URLs based on the model name.
"""

import os
from typing import Dict, List

from src.logger import get_logger

# Initialize logger
logger = get_logger(__name__)


class ModelConfig:
    """
    Configuration container for a specific model.
    It loads the necessary API key and base URL from environment variables.
    """

    # Model configuration mapping
    MODEL_CONFIGS = {
        # OpenAI models
        "gpt-4o": {
            "provider": "openai",
            "api_key_var": "OPENAI_API_KEY",
            "litellm_input_model_name": "openai/gpt-4o",
        },
        "gpt-4.1": {
            "provider": "openai",
            "api_key_var": "OPENAI_API_KEY",
            "litellm_input_model_name": "openai/gpt-4.1",
        },
        "gpt-4.1-mini": {
            "provider": "openai",
            "api_key_var": "OPENAI_API_KEY",
            "litellm_input_model_name": "openai/gpt-4.1-mini",
        },
        "gpt-4.1-nano": {
            "provider": "openai",
            "api_key_var": "OPENAI_API_KEY",
            "litellm_input_model_name": "openai/gpt-4.1-nano",
        },
        "gpt-5.2": {
            "provider": "openai",
            "api_key_var": "OPENAI_API_KEY",
            "litellm_input_model_name": "openai/gpt-5.2",
        },
        "gpt-5": {
            "provider": "openai",
            "api_key_var": "OPENAI_API_KEY",
            "litellm_input_model_name": "openai/gpt-5",
        },
        "gpt-5-mini": {
            "provider": "openai",
            "api_key_var": "OPENAI_API_KEY",
            "litellm_input_model_name": "openai/gpt-5-mini",
        },
        "gpt-5-nano": {
            "provider": "openai",
            "api_key_var": "OPENAI_API_KEY",
            "litellm_input_model_name": "openai/gpt-5-nano",
        },
        "o3": {
            "provider": "openai",
            "api_key_var": "OPENAI_API_KEY",
            "litellm_input_model_name": "openai/o3",
        },
        "o4-mini": {
            "provider": "openai",
            "api_key_var": "OPENAI_API_KEY",
            "litellm_input_model_name": "openai/o4-mini",
        },
        "gpt-oss-120b": {
            "provider": "openai",
            "api_key_var": "OPENROUTER_API_KEY",
            "litellm_input_model_name": "openrouter/openai/gpt-oss-120b",
        },
        # DeepSeek models
        "deepseek-v3.2-instruct": {
            "provider": "deepseek",
            "api_key_var": "DEEPSEEK_API_KEY",
            "litellm_input_model_name": "deepseek/deepseek-chat",
        },
        "deepseek-v3.2-thinking": {
            "provider": "deepseek",
            "api_key_var": "DEEPSEEK_API_KEY",
            "litellm_input_model_name": "deepseek/deepseek-reasoner",
        },
        # Anthropic models
        "claude-3.7-sonnet": {
            "provider": "anthropic",
            "api_key_var": "ANTHROPIC_API_KEY",
            "litellm_input_model_name": "anthropic/claude-3-7-sonnet-20250219",
        },
        "claude-sonnet-4": {
            "provider": "anthropic",
            "api_key_var": "ANTHROPIC_API_KEY",
            "litellm_input_model_name": "anthropic/claude-sonnet-4-20250514",
        },
        "claude-sonnet-4.5": {
            "provider": "anthropic",
            "api_key_var": "ANTHROPIC_API_KEY",
            "litellm_input_model_name": "anthropic/claude-sonnet-4-5-20250929",
        },
        "claude-opus-4": {
            "provider": "anthropic",
            "api_key_var": "ANTHROPIC_API_KEY",
            "litellm_input_model_name": "anthropic/claude-opus-4-20250514",
        },
        "claude-opus-4.1": {
            "provider": "anthropic",
            "api_key_var": "ANTHROPIC_API_KEY",
            "litellm_input_model_name": "anthropic/claude-opus-4-1-20250805",
        },
        "claude-opus-4.5": {
            "provider": "anthropic",
            "api_key_var": "ANTHROPIC_API_KEY",
            "litellm_input_model_name": "anthropic/claude-opus-4-5-20251101",
        },
        # Google models
        "gemini-2.5-pro": {
            "provider": "google",
            "api_key_var": "GEMINI_API_KEY",
            "litellm_input_model_name": "gemini/gemini-2.5-pro",
        },
        "gemini-2.5-flash": {
            "provider": "google",
            "api_key_var": "GEMINI_API_KEY",
            "litellm_input_model_name": "gemini/gemini-2.5-flash",
        },
        "gemini-3-pro": {
            "provider": "google",
            "api_key_var": "GEMINI_API_KEY",
            "litellm_input_model_name": "gemini/gemini-3-pro-preview",
        },
        # Moonshot models
        "kimi-k2-0711": {
            "provider": "moonshot",
            "api_key_var": "MOONSHOT_API_KEY",
            "litellm_input_model_name": "moonshot/kimi-k2-0711-preview",
        },
        "kimi-k2-0905": {
            "provider": "moonshot",
            "api_key_var": "MOONSHOT_API_KEY",
            "litellm_input_model_name": "moonshot/kimi-k2-0905-preview",
        },
        "kimi-k2-thinking": {
            "provider": "moonshot",
            "api_key_var": "OPENROUTER_API_KEY",
            "litellm_input_model_name": "openrouter/moonshotai/kimi-k2-thinking",
        },
        # Grok models
        "grok-4": {
            "provider": "xai",
            "api_key_var": "GROK_API_KEY",
            "litellm_input_model_name": "xai/grok-4-0709",
        },
        "grok-code-fast-1": {
            "provider": "xai",
            "api_key_var": "GROK_API_KEY",
            "litellm_input_model_name": "xai/grok-code-fast-1",
        },
        # Qwen models
        "qwen-3-coder-plus": {
            "provider": "qwen",
            "api_key_var": "DASHSCOPE_API_KEY",
            "litellm_input_model_name": "dashscope/qwen3-coder-plus",
        },
        "qwen-3-max": {
            "provider": "qwen",
            "api_key_var": "DASHSCOPE_API_KEY",
            "litellm_input_model_name": "dashscope/qwen3-max-preview",
        },
        # Zhipu
        "glm-4.5": {
            "provider": "zhipu",
            "api_key_var": "OPENROUTER_API_KEY",
            "litellm_input_model_name": "openrouter/z-ai/glm-4.5",
        }
    }

    def __init__(self, model_name: str):
        """
        Initializes the model configuration.

        Args:
            model_name: The name of the model (e.g., 'gpt-4o', 'deepseek-chat').

        Raises:
            ValueError: If the model is not supported or environment variables are missing.
        """
        self.short_model_name = model_name
        model_info = self._get_model_info(model_name)

        # Load API key, base URL and LiteLLM model name from environment variables
        if "base_url_var" in model_info:
            self.base_url = os.getenv(model_info["base_url_var"])
        else:
            self.base_url = None
        
        self.api_key = os.getenv(model_info["api_key_var"])
        if not self.api_key:
            raise ValueError(
                f"Missing required environment variable: {model_info['api_key_var']}"
            )

        self.litellm_input_model_name = model_info.get("litellm_input_model_name", model_name)

    def _get_model_info(self, model_name: str) -> Dict[str, str]:
        """
        Retrieves the configuration details for a given model name.
        For unsupported models, defaults to using OPENAI_BASE_URL and OPENAI_API_KEY.
        """
        if model_name not in self.MODEL_CONFIGS:
            logger.warning(
                f"Model '{model_name}' not in supported list. Using default OpenAI configuration."
            )
            # Return default configuration for unsupported models
            return {
                "provider": "openai",
                "api_key_var": "OPENAI_API_KEY",
                "litellm_input_model_name": model_name,
            }
        return self.MODEL_CONFIGS[model_name]

    @classmethod
    def get_supported_models(cls) -> List[str]:
        """Returns a list of all supported model names."""
        return list(cls.MODEL_CONFIGS.keys())


def main():
    """Example usage of the ModelConfig class."""
    logger.info("Supported models: %s", ModelConfig.get_supported_models())

    try:
        # Example: Create a model config for DeepSeek
        model_config = ModelConfig("deepseek-chat")
        logger.info("✅ DeepSeek model config created successfully.")
        logger.info("Short model name: %s", model_config.short_model_name)
        logger.info("API key loaded: %s", bool(model_config.api_key))

    except ValueError as e:
        logger.error("⚠️  Configuration error: %s", e)


if __name__ == "__main__":
    main()


================================================
FILE: src/results_reporter.py
================================================
#!/usr/bin/env python3
"""
Results Reporter for MCPMark Evaluation Pipeline
================================================

This module provides utilities for saving evaluation results in a structured format.
"""

import json
from dataclasses import dataclass
from datetime import datetime
from pathlib import Path
from typing import Any, Dict, List, Optional

from src.logger import get_logger

# Initialize logger
logger = get_logger(__name__)


@dataclass
class TaskResult:
    """
    Represents the result of a single task evaluation.

    Attributes:
        task_name: The full name of the task (e.g., "category_id__task_id").
        success: Whether the task completed successfully.
        category_id: The task category ID.
        task_id: The task identifier (number or slug).
        error_message: Error message from agent execution if it failed.
        verification_error: Error message from verification if it failed.
        verification_output: Captured stdout from verification script.
        model_output: Agent conversation trajectory (messages).
        token_usage: Token usage statistics.
        turn_count: Number of turns taken during task execution.
        agent_execution_time: Time for Step 2 (agent execution) in seconds.
        task_execution_time: Total time for Steps 1-4 in seconds.
    """

    task_name: str
    success: bool
    category_id: Optional[str] = None
    task_id: Optional[str] = None
    error_message: Optional[str] = None  # Agent execution error
    verification_error: Optional[str] = None  # Verification error (separate from agent error)
    verification_output: Optional[str] = None  # Verification stdout/stderr
    model_output: Optional[Any] = None  # Agent conversation trajectory
    token_usage: Optional[Dict[str, int]] = None  # Token usage statistics
    turn_count: Optional[int] = None  # Number of turns taken during task execution
    agent_execution_time: float = 0.0  # Time for Step 2 (agent execution) in seconds
    task_execution_time: float = 0.0  # Total time for Steps 1-4 in seconds

    @property
    def status(self) -> str:
        """Returns the status of the task as 'PASS' or 'FAIL'."""
        return "PASS" if self.success else "FAIL"


@dataclass
class EvaluationReport:
    """Represents a complete evaluation report for a model."""

    model_name: str
    model_config: Dict[str, Any]
    total_tasks: int
    successful_tasks: int
    failed_tasks: int
    task_results: List[TaskResult]
    tasks_filter: Optional[str] = None

    @property
    def success_rate(self) -> float:
        """Calculates the overall success rate as a percentage."""
        if self.total_tasks == 0:
            return 0.0
        return (self.successful_tasks / self.total_tasks) * 100

    @property
    def total_input_tokens(self) -> int:
        """Calculate total input tokens across all tasks."""
        total = 0
        for result in self.task_results:
            if result.token_usage:
                total += (result.token_usage.get("input_tokens") or 0)
        return total

    @property
    def total_output_tokens(self) -> int:
        """Calculate total output tokens across all tasks."""
        total = 0
        for result in self.task_results:
            if result.token_usage:
                total += (result.token_usage.get("output_tokens") or 0)
        return total

    @property
    def total_tokens(self) -> int:
        """Calculate total tokens across all tasks."""
        total = 0
        for result in self.task_results:
            if result.token_usage:
                total += (result.token_usage.get("total_tokens") or 0)
        return total
    
    @property
    def total_reasoning_tokens(self) -> int:
        """Calculate total reasoning tokens across all tasks."""
        total = 0
        for result in self.task_results:
            if result.token_usage:
                total += (result.token_usage.get("reasoning_tokens") or 0)
        return total

    @property
    def avg_input_tokens(self) -> float:
        """Calculate average input tokens per task."""
        if self.total_tasks == 0:
            return 0.0
        return self.total_input_tokens / self.total_tasks

    @property
    def avg_output_tokens(self) -> float:
        """Calculate average output tokens per task."""
        if self.total_tasks == 0:
            return 0.0
        return self.total_output_tokens / self.total_tasks

    @property
    def avg_total_tokens(self) -> float:
        """Calculate average total tokens per task."""
        if self.total_tasks == 0:
            return 0.0
        return self.total_tokens / self.total_tasks
    
    @property
    def avg_reasoning_tokens(self) -> float:
        """Calculate average reasoning tokens per task."""
        if self.total_tasks == 0:
            return 0.0
        return self.total_reasoning_tokens / self.total_tasks

    @property
    def total_task_execution_time(self) -> float:
        """Calculates the total task execution time from sum of all task execution times."""
        # Use sum of individual task execution times instead of pipeline wall clock time
        # This ensures resume functionality shows correct total time
        return sum(task.task_execution_time for task in self.task_results)
    
    @property
    def total_agent_execution_time(self) -> float:
        """Calculates the total agent execution time (Step 2) across all tasks."""
        return sum(task.agent_execution_time for task in self.task_results)

    def get_category_stats(self) -> Dict[str, Dict[str, Any]]:
        """
        Calculates and returns success statistics grouped by task category.
        """
        category_stats = {}

        for result in self.task_results:
            category = result.category_id or "Uncategorized"
            if category not in category_stats:
                category_stats[category] = {
                    "total": 0,
                    "successful": 0,
                    "failed": 0,
                    "success_rate": 0.0,
                    "avg_execution_time": 0.0,
                    "avg_agent_execution_time": 0.0,
                    "total_input_tokens": 0,
                    "total_output_tokens": 0,
                    "total_tokens": 0,
                    "total_reasoning_tokens": 0,
                    "avg_input_tokens": 0.0,
                    "avg_output_tokens": 0.0,
                    "avg_total_tokens": 0.0,
                    "avg_reasoning_tokens": 0.0,
                    "total_turns": 0,
                    "avg_turns": 0.0,
                }

            category_stats[category]["total"] += 1
            if result.success:
                category_stats[category]["successful"] += 1
            else:
                category_stats[category]["failed"] += 1

            # Add token and turn usage
            if result.token_usage:
                category_stats[category]["total_input_tokens"] += (
                    result.token_usage.get("input_tokens") or 0
                )
                category_stats[category]["total_output_tokens"] += (
                    result.token_usage.get("output_tokens") or 0
                )
                category_stats[category]["total_tokens"] += (
                    result.token_usage.get("total_tokens") or 0
                )
                category_stats[category]["total_reasoning_tokens"] += result.token_usage.get(
                    "reasoning_tokens", 0
                ) or 0

            # Accumulate turns
            if result.turn_count is not None:
                category_stats[category]["total_turns"] += result.turn_count

        # Calculate derived metrics like success rate and average time
        for category, stats in category_stats.items():
            if stats["total"] > 0:
                stats["success_rate"] = (stats["successful"] / stats["total"]) * 100
                category_results = [
                    r
                    for r in self.task_results
                    if (r.category_id or "Uncategorized") == category
                ]
                total_time = sum(r.task_execution_time for r in category_results)
                stats["avg_execution_time"] = total_time / len(category_results)
                
                # Add agent execution time stats
                total_agent_time = sum(r.agent_execution_time for r in category_results)
                stats["avg_agent_execution_time"] = total_agent_time / len(category_results)

                # Calculate average tokens and turns
                stats["avg_input_tokens"] = stats["total_input_tokens"] / stats["total"]
                stats["avg_output_tokens"] = (
                    stats["total_output_tokens"] / stats["total"]
                )
                stats["avg_total_tokens"] = stats["total_tokens"] / stats["total"]
                stats["avg_reasoning_tokens"] = stats["total_reasoning_tokens"] / stats["total"]

                stats["avg_turns"] = (
                    stats["total_turns"] / stats["total"] if stats["total"] > 0 else 0
                )

        return category_stats


class ResultsReporter:
    """Handles saving evaluation results in structured formats."""

    def __init__(self):
        """Initialize the results reporter."""
        pass

    def save_messages_json(self, messages: Any, output_path: Path) -> Path:
        """Saves the conversation messages/trajectory as messages.json."""
        output_path.parent.mkdir(parents=True, exist_ok=True)
        with output_path.open("w", encoding="utf-8") as f:
            json.dump(messages, f, indent=2, ensure_ascii=False)
        return output_path

    def save_meta_json(
        self,
        task_result: TaskResult,
        model_config: Dict[str, Any],
        start_time: datetime,
        end_time: datetime,
        output_path: Path,
    ) -> Path:
        """Saves task metadata (excluding messages) as meta.json."""
        output_path.parent.mkdir(parents=True, exist_ok=True)

        meta_data = {
            "task_name": task_result.task_name,
            "model_name": model_config.get("model_name", "unknown"),
            "litellm_run_model_name": model_config.get("litellm_run_model_name"),
            "reasoning_effort": model_config.get("reasoning_effort"),
            "mcp": model_config.get("mcp_service", "unknown"),
            "timeout": model_config.get("timeout", 300),
            "time": {"start": start_time.isoformat(), "end": end_time.isoformat()},
            "agent_execution_time": task_result.agent_execution_time,
            "task_execution_time": task_result.task_execution_time,
            "execution_result": {
                "success": task_result.success,
                "error_message": task_result.error_message,
                "verification_error": task_result.verification_error,
                "verification_output": task_result.verification_output,
            },
            "token_usage": task_result.token_usage or {},
            "turn_count": task_result.turn_count,
        }

        with output_path.open("w", encoding="utf-8") as f:
            json.dump(meta_data, f, indent=2, ensure_ascii=False)
        return output_path

    def save_model_summary(self, report: EvaluationReport, output_path: Path) -> Path:
        """Saves a concise model-level summary."""
        output_path.parent.mkdir(parents=True, exist_ok=True)

        category_stats = report.get_category_stats()

        # Aggregate turn counts using category_stats
        total_turns = sum(stats["total_turns"] for stats in category_stats.values())
        avg_turns = total_turns / report.total_tasks if report.total_tasks > 0 else 0

        summary = {
            "model_name": report.model_name,
            "model_config": report.model_config,
            "total_tasks": report.total_tasks,
            "successful_tasks": report.successful_tasks,
            "failed_tasks": report.failed_tasks,
            "success_rate": round(report.success_rate, 2),
            "total_task_execution_time": report.total_task_execution_time,
            "average_task_execution_time": report.total_task_execution_time / report.total_tasks
            if report.total_tasks > 0
            else 0,
            "total_agent_execution_time": report.total_agent_execution_time,
            "average_agent_execution_time": report.total_agent_execution_time / report.total_tasks
            if report.total_tasks > 0
            else 0,
            "token_usage": {
                "total_input_tokens": report.total_input_tokens,
                "total_output_tokens": report.total_output_tokens,
                "total_tokens": report.total_tokens,
                "total_reasoning_tokens": report.total_reasoning_tokens,
                "avg_input_tokens": round(report.avg_input_tokens, 2),
                "avg_output_tokens": round(report.avg_output_tokens, 2),
                "avg_total_tokens": round(report.avg_total_tokens, 2),
                "avg_reasoning_tokens": round(report.avg_reasoning_tokens, 2),
            },
            "turn_usage": {
                "total_turns": total_turns,
                "avg_turns": round(avg_turns, 2),
            },
            "category_breakdown": {
                category: {
                    "total": stats["total"],
                    "success_rate": round(stats["success_rate"], 2),
                    "avg_time": round(stats["avg_execution_time"], 2),
                    "token_usage": {
                        "total_input": stats["total_input_tokens"],
                        "total_output": stats["total_output_tokens"],
                        "total": stats["total_tokens"],
                        "total_reasoning": stats["total_reasoning_tokens"],
                        "avg_input": round(stats["avg_input_tokens"], 2),
                        "avg_output": round(stats["avg_output_tokens"], 2),
                        "avg_total": round(stats["avg_total_tokens"], 2),
                        "avg_reasoning": round(stats["avg_reasoning_tokens"], 2),
                    },
                    "turn_usage": {
                        "total_turns": stats["total_turns"],
                        "avg_turns": round(stats["avg_turns"], 2),
                    },
                }
                for category, stats in category_stats.items()
            },
        }

        with output_path.open("w", encoding="utf-8") as f:
            json.dump(summary, f, indent=2, ensure_ascii=False)
        return output_path


================================================
FILE: src/services.py
================================================
"""
Service Definitions for MCPMark
================================

Single source of truth for all MCP service configurations.
Adding a new service only requires modifying this file.

Note: Environment variables are already loaded from .mcp_env when the app starts,
so we can reference them directly via the config system.

MCP server creation is now handled entirely within src.agent.MCPAgent; therefore,
the legacy "mcp_server" and "eval_config" entries in each service definition are
deprecated and set to None for backward-compatibility.
"""

# Service definitions
SERVICES = {
    "notion": {
        "config_schema": {
            "source_api_key": {
                "env_var": "SOURCE_NOTION_API_KEY",
                "required": True,
                "description": "Notion API key for source hub with templates",
            },
            "eval_api_key": {
                "env_var": "EVAL_NOTION_API_KEY",
                "required": True,
                "description": "Notion API key for evaluation hub",
            },
            "source_parent_page_title": {
                "env_var": "SOURCE_PARENT_PAGE_TITLE",
                "default": "MCPMark Source Hub",
                "required": False,
                "description": "Title of the source hub page that contains all initial states",
            },
            "eval_parent_page_title": {
                "env_var": "EVAL_PARENT_PAGE_TITLE",
                "required": True,
                "description": "Title of the parent page in evaluation workspace",
            },
            "playwright_headless": {
                "env_var": "PLAYWRIGHT_HEADLESS",
                "default": True,
                "required": False,
                "description": "Run browser in headless mode",
                "transform": "bool",  # Will be handled by GenericConfigSchema
            },
            "playwright_browser": {
                "env_var": "PLAYWRIGHT_BROWSER",
                "default": "firefox",
                "required": False,
                "description": "Browser to use for Playwright",
                "validator": "in:chromium,firefox,webkit",  # Simple validator syntax
            },
        },
        "components": {
            "task_manager": "src.mcp_services.notion.notion_task_manager.NotionTaskManager",
            "state_manager": "src.mcp_services.notion.notion_state_manager.NotionStateManager",
            "login_helper": "src.mcp_services.notion.notion_login_helper.NotionLoginHelper",
        },
        "config_mapping": {
            # Maps config schema keys to class constructor parameters
            "state_manager": {
                "source_notion_key": "source_api_key",
                "eval_notion_key": "eval_api_key",
                "headless": "playwright_headless",
                "browser": "playwright_browser",
                "source_parent_page_title": "source_parent_page_title",
                "eval_parent_page_title": "eval_parent_page_title",
            },
            "login_helper": {
                "headless": "playwright_headless",
                "browser": "playwright_browser",
            },
        },
        # MCP server is now instantiated dynamically in MCPAgent; kept for backward
        # compatibility but set to None to indicate deprecation.
        "mcp_server": None,
        "eval_config": None,
    },
    "github": {
        "config_schema": {
            "github_tokens": {
                "env_var": "GITHUB_TOKENS",
                "required": True,
                "description": "GitHub personal access token(s) - comma-separated for round-robin",
                "transform": "list",  # Will split by comma
            },
            # Evaluation organisation / user that hosts ephemeral test repositories
            "eval_org": {
                "env_var": "GITHUB_EVAL_ORG",
                "default": "mcpleague-eval",
                "required": False,
                "description": "Evaluation organisation or user for creating temporary test repositories",
            },
            # (source_org removed – template repos now imported from local files)
        },
        "components": {
            "task_manager": "src.mcp_services.github.github_task_manager.GitHubTaskManager",
            "state_manager": "src.mcp_services.github.github_state_manager.GitHubStateManager",
            "login_helper": "src.mcp_services.github.github_login_helper.GitHubLoginHelper",
        },
        "config_mapping": {
            "state_manager": {
                "github_token": "github_tokens",
                "eval_org": "eval_org",
            },
            "login_helper": {
                # Login helper needs a single token, we'll use the first one
                "token": "github_tokens",
            },
        },
        "mcp_server": None,
        "eval_config": None,
    },
    "filesystem": {
        "config_schema": {
            "test_root": {
                "env_var": "FILESYSTEM_TEST_ROOT",
                "default": None,
                "required": False,
                "description": "Root directory for filesystem tests",
                "transform": "path",  # Convert to Path object
            },
            "cleanup_on_exit": {
                "env_var": "FILESYSTEM_CLEANUP",
                "default": True,
                "required": False,
                "description": "Clean up test directories after tasks",
                "transform": "bool",
            },
        },
        "components": {
            "task_manager": "src.mcp_services.filesystem.filesystem_task_manager.FilesystemTaskManager",
            "state_manager": "src.mcp_services.filesystem.filesystem_state_manager.FilesystemStateManager",
            "login_helper": "src.mcp_services.filesystem.filesystem_login_helper.FilesystemLoginHelper",
        },
        "config_mapping": {
            "state_manager": {
                "test_root": "test_root",
                "cleanup_on_exit": "cleanup_on_exit",
            }
        },
        "mcp_server": None,
        "eval_config": None,
    },
    "playwright": {
        "config_schema": {
            "browser": {
                "env_var": "PLAYWRIGHT_BROWSER",
                "default": "chromium",
                "required": False,
                "description": "Browser to use (chromium, firefox, webkit)",
                "validator": "in:chromium,firefox,webkit",
            },
            "headless": {
                "env_var": "PLAYWRIGHT_HEADLESS",
                "default": True,
                "required": False,
                "description": "Run browser in headless mode",
                "transform": "bool",
            },
            "network_origins": {
                "env_var": "PLAYWRIGHT_NETWORK_ORIGINS",
                "default": "*",
                "required": False,
                "description": "Allowed network origins (comma-separated or *)",
            },
            "user_profile": {
                "env_var": "PLAYWRIGHT_USER_PROFILE",
                "default": "isolated",
                "required": False,
                "description": "User profile type (isolated or persistent)",
                "validator": "in:isolated,persistent",
            },
            "viewport_width": {
                "env_var": "PLAYWRIGHT_VIEWPORT_WIDTH",
                "default": 1280,
                "required": False,
                "description": "Browser viewport width",
                "transform": "int",
            },
            "viewport_height": {
                "env_var": "PLAYWRIGHT_VIEWPORT_HEIGHT",
                "default": 720,
                "required": False,
                "description": "Browser viewport height",
                "transform": "int",
            },
        },
        "components": {
            "task_manager": "src.mcp_services.playwright.playwright_task_manager.PlaywrightTaskManager",
            "state_manager": "src.mcp_services.playwright.playwright_state_manager.PlaywrightStateManager",
            "login_helper": "src.mcp_services.playwright.playwright_login_helper.PlaywrightLoginHelper",
        },
        "config_mapping": {
            "state_manager": {
                "browser": "browser",
                "headless": "headless",
                "network_origins": "network_origins",
                "user_profile": "user_profile",
                "viewport_width": "viewport_width",
                "viewport_height": "viewport_height",
            },
            "login_helper": {
                "browser": "browser",
                "headless": "headless",
            },
        },
        "mcp_server": None,
        "eval_config": None,
    },
    "postgres": {
        "config_schema": {
            "host": {
                "env_var": "POSTGRES_HOST",
                "default": "localhost",
                "required": False,
                "description": "PostgreSQL server host",
            },
            "port": {
                "env_var": "POSTGRES_PORT",
                "default": 5432,
                "required": False,
                "description": "PostgreSQL server port",
                "transform": "int",
                "validator": "port",  # Validates port range 1-65535
            },
            "database": {
                "env_var": "POSTGRES_DATABASE",
                "default": "postgres",
                "required": False,
                "description": "PostgreSQL database name",
            },
            "username": {
                "env_var": "POSTGRES_USERNAME",
                "default": "postgres",
                "required": False,
                "description": "PostgreSQL username",
            },
            "password": {
                "env_var": "POSTGRES_PASSWORD",
                "default": "password",
                "required": False,
                "description": "PostgreSQL password",
            },
        },
        "components": {
            "task_manager": "src.mcp_services.postgres.postgres_task_manager.PostgresTaskManager",
            "state_manager": "src.mcp_services.postgres.postgres_state_manager.PostgresStateManager",
            "login_helper": "src.mcp_services.postgres.postgres_login_helper.PostgresLoginHelper",
        },
        "config_mapping": {
            "state_manager": {
                "host": "host",
                "port": "port",
                "database": "database",
                "username": "username",
                "password": "password",
            },
            "login_helper": {
                "host": "host",
                "port": "port",
                "database": "database",
                "username": "username",
                "password": "password",
            },
        },
        "mcp_server": None,
        "eval_config": None,
    },
    "insforge": {
        "config_schema": {
            "api_key": {
                "env_var": "INSFORGE_API_KEY",
                "required": True,
                "description": "Insforge backend API key for authentication",
            },
            "backend_url": {
                "env_var": "INSFORGE_BACKEND_URL",
                "required": True,
                "description": "Insforge backend URL (e.g., https://your-app.insforge.app)",
            },
        },
        "components": {
            "task_manager": "src.mcp_services.insforge.insforge_task_manager.InsforgeTaskManager",
            "state_manager": "src.mcp_services.insforge.insforge_state_manager.InsforgeStateManager",
            "login_helper": "src.mcp_services.insforge.insforge_login_helper.InsforgeLoginHelper",
        },
        "config_mapping": {
            "state_manager": {
                "api_key": "api_key",
                "backend_url": "backend_url",
            },
            "login_helper": {
                "api_key": "api_key",
                "backend_url": "backend_url",
            },
        },
        "mcp_server": None,
        "eval_config": None,
    },
    "supabase": {
        "config_schema": {
            "api_url": {
                "env_var": "SUPABASE_API_URL",
                "required": False,
                "description": "Supabase PostgREST API URL (default: http://localhost:54321 from CLI)",
                "default": "http://localhost:54321",
            },
            "api_key": {
                "env_var": "SUPABASE_API_KEY",
                "required": False,
                "description": "Supabase API key (anon or service_role key from 'supabase status')",
            },
            "postgres_host": {
                "env_var": "SUPABASE_DB_HOST",
                "required": False,
                "description": "PostgreSQL host for Supabase CLI instance",
                "default": "localhost",
            },
            "postgres_port": {
                "env_var": "SUPABASE_DB_PORT",
                "required": False,
                "description": "PostgreSQL port for Supabase CLI instance (default: 54322)",
                "default": 54322,
            },
            "postgres_user": {
                "env_var": "SUPABASE_DB_USER",
                "required": False,
                "description": "PostgreSQL username",
                "default": "postgres",
            },
            "postgres_password": {
                "env_var": "SUPABASE_DB_PASSWORD",
                "required": False,
                "description": "PostgreSQL password",
                "default": "postgres",
            },
            "postgres_database": {
                "env_var": "SUPABASE_DB_NAME",
                "required": False,
                "description": "PostgreSQL database name",
                "default": "postgres",
            },
        },
        "components": {
            "task_manager": "src.mcp_services.supabase.supabase_task_manager.SupabaseTaskManager",
            "state_manager": "src.mcp_services.supabase.supabase_state_manager.SupabaseStateManager",
            "login_helper": "src.mcp_services.supabase.supabase_login_helper.SupabaseLoginHelper",
        },
        "config_mapping": {
            "state_manager": {
                "api_url": "api_url",
                "api_key": "api_key",
                "postgres_host": "postgres_host",
                "postgres_port": "postgres_port",
                "postgres_user": "postgres_user",
                "postgres_password": "postgres_password",
                "postgres_database": "postgres_database",
            },
            "login_helper": {},
        },
        "mcp_server": None,
        "eval_config": None,
    },
    "playwright_webarena": {
        "config_schema": {
            "browser": {
                "env_var": "PLAYWRIGHT_BROWSER",
                "default": "chromium",
                "required": False,
                "description": "Browser to use (chromium, firefox, webkit)",
                "validator": "in:chromium,firefox,webkit",
            },
            "headless": {
                "env_var": "PLAYWRIGHT_HEADLESS",
                "default": True,
                "required": False,
                "description": "Run browser in headless mode",
                "transform": "bool",
            },
            "network_origins": {
                "env_var": "PLAYWRIGHT_NETWORK_ORIGINS",
                "default": "*",
                "required": False,
                "description": "Allowed network origins (comma-separated or *)",
            },
            "user_profile": {
                "env_var": "PLAYWRIGHT_USER_PROFILE",
                "default": "isolated",
                "required": False,
                "description": "User profile type (isolated or persistent)",
                "validator": "in:isolated,persistent",
            },
            "viewport_width": {
                "env_var": "PLAYWRIGHT_VIEWPORT_WIDTH",
                "default": 1280,
                "required": False,
                "description": "Browser viewport width",
                "transform": "int",
            },
            "viewport_height": {
                "env_var": "PLAYWRIGHT_VIEWPORT_HEIGHT",
                "default": 720,
                "required": False,
                "description": "Browser viewport height",
                "transform": "int",
            },
            "skip_cleanup": {
                "env_var": "PLAYWRIGHT_WEBARENA_SKIP_CLEANUP",
                "default": False,
                "required": False,
                "description": "Skip Docker container cleanup for debugging",
                "transform": "bool",
            },
        },
        "components": {
            "task_manager": "src.mcp_services.playwright_webarena.playwright_task_manager.PlaywrightTaskManager",
            "state_manager": "src.mcp_services.playwright_webarena.playwright_state_manager.PlaywrightStateManager",
            "login_helper": "src.mcp_services.playwright_webarena.playwright_login_helper.PlaywrightLoginHelper",
        },
        "config_mapping": {
            "state_manager": {
                "browser": "browser",
                "headless": "headless",
                "network_origins": "network_origins",
                "user_profile": "user_profile",
                "viewport_width": "viewport_width",
                "viewport_height": "viewport_height",
                "skip_cleanup": "skip_cleanup",
            },
            "login_helper": {
                "browser": "browser",
                "headless": "headless",
            },
            "task_manager": {},
        },
        "mcp_server": None,
        "eval_config": None,
    },
}


def get_service_definition(service_name: str) -> dict:
    """Get MCP service definition by name."""
    if service_name not in SERVICES:
        raise ValueError(f"Unknown MCP service: {service_name}")
    return SERVICES[service_name]


def get_supported_mcp_services() -> list:
    """Get list of implemented MCP services."""
    return [
        name
        for name, config in SERVICES.items()
        if config["components"]["task_manager"] is not None
    ]


================================================
FILE: tasks/__init__.py
================================================


================================================
FILE: tasks/filesystem/easy/.gitkeep
================================================


================================================
FILE: tasks/filesystem/easy/file_context/file_splitting/description.md
================================================
# File Splitting Task

## 📋 Task Description

You need to split a large text file into multiple smaller files with equal character counts. The task involves creating a new directory and splitting the content into exactly 3 files.

## 🎯 Task Objectives

1. **Create a new directory** named `split` in the test directory
2. **Split the file** `large_file.txt` into exactly 3 files with **similar** character counts (maximum character difference of 100 between any two files)
3. **Name the files** as `split_01.txt`, `split_02.txt`, `split_03.txt` in the `split` directory


================================================
FILE: tasks/filesystem/easy/file_context/file_splitting/meta.json
================================================
{
  "task_id": "file_splitting",
  "task_name": "File Splitting",
  "category_id": "file_context",
  "category_name": "File Context",
  "description": "Split large_file.txt into three nearly equal chunks stored as split_01.txt-split_03.txt inside a new split directory.",
  "author": "Lingjun Chen",
  "created_at": "2025-11-15",
  "difficulty": "L1",
  "tags": [
    "content transformation",
    "file automation"
  ],
  "mcp": [
    "filesystem"
  ],
  "meta_data": {
    "stateType": "text",
    "stateContent": "file_context/\n    \u251c\u2500\u2500 file_01.txt\n    \u251c\u2500\u2500 file_02.txt\n    \u251c\u2500\u2500 file_03.txt\n    \u251c\u2500\u2500 file_04.txt\n    \u251c\u2500\u2500 file_05.txt\n    \u251c\u2500\u2500 file_06.txt\n    \u251c\u2500\u2500 file_07.txt\n    \u251c\u2500\u2500 file_08.txt\n    \u251c\u2500\u2500 file_09.txt\n    \u251c\u2500\u2500 file_10.txt\n    \u251c\u2500\u2500 file_11.txt\n    \u251c\u2500\u2500 file_12.txt\n    \u251c\u2500\u2500 file_13.txt\n    \u251c\u2500\u2500 file_14.txt\n    \u251c\u2500\u2500 file_15.txt\n    \u251c\u2500\u2500 file_16.txt\n    \u251c\u2500\u2500 file_17.txt\n    \u251c\u2500\u2500 file_18.txt\n    \u251c\u2500\u2500 file_19.txt\n    \u251c\u2500\u2500 file_20.txt\n    \u2514\u2500\u2500 large_file.txt",
    "stateUrl": "https://storage.mcpmark.ai/filesystem/file_context.zip",
    "stateOriginalUrl": null
  }
}


================================================
FILE: tasks/filesystem/easy/file_context/file_splitting/verify.py
================================================
#!/usr/bin/env python3
"""
Verification script for File Splitting Task
"""

import sys
from pathlib import Path
import os

def get_test_directory() -> Path:
    """Get the test directory from FILESYSTEM_TEST_DIR env var."""
    test_root = os.environ.get("FILESYSTEM_TEST_DIR")
    if not test_root:
        raise ValueError("FILESYSTEM_TEST_DIR environment variable is required")
    return Path(test_root)

def verify_split_directory_exists(test_dir: Path) -> bool:
    """Verify that the split directory exists."""
    split_dir = test_dir / "split"
    
    if not split_dir.exists():
        print("❌ Directory 'split' not found")
        return False
    
    if not split_dir.is_dir():
        print("❌ 'split' exists but is not a directory")
        return False
    
    print("✅ Split directory found")
    return True

def verify_all_split_files_exist(test_dir: Path) -> bool:
    """Verify that all 3 split files exist with correct names."""
    split_dir = test_dir / "split"
    
    expected_files = [f"split_{i:02d}.txt" for i in range(1, 4)]
    missing_files = []
    
    for filename in expected_files:
        file_path = split_dir / filename
        if not file_path.exists():
            missing_files.append(filename)
    
    if missing_files:
        print(f"❌ Missing files: {missing_files}")
        return False
    
    print("✅ All 3 split files exist with correct names")
    return True

def verify_similar_file_lengths(test_dir: Path) -> bool:
    """Verify that all split files have similar character counts (within 30 characters difference)."""
    split_dir = test_dir / "split"
    
    file_lengths = []
    for i in range(1, 4):
        filename = f"split_{i:02d}.txt"
        file_path = split_dir / filename
        
        try:
            content = file_path.read_text()
            file_lengths.append(len(content))
        except Exception as e:
            print(f"❌ Error reading {filename}: {e}")
            return False
    
    # Check if all lengths are within 30 characters of each other
    min_length = min(file_lengths)
    max_length = max(file_lengths)
    length_difference = max_length - min_length
    
    if length_difference > 100:
        print(f"❌ File lengths differ by more than 30 characters: {length_difference}")
        print(f"   Min length: {min_length}, Max length: {max_length}")
        print(f"   All lengths: {file_lengths}")
        return False
    
    print(f"✅ All files have similar lengths (difference: {length_difference} characters)")
    print(f"   Min: {min_length}, Max: {max_length}")
    return True

def verify_content_integrity(test_dir: Path) -> bool:
    """Verify that concatenated split files equal the original file."""
    split_dir = test_dir / "split"
    original_file = test_dir / "large_file.txt"
    
    # Read original content
    try:
        original_content = original_file.read_text()
    except Exception as e:
        print(f"❌ Error reading original file: {e}")
        return False
    
    # Concatenate all split files
    concatenated_content = ""
    for i in range(1, 4):
        filename = f"split_{i:02d}.txt"
        file_path = split_dir / filename
        
        try:
            content = file_path.read_text()
            concatenated_content += content
        except Exception as e:
            print(f"❌ Error reading {filename}: {e}")
            return False
    
    # Compare content
    if concatenated_content != original_content:
        print("❌ Concatenated content does not match original file")
        print(f"   Original length: {len(original_content)}")
        print(f"   Concatenated length: {len(concatenated_content)}")
        return False
    
    print("✅ Concatenated content matches original file exactly")
    return True

def verify_no_extra_files(test_dir: Path) -> bool:
    """Verify that no extra files exist in the split directory."""
    split_dir = test_dir / "split"
    
    expected_files = {f"split_{i:02d}.txt" for i in range(1, 4)}
    actual_files = {f.name for f in split_dir.iterdir() if f.is_file()}
    
    extra_files = actual_files - expected_files
    if extra_files:
        print(f"❌ Extra files found in split directory: {extra_files}")
        return False
    
    print("✅ No extra files in split directory")
    return True

def main():
    """Main verification function."""
    test_dir = get_test_directory()
    print("🔍 Verifying File Splitting Task...")
    
    # Define verification steps
    verification_steps = [
        ("Split Directory Exists", verify_split_directory_exists),
        ("All Split Files Exist", verify_all_split_files_exist),
        ("Similar File Lengths", verify_similar_file_lengths),
        ("Content Integrity", verify_content_integrity),
        ("No Extra Files", verify_no_extra_files),
    ]
    
    # Run all verification steps
    all_passed = True
    for step_name, verify_func in verification_steps:
        print(f"\n--- {step_name} ---")
        if not verify_func(test_dir):
            all_passed = False
    
    # Final result
    print("\n" + "="*50)
    if all_passed:
        print("✅ File splitting task completed correctly!")
        print("🎉 Task verification: PASS")
        sys.exit(0)
    else:
        print("❌ Task verification: FAIL")
        sys.exit(1)

if __name__ == "__main__":
    main()

================================================
FILE: tasks/filesystem/easy/file_context/pattern_matching/description.md
================================================
# File Filtering Task: Find Files with Common Substring

## 📋 Task Description

Your task is to find all files that contain a substring of 30 or more characters that also appears in `large_file.txt`. **You are not allowed to use python code.**

## 🎯 Task Objectives

1. **Read the reference file** `large_file.txt` to understand its content
2. **Examine each file** from file_01.txt to file_20.txt
3. **Find files** that contain a substring of 30 or more characters that matches a substring in `large_file.txt`
4. **Create a file `answer.txt`** and write the results to it with the following format:
   - One line per matching file
   - Format: `filename.txt`
   - Do not add any things else other than `filename.txt.`


================================================
FILE: tasks/filesystem/easy/file_context/pattern_matching/meta.json
================================================
{
  "task_id": "pattern_matching",
  "task_name": "Pattern Matching",
  "category_id": "file_context",
  "category_name": "File Context",
  "description": "Scan file_01.txt through file_20.txt for any 30+ character substring that also appears in large_file.txt and list each matching filename in answer.txt.",
  "author": "Lingjun Chen",
  "created_at": "2025-11-15",
  "difficulty": "L1",
  "tags": [
    "pattern analysis",
    "search and filtering"
  ],
  "mcp": [
    "filesystem"
  ],
  "meta_data": {
    "stateType": "text",
    "stateContent": "file_context/\n    \u251c\u2500\u2500 file_01.txt\n    \u251c\u2500\u2500 file_02.txt\n    \u251c\u2500\u2500 file_03.txt\n    \u251c\u2500\u2500 file_04.txt\n    \u251c\u2500\u2500 file_05.txt\n    \u251c\u2500\u2500 file_06.txt\n    \u251c\u2500\u2500 file_07.txt\n    \u251c\u2500\u2500 file_08.txt\n    \u251c\u2500\u2500 file_09.txt\n    \u251c\u2500\u2500 file_10.txt\n    \u251c\u2500\u2500 file_11.txt\n    \u251c\u2500\u2500 file_12.txt\n    \u251c\u2500\u2500 file_13.txt\n    \u251c\u2500\u2500 file_14.txt\n    \u251c\u2500\u2500 file_15.txt\n    \u251c\u2500\u2500 file_16.txt\n    \u251c\u2500\u2500 file_17.txt\n    \u251c\u2500\u2500 file_18.txt\n    \u251c\u2500\u2500 file_19.txt\n    \u251c\u2500\u2500 file_20.txt\n    \u2514\u2500\u2500 large_file.txt",
    "stateUrl": "https://storage.mcpmark.ai/filesystem/file_context.zip",
    "stateOriginalUrl": null
  }
}


================================================
FILE: tasks/filesystem/easy/file_context/pattern_matching/verify.py
================================================
#!/usr/bin/env python3
"""
Verification script for File Filtering Task: Find Files with Common Substring
"""

import sys
from pathlib import Path
import os

def get_test_directory() -> Path:
    """Get the test directory from FILESYSTEM_TEST_DIR env var."""
    test_root = os.environ.get("FILESYSTEM_TEST_DIR")
    if not test_root:
        raise ValueError("FILESYSTEM_TEST_DIR environment variable is required")
    return Path(test_root)

def verify_answer_file_exists(test_dir: Path) -> bool:
    """Verify that the answer.txt file exists."""
    answer_file = test_dir / "answer.txt"
    
    if not answer_file.exists():
        print("❌ File 'answer.txt' not found")
        return False
    
    print("✅ Answer file found")
    return True

def verify_answer_format(test_dir: Path) -> bool:
    """Verify that the answer file has the correct format."""
    answer_file = test_dir / "answer.txt"
    
    try:
        content = answer_file.read_text().strip()
        
        # If file is empty, that's acceptable (no matches found)
        if not content:
            print("✅ Answer file is empty (no matches found)")
            return True
        
        lines = content.split('\n')
        
        for i, line in enumerate(lines, 1):
            line = line.strip()
            if not line:
                continue
                
            # Check format: just filename.txt
            if not line.endswith('.txt') or not line.startswith('file_'):
                print(f"❌ Line {i} has incorrect format: {line}")
                print("   Expected format: filename.txt")
                return False
        
        print("✅ Answer format is correct")
        return True
        
    except Exception as e:
        print(f"❌ Error reading answer file: {e}")
        return False

def find_30_plus_char_matches(test_dir: Path) -> set:
    """Find all files that have 30+ character substring matches with large_file.txt."""
    large_file = test_dir / "large_file.txt"
    if not large_file.exists():
        print("❌ large_file.txt not found")
        return set()
    
    large_content = large_file.read_text()
    matching_files = set()
    
    # Check each file from file_01.txt to file_20.txt
    for i in range(1, 21):
        filename = f"file_{i:02d}.txt"
        file_path = test_dir / filename
        
        if not file_path.exists():
            continue
            
        file_content = file_path.read_text()
        
        # Check if there's a substring of 30+ characters that matches
        has_match = False
        for start_pos in range(len(file_content)):
            for end_pos in range(start_pos + 30, len(file_content) + 1):
                substring = file_content[start_pos:end_pos]
                if substring in large_content:
                    has_match = True
                    break
            if has_match:
                break
        
        if has_match:
            matching_files.add(filename)
    
    return matching_files

def verify_matches_are_correct(test_dir: Path) -> bool:
    """Verify that the files listed in answer.txt actually have 30+ character matches."""
    answer_file = test_dir / "answer.txt"
    
    try:
        content = answer_file.read_text().strip()
        
        # If no content, check if there should actually be no matches
        if not content:
            expected_matches = find_30_plus_char_matches(test_dir)
            if expected_matches:
                print("❌ Answer file is empty but matches should exist")
                for filename in expected_matches:
                    print(f"   Expected: {filename}")
                return False
            else:
                print("✅ No matches found (correct)")
                return True
        
        # Parse answer file
        answer_files = set()
        lines = content.split('\n')
        for line in lines:
            line = line.strip()
            if not line:
                continue
            answer_files.add(line)
        
        # Get expected matches
        expected_matches = find_30_plus_char_matches(test_dir)
        
        # Check if all answer files actually have matches
        for filename in answer_files:
            if filename not in expected_matches:
                print(f"❌ File {filename} listed in answer but has no valid 30+ character match")
                return False
        
        # Check if all expected matches are in answer
        for filename in expected_matches:
            if filename not in answer_files:
                print(f"❌ Missing match for {filename} in answer file")
                return False
        
        print("✅ All matches are correct")
        return True
        
    except Exception as e:
        print(f"❌ Error verifying matches: {e}")
        return False

def verify_files_exist(test_dir: Path) -> bool:
    """Verify that all files mentioned in answer.txt actually exist."""
    answer_file = test_dir / "answer.txt"
    
    try:
        content = answer_file.read_text().strip()
        
        if not content:
            return True  # No files to verify
        
        lines = content.split('\n')
        for line in lines:
            line = line.strip()
            if not line:
                continue
                
            file_path = test_dir / line
            
            if not file_path.exists():
                print(f"❌ File mentioned in answer does not exist: {line}")
                return False
        
        print("✅ All files mentioned in answer exist")
        return True
        
    except Exception as e:
        print(f"❌ Error verifying file existence: {e}")
        return False

def main():
    """Main verification function."""
    test_dir = get_test_directory()
    print("🔍 Verifying File Filtering Task: Find Files with Common Substring...")
    
    # Define verification steps
    verification_steps = [
        ("Answer File Exists", verify_answer_file_exists),
        ("Answer Format", verify_answer_format),
        ("Files Exist", verify_files_exist),
        ("Matches are Correct", verify_matches_are_correct),
    ]
    
    # Run all verification steps
    all_passed = True
    for step_name, verify_func in verification_steps:
        print(f"\n--- {step_name} ---")
        if not verify_func(test_dir):
            all_passed = False
    
    # Final result
    print("\n" + "="*50)
    if all_passed:
        print("✅ File filtering task completed correctly!")
        print("🎉 Task verification: PASS")
        sys.exit(0)
    else:
        print("❌ Task verification: FAIL")
        sys.exit(1)

if __name__ == "__main__":
    main()

================================================
FILE: tasks/filesystem/easy/file_context/uppercase/description.md
================================================
# File Context Task: Convert Files to Uppercase

## 📋 Task Description

You need to process 5 text files (file_01.txt to file_05.txt) and convert their content to uppercase format.

## 🎯 Task Objectives

1. **Create an uppercase directory** in the test environment root
2. **Convert each file** from file_01.txt to file_05.txt to uppercase
3. **Save converted files** in the uppercase/ directory with the same names


================================================
FILE: tasks/filesystem/easy/file_context/uppercase/meta.json
================================================
{
  "task_id": "uppercase",
  "task_name": "Uppercase",
  "category_id": "file_context",
  "category_name": "File Context",
  "description": "Copy file_01.txt-file_05.txt into an uppercase/ folder and convert the contents of every file to uppercase text.",
  "author": "Lingjun Chen",
  "created_at": "2025-11-15",
  "difficulty": "L1",
  "tags": [
    "content transformation",
    "batch processing"
  ],
  "mcp": [
    "filesystem"
  ],
  "meta_data": {
    "stateType": "text",
    "stateContent": "file_context/\n    \u251c\u2500\u2500 file_01.txt\n    \u251c\u2500\u2500 file_02.txt\n    \u251c\u2500\u2500 file_03.txt\n    \u251c\u2500\u2500 file_04.txt\n    \u251c\u2500\u2500 file_05.txt\n    \u251c\u2500\u2500 file_06.txt\n    \u251c\u2500\u2500 file_07.txt\n    \u251c\u2500\u2500 file_08.txt\n    \u251c\u2500\u2500 file_09.txt\n    \u251c\u2500\u2500 file_10.txt\n    \u251c\u2500\u2500 file_11.txt\n    \u251c\u2500\u2500 file_12.txt\n    \u251c\u2500\u2500 file_13.txt\n    \u251c\u2500\u2500 file_14.txt\n    \u251c\u2500\u2500 file_15.txt\n    \u251c\u2500\u2500 file_16.txt\n    \u251c\u2500\u2500 file_17.txt\n    \u251c\u2500\u2500 file_18.txt\n    \u251c\u2500\u2500 file_19.txt\n    \u251c\u2500\u2500 file_20.txt\n    \u2514\u2500\u2500 large_file.txt",
    "stateUrl": "https://storage.mcpmark.ai/filesystem/file_context.zip",
    "stateOriginalUrl": null
  }
}


================================================
FILE: tasks/filesystem/easy/file_context/uppercase/verify.py
================================================
#!/usr/bin/env python3
"""
Verification script for File Context Task: Convert Files to Uppercase
"""

import sys
from pathlib import Path
import os
import re

def get_test_directory() -> Path:
    """Get the test directory from FILESYSTEM_TEST_DIR env var."""
    test_root = os.environ.get("FILESYSTEM_TEST_DIR")
    if not test_root:
        raise ValueError("FILESYSTEM_TEST_DIR environment variable is required")
    return Path(test_root)

def verify_uppercase_directory_exists(test_dir: Path) -> bool:
    """Verify that the uppercase directory exists."""
    uppercase_dir = test_dir / "uppercase"
    
    if not uppercase_dir.exists():
        print("❌ Directory 'uppercase' not found")
        return False
    
    if not uppercase_dir.is_dir():
        print("❌ 'uppercase' exists but is not a directory")
        return False
    
    print("✅ Uppercase directory found")
    return True

def verify_uppercase_files_exist(test_dir: Path) -> bool:
    """Verify that all 5 uppercase files exist."""
    uppercase_dir = test_dir / "uppercase"
    
    for i in range(1, 6):
        filename = f"file_{i:02d}.txt"
        file_path = uppercase_dir / filename
        
        if not file_path.exists():
            print(f"❌ File '{filename}' not found in uppercase directory")
            return False
    
    print("✅ All 5 uppercase files found")
    return True

def verify_uppercase_content(test_dir: Path) -> bool:
    """Verify that uppercase files contain the correct uppercase content."""
    uppercase_dir = test_dir / "uppercase"
    
    for i in range(1, 6):
        filename = f"file_{i:02d}.txt"
        original_file = test_dir / filename
        uppercase_file = uppercase_dir / filename
        
        if not original_file.exists():
            print(f"❌ Original file '{filename}' not found")
            return False
        
        try:
            original_content = original_file.read_text()
            uppercase_content = uppercase_file.read_text()
            
            # Check if uppercase content is the uppercase version of original
            expected_uppercase = original_content.upper()
            
            if uppercase_content != expected_uppercase:
                print(f"❌ File '{filename}' content is not properly converted to uppercase")
                return False
                
        except Exception as e:
            print(f"❌ Error reading file '{filename}': {e}")
            return False
    
    print("✅ All uppercase files contain correct uppercase content")
    return True

def verify_answer_file_exists(test_dir: Path) -> bool:
    """Verify that the answer.txt file exists in the uppercase directory."""
    uppercase_dir = test_dir / "uppercase"
    answer_file = uppercase_dir / "answer.txt"
    
    if not answer_file.exists():
        print("❌ File 'answer.txt' not found in uppercase directory")
        return False
    
    print("✅ Answer file found in uppercase directory")
    return True

def verify_answer_format(test_dir: Path) -> bool:
    """Verify that the answer file has the correct format."""
    uppercase_dir = test_dir / "uppercase"
    answer_file = uppercase_dir / "answer.txt"
    
    try:
        content = answer_file.read_text().strip()
        
        if not content:
            print("❌ Answer file is empty")
            return False
        
        lines = content.split('\n')
        
        # Check if we have exactly 10 lines
        if len(lines) != 10:
            print(f"❌ Answer file has {len(lines)} lines, expected 10")
            return False
        
        for i, line in enumerate(lines, 1):
            line = line.strip()
            if not line:
                print(f"❌ Line {i} is empty")
                return False
            
            # Check format: filename:word_count
            if ':' not in line:
                print(f"❌ Line {i} has incorrect format: {line}")
                print("   Expected format: filename:word_count")
                return False
            
            parts = line.split(':', 1)
            if len(parts) != 2:
                print(f"❌ Line {i} has incorrect format: {line}")
                print("   Expected format: filename:word_count")
                return False
            
            filename, word_count_str = parts
            
            # Check filename format
            if not filename.endswith('.txt') or not filename.startswith('file_'):
                print(f"❌ Line {i} has invalid filename: {filename}")
                return False
            
            # Check word count format (should be integer)
            try:
                word_count = int(word_count_str)
                if word_count <= 0:
                    print(f"❌ Line {i} has invalid word count: {word_count_str}")
                    return False
            except ValueError:
                print(f"❌ Line {i} has non-integer word count: {word_count_str}")
                return False
        
        print("✅ Answer format is correct")
        return True
        
    except Exception as e:
        print(f"❌ Error reading answer file: {e}")
        return False

def count_words_in_file(file_path: Path) -> int:
    """Count words in a file."""
    try:
        content = file_path.read_text()
        # Split by whitespace and filter out empty strings
        words = [word for word in content.split() if word.strip()]
        return len(words)
    except Exception as e:
        print(f"❌ Error reading file {file_path}: {e}")
        return 0

def verify_word_counts_are_correct(test_dir: Path) -> bool:
    """Verify that the word counts in answer.txt are correct."""
    uppercase_dir = test_dir / "uppercase"
    answer_file = uppercase_dir / "answer.txt"
    
    try:
        content = answer_file.read_text().strip()
        lines = content.split('\n')
        
        # Expected word counts based on answer.md
        expected_counts = [22, 22, 22, 22, 18, 22, 22, 22, 18, 20]
        
        # Create a set of expected file entries for easier checking
        expected_entries = set()
        for i in range(1, 11):
            filename = f"file_{i:02d}.txt"
            expected_count = expected_counts[i - 1]
            if i == 6:  # Special case for file_06.txt: can be 21 or 22
                expected_entries.add(f"{filename}:21")
                expected_entries.add(f"{filename}:22")
            else:
                expected_entries.add(f"{filename}:{expected_count}")
        
        # Check each line in the answer file
        found_entries = set()
        for line in lines:
            line = line.strip()
            if line in expected_entries:
                found_entries.add(line)
            else:
                print(f"❌ Invalid entry: {line}")
                return False
        
        # Check if we found all expected entries
        if len(found_entries) != 10:
            print(f"❌ Found {len(found_entries)} entries, expected 10")
            missing = expected_entries - found_entries
            if missing:
                print(f"   Missing entries: {missing}")
            return False
        
        print("✅ All word counts are correct")
        return True
        
    except Exception as e:
        print(f"❌ Error verifying word counts: {e}")
        return False

def verify_all_files_are_included(test_dir: Path) -> bool:
    """Verify that all 10 files are included in the answer."""
    uppercase_dir = test_dir / "uppercase"
    answer_file = uppercase_dir / "answer.txt"
    
    try:
        content = answer_file.read_text().strip()
        lines = content.split('\n')
        
        # Check that all 10 files are present
        found_files = set()
        for line in lines:
            parts = line.split(':', 1)
            filename = parts[0]
            found_files.add(filename)
        
        expected_files = {f"file_{i:02d}.txt" for i in range(1, 11)}
        
        if found_files != expected_files:
            missing = expected_files - found_files
            extra = found_files - expected_files
            if missing:
                print(f"❌ Missing files in answer: {missing}")
            if extra:
                print(f"❌ Extra files in answer: {extra}")
            return False
        
        print("✅ All 10 files are included in answer")
        return True
        
    except Exception as e:
        print(f"❌ Error verifying file inclusion: {e}")
        return False

def main():
    """Main verification function."""
    try:
        test_dir = get_test_directory()
        print(f"🔍 Verifying Uppercase in: {test_dir}")
        print()
        
        # Run all verification checks
        checks = [
            ("Uppercase directory exists", verify_uppercase_directory_exists),
            ("Uppercase files exist", verify_uppercase_files_exist),
            ("Uppercase content is correct", verify_uppercase_content),
        ]
        
        all_passed = True
        for check_name, check_func in checks:
            print(f"📋 {check_name}...")
            if not check_func(test_dir):
                all_passed = False
            print()
        
        if all_passed:
            print("🎉 All verification checks passed!")
            sys.exit(0)
        else:
            print("❌ Some verification checks failed!")
            sys.exit(1)
            
    except Exception as e:
        print(f"❌ Verification failed with error: {e}")
        sys.exit(1)

if __name__ == "__main__":
    main()

================================================
FILE: tasks/filesystem/easy/file_property/largest_rename/description.md
================================================
# Largest File Rename Task

## 📋 Task Description

Rename the largest `.jpg` file in the test directory to `largest.jpg` based on file size.

## 🎯 Task Objectives

1. **Find all `.jpg` files** in the test directory
2. **Determine which `.jpg` file is the largest** by file size
3. **Rename the largest `.jpg` file to `largest.jpg`**


================================================
FILE: tasks/filesystem/easy/file_property/largest_rename/meta.json
================================================
{
  "task_id": "largest_rename",
  "task_name": "Largest File Rename",
  "category_id": "file_property",
  "category_name": "File Property",
  "description": "Identify the largest .jpg in the workspace and rename it to largest.jpg while leaving the other files untouched.",
  "author": "Lingjun Chen",
  "created_at": "2025-11-15",
  "difficulty": "L1",
  "tags": [
    "file organization",
    "attribute inspection"
  ],
  "mcp": [
    "filesystem"
  ],
  "meta_data": {
    "stateType": "text",
    "stateContent": "file_property/\n    \u251c\u2500\u2500 bear.jpg\n    \u251c\u2500\u2500 bridge.jpg\n    \u251c\u2500\u2500 bus.MOV\n    \u251c\u2500\u2500 random_file_1.txt\n    \u251c\u2500\u2500 random_file_2.txt\n    \u251c\u2500\u2500 random_file_3.txt\n    \u251c\u2500\u2500 road.MOV\n    \u2514\u2500\u2500 sg.jpg",
    "stateUrl": "https://storage.mcpmark.ai/filesystem/file_property.zip",
    "stateOriginalUrl": null
  }
}


================================================
FILE: tasks/filesystem/easy/file_property/largest_rename/verify.py
================================================
#!/usr/bin/env python3
"""
Verification script for Largest File Rename Task
"""

import sys
from pathlib import Path
import os

def get_test_directory() -> Path:
    """Get the test directory from FILESYSTEM_TEST_DIR env var."""
    test_root = os.environ.get("FILESYSTEM_TEST_DIR")
    if not test_root:
        raise ValueError("FILESYSTEM_TEST_DIR environment variable is required")
    return Path(test_root)

def verify_sg_jpg_not_exists(test_dir: Path) -> bool:
    """Verify that sg.jpg does not exist."""
    sg_file = test_dir / "sg.jpg"
    
    if sg_file.exists():
        print("❌ sg.jpg still exists (should be renamed)")
        return False
    
    print("✅ sg.jpg does not exist")
    return True

def verify_largest_jpg_exists(test_dir: Path) -> bool:
    """Verify that largest.jpg exists."""
    largest_file = test_dir / "largest.jpg"
    
    if not largest_file.exists():
        print("❌ largest.jpg does not exist")
        return False
    
    print("✅ largest.jpg exists")
    return True

def main():
    """Main verification function."""
    try:
        test_dir = get_test_directory()
        print(f"🔍 Verifying largest file rename in: {test_dir}")
        
        # Run all verification checks
        checks = [
            ("sg.jpg does not exist", verify_sg_jpg_not_exists),
            ("largest.jpg exists", verify_largest_jpg_exists)
        ]
        
        all_passed = True
        for check_name, check_func in checks:
            print(f"\n📋 Checking: {check_name}")
            if not check_func(test_dir):
                all_passed = False
        
        if all_passed:
            print("\n🎉 All verification checks passed!")
            sys.exit(0)
        else:
            print("\n❌ Some verification checks failed!")
            sys.exit(1)
            
    except Exception as e:
        print(f"❌ Verification failed with error: {e}")
        sys.exit(1)

if __name__ == "__main__":
    main()

================================================
FILE: tasks/filesystem/easy/file_property/txt_merging/description.md
================================================
# Text File Merging Task

## 📋 Task Description

Merge all `.txt` files in the test directory into a single file called `merge.txt`. The merged file should contain the content from all `.txt` files.

## 🎯 Task Objectives

1. **Read all `.txt` files** in the test directory
2. **Create a new file** called `merge.txt` in the test directory
3. **Write the content** from all `.txt` files into `merge.txt`
4. **The order** of content doesn't matter - as long as all content from all `.txt` files is present in `merge.txt`


================================================
FILE: tasks/filesystem/easy/file_property/txt_merging/meta.json
================================================
{
  "task_id": "txt_merging",
  "task_name": "Text File Merging",
  "category_id": "file_property",
  "category_name": "File Property",
  "description": "Combine the contents of every .txt file into a single merge.txt file so the archive has one consolidated view.",
  "author": "Lingjun Chen",
  "created_at": "2025-11-15",
  "difficulty": "L1",
  "tags": [
    "content consolidation",
    "file automation"
  ],
  "mcp": [
    "filesystem"
  ],
  "meta_data": {
    "stateType": "text",
    "stateContent": "file_property/\n    \u251c\u2500\u2500 bear.jpg\n    \u251c\u2500\u2500 bridge.jpg\n    \u251c\u2500\u2500 bus.MOV\n    \u251c\u2500\u2500 random_file_1.txt\n    \u251c\u2500\u2500 random_file_2.txt\n    \u251c\u2500\u2500 random_file_3.txt\n    \u251c\u2500\u2500 road.MOV\n    \u2514\u2500\u2500 sg.jpg",
    "stateUrl": "https://storage.mcpmark.ai/filesystem/file_property.zip",
    "stateOriginalUrl": null
  }
}


================================================
FILE: tasks/filesystem/easy/file_property/txt_merging/verify.py
================================================
#!/usr/bin/env python3
"""
Verification script for Text File Merging Task
"""

import sys
from pathlib import Path
import os

def get_test_directory() -> Path:
    """Get the test directory from FILESYSTEM_TEST_DIR env var."""
    test_root = os.environ.get("FILESYSTEM_TEST_DIR")
    if not test_root:
        raise ValueError("FILESYSTEM_TEST_DIR environment variable is required")
    return Path(test_root)

def get_expected_contents():
    """Return the expected content from each .txt file."""
    return [
        "O rErmZ4tDgzMNoxn1oNfQhT1TRpy9w0tQPGTcrsaoMFrrgt9bY5mgBxO6q8c8lZywXxEEBWW4i6Jh9NbAtYtRKvkzB4bshGIMzn2G1 rDTpKJj",
        "DmRrDFFaIl1mPubzSJJaN4aMeZyBHqVxZe5tpztHQ9zSe6b69Hnl7coqeNJXHXU2EnaDnyhYxZSWHPn3IWLsLGWrx7py8d37Z8blMnh7VDUH7hAMamhLRO8lfUVV1roM8a0njnW9evXRq5AoNTt8Tv7kQ5LmLe6Z66MZwtjckRAXmOB4x3AYbbxLULYZAxitW1KNG1yTaDOYZQhtKdZkX1XqytzBl9dRXI4gk91ZlVHLOiujwUa89EVsdjayKeCc21gCJMXvbhDSOGAs6dXZEHuaHQnnBdM19X3TwPgfDONyhlc pjwoQ45D56UQVWxwNIJUTgwS1vctYOx4XFpMgf3PRQ7zZdfhIuPBFdQwnQvYUeQbWa5gnyMO9FVSU0vm9uccbJQvkcEAJzMkEh9i7z6EEixtbwVedlTGWL2XBwjenRdf2qsOgvJo8Dyuvf35ieCFMG7wR7200rs GJZ5bRdx4R2gGOWVMi3MOBrqcw3KhbcpJtdQoKMALEjBMrY7VYKtAZNI6LoXX OOTJZ3x3usHRJY0gMtKhh6OJ 37aknvBwNYJ0IRWYWaeJ8LBwJyO6ZV3ZJ0palISQvGaHEZ0olHnK2iNCTxqxvF8J7EdIdIPYssl5f0XgPl6",
        "aFCzXJbJq02zlCKnyarJnPUiwVIuUrQci3fZvGD53F5fUsKDUlEwO5 ANJ2VgBnJ5cuBJzjILcM9AxTvyNZ5NPIHjSCo5O20K"
    ]

def verify_merge_file_exists(test_dir: Path) -> bool:
    """Verify that merge.txt exists in the test directory."""
    merge_file = test_dir / "merge.txt"
    
    if not merge_file.exists():
        print("❌ merge.txt not found")
        return False
    
    if not merge_file.is_file():
        print("❌ merge.txt exists but is not a file")
        return False
    
    print("✅ merge.txt exists")
    return True

def verify_merge_file_contents(test_dir: Path) -> bool:
    """Verify that merge.txt contains all expected content strings."""
    merge_file = test_dir / "merge.txt"
    expected_contents = get_expected_contents()
    
    try:
        with open(merge_file, 'r', encoding='utf-8') as f:
            merge_content = f.read()
    except Exception as e:
        print(f"❌ Failed to read merge.txt: {e}")
        return False
    
    # Check that each expected content string is present in the merged file
    missing_contents = []
    for content in expected_contents:
        if content not in merge_content:
            missing_contents.append(content[:50] + "..." if len(content) > 50 else content)
    
    if missing_contents:
        print(f"❌ Missing content in merge.txt:")
        for content in missing_contents:
            print(f"   - {content}")
        return False
    
    print("✅ merge.txt contains all expected content")
    return True

def main():
    """Main verification function."""
    try:
        test_dir = get_test_directory()
        print(f"🔍 Verifying text file merging in: {test_dir}")
        
        # Run all verification checks
        checks = [
            ("Merge file existence", verify_merge_file_exists),
            ("Merge file contents", verify_merge_file_contents)
        ]
        
        all_passed = True
        for check_name, check_func in checks:
            print(f"\n📋 Checking: {check_name}")
            if not check_func(test_dir):
                all_passed = False
        
        if all_passed:
            print("\n🎉 All verification checks passed!")
            sys.exit(0)
        else:
            print("\n❌ Some verification checks failed!")
            sys.exit(1)
            
    except Exception as e:
        print(f"❌ Verification failed with error: {e}")
        sys.exit(1)

if __name__ == "__main__":
    main()


================================================
FILE: tasks/filesystem/easy/folder_structure/structure_analysis/description.md
================================================
# Directory Structure Analysis Task

You need to recursively traverse the entire folder structure under the main directory and count the total number of `.py` files in the entire directory (including all subdirectories).

Write the answer (just a single number) in a file named `structure_analysis.txt` in the main directory (at the same level as the `complex_structure` folder).

You should not change or delete any existed files.

Do not try to use python code.


================================================
FILE: tasks/filesystem/easy/folder_structure/structure_analysis/meta.json
================================================
{
  "task_id": "structure_analysis",
  "task_name": "Structure Analysis",
  "category_id": "folder_structure",
  "category_name": "Folder Structure",
  "description": "Recursively inspect the complex_structure tree, count all .py files, and save the total as the only line of structure_analysis.txt.",
  "author": "Lingjun Chen",
  "created_at": "2025-11-15",
  "difficulty": "L1",
  "tags": [
    "data extraction",
    "filesystem traversal"
  ],
  "mcp": [
    "filesystem"
  ],
  "meta_data": {
    "stateType": "text",
    "stateContent": "folder_structure/\n    \u2514\u2500\u2500 complex_structure/\n            \u251c\u2500\u2500 deeply/\n            \u2502       \u2514\u2500\u2500 nested/\n            \u2502               \u2514\u2500\u2500 folder/\n            \u2502                       \u2514\u2500\u2500 structure/\n            \u251c\u2500\u2500 empty_folder/\n            \u251c\u2500\u2500 folder_lxkHt_0_1/\n            \u2502       \u2514\u2500\u2500 file_PeLzC_0.txt\n            \u251c\u2500\u2500 folder_QdTAj_0_2/\n            \u2502       \u251c\u2500\u2500 folder_eXccj_1_0/\n            \u2502       \u2502       \u251c\u2500\u2500 folder_Mqlwh_2_1/\n            \u2502       \u2502       \u2502       \u251c\u2500\u2500 folder_cKxcP_3_3/\n            \u2502       \u2502       \u2502       \u2502       \u251c\u2500\u2500 folder_BPTMK_4_1/\n            \u2502       \u2502       \u2502       \u2502       \u2502       \u2514\u2500\u2500 file_RHtBP_0.txt\n            \u2502       \u2502       \u2502       \u2502       \u251c\u2500\u2500 folder_QNqjq_4_0/\n            \u2502       \u2502       \u2502       \u2502       \u2502       \u251c\u2500\u2500 folder_gRwPE_5_1/\n            \u2502       \u2502       \u2502       \u2502       \u2502       \u2502       \u251c\u2500\u2500 file_jVlpp_0.txt\n            \u2502       \u2502       \u2502       \u2502       \u2502       \u2502       \u2514\u2500\u2500 file_vJuHz_1.txt\n            \u2502       \u2502       \u2502       \u2502       \u2502       \u251c\u2500\u2500 folder_XdXYJ_5_0/\n            \u2502       \u2502       \u2502       \u2502       \u2502       \u2502       \u2514\u2500\u2500 file_KvkKi_0.txt\n            \u2502       \u2502       \u2502       \u2502       \u2502       \u251c\u2500\u2500 file_gGxLG_2.txt\n            \u2502       \u2502       \u2502       \u2502       \u2502       \u251c\u2500\u2500 file_Hzkxo_0.txt\n            \u2502       \u2502       \u2502       \u2502       \u2502       \u2514\u2500\u2500 file_XRjeh_1.txt\n            \u2502       \u2502       \u2502       \u2502       \u251c\u2500\u2500 folder_vIBIt_4_2/\n            \u2502       \u2502       \u2502       \u2502       \u2502       \u251c\u2500\u2500 folder_kRDNS_5_0/\n            \u2502       \u2502       \u2502       \u2502       \u2502       \u2502       \u2514\u2500\u2500 file_wFSjJ_0.txt\n            \u2502       \u2502       \u2502       \u2502       \u2502       \u2514\u2500\u2500 file_NyBSO_0.txt\n            \u2502       \u2502       \u2502       \u2502       \u251c\u2500\u2500 file_EOCNf_1.txt\n            \u2502       \u2502       \u2502       \u2502       \u2514\u2500\u2500 file_gmrXA_0.txt\n            \u2502       \u2502       \u2502       \u251c\u2500\u2500 folder_NcruA_3_1/\n            \u2502       \u2502       \u2502       \u2502       \u251c\u2500\u2500 file_bLWDj_1.txt\n            \u2502       \u2502       \u2502       \u2502       \u2514\u2500\u2500 file_WAftR_0.txt\n            \u2502       \u2502       \u2502       \u251c\u2500\u2500 folder_qCDFI_3_2/\n            \u2502       \u2502       \u2502       \u2502       \u251c\u2500\u2500 file_eSMOJ_0.txt\n            \u2502       \u2502       \u2502       \u2502       \u251c\u2500\u2500 file_oxADy_2.txt\n            \u2502       \u2502       \u2502       \u2502       \u2514\u2500\u2500 file_RTbbc_1.txt\n            \u2502       \u2502       \u2502       \u251c\u2500\u2500 folder_QVHUU_3_0/\n            \u2502       \u2502       \u2502       \u2502       \u251c\u2500\u2500 folder_FEPTK_4_1/\n            \u2502       \u2502       \u2502       \u2502       \u2502       \u251c\u2500\u2500 folder_GHoMC_5_1/\n            \u2502       \u2502       \u2502       \u2502       \u2502       \u2502       \u2514\u2500\u2500 file_rAMYd_0.txt\n            \u2502       \u2502       \u2502       \u2502       \u2502       \u251c\u2500\u2500 folder_iBDUY_5_0/\n            \u2502       \u2502       \u2502       \u2502       \u2502       \u2502       \u2514\u2500\u2500 file_IJCaw_0.txt\n            \u2502       \u2502       \u2502       \u2502       \u2502       \u251c\u2500\u2500 folder_VRXgp_5_2/\n            \u2502       \u2502       \u2502       \u2502       \u2502       \u2502       \u2514\u2500\u2500 file_hkUmS_0.txt\n            \u2502       \u2502       \u2502       \u2502       \u2502       \u251c\u2500\u2500 file_nqLAf_1.txt\n            \u2502       \u2502       \u2502       \u2502       \u2502       \u2514\u2500\u2500 file_XflmA_0.txt\n            \u2502       \u2502       \u2502       \u2502       \u251c\u2500\u2500 folder_FlPoK_4_3/\n            \u2502       \u2502       \u2502       \u2502       \u2502       \u251c\u2500\u2500 folder_hSVNm_5_3/\n            \u2502       \u2502       \u2502       \u2502       \u2502       \u2502       \u2514\u2500\u2500 file_klnbn_0.txt\n            \u2502       \u2502       \u2502       \u2502       \u2502       \u251c\u2500\u2500 folder_iZuEl_5_0/\n            \u2502       \u2502       \u2502       \u2502       \u2502       \u2502       \u2514\u2500\u2500 file_LqAmy_0.txt\n            \u2502       \u2502       \u2502       \u2502       \u2502       \u251c\u2500\u2500 folder_LcURj_5_2/\n            \u2502       \u2502       \u2502       \u2502       \u2502       \u2502       \u251c\u2500\u2500 file_RgwOS_1.txt\n            \u2502       \u2502       \u2502       \u2502       \u2502       \u2502       \u2514\u2500\u2500 file_ZHnYb_0.txt\n            \u2502       \u2502       \u2502       \u2502       \u2502       \u251c\u2500\u2500 folder_tuZQJ_5_1/\n            \u2502       \u2502       \u2502       \u2502       \u2502       \u2502       \u2514\u2500\u2500 file_LHuIx_0.txt\n            \u2502       \u2502       \u2502       \u2502       \u2502       \u251c\u2500\u2500 file_asJnB_1.txt\n            \u2502       \u2502       \u2502       \u2502       \u2502       \u2514\u2500\u2500 file_EzLdu_0.txt\n            \u2502       \u2502       \u2502       \u2502       \u251c\u2500\u2500 folder_ndhsJ_4_0/\n            \u2502       \u2502       \u2502       \u2502       \u2502       \u251c\u2500\u2500 folder_CUSXK_5_0/\n            \u2502       \u2502       \u2502       \u2502       \u2502       \u2502       \u251c\u2500\u2500 file_DpiuM_1.txt\n            \u2502       \u2502       \u2502       \u2502       \u2502       \u2502       \u2514\u2500\u2500 file_pSqeG_0.txt\n            \u2502       \u2502       \u2502       \u2502       \u2502       \u251c\u2500\u2500 folder_pstmE_5_1/\n            \u2502       \u2502       \u2502       \u2502       \u2502       \u2502       \u2514\u2500\u2500 file_YwdJt_0.txt\n            \u2502       \u2502       \u2502       \u2502       \u2502       \u251c\u2500\u2500 folder_StlsP_5_2/\n            \u2502       \u2502       \u2502       \u2502       \u2502       \u2502       \u251c\u2500\u2500 file_kriBJ_0.txt\n            \u2502       \u2502       \u2502       \u2502       \u2502       \u2502       \u2514\u2500\u2500 file_XCEdm_1.txt\n            \u2502       \u2502       \u2502       \u2502       \u2502       \u251c\u2500\u2500 file_ToDjh_1.txt\n            \u2502       \u2502       \u2502       \u2502       \u2502       \u2514\u2500\u2500 file_xbIVx_0.txt\n            \u2502       \u2502       \u2502       \u2502       \u251c\u2500\u2500 folder_PJBok_4_4/\n            \u2502       \u2502       \u2502       \u2502       \u2502       \u251c\u2500\u2500 folder_mzxaf_5_0/\n            \u2502       \u2502       \u2502       \u2502       \u2502       \u2502       \u251c\u2500\u2500 file_ILBzj_2.txt\n            \u2502       \u2502       \u2502       \u2502       \u2502       \u2502       \u251c\u2500\u2500 file_MTGMm_1.txt\n            \u2502       \u2502       \u2502       \u2502       \u2502       \u2502       \u2514\u2500\u2500 file_zBDqz_0.txt\n            \u2502       \u2502       \u2502       \u2502       \u2502       \u251c\u2500\u2500 folder_sULMj_5_1/\n            \u2502       \u2502       \u2502       \u2502       \u2502       \u2502       \u251c\u2500\u2500 file_BHziw_1.txt\n            \u2502       \u2502       \u2502       \u2502       \u2502       \u2502       \u251c\u2500\u2500 file_sIjiu_2.txt\n            \u2502       \u2502       \u2502       \u2502       \u2502       \u2502       \u2514\u2500\u2500 file_VqNkB_0.txt\n            \u2502       \u2502       \u2502       \u2502       \u2502       \u251c\u2500\u2500 folder_vypSi_5_3/\n            \u2502       \u2502       \u2502       \u2502       \u2502       \u2502       \u251c\u2500\u2500 file_kZbIm_1.txt\n            \u2502       \u2502       \u2502       \u2502       \u2502       \u2502       \u2514\u2500\u2500 file_sOBtE_0.txt\n            \u2502       \u2502       \u2502       \u2502       \u2502       \u251c\u2500\u2500 folder_ZLGHy_5_2/\n            \u2502       \u2502       \u2502       \u2502       \u2502       \u2502       \u251c\u2500\u2500 file_azaFF_0.txt\n            \u2502       \u2502       \u2502       \u2502       \u2502       \u2502       \u2514\u2500\u2500 file_nAFRe_1.txt\n            \u2502       \u2502       \u2502       \u2502       \u2502       \u251c\u2500\u2500 file_mIkQU_0.txt\n            \u2502       \u2502       \u2502       \u2502       \u2502       \u2514\u2500\u2500 file_sGPxd_1.txt\n            \u2502       \u2502       \u2502       \u2502       \u251c\u2500\u2500 folder_VTbEG_4_2/\n            \u2502       \u2502       \u2502       \u2502       \u2502       \u251c\u2500\u2500 file_HtYLg_0.txt\n            \u2502       \u2502       \u2502       \u2502       \u2502       \u251c\u2500\u2500 file_JXjMd_1.txt\n            \u2502       \u2502       \u2502       \u2502       \u2502       \u2514\u2500\u2500 file_tPccB_2.txt\n            \u2502       \u2502       \u2502       \u2502       \u251c\u2500\u2500 file_BuOSw_1.txt\n            \u2502       \u2502       \u2502       \u2502       \u2514\u2500\u2500 file_TpoqE_0.txt\n            \u2502       \u2502       \u2502       \u251c\u2500\u2500 folder_wTvun_3_4/\n            \u2502       \u2502       \u2502       \u2502       \u251c\u2500\u2500 file_GyhyE_1.txt\n            \u2502       \u2502       \u2502       \u2502       \u251c\u2500\u2500 file_POsla_2.txt\n            \u2502       \u2502       \u2502       \u2502       \u2514\u2500\u2500 file_tSsvk_0.txt\n            \u2502       \u2502       \u2502       \u251c\u2500\u2500 file_irNju_0.txt\n            \u2502       \u2502       \u2502       \u2514\u2500\u2500 file_jYBRm_1.txt\n            \u2502       \u2502       \u251c\u2500\u2500 folder_YlJLI_2_0/\n            \u2502       \u2502       \u2502       \u2514\u2500\u2500 file_FpFSL_0.txt\n            \u2502       \u2502       \u251c\u2500\u2500 file_cFgBr_2.txt\n            \u2502       \u2502       \u251c\u2500\u2500 file_lKEWN_1.txt\n            \u2502       \u2502       \u2514\u2500\u2500 file_ZEWFP_0.txt\n            \u2502       \u2514\u2500\u2500 file_ayUCH_0.txt\n            \u251c\u2500\u2500 folder_xtgyi_0_0/\n            \u2502       \u2514\u2500\u2500 file_BvSOB_0.txt\n            \u251c\u2500\u2500 mixed_content/\n            \u2502       \u2514\u2500\u2500 images_and_text/\n            \u2502               \u2514\u2500\u2500 notes.txt\n            \u251c\u2500\u2500 project/\n            \u2502       \u251c\u2500\u2500 docs/\n            \u2502       \u2502       \u2514\u2500\u2500 archive/\n            \u2502       \u2502               \u2514\u2500\u2500 2023/\n            \u2502       \u2502                       \u2514\u2500\u2500 reports/\n            \u2502       \u2502                               \u251c\u2500\u2500 report_0.txt\n            \u2502       \u2502                               \u251c\u2500\u2500 report_1.txt\n            \u2502       \u2502                               \u2514\u2500\u2500 report_2.txt\n            \u2502       \u2514\u2500\u2500 src/\n            \u2502               \u2514\u2500\u2500 main/\n            \u2502                       \u2514\u2500\u2500 resources/\n            \u2514\u2500\u2500 m.py",
    "stateUrl": "https://storage.mcpmark.ai/filesystem/folder_structure.zip",
    "stateOriginalUrl": null
  }
}


================================================
FILE: tasks/filesystem/easy/folder_structure/structure_analysis/verify.py
================================================
#!/usr/bin/env python3
"""
Verification script for Directory Structure Analysis Task
"""

import sys
from pathlib import Path
import os

def get_test_directory() -> Path:
    """Get the test directory from FILESYSTEM_TEST_DIR env var."""
    test_root = os.environ.get("FILESYSTEM_TEST_DIR")
    if not test_root:
        raise ValueError("FILESYSTEM_TEST_DIR environment variable is required")
    return Path(test_root)

def verify_structure_analysis_file_exists(test_dir: Path) -> bool:
    """Verify that the structure_analysis.txt file exists."""
    analysis_file = test_dir / "structure_analysis.txt"
    
    if not analysis_file.exists():
        print("❌ File 'structure_analysis.txt' not found")
        return False
    
    print("✅ structure_analysis.txt file found")
    return True

def verify_structure_analysis_content(test_dir: Path) -> bool:
    """Verify that the structure_analysis.txt file contains the correct count."""
    analysis_file = test_dir / "structure_analysis.txt"
    
    try:
        content = analysis_file.read_text().strip()
        
        if not content:
            print("❌ structure_analysis.txt file is empty")
            return False
        
        # The expected answer is 1
        expected_count = 1
        
        # Check if content is exactly "1"
        if content != str(expected_count):
            print(f"❌ Expected '{expected_count}', but found: '{content}'")
            return False
        
        print(f"✅ Python file count is correct: {content}")
        return True
        
    except Exception as e:
        print(f"❌ Error reading structure_analysis.txt file: {e}")
        return False

def main():
    """Main verification function."""
    try:
        test_dir = get_test_directory()
        print(f"🔍 Verifying Directory Structure Analysis Task in: {test_dir}")
        print()
        
        # Define verification steps
        verification_steps = [
            ("Structure Analysis File Exists", verify_structure_analysis_file_exists),
            ("Python File Count is Correct", verify_structure_analysis_content),
        ]
        
        # Run all verification steps
        all_passed = True
        for step_name, verify_func in verification_steps:
            print(f"📋 {step_name}...")
            if not verify_func(test_dir):
                all_passed = False
            print()
        
        # Final result
        if all_passed:
            print("🎉 All verification checks passed!")
            sys.exit(0)
        else:
            print("❌ Some verification checks failed!")
            sys.exit(1)
            
    except Exception as e:
        print(f"❌ Verification failed with error: {e}")
        sys.exit(1)

if __name__ == "__main__":
    main()


================================================
FILE: tasks/filesystem/easy/legal_document/file_reorganize/description.md
================================================
# Legal Document File Reorganization Task

**Overview**

The folder "legal_files/" contains multiple versions of the Stock Purchase Agreement (Preferred_Stock_Purchase_Agreement_v0.txt through Preferred_Stock_Purchase_Agreement_v10.txt).

## Task

Your task is to:

1. Identify the final version of the document among the different versions
2. Create a folder named `final_version` inside the `legal_files/` directory
3. Create an **empty file** with the same name as the final version in the newly created `final_version/` folder
4. Keep the original file in its original location

Note: Due to the large file size, you only need to create an empty file (not copy the content). The filename should remain unchanged in the `final_version/` folder.


================================================
FILE: tasks/filesystem/easy/legal_document/file_reorganize/meta.json
================================================
{
  "task_id": "file_reorganize",
  "task_name": "File Reorganize",
  "category_id": "legal_document",
  "category_name": "Legal Document",
  "description": "Determine the final Stock Purchase Agreement version and create an empty copy of that filename inside legal_files/final_version/.",
  "author": "Lingjun Chen",
  "created_at": "2025-11-15",
  "difficulty": "L1",
  "tags": [
    "file organization",
    "version management"
  ],
  "mcp": [
    "filesystem"
  ],
  "meta_data": {
    "stateType": "text",
    "stateContent": "legal_document/\n    \u2514\u2500\u2500 legal_files/\n            \u251c\u2500\u2500 Preferred_Stock_Purchase_Agreement_v0.txt\n            \u251c\u2500\u2500 Preferred_Stock_Purchase_Agreement_v1.txt\n            \u251c\u2500\u2500 Preferred_Stock_Purchase_Agreement_v2.txt\n            \u251c\u2500\u2500 Preferred_Stock_Purchase_Agreement_v3.txt\n            \u251c\u2500\u2500 Preferred_Stock_Purchase_Agreement_v4.txt\n            \u251c\u2500\u2500 Preferred_Stock_Purchase_Agreement_v5.txt\n            \u251c\u2500\u2500 Preferred_Stock_Purchase_Agreement_v6.txt\n            \u251c\u2500\u2500 Preferred_Stock_Purchase_Agreement_v7.txt\n            \u251c\u2500\u2500 Preferred_Stock_Purchase_Agreement_v8.txt\n            \u251c\u2500\u2500 Preferred_Stock_Purchase_Agreement_v9.txt\n            \u2514\u2500\u2500 Preferred_Stock_Purchase_Agreement_v10.txt",
    "stateUrl": "https://storage.mcpmark.ai/filesystem/legal_document.zip",
    "stateOriginalUrl": "https://www.cooleygo.com/documents/nvca-financing-documents"
  }
}


================================================
FILE: tasks/filesystem/easy/legal_document/file_reorganize/verify.py
================================================
#!/usr/bin/env python3
"""
Verification script for Legal Document File Reorganization Task
"""

import sys
from pathlib import Path
import os

def get_test_directory() -> Path:
    """Get the test directory from FILESYSTEM_TEST_DIR env var."""
    test_root = os.environ.get("FILESYSTEM_TEST_DIR")
    if not test_root:
        raise ValueError("FILESYSTEM_TEST_DIR environment variable is required")
    return Path(test_root)

def verify_final_version_folder_exists(test_dir: Path) -> bool:
    """Verify that the final_version folder exists in legal_files."""
    final_version_dir = test_dir / "legal_files" / "final_version"
    
    if not final_version_dir.exists():
        print("❌ Folder 'legal_files/final_version' not found")
        return False
    
    if not final_version_dir.is_dir():
        print("❌ 'legal_files/final_version' exists but is not a directory")
        return False
    
    print("✅ Folder 'legal_files/final_version' found")
    return True

def verify_target_file_exists(test_dir: Path) -> bool:
    """Verify that Preferred_Stock_Purchase_Agreement_v10.txt exists in final_version folder."""
    target_file = test_dir / "legal_files" / "final_version" / "Preferred_Stock_Purchase_Agreement_v10.txt"
    
    if not target_file.exists():
        print("❌ File 'legal_files/final_version/Preferred_Stock_Purchase_Agreement_v10.txt' not found")
        return False
    
    if not target_file.is_file():
        print("❌ 'Preferred_Stock_Purchase_Agreement_v10.txt' exists but is not a file")
        return False
    
    print("✅ Target file 'Preferred_Stock_Purchase_Agreement_v10.txt' found in final_version folder")
    return True

def verify_original_file_preserved(test_dir: Path) -> bool:
    """Verify that the original v10 file is still in place."""
    original_file = test_dir / "legal_files" / "Preferred_Stock_Purchase_Agreement_v10.txt"
    
    if not original_file.exists():
        print("❌ Original file 'Preferred_Stock_Purchase_Agreement_v10.txt' was removed")
        return False
    
    print("✅ Original file 'Preferred_Stock_Purchase_Agreement_v10.txt' preserved")
    return True

def verify_only_v10_in_final_version(test_dir: Path) -> bool:
    """Verify that final_version folder contains only v10 file."""
    final_version_dir = test_dir / "legal_files" / "final_version"
    
    # Get all files in final_version folder
    files = list(final_version_dir.iterdir())
    
    # Filter out directories, keep only files
    files_only = [f for f in files if f.is_file()]
    
    if len(files_only) != 1:
        print(f"❌ final_version folder should contain exactly 1 file, but found {len(files_only)}")
        for f in files_only:
            print(f"   - {f.name}")
        return False
    
    # Check if the only file is v10
    if files_only[0].name != "Preferred_Stock_Purchase_Agreement_v10.txt":
        print(f"❌ final_version folder contains wrong file: {files_only[0].name}")
        print("   Expected: Preferred_Stock_Purchase_Agreement_v10.txt")
        return False
    
    print("✅ final_version folder contains only Preferred_Stock_Purchase_Agreement_v10.txt")
    return True

def main():
    """Main verification function."""
    test_dir = get_test_directory()
    print("🔍 Verifying Legal Document File Reorganization Task...")
    
    # Define verification steps
    verification_steps = [
        ("Final Version Folder Exists", verify_final_version_folder_exists),
        ("Target File Exists", verify_target_file_exists),
        ("Only V10 in Final Version", verify_only_v10_in_final_version),
        ("Original File Preserved", verify_original_file_preserved),
    ]
    
    # Run all verification steps
    all_passed = True
    for step_name, verify_func in verification_steps:
        print(f"\n--- {step_name} ---")
        if not verify_func(test_dir):
            all_passed = False
    
    # Final result
    print("\n" + "="*50)
    if all_passed:
        print("✅ Legal document file reorganization completed correctly!")
        print("🎉 Task verification: PASS")
        sys.exit(0)
    else:
        print("❌ Task verification: FAIL")
        sys.exit(1)

if __name__ == "__main__":
    main()


================================================
FILE: tasks/filesystem/easy/papers/papers_counting/description.md
================================================
# File Context Task: Count HTML Files

## 📋 Task Description

You need to count the number of HTML files in the given directory and write the count to a file.

## 🎯 Task Objectives

1. **Count HTML files** in the given directory
2. **Create a file** named `count.txt` in the same directory
3. **Write the count** (just the number) to `count.txt`

## 📝 Expected Output

- File `count.txt` containing only the number of HTML files found


================================================
FILE: tasks/filesystem/easy/papers/papers_counting/meta.json
================================================
{
  "task_id": "papers_counting",
  "task_name": "Papers Counting",
  "category_id": "papers",
  "category_name": "Papers",
  "description": "Count how many .html papers live in the directory and write just that number into count.txt.",
  "author": "Lingjun Chen",
  "created_at": "2025-11-15",
  "difficulty": "L1",
  "tags": [
    "data extraction",
    "reporting"
  ],
  "mcp": [
    "filesystem"
  ],
  "meta_data": {
    "stateType": "text",
    "stateContent": "papers/\n    \u251c\u2500\u2500 1707.06347.html\n    \u251c\u2500\u2500 2105.04165.html\n    \u251c\u2500\u2500 2201.11903.html\n    \u251c\u2500\u2500 2303.08774.html\n    \u251c\u2500\u2500 2306.08640.html\n    \u251c\u2500\u2500 2310.02255.html\n    \u251c\u2500\u2500 2310.08446.html\n    \u251c\u2500\u2500 2312.00849.html\n    \u251c\u2500\u2500 2312.07533.html\n    \u251c\u2500\u2500 2312.11805.html\n    \u251c\u2500\u2500 2402.00253.html\n    \u251c\u2500\u2500 2402.03300.html\n    \u251c\u2500\u2500 2403.05530.html\n    \u251c\u2500\u2500 2404.13046.html\n    \u251c\u2500\u2500 2404.14367.html\n    \u251c\u2500\u2500 2404.14396.html\n    \u251c\u2500\u2500 2405.09818.html\n    \u251c\u2500\u2500 2405.13911.html\n    \u251c\u2500\u2500 2405.16473.html\n    \u251c\u2500\u2500 2405.16640.html\n    \u251c\u2500\u2500 2406.08478.html\n    \u251c\u2500\u2500 2406.16852.html\n    \u251c\u2500\u2500 2406.17294.html\n    \u251c\u2500\u2500 2407.01284.html\n    \u251c\u2500\u2500 2407.01509.html\n    \u251c\u2500\u2500 2407.21783.html\n    \u251c\u2500\u2500 2408.03326.html\n    \u251c\u2500\u2500 2408.12528.html\n    \u251c\u2500\u2500 2409.19256.html\n    \u251c\u2500\u2500 2410.05993.html\n    \u251c\u2500\u2500 2410.06166.html\n    \u251c\u2500\u2500 2410.10563.html\n    \u251c\u2500\u2500 2410.13848.html\n    \u251c\u2500\u2500 2410.17885.html\n    \u251c\u2500\u2500 2410.21276.html\n    \u251c\u2500\u2500 2411.07975.html\n    \u251c\u2500\u2500 2411.10442.html\n    \u251c\u2500\u2500 2411.11930.html\n    \u251c\u2500\u2500 2411.14432.html\n    \u251c\u2500\u2500 2412.05271.html\n    \u251c\u2500\u2500 2412.08443.html\n    \u251c\u2500\u2500 2412.10302.html\n    \u251c\u2500\u2500 2412.15115.html\n    \u251c\u2500\u2500 2412.16720.html\n    \u251c\u2500\u2500 2412.17256.html\n    \u251c\u2500\u2500 2412.18319.html\n    \u251c\u2500\u2500 2412.20631.html\n    \u251c\u2500\u2500 2501.04686.html\n    \u251c\u2500\u2500 2501.06186.html\n    \u251c\u2500\u2500 2501.12599.html\n    \u251c\u2500\u2500 2501.12948.html\n    \u251c\u2500\u2500 2501.17811.html\n    \u251c\u2500\u2500 2502.01456.html\n    \u251c\u2500\u2500 2502.09621.html\n    \u251c\u2500\u2500 2502.10391.html\n    \u251c\u2500\u2500 2502.13923.html\n    \u251c\u2500\u2500 2503.01785.html\n    \u251c\u2500\u2500 2503.06520.html\n    \u251c\u2500\u2500 2503.06749.html\n    \u251c\u2500\u2500 2503.07065.html\n    \u251c\u2500\u2500 2503.07365.html\n    \u251c\u2500\u2500 2503.07536.html\n    \u251c\u2500\u2500 2503.10291.html\n    \u251c\u2500\u2500 2503.10615.html\n    \u251c\u2500\u2500 2503.12937.html\n    \u251c\u2500\u2500 2503.13939.html\n    \u251c\u2500\u2500 2503.14476.html\n    \u251c\u2500\u2500 2503.17352.html\n    \u251c\u2500\u2500 2503.18892.html\n    \u251c\u2500\u2500 2503.19786.html\n    \u251c\u2500\u2500 2503.20783.html\n    \u251c\u2500\u2500 2503.21620.html\n    \u251c\u2500\u2500 2503.21776.html\n    \u251c\u2500\u2500 2503.22679.html\n    \u251c\u2500\u2500 2504.02587.html\n    \u251c\u2500\u2500 2504.05599.html\n    \u251c\u2500\u2500 2504.07491.html\n    \u251c\u2500\u2500 2504.07934.html\n    \u251c\u2500\u2500 2504.07954.html\n    \u251c\u2500\u2500 2504.11455.html\n    \u251c\u2500\u2500 2504.14945.html\n    \u251c\u2500\u2500 2504.16656.html\n    \u251c\u2500\u2500 2505.00703.html\n    \u2514\u2500\u2500 arxiv_2025.bib",
    "stateUrl": "https://storage.mcpmark.ai/filesystem/papers.zip",
    "stateOriginalUrl": null
  }
}


================================================
FILE: tasks/filesystem/easy/papers/papers_counting/verify.py
================================================
#!/usr/bin/env python3
"""
Verification script for Paper Counting Task: Count HTML Files
"""

import sys
from pathlib import Path
import os

def get_test_directory() -> Path:
    """Get the test directory from FILESYSTEM_TEST_DIR env var."""
    test_root = os.environ.get("FILESYSTEM_TEST_DIR")
    if not test_root:
        raise ValueError("FILESYSTEM_TEST_DIR environment variable is required")
    return Path(test_root)

def verify_count_file_exists(test_dir: Path) -> bool:
    """Verify that the count.txt file exists."""
    count_file = test_dir / "count.txt"
    
    if not count_file.exists():
        print("❌ File 'count.txt' not found")
        return False
    
    print("✅ count.txt file found")
    return True

def verify_count_content(test_dir: Path) -> bool:
    """Verify that count.txt contains the correct number (83)."""
    count_file = test_dir / "count.txt"
    
    try:
        content = count_file.read_text().strip()
        
        # Check if content is exactly "83"
        if content == "83":
            print("✅ count.txt contains the correct number: 83")
            return True
        else:
            print(f"❌ count.txt contains '{content}' but expected '83'")
            return False
        
    except Exception as e:
        print(f"❌ Error reading count.txt: {e}")
        return False

def verify_actual_html_count(test_dir: Path) -> bool:
    """Verify that there are actually 83 HTML files in the directory."""
    html_files = list(test_dir.glob("*.html"))
    count = len(html_files)
    
    if count == 83:
        print(f"✅ Verified: There are exactly {count} HTML files in the directory")
        return True
    else:
        print(f"⚠️  Found {count} HTML files in the directory (expected 83)")
        return False

def main():
    """Main verification function."""
    try:
        test_dir = get_test_directory()
        print(f"🔍 Verifying HTML file count in: {test_dir}")
        
        # Define verification steps
        verification_steps = [
            ("Count File Exists", verify_count_file_exists),
            ("Count Content", verify_count_content),
            ("Actual HTML Count", verify_actual_html_count),
        ]
        
        # Run all verification steps
        all_passed = True
        for step_name, verify_func in verification_steps:
            print(f"\n--- {step_name} ---")
            if not verify_func(test_dir):
                all_passed = False
        
        # Final result
        print("\n" + "="*50)
        if all_passed:
            print("✅ HTML file count is correct!")
            print("🎉 Task verification: PASS")
            sys.exit(0)
        else:
            print("❌ Task verification: FAIL")
            sys.exit(1)
            
    except Exception as e:
        print(f"❌ Verification failed with error: {e}")
        sys.exit(1)

if __name__ == "__main__":
    main()


================================================
FILE: tasks/filesystem/easy/student_database/duplicate_name/description.md
================================================
Please help me identify any duplicate name from the list of all the 150 students. Do not use python code. You only need to find **any one** duplicate name. Then generate a `namesake.txt` file to record the result in the following format, with only three lines. Note: when recording the name, replace underscores with spaces.

name: xxx
count: xxx
ids: xxx, xxx, ...


================================================
FILE: tasks/filesystem/easy/student_database/duplicate_name/meta.json
================================================
{
  "task_id": "duplicate_name",
  "task_name": "Duplicate Name",
  "category_id": "student_database",
  "category_name": "Student Database",
  "description": "Search the 150 student folders for any repeated full name and document the name, count, and ids in namesake.txt.",
  "author": "Lingjun Chen",
  "created_at": "2025-11-15",
  "difficulty": "L1",
  "tags": [
    "pattern analysis",
    "data validation"
  ],
  "mcp": [
    "filesystem"
  ],
  "meta_data": {
    "stateType": "text",
    "stateContent": "student_database/\n    \u251c\u2500\u2500 20101250_Patricia_Jones/\n    \u2502       \u251c\u2500\u2500 basic_info.txt\n    \u2502       \u2514\u2500\u2500 recommendation_letter.txt\n    \u251c\u2500\u2500 20101701_Isabella_Davis/\n    \u2502       \u251c\u2500\u2500 basic_info.txt\n    \u2502       \u2514\u2500\u2500 recommendation_letter.txt\n    \u251c\u2500\u2500 20102572_Michael_Taylor/\n    \u2502       \u251c\u2500\u2500 basic_info.txt\n    \u2502       \u2514\u2500\u2500 recommendation_letter.txt\n    \u251c\u2500\u2500 20104233_Robert_Lopez/\n    \u2502       \u251c\u2500\u2500 basic_info.txt\n    \u2502       \u2514\u2500\u2500 recommendation_letter.txt\n    \u251c\u2500\u2500 20104498_Sarah_Brown/\n    \u2502       \u251c\u2500\u2500 basic_info.txt\n    \u2502       \u2514\u2500\u2500 recommendation_letter.txt\n    \u251c\u2500\u2500 20104653_Sophia_Brown/\n    \u2502       \u251c\u2500\u2500 basic_info.txt\n    \u2502       \u2514\u2500\u2500 recommendation_letter.txt\n    \u251c\u2500\u2500 20104675_Michael_Gonzalez/\n    \u2502       \u251c\u2500\u2500 basic_info.txt\n    \u2502       \u2514\u2500\u2500 recommendation_letter.txt\n    \u251c\u2500\u2500 20104846_Christopher_Brown/\n    \u2502       \u251c\u2500\u2500 basic_info.txt\n    \u2502       \u2514\u2500\u2500 recommendation_letter.txt\n    \u251c\u2500\u2500 20107487_Mia_Martin/\n    \u2502       \u251c\u2500\u2500 basic_info.txt\n    \u2502       \u2514\u2500\u2500 recommendation_letter.txt\n    \u251c\u2500\u2500 20108742_Sarah_Brown/\n    \u2502       \u251c\u2500\u2500 basic_info.txt\n    \u2502       \u2514\u2500\u2500 recommendation_letter.txt\n    \u251c\u2500\u2500 20109144_Emma_Thomas/\n    \u2502       \u251c\u2500\u2500 basic_info.txt\n    \u2502       \u2514\u2500\u2500 recommendation_letter.txt\n    \u251c\u2500\u2500 20109803_Oliver_Hernandez/\n    \u2502       \u251c\u2500\u2500 basic_info.txt\n    \u2502       \u2514\u2500\u2500 recommendation_letter.txt\n    \u251c\u2500\u2500 20111634_Isabella_Thomas/\n    \u2502       \u251c\u2500\u2500 basic_info.txt\n    \u2502       \u2514\u2500\u2500 recommendation_letter.txt\n    \u251c\u2500\u2500 20112439_Christopher_Moore/\n    \u2502       \u251c\u2500\u2500 basic_info.txt\n    \u2502       \u2514\u2500\u2500 recommendation_letter.txt\n    \u251c\u2500\u2500 20113368_William_Wilson/\n    \u2502       \u251c\u2500\u2500 basic_info.txt\n    \u2502       \u2514\u2500\u2500 recommendation_letter.txt\n    \u251c\u2500\u2500 20113603_Robert_Rodriguez/\n    \u2502       \u251c\u2500\u2500 basic_info.txt\n    \u2502       \u2514\u2500\u2500 recommendation_letter.txt\n    \u251c\u2500\u2500 20114397_Isabella_Martin/\n    \u2502       \u251c\u2500\u2500 basic_info.txt\n    \u2502       \u2514\u2500\u2500 recommendation_letter.txt\n    \u251c\u2500\u2500 20114869_Ethan_Martin/\n    \u2502       \u251c\u2500\u2500 basic_info.txt\n    \u2502       \u2514\u2500\u2500 recommendation_letter.txt\n    \u251c\u2500\u2500 20115252_Mason_Johnson/\n    \u2502       \u251c\u2500\u2500 basic_info.txt\n    \u2502       \u2514\u2500\u2500 recommendation_letter.txt\n    \u251c\u2500\u2500 20115632_Elizabeth_Anderson/\n    \u2502       \u251c\u2500\u2500 basic_info.txt\n    \u2502       \u2514\u2500\u2500 recommendation_letter.txt\n    \u251c\u2500\u2500 20115753_Charlotte_Johnson/\n    \u2502       \u251c\u2500\u2500 basic_info.txt\n    \u2502       \u2514\u2500\u2500 recommendation_letter.txt\n    \u251c\u2500\u2500 20115924_Michael_Lopez/\n    \u2502       \u251c\u2500\u2500 basic_info.txt\n    \u2502       \u2514\u2500\u2500 recommendation_letter.txt\n    \u251c\u2500\u2500 20116232_Olivia_Lopez/\n    \u2502       \u251c\u2500\u2500 basic_info.txt\n    \u2502       \u2514\u2500\u2500 recommendation_letter.txt\n    \u251c\u2500\u2500 20119528_Thomas_Brown/\n    \u2502       \u251c\u2500\u2500 basic_info.txt\n    \u2502       \u2514\u2500\u2500 recommendation_letter.txt\n    \u251c\u2500\u2500 20122427_Karen_Gonzalez/\n    \u2502       \u251c\u2500\u2500 basic_info.txt\n    \u2502       \u2514\u2500\u2500 recommendation_letter.txt\n    \u251c\u2500\u2500 20122977_Evelyn_Miller/\n    \u2502       \u251c\u2500\u2500 basic_info.txt\n    \u2502       \u2514\u2500\u2500 recommendation_letter.txt\n    \u251c\u2500\u2500 20123376_Joseph_Johnson/\n    \u2502       \u251c\u2500\u2500 basic_info.txt\n    \u2502       \u2514\u2500\u2500 recommendation_letter.txt\n    \u251c\u2500\u2500 20125451_Barbara_Brown/\n    \u2502       \u251c\u2500\u2500 basic_info.txt\n    \u2502       \u2514\u2500\u2500 recommendation_letter.txt\n    \u251c\u2500\u2500 20126203_Barbara_Davis/\n    \u2502       \u251c\u2500\u2500 basic_info.txt\n    \u2502       \u2514\u2500\u2500 recommendation_letter.txt\n    \u251c\u2500\u2500 20126394_Olivia_Williams/\n    \u2502       \u251c\u2500\u2500 basic_info.txt\n    \u2502       \u2514\u2500\u2500 recommendation_letter.txt\n    \u251c\u2500\u2500 20126471_Ethan_Taylor/\n    \u2502       \u251c\u2500\u2500 basic_info.txt\n    \u2502       \u2514\u2500\u2500 recommendation_letter.txt\n    \u251c\u2500\u2500 20127423_John_Williams/\n    \u2502       \u251c\u2500\u2500 basic_info.txt\n    \u2502       \u2514\u2500\u2500 recommendation_letter.txt\n    \u251c\u2500\u2500 20128249_Oliver_Smith/\n    \u2502       \u251c\u2500\u2500 basic_info.txt\n    \u2502       \u2514\u2500\u2500 recommendation_letter.txt\n    \u251c\u2500\u2500 20128879_Christopher_Taylor/\n    \u2502       \u251c\u2500\u2500 basic_info.txt\n    \u2502       \u2514\u2500\u2500 recommendation_letter.txt\n    \u251c\u2500\u2500 20129898_Jessica_Johnson/\n    \u2502       \u251c\u2500\u2500 basic_info.txt\n    \u2502       \u2514\u2500\u2500 recommendation_letter.txt\n    \u251c\u2500\u2500 20131271_Olivia_Brown/\n    \u2502       \u251c\u2500\u2500 basic_info.txt\n    \u2502       \u2514\u2500\u2500 recommendation_letter.txt\n    \u251c\u2500\u2500 20131518_Sophia_Smith/\n    \u2502       \u251c\u2500\u2500 basic_info.txt\n    \u2502       \u2514\u2500\u2500 recommendation_letter.txt\n    \u251c\u2500\u2500 20132026_Isabella_Smith/\n    \u2502       \u251c\u2500\u2500 basic_info.txt\n    \u2502       \u2514\u2500\u2500 recommendation_letter.txt\n    \u251c\u2500\u2500 20132370_James_Brown/\n    \u2502       \u251c\u2500\u2500 basic_info.txt\n    \u2502       \u2514\u2500\u2500 recommendation_letter.txt\n    \u251c\u2500\u2500 20132669_Noah_Smith/\n    \u2502       \u251c\u2500\u2500 basic_info.txt\n    \u2502       \u2514\u2500\u2500 recommendation_letter.txt\n    \u251c\u2500\u2500 20133527_Mason_Jackson/\n    \u2502       \u251c\u2500\u2500 basic_info.txt\n    \u2502       \u2514\u2500\u2500 recommendation_letter.txt\n    \u251c\u2500\u2500 20133697_Isabella_Smith/\n    \u2502       \u251c\u2500\u2500 basic_info.txt\n    \u2502       \u2514\u2500\u2500 recommendation_letter.txt\n    \u251c\u2500\u2500 20135821_Thomas_Wilson/\n    \u2502       \u251c\u2500\u2500 basic_info.txt\n    \u2502       \u2514\u2500\u2500 recommendation_letter.txt\n    \u251c\u2500\u2500 20136681_Benjamin_Anderson/\n    \u2502       \u251c\u2500\u2500 basic_info.txt\n    \u2502       \u2514\u2500\u2500 recommendation_letter.txt\n    \u251c\u2500\u2500 20136890_Benjamin_Brown/\n    \u2502       \u251c\u2500\u2500 basic_info.txt\n    \u2502       \u2514\u2500\u2500 recommendation_letter.txt\n    \u251c\u2500\u2500 20137514_Lucas_Anderson/\n    \u2502       \u251c\u2500\u2500 basic_info.txt\n    \u2502       \u2514\u2500\u2500 recommendation_letter.txt\n    \u251c\u2500\u2500 20139234_Harper_Martinez/\n    \u2502       \u251c\u2500\u2500 basic_info.txt\n    \u2502       \u2514\u2500\u2500 recommendation_letter.txt\n    \u251c\u2500\u2500 20139637_Noah_Johnson/\n    \u2502       \u251c\u2500\u2500 basic_info.txt\n    \u2502       \u2514\u2500\u2500 recommendation_letter.txt\n    \u251c\u2500\u2500 20139647_Patricia_Lopez/\n    \u2502       \u251c\u2500\u2500 basic_info.txt\n    \u2502       \u2514\u2500\u2500 recommendation_letter.txt\n    \u251c\u2500\u2500 20141421_Linda_Gonzalez/\n    \u2502       \u251c\u2500\u2500 basic_info.txt\n    \u2502       \u2514\u2500\u2500 recommendation_letter.txt\n    \u251c\u2500\u2500 20142085_William_Anderson/\n    \u2502       \u251c\u2500\u2500 basic_info.txt\n    \u2502       \u2514\u2500\u2500 recommendation_letter.txt\n    \u251c\u2500\u2500 20142383_Amelia_Brown/\n    \u2502       \u251c\u2500\u2500 basic_info.txt\n    \u2502       \u2514\u2500\u2500 recommendation_letter.txt\n    \u251c\u2500\u2500 20143406_Susan_Martin/\n    \u2502       \u251c\u2500\u2500 basic_info.txt\n    \u2502       \u2514\u2500\u2500 recommendation_letter.txt\n    \u251c\u2500\u2500 20143830_James_Garcia/\n    \u2502       \u251c\u2500\u2500 basic_info.txt\n    \u2502       \u2514\u2500\u2500 recommendation_letter.txt\n    \u251c\u2500\u2500 20146035_Christopher_Garcia/\n    \u2502       \u251c\u2500\u2500 basic_info.txt\n    \u2502       \u2514\u2500\u2500 recommendation_letter.txt\n    \u251c\u2500\u2500 20146277_William_Anderson/\n    \u2502       \u251c\u2500\u2500 basic_info.txt\n    \u2502       \u2514\u2500\u2500 recommendation_letter.txt\n    \u251c\u2500\u2500 20146279_Christopher_Moore/\n    \u2502       \u251c\u2500\u2500 basic_info.txt\n    \u2502       \u2514\u2500\u2500 recommendation_letter.txt\n    \u251c\u2500\u2500 20147301_James_Jones/\n    \u2502       \u251c\u2500\u2500 basic_info.txt\n    \u2502       \u2514\u2500\u2500 recommendation_letter.txt\n    \u251c\u2500\u2500 20147789_James_Anderson/\n    \u2502       \u251c\u2500\u2500 basic_info.txt\n    \u2502       \u2514\u2500\u2500 recommendation_letter.txt\n    \u251c\u2500\u2500 20148681_John_Hernandez/\n    \u2502       \u251c\u2500\u2500 basic_info.txt\n    \u2502       \u2514\u2500\u2500 recommendation_letter.txt\n    \u251c\u2500\u2500 20148778_Susan_Anderson/\n    \u2502       \u251c\u2500\u2500 basic_info.txt\n    \u2502       \u2514\u2500\u2500 recommendation_letter.txt\n    \u251c\u2500\u2500 20149712_Jessica_Rodriguez/\n    \u2502       \u251c\u2500\u2500 basic_info.txt\n    \u2502       \u2514\u2500\u2500 recommendation_letter.txt\n    \u251c\u2500\u2500 20151012_Harper_Miller/\n    \u2502       \u251c\u2500\u2500 basic_info.txt\n    \u2502       \u2514\u2500\u2500 recommendation_letter.txt\n    \u251c\u2500\u2500 20153174_Benjamin_Jackson/\n    \u2502       \u251c\u2500\u2500 basic_info.txt\n    \u2502       \u2514\u2500\u2500 recommendation_letter.txt\n    \u251c\u2500\u2500 20153412_Charlotte_Martin/\n    \u2502       \u251c\u2500\u2500 basic_info.txt\n    \u2502       \u2514\u2500\u2500 recommendation_letter.txt\n    \u251c\u2500\u2500 20153606_James_Anderson/\n    \u2502       \u251c\u2500\u2500 basic_info.txt\n    \u2502       \u2514\u2500\u2500 recommendation_letter.txt\n    \u251c\u2500\u2500 20153687_Richard_Taylor/\n    \u2502       \u251c\u2500\u2500 basic_info.txt\n    \u2502       \u2514\u2500\u2500 recommendation_letter.txt\n    \u251c\u2500\u2500 20154518_John_Gonzalez/\n    \u2502       \u251c\u2500\u2500 basic_info.txt\n    \u2502       \u2514\u2500\u2500 recommendation_letter.txt\n    \u251c\u2500\u2500 20154710_Benjamin_Rodriguez/\n    \u2502       \u251c\u2500\u2500 basic_info.txt\n    \u2502       \u2514\u2500\u2500 recommendation_letter.txt\n    \u251c\u2500\u2500 20156469_Jennifer_Hernandez/\n    \u2502       \u251c\u2500\u2500 basic_info.txt\n    \u2502       \u2514\u2500\u2500 recommendation_letter.txt\n    \u251c\u2500\u2500 20156522_Jennifer_Martinez/\n    \u2502       \u251c\u2500\u2500 basic_info.txt\n    \u2502       \u2514\u2500\u2500 recommendation_letter.txt\n    \u251c\u2500\u2500 20156851_Noah_Anderson/\n    \u2502       \u251c\u2500\u2500 basic_info.txt\n    \u2502       \u2514\u2500\u2500 recommendation_letter.txt\n    \u251c\u2500\u2500 20157943_Harper_Williams/\n    \u2502       \u251c\u2500\u2500 basic_info.txt\n    \u2502       \u2514\u2500\u2500 recommendation_letter.txt\n    \u251c\u2500\u2500 20158266_Sophia_Moore/\n    \u2502       \u251c\u2500\u2500 basic_info.txt\n    \u2502       \u2514\u2500\u2500 recommendation_letter.txt\n    \u251c\u2500\u2500 20158294_Sophia_Wilson/\n    \u2502       \u251c\u2500\u2500 basic_info.txt\n    \u2502       \u2514\u2500\u2500 recommendation_letter.txt\n    \u251c\u2500\u2500 20158819_Sarah_Wilson/\n    \u2502       \u251c\u2500\u2500 basic_info.txt\n    \u2502       \u2514\u2500\u2500 recommendation_letter.txt\n    \u251c\u2500\u2500 20159113_John_Rodriguez/\n    \u2502       \u251c\u2500\u2500 basic_info.txt\n    \u2502       \u2514\u2500\u2500 recommendation_letter.txt\n    \u251c\u2500\u2500 20159695_James_Moore/\n    \u2502       \u251c\u2500\u2500 basic_info.txt\n    \u2502       \u2514\u2500\u2500 recommendation_letter.txt\n    \u251c\u2500\u2500 20161279_William_Moore/\n    \u2502       \u251c\u2500\u2500 basic_info.txt\n    \u2502       \u2514\u2500\u2500 recommendation_letter.txt\n    \u251c\u2500\u2500 20162253_Mason_Rodriguez/\n    \u2502       \u251c\u2500\u2500 basic_info.txt\n    \u2502       \u2514\u2500\u2500 recommendation_letter.txt\n    \u251c\u2500\u2500 20162542_Mia_Anderson/\n    \u2502       \u251c\u2500\u2500 basic_info.txt\n    \u2502       \u2514\u2500\u2500 recommendation_letter.txt\n    \u251c\u2500\u2500 20163356_Ava_Anderson/\n    \u2502       \u251c\u2500\u2500 basic_info.txt\n    \u2502       \u2514\u2500\u2500 recommendation_letter.txt\n    \u251c\u2500\u2500 20164515_Patricia_Moore/\n    \u2502       \u251c\u2500\u2500 basic_info.txt\n    \u2502       \u2514\u2500\u2500 recommendation_letter.txt\n    \u251c\u2500\u2500 20164801_Noah_Rodriguez/\n    \u2502       \u251c\u2500\u2500 basic_info.txt\n    \u2502       \u2514\u2500\u2500 recommendation_letter.txt\n    \u251c\u2500\u2500 20165511_Mary_Gonzalez/\n    \u2502       \u251c\u2500\u2500 basic_info.txt\n    \u2502       \u2514\u2500\u2500 recommendation_letter.txt\n    \u251c\u2500\u2500 20166436_Christopher_Jackson/\n    \u2502       \u251c\u2500\u2500 basic_info.txt\n    \u2502       \u2514\u2500\u2500 recommendation_letter.txt\n    \u251c\u2500\u2500 20166487_Barbara_Hernandez/\n    \u2502       \u251c\u2500\u2500 basic_info.txt\n    \u2502       \u2514\u2500\u2500 recommendation_letter.txt\n    \u251c\u2500\u2500 20166564_Ava_Lopez/\n    \u2502       \u251c\u2500\u2500 basic_info.txt\n    \u2502       \u2514\u2500\u2500 recommendation_letter.txt\n    \u251c\u2500\u2500 20166998_Ava_Lopez/\n    \u2502       \u251c\u2500\u2500 basic_info.txt\n    \u2502       \u2514\u2500\u2500 recommendation_letter.txt\n    \u251c\u2500\u2500 20168311_Lucas_Jackson/\n    \u2502       \u251c\u2500\u2500 basic_info.txt\n    \u2502       \u2514\u2500\u2500 recommendation_letter.txt\n    \u251c\u2500\u2500 20168491_Karen_Martinez/\n    \u2502       \u251c\u2500\u2500 basic_info.txt\n    \u2502       \u2514\u2500\u2500 recommendation_letter.txt\n    \u251c\u2500\u2500 20169515_Thomas_Taylor/\n    \u2502       \u251c\u2500\u2500 basic_info.txt\n    \u2502       \u2514\u2500\u2500 recommendation_letter.txt\n    \u251c\u2500\u2500 20171050_Christopher_Rodriguez/\n    \u2502       \u251c\u2500\u2500 basic_info.txt\n    \u2502       \u2514\u2500\u2500 recommendation_letter.txt\n    \u251c\u2500\u2500 20171406_Mary_Anderson/\n    \u2502       \u251c\u2500\u2500 basic_info.txt\n    \u2502       \u2514\u2500\u2500 recommendation_letter.txt\n    \u251c\u2500\u2500 20171613_Ethan_Moore/\n    \u2502       \u251c\u2500\u2500 basic_info.txt\n    \u2502       \u2514\u2500\u2500 recommendation_letter.txt\n    \u251c\u2500\u2500 20172106_Isabella_Rodriguez/\n    \u2502       \u251c\u2500\u2500 basic_info.txt\n    \u2502       \u2514\u2500\u2500 recommendation_letter.txt\n    \u251c\u2500\u2500 20173259_Michael_Anderson/\n    \u2502       \u251c\u2500\u2500 basic_info.txt\n    \u2502       \u2514\u2500\u2500 recommendation_letter.txt\n    \u251c\u2500\u2500 20173492_Richard_Miller/\n    \u2502       \u251c\u2500\u2500 basic_info.txt\n    \u2502       \u2514\u2500\u2500 recommendation_letter.txt\n    \u251c\u2500\u2500 20173501_Mary_Smith/\n    \u2502       \u251c\u2500\u2500 basic_info.txt\n    \u2502       \u2514\u2500\u2500 recommendation_letter.txt\n    \u251c\u2500\u2500 20173517_Susan_Anderson/\n    \u2502       \u251c\u2500\u2500 basic_info.txt\n    \u2502       \u2514\u2500\u2500 recommendation_letter.txt\n    \u251c\u2500\u2500 20174207_Richard_Wilson/\n    \u2502       \u251c\u2500\u2500 basic_info.txt\n    \u2502       \u2514\u2500\u2500 recommendation_letter.txt\n    \u251c\u2500\u2500 20174369_Mary_Garcia/\n    \u2502       \u251c\u2500\u2500 basic_info.txt\n    \u2502       \u2514\u2500\u2500 recommendation_letter.txt\n    \u251c\u2500\u2500 20175314_William_Taylor/\n    \u2502       \u251c\u2500\u2500 basic_info.txt\n    \u2502       \u2514\u2500\u2500 recommendation_letter.txt\n    \u251c\u2500\u2500 20176169_Lucas_Smith/\n    \u2502       \u251c\u2500\u2500 basic_info.txt\n    \u2502       \u2514\u2500\u2500 recommendation_letter.txt\n    \u251c\u2500\u2500 20176947_Noah_Miller/\n    \u2502       \u251c\u2500\u2500 basic_info.txt\n    \u2502       \u2514\u2500\u2500 recommendation_letter.txt\n    \u251c\u2500\u2500 20177389_James_Smith/\n    \u2502       \u251c\u2500\u2500 basic_info.txt\n    \u2502       \u2514\u2500\u2500 recommendation_letter.txt\n    \u251c\u2500\u2500 20178687_Isabella_Anderson/\n    \u2502       \u251c\u2500\u2500 basic_info.txt\n    \u2502       \u2514\u2500\u2500 recommendation_letter.txt\n    \u251c\u2500\u2500 20179461_William_Johnson/\n    \u2502       \u251c\u2500\u2500 basic_info.txt\n    \u2502       \u2514\u2500\u2500 recommendation_letter.txt\n    \u251c\u2500\u2500 20179690_Linda_Thomas/\n    \u2502       \u251c\u2500\u2500 basic_info.txt\n    \u2502       \u2514\u2500\u2500 recommendation_letter.txt\n    \u251c\u2500\u2500 20181056_Sarah_Hernandez/\n    \u2502       \u251c\u2500\u2500 basic_info.txt\n    \u2502       \u2514\u2500\u2500 recommendation_letter.txt\n    \u251c\u2500\u2500 20182020_Patricia_Taylor/\n    \u2502       \u251c\u2500\u2500 basic_info.txt\n    \u2502       \u2514\u2500\u2500 recommendation_letter.txt\n    \u251c\u2500\u2500 20182390_Ethan_Wilson/\n    \u2502       \u251c\u2500\u2500 basic_info.txt\n    \u2502       \u2514\u2500\u2500 recommendation_letter.txt\n    \u251c\u2500\u2500 20183149_David_Smith/\n    \u2502       \u251c\u2500\u2500 basic_info.txt\n    \u2502       \u2514\u2500\u2500 recommendation_letter.txt\n    \u251c\u2500\u2500 20183219_Charlotte_Williams/\n    \u2502       \u251c\u2500\u2500 basic_info.txt\n    \u2502       \u2514\u2500\u2500 recommendation_letter.txt\n    \u251c\u2500\u2500 20184489_Jessica_Gonzalez/\n    \u2502       \u251c\u2500\u2500 basic_info.txt\n    \u2502       \u2514\u2500\u2500 recommendation_letter.txt\n    \u251c\u2500\u2500 20186154_Charlotte_Smith/\n    \u2502       \u251c\u2500\u2500 basic_info.txt\n    \u2502       \u2514\u2500\u2500 recommendation_letter.txt\n    \u251c\u2500\u2500 20186510_James_Thomas/\n    \u2502       \u251c\u2500\u2500 basic_info.txt\n    \u2502       \u2514\u2500\u2500 recommendation_letter.txt\n    \u251c\u2500\u2500 20187107_David_Martinez/\n    \u2502       \u251c\u2500\u2500 basic_info.txt\n    \u2502       \u2514\u2500\u2500 recommendation_letter.txt\n    \u251c\u2500\u2500 20187144_Mary_Jackson/\n    \u2502       \u251c\u2500\u2500 basic_info.txt\n    \u2502       \u2514\u2500\u2500 recommendation_letter.txt\n    \u251c\u2500\u2500 20187892_Christopher_Taylor/\n    \u2502       \u251c\u2500\u2500 basic_info.txt\n    \u2502       \u2514\u2500\u2500 recommendation_letter.txt\n    \u251c\u2500\u2500 20187921_Mary_Jones/\n    \u2502       \u251c\u2500\u2500 basic_info.txt\n    \u2502       \u2514\u2500\u2500 recommendation_letter.txt\n    \u251c\u2500\u2500 20187967_Sarah_Davis/\n    \u2502       \u251c\u2500\u2500 basic_info.txt\n    \u2502       \u2514\u2500\u2500 recommendation_letter.txt\n    \u251c\u2500\u2500 20188937_James_Moore/\n    \u2502       \u251c\u2500\u2500 basic_info.txt\n    \u2502       \u2514\u2500\u2500 recommendation_letter.txt\n    \u251c\u2500\u2500 20189123_Mary_Martin/\n    \u2502       \u251c\u2500\u2500 basic_info.txt\n    \u2502       \u2514\u2500\u2500 recommendation_letter.txt\n    \u251c\u2500\u2500 20189192_Olivia_Jones/\n    \u2502       \u251c\u2500\u2500 basic_info.txt\n    \u2502       \u2514\u2500\u2500 recommendation_letter.txt\n    \u251c\u2500\u2500 20189268_Emma_Williams/\n    \u2502       \u251c\u2500\u2500 basic_info.txt\n    \u2502       \u2514\u2500\u2500 recommendation_letter.txt\n    \u251c\u2500\u2500 20189854_William_Taylor/\n    \u2502       \u251c\u2500\u2500 basic_info.txt\n    \u2502       \u2514\u2500\u2500 recommendation_letter.txt\n    \u251c\u2500\u2500 20191265_Joseph_Lopez/\n    \u2502       \u251c\u2500\u2500 basic_info.txt\n    \u2502       \u2514\u2500\u2500 recommendation_letter.txt\n    \u251c\u2500\u2500 20192725_Robert_Martinez/\n    \u2502       \u251c\u2500\u2500 basic_info.txt\n    \u2502       \u2514\u2500\u2500 recommendation_letter.txt\n    \u251c\u2500\u2500 20194054_Michael_Jones/\n    \u2502       \u251c\u2500\u2500 basic_info.txt\n    \u2502       \u2514\u2500\u2500 recommendation_letter.txt\n    \u251c\u2500\u2500 20194160_Benjamin_Jackson/\n    \u2502       \u251c\u2500\u2500 basic_info.txt\n    \u2502       \u2514\u2500\u2500 recommendation_letter.txt\n    \u251c\u2500\u2500 20194164_Sarah_Jones/\n    \u2502       \u251c\u2500\u2500 basic_info.txt\n    \u2502       \u2514\u2500\u2500 recommendation_letter.txt\n    \u251c\u2500\u2500 20194525_John_Taylor/\n    \u2502       \u251c\u2500\u2500 basic_info.txt\n    \u2502       \u2514\u2500\u2500 recommendation_letter.txt\n    \u251c\u2500\u2500 20195164_Jennifer_Gonzalez/\n    \u2502       \u251c\u2500\u2500 basic_info.txt\n    \u2502       \u2514\u2500\u2500 recommendation_letter.txt\n    \u251c\u2500\u2500 20195982_David_Jackson/\n    \u2502       \u251c\u2500\u2500 basic_info.txt\n    \u2502       \u2514\u2500\u2500 recommendation_letter.txt\n    \u251c\u2500\u2500 20196776_William_Brown/\n    \u2502       \u251c\u2500\u2500 basic_info.txt\n    \u2502       \u2514\u2500\u2500 recommendation_letter.txt\n    \u251c\u2500\u2500 20196896_Olivia_Jones/\n    \u2502       \u251c\u2500\u2500 basic_info.txt\n    \u2502       \u2514\u2500\u2500 recommendation_letter.txt\n    \u251c\u2500\u2500 20196961_Joseph_Thomas/\n    \u2502       \u251c\u2500\u2500 basic_info.txt\n    \u2502       \u2514\u2500\u2500 recommendation_letter.txt\n    \u251c\u2500\u2500 20196998_Ethan_Wilson/\n    \u2502       \u251c\u2500\u2500 basic_info.txt\n    \u2502       \u2514\u2500\u2500 recommendation_letter.txt\n    \u251c\u2500\u2500 20198548_Evelyn_Moore/\n    \u2502       \u251c\u2500\u2500 basic_info.txt\n    \u2502       \u2514\u2500\u2500 recommendation_letter.txt\n    \u251c\u2500\u2500 20199036_Benjamin_Hernandez/\n    \u2502       \u251c\u2500\u2500 basic_info.txt\n    \u2502       \u2514\u2500\u2500 recommendation_letter.txt\n    \u251c\u2500\u2500 20199583_Mary_Brown/\n    \u2502       \u251c\u2500\u2500 basic_info.txt\n    \u2502       \u2514\u2500\u2500 recommendation_letter.txt\n    \u251c\u2500\u2500 20199735_Mason_Johnson/\n    \u2502       \u251c\u2500\u2500 basic_info.txt\n    \u2502       \u2514\u2500\u2500 recommendation_letter.txt\n    \u251c\u2500\u2500 20199872_Sophia_Jackson/\n    \u2502       \u251c\u2500\u2500 basic_info.txt\n    \u2502       \u2514\u2500\u2500 recommendation_letter.txt\n    \u251c\u2500\u2500 20199980_James_Rodriguez/\n    \u2502       \u251c\u2500\u2500 basic_info.txt\n    \u2502       \u2514\u2500\u2500 recommendation_letter.txt\n    \u251c\u2500\u2500 20201385_John_Taylor/\n    \u2502       \u251c\u2500\u2500 basic_info.txt\n    \u2502       \u2514\u2500\u2500 recommendation_letter.txt\n    \u251c\u2500\u2500 20201800_John_Jones/\n    \u2502       \u251c\u2500\u2500 basic_info.txt\n    \u2502       \u2514\u2500\u2500 recommendation_letter.txt\n    \u251c\u2500\u2500 20202548_Robert_Miller/\n    \u2502       \u251c\u2500\u2500 basic_info.txt\n    \u2502       \u2514\u2500\u2500 recommendation_letter.txt\n    \u251c\u2500\u2500 20203855_Mia_Miller/\n    \u2502       \u251c\u2500\u2500 basic_info.txt\n    \u2502       \u2514\u2500\u2500 recommendation_letter.txt\n    \u2514\u2500\u2500 20204611_Sarah_Wilson/\n            \u251c\u2500\u2500 basic_info.txt\n            \u2514\u2500\u2500 recommendation_letter.txt",
    "stateUrl": "https://storage.mcpmark.ai/filesystem/student_database.zip",
    "stateOriginalUrl": null
  }
}


================================================
FILE: tasks/filesystem/easy/student_database/duplicate_name/verify.py
================================================
#!/usr/bin/env python3
"""
Verification script for Student Database Task: Find Duplicate Names
Simplified version that only checks against expected results without folder validation
"""

import sys
from pathlib import Path
import os

def get_test_directory() -> Path:
    """Get the test directory from FILESYSTEM_TEST_DIR env var."""
    test_root = os.environ.get("FILESYSTEM_TEST_DIR")
    if not test_root:
        raise ValueError("FILESYSTEM_TEST_DIR environment variable is required")
    return Path(test_root)

def verify_namesake_file_exists(test_dir: Path) -> bool:
    """Verify that the namesake.txt file exists."""
    namesake_file = test_dir / "namesake.txt"
    
    if not namesake_file.exists():
        print("❌ File 'namesake.txt' not found")
        return False
    
    print("✅ Namesake file found")
    return True

def parse_namesake_file(test_dir: Path) -> dict:
    """Parse the namesake.txt file and return structured data."""
    namesake_file = test_dir / "namesake.txt"
    
    try:
        content = namesake_file.read_text()
        lines = content.strip().split('\n')
        
        namesakes = {}
        current_line = 0
        
        while current_line < len(lines):
            # Skip blank lines
            if not lines[current_line].strip():
                current_line += 1
                continue
            
            # Check if we have enough lines for a complete group
            if current_line + 2 >= len(lines):
                print(f"❌ Incomplete group at line {current_line + 1}")
                return {}
            
            # Parse group
            name_line = lines[current_line].strip()
            count_line = lines[current_line + 1].strip()
            ids_line = lines[current_line + 2].strip()
            
            # Extract name
            if not name_line.startswith("name: "):
                print(f"❌ Invalid name line format at line {current_line + 1}: {name_line}")
                return {}
            name = name_line.replace("name: ", "").strip()
            
            # Extract count
            if not count_line.startswith("count: "):
                print(f"❌ Invalid count line format at line {current_line + 2}: {count_line}")
                return {}
            count_str = count_line.replace("count: ", "").strip()
            try:
                count = int(count_str)
            except ValueError:
                print(f"❌ Invalid count format: {count_str}")
                return {}
            
            # Extract IDs
            if not ids_line.startswith("ids: "):
                print(f"❌ Invalid ids line format at line {current_line + 3}: {ids_line}")
                return {}
            ids_str = ids_line.replace("ids: ", "").strip()
            ids = [id.strip() for id in ids_str.split(",")]
            
            namesakes[name] = {
                'count': count,
                'ids': ids
            }
            
            current_line += 4  # Skip to next group (after blank line)
        
        return namesakes
        
    except Exception as e:
        print(f"❌ Error parsing namesake file: {e}")
        return {}

def verify_against_expected_results(namesakes: dict) -> bool:
    """Verify that exactly 1 duplicate name is found and it is correct."""
    
    # Expected duplicate names from answer.md (hardcoded)
    expected_duplicates = {
        'Isabella Smith': ['20132026', '20133697'],
        'Ava Lopez': ['20166564', '20166998'],
        'James Moore': ['20159695', '20188937'],
        'William Taylor': ['20175314', '20189854'],
        'Ethan Wilson': ['20182390', '20196998'],
        'Christopher Taylor': ['20128879', '20187892'],
        'William Anderson': ['20142085', '20146277'],
        'James Anderson': ['20147789', '20153606'],
        'Olivia Jones': ['20189192', '20196896'],
        'Mason Johnson': ['20115252', '20199735'],
        'Benjamin Jackson': ['20153174', '20194160'],
        'John Taylor': ['20194525', '20201385'],
        'Susan Anderson': ['20148778', '20173517'],
        'Christopher Moore': ['20112439', '20146279'],
        'Sarah Wilson': ['20158819', '20204611'],
        'Sarah Brown': ['20104498', '20108742']
    }
    
    # Check if exactly 1 duplicate name is found
    if len(namesakes) != 1:
        print(f"❌ Expected exactly 1 duplicate name, but found {len(namesakes)}")
        return False
    
    print(f"✅ Found exactly 1 duplicate name (as required)")
    
    # Check if the namesake in the file is actually a correct duplicate
    for name, data in namesakes.items():
        if name not in expected_duplicates:
            print(f"❌ '{name}' is not a duplicate name (not in expected list)")
            return False
        
        expected_ids = set(expected_duplicates[name])
        stated_ids = set(data['ids'])
        
        if expected_ids != stated_ids:
            print(f"❌ ID mismatch for '{name}':")
            print(f"   Expected: {sorted(expected_ids)}")
            print(f"   Stated: {sorted(stated_ids)}")
            return False
        
        # Verify count matches
        if data['count'] != 2:
            print(f"❌ Count mismatch for '{name}': expected 2, got {data['count']}")
            return False
    
    print("✅ The identified duplicate name is correct")
    print("✅ All student IDs match expected results")
    print("✅ Count is correct (2 for the duplicate name)")
    return True

def main():
    """Main verification function."""
    test_dir = get_test_directory()
    print("🔍 Verifying Student Database Task: Find Duplicate Names...")
    
    # Check if namesake file exists
    print("\n--- File Existence Check ---")
    if not verify_namesake_file_exists(test_dir):
        print("\n❌ Basic verification failed, cannot proceed with content verification")
        sys.exit(1)
    
    # Parse the file and run content verification
    print("\n--- Content Verification ---")
    namesakes = parse_namesake_file(test_dir)
    
    if not namesakes:
        print("❌ Failed to parse namesake file")
        sys.exit(1)
    
    # Verify against expected results
    print("\n--- Results Verification ---")
    if not verify_against_expected_results(namesakes):
        print("\n❌ Task verification: FAIL")
        sys.exit(1)
    
    # Final result
    print("\n" + "="*50)
    print("✅ Namesake identification completed correctly!")
    print(f"🎉 Found 1 duplicate name (exactly 1 required)")
    print("🎉 Task verification: PASS")
    sys.exit(0)

if __name__ == "__main__":
    main()

================================================
FILE: tasks/filesystem/easy/student_database/recommender_name/description.md
================================================
Please find the recommendation letter for Patricia Jones and identify who wrote it. Generate a `recommender.txt` file with only the author's name.


================================================
FILE: tasks/filesystem/easy/student_database/recommender_name/meta.json
================================================
{
  "task_id": "recommender_name",
  "task_name": "Recommender Name",
  "category_id": "student_database",
  "category_name": "Student Database",
  "description": "Read Patricia Jones's recommendation letter to capture who signed it and store only that name in recommender.txt.",
  "author": "Lingjun Chen",
  "created_at": "2025-11-15",
  "difficulty": "L1",
  "tags": [
    "data extraction",
    "document search"
  ],
  "mcp": [
    "filesystem"
  ],
  "meta_data": {
    "stateType": "text",
    "stateContent": "student_database/\n    \u251c\u2500\u2500 20101250_Patricia_Jones/\n    \u2502       \u251c\u2500\u2500 basic_info.txt\n    \u2502       \u2514\u2500\u2500 recommendation_letter.txt\n    \u251c\u2500\u2500 20101701_Isabella_Davis/\n    \u2502       \u251c\u2500\u2500 basic_info.txt\n    \u2502       \u2514\u2500\u2500 recommendation_letter.txt\n    \u251c\u2500\u2500 20102572_Michael_Taylor/\n    \u2502       \u251c\u2500\u2500 basic_info.txt\n    \u2502       \u2514\u2500\u2500 recommendation_letter.txt\n    \u251c\u2500\u2500 20104233_Robert_Lopez/\n    \u2502       \u251c\u2500\u2500 basic_info.txt\n    \u2502       \u2514\u2500\u2500 recommendation_letter.txt\n    \u251c\u2500\u2500 20104498_Sarah_Brown/\n    \u2502       \u251c\u2500\u2500 basic_info.txt\n    \u2502       \u2514\u2500\u2500 recommendation_letter.txt\n    \u251c\u2500\u2500 20104653_Sophia_Brown/\n    \u2502       \u251c\u2500\u2500 basic_info.txt\n    \u2502       \u2514\u2500\u2500 recommendation_letter.txt\n    \u251c\u2500\u2500 20104675_Michael_Gonzalez/\n    \u2502       \u251c\u2500\u2500 basic_info.txt\n    \u2502       \u2514\u2500\u2500 recommendation_letter.txt\n    \u251c\u2500\u2500 20104846_Christopher_Brown/\n    \u2502       \u251c\u2500\u2500 basic_info.txt\n    \u2502       \u2514\u2500\u2500 recommendation_letter.txt\n    \u251c\u2500\u2500 20107487_Mia_Martin/\n    \u2502       \u251c\u2500\u2500 basic_info.txt\n    \u2502       \u2514\u2500\u2500 recommendation_letter.txt\n    \u251c\u2500\u2500 20108742_Sarah_Brown/\n    \u2502       \u251c\u2500\u2500 basic_info.txt\n    \u2502       \u2514\u2500\u2500 recommendation_letter.txt\n    \u251c\u2500\u2500 20109144_Emma_Thomas/\n    \u2502       \u251c\u2500\u2500 basic_info.txt\n    \u2502       \u2514\u2500\u2500 recommendation_letter.txt\n    \u251c\u2500\u2500 20109803_Oliver_Hernandez/\n    \u2502       \u251c\u2500\u2500 basic_info.txt\n    \u2502       \u2514\u2500\u2500 recommendation_letter.txt\n    \u251c\u2500\u2500 20111634_Isabella_Thomas/\n    \u2502       \u251c\u2500\u2500 basic_info.txt\n    \u2502       \u2514\u2500\u2500 recommendation_letter.txt\n    \u251c\u2500\u2500 20112439_Christopher_Moore/\n    \u2502       \u251c\u2500\u2500 basic_info.txt\n    \u2502       \u2514\u2500\u2500 recommendation_letter.txt\n    \u251c\u2500\u2500 20113368_William_Wilson/\n    \u2502       \u251c\u2500\u2500 basic_info.txt\n    \u2502       \u2514\u2500\u2500 recommendation_letter.txt\n    \u251c\u2500\u2500 20113603_Robert_Rodriguez/\n    \u2502       \u251c\u2500\u2500 basic_info.txt\n    \u2502       \u2514\u2500\u2500 recommendation_letter.txt\n    \u251c\u2500\u2500 20114397_Isabella_Martin/\n    \u2502       \u251c\u2500\u2500 basic_info.txt\n    \u2502       \u2514\u2500\u2500 recommendation_letter.txt\n    \u251c\u2500\u2500 20114869_Ethan_Martin/\n    \u2502       \u251c\u2500\u2500 basic_info.txt\n    \u2502       \u2514\u2500\u2500 recommendation_letter.txt\n    \u251c\u2500\u2500 20115252_Mason_Johnson/\n    \u2502       \u251c\u2500\u2500 basic_info.txt\n    \u2502       \u2514\u2500\u2500 recommendation_letter.txt\n    \u251c\u2500\u2500 20115632_Elizabeth_Anderson/\n    \u2502       \u251c\u2500\u2500 basic_info.txt\n    \u2502       \u2514\u2500\u2500 recommendation_letter.txt\n    \u251c\u2500\u2500 20115753_Charlotte_Johnson/\n    \u2502       \u251c\u2500\u2500 basic_info.txt\n    \u2502       \u2514\u2500\u2500 recommendation_letter.txt\n    \u251c\u2500\u2500 20115924_Michael_Lopez/\n    \u2502       \u251c\u2500\u2500 basic_info.txt\n    \u2502       \u2514\u2500\u2500 recommendation_letter.txt\n    \u251c\u2500\u2500 20116232_Olivia_Lopez/\n    \u2502       \u251c\u2500\u2500 basic_info.txt\n    \u2502       \u2514\u2500\u2500 recommendation_letter.txt\n    \u251c\u2500\u2500 20119528_Thomas_Brown/\n    \u2502       \u251c\u2500\u2500 basic_info.txt\n    \u2502       \u2514\u2500\u2500 recommendation_letter.txt\n    \u251c\u2500\u2500 20122427_Karen_Gonzalez/\n    \u2502       \u251c\u2500\u2500 basic_info.txt\n    \u2502       \u2514\u2500\u2500 recommendation_letter.txt\n    \u251c\u2500\u2500 20122977_Evelyn_Miller/\n    \u2502       \u251c\u2500\u2500 basic_info.txt\n    \u2502       \u2514\u2500\u2500 recommendation_letter.txt\n    \u251c\u2500\u2500 20123376_Joseph_Johnson/\n    \u2502       \u251c\u2500\u2500 basic_info.txt\n    \u2502       \u2514\u2500\u2500 recommendation_letter.txt\n    \u251c\u2500\u2500 20125451_Barbara_Brown/\n    \u2502       \u251c\u2500\u2500 basic_info.txt\n    \u2502       \u2514\u2500\u2500 recommendation_letter.txt\n    \u251c\u2500\u2500 20126203_Barbara_Davis/\n    \u2502       \u251c\u2500\u2500 basic_info.txt\n    \u2502       \u2514\u2500\u2500 recommendation_letter.txt\n    \u251c\u2500\u2500 20126394_Olivia_Williams/\n    \u2502       \u251c\u2500\u2500 basic_info.txt\n    \u2502       \u2514\u2500\u2500 recommendation_letter.txt\n    \u251c\u2500\u2500 20126471_Ethan_Taylor/\n    \u2502       \u251c\u2500\u2500 basic_info.txt\n    \u2502       \u2514\u2500\u2500 recommendation_letter.txt\n    \u251c\u2500\u2500 20127423_John_Williams/\n    \u2502       \u251c\u2500\u2500 basic_info.txt\n    \u2502       \u2514\u2500\u2500 recommendation_letter.txt\n    \u251c\u2500\u2500 20128249_Oliver_Smith/\n    \u2502       \u251c\u2500\u2500 basic_info.txt\n    \u2502       \u2514\u2500\u2500 recommendation_letter.txt\n    \u251c\u2500\u2500 20128879_Christopher_Taylor/\n    \u2502       \u251c\u2500\u2500 basic_info.txt\n    \u2502       \u2514\u2500\u2500 recommendation_letter.txt\n    \u251c\u2500\u2500 20129898_Jessica_Johnson/\n    \u2502       \u251c\u2500\u2500 basic_info.txt\n    \u2502       \u2514\u2500\u2500 recommendation_letter.txt\n    \u251c\u2500\u2500 20131271_Olivia_Brown/\n    \u2502       \u251c\u2500\u2500 basic_info.txt\n    \u2502       \u2514\u2500\u2500 recommendation_letter.txt\n    \u251c\u2500\u2500 20131518_Sophia_Smith/\n    \u2502       \u251c\u2500\u2500 basic_info.txt\n    \u2502       \u2514\u2500\u2500 recommendation_letter.txt\n    \u251c\u2500\u2500 20132026_Isabella_Smith/\n    \u2502       \u251c\u2500\u2500 basic_info.txt\n    \u2502       \u2514\u2500\u2500 recommendation_letter.txt\n    \u251c\u2500\u2500 20132370_James_Brown/\n    \u2502       \u251c\u2500\u2500 basic_info.txt\n    \u2502       \u2514\u2500\u2500 recommendation_letter.txt\n    \u251c\u2500\u2500 20132669_Noah_Smith/\n    \u2502       \u251c\u2500\u2500 basic_info.txt\n    \u2502       \u2514\u2500\u2500 recommendation_letter.txt\n    \u251c\u2500\u2500 20133527_Mason_Jackson/\n    \u2502       \u251c\u2500\u2500 basic_info.txt\n    \u2502       \u2514\u2500\u2500 recommendation_letter.txt\n    \u251c\u2500\u2500 20133697_Isabella_Smith/\n    \u2502       \u251c\u2500\u2500 basic_info.txt\n    \u2502       \u2514\u2500\u2500 recommendation_letter.txt\n    \u251c\u2500\u2500 20135821_Thomas_Wilson/\n    \u2502       \u251c\u2500\u2500 basic_info.txt\n    \u2502       \u2514\u2500\u2500 recommendation_letter.txt\n    \u251c\u2500\u2500 20136681_Benjamin_Anderson/\n    \u2502       \u251c\u2500\u2500 basic_info.txt\n    \u2502       \u2514\u2500\u2500 recommendation_letter.txt\n    \u251c\u2500\u2500 20136890_Benjamin_Brown/\n    \u2502       \u251c\u2500\u2500 basic_info.txt\n    \u2502       \u2514\u2500\u2500 recommendation_letter.txt\n    \u251c\u2500\u2500 20137514_Lucas_Anderson/\n    \u2502       \u251c\u2500\u2500 basic_info.txt\n    \u2502       \u2514\u2500\u2500 recommendation_letter.txt\n    \u251c\u2500\u2500 20139234_Harper_Martinez/\n    \u2502       \u251c\u2500\u2500 basic_info.txt\n    \u2502       \u2514\u2500\u2500 recommendation_letter.txt\n    \u251c\u2500\u2500 20139637_Noah_Johnson/\n    \u2502       \u251c\u2500\u2500 basic_info.txt\n    \u2502       \u2514\u2500\u2500 recommendation_letter.txt\n    \u251c\u2500\u2500 20139647_Patricia_Lopez/\n    \u2502       \u251c\u2500\u2500 basic_info.txt\n    \u2502       \u2514\u2500\u2500 recommendation_letter.txt\n    \u251c\u2500\u2500 20141421_Linda_Gonzalez/\n    \u2502       \u251c\u2500\u2500 basic_info.txt\n    \u2502       \u2514\u2500\u2500 recommendation_letter.txt\n    \u251c\u2500\u2500 20142085_William_Anderson/\n    \u2502       \u251c\u2500\u2500 basic_info.txt\n    \u2502       \u2514\u2500\u2500 recommendation_letter.txt\n    \u251c\u2500\u2500 20142383_Amelia_Brown/\n    \u2502       \u251c\u2500\u2500 basic_info.txt\n    \u2502       \u2514\u2500\u2500 recommendation_letter.txt\n    \u251c\u2500\u2500 20143406_Susan_Martin/\n    \u2502       \u251c\u2500\u2500 basic_info.txt\n    \u2502       \u2514\u2500\u2500 recommendation_letter.txt\n    \u251c\u2500\u2500 20143830_James_Garcia/\n    \u2502       \u251c\u2500\u2500 basic_info.txt\n    \u2502       \u2514\u2500\u2500 recommendation_letter.txt\n    \u251c\u2500\u2500 20146035_Christopher_Garcia/\n    \u2502       \u251c\u2500\u2500 basic_info.txt\n    \u2502       \u2514\u2500\u2500 recommendation_letter.txt\n    \u251c\u2500\u2500 20146277_William_Anderson/\n    \u2502       \u251c\u2500\u2500 basic_info.txt\n    \u2502       \u2514\u2500\u2500 recommendation_letter.txt\n    \u251c\u2500\u2500 20146279_Christopher_Moore/\n    \u2502       \u251c\u2500\u2500 basic_info.txt\n    \u2502       \u2514\u2500\u2500 recommendation_letter.txt\n    \u251c\u2500\u2500 20147301_James_Jones/\n    \u2502       \u251c\u2500\u2500 basic_info.txt\n    \u2502       \u2514\u2500\u2500 recommendation_letter.txt\n    \u251c\u2500\u2500 20147789_James_Anderson/\n    \u2502       \u251c\u2500\u2500 basic_info.txt\n    \u2502       \u2514\u2500\u2500 recommendation_letter.txt\n    \u251c\u2500\u2500 20148681_John_Hernandez/\n    \u2502       \u251c\u2500\u2500 basic_info.txt\n    \u2502       \u2514\u2500\u2500 recommendation_letter.txt\n    \u251c\u2500\u2500 20148778_Susan_Anderson/\n    \u2502       \u251c\u2500\u2500 basic_info.txt\n    \u2502       \u2514\u2500\u2500 recommendation_letter.txt\n    \u251c\u2500\u2500 20149712_Jessica_Rodriguez/\n    \u2502       \u251c\u2500\u2500 basic_info.txt\n    \u2502       \u2514\u2500\u2500 recommendation_letter.txt\n    \u251c\u2500\u2500 20151012_Harper_Miller/\n    \u2502       \u251c\u2500\u2500 basic_info.txt\n    \u2502       \u2514\u2500\u2500 recommendation_letter.txt\n    \u251c\u2500\u2500 20153174_Benjamin_Jackson/\n    \u2502       \u251c\u2500\u2500 basic_info.txt\n    \u2502       \u2514\u2500\u2500 recommendation_letter.txt\n    \u251c\u2500\u2500 20153412_Charlotte_Martin/\n    \u2502       \u251c\u2500\u2500 basic_info.txt\n    \u2502       \u2514\u2500\u2500 recommendation_letter.txt\n    \u251c\u2500\u2500 20153606_James_Anderson/\n    \u2502       \u251c\u2500\u2500 basic_info.txt\n    \u2502       \u2514\u2500\u2500 recommendation_letter.txt\n    \u251c\u2500\u2500 20153687_Richard_Taylor/\n    \u2502       \u251c\u2500\u2500 basic_info.txt\n    \u2502       \u2514\u2500\u2500 recommendation_letter.txt\n    \u251c\u2500\u2500 20154518_John_Gonzalez/\n    \u2502       \u251c\u2500\u2500 basic_info.txt\n    \u2502       \u2514\u2500\u2500 recommendation_letter.txt\n    \u251c\u2500\u2500 20154710_Benjamin_Rodriguez/\n    \u2502       \u251c\u2500\u2500 basic_info.txt\n    \u2502       \u2514\u2500\u2500 recommendation_letter.txt\n    \u251c\u2500\u2500 20156469_Jennifer_Hernandez/\n    \u2502       \u251c\u2500\u2500 basic_info.txt\n    \u2502       \u2514\u2500\u2500 recommendation_letter.txt\n    \u251c\u2500\u2500 20156522_Jennifer_Martinez/\n    \u2502       \u251c\u2500\u2500 basic_info.txt\n    \u2502       \u2514\u2500\u2500 recommendation_letter.txt\n    \u251c\u2500\u2500 20156851_Noah_Anderson/\n    \u2502       \u251c\u2500\u2500 basic_info.txt\n    \u2502       \u2514\u2500\u2500 recommendation_letter.txt\n    \u251c\u2500\u2500 20157943_Harper_Williams/\n    \u2502       \u251c\u2500\u2500 basic_info.txt\n    \u2502       \u2514\u2500\u2500 recommendation_letter.txt\n    \u251c\u2500\u2500 20158266_Sophia_Moore/\n    \u2502       \u251c\u2500\u2500 basic_info.txt\n    \u2502       \u2514\u2500\u2500 recommendation_letter.txt\n    \u251c\u2500\u2500 20158294_Sophia_Wilson/\n    \u2502       \u251c\u2500\u2500 basic_info.txt\n    \u2502       \u2514\u2500\u2500 recommendation_letter.txt\n    \u251c\u2500\u2500 20158819_Sarah_Wilson/\n    \u2502       \u251c\u2500\u2500 basic_info.txt\n    \u2502       \u2514\u2500\u2500 recommendation_letter.txt\n    \u251c\u2500\u2500 20159113_John_Rodriguez/\n    \u2502       \u251c\u2500\u2500 basic_info.txt\n    \u2502       \u2514\u2500\u2500 recommendation_letter.txt\n    \u251c\u2500\u2500 20159695_James_Moore/\n    \u2502       \u251c\u2500\u2500 basic_info.txt\n    \u2502       \u2514\u2500\u2500 recommendation_letter.txt\n    \u251c\u2500\u2500 20161279_William_Moore/\n    \u2502       \u251c\u2500\u2500 basic_info.txt\n    \u2502       \u2514\u2500\u2500 recommendation_letter.txt\n    \u251c\u2500\u2500 20162253_Mason_Rodriguez/\n    \u2502       \u251c\u2500\u2500 basic_info.txt\n    \u2502       \u2514\u2500\u2500 recommendation_letter.txt\n    \u251c\u2500\u2500 20162542_Mia_Anderson/\n    \u2502       \u251c\u2500\u2500 basic_info.txt\n    \u2502       \u2514\u2500\u2500 recommendation_letter.txt\n    \u251c\u2500\u2500 20163356_Ava_Anderson/\n    \u2502       \u251c\u2500\u2500 basic_info.txt\n    \u2502       \u2514\u2500\u2500 recommendation_letter.txt\n    \u251c\u2500\u2500 20164515_Patricia_Moore/\n    \u2502       \u251c\u2500\u2500 basic_info.txt\n    \u2502       \u2514\u2500\u2500 recommendation_letter.txt\n    \u251c\u2500\u2500 20164801_Noah_Rodriguez/\n    \u2502       \u251c\u2500\u2500 basic_info.txt\n    \u2502       \u2514\u2500\u2500 recommendation_letter.txt\n    \u251c\u2500\u2500 20165511_Mary_Gonzalez/\n    \u2502       \u251c\u2500\u2500 basic_info.txt\n    \u2502       \u2514\u2500\u2500 recommendation_letter.txt\n    \u251c\u2500\u2500 20166436_Christopher_Jackson/\n    \u2502       \u251c\u2500\u2500 basic_info.txt\n    \u2502       \u2514\u2500\u2500 recommendation_letter.txt\n    \u251c\u2500\u2500 20166487_Barbara_Hernandez/\n    \u2502       \u251c\u2500\u2500 basic_info.txt\n    \u2502       \u2514\u2500\u2500 recommendation_letter.txt\n    \u251c\u2500\u2500 20166564_Ava_Lopez/\n    \u2502       \u251c\u2500\u2500 basic_info.txt\n    \u2502       \u2514\u2500\u2500 recommendation_letter.txt\n    \u251c\u2500\u2500 20166998_Ava_Lopez/\n    \u2502       \u251c\u2500\u2500 basic_info.txt\n    \u2502       \u2514\u2500\u2500 recommendation_letter.txt\n    \u251c\u2500\u2500 20168311_Lucas_Jackson/\n    \u2502       \u251c\u2500\u2500 basic_info.txt\n    \u2502       \u2514\u2500\u2500 recommendation_letter.txt\n    \u251c\u2500\u2500 20168491_Karen_Martinez/\n    \u2502       \u251c\u2500\u2500 basic_info.txt\n    \u2502       \u2514\u2500\u2500 recommendation_letter.txt\n    \u251c\u2500\u2500 20169515_Thomas_Taylor/\n    \u2502       \u251c\u2500\u2500 basic_info.txt\n    \u2502       \u2514\u2500\u2500 recommendation_letter.txt\n    \u251c\u2500\u2500 20171050_Christopher_Rodriguez/\n    \u2502       \u251c\u2500\u2500 basic_info.txt\n    \u2502       \u2514\u2500\u2500 recommendation_letter.txt\n    \u251c\u2500\u2500 20171406_Mary_Anderson/\n    \u2502       \u251c\u2500\u2500 basic_info.txt\n    \u2502       \u2514\u2500\u2500 recommendation_letter.txt\n    \u251c\u2500\u2500 20171613_Ethan_Moore/\n    \u2502       \u251c\u2500\u2500 basic_info.txt\n    \u2502       \u2514\u2500\u2500 recommendation_letter.txt\n    \u251c\u2500\u2500 20172106_Isabella_Rodriguez/\n    \u2502       \u251c\u2500\u2500 basic_info.txt\n    \u2502       \u2514\u2500\u2500 recommendation_letter.txt\n    \u251c\u2500\u2500 20173259_Michael_Anderson/\n    \u2502       \u251c\u2500\u2500 basic_info.txt\n    \u2502       \u2514\u2500\u2500 recommendation_letter.txt\n    \u251c\u2500\u2500 20173492_Richard_Miller/\n    \u2502       \u251c\u2500\u2500 basic_info.txt\n    \u2502       \u2514\u2500\u2500 recommendation_letter.txt\n    \u251c\u2500\u2500 20173501_Mary_Smith/\n    \u2502       \u251c\u2500\u2500 basic_info.txt\n    \u2502       \u2514\u2500\u2500 recommendation_letter.txt\n    \u251c\u2500\u2500 20173517_Susan_Anderson/\n    \u2502       \u251c\u2500\u2500 basic_info.txt\n    \u2502       \u2514\u2500\u2500 recommendation_letter.txt\n    \u251c\u2500\u2500 20174207_Richard_Wilson/\n    \u2502       \u251c\u2500\u2500 basic_info.txt\n    \u2502       \u2514\u2500\u2500 recommendation_letter.txt\n    \u251c\u2500\u2500 20174369_Mary_Garcia/\n    \u2502       \u251c\u2500\u2500 basic_info.txt\n    \u2502       \u2514\u2500\u2500 recommendation_letter.txt\n    \u251c\u2500\u2500 20175314_William_Taylor/\n    \u2502       \u251c\u2500\u2500 basic_info.txt\n    \u2502       \u2514\u2500\u2500 recommendation_letter.txt\n    \u251c\u2500\u2500 20176169_Lucas_Smith/\n    \u2502       \u251c\u2500\u2500 basic_info.txt\n    \u2502       \u2514\u2500\u2500 recommendation_letter.txt\n    \u251c\u2500\u2500 20176947_Noah_Miller/\n    \u2502       \u251c\u2500\u2500 basic_info.txt\n    \u2502       \u2514\u2500\u2500 recommendation_letter.txt\n    \u251c\u2500\u2500 20177389_James_Smith/\n    \u2502       \u251c\u2500\u2500 basic_info.txt\n    \u2502       \u2514\u2500\u2500 recommendation_letter.txt\n    \u251c\u2500\u2500 20178687_Isabella_Anderson/\n    \u2502       \u251c\u2500\u2500 basic_info.txt\n    \u2502       \u2514\u2500\u2500 recommendation_letter.txt\n    \u251c\u2500\u2500 20179461_William_Johnson/\n    \u2502       \u251c\u2500\u2500 basic_info.txt\n    \u2502       \u2514\u2500\u2500 recommendation_letter.txt\n    \u251c\u2500\u2500 20179690_Linda_Thomas/\n    \u2502       \u251c\u2500\u2500 basic_info.txt\n    \u2502       \u2514\u2500\u2500 recommendation_letter.txt\n    \u251c\u2500\u2500 20181056_Sarah_Hernandez/\n    \u2502       \u251c\u2500\u2500 basic_info.txt\n    \u2502       \u2514\u2500\u2500 recommendation_letter.txt\n    \u251c\u2500\u2500 20182020_Patricia_Taylor/\n    \u2502       \u251c\u2500\u2500 basic_info.txt\n    \u2502       \u2514\u2500\u2500 recommendation_letter.txt\n    \u251c\u2500\u2500 20182390_Ethan_Wilson/\n    \u2502       \u251c\u2500\u2500 basic_info.txt\n    \u2502       \u2514\u2500\u2500 recommendation_letter.txt\n    \u251c\u2500\u2500 20183149_David_Smith/\n    \u2502       \u251c\u2500\u2500 basic_info.txt\n    \u2502       \u2514\u2500\u2500 recommendation_letter.txt\n    \u251c\u2500\u2500 20183219_Charlotte_Williams/\n    \u2502       \u251c\u2500\u2500 basic_info.txt\n    \u2502       \u2514\u2500\u2500 recommendation_letter.txt\n    \u251c\u2500\u2500 20184489_Jessica_Gonzalez/\n    \u2502       \u251c\u2500\u2500 basic_info.txt\n    \u2502       \u2514\u2500\u2500 recommendation_letter.txt\n    \u251c\u2500\u2500 20186154_Charlotte_Smith/\n    \u2502       \u251c\u2500\u2500 basic_info.txt\n    \u2502       \u2514\u2500\u2500 recommendation_letter.txt\n    \u251c\u2500\u2500 20186510_James_Thomas/\n    \u2502       \u251c\u2500\u2500 basic_info.txt\n    \u2502       \u2514\u2500\u2500 recommendation_letter.txt\n    \u251c\u2500\u2500 20187107_David_Martinez/\n    \u2502       \u251c\u2500\u2500 basic_info.txt\n    \u2502       \u2514\u2500\u2500 recommendation_letter.txt\n    \u251c\u2500\u2500 20187144_Mary_Jackson/\n    \u2502       \u251c\u2500\u2500 basic_info.txt\n    \u2502       \u2514\u2500\u2500 recommendation_letter.txt\n    \u251c\u2500\u2500 20187892_Christopher_Taylor/\n    \u2502       \u251c\u2500\u2500 basic_info.txt\n    \u2502       \u2514\u2500\u2500 recommendation_letter.txt\n    \u251c\u2500\u2500 20187921_Mary_Jones/\n    \u2502       \u251c\u2500\u2500 basic_info.txt\n    \u2502       \u2514\u2500\u2500 recommendation_letter.txt\n    \u251c\u2500\u2500 20187967_Sarah_Davis/\n    \u2502       \u251c\u2500\u2500 basic_info.txt\n    \u2502       \u2514\u2500\u2500 recommendation_letter.txt\n    \u251c\u2500\u2500 20188937_James_Moore/\n    \u2502       \u251c\u2500\u2500 basic_info.txt\n    \u2502       \u2514\u2500\u2500 recommendation_letter.txt\n    \u251c\u2500\u2500 20189123_Mary_Martin/\n    \u2502       \u251c\u2500\u2500 basic_info.txt\n    \u2502       \u2514\u2500\u2500 recommendation_letter.txt\n    \u251c\u2500\u2500 20189192_Olivia_Jones/\n    \u2502       \u251c\u2500\u2500 basic_info.txt\n    \u2502       \u2514\u2500\u2500 recommendation_letter.txt\n    \u251c\u2500\u2500 20189268_Emma_Williams/\n    \u2502       \u251c\u2500\u2500 basic_info.txt\n    \u2502       \u2514\u2500\u2500 recommendation_letter.txt\n    \u251c\u2500\u2500 20189854_William_Taylor/\n    \u2502       \u251c\u2500\u2500 basic_info.txt\n    \u2502       \u2514\u2500\u2500 recommendation_letter.txt\n    \u251c\u2500\u2500 20191265_Joseph_Lopez/\n    \u2502       \u251c\u2500\u2500 basic_info.txt\n    \u2502       \u2514\u2500\u2500 recommendation_letter.txt\n    \u251c\u2500\u2500 20192725_Robert_Martinez/\n    \u2502       \u251c\u2500\u2500 basic_info.txt\n    \u2502       \u2514\u2500\u2500 recommendation_letter.txt\n    \u251c\u2500\u2500 20194054_Michael_Jones/\n    \u2502       \u251c\u2500\u2500 basic_info.txt\n    \u2502       \u2514\u2500\u2500 recommendation_letter.txt\n    \u251c\u2500\u2500 20194160_Benjamin_Jackson/\n    \u2502       \u251c\u2500\u2500 basic_info.txt\n    \u2502       \u2514\u2500\u2500 recommendation_letter.txt\n    \u251c\u2500\u2500 20194164_Sarah_Jones/\n    \u2502       \u251c\u2500\u2500 basic_info.txt\n    \u2502       \u2514\u2500\u2500 recommendation_letter.txt\n    \u251c\u2500\u2500 20194525_John_Taylor/\n    \u2502       \u251c\u2500\u2500 basic_info.txt\n    \u2502       \u2514\u2500\u2500 recommendation_letter.txt\n    \u251c\u2500\u2500 20195164_Jennifer_Gonzalez/\n    \u2502       \u251c\u2500\u2500 basic_info.txt\n    \u2502       \u2514\u2500\u2500 recommendation_letter.txt\n    \u251c\u2500\u2500 20195982_David_Jackson/\n    \u2502       \u251c\u2500\u2500 basic_info.txt\n    \u2502       \u2514\u2500\u2500 recommendation_letter.txt\n    \u251c\u2500\u2500 20196776_William_Brown/\n    \u2502       \u251c\u2500\u2500 basic_info.txt\n    \u2502       \u2514\u2500\u2500 recommendation_letter.txt\n    \u251c\u2500\u2500 20196896_Olivia_Jones/\n    \u2502       \u251c\u2500\u2500 basic_info.txt\n    \u2502       \u2514\u2500\u2500 recommendation_letter.txt\n    \u251c\u2500\u2500 20196961_Joseph_Thomas/\n    \u2502       \u251c\u2500\u2500 basic_info.txt\n    \u2502       \u2514\u2500\u2500 recommendation_letter.txt\n    \u251c\u2500\u2500 20196998_Ethan_Wilson/\n    \u2502       \u251c\u2500\u2500 basic_info.txt\n    \u2502       \u2514\u2500\u2500 recommendation_letter.txt\n    \u251c\u2500\u2500 20198548_Evelyn_Moore/\n    \u2502       \u251c\u2500\u2500 basic_info.txt\n    \u2502       \u2514\u2500\u2500 recommendation_letter.txt\n    \u251c\u2500\u2500 20199036_Benjamin_Hernandez/\n    \u2502       \u251c\u2500\u2500 basic_info.txt\n    \u2502       \u2514\u2500\u2500 recommendation_letter.txt\n    \u251c\u2500\u2500 20199583_Mary_Brown/\n    \u2502       \u251c\u2500\u2500 basic_info.txt\n    \u2502       \u2514\u2500\u2500 recommendation_letter.txt\n    \u251c\u2500\u2500 20199735_Mason_Johnson/\n    \u2502       \u251c\u2500\u2500 basic_info.txt\n    \u2502       \u2514\u2500\u2500 recommendation_letter.txt\n    \u251c\u2500\u2500 20199872_Sophia_Jackson/\n    \u2502       \u251c\u2500\u2500 basic_info.txt\n    \u2502       \u2514\u2500\u2500 recommendation_letter.txt\n    \u251c\u2500\u2500 20199980_James_Rodriguez/\n    \u2502       \u251c\u2500\u2500 basic_info.txt\n    \u2502       \u2514\u2500\u2500 recommendation_letter.txt\n    \u251c\u2500\u2500 20201385_John_Taylor/\n    \u2502       \u251c\u2500\u2500 basic_info.txt\n    \u2502       \u2514\u2500\u2500 recommendation_letter.txt\n    \u251c\u2500\u2500 20201800_John_Jones/\n    \u2502       \u251c\u2500\u2500 basic_info.txt\n    \u2502       \u2514\u2500\u2500 recommendation_letter.txt\n    \u251c\u2500\u2500 20202548_Robert_Miller/\n    \u2502       \u251c\u2500\u2500 basic_info.txt\n    \u2502       \u2514\u2500\u2500 recommendation_letter.txt\n    \u251c\u2500\u2500 20203855_Mia_Miller/\n    \u2502       \u251c\u2500\u2500 basic_info.txt\n    \u2502       \u2514\u2500\u2500 recommendation_letter.txt\n    \u2514\u2500\u2500 20204611_Sarah_Wilson/\n            \u251c\u2500\u2500 basic_info.txt\n            \u2514\u2500\u2500 recommendation_letter.txt",
    "stateUrl": "https://storage.mcpmark.ai/filesystem/student_database.zip",
    "stateOriginalUrl": null
  }
}


================================================
FILE: tasks/filesystem/easy/student_database/recommender_name/verify.py
================================================
#!/usr/bin/env python3
"""
Verification script for Student Database Task: Find Recommender Name
"""

import sys
from pathlib import Path
import os

def get_test_directory() -> Path:
    """Get the test directory from FILESYSTEM_TEST_DIR env var."""
    test_root = os.environ.get("FILESYSTEM_TEST_DIR")
    if not test_root:
        raise ValueError("FILESYSTEM_TEST_DIR environment variable is required")
    return Path(test_root)

def verify_recommender_file_exists(test_dir: Path) -> bool:
    """Verify that the recommender.txt file exists."""
    recommender_file = test_dir / "recommender.txt"
    
    if not recommender_file.exists():
        print("❌ File 'recommender.txt' not found")
        return False
    
    print("✅ Recommender file found")
    return True

def verify_recommender_content(test_dir: Path) -> bool:
    """Verify that the recommender.txt file contains 'Brown'."""
    recommender_file = test_dir / "recommender.txt"
    
    try:
        content = recommender_file.read_text()
        
        if "Brown" in content:
            print("✅ Recommender name 'Brown' found in file")
            return True
        else:
            print("❌ Recommender name 'Brown' not found in file")
            print(f"   File content: {content.strip()}")
            return False
        
    except Exception as e:
        print(f"❌ Error reading recommender file: {e}")
        return False

def main():
    """Main verification function."""
    test_dir = get_test_directory()
    print("🔍 Verifying Student Database Task: Find Recommender Name...")
    
    # Check if recommender file exists
    print("\n--- File Existence Check ---")
    if not verify_recommender_file_exists(test_dir):
        print("\n❌ Basic verification failed, cannot proceed with content verification")
        sys.exit(1)
    
    # Verify content
    print("\n--- Content Verification ---")
    if not verify_recommender_content(test_dir):
        print("\n❌ Task verification: FAIL")
        sys.exit(1)
    
    # Final result
    print("\n" + "="*50)
    print("✅ Recommender identification completed correctly!")
    print("🎉 Task verification: PASS")
    sys.exit(0)

if __name__ == "__main__":
    main()


================================================
FILE: tasks/filesystem/standard/desktop/music_report/description.md
================================================
Please use FileSystem tools to finish the following task:

### 1. Data Loading

- Read and extract song information from `jay_chou/`
- Read and extract song information from `jj_lin/`

### 2. Popularity Score Calculation

For each songs, calculate popularity scores using this formula (keep 3 decimal places):

```
popularity_score = (rating × 0.4) + (play_count_normalized × 0.4) + (year_factor × 0.2)

Where:
- rating: song rating (1-5 scale)
- play_count_normalized: play_count / 250 (0-1 scale)
- year_factor: (2025 - release_year) / 25 (recency bonus)
```

### 3. Generate Analysis Report

Create a file named `music_analysis_report.txt`

 in the `music/` folder with the following exact format:

**Lines 1-20**: Each line contains one song in format `songname:popularity_score`

- Sort songs by popularity_score in descending order (highest first)
- Use exact song names as they appear in the source files
- Include all 20 songs from both artists

**Lines 21-25**: Top 5 song names only (one per line)

- List the top 5 songs by popularity_score
- No scores, just song names
- One song name per line

**Important**: The file must contain exactly 25 lines with no additional content, headers, or formatting.


================================================
FILE: tasks/filesystem/standard/desktop/music_report/meta.json
================================================
{
  "task_id": "music_report",
  "task_name": "Music Report",
  "category_id": "desktop",
  "category_name": "Desktop",
  "description": "Search and analyze desktop music files to generate a scored recommendation list using specified computation rules and criteria.",
  "author": "Lingjun Chen",
  "created_at": "2025-08-12",
  "difficulty": "L3",
  "tags": [
    "data extraction",
    "pattern analysis"
  ],
  "mcp": [
    "filesystem"
  ],
  "meta_data": {
    "stateType": "text",
    "stateContent": "desktop/\n    ├── exp_logs/\n    │       ├── aug/\n    │       │       └── augmentation_log.txt\n    │       ├── project_1/\n    │       │       ├── data.csv\n    │       │       ├── model.py\n    │       │       └── README.md\n    │       ├── project_2/\n    │       │       ├── analysis_report.md\n    │       │       └── data_analysis.py\n    │       ├── sep/\n    │       │       └── september_summary.csv\n    │       ├── exp_record.md\n    │       ├── experiment_summary.md\n    │       └── results_record.csv\n    ├── learning/\n    │       ├── 2024/\n    │       │       └── learning_progress.csv\n    │       ├── 2025/\n    │       │       └── learning_roadmap.md\n    │       ├── activities/\n    │       │       └── study_notes.py\n    │       ├── research/\n    │       │       └── research_topics.md\n    │       ├── schedule/\n    │       │       └── weekly_schedule.csv\n    │       └── learning_goals.md\n    ├── music/\n    │       ├── beni/\n    │       │       └── playlist_manager.py\n    │       ├── jay_chou/\n    │       │       └── favorite_songs.csv\n    │       ├── jj_lin/\n    │       │       └── top_songs.txt\n    │       └── music_collection.md\n    ├── old_homebrew/\n    │       ├── 2023-09-23_22/\n    │       │       ├── opt/\n    │       │       └── Users/\n    │       └── 2023-09-23_23/\n    │               ├── opt/\n    │               └── Users/\n    ├── play/\n    │       ├── game_plan/\n    │       │       └── gaming_schedule.md\n    │       ├── hongkong_tour/\n    │       │       └── travel_itinerary.csv\n    │       ├── kit&shoes_collection/\n    │       │       └── inventory.py\n    │       └── others/\n    │               └── entertainment_planner.md\n    └── travel_plan/\n            ├── travel_bucket_list.md\n            └── travel_calculator.py\n",
    "stateUrl": "https://storage.mcpmark.ai/filesystem/desktop.zip",
    "stateOriginalUrl": null
  }
}

================================================
FILE: tasks/filesystem/standard/desktop/music_report/verify.py
================================================
#!/usr/bin/env python3
"""
Verification script for Desktop 2 Music Report Task: Music Collection Analysis
"""

import sys
from pathlib import Path
import os

def get_test_directory() -> Path:
    """Get the test directory from FILESYSTEM_TEST_DIR env var."""
    test_root = os.environ.get("FILESYSTEM_TEST_DIR")
    if not test_root:
        raise ValueError("FILESYSTEM_TEST_DIR environment variable is required")
    return Path(test_root)

# Hardcoded expected data from answer.json
EXPECTED_SONGS = [
    {"song_name": "晴天", "popularity_score": 2.576},
    {"song_name": "七里香", "popularity_score": 2.488},
    {"song_name": "江南", "popularity_score": 2.488},
    {"song_name": "夜曲", "popularity_score": 2.448},
    {"song_name": "一千年以后", "popularity_score": 2.44},
    {"song_name": "稻香", "popularity_score": 2.376},
    {"song_name": "青花瓷", "popularity_score": 2.336},
    {"song_name": "不为谁而作的歌", "popularity_score": 2.32},
    {"song_name": "学不会", "popularity_score": 2.304},
    {"song_name": "小酒窝", "popularity_score": 2.264},
    {"song_name": "可惜没如果", "popularity_score": 2.248},
    {"song_name": "修炼爱情", "popularity_score": 2.24},
    {"song_name": "背对背拥抱", "popularity_score": 2.24},
    {"song_name": "爱笑的眼睛", "popularity_score": 2.232},
    {"song_name": "她说", "popularity_score": 2.216},
    {"song_name": "简单爱", "popularity_score": 1.952},
    {"song_name": "龙卷风", "popularity_score": 1.936},
    {"song_name": "双截棍", "popularity_score": 1.92},
    {"song_name": "可爱女人", "popularity_score": 1.912},
    {"song_name": "星晴", "popularity_score": 1.896}
]

EXPECTED_TOP_5 = ["晴天", "七里香", "江南", "夜曲", "一千年以后"]

def verify_report_file_exists(test_dir: Path) -> bool:
    """Verify that the music_analysis_report.txt file exists."""
    report_file = test_dir / "music" / "music_analysis_report.txt"
    
    if not report_file.exists():
        print("❌ 'music_analysis_report.txt' file not found in music/ folder")
        return False
    
    if not report_file.is_file():
        print("❌ 'music_analysis_report.txt' exists but is not a file")
        return False
    
    print("✅ 'music_analysis_report.txt' file exists")
    return True

def verify_file_content_structure(test_dir: Path) -> bool:
    """Verify that the file has exactly 25 lines."""
    report_file = test_dir / "music" / "music_analysis_report.txt"
    
    try:
        content = report_file.read_text(encoding='utf-8')
        lines = content.strip().split('\n')
        
        if len(lines) != 25:
            print(f"❌ File should have exactly 25 lines, but has {len(lines)}")
            return False
        
        print("✅ File has exactly 25 lines")
        return True
        
    except Exception as e:
        print(f"❌ Error reading file content: {e}")
        return False

def verify_song_ranking_format(test_dir: Path) -> bool:
    """Verify that lines 1-20 contain songs with scores in correct format."""
    report_file = test_dir / "music" / "music_analysis_report.txt"
    
    try:
        content = report_file.read_text(encoding='utf-8')
        lines = content.strip().split('\n')
        
        # Check lines 1-20 (index 0-19)
        for i in range(20):
            line = lines[i].strip()
            if not line:
                print(f"❌ Line {i+1} is empty")
                return False
            
            # Check format: songname:popularity_score
            if ':' not in line:
                print(f"❌ Line {i+1} missing colon separator: '{line}'")
                return False
            
            parts = line.split(':', 1)
            if len(parts) != 2:
                print(f"❌ Line {i+1} has incorrect format: '{line}'")
                return False
            
            song_name, score_str = parts
            
            if not song_name.strip():
                print(f"❌ Line {i+1} has empty song name: '{line}'")
                return False
            
            try:
                score = float(score_str.strip())
                if score < 0 or score > 5:
                    print(f"❌ Line {i+1} has invalid score range: {score}")
                    return False
            except ValueError:
                print(f"❌ Line {i+1} has invalid score format: '{score_str}'")
                return False
        
        print("✅ Lines 1-20 have correct song:score format")
        return True
        
    except Exception as e:
        print(f"❌ Error checking song ranking format: {e}")
        return False

def verify_song_ranking_order_with_tolerance(test_dir: Path) -> bool:
    """Verify that songs are ranked by popularity score in descending order, allowing equal scores to be swapped."""
    report_file = test_dir / "music" / "music_analysis_report.txt"
    
    try:
        content = report_file.read_text(encoding='utf-8')
        lines = content.strip().split('\n')
        
        scores = []
        for i in range(20):
            line = lines[i].strip()
            parts = line.split(':', 1)
            score = float(parts[1].strip())
            scores.append(score)
        
        # Check if scores are in descending order, allowing equal scores to be adjacent
        for i in range(1, len(scores)):
            if scores[i] > scores[i-1]:
                print(f"❌ Scores not in descending order: {scores[i-1]} < {scores[i]} at line {i+1}")
                return False
        
        print("✅ Songs are ranked by popularity score in descending order (allowing equal scores)")
        return True
        
    except Exception as e:
        print(f"❌ Error checking song ranking order: {e}")
        return False

def verify_song_names_match_expected(test_dir: Path) -> bool:
    """Verify that all expected song names are present in the ranking."""
    report_file = test_dir / "music" / "music_analysis_report.txt"
    
    try:
        content = report_file.read_text(encoding='utf-8')
        lines = content.strip().split('\n')
        
        found_songs = []
        for i in range(20):
            line = lines[i].strip()
            song_name = line.split(':', 1)[0].strip()
            found_songs.append(song_name)
        
        # Check if all expected songs are present
        missing_songs = []
        for expected_song in EXPECTED_SONGS:
            if expected_song["song_name"] not in found_songs:
                missing_songs.append(expected_song["song_name"])
        
        if missing_songs:
            print(f"❌ Missing expected songs: {missing_songs}")
            return False
        
        print("✅ All expected song names are present")
        return True
        
    except Exception as e:
        print(f"❌ Error checking song names: {e}")
        return False

def verify_popularity_scores_match_expected(test_dir: Path) -> bool:
    """Verify that popularity scores match the expected values."""
    report_file = test_dir / "music" / "music_analysis_report.txt"
    
    try:
        content = report_file.read_text(encoding='utf-8')
        lines = content.strip().split('\n')
        
        score_errors = []
        for i in range(20):
            line = lines[i].strip()
            parts = line.split(':', 1)
            song_name = parts[0].strip()
            actual_score = float(parts[1].strip())
            
            # Find expected score for this song
            expected_score = None
            for expected_song in EXPECTED_SONGS:
                if expected_song["song_name"] == song_name:
                    expected_score = expected_song["popularity_score"]
                    break
            
            if expected_score is not None:
                # Allow small floating point precision differences
                if abs(actual_score - expected_score) > 0.001:
                    score_errors.append(f"{song_name}: expected {expected_score}, got {actual_score}")
        
        if score_errors:
            print(f"❌ Score mismatches: {score_errors}")
            return False
        
        print("✅ All popularity scores match expected values")
        return True
        
    except Exception as e:
        print(f"❌ Error checking popularity scores: {e}")
        return False

def verify_top_5_songs(test_dir: Path) -> bool:
    """Verify that lines 21-25 contain the top 5 song names, allowing equal scores to be in different order."""
    report_file = test_dir / "music" / "music_analysis_report.txt"
    
    try:
        content = report_file.read_text(encoding='utf-8')
        lines = content.strip().split('\n')
        
        # Check lines 21-25 (index 20-24)
        found_top_5 = []
        for i in range(5):
            line_num = i + 21
            line = lines[i + 20].strip()  # Index 20-24 for lines 21-25
            
            if not line:
                print(f"❌ Line {line_num} is empty")
                return False
            
            if ':' in line:
                print(f"❌ Line {line_num} should not contain colon: '{line}'")
                return False
            
            found_top_5.append(line)
        
        # Check if all expected top 5 songs are present (order doesn't matter for equal scores)
        missing_songs = []
        for expected_song in EXPECTED_TOP_5:
            if expected_song not in found_top_5:
                missing_songs.append(expected_song)
        
        if missing_songs:
            print(f"❌ Missing expected top 5 songs: {missing_songs}")
            return False
        
        # Check if the order is valid (allowing equal scores to be swapped)
        # Since 七里香 and 江南 both have score 2.488, they can be in either order
        valid_orders = [
            ["晴天", "七里香", "江南", "夜曲", "一千年以后"],  # Original order
            ["晴天", "江南", "七里香", "夜曲", "一千年以后"],  # Swapped 七里香 and 江南
        ]
        
        order_valid = False
        for valid_order in valid_orders:
            if found_top_5 == valid_order:
                order_valid = True
                break
        
        if not order_valid:
            print(f"❌ Top 5 songs order is invalid. Found: {found_top_5}")
            print(f"Expected one of: {valid_orders}")
            return False
        
        print("✅ Lines 21-25 contain correct top 5 song names in valid order")
        return True
        
    except Exception as e:
        print(f"❌ Error checking top 5 songs: {e}")
        return False

def verify_no_extra_content(test_dir: Path) -> bool:
    """Verify that the file contains no extra content beyond the 25 lines."""
    report_file = test_dir / "music" / "music_analysis_report.txt"
    
    try:
        content = report_file.read_text(encoding='utf-8')
        lines = content.strip().split('\n')
        
        if len(lines) != 25:
            print(f"❌ File should have exactly 25 lines, but has {len(lines)}")
            return False
        
        print("✅ File contains exactly 25 lines with no extra content")
        return True
        
    except Exception as e:
        print(f"❌ Error checking for extra content: {e}")
        return False

def main():
    """Main verification function."""
    test_dir = get_test_directory()
    print("🔍 Verifying Desktop 2 Music Report Task: Music Collection Analysis...")
    
    # Define verification steps
    verification_steps = [
        ("Report File Exists", verify_report_file_exists),
        ("File Content Structure", verify_file_content_structure),
        ("Song Ranking Format", verify_song_ranking_format),
        ("Song Ranking Order", verify_song_ranking_order_with_tolerance),
        ("Song Names Match Expected", verify_song_names_match_expected),
        ("Popularity Scores Match Expected", verify_popularity_scores_match_expected),
        ("Top 5 Songs", verify_top_5_songs),
        ("No Extra Content", verify_no_extra_content),
    ]
    
    # Run all verification steps
    all_passed = True
    for step_name, verify_func in verification_steps:
        print(f"\n--- {step_name} ---")
        if not verify_func(test_dir):
            all_passed = False
    
    # Final result
    print("\n" + "="*50)
    if all_passed:
        print("✅ Music collection analysis completed correctly!")
        print("🎉 Task verification: PASS")
        sys.exit(0)
    else:
        print("❌ Task verification: FAIL")
        sys.exit(1)

if __name__ == "__main__":
    main()

================================================
FILE: tasks/filesystem/standard/desktop/project_management/description.md
================================================
Please use FileSystem tools to finish the following task:

1. **Create the main directory structure** in `desktop_2`:

   - Create a new directory in main directory called `organized_projects`
   - Inside `organized_projects`, create 3 main subdirectories: `experiments`, `learning`, and `personal`
   - Inside `experiments`, create 2 subdirectories: `ml_projects` and `data_analysis`
   - Inside `learning`, create 2 subdirectories: `progress_tracking` and `resources`
   - Inside `personal`, create 2 subdirectories: `entertainment` and `collections`
2. **Move all the Python files** to `experiments/ml_projects/`:
3. **Move all the CSV files** to `experiments/data_analysis/`:
4. **Only Move learning-related markdown files** to `learning/resources/`:
5. **Only Move entertainment planning-related markdown files** to `personal/entertainment/`:
6. **Only Move music collection-related markdown files** to `personal/collections/`:
7. **step 4.5.6 should move all the markdown files.**
8. **Create a project structure documentation file**:

   - Create `project_structure.md` in the `organized_projects` directory
   - Document the new organization with exact file counts for each subdirectory
   - Include a summary of what types of files are in each directory


================================================
FILE: tasks/filesystem/standard/desktop/project_management/meta.json
================================================
{
  "task_id": "project_management",
  "task_name": "Project Management",
  "category_id": "desktop",
  "category_name": "Desktop",
  "description": "Reorganize scattered desktop files into a structured project directory system based on content type, purpose, and file format analysis.",
  "author": "Lingjun Chen",
  "created_at": "2025-08-12",
  "difficulty": "L3",
  "tags": [
    "file organization"
  ],
  "mcp": [
    "filesystem"
  ],
  "meta_data": {
    "stateType": "text",
    "stateContent": "desktop/\n    ├── exp_logs/\n    │       ├── aug/\n    │       │       └── augmentation_log.txt\n    │       ├── project_1/\n    │       │       ├── data.csv\n    │       │       ├── model.py\n    │       │       └── README.md\n    │       ├── project_2/\n    │       │       ├── analysis_report.md\n    │       │       └── data_analysis.py\n    │       ├── sep/\n    │       │       └── september_summary.csv\n    │       ├── exp_record.md\n    │       ├── experiment_summary.md\n    │       └── results_record.csv\n    ├── learning/\n    │       ├── 2024/\n    │       │       └── learning_progress.csv\n    │       ├── 2025/\n    │       │       └── learning_roadmap.md\n    │       ├── activities/\n    │       │       └── study_notes.py\n    │       ├── research/\n    │       │       └── research_topics.md\n    │       ├── schedule/\n    │       │       └── weekly_schedule.csv\n    │       └── learning_goals.md\n    ├── music/\n    │       ├── beni/\n    │       │       └── playlist_manager.py\n    │       ├── jay_chou/\n    │       │       └── favorite_songs.csv\n    │       ├── jj_lin/\n    │       │       └── top_songs.txt\n    │       └── music_collection.md\n    ├── old_homebrew/\n    │       ├── 2023-09-23_22/\n    │       │       ├── opt/\n    │       │       └── Users/\n    │       └── 2023-09-23_23/\n    │               ├── opt/\n    │               └── Users/\n    ├── play/\n    │       ├── game_plan/\n    │       │       └── gaming_schedule.md\n    │       ├── hongkong_tour/\n    │       │       └── travel_itinerary.csv\n    │       ├── kit&shoes_collection/\n    │       │       └── inventory.py\n    │       └── others/\n    │               └── entertainment_planner.md\n    └── travel_plan/\n            ├── travel_bucket_list.md\n            └── travel_calculator.py\n",
    "stateUrl": "https://storage.mcpmark.ai/filesystem/desktop.zip",
    "stateOriginalUrl": null
  }
}

================================================
FILE: tasks/filesystem/standard/desktop/project_management/verify.py
================================================
#!/usr/bin/env python3
"""
Verification script for Desktop 2 Project Management Task: File Reorganization
"""

import sys
from pathlib import Path
import os

def get_test_directory() -> Path:
    """Get the test directory from FILESYSTEM_TEST_DIR env var."""
    test_root = os.environ.get("FILESYSTEM_TEST_DIR")
    if not test_root:
        raise ValueError("FILESYSTEM_TEST_DIR environment variable is required")
    return Path(test_root)

def verify_organized_projects_directory_exists(test_dir: Path) -> bool:
    """Verify that the organized_projects directory exists."""
    organized_dir = test_dir / "organized_projects"
    
    if not organized_dir.exists():
        print("❌ 'organized_projects' directory not found")
        return False
    
    if not organized_dir.is_dir():
        print("❌ 'organized_projects' exists but is not a directory")
        return False
    
    print("✅ 'organized_projects' directory exists")
    return True

def verify_directory_structure(test_dir: Path) -> bool:
    """Verify that all required subdirectories exist."""
    organized_dir = test_dir / "organized_projects"
    
    required_dirs = [
        "experiments",
        "experiments/ml_projects",
        "experiments/data_analysis",
        "learning",
        "learning/progress_tracking",
        "learning/resources",
        "personal",
        "personal/entertainment",
        "personal/collections"
    ]
    
    missing_dirs = []
    for dir_path in required_dirs:
        full_path = organized_dir / dir_path
        if not full_path.exists():
            missing_dirs.append(dir_path)
        elif not full_path.is_dir():
            missing_dirs.append(f"{dir_path} (not a directory)")
    
    if missing_dirs:
        print(f"❌ Missing or invalid directories: {missing_dirs}")
        return False
    
    print("✅ All required directory structure created correctly")
    return True

def verify_python_files_in_ml_projects(test_dir: Path) -> bool:
    """Verify that all Python files are moved to experiments/ml_projects."""
    organized_dir = test_dir / "organized_projects"
    ml_projects_dir = organized_dir / "experiments" / "ml_projects"
    
    expected_python_files = [
        "study_notes.py",
        "model.py",
        "data_analysis.py",
        "travel_calculator.py",
        "inventory.py",
        "playlist_manager.py"
    ]
    
    missing_files = []
    for filename in expected_python_files:
        file_path = ml_projects_dir / filename
        if not file_path.exists():
            missing_files.append(filename)
    
    if missing_files:
        print(f"❌ Missing Python files in ml_projects: {missing_files}")
        return False
    
    print("✅ All Python files moved to experiments/ml_projects")
    return True

def verify_csv_files_in_data_analysis(test_dir: Path) -> bool:
    """Verify that all CSV files are moved to experiments/data_analysis."""
    organized_dir = test_dir / "organized_projects"
    data_analysis_dir = organized_dir / "experiments" / "data_analysis"
    
    expected_csv_files = [
        "learning_progress.csv",
        "weekly_schedule.csv",
        "results_record.csv",
        "september_summary.csv",
        "data.csv",
        "favorite_songs.csv",
        "travel_itinerary.csv"
    ]
    
    missing_files = []
    for filename in expected_csv_files:
        file_path = data_analysis_dir / filename
        if not file_path.exists():
            missing_files.append(filename)
    
    if missing_files:
        print(f"❌ Missing CSV files in data_analysis: {missing_files}")
        return False
    
    print("✅ All CSV files moved to experiments/data_analysis")
    return True

def verify_learning_md_files_in_resources(test_dir: Path) -> bool:
    """Verify that learning-related markdown files are moved to learning/resources."""
    organized_dir = test_dir / "organized_projects"
    resources_dir = organized_dir / "learning" / "resources"
    
    expected_learning_files = [
        "learning_roadmap.md",
        "research_topics.md",
        "experiment_summary.md",
        "exp_record.md",
        "README.md",
        "analysis_report.md",
        "learning_goals.md"
    ]
    
    missing_files = []
    for filename in expected_learning_files:
        file_path = resources_dir / filename
        if not file_path.exists():
            missing_files.append(filename)
    
    if missing_files:
        print(f"❌ Missing learning markdown files in resources: {missing_files}")
        return False
    
    print("✅ All learning markdown files moved to learning/resources")
    return True

def verify_entertainment_md_files_in_entertainment(test_dir: Path) -> bool:
    """Verify that entertainment planning markdown files are moved to personal/entertainment."""
    organized_dir = test_dir / "organized_projects"
    entertainment_dir = organized_dir / "personal" / "entertainment"
    
    expected_entertainment_files = [
        "gaming_schedule.md",
        "entertainment_planner.md",
        "travel_bucket_list.md"
    ]
    
    missing_files = []
    for filename in expected_entertainment_files:
        file_path = entertainment_dir / filename
        if not file_path.exists():
            missing_files.append(filename)
    
    if missing_files:
        print(f"❌ Missing entertainment markdown files in entertainment: {missing_files}")
        return False
    
    print("✅ All entertainment markdown files moved to personal/entertainment")
    return True

def verify_music_md_files_in_collections(test_dir: Path) -> bool:
    """Verify that music collection markdown files are moved to personal/collections."""
    organized_dir = test_dir / "organized_projects"
    collections_dir = organized_dir / "personal" / "collections"
    
    expected_music_files = [
        "music_collection.md"
    ]
    
    missing_files = []
    for filename in expected_music_files:
        file_path = collections_dir / filename
        if not file_path.exists():
            missing_files.append(filename)
    
    if missing_files:
        print(f"❌ Missing music collection markdown files in collections: {filename}")
        return False
    
    print("✅ All music collection markdown files moved to personal/collections")
    return True

def verify_progress_tracking_empty(test_dir: Path) -> bool:
    """Verify that progress_tracking directory is empty."""
    organized_dir = test_dir / "organized_projects"
    progress_dir = organized_dir / "learning" / "progress_tracking"
    
    files_in_progress = list(progress_dir.iterdir())
    if files_in_progress:
        print(f"❌ progress_tracking directory should be empty, but contains: {[f.name for f in files_in_progress]}")
        return False
    
    print("✅ progress_tracking directory is correctly empty")
    return True

def verify_project_structure_file_exists(test_dir: Path) -> bool:
    """Verify that project_structure.md file exists."""
    organized_dir = test_dir / "organized_projects"
    structure_file = organized_dir / "project_structure.md"
    
    if not structure_file.exists():
        print("❌ 'project_structure.md' file not found")
        return False
    
    if not structure_file.is_file():
        print("❌ 'project_structure.md' exists but is not a file")
        return False
    
    print("✅ 'project_structure.md' file exists")
    return True

def verify_file_counts(test_dir: Path) -> bool:
    """Verify that each directory has the correct number of files."""
    organized_dir = test_dir / "organized_projects"
    
    expected_counts = {
        "experiments/ml_projects": 6,      # 6 Python files
        "experiments/data_analysis": 7,    # 7 CSV files
        "learning/resources": 7,           # 7 learning markdown files
        "learning/progress_tracking": 0,   # 0 files (empty)
        "personal/entertainment": 3,       # 3 entertainment markdown files
        "personal/collections": 1          # 1 music collection markdown file
    }
    
    incorrect_counts = []
    for dir_path, expected_count in expected_counts.items():
        full_path = organized_dir / dir_path
        actual_count = len([f for f in full_path.iterdir() if f.is_file()])
        
        if actual_count != expected_count:
            incorrect_counts.append(f"{dir_path}: expected {expected_count}, got {actual_count}")
    
    if incorrect_counts:
        print(f"❌ Incorrect file counts: {incorrect_counts}")
        return False
    
    print("✅ All directories have correct file counts")
    return True

def main():
    """Main verification function."""
    test_dir = get_test_directory()
    print("🔍 Verifying Desktop 2 Project Management Task: File Reorganization...")
    
    # Define verification steps
    verification_steps = [
        ("Organized Projects Directory Exists", verify_organized_projects_directory_exists),
        ("Directory Structure", verify_directory_structure),
        ("Python Files in ML Projects", verify_python_files_in_ml_projects),
        ("CSV Files in Data Analysis", verify_csv_files_in_data_analysis),
        ("Learning Markdown Files in Resources", verify_learning_md_files_in_resources),
        ("Entertainment Markdown Files in Entertainment", verify_entertainment_md_files_in_entertainment),
        ("Music Collection Files in Collections", verify_music_md_files_in_collections),
        ("Progress Tracking Empty", verify_progress_tracking_empty),
        ("Project Structure File Exists", verify_project_structure_file_exists),
        ("File Counts", verify_file_counts),
    ]
    
    # Run all verification steps
    all_passed = True
    for step_name, verify_func in verification_steps:
        print(f"\n--- {step_name} ---")
        if not verify_func(test_dir):
            all_passed = False
    
    # Final result
    print("\n" + "="*50)
    if all_passed:
        print("✅ Desktop 2 project reorganization completed correctly!")
        print("🎉 Task verification: PASS")
        sys.exit(0)
    else:
        print("❌ Task verification: FAIL")
        sys.exit(1)

if __name__ == "__main__":
    main()

================================================
FILE: tasks/filesystem/standard/desktop/timeline_extraction/description.md
================================================
Please use FileSystem tools to finish the following task:

Read all the files under current path, extract every time/plan information that clearly indicates 2024, and integrate them into a list and create a file in main directory called `timeline.txt`. Write the timeline in the file in the following format.

### Rules
- If a task only shows month without day, use the 1st day of that month
- If a task only shows year without month and day, skip it.
- If a file shows multiple tasks on the same date, count only once per date

### Output Format
- Each line format: `file_path:time`
    - `file_path`: The file path where this time information appears (**relative to the current path**)
    - `time`: Specific time, if it's a time period, write the start time (YYYY-MM-DD)

### Sorting Requirements
- Sort by chronological order


================================================
FILE: tasks/filesystem/standard/desktop/timeline_extraction/meta.json
================================================
{
  "task_id": "timeline_extraction",
  "task_name": "Timeline Extraction",
  "category_id": "desktop",
  "category_name": "Desktop",
  "description": "Extract temporal event information from various desktop files and compile a comprehensive chronological timeline of activities and milestones.",
  "author": "Lingjun Chen",
  "created_at": "2025-08-12",
  "difficulty": "L3",
  "tags": [
    "data extraction",
    "pattern analysis"
  ],
  "mcp": [
    "filesystem"
  ],
  "meta_data": {
    "stateType": "text",
    "stateContent": "desktop/\n    ├── exp_logs/\n    │       ├── aug/\n    │       │       └── augmentation_log.txt\n    │       ├── project_1/\n    │       │       ├── data.csv\n    │       │       ├── model.py\n    │       │       └── README.md\n    │       ├── project_2/\n    │       │       ├── analysis_report.md\n    │       │       └── data_analysis.py\n    │       ├── sep/\n    │       │       └── september_summary.csv\n    │       ├── exp_record.md\n    │       ├── experiment_summary.md\n    │       └── results_record.csv\n    ├── learning/\n    │       ├── 2024/\n    │       │       └── learning_progress.csv\n    │       ├── 2025/\n    │       │       └── learning_roadmap.md\n    │       ├── activities/\n    │       │       └── study_notes.py\n    │       ├── research/\n    │       │       └── research_topics.md\n    │       ├── schedule/\n    │       │       └── weekly_schedule.csv\n    │       └── learning_goals.md\n    ├── music/\n    │       ├── beni/\n    │       │       └── playlist_manager.py\n    │       ├── jay_chou/\n    │       │       └── favorite_songs.csv\n    │       ├── jj_lin/\n    │       │       └── top_songs.txt\n    │       └── music_collection.md\n    ├── old_homebrew/\n    │       ├── 2023-09-23_22/\n    │       │       ├── opt/\n    │       │       └── Users/\n    │       └── 2023-09-23_23/\n    │               ├── opt/\n    │               └── Users/\n    ├── play/\n    │       ├── game_plan/\n    │       │       └── gaming_schedule.md\n    │       ├── hongkong_tour/\n    │       │       └── travel_itinerary.csv\n    │       ├── kit&shoes_collection/\n    │       │       └── inventory.py\n    │       └── others/\n    │               └── entertainment_planner.md\n    └── travel_plan/\n            ├── travel_bucket_list.md\n            └── travel_calculator.py\n",
    "stateUrl": "https://storage.mcpmark.ai/filesystem/desktop.zip",
    "stateOriginalUrl": null
  }
}

================================================
FILE: tasks/filesystem/standard/desktop/timeline_extraction/verify.py
================================================
#!/usr/bin/env python3
"""
Verification script for Desktop 2 Timeline Extraction Task
"""

import sys
from pathlib import Path
import os
import re
from datetime import datetime
from typing import List, Tuple, Set

def get_test_directory() -> Path:
    """Get the test directory from FILESYSTEM_TEST_DIR env var."""
    test_root = os.environ.get("FILESYSTEM_TEST_DIR")
    if not test_root:
        raise ValueError("FILESYSTEM_TEST_DIR environment variable is required")
    return Path(test_root)

def verify_timeline_file_exists(test_dir: Path) -> bool:
    """Verify that the timeline.txt file exists in the main directory."""
    timeline_file = test_dir / "timeline.txt"
    
    if not timeline_file.exists():
        print("❌ 'timeline.txt' file not found in main directory")
        return False
    
    if not timeline_file.is_file():
        print("❌ 'timeline.txt' exists but is not a file")
        return False
    
    print("✅ 'timeline.txt' file exists in main directory")
    return True

def verify_timeline_file_readable(test_dir: Path) -> bool:
    """Verify that the timeline.txt file is readable."""
    timeline_file = test_dir / "timeline.txt"
    
    try:
        content = timeline_file.read_text(encoding='utf-8')
        if not content.strip():
            print("❌ 'timeline.txt' file is empty")
            return False
        
        print("✅ 'timeline.txt' file is readable")
        return True
        
    except Exception as e:
        print(f"❌ Error reading 'timeline.txt' file: {e}")
        return False

def verify_line_count(test_dir: Path) -> bool:
    """Verify that the timeline.txt file has exactly 43 lines."""
    timeline_file = test_dir / "timeline.txt"
    
    try:
        content = timeline_file.read_text(encoding='utf-8')
        lines = [line.strip() for line in content.split('\n') if line.strip()]
        
        if len(lines) != 43:
            print(f"❌ Expected 43 lines, but found {len(lines)} lines")
            return False
        
        print(f"✅ File contains exactly {len(lines)} lines")
        return True
        
    except Exception as e:
        print(f"❌ Error checking line count: {e}")
        return False

def verify_line_format(test_dir: Path) -> bool:
    """Verify that each line contains both file path and date time information."""
    timeline_file = test_dir / "timeline.txt"
    
    try:
        content = timeline_file.read_text(encoding='utf-8')
        lines = [line.strip() for line in content.split('\n') if line.strip()]
        
        # More flexible pattern: just check if line contains both path-like content and date-like content
        date_pattern = r'\d{4}-\d{2}-\d{2}'  # YYYY-MM-DD format
        
        invalid_lines = []
        for i, line in enumerate(lines, 1):
            # Check if line contains a date
            if not re.search(date_pattern, line):
                invalid_lines.append(f"Line {i}: '{line}' (no valid date found)")
                continue
            
            # Check if line contains path-like content (contains '/' or '.' and not just a date)
            # More flexible: look for path anywhere in the line, not just at the beginning
            path_found = False
            
            # Split line into words and look for path-like content
            words = line.split()
            for word in words:
                # Check if word looks like a file path (contains '/' or '.' and not just a date)
                if ('/' in word or '.' in word) and not re.match(r'^\d{4}-\d{2}-\d{2}$', word.strip()):
                    path_found = True
                    break
            
            # Also check if line contains path-like content with colon separator
            if ':' in line:
                parts = line.split(':')
                for part in parts:
                    if ('/' in part or '.' in part) and not re.match(r'^\d{4}-\d{2}-\d{2}$', part.strip()):
                        path_found = True
                        break
            
            if not path_found:
                invalid_lines.append(f"Line {i}: '{line}' (no valid path found)")
                continue
        
        if invalid_lines:
            print(f"❌ Invalid line format found: {invalid_lines[:5]}...")
            return False
        
        print("✅ All lines contain both file path and date time information")
        return True
        
    except Exception as e:
        print(f"❌ Error checking line format: {e}")
        return False

def verify_date_format(test_dir: Path) -> bool:
    """Verify that all dates are in valid YYYY-MM-DD format."""
    timeline_file = test_dir / "timeline.txt"
    
    try:
        content = timeline_file.read_text(encoding='utf-8')
        lines = [line.strip() for line in content.split('\n') if line.strip()]
        
        invalid_dates = []
        for i, line in enumerate(lines, 1):
            try:
                # Find date pattern in the line (more flexible)
                date_match = re.search(r'\d{4}-\d{2}-\d{2}', line)
                if not date_match:
                    invalid_dates.append(f"Line {i}: '{line}' (no date found)")
                    continue
                
                date_part = date_match.group()
                datetime.strptime(date_part, '%Y-%m-%d')
            except (IndexError, ValueError) as e:
                invalid_dates.append(f"Line {i}: '{line}' (invalid date: {e})")
        
        if invalid_dates:
            print(f"❌ Invalid date format found: {invalid_dates[:5]}...")
            return False
        
        print("✅ All dates are in valid YYYY-MM-DD format")
        return True
        
    except Exception as e:
        print(f"❌ Error checking date format: {e}")
        return False

def verify_chronological_order(test_dir: Path) -> bool:
    """Verify that dates are in chronological order."""
    timeline_file = test_dir / "timeline.txt"
    
    try:
        content = timeline_file.read_text(encoding='utf-8')
        lines = [line.strip() for line in content.split('\n') if line.strip()]
        
        dates = []
        for line in lines:
            # Find date pattern in the line (more flexible)
            date_match = re.search(r'\d{4}-\d{2}-\d{2}', line)
            if date_match:
                date_obj = datetime.strptime(date_match.group(), '%Y-%m-%d')
                dates.append(date_obj)
        
        # Check if dates are in ascending order
        for i in range(1, len(dates)):
            if dates[i] < dates[i-1]:
                print(f"❌ Date order violation: {dates[i-1].strftime('%Y-%m-%d')} comes after {dates[i].strftime('%Y-%m-%d')}")
                return False
        
        print("✅ All dates are in chronological order")
        return True
        
    except Exception as e:
        print(f"❌ Error checking chronological order: {e}")
        return False

def verify_expected_entries(test_dir: Path) -> bool:
    """Verify that all expected entries from answer.txt are present."""
    timeline_file = test_dir / "timeline.txt"
    
    try:
        content = timeline_file.read_text(encoding='utf-8')
        actual_lines = [line.strip() for line in content.split('\n') if line.strip()]
        
        # Expected entries from answer.txt
        expected_entries = {
            "exp_logs/project_2/analysis_report.md:2024-01-01",
            "learning/2024/learning_progress.csv:2024-01-01",
            "exp_logs/experiment_summary.md:2024-01-05",
            "play/kit&shoes_collection/inventory.py:2024-01-05",
            "exp_logs/experiment_summary.md:2024-01-10",
            "play/kit&shoes_collection/inventory.py:2024-01-10",
            "exp_logs/aug/augmentation_log.txt:2024-01-15",
            "exp_logs/experiment_summary.md:2024-01-15",
            "play/kit&shoes_collection/inventory.py:2024-01-15",
            "learning/2024/learning_progress.csv:2024-02-01",
            "learning/2024/learning_progress.csv:2024-03-01",
            "play/hongkong_tour/travel_itinerary.csv:2024-03-15",
            "travel_plan/travel_calculator.py:2024-03-15",
            "play/hongkong_tour/travel_itinerary.csv:2024-03-16",
            "play/hongkong_tour/travel_itinerary.csv:2024-03-17",
            "play/hongkong_tour/travel_itinerary.csv:2024-03-18",
            "play/hongkong_tour/travel_itinerary.csv:2024-03-19",
            "play/hongkong_tour/travel_itinerary.csv:2024-03-20",
            "travel_plan/travel_bucket_list.md:2024-04-01",
            "learning/2024/learning_progress.csv:2024-04-01",
            "learning/2024/learning_progress.csv:2024-05-01",
            "travel_plan/travel_bucket_list.md:2024-06-01",
            "learning/2024/learning_progress.csv:2024-06-01",
            "learning/2024/learning_progress.csv:2024-07-01",
            "exp_logs/exp_record.md:2024-08-01",
            "exp_logs/results_record.csv:2024-08-01",
            "travel_plan/travel_bucket_list.md:2024-08-01",
            "learning/2024/learning_progress.csv:2024-08-01",
            "exp_logs/results_record.csv:2024-08-02",
            "exp_logs/results_record.csv:2024-08-03",
            "exp_logs/results_record.csv:2024-08-04",
            "exp_logs/exp_record.md:2024-09-01",
            "exp_logs/sep/september_summary.csv:2024-09-01",
            "learning/2024/learning_progress.csv:2024-09-01",
            "exp_logs/sep/september_summary.csv:2024-09-05",
            "exp_logs/sep/september_summary.csv:2024-09-10",
            "exp_logs/sep/september_summary.csv:2024-09-15",
            "exp_logs/sep/september_summary.csv:2024-09-20",
            "exp_logs/sep/september_summary.csv:2024-09-25",
            "exp_logs/sep/september_summary.csv:2024-09-30",
            "learning/2024/learning_progress.csv:2024-10-01",
            "learning/2024/learning_progress.csv:2024-11-01",
            "learning/2024/learning_progress.csv:2024-12-01"
        }
        
        # Check if each expected entry is found in actual lines (more flexible matching)
        missing_entries = []
        for expected in expected_entries:
            expected_path, expected_date = expected.split(':')
            found = False
            
            for actual_line in actual_lines:
                # Check if line contains both the expected path and date
                # More flexible: path can be anywhere in the line, not just at the beginning
                if expected_path in actual_line and expected_date in actual_line:
                    found = True
                    break
            
            if not found:
                missing_entries.append(expected)
        
        # Check for extra entries (lines that don't match any expected pattern)
        extra_entries = []
        for actual_line in actual_lines:
            # Extract date from actual line
            date_match = re.search(r'\d{4}-\d{2}-\d{2}', actual_line)
            if not date_match:
                continue
                
            actual_date = date_match.group()
            
            # Try to extract file path from the line
            actual_path = None
            words = actual_line.split()
            for word in words:
                if ('/' in word or '.' in word) and not re.match(r'^\d{4}-\d{2}-\d{2}$', word.strip()):
                    actual_path = word
                    break
            
            if not actual_path:
                continue
            
            # Find if this line matches any expected entry
            found_expected = False
            for expected in expected_entries:
                expected_path, expected_date = expected.split(':')
                if expected_path in actual_path and expected_date == actual_date:
                    found_expected = True
                    break
            
            if not found_expected:
                extra_entries.append(actual_line)
        
        if missing_entries:
            print(f"❌ Missing {len(missing_entries)} expected entries")
            print(f"   Examples: {missing_entries[:3]}")
            return False
        
        if extra_entries:
            print(f"❌ Found {len(extra_entries)} unexpected entries")
            print(f"   Examples: {extra_entries[:3]}")
            return False
        
        print("✅ All expected entries are present, no extra entries")
        return True
        
    except Exception as e:
        print(f"❌ Error checking expected entries: {e}")
        return False

def verify_no_duplicates(test_dir: Path) -> bool:
    """Verify that there are no duplicate entries."""
    timeline_file = test_dir / "timeline.txt"
    
    try:
        content = timeline_file.read_text(encoding='utf-8')
        lines = [line.strip() for line in content.split('\n') if line.strip()]
        
        if len(lines) != len(set(lines)):
            print("❌ Duplicate entries found in timeline.txt")
            return False
        
        print("✅ No duplicate entries found")
        return True
        
    except Exception as e:
        print(f"❌ Error checking for duplicates: {e}")
        return False

def verify_file_paths_exist(test_dir: Path) -> bool:
    """Verify that all file paths mentioned in timeline.txt actually exist."""
    timeline_file = test_dir / "timeline.txt"
    
    try:
        content = timeline_file.read_text(encoding='utf-8')
        lines = [line.strip() for line in content.split('\n') if line.strip()]
        
        missing_files = []
        for line in lines:
            # Try to extract file path from the line (more flexible)
            file_path_found = False
            
            # Method 1: Split by colon and check each part
            if ':' in line:
                parts = line.split(':')
                for part in parts:
                    part = part.strip()
                    if part and ('/' in part or '.' in part) and not re.match(r'^\d{4}-\d{2}-\d{2}$', part):
                        # This looks like a file path
                        full_path = test_dir / part
                        if not full_path.exists():
                            missing_files.append(part)
                        file_path_found = True
                        break
            
            # Method 2: Split into words and look for path-like content
            if not file_path_found:
                words = line.split()
                for word in words:
                    word = word.strip()
                    if ('/' in word or '.' in word) and not re.match(r'^\d{4}-\d{2}-\d{2}$', word):
                        # This looks like a file path
                        full_path = test_dir / word
                        if not full_path.exists():
                            missing_files.append(word)
                        file_path_found = True
                        break
            
            # Method 3: Look for path pattern in the entire line
            if not file_path_found:
                # Use regex to find path-like patterns
                path_pattern = r'[a-zA-Z0-9_\-\.\/]+/[a-zA-Z0-9_\-\.\/]+'
                path_matches = re.findall(path_pattern, line)
                for match in path_matches:
                    if '.' in match or '/' in match:
                        full_path = test_dir / match
                        if not full_path.exists():
                            missing_files.append(match)
                        file_path_found = True
                        break
        
        if missing_files:
            print(f"❌ {len(missing_files)} referenced files do not exist")
            print(f"   Examples: {missing_files[:3]}")
            return False
        
        print("✅ All referenced file paths exist")
        return True
        
    except Exception as e:
        print(f"❌ Error checking file paths: {e}")
        return False

def main():
    """Main verification function."""
    test_dir = get_test_directory()
    print("🔍 Verifying Desktop Timeline Extraction Task...")
    
    # Define verification steps
    verification_steps = [
        ("Timeline File Exists", verify_timeline_file_exists),
        ("File is Readable", verify_timeline_file_readable),
        ("Correct Line Count", verify_line_count),
        ("Line Format", verify_line_format),
        ("Date Format", verify_date_format),
        ("Chronological Order", verify_chronological_order),
        ("Expected Entries", verify_expected_entries),
        ("No Duplicates", verify_no_duplicates),
        ("File Paths Exist", verify_file_paths_exist),
    ]
    
    # Run all verification steps
    all_passed = True
    for step_name, verify_func in verification_steps:
        print(f"\n--- {step_name} ---")
        if not verify_func(test_dir):
            all_passed = False
    
    # Final result
    print("\n" + "="*50)
    if all_passed:
        print("✅ Desktop 2 Timeline Extraction completed correctly!")
        print("🎉 Task verification: PASS")
        sys.exit(0)
    else:
        print("❌ Task verification: FAIL")
        sys.exit(1)

if __name__ == "__main__":
    main()

================================================
FILE: tasks/filesystem/standard/desktop_template/budget_computation/description.md
================================================
Please use FileSystem tools to finish the following task:

### Task Description

You need to analyze all the files in the desktop environment to calculate personal life expenses and create a budget summary.

### Task Objectives

1. **Locate and analyze all files** in the desktop environment
2. **Extract personal life expenses** from the files (such as salary, food, living material, tax, expenses on the internet, ...) (exclude expenses in project/work)
3. **Create a file named `total_budget.txt`** in the main directory
4. **Format each expense entry** as `file_path;price` (one per line)
5. **Add total sum** as the last line, rounded to 2 decimal places

### Output Format

The `total_budget.txt` file should contain:

- One expense per line in format: `file_path;price`
- File path should be the relative path from the main directory
- Price should be rounded to 2 decimal places
- Last line should be the total sum
- No additional text or explanations

### Important Notes

- Only include personal life expenses (not in project/work)
- Use the cheapest available price when multiple options exist for one thing
- The total should match the sum of all individual expenses
- Hint: If a file contains 1 item for personal consumption, it means that all the entry in entire file is for personal consumption


================================================
FILE: tasks/filesystem/standard/desktop_template/budget_computation/meta.json
================================================
{
  "task_id": "budget_computation",
  "task_name": "Budget Computation",
  "category_id": "desktop_template",
  "category_name": "Desktop Template",
  "description": "Analyze personal expense data extracted from desktop files to create a detailed budget summary report for financial review.",
  "author": "Lingjun Chen",
  "created_at": "2025-08-14",
  "difficulty": "L3",
  "tags": [
    "data extraction",
    "pattern analysis"
  ],
  "mcp": [
    "filesystem"
  ],
  "meta_data": {
    "stateType": "text",
    "stateContent": "desktop_template/\n    ├── Archives/\n    │       ├── backup_contacts.csv\n    │       └── tax_documents_2022.csv\n    ├── Desktop/\n    │       └── contacts.csv\n    ├── Documents/\n    │       ├── Personal/\n    │       │       └── tax_info_2023.csv\n    │       ├── Projects/\n    │       │       └── budget_tracker.csv\n    │       ├── Work/\n    │       │       ├── client_list.csv\n    │       │       └── timesheet.csv\n    │       ├── budget.csv\n    │       └── important_dates.csv\n    ├── Downloads/\n    │       ├── expenses.csv\n    │       ├── fitness_log.csv\n    │       └── price_comparisons.csv\n    ├── Temp/\n    │       └── test_data.csv\n    ├── book_list.txt\n    ├── bookmark_export.txt\n    ├── calculations.txt\n    ├── correspondence_2023.txt\n    ├── draft_letter.txt\n    ├── emergency_contacts.txt\n    ├── example.txt\n    └── experiment_results.txt\n",
    "stateUrl": "https://storage.mcpmark.ai/filesystem/desktop_template.zip",
    "stateOriginalUrl": null
  }
}

================================================
FILE: tasks/filesystem/standard/desktop_template/budget_computation/verify.py
================================================
#!/usr/bin/env python3
"""
Verification script for Budget Computation Task
"""

import sys
from pathlib import Path
import os
from collections import Counter

def get_test_directory() -> Path:
    """Get the test directory from FILESYSTEM_TEST_DIR env var."""
    test_root = os.environ.get("FILESYSTEM_TEST_DIR")
    if not test_root:
        raise ValueError("FILESYSTEM_TEST_DIR environment variable is required")
    return Path(test_root)

def verify_total_budget_file_exists(test_dir: Path) -> bool:
    """Verify that the total_budget.txt file exists."""
    budget_file = test_dir / "total_budget.txt"
    
    if not budget_file.exists():
        print("❌ File 'total_budget.txt' not found")
        return False
    
    print("✅ total_budget.txt file found")
    return True

def verify_file_format(test_dir: Path) -> bool:
    """Verify that the total_budget.txt file has proper format."""
    budget_file = test_dir / "total_budget.txt"
    
    try:
        content = budget_file.read_text()
        lines = [line.strip() for line in content.split('\n') if line.strip()]
        
        if len(lines) < 2:
            print("❌ File must contain at least 2 lines (expenses + total)")
            return False
        
        # Check that all lines except the last follow the format file_path;price
        for i, line in enumerate(lines[:-1]):
            if ';' not in line:
                print(f"❌ Line {i+1} does not contain ';' separator: {line}")
                return False
            
            parts = line.split(';')
            if len(parts) != 2:
                print(f"❌ Line {i+1} does not have exactly 2 parts: {line}")
                return False
            
            # Check if second part is a valid number
            try:
                float(parts[1])
            except ValueError:
                print(f"❌ Line {i+1} price is not a valid number: {parts[1]}")
                return False
        
        # Check if last line is a valid number (total)
        try:
            float(lines[-1])
        except ValueError:
            print(f"❌ Last line is not a valid number: {lines[-1]}")
            return False
        
        print("✅ File format is correct")
        return True
        
    except Exception as e:
        print(f"❌ Error reading or parsing file: {e}")
        return False

def verify_expense_entries(test_dir: Path) -> bool:
    """Verify that all 15 required expense entries are present."""
    budget_file = test_dir / "total_budget.txt"
    
    try:
        content = budget_file.read_text()
        lines = [line.strip() for line in content.split('\n') if line.strip()]
        
        # Should have 16 lines total (15 expenses + 1 total)
        if len(lines) != 16:
            print(f"❌ Expected 16 lines (15 expenses + 1 total), found {len(lines)}")
            return False
        
        # Check that we have exactly 15 expense entries
        expense_lines = lines[:-1]  # All lines except the last
        
        if len(expense_lines) != 15:
            print(f"❌ Expected 15 expense entries, found {len(expense_lines)}")
            return False
        
        print("✅ File contains exactly 15 expense entries")
        return True
        
    except Exception as e:
        print(f"❌ Error checking expense entries: {e}")
        return False

def verify_file_paths_and_counts(test_dir: Path) -> bool:
    """Verify that all required file paths are present with correct counts."""
    budget_file = test_dir / "total_budget.txt"
    
    try:
        content = budget_file.read_text()
        lines = [line.strip() for line in content.split('\n') if line.strip()]
        expense_lines = lines[:-1]  # All lines except the last
        
        # Extract file paths from expense lines
        file_paths = []
        for line in expense_lines:
            file_path = line.split(';')[0]
            file_paths.append(file_path)
        
        # Count occurrences of each path
        path_counts = Counter(file_paths)
        
        # Expected file paths and their counts based on answer.txt
        expected_paths = {
            'Archives/tax_documents_2022.csv': 3,
            'Documents/Personal/tax_info_2023.csv': 3,
            'Documents/budget.csv': 3,
            'Downloads/expenses.csv': 3,
            'Downloads/price_comparisons.csv': 3
        }
        
        # Helper function to check if a path contains the expected path
        def path_matches_expected(actual_path: str, expected_path: str) -> bool:
            """Check if actual path contains the expected path (allowing for prefixes like './')"""
            # Remove common prefixes like './', '../', etc.
            normalized_actual = actual_path
            while normalized_actual.startswith('./') or normalized_actual.startswith('../'):
                normalized_actual = normalized_actual[2:] if normalized_actual.startswith('./') else normalized_actual[3:]
            
            # Check if the normalized path contains the expected path
            return expected_path in normalized_actual or normalized_actual == expected_path
        
        # Check if all expected paths are present with correct counts
        for expected_path, expected_count in expected_paths.items():
            # Find matching actual paths
            matching_paths = []
            for actual_path in path_counts.keys():
                if path_matches_expected(actual_path, expected_path):
                    matching_paths.append(actual_path)
            
            if not matching_paths:
                print(f"❌ Missing expected file path: {expected_path}")
                return False
            
            # Sum up the counts from all matching paths
            total_count = sum(path_counts[path] for path in matching_paths)
            if total_count != expected_count:
                print(f"❌ Path {expected_path} has wrong count: expected {expected_count}, found {total_count}")
                print(f"   Matching paths: {matching_paths}")
                return False
        
        # Check if there are any completely unexpected paths (not matching any expected path)
        all_matching_paths = set()
        for expected_path in expected_paths.keys():
            for actual_path in path_counts.keys():
                if path_matches_expected(actual_path, expected_path):
                    all_matching_paths.add(actual_path)
        
        unexpected_paths = set(path_counts.keys()) - all_matching_paths
        if unexpected_paths:
            print(f"❌ Unexpected file paths found: {unexpected_paths}")
            return False
        
        print("✅ All expected file paths are present with correct counts")
        return True
        
    except Exception as e:
        print(f"❌ Error checking file paths: {e}")
        return False

def verify_individual_prices(test_dir: Path) -> bool:
    """Verify that all individual prices match the expected values."""
    budget_file = test_dir / "total_budget.txt"
    
    try:
        content = budget_file.read_text()
        lines = [line.strip() for line in content.split('\n') if line.strip()]
        expense_lines = lines[:-1]  # All lines except the last
        
        # Expected prices based on answer.txt
        expected_expenses = [
            ('Archives/tax_documents_2022.csv', 42000.00),
            ('Archives/tax_documents_2022.csv', 1800.00),
            ('Archives/tax_documents_2022.csv', 950.00),
            ('Documents/Personal/tax_info_2023.csv', 45000.00),
            ('Documents/Personal/tax_info_2023.csv', 2500.00),
            ('Documents/Personal/tax_info_2023.csv', 1200.00),
            ('Documents/budget.csv', 250.00),
            ('Documents/budget.csv', 180.00),
            ('Documents/budget.csv', 120.00),
            ('Downloads/expenses.csv', 45.99),
            ('Downloads/expenses.csv', 99.00),
            ('Downloads/expenses.csv', 234.50),
            ('Downloads/price_comparisons.csv', 879.99),
            ('Downloads/price_comparisons.csv', 289.99),
            ('Downloads/price_comparisons.csv', 74.99)
        ]
        
        # Helper function to check if a path contains the expected path
        def path_matches_expected(actual_path: str, expected_path: str) -> bool:
            """Check if actual path contains the expected path (allowing for prefixes like './')"""
            # Remove common prefixes like './', '../', etc.
            normalized_actual = actual_path
            while normalized_actual.startswith('./') or normalized_actual.startswith('../'):
                normalized_actual = normalized_actual[2:] if normalized_actual.startswith('./') else normalized_actual[3:]
            
            # Check if the normalized path contains the expected path
            return expected_path in normalized_actual or normalized_actual == expected_path
        
        # Parse actual expenses
        actual_expenses = []
        for line in expense_lines:
            parts = line.split(';')
            file_path = parts[0]
            price = float(parts[1])
            actual_expenses.append((file_path, price))
        
        # Create a counter for expected expenses to handle duplicates
        expected_expenses_counter = Counter(expected_expenses)
        actual_expenses_counter = Counter(actual_expenses)
        
        # Check if all expected expenses are present with correct counts
        for expected_expense, expected_count in expected_expenses_counter.items():
            expected_path, expected_price = expected_expense
            
            # Find matching actual expenses
            matching_expenses = []
            for actual_expense, actual_count in actual_expenses_counter.items():
                actual_path, actual_price = actual_expense
                if path_matches_expected(actual_path, expected_path) and abs(actual_price - expected_price) < 0.01:
                    matching_expenses.append(actual_expense)
            
            if not matching_expenses:
                print(f"❌ Missing expected expense: {expected_expense}")
                return False
            
            # Sum up the counts from all matching expenses
            total_count = sum(actual_expenses_counter[expense] for expense in matching_expenses)
            if total_count != expected_count:
                print(f"❌ Expense {expected_expense} has wrong count: expected {expected_count}, found {total_count}")
                print(f"   Matching expenses: {matching_expenses}")
                return False
        
        # Check if there are any completely unexpected expenses (not matching any expected expense)
        all_matching_expenses = set()
        for expected_expense in expected_expenses_counter.keys():
            expected_path, expected_price = expected_expense
            for actual_expense in actual_expenses_counter.keys():
                actual_path, actual_price = actual_expense
                if path_matches_expected(actual_path, expected_path) and abs(actual_price - expected_price) < 0.01:
                    all_matching_expenses.add(actual_expense)
        
        unexpected_expenses = set(actual_expenses_counter.keys()) - all_matching_expenses
        if unexpected_expenses:
            print(f"❌ Unexpected expenses found: {unexpected_expenses}")
            return False
        
        print("✅ All individual prices match expected values")
        return True
        
    except Exception as e:
        print(f"❌ Error checking individual prices: {e}")
        return False

def verify_total_price(test_dir: Path) -> bool:
    """Verify that the total price is correct."""
    budget_file = test_dir / "total_budget.txt"
    
    try:
        content = budget_file.read_text()
        lines = [line.strip() for line in content.split('\n') if line.strip()]
        
        # Get the total from the last line
        total_line = lines[-1]
        try:
            actual_total = float(total_line)
        except ValueError:
            print(f"❌ Last line is not a valid number: {total_line}")
            return False
        
        # Expected total based on answer.txt
        expected_total = 95624.46
        
        if abs(actual_total - expected_total) > 0.01:  # Allow small floating point differences
            print(f"❌ Expected total {expected_total}, found {actual_total}")
            return False
        
        print("✅ Total price is correct")
        return True
        
    except Exception as e:
        print(f"❌ Error checking total price: {e}")
        return False

def verify_total_calculation(test_dir: Path) -> bool:
    """Verify that the total matches the sum of individual expenses."""
    budget_file = test_dir / "total_budget.txt"
    
    try:
        content = budget_file.read_text()
        lines = [line.strip() for line in content.split('\n') if line.strip()]
        expense_lines = lines[:-1]  # All lines except the last
        
        # Calculate sum of individual expenses
        calculated_total = 0.0
        for line in expense_lines:
            price = float(line.split(';')[1])
            calculated_total += price
        
        # Get the stated total from the last line
        stated_total = float(lines[-1])
        
        # Check if they match (allow small floating point differences)
        if abs(calculated_total - stated_total) > 0.01:
            print(f"❌ Total calculation mismatch: calculated {calculated_total:.2f}, stated {stated_total:.2f}")
            return False
        
        print("✅ Total calculation is correct")
        return True
        
    except Exception as e:
        print(f"❌ Error verifying total calculation: {e}")
        return False

def main():
    """Main verification function."""
    test_dir = get_test_directory()
    print("🔍 Verifying Budget Computation Task...")
    
    # Define verification steps
    verification_steps = [
        ("Total Budget File Exists", verify_total_budget_file_exists),
        ("File Format", verify_file_format),
        ("Expense Entries Count", verify_expense_entries),
        ("File Paths and Counts", verify_file_paths_and_counts),
        ("Individual Prices", verify_individual_prices),
        ("Total Price", verify_total_price),
        ("Total Calculation", verify_total_calculation),
    ]
    
    # Run all verification steps
    all_passed = True
    for step_name, verify_func in verification_steps:
        print(f"\n--- {step_name} ---")
        if not verify_func(test_dir):
            all_passed = False
    
    # Final result
    print("\n" + "="*50)
    if all_passed:
        print("✅ Budget computation task completed successfully!")
        print("🎉 All verification steps passed")
        print("📊 Summary:")
        print("   - 15 expense entries found")
        print("   - 5 different file paths covered")
        print("   - All individual prices correct")
        print("   - Total price: $95,624.46")
        print("   - Calculation verified")
        sys.exit(0)
    else:
        print("❌ Budget computation task verification: FAIL")
        print("Please check the errors above and ensure all requirements are met")
        sys.exit(1)

if __name__ == "__main__":
    main()

================================================
FILE: tasks/filesystem/standard/desktop_template/contact_information/description.md
================================================
Please use FileSystem tools to finish the following task:

### Task Description

Your task is to compile all contact information from all the files into a single CSV table. You need to extract all people's contact information and organize it systematically.

### Task Objectives

1. **Scan all files** in the directory
2. **Extract contact information** for all individuals and organizations found
3. **Create a CSV file** named `contact_info.csv` in the main directory
4. **Structure the CSV** with the following columns:
   - First column: Name (required)
   - Second column: Email (required)
   - Third column: Phone (required)
   - Additional columns: Any other contact information types found
5. **Consolidate information** by merging the same types of information across entries into single columns
6. **Leave cells blank** if specific information is not available for a person/organization
7. Each entry from different files should be processed and listed separately, without any secondary processing.

### Expected Output

- **File name**: `contact_info.csv`
- **Format**: CSV with headers and data rows

### Reasoning Task

After creating the contact_info.csv file, analyze the data to answer:
**What is Charlie Davis's job/profession?**

Hint: focus on the contact information in contact_info.csv.

Write your answer in a file named `answer.txt` in the main directory.

### Important Notes

- Do not modify any existing files
- Only create the two new files: `contact_info.csv` and `answer.txt`


================================================
FILE: tasks/filesystem/standard/desktop_template/contact_information/meta.json
================================================
{
  "task_id": "contact_information",
  "task_name": "Contact Information",
  "category_id": "desktop_template",
  "category_name": "Desktop Template",
  "description": "Extract contact details from various file formats on desktop and perform reasoning analysis on the collected relationship data.",
  "author": "Lingjun Chen",
  "created_at": "2025-08-14",
  "difficulty": "L3",
  "tags": [
    "data extraction",
    "cross-referencing"
  ],
  "mcp": [
    "filesystem"
  ],
  "meta_data": {
    "stateType": "text",
    "stateContent": "desktop_template/\n    ├── Archives/\n    │       ├── backup_contacts.csv\n    │       └── tax_documents_2022.csv\n    ├── Desktop/\n    │       └── contacts.csv\n    ├── Documents/\n    │       ├── Personal/\n    │       │       └── tax_info_2023.csv\n    │       ├── Projects/\n    │       │       └── budget_tracker.csv\n    │       ├── Work/\n    │       │       ├── client_list.csv\n    │       │       └── timesheet.csv\n    │       ├── budget.csv\n    │       └── important_dates.csv\n    ├── Downloads/\n    │       ├── expenses.csv\n    │       ├── fitness_log.csv\n    │       └── price_comparisons.csv\n    ├── Temp/\n    │       └── test_data.csv\n    ├── book_list.txt\n    ├── bookmark_export.txt\n    ├── calculations.txt\n    ├── correspondence_2023.txt\n    ├── draft_letter.txt\n    ├── emergency_contacts.txt\n    ├── example.txt\n    └── experiment_results.txt\n",
    "stateUrl": "https://storage.mcpmark.ai/filesystem/desktop_template.zip",
    "stateOriginalUrl": null
  }
}

================================================
FILE: tasks/filesystem/standard/desktop_template/contact_information/verify.py
================================================
#!/usr/bin/env python3
"""
Verification script for Contact Information Compilation Task
"""

import sys
from pathlib import Path
import csv
import os
import re

def get_test_directory() -> Path:
    """Get the test directory from FILESYSTEM_TEST_DIR env var."""
    test_root = os.environ.get("FILESYSTEM_TEST_DIR")
    if not test_root:
        raise ValueError("FILESYSTEM_TEST_DIR environment variable is required")
    return Path(test_root)

def verify_contact_info_csv_exists(test_dir: Path) -> bool:
    """Verify that the contact_info.csv file exists in the main directory."""
    contact_file = test_dir / "contact_info.csv"
    
    if not contact_file.exists():
        print("❌ File 'contact_info.csv' not found in main directory")
        return False
    
    print("✅ contact_info.csv file found")
    return True

def verify_answer_txt_exists(test_dir: Path) -> bool:
    """Verify that the answer.txt file exists in the main directory."""
    answer_file = test_dir / "answer.txt"
    
    if not answer_file.exists():
        print("❌ File 'answer.txt' not found in main directory")
        return False
    
    print("✅ answer.txt file found")
    return True

def verify_csv_structure(test_dir: Path) -> bool:
    """Verify that the CSV file has the correct structure."""
    contact_file = test_dir / "contact_info.csv"
    
    try:
        with open(contact_file, 'r', encoding='utf-8') as f:
            reader = csv.reader(f)
            rows = list(reader)
            
        if len(rows) < 2:  # Need at least header + 1 data row
            print("❌ CSV file has insufficient rows")
            return False
        
        headers = rows[0]
        if not headers:
            print("❌ CSV file has no headers")
            return False
        
        # Check that Name is the first column
        if headers[0].lower() != 'name':
            print("❌ First column is not 'Name'")
            return False
        
        # Check that Email and Phone are present (order may vary)
        header_lower = [h.lower() for h in headers]
        if 'email' not in header_lower:
            print("❌ 'Email' column not found")
            return False
        
        if 'phone' not in header_lower:
            print("❌ 'Phone' column not found")
            return False
        
        print("✅ CSV structure is correct")
        return True
        
    except Exception as e:
        print(f"❌ Error reading CSV file: {e}")
        return False

def verify_csv_content_accuracy(test_dir: Path) -> bool:
    """Verify that the CSV content contains all required data, regardless of row order or extra entries."""
    contact_file = test_dir / "contact_info.csv"
    
    try:
        with open(contact_file, 'r', encoding='utf-8') as f:
            reader = csv.DictReader(f)
            rows = list(reader)
        
        # Expected data from answer.csv (hardcoded as required)
        expected_data = [
            {"Name": "John Smith", "Email": "john@email.com", "Phone": "555-0101", "Status": "", "Industry": ""},
            {"Name": "Jane Doe", "Email": "jane@email.com", "Phone": "555-0102", "Status": "", "Industry": ""},
            {"Name": "Bob Johnson", "Email": "bob@email.com", "Phone": "555-0103", "Status": "", "Industry": ""},
            {"Name": "Alice Brown", "Email": "alice@email.com", "Phone": "555-0201", "Status": "Inactive", "Industry": ""},
            {"Name": "Charlie Davis", "Email": "charlie@email.com", "Phone": "555-0202", "Status": "Active", "Industry": ""},
            {"Name": "David Wilson", "Email": "david@email.com", "Phone": "555-0203", "Status": "Inactive", "Industry": ""},
            {"Name": "Acme Corp", "Email": "acme@corp.com", "Phone": "", "Status": "", "Industry": "Technology"},
            {"Name": "Global Inc", "Email": "global@inc.com", "Phone": "", "Status": "", "Industry": "Finance"},
            {"Name": "Local Business", "Email": "local@biz.com", "Phone": "", "Status": "", "Industry": "Retail"},
            {"Name": "Spouse", "Email": "", "Phone": "+1-555-0124", "Status": "", "Industry": ""},
            {"Name": "Parent", "Email": "", "Phone": "+1-555-0125", "Status": "", "Industry": ""},
            {"Name": "Sibling", "Email": "", "Phone": "+1-555-0126", "Status": "", "Industry": ""},
            {"Name": "Primary Doctor", "Email": "", "Phone": "+1-555-0201", "Status": "", "Industry": ""},
            {"Name": "Dentist", "Email": "", "Phone": "+1-555-0202", "Status": "", "Industry": ""},
            {"Name": "Pharmacy", "Email": "", "Phone": "+1-555-0203", "Status": "", "Industry": ""}
        ]
        
        # Convert expected data to a dictionary for easier lookup
        # We'll use Name as the key since it should be unique
        expected_dict = {}
        for entry in expected_data:
            expected_dict[entry["Name"]] = entry
        
        # Check each row for accuracy, regardless of order
        # Allow extra entries and mixed content
        found_entries = set()
        extra_entries = []
        
        for i, row in enumerate(rows):
            row_name = row.get('Name', '')
            if not row_name:
                # Skip rows without names (they're not valid entries)
                continue
            
            if row_name in expected_dict:
                # This is one of our expected entries
                if row_name in found_entries:
                    print(f"❌ Duplicate name found: '{row_name}'")
                    return False
                
                found_entries.add(row_name)
                expected = expected_dict[row_name]
                
                # Check all columns for this entry
                for key, expected_value in expected.items():
                    if key in row:
                        actual_value = row[key] if row[key] else ""
                        if actual_value != expected_value:
                            print(f"❌ Entry '{row_name}', column '{key}': expected '{expected_value}', got '{actual_value}'")
                            return False
                    else:
                        print(f"❌ Entry '{row_name}' missing column '{key}'")
                        return False
            else:
                # This is an extra entry - record it for informational purposes
                extra_entries.append(row_name)
        
        # Verify all expected entries were found
        if len(found_entries) != len(expected_data):
            missing = set(expected_dict.keys()) - found_entries
            print(f"❌ Missing entries: {missing}")
            return False
        
        # Report extra entries if any
        if extra_entries:
            print(f"ℹ️  Found {len(extra_entries)} extra entries: {extra_entries}")
        
        print(f"✅ CSV content accuracy verified: found all {len(expected_data)} required entries (plus {len(extra_entries)} extra entries)")
        return True
        
    except Exception as e:
        print(f"❌ Error verifying CSV content: {e}")
        return False

def verify_csv_data_completeness(test_dir: Path) -> bool:
    """Verify that all required data is present and no entries are missing."""
    contact_file = test_dir / "contact_info.csv"
    
    try:
        with open(contact_file, 'r', encoding='utf-8') as f:
            reader = csv.DictReader(f)
            rows = list(reader)
        
        # Check that all expected names are present
        expected_names = [
            "John Smith", "Jane Doe", "Bob Johnson", "Alice Brown", 
            "Charlie Davis", "David Wilson", "Acme Corp", "Global Inc", 
            "Local Business", "Spouse", "Parent", "Sibling", 
            "Primary Doctor", "Dentist", "Pharmacy"
        ]
        
        actual_names = [row.get('Name', '') for row in rows if row.get('Name')]
        
        missing_names = set(expected_names) - set(actual_names)
        if missing_names:
            print(f"❌ Missing names: {missing_names}")
            return False
        
        extra_names = set(actual_names) - set(expected_names)
        if extra_names:
            print(f"⚠️  Extra names found: {extra_names}")
            # This is a warning, not an error
        
        print("✅ CSV data completeness verified")
        return True
        
    except Exception as e:
        print(f"❌ Error checking data completeness: {e}")
        return False

def verify_answer_content(test_dir: Path) -> bool:
    """Verify that the answer.txt contains the correct answer about Charlie Davis."""
    answer_file = test_dir / "answer.txt"
    
    try:
        content = answer_file.read_text().strip().lower()
        
        # The answer should contain "dentist" (as per answer.txt)
        if "dentist" in content:
            print("✅ Answer about Charlie Davis's job is correct")
            return True
        else:
            print(f"❌ Answer does not contain 'dentist'. Found: '{content}'")
            return False
        
    except Exception as e:
        print(f"❌ Error reading answer.txt: {e}")
        return False

def verify_file_locations(test_dir: Path) -> bool:
    """Verify that files are in the correct locations."""
    contact_file = test_dir / "contact_info.csv"
    answer_file = test_dir / "answer.txt"
    
    # Check that files are in the main directory, not in subdirectories
    if contact_file.parent != test_dir:
        print(f"❌ contact_info.csv is not in main directory: {contact_file}")
        return False
    
    if answer_file.parent != test_dir:
        print(f"❌ answer.txt is not in main directory: {answer_file}")
        return False
    
    print("✅ Files are in correct locations")
    return True

def main():
    """Main verification function."""
    test_dir = get_test_directory()
    print("🔍 Verifying Contact Information Compilation Task...")
    
    # Define verification steps
    verification_steps = [
        ("Contact Info CSV Exists", verify_contact_info_csv_exists),
        ("Answer TXT Exists", verify_answer_txt_exists),
        ("Files in Correct Locations", verify_file_locations),
        ("CSV Structure", verify_csv_structure),
        ("CSV Content Accuracy (Flexible)", verify_csv_content_accuracy),
        ("CSV Data Completeness", verify_csv_data_completeness),
        ("Answer Content", verify_answer_content),
    ]
    
    # Run all verification steps
    all_passed = True
    for step_name, verify_func in verification_steps:
        print(f"\n--- {step_name} ---")
        if not verify_func(test_dir):
            all_passed = False
    
    # Final result
    print("\n" + "="*50)
    if all_passed:
        print("✅ Contact Information Compilation Task completed successfully!")
        print("🎉 Task verification: PASS")
        sys.exit(0)
    else:
        print("❌ Task verification: FAIL")
        sys.exit(1)

if __name__ == "__main__":
    main()

================================================
FILE: tasks/filesystem/standard/desktop_template/file_arrangement/description.md
================================================
Please use FileSystem tools to finish the following task:

### Task Description

You are tasked with organizing files on an AI researcher's desktop into a structured folder system. You need to create specific folders and move files to their designated locations according to the provided organization scheme.

### Task Objectives

1. **Create the following folder structure** in the main directory:

   - `work/` - for work, research and projects related files
   - `life/` - for personal life related files
   - `archives/` - for archived files or files with past dates in its file names
   - `temp/` - for temporary files, drafts
   - `others/` - for files that cannot be classified elsewhere

### Important Notes

- All files must be moved from their current locations to the specified folders
- The `others/` folder is for files that don't fit the other categories
- Do not modify the contents of any files, only move them to the correct locations
- If you are not sure about which folder it should belongs to, you can read the context in the files before making decisions
- **Do not change files' name**


================================================
FILE: tasks/filesystem/standard/desktop_template/file_arrangement/meta.json
================================================
{
  "task_id": "file_arrangement",
  "task_name": "File Arrangement",
  "category_id": "desktop_template",
  "category_name": "Desktop Template",
  "description": "Classify and organize desktop files into appropriate categories following specified classification rules and naming convention standards.",
  "author": "Lingjun Chen",
  "created_at": "2025-08-14",
  "difficulty": "L3",
  "tags": [
    "file organization"
  ],
  "mcp": [
    "filesystem"
  ],
  "meta_data": {
    "stateType": "text",
    "stateContent": "desktop_template/\n    ├── Archives/\n    │       ├── backup_contacts.csv\n    │       └── tax_documents_2022.csv\n    ├── Desktop/\n    │       └── contacts.csv\n    ├── Documents/\n    │       ├── Personal/\n    │       │       └── tax_info_2023.csv\n    │       ├── Projects/\n    │       │       └── budget_tracker.csv\n    │       ├── Work/\n    │       │       ├── client_list.csv\n    │       │       └── timesheet.csv\n    │       ├── budget.csv\n    │       └── important_dates.csv\n    ├── Downloads/\n    │       ├── expenses.csv\n    │       ├── fitness_log.csv\n    │       └── price_comparisons.csv\n    ├── Temp/\n    │       └── test_data.csv\n    ├── book_list.txt\n    ├── bookmark_export.txt\n    ├── calculations.txt\n    ├── correspondence_2023.txt\n    ├── draft_letter.txt\n    ├── emergency_contacts.txt\n    ├── example.txt\n    └── experiment_results.txt\n",
    "stateUrl": "https://storage.mcpmark.ai/filesystem/desktop_template.zip",
    "stateOriginalUrl": null
  }
}

================================================
FILE: tasks/filesystem/standard/desktop_template/file_arrangement/verify.py
================================================
#!/usr/bin/env python3
"""
Verification script for Desktop File Organization Task
"""

import sys
from pathlib import Path
import os

def get_test_directory() -> Path:
    """Get the test directory from FILESYSTEM_TEST_DIR env var."""
    test_root = os.environ.get("FILESYSTEM_TEST_DIR")
    if not test_root:
        raise ValueError("FILESYSTEM_TEST_DIR environment variable is required")
    return Path(test_root)

def verify_folder_structure(test_dir: Path) -> bool:
    """Verify that all required folders exist."""
    required_folders = ["work", "life", "archives", "temp", "others"]
    missing_folders = []
    
    for folder in required_folders:
        folder_path = test_dir / folder
        if not folder_path.exists() or not folder_path.is_dir():
            missing_folders.append(folder)
    
    if missing_folders:
        print(f"❌ Missing required folders: {missing_folders}")
        return False
    
    print("✅ All required folders exist")
    return True

def verify_work_folder_files(test_dir: Path) -> bool:
    """Verify that work folder contains the required files."""
    work_dir = test_dir / "work"
    required_files = [
        "client_list.csv",
        "timesheet.csv", 
        "experiment_results.txt",
        "budget_tracker.csv",
        "expenses.csv"
    ]
    
    missing_files = []
    for file_name in required_files:
        file_path = work_dir / file_name
        if not file_path.exists():
            missing_files.append(file_name)
    
    if missing_files:
        print(f"❌ Missing required files in work/ folder: {missing_files}")
        return False
    
    # Count total files in work folder for info
    total_files = len([f for f in work_dir.iterdir() if f.is_file()])
    print(f"✅ All required files found in work/ folder (total: {total_files} files)")
    return True

def verify_life_folder_files(test_dir: Path) -> bool:
    """Verify that life folder contains the required files."""
    life_dir = test_dir / "life"
    required_files = [
        "contacts.csv",
        "budget.csv",
        "fitness_log.csv",
        "price_comparisons.csv",
        "book_list.txt",
        "bookmark_export.txt",
        "emergency_contacts.txt"
    ]
    
    missing_files = []
    for file_name in required_files:
        file_path = life_dir / file_name
        if not file_path.exists():
            missing_files.append(file_name)
    
    if missing_files:
        print(f"❌ Missing required files in life/ folder: {missing_files}")
        return False
    
    # Count total files in life folder for info
    total_files = len([f for f in life_dir.iterdir() if f.is_file()])
    print(f"✅ All required files found in life/ folder (total: {total_files} files)")
    return True

def verify_archives_folder_files(test_dir: Path) -> bool:
    """Verify that archives folder contains the required files."""
    archives_dir = test_dir / "archives"
    required_files = [
        "backup_contacts.csv",
        "tax_documents_2022.csv",
        "correspondence_2023.txt",
        "tax_info_2023.csv"
    ]
    
    missing_files = []
    for file_name in required_files:
        file_path = archives_dir / file_name
        if not file_path.exists():
            missing_files.append(file_name)
    
    if missing_files:
        print(f"❌ Missing required files in archives/ folder: {missing_files}")
        return False
    
    # Count total files in archives folder for info
    total_files = len([f for f in archives_dir.iterdir() if f.is_file()])
    print(f"✅ All required files found in archives/ folder (total: {total_files} files)")
    return True

def verify_temp_folder_files(test_dir: Path) -> bool:
    """Verify that temp folder contains the required files."""
    temp_dir = test_dir / "temp"
    required_files = [
        "test_data.csv",
        "draft_letter.txt"
    ]
    
    missing_files = []
    for file_name in required_files:
        file_path = temp_dir / file_name
        if not file_path.exists():
            missing_files.append(file_name)
    
    if missing_files:
        print(f"❌ Missing required files in temp/ folder: {missing_files}")
        return False
    
    # Count total files in temp folder for info
    total_files = len([f for f in temp_dir.iterdir() if f.is_file()])
    print(f"✅ All required files found in temp/ folder (total: {total_files} files)")
    return True

def verify_others_folder_files(test_dir: Path) -> bool:
    """Verify that others folder exists and can contain any files."""
    others_dir = test_dir / "others"
    
    if not others_dir.exists() or not others_dir.is_dir():
        print("❌ others/ folder not found")
        return False
    
    # Count files in others folder for info
    total_files = len([f for f in others_dir.iterdir() if f.is_file()])
    print(f"✅ others/ folder exists (contains {total_files} files)")
    return True

def verify_required_files_in_correct_folders(test_dir: Path) -> bool:
    """Verify that all 18 required files are in their correct designated folders."""
    # Define the mapping of required files to their correct folders
    required_file_mapping = {
        "work": [
            "client_list.csv",
            "timesheet.csv", 
            "experiment_results.txt",
            "budget_tracker.csv",
            "expenses.csv",
        ],
        "life": [
            "contacts.csv",
            "budget.csv",
            "fitness_log.csv",
            "price_comparisons.csv",
            "book_list.txt",
            "bookmark_export.txt",
            "emergency_contacts.txt"
        ],
        "archives": [
            "backup_contacts.csv",
            "tax_documents_2022.csv",
            "correspondence_2023.txt",
            "tax_info_2023.csv"
        ],
        "temp": [
            "test_data.csv",
            "draft_letter.txt"
        ]
    }
    
    missing_files = []
    
    # Check each required file is in its correct folder
    for folder, files in required_file_mapping.items():
        folder_path = test_dir / folder
        for file_name in files:
            file_path = folder_path / file_name
            if not file_path.exists():
                missing_files.append(f"{folder}/{file_name}")
    
    if missing_files:
        print(f"❌ Missing required files: {missing_files}")
        return False
    
    print("✅ All 18 required files are in their correct designated folders")
    return True

def verify_no_duplicate_required_files(test_dir: Path) -> bool:
    """Verify that the 18 required files are not duplicated across folders."""
    required_files = [
        "client_list.csv", "timesheet.csv", "experiment_results.txt", "budget_tracker.csv",
        "contacts.csv", "budget.csv", "expenses.csv", "fitness_log.csv",
        "price_comparisons.csv", "book_list.txt", "bookmark_export.txt", "emergency_contacts.txt",
        "backup_contacts.csv", "tax_documents_2022.csv", "correspondence_2023.txt", "tax_info_2023.csv",
        "test_data.csv", "draft_letter.txt"
    ]
    
    # Check for duplicates of required files
    file_locations = {}
    duplicates = []
    
    for folder in ["work", "life", "archives", "temp", "others"]:
        folder_path = test_dir / folder
        if folder_path.exists() and folder_path.is_dir():
            for file_path in folder_path.iterdir():
                if file_path.is_file() and file_path.name in required_files:
                    if file_path.name in file_locations:
                        duplicates.append(f"{file_path.name} (in {file_locations[file_path.name]} and {folder}/)")
                    else:
                        file_locations[file_path.name] = f"{folder}/"
    
    if duplicates:
        print(f"❌ Duplicate required files found: {duplicates}")
        return False
    
    print("✅ No duplicate required files found")
    return True

def main():
    """Main verification function."""
    test_dir = get_test_directory()
    print("🔍 Verifying Desktop File Organization Task...")
    
    # Define verification steps
    verification_steps = [
        ("Folder Structure", verify_folder_structure),
        ("Required Files in Work Folder", verify_work_folder_files),
        ("Required Files in Life Folder", verify_life_folder_files),
        ("Required Files in Archives Folder", verify_archives_folder_files),
        ("Required Files in Temp Folder", verify_temp_folder_files),
        ("Others Folder Exists", verify_others_folder_files),
        ("All Required Files in Correct Folders", verify_required_files_in_correct_folders),
        ("No Duplicate Required Files", verify_no_duplicate_required_files),
    ]
    
    # Run all verification steps
    all_passed = True
    for step_name, verify_func in verification_steps:
        print(f"\n--- {step_name} ---")
        if not verify_func(test_dir):
            all_passed = False
    
    # Final result
    print("\n" + "="*50)
    if all_passed:
        print("✅ Desktop file organization task completed successfully!")
        print("🎉 All 18 required files are correctly placed in their designated folders")
        print("📊 Summary:")
        print("   - work/ folder: 5 required files")
        print("   - life/ folder: 7 required files") 
        print("   - archives/ folder: 4 required files")
        print("   - temp/ folder: 2 required files")
        print("   - others/ folder: can contain any files")
        print("   - Total required files: 18")
        print("   - Note: Other files can be placed in any folder")
        sys.exit(0)
    else:
        print("❌ Desktop file organization task verification: FAIL")
        print("Please check the errors above and ensure all 18 required files are in their correct locations")
        sys.exit(1)

if __name__ == "__main__":
    main()


================================================
FILE: tasks/filesystem/standard/file_context/duplicates_searching/description.md
================================================
Please use FileSystem tools to finish the following task:

### Task Description

You are given a directory containing multiple text files. Some files have identical content and need to be organized. Your task is to identify all files with duplicate content and move them to a newly created 'duplicates' directory.

### Task Objectives

1. **Scan all text files** in the test directory to identify groups with identical content
2. **Create a 'duplicates' directory** in the test directory root
3. **Move all duplicate files** into the 'duplicates' directory
4. **Leave unique files** in their original location

### Expected Output

After completing the task, the directory structure should be:

- `duplicates/` directory containing all files with duplicate content
- Original directory containing only files with unique content


================================================
FILE: tasks/filesystem/standard/file_context/duplicates_searching/meta.json
================================================
{
  "task_id": "duplicates_searching",
  "task_name": "Duplicates Searching",
  "category_id": "file_context",
  "category_name": "File Context",
  "description": "Scan directory to identify files with identical content, then organize all duplicate files into a separate dedicated directory for cleanup.",
  "author": "Lingjun Chen",
  "created_at": "2025-08-06",
  "difficulty": "L3",
  "tags": [
    "pattern analysis",
    "file organization"
  ],
  "mcp": [
    "filesystem"
  ],
  "meta_data": {
    "stateType": "text",
    "stateContent": "file_context/\n    ├── file_01.txt\n    ├── file_02.txt\n    ├── file_03.txt\n    ├── file_04.txt\n    ├── file_05.txt\n    ├── file_06.txt\n    ├── file_07.txt\n    ├── file_08.txt\n    ├── file_09.txt\n    ├── file_10.txt\n    ├── file_11.txt\n    ├── file_12.txt\n    ├── file_13.txt\n    ├── file_14.txt\n    ├── file_15.txt\n    ├── file_16.txt\n    ├── file_17.txt\n    ├── file_18.txt\n    ├── file_19.txt\n    ├── file_20.txt\n    └── large_file.txt",
    "stateUrl": "https://storage.mcpmark.ai/filesystem/file_context.zip",
    "stateOriginalUrl": null
  }
}

================================================
FILE: tasks/filesystem/standard/file_context/duplicates_searching/verify.py
================================================
#!/usr/bin/env python3
"""
Verification script for File Duplicates Detection and Organization Task
"""

import sys
from pathlib import Path
import os
import hashlib

def get_test_directory() -> Path:
    """Get the test directory from FILESYSTEM_TEST_DIR env var."""
    test_root = os.environ.get("FILESYSTEM_TEST_DIR")
    if not test_root:
        raise ValueError("FILESYSTEM_TEST_DIR environment variable is required")
    return Path(test_root)

def calculate_file_hash(file_path: Path) -> str:
    """Calculate MD5 hash of file content."""
    try:
        with open(file_path, 'rb') as f:
            return hashlib.md5(f.read()).hexdigest()
    except Exception as e:
        print(f"❌ Error reading file {file_path}: {e}")
        return None

def verify_duplicates_directory_exists(test_dir: Path) -> bool:
    """Verify that the duplicates directory exists."""
    duplicates_dir = test_dir / "duplicates"
    
    if not duplicates_dir.exists():
        print("❌ 'duplicates' directory not found")
        return False
    
    if not duplicates_dir.is_dir():
        print("❌ 'duplicates' exists but is not a directory")
        return False
    
    print("✅ 'duplicates' directory exists")
    return True

def get_expected_duplicate_groups():
    """Return the expected duplicate file groups based on content analysis."""
    # Based on the answer.md and content analysis
    return {
        # Group 1: file_01.txt, file_02.txt (identical content)
        "group1": ["file_01.txt", "file_02.txt"],
        # Group 2: file_03.txt, file_04.txt (identical content)
        "group2": ["file_03.txt", "file_04.txt"],
        # Group 3: file_07.txt, file_08.txt (identical content)
        "group3": ["file_07.txt", "file_08.txt"],
        # Group 4: file_10.txt, file_11.txt (identical content)
        "group4": ["file_10.txt", "file_11.txt"],
        # Group 5: file_13.txt, file_14.txt (identical content)
        "group5": ["file_13.txt", "file_14.txt"],
        # Group 6: file_15.txt, file_16.txt (identical content)
        "group6": ["file_15.txt", "file_16.txt"],
        # Group 7: file_18.txt, file_19.txt (identical content)
        "group7": ["file_18.txt", "file_19.txt"]
    }

def get_expected_unique_files():
    """Return the expected unique files that should remain in original location."""
    return [
        "file_05.txt", "file_06.txt", "file_09.txt", 
        "file_12.txt", "file_17.txt", "file_20.txt"
    ]

def verify_duplicate_files_moved(test_dir: Path) -> bool:
    """Verify that all duplicate files have been moved to the duplicates directory."""
    duplicates_dir = test_dir / "duplicates"
    expected_duplicate_groups = get_expected_duplicate_groups()
    
    # Check that all expected duplicate files are in the duplicates directory
    missing_files = []
    for group_name, files in expected_duplicate_groups.items():
        for filename in files:
            file_path = duplicates_dir / filename
            if not file_path.exists():
                missing_files.append(filename)
    
    if missing_files:
        print(f"❌ Missing duplicate files in 'duplicates' directory: {missing_files}")
        return False
    
    print("✅ All expected duplicate files are in the 'duplicates' directory")
    return True

def verify_unique_files_remain(test_dir: Path) -> bool:
    """Verify that unique files remain in the original location."""
    expected_unique_files = get_expected_unique_files()
    
    missing_files = []
    for filename in expected_unique_files:
        file_path = test_dir / filename
        if not file_path.exists():
            missing_files.append(filename)
    
    if missing_files:
        print(f"❌ Missing unique files in original location: {missing_files}")
        return False
    
    print("✅ All expected unique files remain in the original location")
    return True

def verify_no_duplicate_files_in_original(test_dir: Path) -> bool:
    """Verify that no duplicate files remain in the original location."""
    expected_duplicate_groups = get_expected_duplicate_groups()
    
    remaining_duplicates = []
    for group_name, files in expected_duplicate_groups.items():
        for filename in files:
            file_path = test_dir / filename
            if file_path.exists():
                remaining_duplicates.append(filename)
    
    if remaining_duplicates:
        print(f"❌ Duplicate files still exist in original location: {remaining_duplicates}")
        return False
    
    print("✅ No duplicate files remain in the original location")
    return True

def verify_content_integrity(test_dir: Path) -> bool:
    """Verify that file content integrity is maintained after moving."""
    duplicates_dir = test_dir / "duplicates"
    expected_duplicate_groups = get_expected_duplicate_groups()
    
    # Check that files in each duplicate group have identical content
    for group_name, files in expected_duplicate_groups.items():
        if len(files) < 2:
            continue
            
        # Calculate hash of the first file in the group
        first_file = duplicates_dir / files[0]
        if not first_file.exists():
            print(f"❌ First file of group {group_name} not found: {files[0]}")
            return False
        
        first_hash = calculate_file_hash(first_file)
        if first_hash is None:
            return False
        
        # Check that all other files in the group have the same hash
        for filename in files[1:]:
            file_path = duplicates_dir / filename
            if not file_path.exists():
                print(f"❌ File in group {group_name} not found: {filename}")
                return False
            
            file_hash = calculate_file_hash(file_path)
            if file_hash is None:
                return False
            
            if file_hash != first_hash:
                print(f"❌ Files in group {group_name} have different content: {files[0]} vs {filename}")
                return False
    
    print("✅ Content integrity verified - duplicate files have identical content")
    return True

def verify_total_file_count(test_dir: Path) -> bool:
    """Verify that the duplicates directory contains exactly 14 files."""
    duplicates_dir = test_dir / "duplicates"
    
    # Count files in original location (excluding the duplicates directory itself)
    original_files = [f for f in test_dir.iterdir() if f.is_file()]
    
    # Count files in duplicates directory
    duplicate_files = [f for f in duplicates_dir.iterdir() if f.is_file()]
    
    # Expected: 14 files in duplicates directory
    expected_duplicates = 14
    actual_duplicates = len(duplicate_files)
    
    if actual_duplicates != expected_duplicates:
        print(f"❌ Wrong number of files in duplicates directory. Expected: {expected_duplicates}, Actual: {actual_duplicates}")
        return False
    
    print(f"✅ Duplicates directory has correct number of files: {actual_duplicates}")
    return True


def main():
    """Main verification function."""
    test_dir = get_test_directory()
    print("🔍 Verifying File Duplicates Detection and Organization Task...")
    
    # Define verification steps
    verification_steps = [
        ("Duplicates Directory Exists", verify_duplicates_directory_exists),
        ("Duplicate Files Moved", verify_duplicate_files_moved),
        ("Unique Files Remain", verify_unique_files_remain),
        ("No Duplicates in Original", verify_no_duplicate_files_in_original),
        ("Content Integrity", verify_content_integrity),
        ("Duplicates Count", verify_total_file_count),
    ]
    
    # Run all verification steps
    all_passed = True
    for step_name, verify_func in verification_steps:
        print(f"\n--- {step_name} ---")
        if not verify_func(test_dir):
            all_passed = False
    
    # Final result
    print("\n" + "="*50)
    if all_passed:
        print("✅ File duplicates detection and organization completed correctly!")
        print("🎉 Task verification: PASS")
        sys.exit(0)
    else:
        print("❌ Task verification: FAIL")
        sys.exit(1)

if __name__ == "__main__":
    main()

================================================
FILE: tasks/filesystem/standard/file_context/file_merging/description.md
================================================
Please use FileSystem tools to finish the following task:

### Task Description

You are given a directory containing multiple text files of varying sizes. Your task is to identify the 10 smallest .txt files, merge their content in alphabetical order, and create a consolidated file called "merged_content.txt" with proper formatting.

### Task Objectives

1. **Identify the 10 smallest .txt files** in the test directory
2. **Sort the selected files alphabetically** by filename
3. **Merge the content** of these files into a single file
4. **Add file headers** (file name) before each file's content


================================================
FILE: tasks/filesystem/standard/file_context/file_merging/meta.json
================================================
{
  "task_id": "file_merging",
  "task_name": "File Merging",
  "category_id": "file_context",
  "category_name": "File Context",
  "description": "Identify the 10 smallest text files in the directory, then merge their content in alphabetical order into a single consolidated file.",
  "author": "Lingjun Chen",
  "created_at": "2025-08-07",
  "difficulty": "L3",
  "tags": [
    "content transformation",
    "file organization"
  ],
  "mcp": [
    "filesystem"
  ],
  "meta_data": {
    "stateType": "text",
    "stateContent": "file_context/\n    ├── file_01.txt\n    ├── file_02.txt\n    ├── file_03.txt\n    ├── file_04.txt\n    ├── file_05.txt\n    ├── file_06.txt\n    ├── file_07.txt\n    ├── file_08.txt\n    ├── file_09.txt\n    ├── file_10.txt\n    ├── file_11.txt\n    ├── file_12.txt\n    ├── file_13.txt\n    ├── file_14.txt\n    ├── file_15.txt\n    ├── file_16.txt\n    ├── file_17.txt\n    ├── file_18.txt\n    ├── file_19.txt\n    ├── file_20.txt\n    └── large_file.txt",
    "stateUrl": "https://storage.mcpmark.ai/filesystem/file_context.zip",
    "stateOriginalUrl": null
  }
}

================================================
FILE: tasks/filesystem/standard/file_context/file_merging/verify.py
================================================
#!/usr/bin/env python3
"""
Verification script for File Merging Task
"""

import sys
from pathlib import Path
import os

def get_test_directory() -> Path:
    """Get the test directory from FILESYSTEM_TEST_DIR env var."""
    test_root = os.environ.get("FILESYSTEM_TEST_DIR")
    if not test_root:
        raise ValueError("FILESYSTEM_TEST_DIR environment variable is required")
    return Path(test_root)

def get_expected_files() -> list:
    """Get the expected 10 files in alphabetical order."""
    # The 10 smallest files (excluding file_12.txt) in alphabetical order
    expected_files = [
        "file_10.txt",
        "file_11.txt", 
        "file_13.txt",
        "file_14.txt",
        "file_15.txt",
        "file_16.txt",
        "file_17.txt",
        "file_18.txt",
        "file_19.txt",
        "file_20.txt"
    ]
    return expected_files

def verify_merged_file_exists(test_dir: Path) -> bool:
    """Verify that the merged_content.txt file exists."""
    merged_file = test_dir / "merged_content.txt"
    
    if not merged_file.exists():
        print("❌ File 'merged_content.txt' not found")
        return False
    
    print("✅ Merged content file found")
    return True


def verify_correct_files_selected(test_dir: Path) -> bool:
    """Verify that the correct 10 files were selected and included."""
    expected_files = get_expected_files()
    merged_file = test_dir / "merged_content.txt"
    
    try:
        content = merged_file.read_text()
        
        # Check if all expected files are present
        for expected_file in expected_files:
            if expected_file not in content:
                print(f"❌ Expected file '{expected_file}' not found in merged content")
                return False
        
        # Check if file_12.txt is NOT present (should be excluded)
        if "file_12.txt" in content:
            print("❌ file_12.txt should be excluded but was found in merged content")
            return False
        
        print("✅ Correct files selected and included")
        return True
        
    except Exception as e:
        print(f"❌ Error verifying file selection: {e}")
        return False

def verify_alphabetical_order(test_dir: Path) -> bool:
    """Verify that files are in alphabetical order."""
    expected_files = get_expected_files()
    merged_file = test_dir / "merged_content.txt"
    
    try:
        content = merged_file.read_text()
        lines = content.split('\n')
        
        # Extract filenames from the content (lines that contain .txt)
        found_files = []
        for line in lines:
            line = line.strip()
            # Check if this line contains any of the expected filenames
            for expected_file in expected_files:
                if expected_file in line:
                    found_files.append(expected_file)
                    break
        
        # Check if files are in alphabetical order
        if found_files != expected_files:
            print(f"❌ Files not in correct alphabetical order")
            print(f"   Expected: {expected_files}")
            print(f"   Found: {found_files}")
            return False
        
        print("✅ Files are in correct alphabetical order")
        return True
        
    except Exception as e:
        print(f"❌ Error verifying alphabetical order: {e}")
        return False

def verify_file_content_integrity(test_dir: Path) -> bool:
    """Verify that the content of each file is preserved correctly."""
    expected_files = get_expected_files()
    merged_file = test_dir / "merged_content.txt"
    
    try:
        content = merged_file.read_text()
        lines = content.split('\n')
        
        for expected_file in expected_files:
            # Get the original file content
            original_file = test_dir / expected_file
            original_content = original_file.read_text().strip()
            
            # Find the line index where this file's header appears
            header_line_index = -1
            for i, line in enumerate(lines):
                if expected_file in line:
                    header_line_index = i
                    break
            
            if header_line_index == -1:
                print(f"❌ Could not find header for {expected_file}")
                return False
            
            # Find the next header line or end of file
            next_header_index = len(lines)
            for i in range(header_line_index + 1, len(lines)):
                for other_file in expected_files:
                    if other_file != expected_file and other_file in lines[i]:
                        next_header_index = i
                        break
                if next_header_index != len(lines):
                    break
            
            # Extract content lines (from header + 1 to next header)
            content_lines = lines[header_line_index + 1:next_header_index]
            merged_content = '\n'.join(content_lines).strip()
            
            if merged_content != original_content:
                print(f"❌ Content mismatch for {expected_file}")
                print(f"   Expected: {original_content}")
                print(f"   Found: {merged_content}")
                return False
        
        print("✅ All file contents preserved correctly")
        return True
        
    except Exception as e:
        print(f"❌ Error verifying content integrity: {e}")
        return False

def verify_filename_headers(test_dir: Path) -> bool:
    """Verify that each file section starts with the correct filename header."""
    expected_files = get_expected_files()
    merged_file = test_dir / "merged_content.txt"
    
    try:
        content = merged_file.read_text()
        
        for expected_file in expected_files:
            # Check if the filename appears anywhere in the content (as part of a line)
            if expected_file not in content:
                print(f"❌ Filename header '{expected_file}' not found")
                return False
        
        print("✅ All filename headers present and correctly formatted")
        return True
        
    except Exception as e:
        print(f"❌ Error verifying filename headers: {e}")
        return False

def main():
    """Main verification function."""
    test_dir = get_test_directory()
    print("🔍 Verifying File Merging Task...")
    
    # Show expected files for debugging
    expected_files = get_expected_files()
    print(f"📋 Expected files (10 smallest, excluding file_12.txt): {expected_files}")
    
    # Define verification steps
    verification_steps = [
        ("Merged File Exists", verify_merged_file_exists),
        ("Correct Files Selected", verify_correct_files_selected),
        ("Alphabetical Order", verify_alphabetical_order),
        ("Filename Headers", verify_filename_headers),
        ("Content Integrity", verify_file_content_integrity),
    ]
    
    # Run all verification steps
    all_passed = True
    for step_name, verify_func in verification_steps:
        print(f"\n--- {step_name} ---")
        if not verify_func(test_dir):
            all_passed = False
    
    # Final result
    print("\n" + "="*50)
    if all_passed:
        print("✅ File merging task completed correctly!")
        print("🎉 Task verification: PASS")
        sys.exit(0)
    else:
        print("❌ Task verification: FAIL")
        sys.exit(1)

if __name__ == "__main__":
    main()

================================================
FILE: tasks/filesystem/standard/file_context/file_splitting/description.md
================================================
Please use FileSystem tools to finish the following task:

### Task Description

You need to split a large text file into multiple smaller files with equal character counts. The task involves creating a new directory and splitting the content into exactly 10 files.

### Task Objectives

1. **Create a new directory** named `split` in the test directory
2. **Split the file** `large_file.txt` into exactly 10 files with equal character counts
3. **Name the files** as `split_01.txt`, `split_02.txt`, ..., `split_10.txt` in the `split` directory


================================================
FILE: tasks/filesystem/standard/file_context/file_splitting/meta.json
================================================
{
  "task_id": "file_splitting",
  "task_name": "File Splitting",
  "category_id": "file_context",
  "category_name": "File Context",
  "description": "Split a large text file into multiple equal-length segments for easier processing, distribution, and parallel handling of content.",
  "author": "Lingjun Chen",
  "created_at": "2025-08-08",
  "difficulty": "L3",
  "tags": [
    "content transformation"
  ],
  "mcp": [
    "filesystem"
  ],
  "meta_data": {
    "stateType": "text",
    "stateContent": "file_context/\n    ├── file_01.txt\n    ├── file_02.txt\n    ├── file_03.txt\n    ├── file_04.txt\n    ├── file_05.txt\n    ├── file_06.txt\n    ├── file_07.txt\n    ├── file_08.txt\n    ├── file_09.txt\n    ├── file_10.txt\n    ├── file_11.txt\n    ├── file_12.txt\n    ├── file_13.txt\n    ├── file_14.txt\n    ├── file_15.txt\n    ├── file_16.txt\n    ├── file_17.txt\n    ├── file_18.txt\n    ├── file_19.txt\n    ├── file_20.txt\n    └── large_file.txt",
    "stateUrl": "https://storage.mcpmark.ai/filesystem/file_context.zip",
    "stateOriginalUrl": null
  }
}

================================================
FILE: tasks/filesystem/standard/file_context/file_splitting/verify.py
================================================
#!/usr/bin/env python3
"""
Verification script for File Splitting Task
"""

import sys
from pathlib import Path
import os

def get_test_directory() -> Path:
    """Get the test directory from FILESYSTEM_TEST_DIR env var."""
    test_root = os.environ.get("FILESYSTEM_TEST_DIR")
    if not test_root:
        raise ValueError("FILESYSTEM_TEST_DIR environment variable is required")
    return Path(test_root)

def verify_split_directory_exists(test_dir: Path) -> bool:
    """Verify that the split directory exists."""
    split_dir = test_dir / "split"
    
    if not split_dir.exists():
        print("❌ Directory 'split' not found")
        return False
    
    if not split_dir.is_dir():
        print("❌ 'split' exists but is not a directory")
        return False
    
    print("✅ Split directory found")
    return True

def verify_all_split_files_exist(test_dir: Path) -> bool:
    """Verify that all 10 split files exist with correct names."""
    split_dir = test_dir / "split"
    
    expected_files = [f"split_{i:02d}.txt" for i in range(1, 11)]
    missing_files = []
    
    for filename in expected_files:
        file_path = split_dir / filename
        if not file_path.exists():
            missing_files.append(filename)
    
    if missing_files:
        print(f"❌ Missing files: {missing_files}")
        return False
    
    print("✅ All 10 split files exist with correct names")
    return True

def verify_equal_file_lengths(test_dir: Path) -> bool:
    """Verify that all split files have equal character counts."""
    split_dir = test_dir / "split"
    
    file_lengths = []
    for i in range(1, 11):
        filename = f"split_{i:02d}.txt"
        file_path = split_dir / filename
        
        try:
            content = file_path.read_text()
            file_lengths.append(len(content))
        except Exception as e:
            print(f"❌ Error reading {filename}: {e}")
            return False
    
    # Check if all lengths are equal
    if len(set(file_lengths)) != 1:
        print(f"❌ File lengths are not equal: {file_lengths}")
        return False
    
    print(f"✅ All files have equal length: {file_lengths[0]} characters")
    return True

def verify_content_integrity(test_dir: Path) -> bool:
    """Verify that concatenated split files equal the original file."""
    split_dir = test_dir / "split"
    original_file = test_dir / "large_file.txt"
    
    # Read original content
    try:
        original_content = original_file.read_text()
    except Exception as e:
        print(f"❌ Error reading original file: {e}")
        return False
    
    # Concatenate all split files
    concatenated_content = ""
    for i in range(1, 11):
        filename = f"split_{i:02d}.txt"
        file_path = split_dir / filename
        
        try:
            content = file_path.read_text()
            concatenated_content += content
        except Exception as e:
            print(f"❌ Error reading {filename}: {e}")
            return False
    
    # Compare content
    if concatenated_content != original_content:
        print("❌ Concatenated content does not match original file")
        print(f"   Original length: {len(original_content)}")
        print(f"   Concatenated length: {len(concatenated_content)}")
        return False
    
    print("✅ Concatenated content matches original file exactly")
    return True

def verify_no_extra_files(test_dir: Path) -> bool:
    """Verify that no extra files exist in the split directory."""
    split_dir = test_dir / "split"
    
    expected_files = {f"split_{i:02d}.txt" for i in range(1, 11)}
    actual_files = {f.name for f in split_dir.iterdir() if f.is_file()}
    
    extra_files = actual_files - expected_files
    if extra_files:
        print(f"❌ Extra files found in split directory: {extra_files}")
        return False
    
    print("✅ No extra files in split directory")
    return True

def main():
    """Main verification function."""
    test_dir = get_test_directory()
    print("🔍 Verifying File Splitting Task...")
    
    # Define verification steps
    verification_steps = [
        ("Split Directory Exists", verify_split_directory_exists),
        ("All Split Files Exist", verify_all_split_files_exist),
        ("Equal File Lengths", verify_equal_file_lengths),
        ("Content Integrity", verify_content_integrity),
        ("No Extra Files", verify_no_extra_files),
    ]
    
    # Run all verification steps
    all_passed = True
    for step_name, verify_func in verification_steps:
        print(f"\n--- {step_name} ---")
        if not verify_func(test_dir):
            all_passed = False
    
    # Final result
    print("\n" + "="*50)
    if all_passed:
        print("✅ File splitting task completed correctly!")
        print("🎉 Task verification: PASS")
        sys.exit(0)
    else:
        print("❌ Task verification: FAIL")
        sys.exit(1)

if __name__ == "__main__":
    main()

================================================
FILE: tasks/filesystem/standard/file_context/pattern_matching/description.md
================================================
Please use FileSystem tools to finish the following task:

### Task Description

Your task is to find all files that contain a substring of 30 or more characters that also appears in `large_file.txt`. **You are not allowed to use python code.**

### Task Objectives

1. **Read the reference file** `large_file.txt` to understand its content
2. **Examine each file** from file_01.txt to file_20.txt
3. **Find files** that contain a substring of 30 or more characters that matches a substring in `large_file.txt`
4. **Create a file `answer.txt`** and write the results to it with the following format:
   - One line per matching file
   - Format: `filename.txt,start_position`
   - Where start_position is the character position (1-indexed) of the matching substring in `large_file.txt`
   - Do not add any things else other than `filename.txt,start_position`.


================================================
FILE: tasks/filesystem/standard/file_context/pattern_matching/meta.json
================================================
{
  "task_id": "pattern_matching",
  "task_name": "Pattern Matching",
  "category_id": "file_context",
  "category_name": "File Context",
  "description": "Search multiple files for shared character sequences and precisely locate all matching pattern occurrences within the target files.",
  "author": "Lingjun Chen",
  "created_at": "2025-08-06",
  "difficulty": "L3",
  "tags": [
    "pattern analysis",
    "cross-referencing"
  ],
  "mcp": [
    "filesystem"
  ],
  "meta_data": {
    "stateType": "text",
    "stateContent": "file_context/\n    ├── file_01.txt\n    ├── file_02.txt\n    ├── file_03.txt\n    ├── file_04.txt\n    ├── file_05.txt\n    ├── file_06.txt\n    ├── file_07.txt\n    ├── file_08.txt\n    ├── file_09.txt\n    ├── file_10.txt\n    ├── file_11.txt\n    ├── file_12.txt\n    ├── file_13.txt\n    ├── file_14.txt\n    ├── file_15.txt\n    ├── file_16.txt\n    ├── file_17.txt\n    ├── file_18.txt\n    ├── file_19.txt\n    ├── file_20.txt\n    └── large_file.txt",
    "stateUrl": "https://storage.mcpmark.ai/filesystem/file_context.zip",
    "stateOriginalUrl": null
  }
}

================================================
FILE: tasks/filesystem/standard/file_context/pattern_matching/verify.py
================================================
#!/usr/bin/env python3
"""
Verification script for File Filtering Task: Find Files with Common Substring
"""

import sys
from pathlib import Path
import os
import re

def get_test_directory() -> Path:
    """Get the test directory from FILESYSTEM_TEST_DIR env var."""
    test_root = os.environ.get("FILESYSTEM_TEST_DIR")
    if not test_root:
        raise ValueError("FILESYSTEM_TEST_DIR environment variable is required")
    return Path(test_root)

def verify_answer_file_exists(test_dir: Path) -> bool:
    """Verify that the answer.txt file exists."""
    answer_file = test_dir / "answer.txt"
    
    if not answer_file.exists():
        print("❌ File 'answer.txt' not found")
        return False
    
    print("✅ Answer file found")
    return True

def verify_answer_format(test_dir: Path) -> bool:
    """Verify that the answer file has the correct format."""
    answer_file = test_dir / "answer.txt"
    
    try:
        content = answer_file.read_text().strip()
        
        # If file is empty, that's acceptable (no matches found)
        if not content:
            print("✅ Answer file is empty (no matches found)")
            return True
        
        lines = content.split('\n')
        
        for i, line in enumerate(lines, 1):
            line = line.strip()
            if not line:
                continue
                
            # Check format: filename.txt,start_position
            parts = line.split(',')
            if len(parts) != 2:
                print(f"❌ Line {i} has incorrect format: {line}")
                print("   Expected format: filename.txt,start_position")
                return False
            
            filename, start_pos = parts
            
            # Check filename format
            if not filename.endswith('.txt') or not filename.startswith('file_'):
                print(f"❌ Line {i} has invalid filename: {filename}")
                return False
            
            # Check position format (should be integer)
            try:
                start_int = int(start_pos)
                if start_int <= 0:
                    print(f"❌ Line {i} has invalid position: {start_pos}")
                    return False
            except ValueError:
                print(f"❌ Line {i} has non-integer position: {start_pos}")
                return False
        
        print("✅ Answer format is correct")
        return True
        
    except Exception as e:
        print(f"❌ Error reading answer file: {e}")
        return False

def find_30_plus_char_matches(test_dir: Path) -> dict:
    """Find all matches with 30 or more characters between files and large_file.txt."""
    large_file = test_dir / "large_file.txt"
    if not large_file.exists():
        print("❌ large_file.txt not found")
        return {}
    
    large_content = large_file.read_text()
    matches = {}
    
    # Check each file from file_01.txt to file_20.txt
    for i in range(1, 21):
        filename = f"file_{i:02d}.txt"
        file_path = test_dir / filename
        
        if not file_path.exists():
            continue
            
        file_content = file_path.read_text()
        
        # Find the longest matching substring (30+ characters)
        longest_match = ""
        longest_match_start = -1
        
        # Check all possible substrings in the file
        for start_pos in range(len(file_content)):
            for end_pos in range(start_pos + 30, len(file_content) + 1):  # At least 30 characters
                substring = file_content[start_pos:end_pos]
                
                # Check if this substring exists in large_file.txt
                if substring in large_content:
                    if len(substring) > len(longest_match):
                        longest_match = substring
                        # Find the position in large_file.txt where this substring starts
                        large_start_pos = large_content.find(substring)
                        longest_match_start = large_start_pos + 1  # 1-indexed
        
        # If we found a match of 30+ characters, record it
        if longest_match and len(longest_match) >= 30:
            matches[filename] = longest_match_start
    
    return matches

def verify_matches_are_correct(test_dir: Path) -> bool:
    """Verify that the matches found in answer.txt are actually correct."""
    answer_file = test_dir / "answer.txt"
    
    try:
        content = answer_file.read_text().strip()
        
        # If no content, check if there should actually be no matches
        if not content:
            expected_matches = find_30_plus_char_matches(test_dir)
            if expected_matches:
                print("❌ Answer file is empty but matches should exist")
                for filename, start_pos in expected_matches.items():
                    print(f"   Expected: {filename},{start_pos}")
                return False
            else:
                print("✅ No matches found (correct)")
                return True
        
        # Parse answer file
        answer_matches = {}
        lines = content.split('\n')
        for line in lines:
            line = line.strip()
            if not line:
                continue
            filename, start_pos = line.split(',')
            answer_matches[filename] = int(start_pos)
        
        # Get expected matches
        expected_matches = find_30_plus_char_matches(test_dir)
        
        # Check if all answer matches are correct
        for filename, start_pos in answer_matches.items():
            if filename not in expected_matches:
                print(f"❌ File {filename} listed in answer but has no valid 30+ character match")
                return False
            
            expected_start = expected_matches[filename]
            if start_pos != expected_start:
                print(f"❌ Incorrect match position for {filename}")
                print(f"   Expected: {expected_start}")
                print(f"   Found: {start_pos}")
                return False
        
        # Check if all expected matches are in answer
        for filename in expected_matches:
            if filename not in answer_matches:
                print(f"❌ Missing match for {filename} in answer file")
                return False
        
        print("✅ All matches are correct")
        return True
        
    except Exception as e:
        print(f"❌ Error verifying matches: {e}")
        return False

def verify_match_length_is_30_plus(test_dir: Path) -> bool:
    """Verify that all matches are at least 30 characters long."""
    answer_file = test_dir / "answer.txt"
    
    try:
        content = answer_file.read_text().strip()
        
        if not content:
            return True  # No matches to verify
        
        large_file = test_dir / "large_file.txt"
        large_content = large_file.read_text()
        
        lines = content.split('\n')
        for line in lines:
            line = line.strip()
            if not line:
                continue
                
            filename, start_pos = line.split(',')
            start_int = int(start_pos)
            
            # Get the file content to check the match
            file_path = test_dir / filename
            file_content = file_path.read_text()
            
            # Find the longest matching substring starting from the given position
            longest_match = ""
            for end_pos in range(start_int + 30 - 1, len(large_content) + 1):  # At least 30 characters
                substring = large_content[start_int - 1:end_pos]  # Convert to 0-indexed
                if substring in file_content:
                    longest_match = substring
                else:
                    break
            
            if len(longest_match) < 30:
                print(f"❌ Match in {filename} is {len(longest_match)} characters, less than 30")
                print(f"   Starting position: {start_int}")
                return False
        
        print("✅ All matches are at least 30 characters long")
        return True
        
    except Exception as e:
        print(f"❌ Error verifying match lengths: {e}")
        return False

def verify_files_exist(test_dir: Path) -> bool:
    """Verify that all files mentioned in answer.txt actually exist."""
    answer_file = test_dir / "answer.txt"
    
    try:
        content = answer_file.read_text().strip()
        
        if not content:
            return True  # No files to verify
        
        lines = content.split('\n')
        for line in lines:
            line = line.strip()
            if not line:
                continue
                
            filename = line.split(',')[0]
            file_path = test_dir / filename
            
            if not file_path.exists():
                print(f"❌ File mentioned in answer does not exist: {filename}")
                return False
        
        print("✅ All files mentioned in answer exist")
        return True
        
    except Exception as e:
        print(f"❌ Error verifying file existence: {e}")
        return False

def main():
    """Main verification function."""
    test_dir = get_test_directory()
    print("🔍 Verifying Pattern Matching Task: Find Files with Common Substring...")
    
    # Define verification steps
    verification_steps = [
        ("Answer File Exists", verify_answer_file_exists),
        ("Answer Format", verify_answer_format),
        ("Files Exist", verify_files_exist),
        ("Match Length is 30+", verify_match_length_is_30_plus),
        ("Matches are Correct", verify_matches_are_correct),
    ]
    
    # Run all verification steps
    all_passed = True
    for step_name, verify_func in verification_steps:
        print(f"\n--- {step_name} ---")
        if not verify_func(test_dir):
            all_passed = False
    
    # Final result
    print("\n" + "="*50)
    if all_passed:
        print("✅ File filtering task completed correctly!")
        print("🎉 Task verification: PASS")
        sys.exit(0)
    else:
        print("❌ Task verification: FAIL")
        sys.exit(1)

if __name__ == "__main__":
    main()

================================================
FILE: tasks/filesystem/standard/file_context/uppercase/description.md
================================================
Please use FileSystem tools to finish the following task:

### Task Description

You need to process 10 text files (file_01.txt to file_10.txt) and convert their content to uppercase format.

### Task Objectives

1. **Create an uppercase directory** in the test environment root
2. **Convert each file** from file_01.txt to file_10.txt to uppercase
3. **Save converted files** in the uppercase/ directory with the same names
4. **Count words** in each original file (file_01.txt to file_10.txt)
5. **Create answer.txt** with word counts in the specified format.

### Specified Format of answer.txt

Create a file named `answer.txt` in uppercase/

**Requirements:**

- Each line should follow the format: `<filename>:<word_count>`
- Include all 10 files: file_01.txt, file_02.txt, ..., file_10.txt
- Use the exact filename format (file_01.txt, file_02.txt, etc.)
- One entry per line


================================================
FILE: tasks/filesystem/standard/file_context/uppercase/meta.json
================================================
{
  "task_id": "uppercase",
  "task_name": "Uppercase",
  "category_id": "file_context",
  "category_name": "File Context",
  "description": "Convert the content of 10 specified files to uppercase format and calculate the total word count across all processed files.",
  "author": "Lingjun Chen",
  "created_at": "2025-08-19",
  "difficulty": "L3",
  "tags": [
    "content transformation",
    "data extraction"
  ],
  "mcp": [
    "filesystem"
  ],
  "meta_data": {
    "stateType": "text",
    "stateContent": "file_context/\n    ├── file_01.txt\n    ├── file_02.txt\n    ├── file_03.txt\n    ├── file_04.txt\n    ├── file_05.txt\n    ├── file_06.txt\n    ├── file_07.txt\n    ├── file_08.txt\n    ├── file_09.txt\n    ├── file_10.txt\n    ├── file_11.txt\n    ├── file_12.txt\n    ├── file_13.txt\n    ├── file_14.txt\n    ├── file_15.txt\n    ├── file_16.txt\n    ├── file_17.txt\n    ├── file_18.txt\n    ├── file_19.txt\n    ├── file_20.txt\n    └── large_file.txt",
    "stateUrl": "https://storage.mcpmark.ai/filesystem/file_context.zip",
    "stateOriginalUrl": null
  }
}

================================================
FILE: tasks/filesystem/standard/file_context/uppercase/verify.py
================================================
#!/usr/bin/env python3
"""
Verification script for File Context Task: Convert Files to Uppercase
"""

import sys
from pathlib import Path
import os
import re

def get_test_directory() -> Path:
    """Get the test directory from FILESYSTEM_TEST_DIR env var."""
    test_root = os.environ.get("FILESYSTEM_TEST_DIR")
    if not test_root:
        raise ValueError("FILESYSTEM_TEST_DIR environment variable is required")
    return Path(test_root)

def verify_uppercase_directory_exists(test_dir: Path) -> bool:
    """Verify that the uppercase directory exists."""
    uppercase_dir = test_dir / "uppercase"

    if not uppercase_dir.exists():
        print("| ❌ Directory 'uppercase' not found")
        return False

    if not uppercase_dir.is_dir():
        print("| ❌ 'uppercase' exists but is not a directory")
        return False

    print("| ✓ Uppercase directory found")
    return True

def verify_uppercase_files_exist(test_dir: Path) -> bool:
    """Verify that all 10 uppercase files exist."""
    uppercase_dir = test_dir / "uppercase"

    for i in range(1, 11):
        filename = f"file_{i:02d}.txt"
        file_path = uppercase_dir / filename

        if not file_path.exists():
            print(f"| ❌ File '{filename}' not found in uppercase directory")
            return False

    print("| ✓ All 10 uppercase files found")
    return True

def verify_uppercase_content(test_dir: Path) -> bool:
    """Verify that uppercase files contain the correct uppercase content."""
    uppercase_dir = test_dir / "uppercase"

    for i in range(1, 11):
        filename = f"file_{i:02d}.txt"
        original_file = test_dir / filename
        uppercase_file = uppercase_dir / filename

        if not original_file.exists():
            print(f"| ❌ Original file '{filename}' not found")
            return False

        try:
            original_content = original_file.read_text()
            uppercase_content = uppercase_file.read_text()

            # Check if uppercase content is the uppercase version of original
            expected_uppercase = original_content.upper()

            if uppercase_content != expected_uppercase:
                print(f"| ❌ File '{filename}' content is not properly converted to uppercase")
                return False

        except Exception as e:
            print(f"| ❌ Error reading file '{filename}': {e}")
            return False

    print("| ✓ All uppercase files contain correct uppercase content")
    return True

def verify_answer_file_exists(test_dir: Path) -> bool:
    """Verify that the answer.txt file exists in the uppercase directory."""
    uppercase_dir = test_dir / "uppercase"
    answer_file = uppercase_dir / "answer.txt"

    if not answer_file.exists():
        print("| ❌ File 'answer.txt' not found in uppercase directory")
        return False

    print("| ✓ Answer file found in uppercase directory")
    return True

def verify_answer_format(test_dir: Path) -> bool:
    """Verify that the answer file has the correct format."""
    uppercase_dir = test_dir / "uppercase"
    answer_file = uppercase_dir / "answer.txt"

    try:
        content = answer_file.read_text().strip()

        if not content:
            print("| ❌ Answer file is empty")
            return False

        lines = content.split('\n')

        # Check if we have exactly 10 lines
        if len(lines) != 10:
            print(f"| ❌ Answer file has {len(lines)} lines, expected 10")
            return False

        for i, line in enumerate(lines, 1):
            line = line.strip()
            if not line:
                print(f"| ❌ Line {i} is empty")
                return False

            # Check format: filename:word_count
            if ':' not in line:
                print(f"| ❌ Line {i} has incorrect format: {line}")
                print("   Expected format: filename:word_count")
                return False

            parts = line.split(':', 1)
            if len(parts) != 2:
                print(f"| ❌ Line {i} has incorrect format: {line}")
                print("   Expected format: filename:word_count")
                return False

            filename, word_count_str = parts

            # Check filename format
            if not filename.endswith('.txt') or not filename.startswith('file_'):
                print(f"| ❌ Line {i} has invalid filename: {filename}")
                return False

            # Check word count format (should be integer)
            try:
                word_count = int(word_count_str)
                if word_count <= 0:
                    print(f"| ❌ Line {i} has invalid word count: {word_count_str}")
                    return False
            except ValueError:
                print(f"| ❌ Line {i} has non-integer word count: {word_count_str}")
                return False

        print("| ✓ Answer format is correct")
        return True

    except Exception as e:
        print(f"| ❌ Error reading answer file: {e}")
        return False

def count_words_in_file(file_path: Path) -> int:
    """Count words in a file."""
    try:
        content = file_path.read_text()
        # Split by whitespace and filter out empty strings
        words = [word for word in content.split() if word.strip()]
        return len(words)
    except Exception as e:
        print(f"| ❌ Error reading file {file_path}: {e}")
        return 0

def verify_word_counts_are_correct(test_dir: Path) -> bool:
    """Verify that the word counts in answer.txt are correct."""
    uppercase_dir = test_dir / "uppercase"
    answer_file = uppercase_dir / "answer.txt"

    try:
        content = answer_file.read_text().strip()
        lines = content.split('\n')

        # Expected word counts based on answer.md
        expected_counts = [22, 22, 22, 22, 18, 22, 22, 22, 18, 20]

        # Create a set of expected file entries for easier checking
        expected_entries = set()
        for i in range(1, 11):
            filename = f"file_{i:02d}.txt"
            expected_count = expected_counts[i - 1]
            if i == 6:  # Special case for file_06.txt: can be 21 or 22
                expected_entries.add(f"{filename}:21")
                expected_entries.add(f"{filename}:22")
            else:
                expected_entries.add(f"{filename}:{expected_count}")

        # Check each line in the answer file
        found_entries = set()
        for line in lines:
            line = line.strip()
            if line in expected_entries:
                found_entries.add(line)
            else:
                print(f"| ❌ Invalid entry: {line}")
                return False

        # Check if we found all expected entries
        if len(found_entries) != 10:
            print(f"| ❌ Found {len(found_entries)} entries, expected 10")
            missing = expected_entries - found_entries
            if missing:
                print(f"   Missing entries: {missing}")
            return False

        print("| ✓ All word counts are correct")
        return True

    except Exception as e:
        print(f"| ❌ Error verifying word counts: {e}")
        return False

def verify_all_files_are_included(test_dir: Path) -> bool:
    """Verify that all 10 files are included in the answer."""
    uppercase_dir = test_dir / "uppercase"
    answer_file = uppercase_dir / "answer.txt"

    try:
        content = answer_file.read_text().strip()
        lines = content.split('\n')

        # Check that all 10 files are present
        found_files = set()
        for line in lines:
            parts = line.split(':', 1)
            filename = parts[0]
            found_files.add(filename)

        expected_files = {f"file_{i:02d}.txt" for i in range(1, 11)}

        if found_files != expected_files:
            missing = expected_files - found_files
            extra = found_files - expected_files
            if missing:
                print(f"| ❌ Missing files in answer: {missing}")
            if extra:
                print(f"| ❌ Extra files in answer: {extra}")
            return False

        print("| ✓ All 10 files are included in answer")
        return True

    except Exception as e:
        print(f"| ❌ Error verifying file inclusion: {e}")
        return False

def main():
    """Main verification function."""
    try:
        test_dir = get_test_directory()
        print(f"| 🔍 Verifying Uppercase in: {test_dir}")
        print('|')

        # Run all verification checks
        checks = [
            ("Uppercase directory exists", verify_uppercase_directory_exists),
            ("Uppercase files exist", verify_uppercase_files_exist),
            ("Uppercase content is correct", verify_uppercase_content),
            ("Answer file exists in uppercase directory", verify_answer_file_exists),
            ("Answer format is correct", verify_answer_format),
            ("All files are included", verify_all_files_are_included),
            ("Word counts are correct", verify_word_counts_are_correct),
        ]

        all_passed = True
        for check_name, check_func in checks:
            print(f"| Checking {check_name}...")
            if not check_func(test_dir):
                all_passed = False
            print('|')

        if all_passed:
            print("| 🎉 All verification checks passed!")
            sys.exit(0)
        else:
            print("| ❌ Some verification checks failed!")
            sys.exit(1)

    except Exception as e:
        print(f"| ❌ Verification failed with error: {e}")
        sys.exit(1)

if __name__ == "__main__":
    main()


================================================
FILE: tasks/filesystem/standard/file_property/size_classification/description.md
================================================
Please use FileSystem tools to finish the following task:

### Task Description

Classify all files in the test directory into three categories based on their file size. Create three subdirectories and move files accordingly.

### Task Objectives

1. **Create three directories** in the test directory:

   - `small_files/` - for files smaller than 300 bytes
   - `medium_files/` - for files between 300-700 bytes (inclusive)
   - `large_files/` - for files larger than 700 bytes
2. **Move all files** from the test directory into the appropriate subdirectory based on their size
3. **Handle all file types** - classify all files regardless of their extension (.txt, .jpg, .MOV, etc.)


================================================
FILE: tasks/filesystem/standard/file_property/size_classification/meta.json
================================================
{
  "task_id": "size_classification",
  "task_name": "Size Classification",
  "category_id": "file_property",
  "category_name": "File Property",
  "description": "Classify all files in the folder by size into distinct categories (small/medium/large) and generate a comprehensive summary report with statistics.",
  "author": "Lingjun Chen",
  "created_at": "2025-08-07",
  "difficulty": "L3",
  "tags": [
    "file organization",
    "pattern analysis"
  ],
  "mcp": [
    "filesystem"
  ],
  "meta_data": {
    "stateType": "text",
    "stateContent": "file_property/\n    ├── bear.jpg\n    ├── bridge.jpg\n    ├── bus.MOV\n    ├── random_file_1.txt\n    ├── random_file_2.txt\n    ├── random_file_3.txt\n    ├── road.MOV\n    └── sg.jpg",
    "stateUrl": "https://storage.mcpmark.ai/filesystem/file_property.zip",
    "stateOriginalUrl": null
  }
}

================================================
FILE: tasks/filesystem/standard/file_property/size_classification/verify.py
================================================
#!/usr/bin/env python3
"""
Verification script for File Classification Task
"""

import sys
from pathlib import Path
import os

def get_test_directory() -> Path:
    """Get the test directory from FILESYSTEM_TEST_DIR env var."""
    test_root = os.environ.get("FILESYSTEM_TEST_DIR")
    if not test_root:
        raise ValueError("FILESYSTEM_TEST_DIR environment variable is required")
    return Path(test_root)

def get_expected_classification():
    """Return the expected file classification based on answer.md."""
    return {
        "small_files": ["random_file_1.txt", "random_file_3.txt"],
        "medium_files": ["random_file_2.txt"],
        "large_files": ["bear.jpg", "sg.jpg", "road.MOV", "bus.MOV", "bridge.jpg"]
    }

def verify_directories_exist(test_dir: Path) -> bool:
    """Verify that all three required directories exist."""
    required_dirs = ["small_files", "medium_files", "large_files"]
    
    for dir_name in required_dirs:
        dir_path = test_dir / dir_name
        if not dir_path.exists():
            print(f"❌ Directory '{dir_name}' not found")
            return False
        if not dir_path.is_dir():
            print(f"❌ '{dir_name}' exists but is not a directory")
            return False
    
    print("✅ All required directories exist")
    return True

def verify_file_classification(test_dir: Path) -> bool:
    """Verify that files are correctly classified into the right directories."""
    expected_classification = get_expected_classification()
    
    for dir_name, expected_files in expected_classification.items():
        dir_path = test_dir / dir_name
        
        # Check that all expected files are in the directory
        missing_files = []
        for filename in expected_files:
            file_path = dir_path / filename
            if not file_path.exists():
                missing_files.append(filename)
        
        if missing_files:
            print(f"❌ Missing files in '{dir_name}': {missing_files}")
            return False
        
        # Check that no unexpected files are in the directory (ignore .DS_Store and similar system files)
        actual_files = [f.name for f in dir_path.iterdir() if f.is_file()]
        # Filter out system files that are commonly present
        system_files = ['.DS_Store', 'Thumbs.db', '.DS_Store?', '._.DS_Store']
        unexpected_files = [f for f in actual_files if f not in expected_files and f not in system_files]
        
        if unexpected_files:
            print(f"❌ Unexpected files in '{dir_name}': {unexpected_files}")
            return False
    
    print("✅ All files are correctly classified")
    return True

def verify_no_files_in_root(test_dir: Path) -> bool:
    """Verify that no files remain in the root test directory."""
    root_files = [f for f in test_dir.iterdir() if f.is_file()]
    
    # Filter out system files that are commonly present
    system_files = ['.DS_Store', 'Thumbs.db', '.DS_Store?', '._.DS_Store']
    non_system_files = [f for f in root_files if f.name not in system_files]
    
    if non_system_files:
        print(f"❌ Files still present in root directory: {[f.name for f in non_system_files]}")
        return False
    
    print("✅ No files remain in root directory")
    return True

def verify_file_sizes(test_dir: Path) -> bool:
    """Verify that files are actually in the correct size categories."""
    size_ranges = {
        "small_files": (0, 299),  # < 300 bytes
        "medium_files": (300, 700),  # 300-700 bytes (inclusive)
        "large_files": (701, float('inf'))  # > 700 bytes
    }
    
    for dir_name, (min_size, max_size) in size_ranges.items():
        dir_path = test_dir / dir_name
        
        for file_path in dir_path.iterdir():
            if file_path.is_file():
                file_size = file_path.stat().st_size
                
                if dir_name == "small_files" and file_size >= 300:
                    print(f"❌ File {file_path.name} in small_files but size is {file_size} bytes")
                    return False
                elif dir_name == "medium_files" and (file_size < 300 or file_size > 700):
                    print(f"❌ File {file_path.name} in medium_files but size is {file_size} bytes")
                    return False
                elif dir_name == "large_files" and file_size <= 700:
                    print(f"❌ File {file_path.name} in large_files but size is {file_size} bytes")
                    return False
    
    print("✅ All files are in correct size categories")
    return True

def verify_total_file_count(test_dir: Path) -> bool:
    """Verify that all original files are accounted for."""
    expected_classification = get_expected_classification()
    total_expected = sum(len(files) for files in expected_classification.values())
    
    total_actual = 0
    for dir_name in ["small_files", "medium_files", "large_files"]:
        dir_path = test_dir / dir_name
        if dir_path.exists():
            # Count only non-system files
            system_files = ['.DS_Store', 'Thumbs.db', '.DS_Store?', '._.DS_Store']
            files_in_dir = [f for f in dir_path.iterdir() if f.is_file() and f.name not in system_files]
            total_actual += len(files_in_dir)
    
    if total_actual != total_expected:
        print(f"❌ Expected {total_expected} files total, found {total_actual}")
        return False
    
    print(f"✅ Total file count is correct: {total_actual}")
    return True

def main():
    """Main verification function."""
    try:
        test_dir = get_test_directory()
        print(f"🔍 Verifying file classification in: {test_dir}")
        
        # Run all verification checks
        checks = [
            ("Directory existence", verify_directories_exist),
            ("File classification", verify_file_classification),
            ("No files in root", verify_no_files_in_root),
            ("File size validation", verify_file_sizes),
            ("Total file count", verify_total_file_count)
        ]
        
        all_passed = True
        for check_name, check_func in checks:
            print(f"\n📋 Checking: {check_name}")
            if not check_func(test_dir):
                all_passed = False
        
        if all_passed:
            print("\n🎉 All verification checks passed!")
            sys.exit(0)
        else:
            print("\n❌ Some verification checks failed!")
            sys.exit(1)
            
    except Exception as e:
        print(f"❌ Verification failed with error: {e}")
        sys.exit(1)

if __name__ == "__main__":
    main()

================================================
FILE: tasks/filesystem/standard/file_property/time_classification/description.md
================================================
Please use FileSystem tools to finish the following task:

### Task Description

Analyze the creation time (ctime) of all files in the test directory and organize them into a hierarchical directory structure based on their creation dates.

### Task Objectives

1. **Read metadata** of all files in the test directory
2. **Analyze creation times** (ctime) of all files (excluding .DS_Store)
3. **Create directory structure** organized by month/day based on creation time
4. **Move files** to appropriate directories
5. **Create metadata analysis files** in each directory

### Expected Output

#### Directory Structure

Create directories in the format: `MM/DD/` where:

- MM = month (two digits, e.g., 01, 02)
- DD = day (two digits, e.g., 07, 09, 11, 26)

#### Metadata Analysis Files

Create a file named `metadata_analyse.txt` in each directory containing exactly only two lines:

- **Line 1**: Oldest filename and its creation time (excluding .DS_Store)
- **Line 2**: Latest filename and its creation time (excluding .DS_Store)


================================================
FILE: tasks/filesystem/standard/file_property/time_classification/meta.json
================================================
{
  "task_id": "time_classification",
  "task_name": "Time Classification",
  "category_id": "file_property",
  "category_name": "File Property",
  "description": "Organize files based on modification timestamps into temporal categories and create a detailed time-based classification report with groupings.",
  "author": "Lingjun Chen",
  "created_at": "2025-08-07",
  "difficulty": "L3",
  "tags": [
    "file organization",
    "data extraction",
    "pattern analysis"
  ],
  "mcp": [
    "filesystem"
  ],
  "meta_data": {
    "stateType": "text",
    "stateContent": "file_property/\n    ├── bear.jpg\n    ├── bridge.jpg\n    ├── bus.MOV\n    ├── random_file_1.txt\n    ├── random_file_2.txt\n    ├── random_file_3.txt\n    ├── road.MOV\n    └── sg.jpg",
    "stateUrl": "https://storage.mcpmark.ai/filesystem/file_property.zip",
    "stateOriginalUrl": null
  }
}

================================================
FILE: tasks/filesystem/standard/file_property/time_classification/verify.py
================================================
#!/usr/bin/env python3
"""
Verification script for File Organization by Creation Time Task
"""

import sys
from pathlib import Path
import os
from datetime import datetime
import re

def get_test_directory() -> Path:
    """Get the test directory from FILESYSTEM_TEST_DIR env var."""
    test_root = os.environ.get("FILESYSTEM_TEST_DIR")
    if not test_root:
        raise ValueError("FILESYSTEM_TEST_DIR environment variable is required")
    return Path(test_root)

def get_month_mapping():
    """Return mapping for both numeric and alphabetic month representations."""
    return {
        "07": ["07", "7", "jul", "Jul", "JUL"],
        "08": ["08", "8", "aug", "Aug", "AUG"]
    }

def get_day_mapping():
    """Return mapping for day representations."""
    return {
        "09": ["09", "9"],
        "25": ["25"],
        "26": ["26"],
        "06": ["06", "6"]
    }

def get_expected_directory_structure():
    """Return the expected directory structure based on answer.md."""
    return {
        "07": {
            "09": ["sg.jpg"],
            "25": ["bus.MOV"],
            "26": ["road.MOV"]
        },
        "08": {
            "06": ["bear.jpg", "bridge.jpg", "random_file_1.txt", "random_file_2.txt", "random_file_3.txt"]
        }
    }

def find_month_directory(test_dir: Path, expected_month: str) -> Path:
    """Find the actual month directory, handling both numeric and alphabetic representations."""
    month_mapping = get_month_mapping()
    valid_month_names = month_mapping.get(expected_month, [expected_month])
    
    for month_name in valid_month_names:
        month_dir = test_dir / month_name
        if month_dir.exists() and month_dir.is_dir():
            return month_dir
    
    return None

def find_day_directory(month_dir: Path, expected_day: str) -> Path:
    """Find the actual day directory, handling both numeric representations."""
    day_mapping = get_day_mapping()
    valid_day_names = day_mapping.get(expected_day, [expected_day])
    
    for day_name in valid_day_names:
        day_dir = month_dir / day_name
        if day_dir.exists() and day_dir.is_dir():
            return day_dir
    
    return None

def verify_directory_structure(test_dir: Path) -> bool:
    """Verify that the correct directory structure exists."""
    expected_structure = get_expected_directory_structure()
    
    for expected_month, days in expected_structure.items():
        month_dir = find_month_directory(test_dir, expected_month)
        if month_dir is None:
            valid_names = get_month_mapping().get(expected_month, [expected_month])
            print(f"❌ Month directory not found. Expected one of: {valid_names}")
            return False
        
        for day, expected_files in days.items():
            day_dir = find_day_directory(month_dir, day)
            if day_dir is None:
                valid_day_names = get_day_mapping().get(day, [day])
                print(f"❌ Day directory '{month_dir.name}/{day}' not found. Expected one of: {valid_day_names}")
                return False
            if not day_dir.is_dir():
                print(f"❌ '{month_dir.name}/{day_dir.name}' exists but is not a directory")
                return False
    
    print("✅ Directory structure is correct")
    return True

def verify_files_in_directories(test_dir: Path) -> bool:
    """Verify that files are in the correct directories."""
    expected_structure = get_expected_directory_structure()
    
    for expected_month, days in expected_structure.items():
        month_dir = find_month_directory(test_dir, expected_month)
        if month_dir is None:
            continue  # Already handled in verify_directory_structure
        
        for day, expected_files in days.items():
            day_dir = find_day_directory(month_dir, day)
            if day_dir is None:
                continue  # Already handled in verify_directory_structure
            
            # Check that all expected files are in the directory
            missing_files = []
            for filename in expected_files:
                file_path = day_dir / filename
                if not file_path.exists():
                    missing_files.append(filename)
            
            if missing_files:
                print(f"❌ Missing files in '{month_dir.name}/{day_dir.name}': {missing_files}")
                return False
            
            # Check that no unexpected files are in the directory (ignore .DS_Store and metadata_analyse.txt)
            actual_files = [f.name for f in day_dir.iterdir() if f.is_file()]
            system_files = ['.DS_Store', 'Thumbs.db', '.DS_Store?', '._.DS_Store', 'metadata_analyse.txt']
            unexpected_files = [f for f in actual_files if f not in expected_files and f not in system_files]
            
            if unexpected_files:
                print(f"❌ Unexpected files in '{month_dir.name}/{day_dir.name}': {unexpected_files}")
                return False
    
    print("✅ All files are in correct directories")
    return True

def verify_metadata_analysis_files(test_dir: Path) -> bool:
    """Verify that metadata_analyse.txt files exist and have correct content."""
    expected_structure = get_expected_directory_structure()
    
    for expected_month, days in expected_structure.items():
        month_dir = find_month_directory(test_dir, expected_month)
        if month_dir is None:
            continue  # Already handled in verify_directory_structure
        
        for day, expected_files in days.items():
            day_dir = find_day_directory(month_dir, day)
            if day_dir is None:
                continue  # Already handled in verify_directory_structure
            
            metadata_file = day_dir / "metadata_analyse.txt"
            
            if not metadata_file.exists():
                print(f"❌ metadata_analyse.txt not found in '{month_dir.name}/{day_dir.name}'")
                return False
            
            try:
                content = metadata_file.read_text().strip()
                lines = content.split('\n')
                
                # Check that there are exactly 2 lines
                if len(lines) != 2:
                    print(f"❌ metadata_analyse.txt in '{month_dir.name}/{day_dir.name}' has {len(lines)} lines, expected 2")
                    return False
                
                # Check each line - more flexible verification
                for line_num, line in enumerate(lines, 1):
                    line_lower = line.lower()
                    
                    # Check filename based on expected_month and day
                    expected_filename = None
                    if expected_month == "07" and day == "09":
                        expected_filename = "sg.jpg"
                    elif expected_month == "07" and day == "25":
                        expected_filename = "bus.mov"
                    elif expected_month == "07" and day == "26":
                        expected_filename = "road.mov"
                    elif expected_month == "08" and day == "06":
                        # For 08/06, check if it's one of the expected files
                        if line_num == 1:  # First line should be bear.jpg
                            expected_filename = "bear.jpg"
                        else:  # Second line should be one of the random files
                            expected_filenames = ["random_file_1.txt", "random_file_2.txt", "random_file_3.txt"]
                            if not any(filename in line_lower for filename in expected_filenames):
                                print(f"❌ Line {line_num} in '{month_dir.name}/{day_dir.name}' should contain one of {expected_filenames}: {line}")
                                return False
                            continue  # Skip other checks for this line
                    
                    if expected_filename and expected_filename not in line_lower:
                        print(f"❌ Line {line_num} in '{month_dir.name}/{day_dir.name}' should contain '{expected_filename}': {line}")
                        return False
                    
                    # Check month letters
                    month_letters = None
                    if expected_month == "07":
                        month_letters = ["jul", "7"]
                    elif expected_month == "08":
                        month_letters = ["aug", "8"]
                    
                    if month_letters and not any(letter in line_lower for letter in month_letters):
                        print(f"❌ Line {line_num} in '{month_dir.name}/{day_dir.name}' should contain month letters: {line}")
                        return False
                    
                    # Check year (2025)
                    if "2025" not in line_lower:
                        print(f"❌ Line {line_num} in '{month_dir.name}/{day_dir.name}' should contain '2025': {line}")
                        return False
                    
                    # Check day number - support both formats
                    valid_day_names = get_day_mapping().get(day, [day])
                    if not any(day_name in line_lower for day_name in valid_day_names):
                        print(f"❌ Line {line_num} in '{month_dir.name}/{day_dir.name}' should contain day '{day}' (or {valid_day_names}): {line}")
                        return False
                
            except Exception as e:
                print(f"❌ Error reading metadata_analyse.txt in '{month_dir.name}/{day_dir.name}': {e}")
                return False
    
    print("✅ All metadata_analyse.txt files are correct")
    return True

def verify_no_files_in_root(test_dir: Path) -> bool:
    """Verify that no files remain in the root test directory."""
    root_files = [f for f in test_dir.iterdir() if f.is_file()]
    
    # Filter out system files that are commonly present
    system_files = ['.DS_Store', 'Thumbs.db', '.DS_Store?', '._.DS_Store']
    non_system_files = [f for f in root_files if f.name not in system_files]
    
    if non_system_files:
        print(f"❌ Files still present in root directory: {[f.name for f in non_system_files]}")
        return False
    
    print("✅ No files remain in root directory")
    return True

def verify_total_file_count(test_dir: Path) -> bool:
    """Verify that all original files are accounted for."""
    expected_structure = get_expected_directory_structure()
    total_expected = sum(len(files) for days in expected_structure.values() for files in days.values())
    
    total_actual = 0
    for expected_month, days in expected_structure.items():
        month_dir = find_month_directory(test_dir, expected_month)
        if month_dir is None:
            continue
        for day in days:
            day_dir = find_day_directory(month_dir, day)
            if day_dir and day_dir.exists():
                # Count only non-system files
                system_files = ['.DS_Store', 'Thumbs.db', '.DS_Store?', '._.DS_Store', 'metadata_analyse.txt']
                files_in_dir = [f for f in day_dir.iterdir() if f.is_file() and f.name not in system_files]
                total_actual += len(files_in_dir)
    
    if total_actual != total_expected:
        print(f"❌ Expected {total_expected} files total, found {total_actual}")
        return False
    
    print(f"✅ Total file count is correct: {total_actual}")
    return True

def main():
    """Main verification function."""
    try:
        test_dir = get_test_directory()
        print(f"🔍 Verifying Time Classification in: {test_dir}")
        
        # Run all verification checks
        checks = [
            ("Directory structure", verify_directory_structure),
            ("Files in directories", verify_files_in_directories),
            ("Metadata analysis files", verify_metadata_analysis_files),
            ("No files in root", verify_no_files_in_root),
            ("Total file count", verify_total_file_count)
        ]
        
        all_passed = True
        for check_name, check_func in checks:
            print(f"\n📋 Checking: {check_name}")
            if not check_func(test_dir):
                all_passed = False
        
        if all_passed:
            print("\n🎉 All verification checks passed!")
            sys.exit(0)
        else:
            print("\n❌ Some verification checks failed!")
            sys.exit(1)
            
    except Exception as e:
        print(f"❌ Verification failed with error: {e}")
        sys.exit(1)

if __name__ == "__main__":
    main()

================================================
FILE: tasks/filesystem/standard/folder_structure/structure_analysis/description.md
================================================
Please use FileSystem tools to finish the following task:

You need to recursively traverse the entire folder structure under the main directory and generate a detailed statistical report in a file named `structure_analysis.txt`.

In all tasks, ignore `.DS_Store` files.

In any tasks, you should not change or delete any existed files.

Do not try to use python code.

---

### 1. File Statistics

Count the following information for the entire directory structure:

- total number of files
- total number of folders
- total size of the hole folder (in bytes, include .DS_Store only in this subtask)

**Format (one item per line):**

total number of files: X
total number of folders: Y
total size of all files: Z

---

### 2. Depth Analysis

Identify the deepest folder path(s) in the directory and calculate its depth level.

- Use relative paths based on main directory.
- **Write the folder path only up to the folder, not including the file name.For example, if the file path is `./complex_structure/A/B/C/def.txt`, then the path in your report should be `complex_structure/A/B/C`, and the depth is `4`.**
- If multiple deepest paths exist, list only one.

**Format (one item per line):**

depth: N
PATH

---

### 3. File Type Classification

Categorize files by their extensions and count the number of files for each type.
Files without extensions should also be included.

**Format (one extension per line):**

txt: count
py: count
jpg: count
mov: count
(no extension): count


================================================
FILE: tasks/filesystem/standard/folder_structure/structure_analysis/meta.json
================================================
{
  "task_id": "structure_analysis",
  "task_name": "Structure Analysis",
  "category_id": "folder_structure",
  "category_name": "Folder Structure",
  "description": "Perform thorough analysis of complex folder hierarchy to generate a detailed structural summary report with comprehensive file statistics.",
  "author": "Lingjun Chen",
  "created_at": "2025-08-16",
  "difficulty": "L3",
  "tags": [
    "pattern analysis",
    "data extraction"
  ],
  "mcp": [
    "filesystem"
  ],
  "meta_data": {
    "stateType": "text",
    "stateContent": "folder_structure/\n    └── complex_structure/\n            ├── deeply/\n            │       └── nested/\n            │               └── folder/\n            │                       └── structure/\n            ├── empty_folder/\n            ├── folder_lxkHt_0_1/\n            │       └── file_PeLzC_0.txt\n            ├── folder_QdTAj_0_2/\n            │       ├── folder_eXccj_1_0/\n            │       │       ├── folder_Mqlwh_2_1/\n            │       │       │       ├── folder_cKxcP_3_3/\n            │       │       │       │       ├── folder_BPTMK_4_1/\n            │       │       │       │       │       └── file_RHtBP_0.txt\n            │       │       │       │       ├── folder_QNqjq_4_0/\n            │       │       │       │       │       ├── folder_gRwPE_5_1/\n            │       │       │       │       │       │       ├── file_jVlpp_0.txt\n            │       │       │       │       │       │       └── file_vJuHz_1.txt\n            │       │       │       │       │       ├── folder_XdXYJ_5_0/\n            │       │       │       │       │       │       └── file_KvkKi_0.txt\n            │       │       │       │       │       ├── file_gGxLG_2.txt\n            │       │       │       │       │       ├── file_Hzkxo_0.txt\n            │       │       │       │       │       └── file_XRjeh_1.txt\n            │       │       │       │       ├── folder_vIBIt_4_2/\n            │       │       │       │       │       ├── folder_kRDNS_5_0/\n            │       │       │       │       │       │       └── file_wFSjJ_0.txt\n            │       │       │       │       │       └── file_NyBSO_0.txt\n            │       │       │       │       ├── file_EOCNf_1.txt\n            │       │       │       │       └── file_gmrXA_0.txt\n            │       │       │       ├── folder_NcruA_3_1/\n            │       │       │       │       ├── file_bLWDj_1.txt\n            │       │       │       │       └── file_WAftR_0.txt\n            │       │       │       ├── folder_qCDFI_3_2/\n            │       │       │       │       ├── file_eSMOJ_0.txt\n            │       │       │       │       ├── file_oxADy_2.txt\n            │       │       │       │       └── file_RTbbc_1.txt\n            │       │       │       ├── folder_QVHUU_3_0/\n            │       │       │       │       ├── folder_FEPTK_4_1/\n            │       │       │       │       │       ├── folder_GHoMC_5_1/\n            │       │       │       │       │       │       └── file_rAMYd_0.txt\n            │       │       │       │       │       ├── folder_iBDUY_5_0/\n            │       │       │       │       │       │       └── file_IJCaw_0.txt\n            │       │       │       │       │       ├── folder_VRXgp_5_2/\n            │       │       │       │       │       │       └── file_hkUmS_0.txt\n            │       │       │       │       │       ├── file_nqLAf_1.txt\n            │       │       │       │       │       └── file_XflmA_0.txt\n            │       │       │       │       ├── folder_FlPoK_4_3/\n            │       │       │       │       │       ├── folder_hSVNm_5_3/\n            │       │       │       │       │       │       └── file_klnbn_0.txt\n            │       │       │       │       │       ├── folder_iZuEl_5_0/\n            │       │       │       │       │       │       └── file_LqAmy_0.txt\n            │       │       │       │       │       ├── folder_LcURj_5_2/\n            │       │       │       │       │       │       ├── file_RgwOS_1.txt\n            │       │       │       │       │       │       └── file_ZHnYb_0.txt\n            │       │       │       │       │       ├── folder_tuZQJ_5_1/\n            │       │       │       │       │       │       └── file_LHuIx_0.txt\n            │       │       │       │       │       ├── file_asJnB_1.txt\n            │       │       │       │       │       └── file_EzLdu_0.txt\n            │       │       │       │       ├── folder_ndhsJ_4_0/\n            │       │       │       │       │       ├── folder_CUSXK_5_0/\n            │       │       │       │       │       │       ├── file_DpiuM_1.txt\n            │       │       │       │       │       │       └── file_pSqeG_0.txt\n            │       │       │       │       │       ├── folder_pstmE_5_1/\n            │       │       │       │       │       │       └── file_YwdJt_0.txt\n            │       │       │       │       │       ├── folder_StlsP_5_2/\n            │       │       │       │       │       │       ├── file_kriBJ_0.txt\n            │       │       │       │       │       │       └── file_XCEdm_1.txt\n            │       │       │       │       │       ├── file_ToDjh_1.txt\n            │       │       │       │       │       └── file_xbIVx_0.txt\n            │       │       │       │       ├── folder_PJBok_4_4/\n            │       │       │       │       │       ├── folder_mzxaf_5_0/\n            │       │       │       │       │       │       ├── file_ILBzj_2.txt\n            │       │       │       │       │       │       ├── file_MTGMm_1.txt\n            │       │       │       │       │       │       └── file_zBDqz_0.txt\n            │       │       │       │       │       ├── folder_sULMj_5_1/\n            │       │       │       │       │       │       ├── file_BHziw_1.txt\n            │       │       │       │       │       │       ├── file_sIjiu_2.txt\n            │       │       │       │       │       │       └── file_VqNkB_0.txt\n            │       │       │       │       │       ├── folder_vypSi_5_3/\n            │       │       │       │       │       │       ├── file_kZbIm_1.txt\n            │       │       │       │       │       │       └── file_sOBtE_0.txt\n            │       │       │       │       │       ├── folder_ZLGHy_5_2/\n            │       │       │       │       │       │       ├── file_azaFF_0.txt\n            │       │       │       │       │       │       └── file_nAFRe_1.txt\n            │       │       │       │       │       ├── file_mIkQU_0.txt\n            │       │       │       │       │       └── file_sGPxd_1.txt\n            │       │       │       │       ├── folder_VTbEG_4_2/\n            │       │       │       │       │       ├── file_HtYLg_0.txt\n            │       │       │       │       │       ├── file_JXjMd_1.txt\n            │       │       │       │       │       └── file_tPccB_2.txt\n            │       │       │       │       ├── file_BuOSw_1.txt\n            │       │       │       │       └── file_TpoqE_0.txt\n            │       │       │       ├── folder_wTvun_3_4/\n            │       │       │       │       ├── file_GyhyE_1.txt\n            │       │       │       │       ├── file_POsla_2.txt\n            │       │       │       │       └── file_tSsvk_0.txt\n            │       │       │       ├── file_irNju_0.txt\n            │       │       │       └── file_jYBRm_1.txt\n            │       │       ├── folder_YlJLI_2_0/\n            │       │       │       └── file_FpFSL_0.txt\n            │       │       ├── file_cFgBr_2.txt\n            │       │       ├── file_lKEWN_1.txt\n            │       │       └── file_ZEWFP_0.txt\n            │       └── file_ayUCH_0.txt\n            ├── folder_xtgyi_0_0/\n            │       └── file_BvSOB_0.txt\n            ├── mixed_content/\n            │       └── images_and_text/\n            │               └── notes.txt\n            ├── project/\n            │       ├── docs/\n            │       │       └── archive/\n            │       │               └── 2023/\n            │       │                       └── reports/\n            │       │                               ├── report_0.txt\n            │       │                               ├── report_1.txt\n            │       │                               └── report_2.txt\n            │       └── src/\n            │               └── main/\n            │                       └── resources/\n            └── m.py",
    "stateUrl": "https://storage.mcpmark.ai/filesystem/folder_structure.zip",
    "stateOriginalUrl": null
  }
}

================================================
FILE: tasks/filesystem/standard/folder_structure/structure_analysis/verify.py
================================================
#!/usr/bin/env python3
"""
Verification script for Directory Structure Analysis Task
"""

import sys
from pathlib import Path
import os
import re

def get_test_directory() -> Path:
    """Get the test directory from FILESYSTEM_TEST_DIR env var."""
    test_root = os.environ.get("FILESYSTEM_TEST_DIR")
    if not test_root:
        raise ValueError("FILESYSTEM_TEST_DIR environment variable is required")
    return Path(test_root)

def verify_structure_analysis_file_exists(test_dir: Path) -> bool:
    """Verify that the structure_analysis.txt file exists."""
    analysis_file = test_dir / "structure_analysis.txt"
    
    if not analysis_file.exists():
        print("❌ File 'structure_analysis.txt' not found")
        return False
    
    print("✅ structure_analysis.txt file found")
    return True

def verify_structure_analysis_file_readable(test_dir: Path) -> bool:
    """Verify that the structure_analysis.txt file is readable."""
    analysis_file = test_dir / "structure_analysis.txt"
    
    try:
        content = analysis_file.read_text()
        if not content.strip():
            print("❌ structure_analysis.txt file is empty")
            return False
        
        print("✅ structure_analysis.txt file is readable")
        return True
        
    except Exception as e:
        print(f"❌ Error reading structure_analysis.txt file: {e}")
        return False

def verify_subtask1_file_statistics(test_dir: Path) -> bool:
    """Verify subtask 1: File Statistics - files must be 69, folders must be 51, 58097 allows +-1000."""
    analysis_file = test_dir / "structure_analysis.txt"
    
    try:
        content = analysis_file.read_text()
        
        # Extract numbers from the content
        file_count_match = re.search(r'total number of files:\s*(\d+)', content)
        folder_count_match = re.search(r'total number of folders:\s*(\d+)', content)
        size_match = re.search(r'total size of all files:\s*(\d+)', content)
        
        if not file_count_match or not folder_count_match or not size_match:
            print("❌ Could not extract file statistics from structure_analysis.txt")
            return False
        
        file_count = int(file_count_match.group(1))
        folder_count = int(folder_count_match.group(1))
        total_size = int(size_match.group(1))
        
        print(f"📊 Found: files={file_count}, folders={folder_count}, size={total_size}")
        
        # Check if file count is exactly 69
        if file_count != 69:
            print(f"❌ File count must be 69, found: {file_count}")
            return False
        
        # Check if folder count is exactly 51
        if folder_count != 51:
            print(f"❌ Folder count must be 51, found: {folder_count}")
            return False
        
        # Check if size is within acceptable range (58097 ± 1000)
        expected_size = 58097
        size_tolerance = 1000
        if abs(total_size - expected_size) > size_tolerance:
            print(f"❌ Total size ({total_size}) is not within acceptable range ({expected_size} ± {size_tolerance})")
            return False
        
        print(f"✅ File statistics verified: files={file_count}, folders={folder_count}, size={total_size} (within tolerance)")
        return True
        
    except Exception as e:
        print(f"❌ Error verifying file statistics: {e}")
        return False

def verify_subtask2_depth_analysis(test_dir: Path) -> bool:
    """Verify subtask 2: Depth Analysis - depth must be 7, verify path exists."""
    analysis_file = test_dir / "structure_analysis.txt"
    
    try:
        content = analysis_file.read_text()
        
        # Extract depth and path
        depth_match = re.search(r'depth:\s*(\d+)', content)
        path_match = re.search(r'^([^\n]+)$', content, re.MULTILINE)
        
        if not depth_match:
            print("❌ Could not extract depth from structure_analysis.txt")
            return False
        
        depth = int(depth_match.group(1))
        
        # Check if depth is exactly 7
        if depth != 7:
            print(f"❌ Depth must be 7, found: {depth}")
            return False
        
        print(f"✅ Depth verified: {depth}")
        
        # Extract the path (it should be on a separate line after "depth: 7")
        lines = content.split('\n')
        path_line = None
        for i, line in enumerate(lines):
            if line.strip() == f"depth: {depth}":
                if i + 1 < len(lines):
                    path_line = lines[i + 1].strip()
                    break
        
        if not path_line:
            print("❌ Could not find path line after depth specification")
            return False
        
        print(f"📁 Found path: {path_line}")
        
        # Verify that the path depth matches the declared depth
        path_parts = path_line.split('/')
        actual_depth = len(path_parts)
        
        if actual_depth != depth:
            print(f"❌ Path depth mismatch: declared depth is {depth}, but path has {actual_depth} levels")
            print(f"   Path: {path_line}")
            print(f"   Path parts: {path_parts}")
            return False
        
        print(f"✅ Path depth verified: {actual_depth} levels")
        
        # Verify that this path exists in the test environment
        expected_path = test_dir / path_line
        if not expected_path.exists():
            print(f"❌ Path does not exist: {expected_path}")
            return False
        
        if not expected_path.is_dir():
            print(f"❌ Path exists but is not a directory: {expected_path}")
            return False
        
        print(f"✅ Path verified and exists: {path_line}")
        return True
        
    except Exception as e:
        print(f"❌ Error verifying depth analysis: {e}")
        return False

def verify_subtask3_file_type_classification(test_dir: Path) -> bool:
    """Verify subtask 3: File Type Classification - 68 and 1 must be accurate."""
    analysis_file = test_dir / "structure_analysis.txt"
    
    try:
        content = analysis_file.read_text()
        
        # Extract file type counts
        txt_match = re.search(r'txt:\s*(\d+)', content)
        py_match = re.search(r'py:\s*(\d+)', content)
        
        if not txt_match or not py_match:
            print("❌ Could not extract file type counts from structure_analysis.txt")
            return False
        
        txt_count = int(txt_match.group(1))
        py_count = int(py_match.group(1))
        
        print(f"📁 Found: txt={txt_count}, py={py_count}")
        
        # Check if txt count is exactly 68
        if txt_count != 68:
            print(f"❌ txt count must be 68, found: {txt_count}")
            return False
        
        # Check if py count is exactly 1
        if py_count != 1:
            print(f"❌ py count must be 1, found: {py_count}")
            return False
        
        print(f"✅ File type classification verified: txt={txt_count}, py={py_count}")
        return True
        
    except Exception as e:
        print(f"❌ Error verifying file type classification: {e}")
        return False

def verify_file_format(test_dir: Path) -> bool:
    """Verify that the structure_analysis.txt file has proper format."""
    analysis_file = test_dir / "structure_analysis.txt"
    
    try:
        content = analysis_file.read_text()
        lines = content.split('\n')
        
        # Check if file has the expected structure
        if len(lines) < 5:  # Should have at least 5 lines
            print("❌ File seems too short to contain all required information")
            return False
        
        # Basic format check - ensure it's not completely corrupted
        if not content.strip():
            print("❌ File is completely empty")
            return False
        
        print("✅ File format is acceptable")
        return True
        
    except Exception as e:
        print(f"❌ Error checking file format: {e}")
        return False

def main():
    """Main verification function."""
    try:
        test_dir = get_test_directory()
        print(f"🔍 Verifying Directory Structure Analysis Task in: {test_dir}")
        
        # Define verification steps
        verification_steps = [
            ("Structure Analysis File Exists", verify_structure_analysis_file_exists),
            ("File is Readable", verify_structure_analysis_file_readable),
            ("Subtask 1: File Statistics", verify_subtask1_file_statistics),
            ("Subtask 2: Depth Analysis", verify_subtask2_depth_analysis),
            ("Subtask 3: File Type Classification", verify_subtask3_file_type_classification),
            ("File Format", verify_file_format),
        ]
        
        # Run all verification steps
        all_passed = True
        for step_name, verify_func in verification_steps:
            print(f"\n--- {step_name} ---")
            if not verify_func(test_dir):
                all_passed = False
        
        # Final result
        print("\n" + "="*50)
        if all_passed:
            print("✅ Directory Structure Analysis completed correctly!")
            print("🎉 Structure Analysis verification: PASS")
            sys.exit(0)
        else:
            print("❌ Structure Analysis verification: FAIL")
            sys.exit(1)
            
    except Exception as e:
        print(f"❌ Verification failed with error: {e}")
        sys.exit(1)

if __name__ == "__main__":
    main()

================================================
FILE: tasks/filesystem/standard/folder_structure/structure_mirror/description.md
================================================
Please use FileSystem tools to finish the following task:

### Task

Copy the entire directory structure of `complex_structure/` to `complex_structure_mirror/` without copying any file contents. Do not use python code.

### Requirements

- Create the entire directory structure in `complex_structure_mirror/`
- Do not copy any file contents, only create directories
- In each empty directory, create a `placeholder.txt` file containing the absolute path of that directory
- Handle nested directories of any depth
- You should also follow 2 rules:
    1. **Discard any directory that directly contains more than 2 files (only count the immediate folder).**
    2. **If a directory name contains numbers, append "_processed" to the mirror directory name**


================================================
FILE: tasks/filesystem/standard/folder_structure/structure_mirror/meta.json
================================================
{
  "task_id": "structure_mirror",
  "task_name": "Structure Mirror",
  "category_id": "folder_structure",
  "category_name": "Folder Structure",
  "description": "Create an exact mirror copy of the folder structure in a target location while applying specified transformation rules.",
  "author": "Lingjun Chen",
  "created_at": "2025-08-08",
  "difficulty": "L3",
  "tags": [
    "file organization",
    "content transformation"
  ],
  "mcp": [
    "filesystem"
  ],
  "meta_data": {
    "stateType": "text",
    "stateContent": "folder_structure/\n    └── complex_structure/\n            ├── deeply/\n            │       └── nested/\n            │               └── folder/\n            │                       └── structure/\n            ├── empty_folder/\n            ├── folder_lxkHt_0_1/\n            │       └── file_PeLzC_0.txt\n            ├── folder_QdTAj_0_2/\n            │       ├── folder_eXccj_1_0/\n            │       │       ├── folder_Mqlwh_2_1/\n            │       │       │       ├── folder_cKxcP_3_3/\n            │       │       │       │       ├── folder_BPTMK_4_1/\n            │       │       │       │       │       └── file_RHtBP_0.txt\n            │       │       │       │       ├── folder_QNqjq_4_0/\n            │       │       │       │       │       ├── folder_gRwPE_5_1/\n            │       │       │       │       │       │       ├── file_jVlpp_0.txt\n            │       │       │       │       │       │       └── file_vJuHz_1.txt\n            │       │       │       │       │       ├── folder_XdXYJ_5_0/\n            │       │       │       │       │       │       └── file_KvkKi_0.txt\n            │       │       │       │       │       ├── file_gGxLG_2.txt\n            │       │       │       │       │       ├── file_Hzkxo_0.txt\n            │       │       │       │       │       └── file_XRjeh_1.txt\n            │       │       │       │       ├── folder_vIBIt_4_2/\n            │       │       │       │       │       ├── folder_kRDNS_5_0/\n            │       │       │       │       │       │       └── file_wFSjJ_0.txt\n            │       │       │       │       │       └── file_NyBSO_0.txt\n            │       │       │       │       ├── file_EOCNf_1.txt\n            │       │       │       │       └── file_gmrXA_0.txt\n            │       │       │       ├── folder_NcruA_3_1/\n            │       │       │       │       ├── file_bLWDj_1.txt\n            │       │       │       │       └── file_WAftR_0.txt\n            │       │       │       ├── folder_qCDFI_3_2/\n            │       │       │       │       ├── file_eSMOJ_0.txt\n            │       │       │       │       ├── file_oxADy_2.txt\n            │       │       │       │       └── file_RTbbc_1.txt\n            │       │       │       ├── folder_QVHUU_3_0/\n            │       │       │       │       ├── folder_FEPTK_4_1/\n            │       │       │       │       │       ├── folder_GHoMC_5_1/\n            │       │       │       │       │       │       └── file_rAMYd_0.txt\n            │       │       │       │       │       ├── folder_iBDUY_5_0/\n            │       │       │       │       │       │       └── file_IJCaw_0.txt\n            │       │       │       │       │       ├── folder_VRXgp_5_2/\n            │       │       │       │       │       │       └── file_hkUmS_0.txt\n            │       │       │       │       │       ├── file_nqLAf_1.txt\n            │       │       │       │       │       └── file_XflmA_0.txt\n            │       │       │       │       ├── folder_FlPoK_4_3/\n            │       │       │       │       │       ├── folder_hSVNm_5_3/\n            │       │       │       │       │       │       └── file_klnbn_0.txt\n            │       │       │       │       │       ├── folder_iZuEl_5_0/\n            │       │       │       │       │       │       └── file_LqAmy_0.txt\n            │       │       │       │       │       ├── folder_LcURj_5_2/\n            │       │       │       │       │       │       ├── file_RgwOS_1.txt\n            │       │       │       │       │       │       └── file_ZHnYb_0.txt\n            │       │       │       │       │       ├── folder_tuZQJ_5_1/\n            │       │       │       │       │       │       └── file_LHuIx_0.txt\n            │       │       │       │       │       ├── file_asJnB_1.txt\n            │       │       │       │       │       └── file_EzLdu_0.txt\n            │       │       │       │       ├── folder_ndhsJ_4_0/\n            │       │       │       │       │       ├── folder_CUSXK_5_0/\n            │       │       │       │       │       │       ├── file_DpiuM_1.txt\n            │       │       │       │       │       │       └── file_pSqeG_0.txt\n            │       │       │       │       │       ├── folder_pstmE_5_1/\n            │       │       │       │       │       │       └── file_YwdJt_0.txt\n            │       │       │       │       │       ├── folder_StlsP_5_2/\n            │       │       │       │       │       │       ├── file_kriBJ_0.txt\n            │       │       │       │       │       │       └── file_XCEdm_1.txt\n            │       │       │       │       │       ├── file_ToDjh_1.txt\n            │       │       │       │       │       └── file_xbIVx_0.txt\n            │       │       │       │       ├── folder_PJBok_4_4/\n            │       │       │       │       │       ├── folder_mzxaf_5_0/\n            │       │       │       │       │       │       ├── file_ILBzj_2.txt\n            │       │       │       │       │       │       ├── file_MTGMm_1.txt\n            │       │       │       │       │       │       └── file_zBDqz_0.txt\n            │       │       │       │       │       ├── folder_sULMj_5_1/\n            │       │       │       │       │       │       ├── file_BHziw_1.txt\n            │       │       │       │       │       │       ├── file_sIjiu_2.txt\n            │       │       │       │       │       │       └── file_VqNkB_0.txt\n            │       │       │       │       │       ├── folder_vypSi_5_3/\n            │       │       │       │       │       │       ├── file_kZbIm_1.txt\n            │       │       │       │       │       │       └── file_sOBtE_0.txt\n            │       │       │       │       │       ├── folder_ZLGHy_5_2/\n            │       │       │       │       │       │       ├── file_azaFF_0.txt\n            │       │       │       │       │       │       └── file_nAFRe_1.txt\n            │       │       │       │       │       ├── file_mIkQU_0.txt\n            │       │       │       │       │       └── file_sGPxd_1.txt\n            │       │       │       │       ├── folder_VTbEG_4_2/\n            │       │       │       │       │       ├── file_HtYLg_0.txt\n            │       │       │       │       │       ├── file_JXjMd_1.txt\n            │       │       │       │       │       └── file_tPccB_2.txt\n            │       │       │       │       ├── file_BuOSw_1.txt\n            │       │       │       │       └── file_TpoqE_0.txt\n            │       │       │       ├── folder_wTvun_3_4/\n            │       │       │       │       ├── file_GyhyE_1.txt\n            │       │       │       │       ├── file_POsla_2.txt\n            │       │       │       │       └── file_tSsvk_0.txt\n            │       │       │       ├── file_irNju_0.txt\n            │       │       │       └── file_jYBRm_1.txt\n            │       │       ├── folder_YlJLI_2_0/\n            │       │       │       └── file_FpFSL_0.txt\n            │       │       ├── file_cFgBr_2.txt\n            │       │       ├── file_lKEWN_1.txt\n            │       │       └── file_ZEWFP_0.txt\n            │       └── file_ayUCH_0.txt\n            ├── folder_xtgyi_0_0/\n            │       └── file_BvSOB_0.txt\n            ├── mixed_content/\n            │       └── images_and_text/\n            │               └── notes.txt\n            ├── project/\n            │       ├── docs/\n            │       │       └── archive/\n            │       │               └── 2023/\n            │       │                       └── reports/\n            │       │                               ├── report_0.txt\n            │       │                               ├── report_1.txt\n            │       │                               └── report_2.txt\n            │       └── src/\n            │               └── main/\n            │                       └── resources/\n            └── m.py",
    "stateUrl": "https://storage.mcpmark.ai/filesystem/folder_structure.zip",
    "stateOriginalUrl": null
  }
}

================================================
FILE: tasks/filesystem/standard/folder_structure/structure_mirror/verify.py
================================================
#!/usr/bin/env python3
"""
Verification script for Directory Structure Mirroring with Smart Placeholders Task
"""

import sys
from pathlib import Path
import os

def get_test_directory() -> Path:
    """Get the test directory from FILESYSTEM_TEST_DIR env var."""
    test_root = os.environ.get("FILESYSTEM_TEST_DIR")
    if not test_root:
        raise ValueError("FILESYSTEM_TEST_DIR environment variable is required")
    return Path(test_root)

def verify_mirror_directory_exists(test_dir: Path, mirror_path: Path) -> bool:
    """Verify that a mirror directory exists."""
    if not mirror_path.exists():
        print(f"❌ Mirror directory not found: {mirror_path.relative_to(test_dir)}")
        return False
    
    if not mirror_path.is_dir():
        print(f"❌ Mirror path exists but is not a directory: {mirror_path.relative_to(test_dir)}")
        return False
    
    print(f"✅ Mirror directory exists: {mirror_path.relative_to(test_dir)}")
    return True

def verify_placeholder_file_exists(mirror_path: Path, test_dir: Path) -> bool:
    """Verify that placeholder.txt exists in the mirror directory."""
    placeholder_file = mirror_path / "placeholder.txt"
    
    if not placeholder_file.exists():
        print(f"❌ placeholder.txt not found in: {mirror_path.relative_to(test_dir)}")
        return False
    
    if not placeholder_file.is_file():
        print(f"❌ placeholder.txt exists but is not a file in: {mirror_path.relative_to(test_dir)}")
        return False
    
    print(f"✅ placeholder.txt exists in: {mirror_path.relative_to(test_dir)}")
    return True

def verify_placeholder_content(mirror_path: Path, test_dir: Path) -> bool:
    """Verify that placeholder.txt contains the correct path ending with complex_structure_mirror/..."""
    placeholder_file = mirror_path / "placeholder.txt"
    
    try:
        content = placeholder_file.read_text().strip()
        
        # Check if content is not empty
        if not content:
            print(f"❌ placeholder.txt is empty in: {mirror_path.relative_to(test_dir)}")
            return False
        
        # Check if it contains the correct path ending with complex_structure_mirror/...
        expected_ending = f"complex_structure_mirror/{mirror_path.relative_to(test_dir / 'complex_structure_mirror')}"
        if not content.endswith(expected_ending):
            print(f"❌ placeholder.txt content incorrect in: {mirror_path.relative_to(test_dir)}")
            print(f"   Expected ending: {expected_ending}")
            print(f"   Found: {content}")
            return False
        
        print(f"✅ placeholder.txt content is correct in: {mirror_path.relative_to(test_dir)}")
        return True
        
    except Exception as e:
        print(f"❌ Error reading placeholder.txt in {mirror_path.relative_to(test_dir)}: {e}")
        return False

def verify_no_files_copied(test_dir: Path) -> bool:
    """Verify that no file contents were copied, only directory structure."""
    source_dir = test_dir / "complex_structure"
    mirror_dir = test_dir / "complex_structure_mirror"
    
    if not mirror_dir.exists():
        print("❌ Mirror directory 'complex_structure_mirror' not found")
        return False
    
    # Check that no files from source were copied (except placeholder.txt files)
    for source_file in source_dir.rglob("*"):
        if source_file.is_file():
            # Calculate the corresponding mirror path
            relative_path = source_file.relative_to(source_dir)
            mirror_file = mirror_dir / relative_path
            
            # Skip if this would be a placeholder.txt file
            if mirror_file.name == "placeholder.txt":
                continue
            
            if mirror_file.exists():
                print(f"❌ File was copied when it shouldn't be: {relative_path}")
                return False
    
    print("✅ No file contents were copied, only directory structure")
    return True

def verify_mirror_structure_completeness(test_dir: Path) -> bool:
    """Verify that the mirror structure is complete and matches expected structure."""
    mirror_dir = test_dir / "complex_structure_mirror"
    
    if not mirror_dir.exists():
        print("❌ Mirror directory 'complex_structure_mirror' not found")
        return False
    
    # Define expected directories that should exist (based on backup structure)
    expected_dirs = [
        "deeply",
        "deeply/nested",
        "deeply/nested/folder",
        "deeply/nested/folder/structure",
        "empty_folder", 
        "folder_lxkHt_0_1_processed",
        "folder_QdTAj_0_2_processed",
        "folder_xtgyi_0_0_processed",
        "mixed_content",
        "mixed_content/images_and_text",
        "project",
        "project/docs",
        "project/docs/archive",
        "project/docs/archive/2023_processed",
        "project/src",
        "project/src/main",
        "project/src/main/resources"
    ]
    
    # Define which directories should have placeholder.txt files
    placeholder_dirs = [
        "deeply/nested/folder/structure",
        "empty_folder", 
        "folder_lxkHt_0_1_processed",
        "folder_QdTAj_0_2_processed",
        "folder_xtgyi_0_0_processed",
        "mixed_content/images_and_text",
        "project/docs/archive/2023_processed",
        "project/src/main/resources"
    ]
    
    all_passed = True
    
    # Check that all expected directories exist
    for expected_dir in expected_dirs:
        mirror_path = mirror_dir / expected_dir
        if not verify_mirror_directory_exists(test_dir, mirror_path):
            all_passed = False
        elif expected_dir in placeholder_dirs:
            # Check placeholder.txt for directories that should have it
            if not verify_placeholder_file_exists(mirror_path, test_dir):
                all_passed = False
            elif not verify_placeholder_content(mirror_path, test_dir):
                all_passed = False
    
    # Check that no unexpected directories exist
    for mirror_subdir in mirror_dir.rglob("*"):
        if mirror_subdir.is_dir():
            relative_path = mirror_subdir.relative_to(mirror_dir)
            if str(relative_path) not in expected_dirs and str(relative_path) != ".":
                print(f"❌ Unexpected directory found: {relative_path}")
                all_passed = False
    
    return all_passed

def main():
    """Main verification function."""
    try:
        test_dir = get_test_directory()
        print(f"🔍 Verifying Directory Structure Mirroring with Smart Placeholders in: {test_dir}")
        
        # Define verification steps
        verification_steps = [
            ("No files copied", verify_no_files_copied),
            ("Mirror structure completeness", verify_mirror_structure_completeness),
        ]
        
        # Run all verification steps
        all_passed = True
        for step_name, verify_func in verification_steps:
            print(f"\n📋 Checking: {step_name}")
            if not verify_func(test_dir):
                all_passed = False
        
        # Final result
        print("\n" + "="*50)
        if all_passed:
            print("✅ Directory structure mirroring completed correctly!")
            print("🎉 Structure Mirror verification: PASS")
            sys.exit(0)
        else:
            print("❌ Structure Mirror verification: FAIL")
            sys.exit(1)
            
    except Exception as e:
        print(f"❌ Verification failed with error: {e}")
        sys.exit(1)

if __name__ == "__main__":
    main()

================================================
FILE: tasks/filesystem/standard/legal_document/dispute_review/description.md
================================================
Please use FileSystem tools to finish the following task:

**Overview**

The folder "legal_files/" contains all versions (Preferred_Stock_Purchase_Agreement_v0.txt  -- Preferred_Stock_Purchase_Agreement_v10.txt) of the Stock Purchase Agreement for a corporate investment project.

There are comments in it, come from four people:

- **Bill Harvey** (Company CEO)
- **Michelle Jackson** (Investor)
- **David Russel** (Company Counsel)
- **Tony Taylor** (Investor Counsel)

Between v1 and v9, these four people make comments on the clauses. The comment format is `[name:content]`, where:

- `name` is the commenter's name
- `content` is the revision note

**Special Note:** If the name is "All parties", it represents a joint comment from all parties, which counts as one comment but does not count toward any individual's personal comment count.

## Task

Your task is to review these versions and identify all clauses that have been commented in **v5,6,7 (in folder legal_files/)**. Generate a file named `dispute_review.txt` in the main directory. In this file, list each commented clause on a separate line and indicate the number of comments for each clause in the format "Clause number:number of comments". Clause number should be in the format of X.X.


================================================
FILE: tasks/filesystem/standard/legal_document/dispute_review/meta.json
================================================
{
  "task_id": "dispute_review",
  "task_name": "Dispute Review",
  "category_id": "legal_document",
  "category_name": "Legal Document",
  "description": "Analyze multiple versions of legal documents to track clause discussion frequency and generate a comprehensive dispute summary report.",
  "author": "Lingjun Chen",
  "created_at": "2025-08-15",
  "difficulty": "L3",
  "tags": [
    "data extraction",
    "cross-referencing",
    "pattern analysis"
  ],
  "mcp": [
    "filesystem"
  ],
  "meta_data": {
    "stateType": "text",
    "stateContent": "legal_document/\n    └── legal_files/\n            ├── Preferred_Stock_Purchase_Agreement_v0.txt\n            ├── Preferred_Stock_Purchase_Agreement_v1.txt\n            ├── Preferred_Stock_Purchase_Agreement_v2.txt\n            ├── Preferred_Stock_Purchase_Agreement_v3.txt\n            ├── Preferred_Stock_Purchase_Agreement_v4.txt\n            ├── Preferred_Stock_Purchase_Agreement_v5.txt\n            ├── Preferred_Stock_Purchase_Agreement_v6.txt\n            ├── Preferred_Stock_Purchase_Agreement_v7.txt\n            ├── Preferred_Stock_Purchase_Agreement_v8.txt\n            ├── Preferred_Stock_Purchase_Agreement_v9.txt\n            └── Preferred_Stock_Purchase_Agreement_v10.txt",
    "stateUrl": "https://storage.mcpmark.ai/filesystem/legal_document.zip",
    "stateOriginalUrl": "https://www.cooleygo.com/documents/nvca-financing-documents"
  }
}

================================================
FILE: tasks/filesystem/standard/legal_document/dispute_review/verify.py
================================================
#!/usr/bin/env python3
"""
Verification script for Legal Document Dispute Review Task
"""

import sys
from pathlib import Path
import re
import os

def get_test_directory() -> Path:
    """Get the test directory from FILESYSTEM_TEST_DIR env var."""
    test_root = os.environ.get("FILESYSTEM_TEST_DIR")
    if not test_root:
        raise ValueError("FILESYSTEM_TEST_DIR environment variable is required")
    return Path(test_root)

def verify_output_file_exists(test_dir: Path) -> bool:
    """Verify that the dispute_review.txt file exists."""
    output_file = test_dir / "dispute_review.txt"
    
    if not output_file.exists():
        print("❌ File 'dispute_review.txt' not found")
        return False
    
    print("✅ Output file found")
    return True

def verify_output_format(test_dir: Path) -> bool:
    """Verify that the output file has the correct format."""
    output_file = test_dir / "dispute_review.txt"
    
    try:
        content = output_file.read_text().strip()
        
        # Check if content is not empty
        if not content:
            print("❌ Output file is empty")
            return False
        
        # Check format: each line should be "X.X:number"
        lines = content.split('\n')
        for i, line in enumerate(lines, 1):
            line = line.strip()
            if not line:
                continue
                
            # Check format: X.X:number
            if not re.match(r'^\d+\.\d+:\d+$', line):
                print(f"❌ Line {i} has incorrect format: '{line}'")
                print("   Expected format: 'X.X:number' (e.g., '1.1:3')")
                return False
        
        print("✅ Output format is correct")
        return True
        
    except Exception as e:
        print(f"❌ Error reading output file: {e}")
        return False

def verify_expected_entries(test_dir: Path) -> bool:
    """Verify that the output contains the expected entries with correct counts."""
    output_file = test_dir / "dispute_review.txt"
    
    try:
        content = output_file.read_text().strip()
        lines = content.split('\n')
        
        # Parse the output into a dictionary
        output_entries = {}
        for line in lines:
            line = line.strip()
            if not line:
                continue
            clause, count_str = line.split(':', 1)
            output_entries[clause] = int(count_str)
        
        # Expected entries based on answer.txt
        expected_entries = {
            "1.1": 3,
            "1.3": 3,
            "4.6": [5, 6],  # Can be either 5 or 6
            "4.16": 5,
            "6.8": 4
        }
        
        # Check if all expected entries are present
        missing_entries = []
        for clause in expected_entries:
            if clause not in output_entries:
                missing_entries.append(clause)
        
        if missing_entries:
            print(f"❌ Missing expected entries: {missing_entries}")
            return False
        
        # Check if there are extra entries
        extra_entries = []
        for clause in output_entries:
            if clause not in expected_entries:
                extra_entries.append(clause)
        
        if extra_entries:
            print(f"❌ Unexpected extra entries: {extra_entries}")
            return False
        
        # Check counts for each entry
        for clause, expected_count in expected_entries.items():
            actual_count = output_entries[clause]
            
            if isinstance(expected_count, list):
                # For 4.6, accept either 5 or 6
                if actual_count not in expected_count:
                    print(f"❌ Clause {clause}: expected {expected_count}, got {actual_count}")
                    return False
            else:
                if actual_count != expected_count:
                    print(f"❌ Clause {clause}: expected {expected_count}, got {actual_count}")
                    return False
        
        print("✅ All expected entries with correct counts")
        return True
        
    except Exception as e:
        print(f"❌ Error verifying entries: {e}")
        return False

def verify_comment_count_accuracy(test_dir: Path) -> bool:
    """Verify that the comment counts are accurate by checking the actual files."""
    # Since we already verify the expected entries in verify_expected_entries,
    # and the answer.txt contains the correct counts, we can skip this complex verification
    # to avoid false negatives due to regex matching issues.
    
    print("✅ Comment count accuracy check skipped - relying on expected entries verification")
    return True

def main():
    """Main verification function."""
    test_dir = get_test_directory()
    print("🔍 Verifying Legal Document Dispute Review Task...")
    
    # Define verification steps
    verification_steps = [
        ("Output File Exists", verify_output_file_exists),
        ("Output Format", verify_output_format),
        ("Expected Entries", verify_expected_entries),
        ("Comment Count Accuracy", verify_comment_count_accuracy),
    ]
    
    # Run all verification steps
    all_passed = True
    for step_name, verify_func in verification_steps:
        print(f"\n--- {step_name} ---")
        if not verify_func(test_dir):
            all_passed = False
    
    # Final result
    print("\n" + "="*50)
    if all_passed:
        print("✅ Legal document dispute review completed correctly!")
        print("🎉 Task verification: PASS")
        sys.exit(0)
    else:
        print("❌ Task verification: FAIL")
        sys.exit(1)

if __name__ == "__main__":
    main()

================================================
FILE: tasks/filesystem/standard/legal_document/individual_comments/description.md
================================================
Please use FileSystem tools to finish the following task:

**Overview**

The folder "legal_files/" contains all versions (Preferred_Stock_Purchase_Agreement_v0.txt  -- Preferred_Stock_Purchase_Agreement_v10.txt) of the Stock Purchase Agreement for a corporate investment project.

There are comments in it, come from four people:

- **Bill Harvey** (Company CEO)
- **Michelle Jackson** (Investor)
- **David Russel** (Company Counsel)
- **Tony Taylor** (Investor Counsel)

Between v1 and v9, these four people make comments on the clauses. The comment format is `[name:content]`, where:

- `name` is the commenter's name
- `content` is the revision note

**Special Note:** If the name is "All parties", it represents a joint comment from all parties, which counts as one comment but does not count toward any individual's personal comment count.

## Task

Your task is to count the number of comments made by Bill Harvey (Company CEO), Michelle Jackson (Investor), David Russel (Company Counsel), and Tony Taylor (Investor Counsel) in clauses 1.1, 1.3, 4.6, 4.16, 6.8, and 6.16 **in version 5-8.** Please generate `individual_comment.csv` in the **main directory** where the first row contains these clauses (1.1, 1.3, 4.6, 4.16, 6.8, 6.16) and the first column contains the four names (Bill Harvey, Michelle Jackson, David Russel, Tony Taylor). Fill in the table with the number of comments for each person and each clause. If there are no comments, write 0.


================================================
FILE: tasks/filesystem/standard/legal_document/individual_comments/meta.json
================================================
{
  "task_id": "individual_comments",
  "task_name": "Individual Comments",
  "category_id": "legal_document",
  "category_name": "Legal Document",
  "description": "Extract and analyze individual reviewer comments on legal clauses across multiple document versions to understand personal perspectives.",
  "author": "Lingjun Chen",
  "created_at": "2025-08-15",
  "difficulty": "L3",
  "tags": [
    "data extraction",
    "cross-referencing",
    "pattern analysis"
  ],
  "mcp": [
    "filesystem"
  ],
  "meta_data": {
    "stateType": "text",
    "stateContent": "legal_document/\n    └── legal_files/\n            ├── Preferred_Stock_Purchase_Agreement_v0.txt\n            ├── Preferred_Stock_Purchase_Agreement_v1.txt\n            ├── Preferred_Stock_Purchase_Agreement_v2.txt\n            ├── Preferred_Stock_Purchase_Agreement_v3.txt\n            ├── Preferred_Stock_Purchase_Agreement_v4.txt\n            ├── Preferred_Stock_Purchase_Agreement_v5.txt\n            ├── Preferred_Stock_Purchase_Agreement_v6.txt\n            ├── Preferred_Stock_Purchase_Agreement_v7.txt\n            ├── Preferred_Stock_Purchase_Agreement_v8.txt\n            ├── Preferred_Stock_Purchase_Agreement_v9.txt\n            └── Preferred_Stock_Purchase_Agreement_v10.txt",
    "stateUrl": "https://storage.mcpmark.ai/filesystem/legal_document.zip",
    "stateOriginalUrl": "https://www.cooleygo.com/documents/nvca-financing-documents"
  }
}

================================================
FILE: tasks/filesystem/standard/legal_document/individual_comments/verify.py
================================================
#!/usr/bin/env python3
"""
Verification script for Legal Document Individual Comments Task
"""

import sys
from pathlib import Path
import csv
import os

def get_test_directory() -> Path:
    """Get the test directory from FILESYSTEM_TEST_DIR env var."""
    test_root = os.environ.get("FILESYSTEM_TEST_DIR")
    if not test_root:
        raise ValueError("FILESYSTEM_TEST_DIR environment variable is required")
    return Path(test_root)

def verify_output_file_exists(test_dir: Path) -> bool:
    """Verify that the individual_comment.csv file exists."""
    output_file = test_dir / "individual_comment.csv"
    
    if not output_file.exists():
        print("❌ File 'individual_comment.csv' not found")
        return False
    
    print("✅ Output file 'individual_comment.csv' found")
    return True

def verify_csv_format(test_dir: Path) -> bool:
    """Verify that the CSV file has the correct format."""
    output_file = test_dir / "individual_comment.csv"
    
    try:
        with open(output_file, 'r', newline='', encoding='utf-8') as csvfile:
            reader = csv.reader(csvfile)
            rows = list(reader)
            
            if not rows:
                print("❌ CSV file is empty")
                return False
            
            # Check if there are at least 2 rows (header + data)
            if len(rows) < 2:
                print("❌ CSV file has insufficient rows")
                return False
            
            # Check if header row has correct number of columns
            header = rows[0]
            if len(header) != 7:  # First column (can be anything) + 6 clauses
                print(f"❌ Header row has incorrect number of columns: {len(header)}, expected 7")
                return False
            
            # Check if data rows have correct number of columns
            for i, row in enumerate(rows[1:], 1):
                if len(row) != 7:
                    print(f"❌ Data row {i} has incorrect number of columns: {len(row)}, expected 7")
                    return False
            
            print("✅ CSV format is correct")
            return True
            
    except Exception as e:
        print(f"❌ Error reading CSV file: {e}")
        return False

def verify_csv_content(test_dir: Path) -> bool:
    """Verify that the CSV content matches the expected answer exactly."""
    output_file = test_dir / "individual_comment.csv"
    
    try:
        with open(output_file, 'r', newline='', encoding='utf-8') as csvfile:
            reader = csv.reader(csvfile)
            rows = list(reader)
            
            # Expected data based on answer.csv
            expected_data = {
                "Bill Harvey": ["0", "2", "3", "1", "1", "1"],
                "Michelle Jackson": ["0", "1", "2", "1", "1", "1"],
                "David Russel": ["2", "1", "1", "2", "1", "1"],
                "Tony Taylor": ["2", "0", "1", "2", "1", "1"]
            }
            
            # Expected header columns (excluding first column which can be anything)
            expected_header_columns = ["1.1", "1.3", "4.6", "4.16", "6.8", "6.16"]
            
            # Verify header has correct number of columns
            header = rows[0]
            if len(header) != 7:  # First column + 6 clauses
                print(f"❌ Header row has incorrect number of columns: {len(header)}, expected 7")
                return False
            
            # Check if all expected clause columns are present (allow order to be different)
            # Allow first column to be anything, so we check columns 1-6
            header_clauses = header[1:7]
            missing_clauses = []
            for expected_clause in expected_header_columns:
                if expected_clause not in header_clauses:
                    missing_clauses.append(expected_clause)
            
            if missing_clauses:
                print(f"❌ Missing expected clause columns: {missing_clauses}")
                return False
            
            # Check if there are extra clause columns
            extra_clauses = []
            for clause in header_clauses:
                if clause not in expected_header_columns:
                    extra_clauses.append(clause)
            
            if extra_clauses:
                print(f"❌ Unexpected extra clause columns: {extra_clauses}")
                return False
            
            # Create a mapping from expected clause order to actual column indices
            clause_mapping = {}
            for i, clause in enumerate(header_clauses):
                if clause in expected_header_columns:
                    clause_mapping[clause] = i
            
            # Parse the CSV data into a dictionary with correct column mapping
            csv_data = {}
            for row in rows[1:]:
                if len(row) >= 7:
                    name = row[0]
                    # Map values according to the expected clause order
                    values = []
                    for expected_clause in expected_header_columns:
                        col_index = clause_mapping[expected_clause] + 1  # +1 because we skip first column
                        values.append(row[col_index])
                    csv_data[name] = values
            
            # Check if all expected names are present
            missing_names = []
            for expected_name in expected_data:
                if expected_name not in csv_data:
                    missing_names.append(expected_name)
            
            if missing_names:
                print(f"❌ Missing expected names: {missing_names}")
                return False
            
            # Check if there are extra names
            extra_names = []
            for name in csv_data:
                if name not in expected_data:
                    extra_names.append(name)
            
            if extra_names:
                print(f"❌ Unexpected extra names: {extra_names}")
                return False
            
            # Check values for each person
            for name, expected_values in expected_data.items():
                actual_values = csv_data[name]
                
                if actual_values != expected_values:
                    print(f"❌ Values mismatch for {name}:")
                    print(f"   Expected: {expected_values}")
                    print(f"   Got:      {actual_values}")
                    return False
            
            print("✅ CSV content matches expected answer exactly")
            return True
            
    except Exception as e:
        print(f"❌ Error verifying CSV content: {e}")
        return False

def verify_data_accuracy(test_dir: Path) -> bool:
    """Verify that the data values are accurate (all values are non-negative integers)."""
    output_file = test_dir / "individual_comment.csv"
    
    try:
        with open(output_file, 'r', newline='', encoding='utf-8') as csvfile:
            reader = csv.reader(csvfile)
            rows = list(reader)
            
            # Skip header row
            for i, row in enumerate(rows[1:], 1):
                if len(row) >= 7:
                    name = row[0]
                    values = row[1:7]
                    
                    for j, value in enumerate(values, 1):
                        try:
                            int_val = int(value)
                            if int_val < 0:
                                print(f"❌ Row {i}, column {j}: negative value '{value}' for {name}")
                                return False
                        except ValueError:
                            print(f"❌ Row {i}, column {j}: non-integer value '{value}' for {name}")
                            return False
            
            print("✅ All data values are valid non-negative integers")
            return True
            
    except Exception as e:
        print(f"❌ Error verifying data accuracy: {e}")
        return False

def verify_file_location(test_dir: Path) -> bool:
    """Verify that the file is in the main directory (not in a subdirectory)."""
    output_file = test_dir / "individual_comment.csv"
    
    if output_file.exists():
        print("✅ File is located in the main directory")
        return True
    else:
        print("❌ File is not in the main directory")
        return False

def main():
    """Main verification function."""
    test_dir = get_test_directory()
    print("🔍 Verifying Legal Document Individual Comments Task...")
    
    # Define verification steps
    verification_steps = [
        ("Output File Exists", verify_output_file_exists),
        ("CSV Format", verify_csv_format),
        ("CSV Content", verify_csv_content),
        ("Data Accuracy", verify_data_accuracy),
        ("File Location", verify_file_location),
    ]
    
    # Run all verification steps
    all_passed = True
    for step_name, verify_func in verification_steps:
        print(f"\n--- {step_name} ---")
        if not verify_func(test_dir):
            all_passed = False
    
    # Final result
    print("\n" + "="*50)
    if all_passed:
        print("✅ Legal document individual comments task completed correctly!")
        print("🎉 Task verification: PASS")
        sys.exit(0)
    else:
        print("❌ Task verification: FAIL")
        sys.exit(1)

if __name__ == "__main__":
    main()

================================================
FILE: tasks/filesystem/standard/legal_document/solution_tracing/description.md
================================================
Please use FileSystem tools to finish the following task:

### Overview

The folder "legal_files/" contains all versions (Preferred_Stock_Purchase_Agreement_v0.txt  -- Preferred_Stock_Purchase_Agreement_v10.txt) of the Stock Purchase Agreement for a corporate investment project.

There are comments in it, come from four people:

- **Bill Harvey** (Company CEO)
- **Michelle Jackson** (Investor)
- **David Russel** (Company Counsel)
- **Tony Taylor** (Investor Counsel)

Between v1 and v9, these four people make comments on the clauses. The comment format is `[name:content]`, where:

- `name` is the commenter's name
- `content` is the revision note

**Special Note:** If the name is "All parties", it represents a joint comment from all parties, which counts as one comment but does not count toward any individual's personal comment count.

### Task Description

**Your task is to focus on clauses 4.6, 4.16, 6.8, and 6.16 in v5-9** to determine:

1. Who first proposed the idea that eventually led to the final agreed solution
2. In which version's comment it appeared

**Important:** If the final solution was formed through multiple people's comments, count as the originator the person whose comment first provided the core motivation (or part of the idea) that shaped the final solution. The key is to identify who initially proposed the motivation for the final solution.

### Output Requirements

**File Name:** `tracing.csv` (must be placed in the main directory)

**CSV Structure:**

- **First row** (excluding the top-left cell): `4.6, 4.16, 6.8, 6.16`
- **First column** (excluding the top-left cell): `version_number, name`
- **Remaining cells:** Fill in the `version_number` (the version in which the final solution was first proposed, only write a number without any other things) and the `name` (the person who proposed it) for each clause


================================================
FILE: tasks/filesystem/standard/legal_document/solution_tracing/meta.json
================================================
{
  "task_id": "solution_tracing",
  "task_name": "Solution Tracing",
  "category_id": "legal_document",
  "category_name": "Legal Document",
  "description": "Trace the evolution of clause resolutions across document versions to identify who first proposed each final accepted solution.",
  "author": "Lingjun Chen",
  "created_at": "2025-08-15",
  "difficulty": "L3",
  "tags": [
    "cross-referencing",
    "pattern analysis"
  ],
  "mcp": [
    "filesystem"
  ],
  "meta_data": {
    "stateType": "text",
    "stateContent": "legal_document/\n    └── legal_files/\n            ├── Preferred_Stock_Purchase_Agreement_v0.txt\n            ├── Preferred_Stock_Purchase_Agreement_v1.txt\n            ├── Preferred_Stock_Purchase_Agreement_v2.txt\n            ├── Preferred_Stock_Purchase_Agreement_v3.txt\n            ├── Preferred_Stock_Purchase_Agreement_v4.txt\n            ├── Preferred_Stock_Purchase_Agreement_v5.txt\n            ├── Preferred_Stock_Purchase_Agreement_v6.txt\n            ├── Preferred_Stock_Purchase_Agreement_v7.txt\n            ├── Preferred_Stock_Purchase_Agreement_v8.txt\n            ├── Preferred_Stock_Purchase_Agreement_v9.txt\n            └── Preferred_Stock_Purchase_Agreement_v10.txt",
    "stateUrl": "https://storage.mcpmark.ai/filesystem/legal_document.zip",
    "stateOriginalUrl": "https://www.cooleygo.com/documents/nvca-financing-documents"
  }
}

================================================
FILE: tasks/filesystem/standard/legal_document/solution_tracing/verify.py
================================================
#!/usr/bin/env python3
"""
Verification script for Legal Document Solution Tracing Task
"""

import sys
from pathlib import Path
import csv
import os

def get_test_directory() -> Path:
    """Get the test directory from FILESYSTEM_TEST_DIR env var."""
    test_root = os.environ.get("FILESYSTEM_TEST_DIR")
    if not test_root:
        raise ValueError("FILESYSTEM_TEST_DIR environment variable is required")
    return Path(test_root)

def verify_output_file_exists(test_dir: Path) -> bool:
    """Verify that the tracing.csv file exists."""
    output_file = test_dir / "tracing.csv"
    
    if not output_file.exists():
        print("❌ File 'tracing.csv' not found")
        return False
    
    print("✅ Output file 'tracing.csv' found")
    return True

def verify_csv_format(test_dir: Path) -> bool:
    """Verify that the CSV file has the correct format."""
    output_file = test_dir / "tracing.csv"
    
    try:
        with open(output_file, 'r', newline='', encoding='utf-8') as csvfile:
            reader = csv.reader(csvfile)
            rows = list(reader)
            
            if not rows:
                print("❌ CSV file is empty")
                return False
            
            # Check if there are at least 2 rows (header + data)
            if len(rows) < 2:
                print("❌ CSV file has insufficient rows")
                return False
            
            # Check if header row has correct number of columns
            header = rows[0]
            if len(header) != 5:  # First column (can be anything) + 4 clauses
                print(f"❌ Header row has incorrect number of columns: {len(header)}, expected 5")
                return False
            
            # Check if data rows have correct number of columns
            for i, row in enumerate(rows[1:], 1):
                if len(row) != 5:
                    print(f"❌ Data row {i} has incorrect number of columns: {len(row)}, expected 5")
                    return False
            
            print("✅ CSV format is correct")
            return True
            
    except Exception as e:
        print(f"❌ Error reading CSV file: {e}")
        return False

def verify_csv_content(test_dir: Path) -> bool:
    """Verify that the CSV content matches the expected answer exactly."""
    output_file = test_dir / "tracing.csv"
    
    try:
        with open(output_file, 'r', newline='', encoding='utf-8') as csvfile:
            reader = csv.reader(csvfile)
            rows = list(reader)
            
            # Expected data based on answer.csv
            expected_data = {
                "version_number": ["5", "6", "7", "8"],
                "name": ["Bill Harvey", "Michelle Jackson", "Michelle Jackson", "Tony Taylor"]
            }
            
            # Expected header columns (excluding first column which can be anything)
            expected_header_columns = ["4.6", "4.16", "6.8", "6.16"]
            
            # Verify header has correct number of columns
            header = rows[0]
            if len(header) != 5:  # First column + 4 clauses
                print(f"❌ Header row has incorrect number of columns: {len(header)}, expected 5")
                return False
            
            # Check if all expected clause columns are present (allow order to be different)
            # Allow first column to be anything, so we check columns 1-4
            header_clauses = header[1:5]
            missing_clauses = []
            for expected_clause in expected_header_columns:
                if expected_clause not in header_clauses:
                    missing_clauses.append(expected_clause)
            
            if missing_clauses:
                print(f"❌ Missing expected clause columns: {missing_clauses}")
                return False
            
            # Check if there are extra clause columns
            extra_clauses = []
            for clause in header_clauses:
                if clause not in expected_header_columns:
                    extra_clauses.append(clause)
            
            if extra_clauses:
                print(f"❌ Unexpected extra clause columns: {extra_clauses}")
                return False
            
            # Create a mapping from expected clause order to actual column indices
            clause_mapping = {}
            for i, clause in enumerate(header_clauses):
                if clause in expected_header_columns:
                    clause_mapping[clause] = i
            
            # Parse the CSV data into a dictionary with correct column mapping
            csv_data = {}
            for row in rows[1:]:
                if len(row) >= 5:
                    row_type = row[0]  # version_number or name
                    # Map values according to the expected clause order
                    values = []
                    for expected_clause in expected_header_columns:
                        col_index = clause_mapping[expected_clause] + 1  # +1 because we skip first column
                        values.append(row[col_index])
                    csv_data[row_type] = values
            
            # Check if all expected row types are present
            missing_types = []
            for expected_type in expected_data:
                if expected_type not in csv_data:
                    missing_types.append(expected_type)
            
            if missing_types:
                print(f"❌ Missing expected row types: {missing_types}")
                return False
            
            # Check if there are extra row types
            extra_types = []
            for row_type in csv_data:
                if row_type not in expected_data:
                    extra_types.append(row_type)
            
            if extra_types:
                print(f"❌ Unexpected extra row types: {extra_types}")
                return False
            
            # Check values for each row type
            for row_type, expected_values in expected_data.items():
                actual_values = csv_data[row_type]
                
                if actual_values != expected_values:
                    print(f"❌ Values mismatch for {row_type}:")
                    print(f"   Expected: {expected_values}")
                    print(f"   Got:      {actual_values}")
                    return False
            
            print("✅ CSV content matches expected answer exactly")
            return True
            
    except Exception as e:
        print(f"❌ Error verifying CSV content: {e}")
        return False

def verify_data_accuracy(test_dir: Path) -> bool:
    """Verify that the data values are accurate."""
    output_file = test_dir / "tracing.csv"
    
    try:
        with open(output_file, 'r', newline='', encoding='utf-8') as csvfile:
            reader = csv.reader(csvfile)
            rows = list(reader)
            
            # Skip header row
            for i, row in enumerate(rows[1:], 1):
                if len(row) >= 5:
                    row_type = row[0]
                    values = row[1:5]
                    
                    # Check version_number row
                    if row_type == "version_number":
                        for j, value in enumerate(values, 1):
                            try:
                                int_val = int(value)
                                if int_val < 5 or int_val > 8:
                                    print(f"❌ Row {i}, column {j}: version number '{value}' is out of expected range [5-8]")
                                    return False
                            except ValueError:
                                print(f"❌ Row {i}, column {j}: non-integer version number '{value}'")
                                return False
                    
                    # Check name row
                    elif row_type == "name":
                        expected_names = ["Bill Harvey", "Michelle Jackson", "Michelle Jackson", "Tony Taylor"]
                        for j, value in enumerate(values, 1):
                            if value not in expected_names:
                                print(f"❌ Row {i}, column {j}: unexpected name '{value}'")
                                return False
            
            print("✅ All data values are accurate")
            return True
            
    except Exception as e:
        print(f"❌ Error verifying data accuracy: {e}")
        return False

def verify_file_location(test_dir: Path) -> bool:
    """Verify that the file is in the main directory (not in a subdirectory)."""
    output_file = test_dir / "tracing.csv"
    
    if output_file.exists():
        print("✅ File is located in the main directory")
        return True
    else:
        print("❌ File is not in the main directory")
        return False

def main():
    """Main verification function."""
    test_dir = get_test_directory()
    print("🔍 Verifying Legal Document Solution Tracing Task...")
    
    # Define verification steps
    verification_steps = [
        ("Output File Exists", verify_output_file_exists),
        ("CSV Format", verify_csv_format),
        ("CSV Content", verify_csv_content),
        ("Data Accuracy", verify_data_accuracy),
        ("File Location", verify_file_location),
    ]
    
    # Run all verification steps
    all_passed = True
    for step_name, verify_func in verification_steps:
        print(f"\n--- {step_name} ---")
        if not verify_func(test_dir):
            all_passed = False
    
    # Final result
    print("\n" + "="*50)
    if all_passed:
        print("✅ Legal document solution tracing task completed correctly!")
        print("🎉 Task verification: PASS")
        sys.exit(0)
    else:
        print("❌ Task verification: FAIL")
        sys.exit(1)

if __name__ == "__main__":
    main()

================================================
FILE: tasks/filesystem/standard/papers/author_folders/description.md
================================================
Please use FileSystem tools to finish the following task:

### Task Description

You are given a directory containing multiple paper files. You have a collection of academic papers in HTML format from arXiv. Your task is to analyze these papers, identify authors who have published multiple papers, and organize them into author-specific folders based on specified criteria.

### Task Objectives

#### Part 1: Frequent Authors (≥4 papers)
1. **Extract author information** from all HTML papers in the given directory
2. **Identify authors** who appear in 4 or more papers
3. **Create a directory** `frequent_authors` 
4. **Create individual folders** within this directory for each frequent author (lowercase names with underscores)
5. **Copy their papers** to their respective folders

#### Part 2: Prolific 2025 Authors (≥3 papers)
1. **Extract publication dates** along with author information
2. **Identify authors** who published 3 or more papers in 2025
3. **Create a directory** `2025_authors` for 2025 authors
4. **Create individual folders** within this directory for each prolific 2025 author (lowercase names with underscores)
5. **Copy their 2025 papers** to their respective folders

### Expected Output

#### Directory Structure:
```
[given_task_folder]/
├── [original HTML files remain untouched]
├── frequent_authors/              # Authors with ≥4 papers total
│   ├── smith_john/
│   │   └── [copied papers]
│   ├── johnson_sarah/
│   │   └── [copied papers]
│   └── ...
└── 2025_authors/                  # Authors with ≥3 papers in 2025
    ├── williams_david/
    │   └── [copied 2025 papers]
    ├── brown_emily/
    │   └── [copied 2025 papers]
    └── ...
```

#### Requirements:
- Author folder names should be **lowercase** with underscores replacing spaces/commas (e.g., `smith_john`, `williams_david`)
- Papers should be **copied** (not moved) to preserve originals
- Author extraction should handle various name formats correctly

================================================
FILE: tasks/filesystem/standard/papers/author_folders/meta.json
================================================
{
  "task_id": "author_folders",
  "task_name": "Author Folders",
  "category_id": "papers",
  "category_name": "Papers",
  "description": "Analyze academic papers to identify and organize by author, creating separate folders for frequent authors (≥4 papers) and prolific 2025 authors (≥3 papers).",
  "author": "Xiangyan Liu",
  "created_at": "2025-08-12",
  "difficulty": "L3",
  "tags": [
    "data extraction",
    "file organization",
    "pattern analysis"
  ],
  "mcp": [
    "filesystem"
  ],
  "meta_data": {
    "stateType": "text",
    "stateContent": "papers/\n    ├── 1707.06347.html\n    ├── 2105.04165.html\n    ├── 2201.11903.html\n    ├── 2303.08774.html\n    ├── 2306.08640.html\n    ├── 2310.02255.html\n    ├── 2310.08446.html\n    ├── 2312.00849.html\n    ├── 2312.07533.html\n    ├── 2312.11805.html\n    ├── 2402.00253.html\n    ├── 2402.03300.html\n    ├── 2403.05530.html\n    ├── 2404.13046.html\n    ├── 2404.14367.html\n    ├── 2404.14396.html\n    ├── 2405.09818.html\n    ├── 2405.13911.html\n    ├── 2405.16473.html\n    ├── 2405.16640.html\n    ├── 2406.08478.html\n    ├── 2406.16852.html\n    ├── 2406.17294.html\n    ├── 2407.01284.html\n    ├── 2407.01509.html\n    ├── 2407.21783.html\n    ├── 2408.03326.html\n    ├── 2408.12528.html\n    ├── 2409.19256.html\n    ├── 2410.05993.html\n    ├── 2410.06166.html\n    ├── 2410.10563.html\n    ├── 2410.13848.html\n    ├── 2410.17885.html\n    ├── 2410.21276.html\n    ├── 2411.07975.html\n    ├── 2411.10442.html\n    ├── 2411.11930.html\n    ├── 2411.14432.html\n    ├── 2412.05271.html\n    ├── 2412.08443.html\n    ├── 2412.10302.html\n    ├── 2412.15115.html\n    ├── 2412.16720.html\n    ├── 2412.17256.html\n    ├── 2412.18319.html\n    ├── 2412.20631.html\n    ├── 2501.04686.html\n    ├── 2501.06186.html\n    ├── 2501.12599.html\n    ├── 2501.12948.html\n    ├── 2501.17811.html\n    ├── 2502.01456.html\n    ├── 2502.09621.html\n    ├── 2502.10391.html\n    ├── 2502.13923.html\n    ├── 2503.01785.html\n    ├── 2503.06520.html\n    ├── 2503.06749.html\n    ├── 2503.07065.html\n    ├── 2503.07365.html\n    ├── 2503.07536.html\n    ├── 2503.10291.html\n    ├── 2503.10615.html\n    ├── 2503.12937.html\n    ├── 2503.13939.html\n    ├── 2503.14476.html\n    ├── 2503.17352.html\n    ├── 2503.18892.html\n    ├── 2503.19786.html\n    ├── 2503.20783.html\n    ├── 2503.21620.html\n    ├── 2503.21776.html\n    ├── 2503.22679.html\n    ├── 2504.02587.html\n    ├── 2504.05599.html\n    ├── 2504.07491.html\n    ├── 2504.07934.html\n    ├── 2504.07954.html\n    ├── 2504.11455.html\n    ├── 2504.14945.html\n    ├── 2504.16656.html\n    ├── 2505.00703.html\n    └── arxiv_2025.bib",
    "stateUrl": "https://storage.mcpmark.ai/filesystem/papers.zip",
    "stateOriginalUrl": null
  }
}

================================================
FILE: tasks/filesystem/standard/papers/author_folders/verify.py
================================================
#!/usr/bin/env python3
"""
Verification script for Paper Organization Task: Author-Based Paper Categorization
"""

import sys
from pathlib import Path
import os
import re
from typing import Dict, List, Set
from html.parser import HTMLParser
from datetime import datetime

def get_test_directory() -> Path:
    """Get the test directory from FILESYSTEM_TEST_DIR env var."""
    test_root = os.environ.get("FILESYSTEM_TEST_DIR")
    if not test_root:
        raise ValueError("FILESYSTEM_TEST_DIR environment variable is required")
    return Path(test_root)

class ArxivHTMLParser(HTMLParser):
    """Parser to extract author and date information from arXiv HTML papers."""
    
    def __init__(self):
        super().__init__()
        self.authors = []
        self.publication_date = None
        
    def handle_starttag(self, tag, attrs):
        # Look for author metadata tags
        if tag == 'meta':
            attr_dict = dict(attrs)
            if attr_dict.get('name') == 'citation_author':
                content = attr_dict.get('content', '')
                if content:
                    self.authors.append(content)
            elif attr_dict.get('name') in ['citation_date', 'citation_online_date']:
                content = attr_dict.get('content', '')
                if content and not self.publication_date:
                    self.publication_date = content

def extract_paper_info(html_file: Path) -> tuple[List[str], str]:
    """Extract authors and publication year from an HTML paper."""
    try:
        with open(html_file, 'r', encoding='utf-8', errors='ignore') as f:
            content = f.read()
            
        parser = ArxivHTMLParser()
        parser.feed(content)
        
        # Extract year from date if available
        year = None
        if parser.publication_date:
            # Parse year from date string (e.g., "2025/03/13")
            year_match = re.search(r'(\d{4})', parser.publication_date)
            if year_match:
                year = year_match.group(1)
        
        return parser.authors, year
        
    except Exception as e:
        print(f"Warning: Could not parse {html_file.name}: {e}")
        return [], None

def normalize_author_name(author: str) -> str:
    """Normalize author name to lowercase with underscores."""
    # Author names are in "Last, First Middle" format
    # We need to convert to "first_last" format
    
    # Remove any HTML entities or special characters that shouldn't be there
    author = author.strip()
    
    # Split by comma to separate last and first names
    parts = author.split(',', 1)
    if len(parts) == 2:
        last_name = parts[0].strip()
        first_names = parts[1].strip()
        # Take only the first name (not middle names)
        first_name_parts = first_names.split()
        if first_name_parts:
            first_name = first_name_parts[0]
            # Format as "first_last"
            normalized = f"{first_name}_{last_name}"
        else:
            normalized = last_name
    else:
        # If no comma, use as is
        normalized = author
    
    # Convert to lowercase and replace spaces/special chars with underscores
    normalized = re.sub(r'[^\w\s-]', '', normalized)
    normalized = re.sub(r'[\s-]+', '_', normalized)
    return normalized.lower()

def verify_directories_exist(test_dir: Path) -> bool:
    """Verify that required directories exist."""
    frequent_authors_dir = test_dir / "frequent_authors"
    authors_2025_dir = test_dir / "2025_authors"
    
    if not frequent_authors_dir.exists():
        print("❌ 'frequent_authors' directory not found")
        return False
    
    if not authors_2025_dir.exists():
        print("❌ '2025_authors' directory not found")
        return False
    
    if not frequent_authors_dir.is_dir():
        print("❌ 'frequent_authors' exists but is not a directory")
        return False
        
    if not authors_2025_dir.is_dir():
        print("❌ '2025_authors' exists but is not a directory")
        return False
    
    print("✅ Both required directories exist")
    return True

def analyze_papers(test_dir: Path) -> tuple[Dict[str, List[Path]], Dict[str, List[Path]]]:
    """Analyze all HTML papers and return author-paper mappings."""
    author_papers = {}  # author -> list of papers
    author_2025_papers = {}  # author -> list of 2025 papers
    
    # Find all HTML files
    html_files = list(test_dir.glob("*.html"))
    
    for html_file in html_files:
        authors, year = extract_paper_info(html_file)
        
        for author in authors:
            if not author:
                continue
                
            normalized_name = normalize_author_name(author)
            if not normalized_name:
                continue
            
            # Track all papers by author
            if normalized_name not in author_papers:
                author_papers[normalized_name] = []
            author_papers[normalized_name].append(html_file)
            
            # Track 2025 papers
            if year == '2025':
                if normalized_name not in author_2025_papers:
                    author_2025_papers[normalized_name] = []
                author_2025_papers[normalized_name].append(html_file)
    
    return author_papers, author_2025_papers

def verify_frequent_authors(test_dir: Path, author_papers: Dict[str, List[Path]]) -> bool:
    """Verify that authors with ≥4 papers have their folders and papers."""
    frequent_authors_dir = test_dir / "frequent_authors"
    
    # Find authors with 4 or more papers
    frequent_authors = {author: papers for author, papers in author_papers.items() 
                        if len(papers) >= 4}
    
    if not frequent_authors:
        print("⚠️  No authors found with 4 or more papers")
        # This might be expected depending on the test data
        return True
    
    all_correct = True
    
    for author, expected_papers in frequent_authors.items():
        author_dir = frequent_authors_dir / author
        
        # Check if author directory exists
        if not author_dir.exists():
            print(f"❌ Missing directory for frequent author: {author}")
            all_correct = False
            continue
        
        # Check if all expected papers are present
        for paper in expected_papers:
            paper_copy = author_dir / paper.name
            if not paper_copy.exists():
                print(f"❌ Missing paper {paper.name} in {author} directory")
                all_correct = False
    
    # Check for unexpected directories
    for item in frequent_authors_dir.iterdir():
        if item.is_dir():
            dir_name = item.name
            if dir_name not in frequent_authors:
                # Check if this author has less than 4 papers
                if dir_name in author_papers and len(author_papers[dir_name]) < 4:
                    print(f"❌ Author {dir_name} has only {len(author_papers[dir_name])} papers but has a folder in frequent_authors")
                    all_correct = False
    
    if all_correct:
        print(f"✅ Frequent authors correctly organized ({len(frequent_authors)} authors)")
    
    return all_correct

def verify_2025_authors(test_dir: Path, author_2025_papers: Dict[str, List[Path]]) -> bool:
    """Verify that authors with ≥3 papers in 2025 have their folders and papers."""
    authors_2025_dir = test_dir / "2025_authors"
    
    # Find authors with 3 or more papers in 2025
    prolific_2025_authors = {author: papers for author, papers in author_2025_papers.items() 
                             if len(papers) >= 3}
    
    if not prolific_2025_authors:
        print("⚠️  No authors found with 3 or more papers in 2025")
        # This might be expected depending on the test data
        return True
    
    all_correct = True
    
    for author, expected_papers in prolific_2025_authors.items():
        author_dir = authors_2025_dir / author
        
        # Check if author directory exists
        if not author_dir.exists():
            print(f"❌ Missing directory for 2025 author: {author}")
            all_correct = False
            continue
        
        # Check if all expected 2025 papers are present
        for paper in expected_papers:
            paper_copy = author_dir / paper.name
            if not paper_copy.exists():
                print(f"❌ Missing 2025 paper {paper.name} in {author} directory")
                all_correct = False
    
    # Check for unexpected directories
    for item in authors_2025_dir.iterdir():
        if item.is_dir():
            dir_name = item.name
            if dir_name not in prolific_2025_authors:
                # Check if this author has less than 3 papers in 2025
                if dir_name in author_2025_papers and len(author_2025_papers[dir_name]) < 3:
                    print(f"❌ Author {dir_name} has only {len(author_2025_papers[dir_name])} papers in 2025 but has a folder in 2025_authors")
                    all_correct = False
    
    if all_correct:
        print(f"✅ 2025 authors correctly organized ({len(prolific_2025_authors)} authors)")
    
    return all_correct

def verify_original_files_intact(test_dir: Path) -> bool:
    """Verify that original HTML files are still present (not moved)."""
    html_files = list(test_dir.glob("*.html"))
    
    if not html_files:
        print("❌ No original HTML files found in root directory")
        return False
    
    print(f"✅ Original HTML files remain intact ({len(html_files)} files)")
    return True

def verify_naming_convention(test_dir: Path) -> bool:
    """Verify that author folder names follow the correct naming convention."""
    frequent_authors_dir = test_dir / "frequent_authors"
    authors_2025_dir = test_dir / "2025_authors"
    
    all_correct = True
    
    # Check frequent_authors subdirectories
    for author_dir in frequent_authors_dir.iterdir():
        if author_dir.is_dir():
            name = author_dir.name
            # Check for lowercase and underscores only
            if not re.match(r'^[a-z0-9_]+$', name):
                print(f"❌ Invalid folder name in frequent_authors: {name} (should be lowercase with underscores)")
                all_correct = False
    
    # Check 2025_authors subdirectories
    for author_dir in authors_2025_dir.iterdir():
        if author_dir.is_dir():
            name = author_dir.name
            # Check for lowercase and underscores only
            if not re.match(r'^[a-z0-9_]+$', name):
                print(f"❌ Invalid folder name in 2025_authors: {name} (should be lowercase with underscores)")
                all_correct = False
    
    if all_correct:
        print("✅ All author folder names follow correct naming convention")
    
    return all_correct

def main():
    """Main verification function."""
    try:
        test_dir = get_test_directory()
        print(f"🔍 Verifying paper organization in: {test_dir}")
        
        # Analyze papers first
        print("\n📊 Analyzing papers...")
        author_papers, author_2025_papers = analyze_papers(test_dir)
        
        # Run verification checks
        checks = [
            ("Directory existence", lambda: verify_directories_exist(test_dir)),
            ("Original files intact", lambda: verify_original_files_intact(test_dir)),
            ("Frequent authors organization", lambda: verify_frequent_authors(test_dir, author_papers)),
            ("2025 authors organization", lambda: verify_2025_authors(test_dir, author_2025_papers)),
            ("Naming conventions", lambda: verify_naming_convention(test_dir))
        ]
        
        all_passed = True
        for check_name, check_func in checks:
            print(f"\n📋 Checking: {check_name}")
            if not check_func():
                all_passed = False
        
        if all_passed:
            print("\n🎉 All verification checks passed!")
            sys.exit(0)
        else:
            print("\n❌ Some verification checks failed!")
            sys.exit(1)
            
    except Exception as e:
        print(f"❌ Verification failed with error: {e}")
        sys.exit(1)

if __name__ == "__main__":
    main()

================================================
FILE: tasks/filesystem/standard/papers/find_math_paper/description.md
================================================
Please use FileSystem tools to finish the following task:

You are given a directory containing multiple paper files. Please help me find a math-related benchmark paper. I don’t remember its name, but I remember it not only checks whether the answer is correct, but also analyzes whether the model suffers from insufficient knowledge, lacks generalization ability, or relies on rote memorization. After finding this paper, rename its corresponding HTML file to `answer.html`.

================================================
FILE: tasks/filesystem/standard/papers/find_math_paper/meta.json
================================================
{
  "task_id": "find_math_paper",
  "task_name": "Find Math Paper",
  "category_id": "papers",
  "category_name": "Papers",
  "description": "Search through academic papers to identify and locate mathematics-related content that satisfies specific mathematical criteria and research requirements.",
  "author": "Xiangyan Liu",
  "created_at": "2025-08-12",
  "difficulty": "L3",
  "tags": [
    "pattern analysis",
    "data extraction"
  ],
  "mcp": [
    "filesystem"
  ],
  "meta_data": {
    "stateType": "text",
    "stateContent": "papers/\n    ├── 1707.06347.html\n    ├── 2105.04165.html\n    ├── 2201.11903.html\n    ├── 2303.08774.html\n    ├── 2306.08640.html\n    ├── 2310.02255.html\n    ├── 2310.08446.html\n    ├── 2312.00849.html\n    ├── 2312.07533.html\n    ├── 2312.11805.html\n    ├── 2402.00253.html\n    ├── 2402.03300.html\n    ├── 2403.05530.html\n    ├── 2404.13046.html\n    ├── 2404.14367.html\n    ├── 2404.14396.html\n    ├── 2405.09818.html\n    ├── 2405.13911.html\n    ├── 2405.16473.html\n    ├── 2405.16640.html\n    ├── 2406.08478.html\n    ├── 2406.16852.html\n    ├── 2406.17294.html\n    ├── 2407.01284.html\n    ├── 2407.01509.html\n    ├── 2407.21783.html\n    ├── 2408.03326.html\n    ├── 2408.12528.html\n    ├── 2409.19256.html\n    ├── 2410.05993.html\n    ├── 2410.06166.html\n    ├── 2410.10563.html\n    ├── 2410.13848.html\n    ├── 2410.17885.html\n    ├── 2410.21276.html\n    ├── 2411.07975.html\n    ├── 2411.10442.html\n    ├── 2411.11930.html\n    ├── 2411.14432.html\n    ├── 2412.05271.html\n    ├── 2412.08443.html\n    ├── 2412.10302.html\n    ├── 2412.15115.html\n    ├── 2412.16720.html\n    ├── 2412.17256.html\n    ├── 2412.18319.html\n    ├── 2412.20631.html\n    ├── 2501.04686.html\n    ├── 2501.06186.html\n    ├── 2501.12599.html\n    ├── 2501.12948.html\n    ├── 2501.17811.html\n    ├── 2502.01456.html\n    ├── 2502.09621.html\n    ├── 2502.10391.html\n    ├── 2502.13923.html\n    ├── 2503.01785.html\n    ├── 2503.06520.html\n    ├── 2503.06749.html\n    ├── 2503.07065.html\n    ├── 2503.07365.html\n    ├── 2503.07536.html\n    ├── 2503.10291.html\n    ├── 2503.10615.html\n    ├── 2503.12937.html\n    ├── 2503.13939.html\n    ├── 2503.14476.html\n    ├── 2503.17352.html\n    ├── 2503.18892.html\n    ├── 2503.19786.html\n    ├── 2503.20783.html\n    ├── 2503.21620.html\n    ├── 2503.21776.html\n    ├── 2503.22679.html\n    ├── 2504.02587.html\n    ├── 2504.05599.html\n    ├── 2504.07491.html\n    ├── 2504.07934.html\n    ├── 2504.07954.html\n    ├── 2504.11455.html\n    ├── 2504.14945.html\n    ├── 2504.16656.html\n    ├── 2505.00703.html\n    └── arxiv_2025.bib",
    "stateUrl": "https://storage.mcpmark.ai/filesystem/papers.zip",
    "stateOriginalUrl": null
  }
}

================================================
FILE: tasks/filesystem/standard/papers/find_math_paper/verify.py
================================================
#!/usr/bin/env python3
"""
Verification script for Find Math Paper Task
"""

import sys
from pathlib import Path
import os

def get_test_directory() -> Path:
    """Get the test directory from FILESYSTEM_TEST_DIR env var."""
    test_root = os.environ.get("FILESYSTEM_TEST_DIR")
    if not test_root:
        raise ValueError("FILESYSTEM_TEST_DIR environment variable is required")
    return Path(test_root)

def verify_answer_file_exists(test_dir: Path) -> bool:
    """Verify that answer.html exists in the papers directory."""
    answer_file = test_dir  / "answer.html"
    
    if not answer_file.exists():
        print("❌ File 'answer.html' not found")
        return False
    
    print("✅ answer.html found")
    return True

def verify_original_file_removed(test_dir: Path) -> bool:
    """Verify that the original file (2407.01284.html) no longer exists."""
    original_file = test_dir  / "2407.01284.html"
    
    if original_file.exists():
        print("❌ Original file 2407.01284.html still exists")
        return False
    
    print("✅ Original file has been renamed")
    return True

def main():
    """Main verification function."""
    test_dir = get_test_directory()
    print("🔍 Verifying Find Math Paper Task...")
    
    # Define verification steps
    verification_steps = [
        ("Answer File Exists", verify_answer_file_exists),
        ("Original File Renamed", verify_original_file_removed),
    ]
    
    # Run all verification steps
    all_passed = True
    for step_name, verify_func in verification_steps:
        print(f"\n--- {step_name} ---")
        if not verify_func(test_dir):
            all_passed = False
    
    # Final result
    print("\n" + "="*50)
    if all_passed:
        print("✅ Paper correctly renamed to answer.html!")
        print("🎉 Task verification: PASS")
        sys.exit(0)
    else:
        print("❌ Task verification: FAIL")
        sys.exit(1)

if __name__ == "__main__":
    main()

================================================
FILE: tasks/filesystem/standard/papers/organize_legacy_papers/description.md
================================================
Please use FileSystem tools to finish the following task:

### Task Description

You are given a directory containing multiple paper files. You have a collection of arXiv papers saved as HTML files in the papers directory, along with a BibTeX file. Your task is to organize the older papers (2023 and earlier) into a structured year-based hierarchy with proper documentation, while leaving newer papers in the original location.

### Task Objectives

1. **Organize by year**: Create a year-based directory structure for papers from 2023 and earlier
2. **Generate documentation**: Create INDEX.md files for each year with paper metadata
3. **Create summary**: Build a master SUMMARY.md file linking to all year indexes

### Detailed Requirements

#### Step 1: Organization
- Create directory structure: `organized/{year}/` where year is extracted from the arXiv ID
  - Example: `1707.06347.html` → `organized/2017/1707.06347.html`
- Move each HTML file from 2023 and earlier to its corresponding year folder, keeping original filenames
- Papers from 2024 onwards (arXiv IDs starting with `24` or `25`) should remain in the original papers directory

#### Step 2: Year Index Files
For each year folder, create an `INDEX.md` file containing:
- A markdown table with three columns: `ArXiv ID | Authors | Local Path`
- Extract authors from `<meta name="citation_author" content="..."/>` tags, keeping only the first 3 authors
- If there are more than 3 authors, list the first 3 followed by "et al."
- Format authors as: "Author1, Author2, Author3" or "Author1, Author2, Author3, et al."
- Local Path should be just the filename (e.g., `1707.06347.html`)
- Sort entries by arXiv ID in ascending order

#### Step 3: Master Summary
Create `organized/SUMMARY.md` with:
- A markdown table with columns: `Year | Paper Count | Index Link`
- Index Link should be a relative markdown link (e.g., `[View Index](2017/INDEX.md)`)
- Sort by year in ascending order

### Expected Output Structure

```
papers/
├── arxiv_2025.bib (remains here)
├── (2024+ HTML files remain here)
└── organized/
    ├── SUMMARY.md
    ├── 2017/
    │   ├── INDEX.md
    │   └── 1707.06347.html
    ├── 2021/
    │   ├── INDEX.md
    │   └── 2105.04165.html
    ├── 2022/
    │   ├── INDEX.md
    │   └── 2201.11903.html
    └── 2023/
        ├── INDEX.md
        ├── 2303.08774.html
        ├── 2306.08640.html
        ├── 2310.02255.html
        ├── 2310.08446.html
        ├── 2312.00849.html
        ├── 2312.07533.html
        └── 2312.11805.html
```

================================================
FILE: tasks/filesystem/standard/papers/organize_legacy_papers/meta.json
================================================
{
  "task_id": "organize_legacy_papers",
  "task_name": "Organize Legacy Papers",
  "category_id": "papers",
  "category_name": "Papers",
  "description": "Structure and organize older academic papers from 2023 and earlier into a year-based hierarchical directory system with proper documentation.",
  "author": "Xiangyan Liu",
  "created_at": "2025-08-12",
  "difficulty": "L3",
  "tags": [
    "file organization",
    "data extraction",
    "cross-referencing"
  ],
  "mcp": [
    "filesystem"
  ],
  "meta_data": {
    "stateType": "text",
    "stateContent": "papers/\n    ├── 1707.06347.html\n    ├── 2105.04165.html\n    ├── 2201.11903.html\n    ├── 2303.08774.html\n    ├── 2306.08640.html\n    ├── 2310.02255.html\n    ├── 2310.08446.html\n    ├── 2312.00849.html\n    ├── 2312.07533.html\n    ├── 2312.11805.html\n    ├── 2402.00253.html\n    ├── 2402.03300.html\n    ├── 2403.05530.html\n    ├── 2404.13046.html\n    ├── 2404.14367.html\n    ├── 2404.14396.html\n    ├── 2405.09818.html\n    ├── 2405.13911.html\n    ├── 2405.16473.html\n    ├── 2405.16640.html\n    ├── 2406.08478.html\n    ├── 2406.16852.html\n    ├── 2406.17294.html\n    ├── 2407.01284.html\n    ├── 2407.01509.html\n    ├── 2407.21783.html\n    ├── 2408.03326.html\n    ├── 2408.12528.html\n    ├── 2409.19256.html\n    ├── 2410.05993.html\n    ├── 2410.06166.html\n    ├── 2410.10563.html\n    ├── 2410.13848.html\n    ├── 2410.17885.html\n    ├── 2410.21276.html\n    ├── 2411.07975.html\n    ├── 2411.10442.html\n    ├── 2411.11930.html\n    ├── 2411.14432.html\n    ├── 2412.05271.html\n    ├── 2412.08443.html\n    ├── 2412.10302.html\n    ├── 2412.15115.html\n    ├── 2412.16720.html\n    ├── 2412.17256.html\n    ├── 2412.18319.html\n    ├── 2412.20631.html\n    ├── 2501.04686.html\n    ├── 2501.06186.html\n    ├── 2501.12599.html\n    ├── 2501.12948.html\n    ├── 2501.17811.html\n    ├── 2502.01456.html\n    ├── 2502.09621.html\n    ├── 2502.10391.html\n    ├── 2502.13923.html\n    ├── 2503.01785.html\n    ├── 2503.06520.html\n    ├── 2503.06749.html\n    ├── 2503.07065.html\n    ├── 2503.07365.html\n    ├── 2503.07536.html\n    ├── 2503.10291.html\n    ├── 2503.10615.html\n    ├── 2503.12937.html\n    ├── 2503.13939.html\n    ├── 2503.14476.html\n    ├── 2503.17352.html\n    ├── 2503.18892.html\n    ├── 2503.19786.html\n    ├── 2503.20783.html\n    ├── 2503.21620.html\n    ├── 2503.21776.html\n    ├── 2503.22679.html\n    ├── 2504.02587.html\n    ├── 2504.05599.html\n    ├── 2504.07491.html\n    ├── 2504.07934.html\n    ├── 2504.07954.html\n    ├── 2504.11455.html\n    ├── 2504.14945.html\n    ├── 2504.16656.html\n    ├── 2505.00703.html\n    └── arxiv_2025.bib",
    "stateUrl": "https://storage.mcpmark.ai/filesystem/papers.zip",
    "stateOriginalUrl": null
  }
}

================================================
FILE: tasks/filesystem/standard/papers/organize_legacy_papers/verify.py
================================================
#!/usr/bin/env python3
"""
Verification script for Papers Collection Cleanup and Organization Task
"""

import sys
from pathlib import Path
import re
import os

def get_test_directory() -> Path:
    """Get the test directory from FILESYSTEM_TEST_DIR env var."""
    test_root = os.environ.get("FILESYSTEM_TEST_DIR")
    if not test_root:
        raise ValueError("FILESYSTEM_TEST_DIR environment variable is required")
    return Path(test_root)

def verify_papers_remain(test_dir: Path) -> bool:
    """Verify that BibTeX and 2024+ papers remain in original directory."""
    papers_dir = test_dir
    
    # Check BibTeX file still exists
    bib_file = papers_dir / "arxiv_2025.bib"
    if not bib_file.exists():
        print("❌ BibTeX file arxiv_2025.bib not found")
        return False
    print("✅ BibTeX file remains in place")
    
    # Check that 2024+ papers remain in original directory
    found_2024_plus = False
    if papers_dir.exists():
        for html_file in papers_dir.glob("*.html"):
            arxiv_id = html_file.stem
            year_part = arxiv_id[:2] if len(arxiv_id) >= 2 else ""
            if year_part.isdigit():
                year = int(year_part)
                if year >= 24:
                    found_2024_plus = True
                    break
    
    if found_2024_plus:
        print("✅ 2024+ papers remain in original directory")
    else:
        print("⚠️ No 2024+ papers found (this may be expected if none existed)")
    
    # Check that pre-2024 papers are NOT in original directory
    pre_2024_found = []
    if papers_dir.exists():
        for html_file in papers_dir.glob("*.html"):
            arxiv_id = html_file.stem
            year_part = arxiv_id[:2] if len(arxiv_id) >= 2 else ""
            if year_part.isdigit():
                year = int(year_part)
                if year < 24:
                    pre_2024_found.append(html_file.name)
    
    if pre_2024_found:
        print(f"❌ Pre-2024 papers still in original directory: {pre_2024_found[:3]}...")
        return False
    
    print("✅ Pre-2024 papers have been moved")
    return True

def verify_directory_structure(test_dir: Path) -> bool:
    """Verify the organized directory structure exists."""
    organized_dir = test_dir / "organized"
    
    if not organized_dir.exists():
        print("❌ organized/ directory not found")
        return False
    print("✅ organized/ directory exists")
    
    # Expected years based on pre-2024 papers
    expected_years = ["2017", "2021", "2022", "2023"]
    found_years = []
    
    for year in expected_years:
        year_dir = organized_dir / year
        if year_dir.exists() and year_dir.is_dir():
            found_years.append(year)
    
    if len(found_years) != len(expected_years):
        print(f"❌ Expected year directories {expected_years}, found {found_years}")
        return False
    
    print(f"✅ All expected year directories exist: {found_years}")
    return True

def verify_papers_moved(test_dir: Path) -> bool:
    """Verify papers are correctly moved to year folders."""
    organized_dir = test_dir / "organized"
    
    # Expected paper distribution
    expected_papers = {
        "2017": ["1707.06347.html"],
        "2021": ["2105.04165.html"],
        "2022": ["2201.11903.html"],
        "2023": ["2303.08774.html", "2306.08640.html", "2310.02255.html", 
                 "2310.08446.html", "2312.00849.html", "2312.07533.html", 
                 "2312.11805.html"]
    }
    
    all_correct = True
    for year, papers in expected_papers.items():
        year_dir = organized_dir / year
        if not year_dir.exists():
            print(f"❌ Year directory {year} doesn't exist")
            return False
        
        actual_papers = sorted([f.name for f in year_dir.glob("*.html")])
        expected_sorted = sorted(papers)
        
        if actual_papers != expected_sorted:
            print(f"❌ Papers in {year}/: expected {expected_sorted}, found {actual_papers}")
            all_correct = False
        else:
            print(f"✅ Correct papers in {year}/: {len(actual_papers)} files")
    
    return all_correct

def verify_index_files(test_dir: Path) -> bool:
    """Verify INDEX.md files exist and have correct format."""
    organized_dir = test_dir / "organized"
    years = ["2017", "2021", "2022", "2023"]
    
    for year in years:
        index_file = organized_dir / year / "INDEX.md"
        
        if not index_file.exists():
            print(f"❌ INDEX.md missing in {year}/")
            return False
        
        content = index_file.read_text()
        
        # Check for table format
        if "ArXiv ID" not in content or "Authors" not in content or "Local Path" not in content:
            print(f"❌ INDEX.md in {year}/ missing required columns")
            return False
        
        
        # Check that papers are listed
        year_dir = organized_dir / year
        html_files = list(year_dir.glob("*.html"))
        for html_file in html_files:
            arxiv_id = html_file.stem
            if arxiv_id not in content:
                print(f"❌ INDEX.md in {year}/ missing paper {arxiv_id}")
                return False
        
        print(f"✅ INDEX.md in {year}/ has correct format")
    
    return True

def verify_author_extraction(test_dir: Path) -> bool:
    """Verify that authors are correctly extracted from HTML metadata (max 3 authors)."""
    organized_dir = test_dir / "organized"
    
    # Check a sample paper's authors
    sample_file = organized_dir / "2017" / "1707.06347.html"
    if not sample_file.exists():
        print("❌ Cannot verify author extraction - sample file missing")
        return False
    
    # Read the HTML to get expected authors
    html_content = sample_file.read_text()
    author_pattern = r'<meta name="citation_author" content="([^"]+)"'
    all_authors = re.findall(author_pattern, html_content)
    
    if not all_authors:
        print("❌ No authors found in sample HTML file")
        return False
    
    # Build expected author string (max 3 authors)
    if len(all_authors) <= 3:
        expected_author_str = ", ".join(all_authors)
    else:
        expected_author_str = ", ".join(all_authors[:3]) + ", et al."
    
    # Check if INDEX.md contains these authors
    index_file = organized_dir / "2017" / "INDEX.md"
    index_content = index_file.read_text()
    
    # Find the line with this paper
    found = False
    for line in index_content.split('\n'):
        if "1707.06347" in line:
            found = True
            # Check if authors are correctly formatted
            if len(all_authors) > 3:
                # Should have first 3 authors and "et al."
                if "et al." not in line:
                    print("❌ Missing 'et al.' for paper with >3 authors")
                    return False
                # Check first 3 authors are present
                for author in all_authors[:3]:
                    if author not in line:
                        print(f"❌ Author '{author}' not found in INDEX.md")
                        return False
                # Check that 4th author is NOT present
                if len(all_authors) > 3 and all_authors[3] in line:
                    print(f"❌ Fourth author '{all_authors[3]}' should not be in INDEX.md")
                    return False
            else:
                # Should have all authors, no "et al."
                if "et al." in line:
                    print("❌ Should not have 'et al.' for paper with ≤3 authors")
                    return False
                for author in all_authors:
                    if author not in line:
                        print(f"❌ Author '{author}' not found in INDEX.md")
                        return False
            break
    
    if not found:
        print("❌ Paper 1707.06347 not found in INDEX.md")
        return False
    
    print("✅ Authors correctly extracted (max 3) from HTML metadata")
    
    # Additional check: verify 3-author limit across all papers
    print("\nVerifying 3-author limit across all papers...")
    years = ["2017", "2021", "2022", "2023"]
    for year in years:
        year_dir = organized_dir / year
        if not year_dir.exists():
            continue
            
        index_file = year_dir / "INDEX.md"
        if not index_file.exists():
            continue
            
        index_content = index_file.read_text()
        
        # Check each HTML file in the year directory
        for html_file in year_dir.glob("*.html"):
            arxiv_id = html_file.stem
            
            # Get actual authors from HTML
            html_content = html_file.read_text()
            authors = re.findall(r'<meta name="citation_author" content="([^"]+)"', html_content)
            
            # Find corresponding line in INDEX.md
            for line in index_content.split('\n'):
                if arxiv_id in line and '|' in line and 'ArXiv ID' not in line:
                    # Count authors in the line (split by comma)
                    author_parts = line.split('|')[1] if '|' in line else ""
                    
                    # Check et al. usage
                    if len(authors) > 3:
                        if "et al." not in line:
                            print(f"❌ {year}/{arxiv_id}: Missing 'et al.' for {len(authors)} authors")
                            return False
                    elif "et al." in line:
                        print(f"❌ {year}/{arxiv_id}: Unexpected 'et al.' for {len(authors)} authors")
                        return False
                    
                    # Verify no more than 3 authors are listed
                    author_count = author_parts.count(',') + 1 if author_parts.strip() else 0
                    if "et al." in author_parts:
                        author_count -= 1  # Don't count "et al." as an author
                    
                    if author_count > 3:
                        print(f"❌ {year}/{arxiv_id}: More than 3 authors listed")
                        return False
                    
                    break
    
    print("✅ All papers respect the 3-author limit")
    return True

def verify_summary_file(test_dir: Path) -> bool:
    """Verify SUMMARY.md exists and has correct content."""
    summary_file = test_dir / "organized" / "SUMMARY.md"
    
    if not summary_file.exists():
        print("❌ SUMMARY.md not found")
        return False
    
    content = summary_file.read_text()
    
    # Check for required columns
    if "Year" not in content or "Paper Count" not in content or "Index Link" not in content:
        print("❌ SUMMARY.md missing required columns")
        return False
    
    
    # Check for year entries
    expected_years = ["2017", "2021", "2022", "2023"]
    for year in expected_years:
        if year not in content:
            print(f"❌ SUMMARY.md missing year {year}")
            return False
    
    # Check for links to INDEX.md files
    expected_links = [
        f"{year}/INDEX.md" for year in expected_years
    ]
    for link in expected_links:
        if link not in content:
            print(f"❌ SUMMARY.md missing link to {link}")
            return False
    
    # Check paper counts
    expected_counts = {
        "2017": 1,
        "2021": 1,
        "2022": 1,
        "2023": 7
    }
    
    for year, count in expected_counts.items():
        # Look for the row with this year
        for line in content.split('\n'):
            if f"| {year}" in line or f"|{year}" in line:
                if str(count) not in line:
                    print(f"❌ SUMMARY.md has incorrect paper count for {year}")
                    return False
                break
    
    print("✅ SUMMARY.md has correct format and content")
    return True

def verify_sorting(test_dir: Path) -> bool:
    """Verify that entries are sorted correctly."""
    organized_dir = test_dir / "organized"
    
    # Check SUMMARY.md year sorting
    summary_file = organized_dir / "SUMMARY.md"
    content = summary_file.read_text()
    
    # Extract years from table rows
    years_in_summary = []
    for line in content.split('\n'):
        if '|' in line and any(year in line for year in ["2017", "2021", "2022", "2023"]):
            # Extract year from the line
            for year in ["2017", "2021", "2022", "2023"]:
                if year in line:
                    years_in_summary.append(year)
                    break
    
    if years_in_summary != sorted(years_in_summary):
        print(f"❌ SUMMARY.md years not sorted: {years_in_summary}")
        return False
    
    print("✅ SUMMARY.md years sorted correctly")
    
    # Check INDEX.md arxiv ID sorting for one year
    index_file = organized_dir / "2023" / "INDEX.md"
    if index_file.exists():
        content = index_file.read_text()
        arxiv_ids = []
        for line in content.split('\n'):
            if '|' in line and '23' in line and 'ArXiv ID' not in line and '---' not in line:
                # Extract arxiv ID
                match = re.search(r'23\d{2}\.\d{5}', line)
                if match:
                    arxiv_ids.append(match.group())
        
        if arxiv_ids != sorted(arxiv_ids):
            print(f"❌ INDEX.md arxiv IDs not sorted in 2023/")
            return False
        
        print("✅ INDEX.md entries sorted by arxiv ID")
    
    return True

def main():
    """Main verification function."""
    test_dir = get_test_directory()
    print("🔍 Verifying Papers Collection Cleanup and Organization...")
    
    # Define verification steps
    verification_steps = [
        ("Papers Remain/Move Verification", verify_papers_remain),
        ("Directory Structure", verify_directory_structure),
        ("Papers Moved Correctly", verify_papers_moved),
        ("Index Files Format", verify_index_files),
        ("Author Extraction", verify_author_extraction),
        ("Summary File", verify_summary_file),
        ("Sorting Verification", verify_sorting),
    ]
    
    # Run all verification steps
    all_passed = True
    for step_name, verify_func in verification_steps:
        print(f"\n--- {step_name} ---")
        try:
            if not verify_func(test_dir):
                all_passed = False
        except Exception as e:
            print(f"❌ Error in {step_name}: {e}")
            all_passed = False
    
    # Final result
    print("\n" + "="*50)
    if all_passed:
        print("✅ Papers organized correctly!")
        print("🎉 Task verification: PASS")
        sys.exit(0)
    else:
        print("❌ Task verification: FAIL")
        sys.exit(1)

if __name__ == "__main__":
    main()

================================================
FILE: tasks/filesystem/standard/student_database/duplicate_name/description.md
================================================
Please use FileSystem tools to finish the following task:

Please help me identify duplicate names from the list of all the 150 students. Do not use python code. Then generate a `namesake.txt` file to record the results in the following format, with each group written in three lines:

name: xxx
count: xxx
ids: xxx, xxx, ...

Leave one blank line between every two groups. If there are multiple duplicates, just list all corresponding IDs in the third line.


================================================
FILE: tasks/filesystem/standard/student_database/duplicate_name/meta.json
================================================
{
  "task_id": "duplicate_name",
  "task_name": "Duplicate Name",
  "category_id": "student_database",
  "category_name": "Student Database",
  "description": "Identify students with identical names from a 150-student database and generate a formatted namesake grouping report file.",
  "author": "Lingjun Chen",
  "created_at": "2025-08-10",
  "difficulty": "L3",
  "tags": [
    "pattern analysis",
    "data extraction"
  ],
  "mcp": [
    "filesystem"
  ],
  "meta_data": {
    "stateType": "text",
    "stateContent": "student_database/\n    ├── 20101250_Patricia_Jones/\n    │       ├── basic_info.txt\n    │       └── recommendation_letter.txt\n    ├── 20101701_Isabella_Davis/\n    │       ├── basic_info.txt\n    │       └── recommendation_letter.txt\n    ├── 20102572_Michael_Taylor/\n    │       ├── basic_info.txt\n    │       └── recommendation_letter.txt\n    ├── 20104233_Robert_Lopez/\n    │       ├── basic_info.txt\n    │       └── recommendation_letter.txt\n    ├── 20104498_Sarah_Brown/\n    │       ├── basic_info.txt\n    │       └── recommendation_letter.txt\n    ├── 20104653_Sophia_Brown/\n    │       ├── basic_info.txt\n    │       └── recommendation_letter.txt\n    ├── 20104675_Michael_Gonzalez/\n    │       ├── basic_info.txt\n    │       └── recommendation_letter.txt\n    ├── 20104846_Christopher_Brown/\n    │       ├── basic_info.txt\n    │       └── recommendation_letter.txt\n    ├── 20107487_Mia_Martin/\n    │       ├── basic_info.txt\n    │       └── recommendation_letter.txt\n    ├── 20108742_Sarah_Brown/\n    │       ├── basic_info.txt\n    │       └── recommendation_letter.txt\n    ├── 20109144_Emma_Thomas/\n    │       ├── basic_info.txt\n    │       └── recommendation_letter.txt\n    ├── 20109803_Oliver_Hernandez/\n    │       ├── basic_info.txt\n    │       └── recommendation_letter.txt\n    ├── 20111634_Isabella_Thomas/\n    │       ├── basic_info.txt\n    │       └── recommendation_letter.txt\n    ├── 20112439_Christopher_Moore/\n    │       ├── basic_info.txt\n    │       └── recommendation_letter.txt\n    ├── 20113368_William_Wilson/\n    │       ├── basic_info.txt\n    │       └── recommendation_letter.txt\n    ├── 20113603_Robert_Rodriguez/\n    │       ├── basic_info.txt\n    │       └── recommendation_letter.txt\n    ├── 20114397_Isabella_Martin/\n    │       ├── basic_info.txt\n    │       └── recommendation_letter.txt\n    ├── 20114869_Ethan_Martin/\n    │       ├── basic_info.txt\n    │       └── recommendation_letter.txt\n    ├── 20115252_Mason_Johnson/\n    │       ├── basic_info.txt\n    │       └── recommendation_letter.txt\n    ├── 20115632_Elizabeth_Anderson/\n    │       ├── basic_info.txt\n    │       └── recommendation_letter.txt\n    ├── 20115753_Charlotte_Johnson/\n    │       ├── basic_info.txt\n    │       └── recommendation_letter.txt\n    ├── 20115924_Michael_Lopez/\n    │       ├── basic_info.txt\n    │       └── recommendation_letter.txt\n    ├── 20116232_Olivia_Lopez/\n    │       ├── basic_info.txt\n    │       └── recommendation_letter.txt\n    ├── 20119528_Thomas_Brown/\n    │       ├── basic_info.txt\n    │       └── recommendation_letter.txt\n    ├── 20122427_Karen_Gonzalez/\n    │       ├── basic_info.txt\n    │       └── recommendation_letter.txt\n    ├── 20122977_Evelyn_Miller/\n    │       ├── basic_info.txt\n    │       └── recommendation_letter.txt\n    ├── 20123376_Joseph_Johnson/\n    │       ├── basic_info.txt\n    │       └── recommendation_letter.txt\n    ├── 20125451_Barbara_Brown/\n    │       ├── basic_info.txt\n    │       └── recommendation_letter.txt\n    ├── 20126203_Barbara_Davis/\n    │       ├── basic_info.txt\n    │       └── recommendation_letter.txt\n    ├── 20126394_Olivia_Williams/\n    │       ├── basic_info.txt\n    │       └── recommendation_letter.txt\n    ├── 20126471_Ethan_Taylor/\n    │       ├── basic_info.txt\n    │       └── recommendation_letter.txt\n    ├── 20127423_John_Williams/\n    │       ├── basic_info.txt\n    │       └── recommendation_letter.txt\n    ├── 20128249_Oliver_Smith/\n    │       ├── basic_info.txt\n    │       └── recommendation_letter.txt\n    ├── 20128879_Christopher_Taylor/\n    │       ├── basic_info.txt\n    │       └── recommendation_letter.txt\n    ├── 20129898_Jessica_Johnson/\n    │       ├── basic_info.txt\n    │       └── recommendation_letter.txt\n    ├── 20131271_Olivia_Brown/\n    │       ├── basic_info.txt\n    │       └── recommendation_letter.txt\n    ├── 20131518_Sophia_Smith/\n    │       ├── basic_info.txt\n    │       └── recommendation_letter.txt\n    ├── 20132026_Isabella_Smith/\n    │       ├── basic_info.txt\n    │       └── recommendation_letter.txt\n    ├── 20132370_James_Brown/\n    │       ├── basic_info.txt\n    │       └── recommendation_letter.txt\n    ├── 20132669_Noah_Smith/\n    │       ├── basic_info.txt\n    │       └── recommendation_letter.txt\n    ├── 20133527_Mason_Jackson/\n    │       ├── basic_info.txt\n    │       └── recommendation_letter.txt\n    ├── 20133697_Isabella_Smith/\n    │       ├── basic_info.txt\n    │       └── recommendation_letter.txt\n    ├── 20135821_Thomas_Wilson/\n    │       ├── basic_info.txt\n    │       └── recommendation_letter.txt\n    ├── 20136681_Benjamin_Anderson/\n    │       ├── basic_info.txt\n    │       └── recommendation_letter.txt\n    ├── 20136890_Benjamin_Brown/\n    │       ├── basic_info.txt\n    │       └── recommendation_letter.txt\n    ├── 20137514_Lucas_Anderson/\n    │       ├── basic_info.txt\n    │       └── recommendation_letter.txt\n    ├── 20139234_Harper_Martinez/\n    │       ├── basic_info.txt\n    │       └── recommendation_letter.txt\n    ├── 20139637_Noah_Johnson/\n    │       ├── basic_info.txt\n    │       └── recommendation_letter.txt\n    ├── 20139647_Patricia_Lopez/\n    │       ├── basic_info.txt\n    │       └── recommendation_letter.txt\n    ├── 20141421_Linda_Gonzalez/\n    │       ├── basic_info.txt\n    │       └── recommendation_letter.txt\n    ├── 20142085_William_Anderson/\n    │       ├── basic_info.txt\n    │       └── recommendation_letter.txt\n    ├── 20142383_Amelia_Brown/\n    │       ├── basic_info.txt\n    │       └── recommendation_letter.txt\n    ├── 20143406_Susan_Martin/\n    │       ├── basic_info.txt\n    │       └── recommendation_letter.txt\n    ├── 20143830_James_Garcia/\n    │       ├── basic_info.txt\n    │       └── recommendation_letter.txt\n    ├── 20146035_Christopher_Garcia/\n    │       ├── basic_info.txt\n    │       └── recommendation_letter.txt\n    ├── 20146277_William_Anderson/\n    │       ├── basic_info.txt\n    │       └── recommendation_letter.txt\n    ├── 20146279_Christopher_Moore/\n    │       ├── basic_info.txt\n    │       └── recommendation_letter.txt\n    ├── 20147301_James_Jones/\n    │       ├── basic_info.txt\n    │       └── recommendation_letter.txt\n    ├── 20147789_James_Anderson/\n    │       ├── basic_info.txt\n    │       └── recommendation_letter.txt\n    ├── 20148681_John_Hernandez/\n    │       ├── basic_info.txt\n    │       └── recommendation_letter.txt\n    ├── 20148778_Susan_Anderson/\n    │       ├── basic_info.txt\n    │       └── recommendation_letter.txt\n    ├── 20149712_Jessica_Rodriguez/\n    │       ├── basic_info.txt\n    │       └── recommendation_letter.txt\n    ├── 20151012_Harper_Miller/\n    │       ├── basic_info.txt\n    │       └── recommendation_letter.txt\n    ├── 20153174_Benjamin_Jackson/\n    │       ├── basic_info.txt\n    │       └── recommendation_letter.txt\n    ├── 20153412_Charlotte_Martin/\n    │       ├── basic_info.txt\n    │       └── recommendation_letter.txt\n    ├── 20153606_James_Anderson/\n    │       ├── basic_info.txt\n    │       └── recommendation_letter.txt\n    ├── 20153687_Richard_Taylor/\n    │       ├── basic_info.txt\n    │       └── recommendation_letter.txt\n    ├── 20154518_John_Gonzalez/\n    │       ├── basic_info.txt\n    │       └── recommendation_letter.txt\n    ├── 20154710_Benjamin_Rodriguez/\n    │       ├── basic_info.txt\n    │       └── recommendation_letter.txt\n    ├── 20156469_Jennifer_Hernandez/\n    │       ├── basic_info.txt\n    │       └── recommendation_letter.txt\n    ├── 20156522_Jennifer_Martinez/\n    │       ├── basic_info.txt\n    │       └── recommendation_letter.txt\n    ├── 20156851_Noah_Anderson/\n    │       ├── basic_info.txt\n    │       └── recommendation_letter.txt\n    ├── 20157943_Harper_Williams/\n    │       ├── basic_info.txt\n    │       └── recommendation_letter.txt\n    ├── 20158266_Sophia_Moore/\n    │       ├── basic_info.txt\n    │       └── recommendation_letter.txt\n    ├── 20158294_Sophia_Wilson/\n    │       ├── basic_info.txt\n    │       └── recommendation_letter.txt\n    ├── 20158819_Sarah_Wilson/\n    │       ├── basic_info.txt\n    │       └── recommendation_letter.txt\n    ├── 20159113_John_Rodriguez/\n    │       ├── basic_info.txt\n    │       └── recommendation_letter.txt\n    ├── 20159695_James_Moore/\n    │       ├── basic_info.txt\n    │       └── recommendation_letter.txt\n    ├── 20161279_William_Moore/\n    │       ├── basic_info.txt\n    │       └── recommendation_letter.txt\n    ├── 20162253_Mason_Rodriguez/\n    │       ├── basic_info.txt\n    │       └── recommendation_letter.txt\n    ├── 20162542_Mia_Anderson/\n    │       ├── basic_info.txt\n    │       └── recommendation_letter.txt\n    ├── 20163356_Ava_Anderson/\n    │       ├── basic_info.txt\n    │       └── recommendation_letter.txt\n    ├── 20164515_Patricia_Moore/\n    │       ├── basic_info.txt\n    │       └── recommendation_letter.txt\n    ├── 20164801_Noah_Rodriguez/\n    │       ├── basic_info.txt\n    │       └── recommendation_letter.txt\n    ├── 20165511_Mary_Gonzalez/\n    │       ├── basic_info.txt\n    │       └── recommendation_letter.txt\n    ├── 20166436_Christopher_Jackson/\n    │       ├── basic_info.txt\n    │       └── recommendation_letter.txt\n    ├── 20166487_Barbara_Hernandez/\n    │       ├── basic_info.txt\n    │       └── recommendation_letter.txt\n    ├── 20166564_Ava_Lopez/\n    │       ├── basic_info.txt\n    │       └── recommendation_letter.txt\n    ├── 20166998_Ava_Lopez/\n    │       ├── basic_info.txt\n    │       └── recommendation_letter.txt\n    ├── 20168311_Lucas_Jackson/\n    │       ├── basic_info.txt\n    │       └── recommendation_letter.txt\n    ├── 20168491_Karen_Martinez/\n    │       ├── basic_info.txt\n    │       └── recommendation_letter.txt\n    ├── 20169515_Thomas_Taylor/\n    │       ├── basic_info.txt\n    │       └── recommendation_letter.txt\n    ├── 20171050_Christopher_Rodriguez/\n    │       ├── basic_info.txt\n    │       └── recommendation_letter.txt\n    ├── 20171406_Mary_Anderson/\n    │       ├── basic_info.txt\n    │       └── recommendation_letter.txt\n    ├── 20171613_Ethan_Moore/\n    │       ├── basic_info.txt\n    │       └── recommendation_letter.txt\n    ├── 20172106_Isabella_Rodriguez/\n    │       ├── basic_info.txt\n    │       └── recommendation_letter.txt\n    ├── 20173259_Michael_Anderson/\n    │       ├── basic_info.txt\n    │       └── recommendation_letter.txt\n    ├── 20173492_Richard_Miller/\n    │       ├── basic_info.txt\n    │       └── recommendation_letter.txt\n    ├── 20173501_Mary_Smith/\n    │       ├── basic_info.txt\n    │       └── recommendation_letter.txt\n    ├── 20173517_Susan_Anderson/\n    │       ├── basic_info.txt\n    │       └── recommendation_letter.txt\n    ├── 20174207_Richard_Wilson/\n    │       ├── basic_info.txt\n    │       └── recommendation_letter.txt\n    ├── 20174369_Mary_Garcia/\n    │       ├── basic_info.txt\n    │       └── recommendation_letter.txt\n    ├── 20175314_William_Taylor/\n    │       ├── basic_info.txt\n    │       └── recommendation_letter.txt\n    ├── 20176169_Lucas_Smith/\n    │       ├── basic_info.txt\n    │       └── recommendation_letter.txt\n    ├── 20176947_Noah_Miller/\n    │       ├── basic_info.txt\n    │       └── recommendation_letter.txt\n    ├── 20177389_James_Smith/\n    │       ├── basic_info.txt\n    │       └── recommendation_letter.txt\n    ├── 20178687_Isabella_Anderson/\n    │       ├── basic_info.txt\n    │       └── recommendation_letter.txt\n    ├── 20179461_William_Johnson/\n    │       ├── basic_info.txt\n    │       └── recommendation_letter.txt\n    ├── 20179690_Linda_Thomas/\n    │       ├── basic_info.txt\n    │       └── recommendation_letter.txt\n    ├── 20181056_Sarah_Hernandez/\n    │       ├── basic_info.txt\n    │       └── recommendation_letter.txt\n    ├── 20182020_Patricia_Taylor/\n    │       ├── basic_info.txt\n    │       └── recommendation_letter.txt\n    ├── 20182390_Ethan_Wilson/\n    │       ├── basic_info.txt\n    │       └── recommendation_letter.txt\n    ├── 20183149_David_Smith/\n    │       ├── basic_info.txt\n    │       └── recommendation_letter.txt\n    ├── 20183219_Charlotte_Williams/\n    │       ├── basic_info.txt\n    │       └── recommendation_letter.txt\n    ├── 20184489_Jessica_Gonzalez/\n    │       ├── basic_info.txt\n    │       └── recommendation_letter.txt\n    ├── 20186154_Charlotte_Smith/\n    │       ├── basic_info.txt\n    │       └── recommendation_letter.txt\n    ├── 20186510_James_Thomas/\n    │       ├── basic_info.txt\n    │       └── recommendation_letter.txt\n    ├── 20187107_David_Martinez/\n    │       ├── basic_info.txt\n    │       └── recommendation_letter.txt\n    ├── 20187144_Mary_Jackson/\n    │       ├── basic_info.txt\n    │       └── recommendation_letter.txt\n    ├── 20187892_Christopher_Taylor/\n    │       ├── basic_info.txt\n    │       └── recommendation_letter.txt\n    ├── 20187921_Mary_Jones/\n    │       ├── basic_info.txt\n    │       └── recommendation_letter.txt\n    ├── 20187967_Sarah_Davis/\n    │       ├── basic_info.txt\n    │       └── recommendation_letter.txt\n    ├── 20188937_James_Moore/\n    │       ├── basic_info.txt\n    │       └── recommendation_letter.txt\n    ├── 20189123_Mary_Martin/\n    │       ├── basic_info.txt\n    │       └── recommendation_letter.txt\n    ├── 20189192_Olivia_Jones/\n    │       ├── basic_info.txt\n    │       └── recommendation_letter.txt\n    ├── 20189268_Emma_Williams/\n    │       ├── basic_info.txt\n    │       └── recommendation_letter.txt\n    ├── 20189854_William_Taylor/\n    │       ├── basic_info.txt\n    │       └── recommendation_letter.txt\n    ├── 20191265_Joseph_Lopez/\n    │       ├── basic_info.txt\n    │       └── recommendation_letter.txt\n    ├── 20192725_Robert_Martinez/\n    │       ├── basic_info.txt\n    │       └── recommendation_letter.txt\n    ├── 20194054_Michael_Jones/\n    │       ├── basic_info.txt\n    │       └── recommendation_letter.txt\n    ├── 20194160_Benjamin_Jackson/\n    │       ├── basic_info.txt\n    │       └── recommendation_letter.txt\n    ├── 20194164_Sarah_Jones/\n    │       ├── basic_info.txt\n    │       └── recommendation_letter.txt\n    ├── 20194525_John_Taylor/\n    │       ├── basic_info.txt\n    │       └── recommendation_letter.txt\n    ├── 20195164_Jennifer_Gonzalez/\n    │       ├── basic_info.txt\n    │       └── recommendation_letter.txt\n    ├── 20195982_David_Jackson/\n    │       ├── basic_info.txt\n    │       └── recommendation_letter.txt\n    ├── 20196776_William_Brown/\n    │       ├── basic_info.txt\n    │       └── recommendation_letter.txt\n    ├── 20196896_Olivia_Jones/\n    │       ├── basic_info.txt\n    │       └── recommendation_letter.txt\n    ├── 20196961_Joseph_Thomas/\n    │       ├── basic_info.txt\n    │       └── recommendation_letter.txt\n    ├── 20196998_Ethan_Wilson/\n    │       ├── basic_info.txt\n    │       └── recommendation_letter.txt\n    ├── 20198548_Evelyn_Moore/\n    │       ├── basic_info.txt\n    │       └── recommendation_letter.txt\n    ├── 20199036_Benjamin_Hernandez/\n    │       ├── basic_info.txt\n    │       └── recommendation_letter.txt\n    ├── 20199583_Mary_Brown/\n    │       ├── basic_info.txt\n    │       └── recommendation_letter.txt\n    ├── 20199735_Mason_Johnson/\n    │       ├── basic_info.txt\n    │       └── recommendation_letter.txt\n    ├── 20199872_Sophia_Jackson/\n    │       ├── basic_info.txt\n    │       └── recommendation_letter.txt\n    ├── 20199980_James_Rodriguez/\n    │       ├── basic_info.txt\n    │       └── recommendation_letter.txt\n    ├── 20201385_John_Taylor/\n    │       ├── basic_info.txt\n    │       └── recommendation_letter.txt\n    ├── 20201800_John_Jones/\n    │       ├── basic_info.txt\n    │       └── recommendation_letter.txt\n    ├── 20202548_Robert_Miller/\n    │       ├── basic_info.txt\n    │       └── recommendation_letter.txt\n    ├── 20203855_Mia_Miller/\n    │       ├── basic_info.txt\n    │       └── recommendation_letter.txt\n    └── 20204611_Sarah_Wilson/\n            ├── basic_info.txt\n            └── recommendation_letter.txt",
    "stateUrl": "https://storage.mcpmark.ai/filesystem/student_database.zip",
    "stateOriginalUrl": null
  }
}

================================================
FILE: tasks/filesystem/standard/student_database/duplicate_name/verify.py
================================================
#!/usr/bin/env python3
"""
Verification script for Student Database Task: Find Duplicate Names
Simplified version that only checks against expected results without folder validation
"""

import sys
from pathlib import Path
import os

def get_test_directory() -> Path:
    """Get the test directory from FILESYSTEM_TEST_DIR env var."""
    test_root = os.environ.get("FILESYSTEM_TEST_DIR")
    if not test_root:
        raise ValueError("FILESYSTEM_TEST_DIR environment variable is required")
    return Path(test_root)

def verify_namesake_file_exists(test_dir: Path) -> bool:
    """Verify that the namesake.txt file exists."""
    namesake_file = test_dir / "namesake.txt"
    
    if not namesake_file.exists():
        print("❌ File 'namesake.txt' not found")
        return False
    
    print("✅ Namesake file found")
    return True

def parse_namesake_file(test_dir: Path) -> dict:
    """Parse the namesake.txt file and return structured data."""
    namesake_file = test_dir / "namesake.txt"
    
    try:
        content = namesake_file.read_text()
        lines = content.strip().split('\n')
        
        namesakes = {}
        current_line = 0
        
        while current_line < len(lines):
            # Skip blank lines
            if not lines[current_line].strip():
                current_line += 1
                continue
            
            # Check if we have enough lines for a complete group
            if current_line + 2 >= len(lines):
                print(f"❌ Incomplete group at line {current_line + 1}")
                return {}
            
            # Parse group
            name_line = lines[current_line].strip()
            count_line = lines[current_line + 1].strip()
            ids_line = lines[current_line + 2].strip()
            
            # Extract name
            if not name_line.startswith("name: "):
                print(f"❌ Invalid name line format at line {current_line + 1}: {name_line}")
                return {}
            name = name_line.replace("name: ", "").strip()
            
            # Extract count
            if not count_line.startswith("count: "):
                print(f"❌ Invalid count line format at line {current_line + 2}: {count_line}")
                return {}
            count_str = count_line.replace("count: ", "").strip()
            try:
                count = int(count_str)
            except ValueError:
                print(f"❌ Invalid count format: {count_str}")
                return {}
            
            # Extract IDs
            if not ids_line.startswith("ids: "):
                print(f"❌ Invalid ids line format at line {current_line + 3}: {ids_line}")
                return {}
            ids_str = ids_line.replace("ids: ", "").strip()
            ids = [id.strip() for id in ids_str.split(",")]
            
            namesakes[name] = {
                'count': count,
                'ids': ids
            }
            
            current_line += 4  # Skip to next group (after blank line)
        
        return namesakes
        
    except Exception as e:
        print(f"❌ Error parsing namesake file: {e}")
        return {}

def verify_against_expected_results(namesakes: dict) -> bool:
    """Verify that the results match the expected answer.md content exactly."""
    
    # Expected duplicate names from answer.md (hardcoded)
    expected_duplicates = {
        'Isabella Smith': ['20132026', '20133697'],
        'Ava Lopez': ['20166564', '20166998'],
        'James Moore': ['20159695', '20188937'],
        'William Taylor': ['20175314', '20189854'],
        'Ethan Wilson': ['20182390', '20196998'],
        'Christopher Taylor': ['20128879', '20187892'],
        'William Anderson': ['20142085', '20146277'],
        'James Anderson': ['20147789', '20153606'],
        'Olivia Jones': ['20189192', '20196896'],
        'Mason Johnson': ['20115252', '20199735'],
        'Benjamin Jackson': ['20153174', '20194160'],
        'John Taylor': ['20194525', '20201385'],
        'Susan Anderson': ['20148778', '20173517'],
        'Christopher Moore': ['20112439', '20146279'],
        'Sarah Wilson': ['20158819', '20204611'],
        'Sarah Brown': ['20104498', '20108742']
    }
    
    # Check if exactly 16 duplicate names are found
    if len(namesakes) != 16:
        print(f"❌ Expected exactly 16 duplicate names, but found {len(namesakes)}")
        return False
    
    # Check if all expected duplicate names are present
    for expected_name in expected_duplicates:
        if expected_name not in namesakes:
            print(f"❌ Missing expected duplicate name: '{expected_name}'")
            return False
    
    # Check if all namesakes in the file are actually duplicates
    for name, data in namesakes.items():
        if name not in expected_duplicates:
            print(f"❌ Unexpected duplicate name found: '{name}' (not in expected list)")
            return False
        
        expected_ids = set(expected_duplicates[name])
        stated_ids = set(data['ids'])
        
        if expected_ids != stated_ids:
            print(f"❌ ID mismatch for '{name}':")
            print(f"   Expected: {sorted(expected_ids)}")
            print(f"   Stated: {sorted(stated_ids)}")
            return False
        
        # Verify count matches
        if data['count'] != 2:
            print(f"❌ Count mismatch for '{name}': expected 2, got {data['count']}")
            return False
    
    print("✅ All 16 expected duplicate names are correctly identified")
    print("✅ All student IDs match expected results")
    print("✅ All counts are correct (2 for each duplicate name)")
    return True

def main():
    """Main verification function."""
    test_dir = get_test_directory()
    print("🔍 Verifying Student Database Task: Find Duplicate Names...")
    
    # Check if namesake file exists
    print("\n--- File Existence Check ---")
    if not verify_namesake_file_exists(test_dir):
        print("\n❌ Basic verification failed, cannot proceed with content verification")
        sys.exit(1)
    
    # Parse the file and run content verification
    print("\n--- Content Verification ---")
    namesakes = parse_namesake_file(test_dir)
    
    if not namesakes:
        print("❌ Failed to parse namesake file")
        sys.exit(1)
    
    # Verify against expected results
    print("\n--- Results Verification ---")
    if not verify_against_expected_results(namesakes):
        print("\n❌ Task verification: FAIL")
        sys.exit(1)
    
    # Final result
    print("\n" + "="*50)
    print("✅ Namesake identification completed correctly!")
    print(f"🎉 Found exactly {len(namesakes)} duplicate names (16 expected)")
    print("🎉 Task verification: PASS")
    sys.exit(0)

if __name__ == "__main__":
    main()

================================================
FILE: tasks/filesystem/standard/student_database/english_talent/description.md
================================================
Please use FileSystem tools to finish the following task:

We are now recruiting students proficient in English to be responsible for the school’s English media operations. To contact with students, from the total of 150 students, select those who **meet both of the following criteria** :

1. Rated ****S** or** ****A** grade level in** `recommendation_letter.txt` by their teachers.
2. TOEFL score in the basic info is **higher than or equal to 100** .

 Please compile all their names, ids and emails into a  `qualified_students.txt` file, with the format:

    name: xxx
	id: xxx
	email: xxx

Each person’s information should occupy three lines, with one blank line between each block.


================================================
FILE: tasks/filesystem/standard/student_database/english_talent/meta.json
================================================
{
  "task_id": "english_talent",
  "task_name": "English Talent",
  "category_id": "student_database",
  "category_name": "Student Database",
  "description": "Select qualified students with S/A recommendation grades and TOEFL scores ≥100 for English media operations recruitment opportunities.",
  "author": "Lingjun Chen",
  "created_at": "2025-08-10",
  "difficulty": "L3",
  "tags": [
    "data extraction",
    "cross-referencing",
    "pattern analysis"
  ],
  "mcp": [
    "filesystem"
  ],
  "meta_data": {
    "stateType": "text",
    "stateContent": "student_database/\n    ├── 20101250_Patricia_Jones/\n    │       ├── basic_info.txt\n    │       └── recommendation_letter.txt\n    ├── 20101701_Isabella_Davis/\n    │       ├── basic_info.txt\n    │       └── recommendation_letter.txt\n    ├── 20102572_Michael_Taylor/\n    │       ├── basic_info.txt\n    │       └── recommendation_letter.txt\n    ├── 20104233_Robert_Lopez/\n    │       ├── basic_info.txt\n    │       └── recommendation_letter.txt\n    ├── 20104498_Sarah_Brown/\n    │       ├── basic_info.txt\n    │       └── recommendation_letter.txt\n    ├── 20104653_Sophia_Brown/\n    │       ├── basic_info.txt\n    │       └── recommendation_letter.txt\n    ├── 20104675_Michael_Gonzalez/\n    │       ├── basic_info.txt\n    │       └── recommendation_letter.txt\n    ├── 20104846_Christopher_Brown/\n    │       ├── basic_info.txt\n    │       └── recommendation_letter.txt\n    ├── 20107487_Mia_Martin/\n    │       ├── basic_info.txt\n    │       └── recommendation_letter.txt\n    ├── 20108742_Sarah_Brown/\n    │       ├── basic_info.txt\n    │       └── recommendation_letter.txt\n    ├── 20109144_Emma_Thomas/\n    │       ├── basic_info.txt\n    │       └── recommendation_letter.txt\n    ├── 20109803_Oliver_Hernandez/\n    │       ├── basic_info.txt\n    │       └── recommendation_letter.txt\n    ├── 20111634_Isabella_Thomas/\n    │       ├── basic_info.txt\n    │       └── recommendation_letter.txt\n    ├── 20112439_Christopher_Moore/\n    │       ├── basic_info.txt\n    │       └── recommendation_letter.txt\n    ├── 20113368_William_Wilson/\n    │       ├── basic_info.txt\n    │       └── recommendation_letter.txt\n    ├── 20113603_Robert_Rodriguez/\n    │       ├── basic_info.txt\n    │       └── recommendation_letter.txt\n    ├── 20114397_Isabella_Martin/\n    │       ├── basic_info.txt\n    │       └── recommendation_letter.txt\n    ├── 20114869_Ethan_Martin/\n    │       ├── basic_info.txt\n    │       └── recommendation_letter.txt\n    ├── 20115252_Mason_Johnson/\n    │       ├── basic_info.txt\n    │       └── recommendation_letter.txt\n    ├── 20115632_Elizabeth_Anderson/\n    │       ├── basic_info.txt\n    │       └── recommendation_letter.txt\n    ├── 20115753_Charlotte_Johnson/\n    │       ├── basic_info.txt\n    │       └── recommendation_letter.txt\n    ├── 20115924_Michael_Lopez/\n    │       ├── basic_info.txt\n    │       └── recommendation_letter.txt\n    ├── 20116232_Olivia_Lopez/\n    │       ├── basic_info.txt\n    │       └── recommendation_letter.txt\n    ├── 20119528_Thomas_Brown/\n    │       ├── basic_info.txt\n    │       └── recommendation_letter.txt\n    ├── 20122427_Karen_Gonzalez/\n    │       ├── basic_info.txt\n    │       └── recommendation_letter.txt\n    ├── 20122977_Evelyn_Miller/\n    │       ├── basic_info.txt\n    │       └── recommendation_letter.txt\n    ├── 20123376_Joseph_Johnson/\n    │       ├── basic_info.txt\n    │       └── recommendation_letter.txt\n    ├── 20125451_Barbara_Brown/\n    │       ├── basic_info.txt\n    │       └── recommendation_letter.txt\n    ├── 20126203_Barbara_Davis/\n    │       ├── basic_info.txt\n    │       └── recommendation_letter.txt\n    ├── 20126394_Olivia_Williams/\n    │       ├── basic_info.txt\n    │       └── recommendation_letter.txt\n    ├── 20126471_Ethan_Taylor/\n    │       ├── basic_info.txt\n    │       └── recommendation_letter.txt\n    ├── 20127423_John_Williams/\n    │       ├── basic_info.txt\n    │       └── recommendation_letter.txt\n    ├── 20128249_Oliver_Smith/\n    │       ├── basic_info.txt\n    │       └── recommendation_letter.txt\n    ├── 20128879_Christopher_Taylor/\n    │       ├── basic_info.txt\n    │       └── recommendation_letter.txt\n    ├── 20129898_Jessica_Johnson/\n    │       ├── basic_info.txt\n    │       └── recommendation_letter.txt\n    ├── 20131271_Olivia_Brown/\n    │       ├── basic_info.txt\n    │       └── recommendation_letter.txt\n    ├── 20131518_Sophia_Smith/\n    │       ├── basic_info.txt\n    │       └── recommendation_letter.txt\n    ├── 20132026_Isabella_Smith/\n    │       ├── basic_info.txt\n    │       └── recommendation_letter.txt\n    ├── 20132370_James_Brown/\n    │       ├── basic_info.txt\n    │       └── recommendation_letter.txt\n    ├── 20132669_Noah_Smith/\n    │       ├── basic_info.txt\n    │       └── recommendation_letter.txt\n    ├── 20133527_Mason_Jackson/\n    │       ├── basic_info.txt\n    │       └── recommendation_letter.txt\n    ├── 20133697_Isabella_Smith/\n    │       ├── basic_info.txt\n    │       └── recommendation_letter.txt\n    ├── 20135821_Thomas_Wilson/\n    │       ├── basic_info.txt\n    │       └── recommendation_letter.txt\n    ├── 20136681_Benjamin_Anderson/\n    │       ├── basic_info.txt\n    │       └── recommendation_letter.txt\n    ├── 20136890_Benjamin_Brown/\n    │       ├── basic_info.txt\n    │       └── recommendation_letter.txt\n    ├── 20137514_Lucas_Anderson/\n    │       ├── basic_info.txt\n    │       └── recommendation_letter.txt\n    ├── 20139234_Harper_Martinez/\n    │       ├── basic_info.txt\n    │       └── recommendation_letter.txt\n    ├── 20139637_Noah_Johnson/\n    │       ├── basic_info.txt\n    │       └── recommendation_letter.txt\n    ├── 20139647_Patricia_Lopez/\n    │       ├── basic_info.txt\n    │       └── recommendation_letter.txt\n    ├── 20141421_Linda_Gonzalez/\n    │       ├── basic_info.txt\n    │       └── recommendation_letter.txt\n    ├── 20142085_William_Anderson/\n    │       ├── basic_info.txt\n    │       └── recommendation_letter.txt\n    ├── 20142383_Amelia_Brown/\n    │       ├── basic_info.txt\n    │       └── recommendation_letter.txt\n    ├── 20143406_Susan_Martin/\n    │       ├── basic_info.txt\n    │       └── recommendation_letter.txt\n    ├── 20143830_James_Garcia/\n    │       ├── basic_info.txt\n    │       └── recommendation_letter.txt\n    ├── 20146035_Christopher_Garcia/\n    │       ├── basic_info.txt\n    │       └── recommendation_letter.txt\n    ├── 20146277_William_Anderson/\n    │       ├── basic_info.txt\n    │       └── recommendation_letter.txt\n    ├── 20146279_Christopher_Moore/\n    │       ├── basic_info.txt\n    │       └── recommendation_letter.txt\n    ├── 20147301_James_Jones/\n    │       ├── basic_info.txt\n    │       └── recommendation_letter.txt\n    ├── 20147789_James_Anderson/\n    │       ├── basic_info.txt\n    │       └── recommendation_letter.txt\n    ├── 20148681_John_Hernandez/\n    │       ├── basic_info.txt\n    │       └── recommendation_letter.txt\n    ├── 20148778_Susan_Anderson/\n    │       ├── basic_info.txt\n    │       └── recommendation_letter.txt\n    ├── 20149712_Jessica_Rodriguez/\n    │       ├── basic_info.txt\n    │       └── recommendation_letter.txt\n    ├── 20151012_Harper_Miller/\n    │       ├── basic_info.txt\n    │       └── recommendation_letter.txt\n    ├── 20153174_Benjamin_Jackson/\n    │       ├── basic_info.txt\n    │       └── recommendation_letter.txt\n    ├── 20153412_Charlotte_Martin/\n    │       ├── basic_info.txt\n    │       └── recommendation_letter.txt\n    ├── 20153606_James_Anderson/\n    │       ├── basic_info.txt\n    │       └── recommendation_letter.txt\n    ├── 20153687_Richard_Taylor/\n    │       ├── basic_info.txt\n    │       └── recommendation_letter.txt\n    ├── 20154518_John_Gonzalez/\n    │       ├── basic_info.txt\n    │       └── recommendation_letter.txt\n    ├── 20154710_Benjamin_Rodriguez/\n    │       ├── basic_info.txt\n    │       └── recommendation_letter.txt\n    ├── 20156469_Jennifer_Hernandez/\n    │       ├── basic_info.txt\n    │       └── recommendation_letter.txt\n    ├── 20156522_Jennifer_Martinez/\n    │       ├── basic_info.txt\n    │       └── recommendation_letter.txt\n    ├── 20156851_Noah_Anderson/\n    │       ├── basic_info.txt\n    │       └── recommendation_letter.txt\n    ├── 20157943_Harper_Williams/\n    │       ├── basic_info.txt\n    │       └── recommendation_letter.txt\n    ├── 20158266_Sophia_Moore/\n    │       ├── basic_info.txt\n    │       └── recommendation_letter.txt\n    ├── 20158294_Sophia_Wilson/\n    │       ├── basic_info.txt\n    │       └── recommendation_letter.txt\n    ├── 20158819_Sarah_Wilson/\n    │       ├── basic_info.txt\n    │       └── recommendation_letter.txt\n    ├── 20159113_John_Rodriguez/\n    │       ├── basic_info.txt\n    │       └── recommendation_letter.txt\n    ├── 20159695_James_Moore/\n    │       ├── basic_info.txt\n    │       └── recommendation_letter.txt\n    ├── 20161279_William_Moore/\n    │       ├── basic_info.txt\n    │       └── recommendation_letter.txt\n    ├── 20162253_Mason_Rodriguez/\n    │       ├── basic_info.txt\n    │       └── recommendation_letter.txt\n    ├── 20162542_Mia_Anderson/\n    │       ├── basic_info.txt\n    │       └── recommendation_letter.txt\n    ├── 20163356_Ava_Anderson/\n    │       ├── basic_info.txt\n    │       └── recommendation_letter.txt\n    ├── 20164515_Patricia_Moore/\n    │       ├── basic_info.txt\n    │       └── recommendation_letter.txt\n    ├── 20164801_Noah_Rodriguez/\n    │       ├── basic_info.txt\n    │       └── recommendation_letter.txt\n    ├── 20165511_Mary_Gonzalez/\n    │       ├── basic_info.txt\n    │       └── recommendation_letter.txt\n    ├── 20166436_Christopher_Jackson/\n    │       ├── basic_info.txt\n    │       └── recommendation_letter.txt\n    ├── 20166487_Barbara_Hernandez/\n    │       ├── basic_info.txt\n    │       └── recommendation_letter.txt\n    ├── 20166564_Ava_Lopez/\n    │       ├── basic_info.txt\n    │       └── recommendation_letter.txt\n    ├── 20166998_Ava_Lopez/\n    │       ├── basic_info.txt\n    │       └── recommendation_letter.txt\n    ├── 20168311_Lucas_Jackson/\n    │       ├── basic_info.txt\n    │       └── recommendation_letter.txt\n    ├── 20168491_Karen_Martinez/\n    │       ├── basic_info.txt\n    │       └── recommendation_letter.txt\n    ├── 20169515_Thomas_Taylor/\n    │       ├── basic_info.txt\n    │       └── recommendation_letter.txt\n    ├── 20171050_Christopher_Rodriguez/\n    │       ├── basic_info.txt\n    │       └── recommendation_letter.txt\n    ├── 20171406_Mary_Anderson/\n    │       ├── basic_info.txt\n    │       └── recommendation_letter.txt\n    ├── 20171613_Ethan_Moore/\n    │       ├── basic_info.txt\n    │       └── recommendation_letter.txt\n    ├── 20172106_Isabella_Rodriguez/\n    │       ├── basic_info.txt\n    │       └── recommendation_letter.txt\n    ├── 20173259_Michael_Anderson/\n    │       ├── basic_info.txt\n    │       └── recommendation_letter.txt\n    ├── 20173492_Richard_Miller/\n    │       ├── basic_info.txt\n    │       └── recommendation_letter.txt\n    ├── 20173501_Mary_Smith/\n    │       ├── basic_info.txt\n    │       └── recommendation_letter.txt\n    ├── 20173517_Susan_Anderson/\n    │       ├── basic_info.txt\n    │       └── recommendation_letter.txt\n    ├── 20174207_Richard_Wilson/\n    │       ├── basic_info.txt\n    │       └── recommendation_letter.txt\n    ├── 20174369_Mary_Garcia/\n    │       ├── basic_info.txt\n    │       └── recommendation_letter.txt\n    ├── 20175314_William_Taylor/\n    │       ├── basic_info.txt\n    │       └── recommendation_letter.txt\n    ├── 20176169_Lucas_Smith/\n    │       ├── basic_info.txt\n    │       └── recommendation_letter.txt\n    ├── 20176947_Noah_Miller/\n    │       ├── basic_info.txt\n    │       └── recommendation_letter.txt\n    ├── 20177389_James_Smith/\n    │       ├── basic_info.txt\n    │       └── recommendation_letter.txt\n    ├── 20178687_Isabella_Anderson/\n    │       ├── basic_info.txt\n    │       └── recommendation_letter.txt\n    ├── 20179461_William_Johnson/\n    │       ├── basic_info.txt\n    │       └── recommendation_letter.txt\n    ├── 20179690_Linda_Thomas/\n    │       ├── basic_info.txt\n    │       └── recommendation_letter.txt\n    ├── 20181056_Sarah_Hernandez/\n    │       ├── basic_info.txt\n    │       └── recommendation_letter.txt\n    ├── 20182020_Patricia_Taylor/\n    │       ├── basic_info.txt\n    │       └── recommendation_letter.txt\n    ├── 20182390_Ethan_Wilson/\n    │       ├── basic_info.txt\n    │       └── recommendation_letter.txt\n    ├── 20183149_David_Smith/\n    │       ├── basic_info.txt\n    │       └── recommendation_letter.txt\n    ├── 20183219_Charlotte_Williams/\n    │       ├── basic_info.txt\n    │       └── recommendation_letter.txt\n    ├── 20184489_Jessica_Gonzalez/\n    │       ├── basic_info.txt\n    │       └── recommendation_letter.txt\n    ├── 20186154_Charlotte_Smith/\n    │       ├── basic_info.txt\n    │       └── recommendation_letter.txt\n    ├── 20186510_James_Thomas/\n    │       ├── basic_info.txt\n    │       └── recommendation_letter.txt\n    ├── 20187107_David_Martinez/\n    │       ├── basic_info.txt\n    │       └── recommendation_letter.txt\n    ├── 20187144_Mary_Jackson/\n    │       ├── basic_info.txt\n    │       └── recommendation_letter.txt\n    ├── 20187892_Christopher_Taylor/\n    │       ├── basic_info.txt\n    │       └── recommendation_letter.txt\n    ├── 20187921_Mary_Jones/\n    │       ├── basic_info.txt\n    │       └── recommendation_letter.txt\n    ├── 20187967_Sarah_Davis/\n    │       ├── basic_info.txt\n    │       └── recommendation_letter.txt\n    ├── 20188937_James_Moore/\n    │       ├── basic_info.txt\n    │       └── recommendation_letter.txt\n    ├── 20189123_Mary_Martin/\n    │       ├── basic_info.txt\n    │       └── recommendation_letter.txt\n    ├── 20189192_Olivia_Jones/\n    │       ├── basic_info.txt\n    │       └── recommendation_letter.txt\n    ├── 20189268_Emma_Williams/\n    │       ├── basic_info.txt\n    │       └── recommendation_letter.txt\n    ├── 20189854_William_Taylor/\n    │       ├── basic_info.txt\n    │       └── recommendation_letter.txt\n    ├── 20191265_Joseph_Lopez/\n    │       ├── basic_info.txt\n    │       └── recommendation_letter.txt\n    ├── 20192725_Robert_Martinez/\n    │       ├── basic_info.txt\n    │       └── recommendation_letter.txt\n    ├── 20194054_Michael_Jones/\n    │       ├── basic_info.txt\n    │       └── recommendation_letter.txt\n    ├── 20194160_Benjamin_Jackson/\n    │       ├── basic_info.txt\n    │       └── recommendation_letter.txt\n    ├── 20194164_Sarah_Jones/\n    │       ├── basic_info.txt\n    │       └── recommendation_letter.txt\n    ├── 20194525_John_Taylor/\n    │       ├── basic_info.txt\n    │       └── recommendation_letter.txt\n    ├── 20195164_Jennifer_Gonzalez/\n    │       ├── basic_info.txt\n    │       └── recommendation_letter.txt\n    ├── 20195982_David_Jackson/\n    │       ├── basic_info.txt\n    │       └── recommendation_letter.txt\n    ├── 20196776_William_Brown/\n    │       ├── basic_info.txt\n    │       └── recommendation_letter.txt\n    ├── 20196896_Olivia_Jones/\n    │       ├── basic_info.txt\n    │       └── recommendation_letter.txt\n    ├── 20196961_Joseph_Thomas/\n    │       ├── basic_info.txt\n    │       └── recommendation_letter.txt\n    ├── 20196998_Ethan_Wilson/\n    │       ├── basic_info.txt\n    │       └── recommendation_letter.txt\n    ├── 20198548_Evelyn_Moore/\n    │       ├── basic_info.txt\n    │       └── recommendation_letter.txt\n    ├── 20199036_Benjamin_Hernandez/\n    │       ├── basic_info.txt\n    │       └── recommendation_letter.txt\n    ├── 20199583_Mary_Brown/\n    │       ├── basic_info.txt\n    │       └── recommendation_letter.txt\n    ├── 20199735_Mason_Johnson/\n    │       ├── basic_info.txt\n    │       └── recommendation_letter.txt\n    ├── 20199872_Sophia_Jackson/\n    │       ├── basic_info.txt\n    │       └── recommendation_letter.txt\n    ├── 20199980_James_Rodriguez/\n    │       ├── basic_info.txt\n    │       └── recommendation_letter.txt\n    ├── 20201385_John_Taylor/\n    │       ├── basic_info.txt\n    │       └── recommendation_letter.txt\n    ├── 20201800_John_Jones/\n    │       ├── basic_info.txt\n    │       └── recommendation_letter.txt\n    ├── 20202548_Robert_Miller/\n    │       ├── basic_info.txt\n    │       └── recommendation_letter.txt\n    ├── 20203855_Mia_Miller/\n    │       ├── basic_info.txt\n    │       └── recommendation_letter.txt\n    └── 20204611_Sarah_Wilson/\n            ├── basic_info.txt\n            └── recommendation_letter.txt",
    "stateUrl": "https://storage.mcpmark.ai/filesystem/student_database.zip",
    "stateOriginalUrl": null
  }
}

================================================
FILE: tasks/filesystem/standard/student_database/english_talent/verify.py
================================================
#!/usr/bin/env python3
"""
Verification script for Student Database Task: English Talent Recruitment
"""

import sys
from pathlib import Path
import re
import os

def get_test_directory() -> Path:
    """Get the test directory from FILESYSTEM_TEST_DIR env var."""
    test_root = os.environ.get("FILESYSTEM_TEST_DIR")
    if not test_root:
        raise ValueError("FILESYSTEM_TEST_DIR environment variable is required")
    return Path(test_root)

def verify_qualified_students_file_exists(test_dir: Path) -> bool:
    """Verify that the qualified_students.txt file exists."""
    answer_file = test_dir / "qualified_students.txt"
    
    if not answer_file.exists():
        print("❌ File 'qualified_students.txt' not found")
        return False
    
    print("✅ Qualified students file found")
    return True

def verify_file_format(test_dir: Path) -> bool:
    """Verify that the qualified_students.txt file has the correct format."""
    answer_file = test_dir / "qualified_students.txt"
    
    try:
        content = answer_file.read_text()
        lines = content.strip().split('\n')
        
        if not lines:
            print("❌ File is empty")
            return False
        
        # Check if content follows the expected pattern
        # Each student should have 3 lines: name, id, email
        # Students should be separated by blank lines
        current_line = 0
        student_count = 0
        
        while current_line < len(lines):
            # Skip blank lines
            if not lines[current_line].strip():
                current_line += 1
                continue
            
            # Check if we have enough lines for a complete student
            if current_line + 2 >= len(lines):
                print(f"❌ Incomplete student entry at line {current_line + 1}")
                return False
            
            # Verify name line format
            if not lines[current_line].strip().startswith("name: "):
                print(f"❌ Invalid name line format at line {current_line + 1}: {lines[current_line]}")
                return False
            
            # Verify id line format
            if not lines[current_line + 1].strip().startswith("id: "):
                print(f"❌ Invalid id line format at line {current_line + 2}: {lines[current_line + 1]}")
                return False
            
            # Verify email line format
            if not lines[current_line + 2].strip().startswith("email: "):
                print(f"❌ Invalid email line format at line {current_line + 3}: {lines[current_line + 2]}")
                return False
            
            student_count += 1
            current_line += 3
            
            # Check for blank line separator (except for the last student)
            if current_line < len(lines) and lines[current_line].strip():
                print(f"❌ Missing blank line separator after student {student_count}")
                return False
            
            current_line += 1
        
        if student_count == 0:
            print("❌ No valid student entries found")
            return False
        
        print(f"✅ File format is correct with {student_count} students")
        return True
        
    except Exception as e:
        print(f"❌ Error reading qualified students file: {e}")
        return False

def parse_qualified_students_file(test_dir: Path) -> list:
    """Parse the qualified_students.txt file and return structured data."""
    answer_file = test_dir / "qualified_students.txt"
    
    try:
        content = answer_file.read_text()
        lines = content.strip().split('\n')
        
        students = []
        current_line = 0
        
        while current_line < len(lines):
            # Skip blank lines
            if not lines[current_line].strip():
                current_line += 1
                continue
            
            # Parse student entry
            name_line = lines[current_line].strip()
            id_line = lines[current_line + 1].strip()
            email_line = lines[current_line + 2].strip()
            
            # Extract name
            name = name_line.replace("name: ", "").strip()
            
            # Extract id
            student_id = id_line.replace("id: ", "").strip()
            
            # Extract email
            email = email_line.replace("email: ", "").strip()
            
            students.append({
                'name': name,
                'id': student_id,
                'email': email
            })
            
            current_line += 4  # Skip to next student (after blank line)
        
        return students
        
    except Exception as e:
        print(f"❌ Error parsing qualified students file: {e}")
        return []

def verify_student_count(students: list) -> bool:
    """Verify that exactly 19 students are found."""
    expected_count = 19
    actual_count = len(students)
    
    if actual_count != expected_count:
        print(f"❌ Expected {expected_count} students, but found {actual_count}")
        return False
    
    print(f"✅ Found exactly {expected_count} students")
    return True

def verify_expected_students(students: list) -> bool:
    """Verify that all expected students are present with correct details."""
    # Expected students from answer.md
    expected_students = {
        'James Smith': {'id': '20177389', 'email': 'james.smith30@outlook.com'},
        'Ava Lopez': {'id': '20166998', 'email': 'ava.lopez67@outlook.com'},
        'James Anderson': {'id': '20153606', 'email': 'james.anderson71@yahoo.com'},
        'Benjamin Anderson': {'id': '20136681', 'email': 'benjamin.anderson37@qq.com'},
        'Sarah Wilson': {'id': '20158819', 'email': 'sarah.wilson96@outlook.com'},
        'Isabella Davis': {'id': '20101701', 'email': 'isabella.davis89@gmail.com'},
        'James Moore': {'id': '20188937', 'email': 'james.moore62@gmail.com'},
        'Harper Williams': {'id': '20157943', 'email': 'harper.williams38@163.com'},
        'Noah Smith': {'id': '20132669', 'email': 'noah.smith45@163.com'},
        'Emma Thomas': {'id': '20109144', 'email': 'emma.thomas68@163.com'},
        'Mary Brown': {'id': '20199583', 'email': 'mary.brown27@yahoo.com'},
        'John Jones': {'id': '20201800', 'email': 'john.jones46@gmail.com'},
        'Mia Anderson': {'id': '20162542', 'email': 'mia.anderson3@outlook.com'},
        'Barbara Davis': {'id': '20126203', 'email': 'barbara.davis67@163.com'},
        'Thomas Brown': {'id': '20119528', 'email': 'thomas.brown43@163.com'},
        'Susan Anderson': {'id': '20148778', 'email': 'susan.anderson16@163.com'},
        'Mary Garcia': {'id': '20174369', 'email': 'mary.garcia58@gmail.com'},
        'Richard Wilson': {'id': '20174207', 'email': 'richard.wilson39@outlook.com'},
        'Joseph Lopez': {'id': '20191265', 'email': 'joseph.lopez93@yahoo.com'}
    }
    
    # Check if all expected students are present
    found_students = set()
    for student in students:
        found_students.add(student['name'])
    
    missing_students = set(expected_students.keys()) - found_students
    if missing_students:
        print(f"❌ Missing expected students: {missing_students}")
        return False
    
    # Check if all found students are expected
    unexpected_students = found_students - set(expected_students.keys())
    if unexpected_students:
        print(f"❌ Unexpected students found: {unexpected_students}")
        return False
    
    # Check if student details match exactly
    for student in students:
        expected = expected_students[student['name']]
        if student['id'] != expected['id']:
            print(f"❌ ID mismatch for {student['name']}: expected {expected['id']}, got {student['id']}")
            return False
        if student['email'] != expected['email']:
            print(f"❌ Email mismatch for {student['name']}: expected {expected['email']}, got {student['email']}")
            return False
    
    print("✅ All expected students are present with correct details")
    return True

def main():
    """Main verification function."""
    test_dir = get_test_directory()
    print("🔍 Verifying Student Database Task: English Talent Recruitment...")
    
    # Define verification steps
    verification_steps = [
        ("Qualified Students File Exists", verify_qualified_students_file_exists),
        ("File Format", verify_file_format),
    ]
    
    # Run basic verification steps first
    all_passed = True
    for step_name, verify_func in verification_steps:
        print(f"\n--- {step_name} ---")
        if not verify_func(test_dir):
            all_passed = False
            break
    
    if not all_passed:
        print("\n❌ Basic verification failed, cannot proceed with content verification")
        sys.exit(1)
    
    # Parse the file and run content verification
    print("\n--- Content Verification ---")
    students = parse_qualified_students_file(test_dir)
    
    if not students:
        print("❌ Failed to parse qualified students file")
        sys.exit(1)
    
    content_verification_steps = [
        ("Student Count", lambda: verify_student_count(students)),
        ("Expected Students", lambda: verify_expected_students(students)),
    ]
    
    for step_name, verify_func in content_verification_steps:
        print(f"\n--- {step_name} ---")
        if not verify_func():
            all_passed = False
    
    # Final result
    print("\n" + "="*50)
    if all_passed:
        print("✅ English talent recruitment completed correctly!")
        print(f"🎉 Found exactly {len(students)} qualified students")
        print("🎉 Task verification: PASS")
        sys.exit(0)
    else:
        print("❌ Task verification: FAIL")
        sys.exit(1)

if __name__ == "__main__":
    main()

================================================
FILE: tasks/filesystem/standard/student_database/gradebased_score/description.md
================================================
Please use FileSystem tools to finish the following task:

### Simple Grade Calculation

1. Read Student Data:

* Process all student basic_info.txt files from the database
* Extract scores for Chinese, Math, and English subjects

2. Calculate Basic Grades:

* Use simple grade scale: A (90+), B (80-89), C (70-79), D (60-69), F (<60)
* Apply this same scale to all subjects

### Generate Output Files

1. Create student_grades.csv:

* Columns: student_id, name, chinese_score, chinese_grade, math_score, math_grade, english_score, english_grade
* Must contain exactly each students
* Each students one row

2. Create grade_summary.txt:

* Total number of students processed
* Number of A's, B's, C's, D's, and F's for each subject
* Simple count of students with passing grades (A, B, C) vs failing grades (D, F) for each subjects


================================================
FILE: tasks/filesystem/standard/student_database/gradebased_score/meta.json
================================================
{
  "task_id": "gradebased_score",
  "task_name": "Gradebased Score",
  "category_id": "student_database",
  "category_name": "Student Database",
  "description": "Process student numerical scores to calculate letter grades using A-F scale and produce comprehensive grade distribution analysis reports.",
  "author": "Lingjun Chen",
  "created_at": "2025-08-10",
  "difficulty": "L3",
  "tags": [
    "data extraction",
    "content transformation",
    "pattern analysis"
  ],
  "mcp": [
    "filesystem"
  ],
  "meta_data": {
    "stateType": "text",
    "stateContent": "student_database/\n    ├── 20101250_Patricia_Jones/\n    │       ├── basic_info.txt\n    │       └── recommendation_letter.txt\n    ├── 20101701_Isabella_Davis/\n    │       ├── basic_info.txt\n    │       └── recommendation_letter.txt\n    ├── 20102572_Michael_Taylor/\n    │       ├── basic_info.txt\n    │       └── recommendation_letter.txt\n    ├── 20104233_Robert_Lopez/\n    │       ├── basic_info.txt\n    │       └── recommendation_letter.txt\n    ├── 20104498_Sarah_Brown/\n    │       ├── basic_info.txt\n    │       └── recommendation_letter.txt\n    ├── 20104653_Sophia_Brown/\n    │       ├── basic_info.txt\n    │       └── recommendation_letter.txt\n    ├── 20104675_Michael_Gonzalez/\n    │       ├── basic_info.txt\n    │       └── recommendation_letter.txt\n    ├── 20104846_Christopher_Brown/\n    │       ├── basic_info.txt\n    │       └── recommendation_letter.txt\n    ├── 20107487_Mia_Martin/\n    │       ├── basic_info.txt\n    │       └── recommendation_letter.txt\n    ├── 20108742_Sarah_Brown/\n    │       ├── basic_info.txt\n    │       └── recommendation_letter.txt\n    ├── 20109144_Emma_Thomas/\n    │       ├── basic_info.txt\n    │       └── recommendation_letter.txt\n    ├── 20109803_Oliver_Hernandez/\n    │       ├── basic_info.txt\n    │       └── recommendation_letter.txt\n    ├── 20111634_Isabella_Thomas/\n    │       ├── basic_info.txt\n    │       └── recommendation_letter.txt\n    ├── 20112439_Christopher_Moore/\n    │       ├── basic_info.txt\n    │       └── recommendation_letter.txt\n    ├── 20113368_William_Wilson/\n    │       ├── basic_info.txt\n    │       └── recommendation_letter.txt\n    ├── 20113603_Robert_Rodriguez/\n    │       ├── basic_info.txt\n    │       └── recommendation_letter.txt\n    ├── 20114397_Isabella_Martin/\n    │       ├── basic_info.txt\n    │       └── recommendation_letter.txt\n    ├── 20114869_Ethan_Martin/\n    │       ├── basic_info.txt\n    │       └── recommendation_letter.txt\n    ├── 20115252_Mason_Johnson/\n    │       ├── basic_info.txt\n    │       └── recommendation_letter.txt\n    ├── 20115632_Elizabeth_Anderson/\n    │       ├── basic_info.txt\n    │       └── recommendation_letter.txt\n    ├── 20115753_Charlotte_Johnson/\n    │       ├── basic_info.txt\n    │       └── recommendation_letter.txt\n    ├── 20115924_Michael_Lopez/\n    │       ├── basic_info.txt\n    │       └── recommendation_letter.txt\n    ├── 20116232_Olivia_Lopez/\n    │       ├── basic_info.txt\n    │       └── recommendation_letter.txt\n    ├── 20119528_Thomas_Brown/\n    │       ├── basic_info.txt\n    │       └── recommendation_letter.txt\n    ├── 20122427_Karen_Gonzalez/\n    │       ├── basic_info.txt\n    │       └── recommendation_letter.txt\n    ├── 20122977_Evelyn_Miller/\n    │       ├── basic_info.txt\n    │       └── recommendation_letter.txt\n    ├── 20123376_Joseph_Johnson/\n    │       ├── basic_info.txt\n    │       └── recommendation_letter.txt\n    ├── 20125451_Barbara_Brown/\n    │       ├── basic_info.txt\n    │       └── recommendation_letter.txt\n    ├── 20126203_Barbara_Davis/\n    │       ├── basic_info.txt\n    │       └── recommendation_letter.txt\n    ├── 20126394_Olivia_Williams/\n    │       ├── basic_info.txt\n    │       └── recommendation_letter.txt\n    ├── 20126471_Ethan_Taylor/\n    │       ├── basic_info.txt\n    │       └── recommendation_letter.txt\n    ├── 20127423_John_Williams/\n    │       ├── basic_info.txt\n    │       └── recommendation_letter.txt\n    ├── 20128249_Oliver_Smith/\n    │       ├── basic_info.txt\n    │       └── recommendation_letter.txt\n    ├── 20128879_Christopher_Taylor/\n    │       ├── basic_info.txt\n    │       └── recommendation_letter.txt\n    ├── 20129898_Jessica_Johnson/\n    │       ├── basic_info.txt\n    │       └── recommendation_letter.txt\n    ├── 20131271_Olivia_Brown/\n    │       ├── basic_info.txt\n    │       └── recommendation_letter.txt\n    ├── 20131518_Sophia_Smith/\n    │       ├── basic_info.txt\n    │       └── recommendation_letter.txt\n    ├── 20132026_Isabella_Smith/\n    │       ├── basic_info.txt\n    │       └── recommendation_letter.txt\n    ├── 20132370_James_Brown/\n    │       ├── basic_info.txt\n    │       └── recommendation_letter.txt\n    ├── 20132669_Noah_Smith/\n    │       ├── basic_info.txt\n    │       └── recommendation_letter.txt\n    ├── 20133527_Mason_Jackson/\n    │       ├── basic_info.txt\n    │       └── recommendation_letter.txt\n    ├── 20133697_Isabella_Smith/\n    │       ├── basic_info.txt\n    │       └── recommendation_letter.txt\n    ├── 20135821_Thomas_Wilson/\n    │       ├── basic_info.txt\n    │       └── recommendation_letter.txt\n    ├── 20136681_Benjamin_Anderson/\n    │       ├── basic_info.txt\n    │       └── recommendation_letter.txt\n    ├── 20136890_Benjamin_Brown/\n    │       ├── basic_info.txt\n    │       └── recommendation_letter.txt\n    ├── 20137514_Lucas_Anderson/\n    │       ├── basic_info.txt\n    │       └── recommendation_letter.txt\n    ├── 20139234_Harper_Martinez/\n    │       ├── basic_info.txt\n    │       └── recommendation_letter.txt\n    ├── 20139637_Noah_Johnson/\n    │       ├── basic_info.txt\n    │       └── recommendation_letter.txt\n    ├── 20139647_Patricia_Lopez/\n    │       ├── basic_info.txt\n    │       └── recommendation_letter.txt\n    ├── 20141421_Linda_Gonzalez/\n    │       ├── basic_info.txt\n    │       └── recommendation_letter.txt\n    ├── 20142085_William_Anderson/\n    │       ├── basic_info.txt\n    │       └── recommendation_letter.txt\n    ├── 20142383_Amelia_Brown/\n    │       ├── basic_info.txt\n    │       └── recommendation_letter.txt\n    ├── 20143406_Susan_Martin/\n    │       ├── basic_info.txt\n    │       └── recommendation_letter.txt\n    ├── 20143830_James_Garcia/\n    │       ├── basic_info.txt\n    │       └── recommendation_letter.txt\n    ├── 20146035_Christopher_Garcia/\n    │       ├── basic_info.txt\n    │       └── recommendation_letter.txt\n    ├── 20146277_William_Anderson/\n    │       ├── basic_info.txt\n    │       └── recommendation_letter.txt\n    ├── 20146279_Christopher_Moore/\n    │       ├── basic_info.txt\n    │       └── recommendation_letter.txt\n    ├── 20147301_James_Jones/\n    │       ├── basic_info.txt\n    │       └── recommendation_letter.txt\n    ├── 20147789_James_Anderson/\n    │       ├── basic_info.txt\n    │       └── recommendation_letter.txt\n    ├── 20148681_John_Hernandez/\n    │       ├── basic_info.txt\n    │       └── recommendation_letter.txt\n    ├── 20148778_Susan_Anderson/\n    │       ├── basic_info.txt\n    │       └── recommendation_letter.txt\n    ├── 20149712_Jessica_Rodriguez/\n    │       ├── basic_info.txt\n    │       └── recommendation_letter.txt\n    ├── 20151012_Harper_Miller/\n    │       ├── basic_info.txt\n    │       └── recommendation_letter.txt\n    ├── 20153174_Benjamin_Jackson/\n    │       ├── basic_info.txt\n    │       └── recommendation_letter.txt\n    ├── 20153412_Charlotte_Martin/\n    │       ├── basic_info.txt\n    │       └── recommendation_letter.txt\n    ├── 20153606_James_Anderson/\n    │       ├── basic_info.txt\n    │       └── recommendation_letter.txt\n    ├── 20153687_Richard_Taylor/\n    │       ├── basic_info.txt\n    │       └── recommendation_letter.txt\n    ├── 20154518_John_Gonzalez/\n    │       ├── basic_info.txt\n    │       └── recommendation_letter.txt\n    ├── 20154710_Benjamin_Rodriguez/\n    │       ├── basic_info.txt\n    │       └── recommendation_letter.txt\n    ├── 20156469_Jennifer_Hernandez/\n    │       ├── basic_info.txt\n    │       └── recommendation_letter.txt\n    ├── 20156522_Jennifer_Martinez/\n    │       ├── basic_info.txt\n    │       └── recommendation_letter.txt\n    ├── 20156851_Noah_Anderson/\n    │       ├── basic_info.txt\n    │       └── recommendation_letter.txt\n    ├── 20157943_Harper_Williams/\n    │       ├── basic_info.txt\n    │       └── recommendation_letter.txt\n    ├── 20158266_Sophia_Moore/\n    │       ├── basic_info.txt\n    │       └── recommendation_letter.txt\n    ├── 20158294_Sophia_Wilson/\n    │       ├── basic_info.txt\n    │       └── recommendation_letter.txt\n    ├── 20158819_Sarah_Wilson/\n    │       ├── basic_info.txt\n    │       └── recommendation_letter.txt\n    ├── 20159113_John_Rodriguez/\n    │       ├── basic_info.txt\n    │       └── recommendation_letter.txt\n    ├── 20159695_James_Moore/\n    │       ├── basic_info.txt\n    │       └── recommendation_letter.txt\n    ├── 20161279_William_Moore/\n    │       ├── basic_info.txt\n    │       └── recommendation_letter.txt\n    ├── 20162253_Mason_Rodriguez/\n    │       ├── basic_info.txt\n    │       └── recommendation_letter.txt\n    ├── 20162542_Mia_Anderson/\n    │       ├── basic_info.txt\n    │       └── recommendation_letter.txt\n    ├── 20163356_Ava_Anderson/\n    │       ├── basic_info.txt\n    │       └── recommendation_letter.txt\n    ├── 20164515_Patricia_Moore/\n    │       ├── basic_info.txt\n    │       └── recommendation_letter.txt\n    ├── 20164801_Noah_Rodriguez/\n    │       ├── basic_info.txt\n    │       └── recommendation_letter.txt\n    ├── 20165511_Mary_Gonzalez/\n    │       ├── basic_info.txt\n    │       └── recommendation_letter.txt\n    ├── 20166436_Christopher_Jackson/\n    │       ├── basic_info.txt\n    │       └── recommendation_letter.txt\n    ├── 20166487_Barbara_Hernandez/\n    │       ├── basic_info.txt\n    │       └── recommendation_letter.txt\n    ├── 20166564_Ava_Lopez/\n    │       ├── basic_info.txt\n    │       └── recommendation_letter.txt\n    ├── 20166998_Ava_Lopez/\n    │       ├── basic_info.txt\n    │       └── recommendation_letter.txt\n    ├── 20168311_Lucas_Jackson/\n    │       ├── basic_info.txt\n    │       └── recommendation_letter.txt\n    ├── 20168491_Karen_Martinez/\n    │       ├── basic_info.txt\n    │       └── recommendation_letter.txt\n    ├── 20169515_Thomas_Taylor/\n    │       ├── basic_info.txt\n    │       └── recommendation_letter.txt\n    ├── 20171050_Christopher_Rodriguez/\n    │       ├── basic_info.txt\n    │       └── recommendation_letter.txt\n    ├── 20171406_Mary_Anderson/\n    │       ├── basic_info.txt\n    │       └── recommendation_letter.txt\n    ├── 20171613_Ethan_Moore/\n    │       ├── basic_info.txt\n    │       └── recommendation_letter.txt\n    ├── 20172106_Isabella_Rodriguez/\n    │       ├── basic_info.txt\n    │       └── recommendation_letter.txt\n    ├── 20173259_Michael_Anderson/\n    │       ├── basic_info.txt\n    │       └── recommendation_letter.txt\n    ├── 20173492_Richard_Miller/\n    │       ├── basic_info.txt\n    │       └── recommendation_letter.txt\n    ├── 20173501_Mary_Smith/\n    │       ├── basic_info.txt\n    │       └── recommendation_letter.txt\n    ├── 20173517_Susan_Anderson/\n    │       ├── basic_info.txt\n    │       └── recommendation_letter.txt\n    ├── 20174207_Richard_Wilson/\n    │       ├── basic_info.txt\n    │       └── recommendation_letter.txt\n    ├── 20174369_Mary_Garcia/\n    │       ├── basic_info.txt\n    │       └── recommendation_letter.txt\n    ├── 20175314_William_Taylor/\n    │       ├── basic_info.txt\n    │       └── recommendation_letter.txt\n    ├── 20176169_Lucas_Smith/\n    │       ├── basic_info.txt\n    │       └── recommendation_letter.txt\n    ├── 20176947_Noah_Miller/\n    │       ├── basic_info.txt\n    │       └── recommendation_letter.txt\n    ├── 20177389_James_Smith/\n    │       ├── basic_info.txt\n    │       └── recommendation_letter.txt\n    ├── 20178687_Isabella_Anderson/\n    │       ├── basic_info.txt\n    │       └── recommendation_letter.txt\n    ├── 20179461_William_Johnson/\n    │       ├── basic_info.txt\n    │       └── recommendation_letter.txt\n    ├── 20179690_Linda_Thomas/\n    │       ├── basic_info.txt\n    │       └── recommendation_letter.txt\n    ├── 20181056_Sarah_Hernandez/\n    │       ├── basic_info.txt\n    │       └── recommendation_letter.txt\n    ├── 20182020_Patricia_Taylor/\n    │       ├── basic_info.txt\n    │       └── recommendation_letter.txt\n    ├── 20182390_Ethan_Wilson/\n    │       ├── basic_info.txt\n    │       └── recommendation_letter.txt\n    ├── 20183149_David_Smith/\n    │       ├── basic_info.txt\n    │       └── recommendation_letter.txt\n    ├── 20183219_Charlotte_Williams/\n    │       ├── basic_info.txt\n    │       └── recommendation_letter.txt\n    ├── 20184489_Jessica_Gonzalez/\n    │       ├── basic_info.txt\n    │       └── recommendation_letter.txt\n    ├── 20186154_Charlotte_Smith/\n    │       ├── basic_info.txt\n    │       └── recommendation_letter.txt\n    ├── 20186510_James_Thomas/\n    │       ├── basic_info.txt\n    │       └── recommendation_letter.txt\n    ├── 20187107_David_Martinez/\n    │       ├── basic_info.txt\n    │       └── recommendation_letter.txt\n    ├── 20187144_Mary_Jackson/\n    │       ├── basic_info.txt\n    │       └── recommendation_letter.txt\n    ├── 20187892_Christopher_Taylor/\n    │       ├── basic_info.txt\n    │       └── recommendation_letter.txt\n    ├── 20187921_Mary_Jones/\n    │       ├── basic_info.txt\n    │       └── recommendation_letter.txt\n    ├── 20187967_Sarah_Davis/\n    │       ├── basic_info.txt\n    │       └── recommendation_letter.txt\n    ├── 20188937_James_Moore/\n    │       ├── basic_info.txt\n    │       └── recommendation_letter.txt\n    ├── 20189123_Mary_Martin/\n    │       ├── basic_info.txt\n    │       └── recommendation_letter.txt\n    ├── 20189192_Olivia_Jones/\n    │       ├── basic_info.txt\n    │       └── recommendation_letter.txt\n    ├── 20189268_Emma_Williams/\n    │       ├── basic_info.txt\n    │       └── recommendation_letter.txt\n    ├── 20189854_William_Taylor/\n    │       ├── basic_info.txt\n    │       └── recommendation_letter.txt\n    ├── 20191265_Joseph_Lopez/\n    │       ├── basic_info.txt\n    │       └── recommendation_letter.txt\n    ├── 20192725_Robert_Martinez/\n    │       ├── basic_info.txt\n    │       └── recommendation_letter.txt\n    ├── 20194054_Michael_Jones/\n    │       ├── basic_info.txt\n    │       └── recommendation_letter.txt\n    ├── 20194160_Benjamin_Jackson/\n    │       ├── basic_info.txt\n    │       └── recommendation_letter.txt\n    ├── 20194164_Sarah_Jones/\n    │       ├── basic_info.txt\n    │       └── recommendation_letter.txt\n    ├── 20194525_John_Taylor/\n    │       ├── basic_info.txt\n    │       └── recommendation_letter.txt\n    ├── 20195164_Jennifer_Gonzalez/\n    │       ├── basic_info.txt\n    │       └── recommendation_letter.txt\n    ├── 20195982_David_Jackson/\n    │       ├── basic_info.txt\n    │       └── recommendation_letter.txt\n    ├── 20196776_William_Brown/\n    │       ├── basic_info.txt\n    │       └── recommendation_letter.txt\n    ├── 20196896_Olivia_Jones/\n    │       ├── basic_info.txt\n    │       └── recommendation_letter.txt\n    ├── 20196961_Joseph_Thomas/\n    │       ├── basic_info.txt\n    │       └── recommendation_letter.txt\n    ├── 20196998_Ethan_Wilson/\n    │       ├── basic_info.txt\n    │       └── recommendation_letter.txt\n    ├── 20198548_Evelyn_Moore/\n    │       ├── basic_info.txt\n    │       └── recommendation_letter.txt\n    ├── 20199036_Benjamin_Hernandez/\n    │       ├── basic_info.txt\n    │       └── recommendation_letter.txt\n    ├── 20199583_Mary_Brown/\n    │       ├── basic_info.txt\n    │       └── recommendation_letter.txt\n    ├── 20199735_Mason_Johnson/\n    │       ├── basic_info.txt\n    │       └── recommendation_letter.txt\n    ├── 20199872_Sophia_Jackson/\n    │       ├── basic_info.txt\n    │       └── recommendation_letter.txt\n    ├── 20199980_James_Rodriguez/\n    │       ├── basic_info.txt\n    │       └── recommendation_letter.txt\n    ├── 20201385_John_Taylor/\n    │       ├── basic_info.txt\n    │       └── recommendation_letter.txt\n    ├── 20201800_John_Jones/\n    │       ├── basic_info.txt\n    │       └── recommendation_letter.txt\n    ├── 20202548_Robert_Miller/\n    │       ├── basic_info.txt\n    │       └── recommendation_letter.txt\n    ├── 20203855_Mia_Miller/\n    │       ├── basic_info.txt\n    │       └── recommendation_letter.txt\n    └── 20204611_Sarah_Wilson/\n            ├── basic_info.txt\n            └── recommendation_letter.txt",
    "stateUrl": "https://storage.mcpmark.ai/filesystem/student_database.zip",
    "stateOriginalUrl": null
  }
}

================================================
FILE: tasks/filesystem/standard/student_database/gradebased_score/verify.py
================================================
#!/usr/bin/env python3
"""
Verification script for Student Database Grade-Based Score Analysis Task
"""

import sys
from pathlib import Path
import os
import re

def get_test_directory() -> Path:
    """Get the test directory from FILESYSTEM_TEST_DIR env var."""
    test_root = os.environ.get("FILESYSTEM_TEST_DIR")
    if not test_root:
        raise ValueError("FILESYSTEM_TEST_DIR environment variable is required")
    return Path(test_root)

def verify_grade_summary_exists(test_dir: Path) -> bool:
    """Verify that grade_summary.txt file exists."""
    grade_summary_file = test_dir / "grade_summary.txt"
    
    if not grade_summary_file.exists():
        print("❌ File 'grade_summary.txt' not found")
        return False
    
    print("✅ grade_summary.txt file found")
    return True

def verify_grade_summary_readable(test_dir: Path) -> bool:
    """Verify that the grade_summary.txt file is readable."""
    grade_summary_file = test_dir / "grade_summary.txt"
    
    try:
        content = grade_summary_file.read_text()
        if not content.strip():
            print("❌ grade_summary.txt file is empty")
            return False
        
        print("✅ grade_summary.txt file is readable")
        return True
        
    except Exception as e:
        print(f"❌ Error reading grade_summary.txt file: {e}")
        return False

def extract_numbers_from_text(text: str) -> list:
    """Extract all numbers from text."""
    numbers = re.findall(r'\d+', text)
    return [int(num) for num in numbers]

def verify_three_subjects_present(test_dir: Path) -> bool:
    """Verify that grade_summary.txt contains all three subjects (case insensitive)."""
    grade_summary_file = test_dir / "grade_summary.txt"
    
    try:
        content = grade_summary_file.read_text()
        
        # Check if all three subjects are mentioned (case insensitive)
        subjects = ["chinese", "math", "english"]
        missing_subjects = []
        
        for subject in subjects:
            if subject.lower() not in content.lower():
                missing_subjects.append(subject)
        
        if missing_subjects:
            print(f"❌ Missing subjects in grade_summary.txt: {missing_subjects}")
            return False
        
        print("✅ All three subjects (Chinese, Math, English) found in grade_summary.txt")
        return True
        
    except Exception as e:
        print(f"❌ Error checking subjects: {e}")
        return False

def verify_grade_summary_content(test_dir: Path) -> bool:
    """Verify that grade_summary.txt contains the correct statistics from answer.md."""
    grade_summary_file = test_dir / "grade_summary.txt"
    
    try:
        content = grade_summary_file.read_text()
        
        # Extract all numbers from the content
        found_numbers = extract_numbers_from_text(content)
        
        if not found_numbers:
            print("❌ No numbers found in grade_summary.txt")
            return False
        
        # Expected numbers from answer.md
        # Format: [total_students, chinese_A, chinese_B, chinese_C, chinese_D, chinese_pass, chinese_fail,
        #          math_A, math_B, math_C, math_D, math_pass, math_fail,
        #          english_A, english_B, english_C, english_D, english_F, english_pass, english_fail]
        expected_numbers = [
            # Total students
            150,
            # Chinese grades: A(42), B(37), C(43), D(28), Pass(122), Fail(28)
            42, 37, 43, 28, 122, 28,
            # Math grades: A(31), B(38), C(47), D(34), Pass(116), Fail(34)  
            31, 38, 47, 34, 116, 34,
            # English grades: A(32), B(38), C(38), D(41), F(1), Pass(108), Fail(42)
            32, 38, 38, 41, 1, 108, 42
        ]
        
        # Check if all expected numbers are present in the found numbers
        missing_numbers = []
        for expected in expected_numbers:
            if expected not in found_numbers:
                missing_numbers.append(expected)
        
        if missing_numbers:
            print(f"❌ Missing expected numbers: {missing_numbers}")
            print(f"   Found numbers: {found_numbers}")
            return False
        
        # Check if the counts match (each number should appear the expected number of times)
        for expected in expected_numbers:
            expected_count = expected_numbers.count(expected)
            found_count = found_numbers.count(expected)
            if found_count < expected_count:
                print(f"❌ Number {expected} appears {found_count} times, expected {expected_count} times")
                return False
        
        print("✅ All expected grade statistics found in grade_summary.txt")
        return True
        
    except Exception as e:
        print(f"❌ Error verifying grade summary content: {e}")
        return False

def main():
    """Main verification function."""
    try:
        test_dir = get_test_directory()
        print(f"🔍 Verifying Student Database Grade-Based Score Analysis in: {test_dir}")
        
        # Define verification steps
        verification_steps = [
            ("Grade Summary File Exists", verify_grade_summary_exists),
            ("File is Readable", verify_grade_summary_readable),
            ("Three Subjects Present", verify_three_subjects_present),
            ("Grade Statistics Content", verify_grade_summary_content),
        ]
        
        # Run all verification steps
        all_passed = True
        for step_name, verify_func in verification_steps:
            print(f"\n--- {step_name} ---")
            if not verify_func(test_dir):
                all_passed = False
        
        # Final result
        print("\n" + "="*50)
        if all_passed:
            print("✅ Student grade analysis completed correctly!")
            print("🎉 Grade-Based Score Analysis verification: PASS")
            sys.exit(0)
        else:
            print("❌ Grade-Based Score Analysis verification: FAIL")
            sys.exit(1)
            
    except Exception as e:
        print(f"❌ Verification failed with error: {e}")
        sys.exit(1)

if __name__ == "__main__":
    main()

================================================
FILE: tasks/filesystem/standard/threestudio/code_locating/description.md
================================================
Please use FileSystem tools to finish the following task:

### Task Description

ThreeStudio is a comprehensive codebase that implements various diffusion-based text-to-3D models, including NeRF-based rendering stage and diffusion guidance stage. Your task is to explore the codebase and identify the specific file that defines the guidance functionality for the Zero123 model.

### Task Objectives

1. **Explore the ThreeStudio codebase** using filesystem MCP tools
2. **Search through the project structure** to understand the codebase organization
3. **Identify the file** that contains the Zero123 guidance implementation
4. **Create an answer file** with the correct file path

### Expected Output

Create a file named `answer.txt` in the test directory root

**Requirements:**
- Only include the file path, no additional text or explanation
- Use forward slashes (/) for path separators
- Include the full relative path from the project root
- Ensure the path points to the actual file that defines Zero123 guidance

================================================
FILE: tasks/filesystem/standard/threestudio/code_locating/meta.json
================================================
{
  "task_id": "code_locating",
  "task_name": "Code Locating",
  "category_id": "threestudio",
  "category_name": "Threestudio",
  "description": "Navigate the ThreeStudio codebase to locate and identify the specific file that defines Zero123 guidance functionality implementation.",
  "author": "Lingjun Chen",
  "created_at": "2025-08-05",
  "difficulty": "L3",
  "tags": [
    "code exploration"
  ],
  "mcp": [
    "filesystem"
  ],
  "meta_data": {
    "stateType": "text",
    "stateContent": "threestudio/\n    ├── configs/\n    │       ├── debugging/\n    │       │       ├── controlnet-canny.yaml\n    │       │       ├── controlnet-normal.yaml\n    │       │       ├── instructpix2pix.yaml\n    │       │       └── stablediffusion.yaml\n    │       ├── experimental/\n    │       │       ├── unified-guidance/\n    │       │       │       ├── dreamfusion-sd.yaml\n    │       │       │       ├── hifa.yaml\n    │       │       │       ├── prolificdreamer-hifa.yaml\n    │       │       │       ├── prolificdreamer.yaml\n    │       │       │       └── zero123-simple.yaml\n    │       │       ├── co3d-imagecondition.yaml\n    │       │       ├── imagecondition.yaml\n    │       │       ├── imagecondition_zero123nerf.yaml\n    │       │       ├── imagecondition_zero123nerf_refine.yaml\n    │       │       ├── prolificdreamer-importance.yaml\n    │       │       ├── prolificdreamer-neus-importance.yaml\n    │       │       ├── prolificdreamer-propnet.yaml\n    │       │       └── textmesh-if-importance.yaml\n    │       ├── gradio/\n    │       │       ├── dreamfusion-if.yaml\n    │       │       ├── dreamfusion-sd.yaml\n    │       │       ├── fantasia3d.yaml\n    │       │       ├── latentnerf.yaml\n    │       │       ├── sjc.yaml\n    │       │       └── textmesh-if.yaml\n    │       ├── control4d-static.yaml\n    │       ├── dreamfusion-if.yaml\n    │       ├── dreamfusion-sd-eff.yaml\n    │       ├── dreamfusion-sd.yaml\n    │       ├── fantasia3d-texture.yaml\n    │       ├── fantasia3d.yaml\n    │       ├── hifa.yaml\n    │       ├── instructnerf2nerf.yaml\n    │       ├── latentnerf-refine.yaml\n    │       ├── latentnerf.yaml\n    │       ├── magic123-coarse-sd.yaml\n    │       ├── magic123-hifa-coarse-sd.yaml\n    │       ├── magic123-hifa-refine-sd.yaml\n    │       ├── magic123-refine-sd.yaml\n    │       ├── magic3d-coarse-if.yaml\n    │       ├── magic3d-coarse-sd.yaml\n    │       ├── magic3d-refine-sd.yaml\n    │       ├── prolificdreamer-geometry.yaml\n    │       ├── prolificdreamer-hifa.yaml\n    │       ├── prolificdreamer-patch.yaml\n    │       ├── prolificdreamer-scene-hifa.yaml\n    │       ├── prolificdreamer-scene.yaml\n    │       ├── prolificdreamer-texture.yaml\n    │       ├── prolificdreamer.yaml\n    │       ├── sdi.yaml\n    │       ├── sjc.yaml\n    │       ├── sketchshape-refine.yaml\n    │       ├── sketchshape.yaml\n    │       ├── stable-zero123.yaml\n    │       ├── textmesh-if.yaml\n    │       ├── zero123-geometry.yaml\n    │       └── zero123.yaml\n    ├── custom/\n    │       └── put_custom_extensions_here\n    ├── docker/\n    │       ├── compose.yaml\n    │       └── Dockerfile\n    ├── docs/\n    │       └── installation.md\n    ├── extern/\n    │       ├── ldm_zero123/\n    │       │       ├── models/\n    │       │       │       ├── diffusion/\n    │       │       │       │       ├── __init__.py\n    │       │       │       │       ├── classifier.py\n    │       │       │       │       ├── ddim.py\n    │       │       │       │       ├── ddpm.py\n    │       │       │       │       ├── plms.py\n    │       │       │       │       └── sampling_util.py\n    │       │       │       └── autoencoder.py\n    │       │       ├── modules/\n    │       │       │       ├── diffusionmodules/\n    │       │       │       │       ├── __init__.py\n    │       │       │       │       ├── model.py\n    │       │       │       │       ├── openaimodel.py\n    │       │       │       │       └── util.py\n    │       │       │       ├── distributions/\n    │       │       │       │       ├── __init__.py\n    │       │       │       │       └── distributions.py\n    │       │       │       ├── encoders/\n    │       │       │       │       ├── __init__.py\n    │       │       │       │       └── modules.py\n    │       │       │       ├── evaluate/\n    │       │       │       │       ├── adm_evaluator.py\n    │       │       │       │       ├── evaluate_perceptualsim.py\n    │       │       │       │       ├── frechet_video_distance.py\n    │       │       │       │       ├── ssim.py\n    │       │       │       │       └── torch_frechet_video_distance.py\n    │       │       │       ├── image_degradation/\n    │       │       │       │       ├── utils/\n    │       │       │       │       │       └── test.png\n    │       │       │       │       ├── __init__.py\n    │       │       │       │       ├── bsrgan.py\n    │       │       │       │       ├── bsrgan_light.py\n    │       │       │       │       └── utils_image.py\n    │       │       │       ├── losses/\n    │       │       │       │       ├── __init__.py\n    │       │       │       │       ├── contperceptual.py\n    │       │       │       │       └── vqperceptual.py\n    │       │       │       ├── attention.py\n    │       │       │       ├── ema.py\n    │       │       │       └── x_transformer.py\n    │       │       ├── thirdp/\n    │       │       │       └── psp/\n    │       │       │               ├── helpers.py\n    │       │       │               ├── id_loss.py\n    │       │       │               └── model_irse.py\n    │       │       ├── __init__.py\n    │       │       ├── extras.py\n    │       │       ├── guidance.py\n    │       │       ├── lr_scheduler.py\n    │       │       └── util.py\n    │       ├── __init__.py\n    │       └── zero123.py\n    ├── load/\n    │       ├── images/\n    │       │       ├── anya_front.png\n    │       │       ├── anya_front_depth.png\n    │       │       ├── anya_front_normal.png\n    │       │       ├── anya_front_rgba.png\n    │       │       ├── baby_phoenix_on_ice.png\n    │       │       ├── baby_phoenix_on_ice_depth.png\n    │       │       ├── baby_phoenix_on_ice_normal.png\n    │       │       ├── baby_phoenix_on_ice_rgba.png\n    │       │       ├── beach_house_1.png\n    │       │       ├── beach_house_1_depth.png\n    │       │       ├── beach_house_1_normal.png\n    │       │       ├── beach_house_1_rgba.png\n    │       │       ├── beach_house_2.png\n    │       │       ├── beach_house_2_depth.png\n    │       │       ├── beach_house_2_normal.png\n    │       │       ├── beach_house_2_rgba.png\n    │       │       ├── bollywood_actress.png\n    │       │       ├── bollywood_actress_depth.png\n    │       │       ├── bollywood_actress_normal.png\n    │       │       ├── bollywood_actress_rgba.png\n    │       │       ├── cactus.png\n    │       │       ├── cactus_depth.png\n    │       │       ├── cactus_normal.png\n    │       │       ├── cactus_rgba.png\n    │       │       ├── catstatue.png\n    │       │       ├── catstatue_depth.png\n    │       │       ├── catstatue_normal.png\n    │       │       ├── catstatue_rgba.png\n    │       │       ├── church_ruins.png\n    │       │       ├── church_ruins_depth.png\n    │       │       ├── church_ruins_normal.png\n    │       │       ├── church_ruins_rgba.png\n    │       │       ├── dog1_rgba.png\n    │       │       ├── dragon2_rgba.png\n    │       │       ├── firekeeper.jpg\n    │       │       ├── firekeeper_depth.png\n    │       │       ├── firekeeper_normal.png\n    │       │       ├── firekeeper_rgba.png\n    │       │       ├── futuristic_car.png\n    │       │       ├── futuristic_car_depth.png\n    │       │       ├── futuristic_car_normal.png\n    │       │       ├── futuristic_car_rgba.png\n    │       │       ├── grootplant_rgba.png\n    │       │       ├── hamburger.png\n    │       │       ├── hamburger_depth.png\n    │       │       ├── hamburger_rgba.png\n    │       │       ├── mona_lisa.png\n    │       │       ├── mona_lisa_depth.png\n    │       │       ├── mona_lisa_normal.png\n    │       │       ├── mona_lisa_rgba.png\n    │       │       ├── robot_rgba.png\n    │       │       ├── teddy.png\n    │       │       ├── teddy_depth.png\n    │       │       ├── teddy_normal.png\n    │       │       ├── teddy_rgba.png\n    │       │       └── thorhammer_rgba.png\n    │       ├── lights/\n    │       │       ├── bsdf_256_256.bin\n    │       │       ├── LICENSE.txt\n    │       │       └── mud_road_puresky_1k.hdr\n    │       ├── shapes/\n    │       │       ├── animal.obj\n    │       │       ├── blub.obj\n    │       │       ├── cabin.obj\n    │       │       ├── env_sphere.obj\n    │       │       ├── hand_prismatic.obj\n    │       │       ├── human.obj\n    │       │       ├── nascar.obj\n    │       │       ├── potion.obj\n    │       │       ├── README.md\n    │       │       └── teddy.obj\n    │       ├── tets/\n    │       │       ├── 128_tets.npz\n    │       │       ├── 32_tets.npz\n    │       │       ├── 64_tets.npz\n    │       │       └── generate_tets.py\n    │       ├── zero123/\n    │       │       ├── download.sh\n    │       │       └── sd-objaverse-finetune-c_concat-256.yaml\n    │       ├── make_prompt_library.py\n    │       └── prompt_library.json\n    ├── scripts/\n    │       └── convert_zero123_to_diffusers.py\n    ├── threestudio/\n    │       ├── data/\n    │       │       ├── __init__.py\n    │       │       ├── co3d.py\n    │       │       ├── image.py\n    │       │       ├── multiview.py\n    │       │       ├── uncond.py\n    │       │       └── uncond_eff.py\n    │       ├── models/\n    │       │       ├── background/\n    │       │       │       ├── __init__.py\n    │       │       │       ├── base.py\n    │       │       │       ├── neural_environment_map_background.py\n    │       │       │       ├── solid_color_background.py\n    │       │       │       └── textured_background.py\n    │       │       ├── exporters/\n    │       │       │       ├── __init__.py\n    │       │       │       ├── base.py\n    │       │       │       └── mesh_exporter.py\n    │       │       ├── geometry/\n    │       │       │       ├── __init__.py\n    │       │       │       ├── base.py\n    │       │       │       ├── custom_mesh.py\n    │       │       │       ├── implicit_sdf.py\n    │       │       │       ├── implicit_volume.py\n    │       │       │       ├── tetrahedra_sdf_grid.py\n    │       │       │       └── volume_grid.py\n    │       │       ├── guidance/\n    │       │       │       ├── __init__.py\n    │       │       │       ├── controlnet_guidance.py\n    │       │       │       ├── deep_floyd_guidance.py\n    │       │       │       ├── instructpix2pix_guidance.py\n    │       │       │       ├── stable_diffusion_guidance.py\n    │       │       │       ├── stable_diffusion_sdi_guidance.py\n    │       │       │       ├── stable_diffusion_unified_guidance.py\n    │       │       │       ├── stable_diffusion_vsd_guidance.py\n    │       │       │       ├── stable_zero123_guidance.py\n    │       │       │       ├── zero123_guidance.py\n    │       │       │       └── zero123_unified_guidance.py\n    │       │       ├── materials/\n    │       │       │       ├── __init__.py\n    │       │       │       ├── base.py\n    │       │       │       ├── diffuse_with_point_light_material.py\n    │       │       │       ├── hybrid_rgb_latent_material.py\n    │       │       │       ├── neural_radiance_material.py\n    │       │       │       ├── no_material.py\n    │       │       │       ├── pbr_material.py\n    │       │       │       └── sd_latent_adapter_material.py\n    │       │       ├── prompt_processors/\n    │       │       │       ├── __init__.py\n    │       │       │       ├── base.py\n    │       │       │       ├── deepfloyd_prompt_processor.py\n    │       │       │       ├── dummy_prompt_processor.py\n    │       │       │       └── stable_diffusion_prompt_processor.py\n    │       │       ├── renderers/\n    │       │       │       ├── __init__.py\n    │       │       │       ├── base.py\n    │       │       │       ├── deferred_volume_renderer.py\n    │       │       │       ├── gan_volume_renderer.py\n    │       │       │       ├── nerf_volume_renderer.py\n    │       │       │       ├── neus_volume_renderer.py\n    │       │       │       ├── nvdiff_rasterizer.py\n    │       │       │       └── patch_renderer.py\n    │       │       ├── __init__.py\n    │       │       ├── estimators.py\n    │       │       ├── isosurface.py\n    │       │       ├── mesh.py\n    │       │       └── networks.py\n    │       ├── scripts/\n    │       │       ├── make_training_vid.py\n    │       │       ├── run_zero123.sh\n    │       │       ├── run_zero123_comparison.sh\n    │       │       ├── run_zero123_phase.sh\n    │       │       ├── run_zero123_phase2.sh\n    │       │       ├── run_zero123_sbatch.py\n    │       │       ├── zero123_demo.py\n    │       │       └── zero123_sbatch.sh\n    │       ├── systems/\n    │       │       ├── __init__.py\n    │       │       ├── base.py\n    │       │       ├── control4d_multiview.py\n    │       │       ├── dreamfusion.py\n    │       │       ├── eff_dreamfusion.py\n    │       │       ├── fantasia3d.py\n    │       │       ├── imagedreamfusion.py\n    │       │       ├── instructnerf2nerf.py\n    │       │       ├── latentnerf.py\n    │       │       ├── magic123.py\n    │       │       ├── magic3d.py\n    │       │       ├── optimizers.py\n    │       │       ├── prolificdreamer.py\n    │       │       ├── sdi.py\n    │       │       ├── sjc.py\n    │       │       ├── textmesh.py\n    │       │       ├── utils.py\n    │       │       ├── zero123.py\n    │       │       └── zero123_simple.py\n    │       ├── utils/\n    │       │       ├── GAN/\n    │       │       │       ├── __init__.py\n    │       │       │       ├── attention.py\n    │       │       │       ├── discriminator.py\n    │       │       │       ├── distribution.py\n    │       │       │       ├── loss.py\n    │       │       │       ├── mobilenet.py\n    │       │       │       ├── network_util.py\n    │       │       │       ├── util.py\n    │       │       │       └── vae.py\n    │       │       ├── perceptual/\n    │       │       │       ├── __init__.py\n    │       │       │       ├── perceptual.py\n    │       │       │       └── utils.py\n    │       │       ├── __init__.py\n    │       │       ├── base.py\n    │       │       ├── callbacks.py\n    │       │       ├── config.py\n    │       │       ├── loss.py\n    │       │       ├── misc.py\n    │       │       ├── ops.py\n    │       │       ├── rasterize.py\n    │       │       ├── saving.py\n    │       │       └── typing.py\n    │       └── __init__.py\n    ├── .editorconfig\n    ├── .pre-commit-config.yaml\n    ├── .pylintrc\n    ├── 2dplayground.ipynb\n    ├── 2dplayground_SDI_version.ipynb\n    ├── CHANGELOG.md\n    ├── DOCUMENTATION.md\n    ├── gradio_app.py\n    ├── launch.py\n    ├── LICENSE\n    ├── README.md\n    ├── requirements-dev.txt\n    ├── requirements.txt\n    ├── setup.py\n    └── threestudio.ipynb",
    "stateUrl": "https://storage.mcpmark.ai/filesystem/threestudio.zip",
    "stateOriginalUrl": "https://github.com/threestudio-project/threestudio"
  }
}

================================================
FILE: tasks/filesystem/standard/threestudio/code_locating/verify.py
================================================
#!/usr/bin/env python3
"""
Verification script for ThreeStudio Task 1: Find Zero123 Guidance Implementation
"""

import sys
from pathlib import Path
import re
import os

def get_test_directory() -> Path:
    """Get the test directory from FILESYSTEM_TEST_DIR env var."""
    test_root = os.environ.get("FILESYSTEM_TEST_DIR")
    if not test_root:
        raise ValueError("FILESYSTEM_TEST_DIR environment variable is required")
    return Path(test_root)

def verify_answer_file_exists(test_dir: Path) -> bool:
    """Verify that the answer.txt file exists."""
    answer_file = test_dir / "answer.txt"
    
    if not answer_file.exists():
        print("❌ File 'answer.txt' not found")
        return False
    
    print("✅ Answer file found")
    return True

def verify_answer_format(test_dir: Path) -> bool:
    """Verify that the answer file has the correct format."""
    answer_file = test_dir / "answer.txt"
    
    try:
        content = answer_file.read_text().strip()
        
        # Check if content is not empty
        if not content:
            print("❌ Answer file is empty")
            return False
        
        # Check if it contains only the file path (no additional text)
        if len(content.split('\n')) > 1:
            print("❌ Answer file contains multiple lines or additional text")
            return False
        
        # Check if it uses forward slashes
        if '\\' in content:
            print("❌ Answer uses backslashes instead of forward slashes")
            return False
        
        # Check if it's a relative path
        if content.startswith('/') or ':' in content:
            print("❌ Answer appears to be an absolute path")
            return False
        
        print("✅ Answer format is correct")
        return True
        
    except Exception as e:
        print(f"❌ Error reading answer file: {e}")
        return False

def verify_file_path_structure(test_dir: Path) -> bool:
    """Verify that the file path has the expected structure."""
    answer_file = test_dir / "answer.txt"
    
    try:
        content = answer_file.read_text().strip()
        
        # Expected path components for Zero123 guidance
        # In backup directories, the path is threestudio/models/guidance/zero123_guidance.py
        # In test_environments, the path is threestudio/threestudio/models/guidance/zero123_guidance.py
        expected_components = ["threestudio", "models", "guidance", "zero123_guidance.py"]
        
        # Check if all expected components are in the path
        for component in expected_components:
            if component not in content:
                print(f"❌ Path missing expected component: {component}")
                return False
        
        print("✅ File path structure is correct")
        return True
        
    except Exception as e:
        print(f"❌ Error verifying file path structure: {e}")
        return False

def verify_file_exists(test_dir: Path) -> bool:
    """Verify that the identified file actually exists."""
    answer_file = test_dir / "answer.txt"
    
    try:
        content = answer_file.read_text().strip()
        
        # Try the path as provided in the answer file
        file_path = test_dir / content
        
        # If that doesn't exist, try with the correct path structure
        # The answer file might have threestudio/models/guidance/zero123_guidance.py
        # but the actual path is threestudio/threestudio/models/guidance/zero123_guidance.py
        if not file_path.exists():
            # Try to fix the path by adding the missing threestudio prefix
            if content.startswith("threestudio/models/"):
                corrected_path = content.replace("threestudio/models/", "threestudio/threestudio/models/")
                file_path = test_dir / corrected_path
                if file_path.exists():
                    print(f"✅ File exists with corrected path: {corrected_path}")
                    return True
        
        if not file_path.exists():
            print(f"❌ Identified file does not exist: {content}")
            return False
        
        print("✅ Identified file exists")
        return True
        
    except Exception as e:
        print(f"❌ Error verifying file existence: {e}")
        return False

def verify_zero123_guidance_content(test_dir: Path) -> bool:
    """Verify that the identified file actually contains Zero123 guidance implementation."""
    answer_file = test_dir / "answer.txt"
    
    try:
        content = answer_file.read_text().strip()
        
        # Try the path as provided in the answer file
        file_path = test_dir / content
        
        # If that doesn't exist, try with the correct path structure
        if not file_path.exists():
            # Try to fix the path by adding the missing threestudio prefix
            if content.startswith("threestudio/models/"):
                corrected_path = content.replace("threestudio/models/", "threestudio/threestudio/models/")
                file_path = test_dir / corrected_path
        
        if not file_path.exists():
            print(f"❌ Cannot find file for content verification: {content}")
            return False
        
        file_content = file_path.read_text()
        
        # Check for the main Zero123 guidance implementation
        # The main implementation should have the class name "Zero123Guidance" and register as "zero123-guidance"
        main_zero123_indicators = [
            r'class Zero123Guidance',  # Main class name
            r'@threestudio\.register\("zero123-guidance"\)',  # Correct registration
            r'BaseObject',  # Base class
            r'zero123',  # General zero123 reference
        ]
        
        found_indicators = []
        for indicator in main_zero123_indicators:
            if re.search(indicator, file_content, re.IGNORECASE):
                found_indicators.append(indicator)
        
        # Check if this is the main Zero123 guidance implementation
        is_main_implementation = (
            'class Zero123Guidance' in file_content and 
            '@threestudio.register("zero123-guidance")' in file_content
        )
        
        if not is_main_implementation:
            print(f"❌ File is not the main Zero123 guidance implementation")
            print(f"   Expected: class Zero123Guidance and @threestudio.register('zero123-guidance')")
            return False
        
        print(f"✅ File contains main Zero123 guidance implementation indicators: {found_indicators}")
        return True
        
    except Exception as e:
        print(f"❌ Error verifying file content: {e}")
        return False

def main():
    """Main verification function."""
    test_dir = get_test_directory()
    print("🔍 Verifying ThreeStudio Task 1: Find Zero123 Guidance Implementation...")
    
    # Define verification steps
    verification_steps = [
        ("Answer File Exists", verify_answer_file_exists),
        ("Answer Format", verify_answer_format),
        ("File Path Structure", verify_file_path_structure),
        ("File Exists", verify_file_exists),
        ("Zero123 Guidance Content", verify_zero123_guidance_content),
    ]
    
    # Run all verification steps
    all_passed = True
    for step_name, verify_func in verification_steps:
        print(f"\n--- {step_name} ---")
        if not verify_func(test_dir):
            all_passed = False
    
    # Final result
    print("\n" + "="*50)
    if all_passed:
        print("✅ Zero123 guidance file path identified correctly!")
        print("🎉 Task 1 verification: PASS")
        sys.exit(0)
    else:
        print("❌ Task 1 verification: FAIL")
        sys.exit(1)

if __name__ == "__main__":
    main()

================================================
FILE: tasks/filesystem/standard/threestudio/output_analysis/description.md
================================================
Please use FileSystem tools to finish the following task:

### Task Description

ThreeStudio is a comprehensive codebase that implements various diffusion-based text-to-3D models, including NeRF-based rendering stage and diffusion guidance stage. Your task is to explore the codebase and identify the specific file that defines the guidance functionality for the Zero123 model.

### Task

What is the output of `guidance_out`, returned by the code at line 137 in `threestudio/systems/zero123.py`?

Clearly state the structure of it and where you find the answer (file and line numbers).Write your answer in a file named `answer.txt` in the test directory root. Do not add extra explanation or formatting beyond what is required by the task.


================================================
FILE: tasks/filesystem/standard/threestudio/output_analysis/meta.json
================================================
{
  "task_id": "output_analysis",
  "task_name": "Output Analysis",
  "category_id": "threestudio",
  "category_name": "Threestudio",
  "description": "Analyze the structure and components of guidance_out object returned by Zero123 guidance code at line 137 for understanding output format.",
  "author": "Lingjun Chen",
  "created_at": "2025-08-05",
  "difficulty": "L3",
  "tags": [
    "code exploration",
    "pattern analysis"
  ],
  "mcp": [
    "filesystem"
  ],
  "meta_data": {
    "stateType": "text",
    "stateContent": "threestudio/\n    ├── configs/\n    │       ├── debugging/\n    │       │       ├── controlnet-canny.yaml\n    │       │       ├── controlnet-normal.yaml\n    │       │       ├── instructpix2pix.yaml\n    │       │       └── stablediffusion.yaml\n    │       ├── experimental/\n    │       │       ├── unified-guidance/\n    │       │       │       ├── dreamfusion-sd.yaml\n    │       │       │       ├── hifa.yaml\n    │       │       │       ├── prolificdreamer-hifa.yaml\n    │       │       │       ├── prolificdreamer.yaml\n    │       │       │       └── zero123-simple.yaml\n    │       │       ├── co3d-imagecondition.yaml\n    │       │       ├── imagecondition.yaml\n    │       │       ├── imagecondition_zero123nerf.yaml\n    │       │       ├── imagecondition_zero123nerf_refine.yaml\n    │       │       ├── prolificdreamer-importance.yaml\n    │       │       ├── prolificdreamer-neus-importance.yaml\n    │       │       ├── prolificdreamer-propnet.yaml\n    │       │       └── textmesh-if-importance.yaml\n    │       ├── gradio/\n    │       │       ├── dreamfusion-if.yaml\n    │       │       ├── dreamfusion-sd.yaml\n    │       │       ├── fantasia3d.yaml\n    │       │       ├── latentnerf.yaml\n    │       │       ├── sjc.yaml\n    │       │       └── textmesh-if.yaml\n    │       ├── control4d-static.yaml\n    │       ├── dreamfusion-if.yaml\n    │       ├── dreamfusion-sd-eff.yaml\n    │       ├── dreamfusion-sd.yaml\n    │       ├── fantasia3d-texture.yaml\n    │       ├── fantasia3d.yaml\n    │       ├── hifa.yaml\n    │       ├── instructnerf2nerf.yaml\n    │       ├── latentnerf-refine.yaml\n    │       ├── latentnerf.yaml\n    │       ├── magic123-coarse-sd.yaml\n    │       ├── magic123-hifa-coarse-sd.yaml\n    │       ├── magic123-hifa-refine-sd.yaml\n    │       ├── magic123-refine-sd.yaml\n    │       ├── magic3d-coarse-if.yaml\n    │       ├── magic3d-coarse-sd.yaml\n    │       ├── magic3d-refine-sd.yaml\n    │       ├── prolificdreamer-geometry.yaml\n    │       ├── prolificdreamer-hifa.yaml\n    │       ├── prolificdreamer-patch.yaml\n    │       ├── prolificdreamer-scene-hifa.yaml\n    │       ├── prolificdreamer-scene.yaml\n    │       ├── prolificdreamer-texture.yaml\n    │       ├── prolificdreamer.yaml\n    │       ├── sdi.yaml\n    │       ├── sjc.yaml\n    │       ├── sketchshape-refine.yaml\n    │       ├── sketchshape.yaml\n    │       ├── stable-zero123.yaml\n    │       ├── textmesh-if.yaml\n    │       ├── zero123-geometry.yaml\n    │       └── zero123.yaml\n    ├── custom/\n    │       └── put_custom_extensions_here\n    ├── docker/\n    │       ├── compose.yaml\n    │       └── Dockerfile\n    ├── docs/\n    │       └── installation.md\n    ├── extern/\n    │       ├── ldm_zero123/\n    │       │       ├── models/\n    │       │       │       ├── diffusion/\n    │       │       │       │       ├── __init__.py\n    │       │       │       │       ├── classifier.py\n    │       │       │       │       ├── ddim.py\n    │       │       │       │       ├── ddpm.py\n    │       │       │       │       ├── plms.py\n    │       │       │       │       └── sampling_util.py\n    │       │       │       └── autoencoder.py\n    │       │       ├── modules/\n    │       │       │       ├── diffusionmodules/\n    │       │       │       │       ├── __init__.py\n    │       │       │       │       ├── model.py\n    │       │       │       │       ├── openaimodel.py\n    │       │       │       │       └── util.py\n    │       │       │       ├── distributions/\n    │       │       │       │       ├── __init__.py\n    │       │       │       │       └── distributions.py\n    │       │       │       ├── encoders/\n    │       │       │       │       ├── __init__.py\n    │       │       │       │       └── modules.py\n    │       │       │       ├── evaluate/\n    │       │       │       │       ├── adm_evaluator.py\n    │       │       │       │       ├── evaluate_perceptualsim.py\n    │       │       │       │       ├── frechet_video_distance.py\n    │       │       │       │       ├── ssim.py\n    │       │       │       │       └── torch_frechet_video_distance.py\n    │       │       │       ├── image_degradation/\n    │       │       │       │       ├── utils/\n    │       │       │       │       │       └── test.png\n    │       │       │       │       ├── __init__.py\n    │       │       │       │       ├── bsrgan.py\n    │       │       │       │       ├── bsrgan_light.py\n    │       │       │       │       └── utils_image.py\n    │       │       │       ├── losses/\n    │       │       │       │       ├── __init__.py\n    │       │       │       │       ├── contperceptual.py\n    │       │       │       │       └── vqperceptual.py\n    │       │       │       ├── attention.py\n    │       │       │       ├── ema.py\n    │       │       │       └── x_transformer.py\n    │       │       ├── thirdp/\n    │       │       │       └── psp/\n    │       │       │               ├── helpers.py\n    │       │       │               ├── id_loss.py\n    │       │       │               └── model_irse.py\n    │       │       ├── __init__.py\n    │       │       ├── extras.py\n    │       │       ├── guidance.py\n    │       │       ├── lr_scheduler.py\n    │       │       └── util.py\n    │       ├── __init__.py\n    │       └── zero123.py\n    ├── load/\n    │       ├── images/\n    │       │       ├── anya_front.png\n    │       │       ├── anya_front_depth.png\n    │       │       ├── anya_front_normal.png\n    │       │       ├── anya_front_rgba.png\n    │       │       ├── baby_phoenix_on_ice.png\n    │       │       ├── baby_phoenix_on_ice_depth.png\n    │       │       ├── baby_phoenix_on_ice_normal.png\n    │       │       ├── baby_phoenix_on_ice_rgba.png\n    │       │       ├── beach_house_1.png\n    │       │       ├── beach_house_1_depth.png\n    │       │       ├── beach_house_1_normal.png\n    │       │       ├── beach_house_1_rgba.png\n    │       │       ├── beach_house_2.png\n    │       │       ├── beach_house_2_depth.png\n    │       │       ├── beach_house_2_normal.png\n    │       │       ├── beach_house_2_rgba.png\n    │       │       ├── bollywood_actress.png\n    │       │       ├── bollywood_actress_depth.png\n    │       │       ├── bollywood_actress_normal.png\n    │       │       ├── bollywood_actress_rgba.png\n    │       │       ├── cactus.png\n    │       │       ├── cactus_depth.png\n    │       │       ├── cactus_normal.png\n    │       │       ├── cactus_rgba.png\n    │       │       ├── catstatue.png\n    │       │       ├── catstatue_depth.png\n    │       │       ├── catstatue_normal.png\n    │       │       ├── catstatue_rgba.png\n    │       │       ├── church_ruins.png\n    │       │       ├── church_ruins_depth.png\n    │       │       ├── church_ruins_normal.png\n    │       │       ├── church_ruins_rgba.png\n    │       │       ├── dog1_rgba.png\n    │       │       ├── dragon2_rgba.png\n    │       │       ├── firekeeper.jpg\n    │       │       ├── firekeeper_depth.png\n    │       │       ├── firekeeper_normal.png\n    │       │       ├── firekeeper_rgba.png\n    │       │       ├── futuristic_car.png\n    │       │       ├── futuristic_car_depth.png\n    │       │       ├── futuristic_car_normal.png\n    │       │       ├── futuristic_car_rgba.png\n    │       │       ├── grootplant_rgba.png\n    │       │       ├── hamburger.png\n    │       │       ├── hamburger_depth.png\n    │       │       ├── hamburger_rgba.png\n    │       │       ├── mona_lisa.png\n    │       │       ├── mona_lisa_depth.png\n    │       │       ├── mona_lisa_normal.png\n    │       │       ├── mona_lisa_rgba.png\n    │       │       ├── robot_rgba.png\n    │       │       ├── teddy.png\n    │       │       ├── teddy_depth.png\n    │       │       ├── teddy_normal.png\n    │       │       ├── teddy_rgba.png\n    │       │       └── thorhammer_rgba.png\n    │       ├── lights/\n    │       │       ├── bsdf_256_256.bin\n    │       │       ├── LICENSE.txt\n    │       │       └── mud_road_puresky_1k.hdr\n    │       ├── shapes/\n    │       │       ├── animal.obj\n    │       │       ├── blub.obj\n    │       │       ├── cabin.obj\n    │       │       ├── env_sphere.obj\n    │       │       ├── hand_prismatic.obj\n    │       │       ├── human.obj\n    │       │       ├── nascar.obj\n    │       │       ├── potion.obj\n    │       │       ├── README.md\n    │       │       └── teddy.obj\n    │       ├── tets/\n    │       │       ├── 128_tets.npz\n    │       │       ├── 32_tets.npz\n    │       │       ├── 64_tets.npz\n    │       │       └── generate_tets.py\n    │       ├── zero123/\n    │       │       ├── download.sh\n    │       │       └── sd-objaverse-finetune-c_concat-256.yaml\n    │       ├── make_prompt_library.py\n    │       └── prompt_library.json\n    ├── scripts/\n    │       └── convert_zero123_to_diffusers.py\n    ├── threestudio/\n    │       ├── data/\n    │       │       ├── __init__.py\n    │       │       ├── co3d.py\n    │       │       ├── image.py\n    │       │       ├── multiview.py\n    │       │       ├── uncond.py\n    │       │       └── uncond_eff.py\n    │       ├── models/\n    │       │       ├── background/\n    │       │       │       ├── __init__.py\n    │       │       │       ├── base.py\n    │       │       │       ├── neural_environment_map_background.py\n    │       │       │       ├── solid_color_background.py\n    │       │       │       └── textured_background.py\n    │       │       ├── exporters/\n    │       │       │       ├── __init__.py\n    │       │       │       ├── base.py\n    │       │       │       └── mesh_exporter.py\n    │       │       ├── geometry/\n    │       │       │       ├── __init__.py\n    │       │       │       ├── base.py\n    │       │       │       ├── custom_mesh.py\n    │       │       │       ├── implicit_sdf.py\n    │       │       │       ├── implicit_volume.py\n    │       │       │       ├── tetrahedra_sdf_grid.py\n    │       │       │       └── volume_grid.py\n    │       │       ├── guidance/\n    │       │       │       ├── __init__.py\n    │       │       │       ├── controlnet_guidance.py\n    │       │       │       ├── deep_floyd_guidance.py\n    │       │       │       ├── instructpix2pix_guidance.py\n    │       │       │       ├── stable_diffusion_guidance.py\n    │       │       │       ├── stable_diffusion_sdi_guidance.py\n    │       │       │       ├── stable_diffusion_unified_guidance.py\n    │       │       │       ├── stable_diffusion_vsd_guidance.py\n    │       │       │       ├── stable_zero123_guidance.py\n    │       │       │       ├── zero123_guidance.py\n    │       │       │       └── zero123_unified_guidance.py\n    │       │       ├── materials/\n    │       │       │       ├── __init__.py\n    │       │       │       ├── base.py\n    │       │       │       ├── diffuse_with_point_light_material.py\n    │       │       │       ├── hybrid_rgb_latent_material.py\n    │       │       │       ├── neural_radiance_material.py\n    │       │       │       ├── no_material.py\n    │       │       │       ├── pbr_material.py\n    │       │       │       └── sd_latent_adapter_material.py\n    │       │       ├── prompt_processors/\n    │       │       │       ├── __init__.py\n    │       │       │       ├── base.py\n    │       │       │       ├── deepfloyd_prompt_processor.py\n    │       │       │       ├── dummy_prompt_processor.py\n    │       │       │       └── stable_diffusion_prompt_processor.py\n    │       │       ├── renderers/\n    │       │       │       ├── __init__.py\n    │       │       │       ├── base.py\n    │       │       │       ├── deferred_volume_renderer.py\n    │       │       │       ├── gan_volume_renderer.py\n    │       │       │       ├── nerf_volume_renderer.py\n    │       │       │       ├── neus_volume_renderer.py\n    │       │       │       ├── nvdiff_rasterizer.py\n    │       │       │       └── patch_renderer.py\n    │       │       ├── __init__.py\n    │       │       ├── estimators.py\n    │       │       ├── isosurface.py\n    │       │       ├── mesh.py\n    │       │       └── networks.py\n    │       ├── scripts/\n    │       │       ├── make_training_vid.py\n    │       │       ├── run_zero123.sh\n    │       │       ├── run_zero123_comparison.sh\n    │       │       ├── run_zero123_phase.sh\n    │       │       ├── run_zero123_phase2.sh\n    │       │       ├── run_zero123_sbatch.py\n    │       │       ├── zero123_demo.py\n    │       │       └── zero123_sbatch.sh\n    │       ├── systems/\n    │       │       ├── __init__.py\n    │       │       ├── base.py\n    │       │       ├── control4d_multiview.py\n    │       │       ├── dreamfusion.py\n    │       │       ├── eff_dreamfusion.py\n    │       │       ├── fantasia3d.py\n    │       │       ├── imagedreamfusion.py\n    │       │       ├── instructnerf2nerf.py\n    │       │       ├── latentnerf.py\n    │       │       ├── magic123.py\n    │       │       ├── magic3d.py\n    │       │       ├── optimizers.py\n    │       │       ├── prolificdreamer.py\n    │       │       ├── sdi.py\n    │       │       ├── sjc.py\n    │       │       ├── textmesh.py\n    │       │       ├── utils.py\n    │       │       ├── zero123.py\n    │       │       └── zero123_simple.py\n    │       ├── utils/\n    │       │       ├── GAN/\n    │       │       │       ├── __init__.py\n    │       │       │       ├── attention.py\n    │       │       │       ├── discriminator.py\n    │       │       │       ├── distribution.py\n    │       │       │       ├── loss.py\n    │       │       │       ├── mobilenet.py\n    │       │       │       ├── network_util.py\n    │       │       │       ├── util.py\n    │       │       │       └── vae.py\n    │       │       ├── perceptual/\n    │       │       │       ├── __init__.py\n    │       │       │       ├── perceptual.py\n    │       │       │       └── utils.py\n    │       │       ├── __init__.py\n    │       │       ├── base.py\n    │       │       ├── callbacks.py\n    │       │       ├── config.py\n    │       │       ├── loss.py\n    │       │       ├── misc.py\n    │       │       ├── ops.py\n    │       │       ├── rasterize.py\n    │       │       ├── saving.py\n    │       │       └── typing.py\n    │       └── __init__.py\n    ├── .editorconfig\n    ├── .pre-commit-config.yaml\n    ├── .pylintrc\n    ├── 2dplayground.ipynb\n    ├── 2dplayground_SDI_version.ipynb\n    ├── CHANGELOG.md\n    ├── DOCUMENTATION.md\n    ├── gradio_app.py\n    ├── launch.py\n    ├── LICENSE\n    ├── README.md\n    ├── requirements-dev.txt\n    ├── requirements.txt\n    ├── setup.py\n    └── threestudio.ipynb",
    "stateUrl": "https://storage.mcpmark.ai/filesystem/threestudio.zip",
    "stateOriginalUrl": "https://github.com/threestudio-project/threestudio"
  }
}

================================================
FILE: tasks/filesystem/standard/threestudio/output_analysis/verify.py
================================================
#!/usr/bin/env python3
"""
Verification script for ThreeStudio Task 2: Analyze Zero123 Guidance Output Structure
"""

import sys
from pathlib import Path
import re
import os

def get_test_directory() -> Path:
    """Get the test directory from FILESYSTEM_TEST_DIR env var."""
    test_root = os.environ.get("FILESYSTEM_TEST_DIR")
    if not test_root:
        raise ValueError("FILESYSTEM_TEST_DIR environment variable is required")
    return Path(test_root)

def verify_answer_file_exists(test_dir: Path) -> bool:
    """Verify that the answer.txt file exists."""
    answer_file = test_dir / "answer.txt"
    
    if not answer_file.exists():
        print("❌ File 'answer.txt' not found")
        return False
    
    print("✅ Answer file found")
    return True

def verify_required_strings(test_dir: Path) -> bool:
    """Verify that the answer contains the four required strings."""
    answer_file = test_dir / "answer.txt"
    
    try:
        content = answer_file.read_text()
        
        # Check for required strings
        required_strings = ["loss_sds", "grad_norm", "min_step", "max_step"]
        missing_strings = []
        
        for string in required_strings:
            if string not in content:
                missing_strings.append(string)
        
        if missing_strings:
            print(f"❌ Missing required strings: {missing_strings}")
            return False
        
        print("✅ All required strings found")
        return True
        
    except Exception as e:
        print(f"❌ Error reading answer file: {e}")
        return False

def verify_line_numbers(test_dir: Path) -> bool:
    """Verify that line numbers contain (323 or 324) AND (327 or 328)."""
    answer_file = test_dir / "answer.txt"
    
    try:
        content = answer_file.read_text()
        
        # Check for first number (323 or 324)
        has_first = "323" in content or "324" in content
        
        # Check for second number (327 or 328)
        has_second = "327" in content or "328" in content
        
        if not has_first:
            print("❌ Missing first line number (323 or 324)")
            return False
        
        if not has_second:
            print("❌ Missing second line number (327 or 328)")
            return False
        
        print("✅ Line numbers found: contains (323 or 324) and (327 or 328)")
        return True
        
    except Exception as e:
        print(f"❌ Error verifying line numbers: {e}")
        return False

def verify_file_path(test_dir: Path) -> bool:
    """Verify that the file path contains the exact expected path string."""
    answer_file = test_dir / "answer.txt"
    
    try:
        content = answer_file.read_text()
        
        # Check for the exact expected file path
        expected_path = "threestudio/models/guidance/zero123_guidance.py"
        
        if expected_path not in content:
            print(f"❌ Missing expected file path: {expected_path}")
            return False
        
        print("✅ File path found: threestudio/models/guidance/zero123_guidance.py")
        return True
        
    except Exception as e:
        print(f"❌ Error verifying file path: {e}")
        return False

def main():
    """Main verification function."""
    test_dir = get_test_directory()
    print("🔍 Verifying ThreeStudio Task 2: Analyze Zero123 Guidance Output Structure...")
    
    # Define verification steps
    verification_steps = [
        ("Answer File Exists", verify_answer_file_exists),
        ("Required Strings", verify_required_strings),
        ("Line Numbers Range", verify_line_numbers),
        ("File Path Components", verify_file_path),
    ]
    
    # Run all verification steps
    all_passed = True
    for step_name, verify_func in verification_steps:
        print(f"\n--- {step_name} ---")
        if not verify_func(test_dir):
            all_passed = False
    
    # Final result
    print("\n" + "="*50)
    if all_passed:
        print("✅ Zero123 guidance output structure analyzed correctly!")
        print("🎉 Task 2 verification: PASS")
        sys.exit(0)
    else:
        print("❌ Task 2 verification: FAIL")
        sys.exit(1)

if __name__ == "__main__":
    main()

================================================
FILE: tasks/filesystem/standard/threestudio/requirements_completion/description.md
================================================
Please use FileSystem tools to finish the following task:

### Task Description

The `requirements.txt` file in the ThreeStudio project is used to install necessary Python libraries. However, the Zero123-related dependencies were accidentally deleted from the file. Your task is to restore these missing dependencies.

### Task Objectives

1. **Locate the requirements.txt file** in the test environment
2. **Identify the missing Zero123 dependencies** that need to be restored
3. **Add the required dependencies** to the requirements.txt file
4. **Ensure the file format is correct** (one dependency per line)


================================================
FILE: tasks/filesystem/standard/threestudio/requirements_completion/meta.json
================================================
{
  "task_id": "requirements_completion",
  "task_name": "Requirements Completion",
  "category_id": "threestudio",
  "category_name": "Threestudio",
  "description": "Restore and complete missing Zero123-related dependencies in the requirements.txt file to ensure proper ThreeStudio project configuration.",
  "author": "Lingjun Chen",
  "created_at": "2025-08-05",
  "difficulty": "L3",
  "tags": [
    "code exploration",
    "cross-referencing"
  ],
  "mcp": [
    "filesystem"
  ],
  "meta_data": {
    "stateType": "text",
    "stateContent": "threestudio/\n    ├── configs/\n    │       ├── debugging/\n    │       │       ├── controlnet-canny.yaml\n    │       │       ├── controlnet-normal.yaml\n    │       │       ├── instructpix2pix.yaml\n    │       │       └── stablediffusion.yaml\n    │       ├── experimental/\n    │       │       ├── unified-guidance/\n    │       │       │       ├── dreamfusion-sd.yaml\n    │       │       │       ├── hifa.yaml\n    │       │       │       ├── prolificdreamer-hifa.yaml\n    │       │       │       ├── prolificdreamer.yaml\n    │       │       │       └── zero123-simple.yaml\n    │       │       ├── co3d-imagecondition.yaml\n    │       │       ├── imagecondition.yaml\n    │       │       ├── imagecondition_zero123nerf.yaml\n    │       │       ├── imagecondition_zero123nerf_refine.yaml\n    │       │       ├── prolificdreamer-importance.yaml\n    │       │       ├── prolificdreamer-neus-importance.yaml\n    │       │       ├── prolificdreamer-propnet.yaml\n    │       │       └── textmesh-if-importance.yaml\n    │       ├── gradio/\n    │       │       ├── dreamfusion-if.yaml\n    │       │       ├── dreamfusion-sd.yaml\n    │       │       ├── fantasia3d.yaml\n    │       │       ├── latentnerf.yaml\n    │       │       ├── sjc.yaml\n    │       │       └── textmesh-if.yaml\n    │       ├── control4d-static.yaml\n    │       ├── dreamfusion-if.yaml\n    │       ├── dreamfusion-sd-eff.yaml\n    │       ├── dreamfusion-sd.yaml\n    │       ├── fantasia3d-texture.yaml\n    │       ├── fantasia3d.yaml\n    │       ├── hifa.yaml\n    │       ├── instructnerf2nerf.yaml\n    │       ├── latentnerf-refine.yaml\n    │       ├── latentnerf.yaml\n    │       ├── magic123-coarse-sd.yaml\n    │       ├── magic123-hifa-coarse-sd.yaml\n    │       ├── magic123-hifa-refine-sd.yaml\n    │       ├── magic123-refine-sd.yaml\n    │       ├── magic3d-coarse-if.yaml\n    │       ├── magic3d-coarse-sd.yaml\n    │       ├── magic3d-refine-sd.yaml\n    │       ├── prolificdreamer-geometry.yaml\n    │       ├── prolificdreamer-hifa.yaml\n    │       ├── prolificdreamer-patch.yaml\n    │       ├── prolificdreamer-scene-hifa.yaml\n    │       ├── prolificdreamer-scene.yaml\n    │       ├── prolificdreamer-texture.yaml\n    │       ├── prolificdreamer.yaml\n    │       ├── sdi.yaml\n    │       ├── sjc.yaml\n    │       ├── sketchshape-refine.yaml\n    │       ├── sketchshape.yaml\n    │       ├── stable-zero123.yaml\n    │       ├── textmesh-if.yaml\n    │       ├── zero123-geometry.yaml\n    │       └── zero123.yaml\n    ├── custom/\n    │       └── put_custom_extensions_here\n    ├── docker/\n    │       ├── compose.yaml\n    │       └── Dockerfile\n    ├── docs/\n    │       └── installation.md\n    ├── extern/\n    │       ├── ldm_zero123/\n    │       │       ├── models/\n    │       │       │       ├── diffusion/\n    │       │       │       │       ├── __init__.py\n    │       │       │       │       ├── classifier.py\n    │       │       │       │       ├── ddim.py\n    │       │       │       │       ├── ddpm.py\n    │       │       │       │       ├── plms.py\n    │       │       │       │       └── sampling_util.py\n    │       │       │       └── autoencoder.py\n    │       │       ├── modules/\n    │       │       │       ├── diffusionmodules/\n    │       │       │       │       ├── __init__.py\n    │       │       │       │       ├── model.py\n    │       │       │       │       ├── openaimodel.py\n    │       │       │       │       └── util.py\n    │       │       │       ├── distributions/\n    │       │       │       │       ├── __init__.py\n    │       │       │       │       └── distributions.py\n    │       │       │       ├── encoders/\n    │       │       │       │       ├── __init__.py\n    │       │       │       │       └── modules.py\n    │       │       │       ├── evaluate/\n    │       │       │       │       ├── adm_evaluator.py\n    │       │       │       │       ├── evaluate_perceptualsim.py\n    │       │       │       │       ├── frechet_video_distance.py\n    │       │       │       │       ├── ssim.py\n    │       │       │       │       └── torch_frechet_video_distance.py\n    │       │       │       ├── image_degradation/\n    │       │       │       │       ├── utils/\n    │       │       │       │       │       └── test.png\n    │       │       │       │       ├── __init__.py\n    │       │       │       │       ├── bsrgan.py\n    │       │       │       │       ├── bsrgan_light.py\n    │       │       │       │       └── utils_image.py\n    │       │       │       ├── losses/\n    │       │       │       │       ├── __init__.py\n    │       │       │       │       ├── contperceptual.py\n    │       │       │       │       └── vqperceptual.py\n    │       │       │       ├── attention.py\n    │       │       │       ├── ema.py\n    │       │       │       └── x_transformer.py\n    │       │       ├── thirdp/\n    │       │       │       └── psp/\n    │       │       │               ├── helpers.py\n    │       │       │               ├── id_loss.py\n    │       │       │               └── model_irse.py\n    │       │       ├── __init__.py\n    │       │       ├── extras.py\n    │       │       ├── guidance.py\n    │       │       ├── lr_scheduler.py\n    │       │       └── util.py\n    │       ├── __init__.py\n    │       └── zero123.py\n    ├── load/\n    │       ├── images/\n    │       │       ├── anya_front.png\n    │       │       ├── anya_front_depth.png\n    │       │       ├── anya_front_normal.png\n    │       │       ├── anya_front_rgba.png\n    │       │       ├── baby_phoenix_on_ice.png\n    │       │       ├── baby_phoenix_on_ice_depth.png\n    │       │       ├── baby_phoenix_on_ice_normal.png\n    │       │       ├── baby_phoenix_on_ice_rgba.png\n    │       │       ├── beach_house_1.png\n    │       │       ├── beach_house_1_depth.png\n    │       │       ├── beach_house_1_normal.png\n    │       │       ├── beach_house_1_rgba.png\n    │       │       ├── beach_house_2.png\n    │       │       ├── beach_house_2_depth.png\n    │       │       ├── beach_house_2_normal.png\n    │       │       ├── beach_house_2_rgba.png\n    │       │       ├── bollywood_actress.png\n    │       │       ├── bollywood_actress_depth.png\n    │       │       ├── bollywood_actress_normal.png\n    │       │       ├── bollywood_actress_rgba.png\n    │       │       ├── cactus.png\n    │       │       ├── cactus_depth.png\n    │       │       ├── cactus_normal.png\n    │       │       ├── cactus_rgba.png\n    │       │       ├── catstatue.png\n    │       │       ├── catstatue_depth.png\n    │       │       ├── catstatue_normal.png\n    │       │       ├── catstatue_rgba.png\n    │       │       ├── church_ruins.png\n    │       │       ├── church_ruins_depth.png\n    │       │       ├── church_ruins_normal.png\n    │       │       ├── church_ruins_rgba.png\n    │       │       ├── dog1_rgba.png\n    │       │       ├── dragon2_rgba.png\n    │       │       ├── firekeeper.jpg\n    │       │       ├── firekeeper_depth.png\n    │       │       ├── firekeeper_normal.png\n    │       │       ├── firekeeper_rgba.png\n    │       │       ├── futuristic_car.png\n    │       │       ├── futuristic_car_depth.png\n    │       │       ├── futuristic_car_normal.png\n    │       │       ├── futuristic_car_rgba.png\n    │       │       ├── grootplant_rgba.png\n    │       │       ├── hamburger.png\n    │       │       ├── hamburger_depth.png\n    │       │       ├── hamburger_rgba.png\n    │       │       ├── mona_lisa.png\n    │       │       ├── mona_lisa_depth.png\n    │       │       ├── mona_lisa_normal.png\n    │       │       ├── mona_lisa_rgba.png\n    │       │       ├── robot_rgba.png\n    │       │       ├── teddy.png\n    │       │       ├── teddy_depth.png\n    │       │       ├── teddy_normal.png\n    │       │       ├── teddy_rgba.png\n    │       │       └── thorhammer_rgba.png\n    │       ├── lights/\n    │       │       ├── bsdf_256_256.bin\n    │       │       ├── LICENSE.txt\n    │       │       └── mud_road_puresky_1k.hdr\n    │       ├── shapes/\n    │       │       ├── animal.obj\n    │       │       ├── blub.obj\n    │       │       ├── cabin.obj\n    │       │       ├── env_sphere.obj\n    │       │       ├── hand_prismatic.obj\n    │       │       ├── human.obj\n    │       │       ├── nascar.obj\n    │       │       ├── potion.obj\n    │       │       ├── README.md\n    │       │       └── teddy.obj\n    │       ├── tets/\n    │       │       ├── 128_tets.npz\n    │       │       ├── 32_tets.npz\n    │       │       ├── 64_tets.npz\n    │       │       └── generate_tets.py\n    │       ├── zero123/\n    │       │       ├── download.sh\n    │       │       └── sd-objaverse-finetune-c_concat-256.yaml\n    │       ├── make_prompt_library.py\n    │       └── prompt_library.json\n    ├── scripts/\n    │       └── convert_zero123_to_diffusers.py\n    ├── threestudio/\n    │       ├── data/\n    │       │       ├── __init__.py\n    │       │       ├── co3d.py\n    │       │       ├── image.py\n    │       │       ├── multiview.py\n    │       │       ├── uncond.py\n    │       │       └── uncond_eff.py\n    │       ├── models/\n    │       │       ├── background/\n    │       │       │       ├── __init__.py\n    │       │       │       ├── base.py\n    │       │       │       ├── neural_environment_map_background.py\n    │       │       │       ├── solid_color_background.py\n    │       │       │       └── textured_background.py\n    │       │       ├── exporters/\n    │       │       │       ├── __init__.py\n    │       │       │       ├── base.py\n    │       │       │       └── mesh_exporter.py\n    │       │       ├── geometry/\n    │       │       │       ├── __init__.py\n    │       │       │       ├── base.py\n    │       │       │       ├── custom_mesh.py\n    │       │       │       ├── implicit_sdf.py\n    │       │       │       ├── implicit_volume.py\n    │       │       │       ├── tetrahedra_sdf_grid.py\n    │       │       │       └── volume_grid.py\n    │       │       ├── guidance/\n    │       │       │       ├── __init__.py\n    │       │       │       ├── controlnet_guidance.py\n    │       │       │       ├── deep_floyd_guidance.py\n    │       │       │       ├── instructpix2pix_guidance.py\n    │       │       │       ├── stable_diffusion_guidance.py\n    │       │       │       ├── stable_diffusion_sdi_guidance.py\n    │       │       │       ├── stable_diffusion_unified_guidance.py\n    │       │       │       ├── stable_diffusion_vsd_guidance.py\n    │       │       │       ├── stable_zero123_guidance.py\n    │       │       │       ├── zero123_guidance.py\n    │       │       │       └── zero123_unified_guidance.py\n    │       │       ├── materials/\n    │       │       │       ├── __init__.py\n    │       │       │       ├── base.py\n    │       │       │       ├── diffuse_with_point_light_material.py\n    │       │       │       ├── hybrid_rgb_latent_material.py\n    │       │       │       ├── neural_radiance_material.py\n    │       │       │       ├── no_material.py\n    │       │       │       ├── pbr_material.py\n    │       │       │       └── sd_latent_adapter_material.py\n    │       │       ├── prompt_processors/\n    │       │       │       ├── __init__.py\n    │       │       │       ├── base.py\n    │       │       │       ├── deepfloyd_prompt_processor.py\n    │       │       │       ├── dummy_prompt_processor.py\n    │       │       │       └── stable_diffusion_prompt_processor.py\n    │       │       ├── renderers/\n    │       │       │       ├── __init__.py\n    │       │       │       ├── base.py\n    │       │       │       ├── deferred_volume_renderer.py\n    │       │       │       ├── gan_volume_renderer.py\n    │       │       │       ├── nerf_volume_renderer.py\n    │       │       │       ├── neus_volume_renderer.py\n    │       │       │       ├── nvdiff_rasterizer.py\n    │       │       │       └── patch_renderer.py\n    │       │       ├── __init__.py\n    │       │       ├── estimators.py\n    │       │       ├── isosurface.py\n    │       │       ├── mesh.py\n    │       │       └── networks.py\n    │       ├── scripts/\n    │       │       ├── make_training_vid.py\n    │       │       ├── run_zero123.sh\n    │       │       ├── run_zero123_comparison.sh\n    │       │       ├── run_zero123_phase.sh\n    │       │       ├── run_zero123_phase2.sh\n    │       │       ├── run_zero123_sbatch.py\n    │       │       ├── zero123_demo.py\n    │       │       └── zero123_sbatch.sh\n    │       ├── systems/\n    │       │       ├── __init__.py\n    │       │       ├── base.py\n    │       │       ├── control4d_multiview.py\n    │       │       ├── dreamfusion.py\n    │       │       ├── eff_dreamfusion.py\n    │       │       ├── fantasia3d.py\n    │       │       ├── imagedreamfusion.py\n    │       │       ├── instructnerf2nerf.py\n    │       │       ├── latentnerf.py\n    │       │       ├── magic123.py\n    │       │       ├── magic3d.py\n    │       │       ├── optimizers.py\n    │       │       ├── prolificdreamer.py\n    │       │       ├── sdi.py\n    │       │       ├── sjc.py\n    │       │       ├── textmesh.py\n    │       │       ├── utils.py\n    │       │       ├── zero123.py\n    │       │       └── zero123_simple.py\n    │       ├── utils/\n    │       │       ├── GAN/\n    │       │       │       ├── __init__.py\n    │       │       │       ├── attention.py\n    │       │       │       ├── discriminator.py\n    │       │       │       ├── distribution.py\n    │       │       │       ├── loss.py\n    │       │       │       ├── mobilenet.py\n    │       │       │       ├── network_util.py\n    │       │       │       ├── util.py\n    │       │       │       └── vae.py\n    │       │       ├── perceptual/\n    │       │       │       ├── __init__.py\n    │       │       │       ├── perceptual.py\n    │       │       │       └── utils.py\n    │       │       ├── __init__.py\n    │       │       ├── base.py\n    │       │       ├── callbacks.py\n    │       │       ├── config.py\n    │       │       ├── loss.py\n    │       │       ├── misc.py\n    │       │       ├── ops.py\n    │       │       ├── rasterize.py\n    │       │       ├── saving.py\n    │       │       └── typing.py\n    │       └── __init__.py\n    ├── .editorconfig\n    ├── .pre-commit-config.yaml\n    ├── .pylintrc\n    ├── 2dplayground.ipynb\n    ├── 2dplayground_SDI_version.ipynb\n    ├── CHANGELOG.md\n    ├── DOCUMENTATION.md\n    ├── gradio_app.py\n    ├── launch.py\n    ├── LICENSE\n    ├── README.md\n    ├── requirements-dev.txt\n    ├── requirements.txt\n    ├── setup.py\n    └── threestudio.ipynb",
    "stateUrl": "https://storage.mcpmark.ai/filesystem/threestudio.zip",
    "stateOriginalUrl": "https://github.com/threestudio-project/threestudio"
  }
}

================================================
FILE: tasks/filesystem/standard/threestudio/requirements_completion/verify.py
================================================
#!/usr/bin/env python3
"""
Verification script for ThreeStudio Task 3: Restore Zero123 Dependencies in Requirements.txt
"""

import sys
from pathlib import Path
import re
import os

def get_test_directory() -> Path:
    """Get the test directory from FILESYSTEM_TEST_DIR env var."""
    test_root = os.environ.get("FILESYSTEM_TEST_DIR")
    if not test_root:
        raise ValueError("FILESYSTEM_TEST_DIR environment variable is required")
    return Path(test_root)

def verify_requirements_file_exists(test_dir: Path) -> bool:
    """Verify that the requirements.txt file exists."""
    requirements_file = test_dir / "requirements.txt"
    
    if not requirements_file.exists():
        print("❌ File 'requirements.txt' not found")
        return False
    
    print("✅ Requirements.txt file found")
    return True

def verify_requirements_file_readable(test_dir: Path) -> bool:
    """Verify that the requirements.txt file is readable."""
    requirements_file = test_dir / "requirements.txt"
    
    try:
        content = requirements_file.read_text()
        if not content.strip():
            print("❌ Requirements.txt file is empty")
            return False
        
        print("✅ Requirements.txt file is readable")
        return True
        
    except Exception as e:
        print(f"❌ Error reading requirements.txt file: {e}")
        return False

def verify_required_dependencies_present(test_dir: Path) -> bool:
    """Verify that all required Zero123 dependencies are present."""
    requirements_file = test_dir / "requirements.txt"
    
    try:
        content = requirements_file.read_text()
        
        # Required dependencies to check for (simplified)
        required_deps = [
            "einops",
            "kornia", 
            "taming",
            "openai",
            "clip"
        ]
        
        missing_deps = []
        found_deps = []
        
        for dep in required_deps:
            if dep.lower() in content.lower():
                found_deps.append(dep)
            else:
                missing_deps.append(dep)
        
        if missing_deps:
            print(f"❌ Missing required dependencies: {missing_deps}")
            return False
        
        print(f"✅ All required dependencies found: {found_deps}")
        return True
        
    except Exception as e:
        print(f"❌ Error checking dependencies: {e}")
        return False

def verify_specific_dependency_entries(test_dir: Path) -> bool:
    """Verify that the specific dependency entries are present."""
    requirements_file = test_dir / "requirements.txt"
    
    try:
        content = requirements_file.read_text()
        
        # Check for specific dependency entries (simplified)
        # For taming, we only need to check if "taming" is present, not the full package name
        required_checks = [
            ("einops", "einops"),
            ("kornia", "kornia"),
            ("taming", "taming"),  # Just check for "taming" substring
        ]
        
        missing_entries = []
        found_entries = []
        
        for check_name, full_entry in required_checks:
            if check_name in content.lower():
                found_entries.append(check_name)
            else:
                missing_entries.append(check_name)
        
        # Special check for openai and clip - they should be on the same line
        lines = content.split('\n')
        openai_clip_found = False
        for line in lines:
            line_lower = line.lower()
            if "openai" in line_lower and "clip" in line_lower:
                openai_clip_found = True
                break
        
        if openai_clip_found:
            found_entries.append("openai+clip")
        else:
            missing_entries.append("openai+clip")
        
        if missing_entries:
            print(f"❌ Missing required dependency checks: {missing_entries}")
            return False
        
        print(f"✅ All required dependency checks passed: {found_entries}")
        return True
        
    except Exception as e:
        print(f"❌ Error checking specific entries: {e}")
        return False

def verify_file_format(test_dir: Path) -> bool:
    """Verify that the requirements.txt file has proper format."""
    requirements_file = test_dir / "requirements.txt"
    
    try:
        content = requirements_file.read_text()
        lines = content.split('\n')
        
        # Basic format check - just ensure file is not completely empty
        if not content.strip():
            print("❌ File is completely empty")
            return False
        
        print("✅ File format is acceptable")
        return True
        
    except Exception as e:
        print(f"❌ Error checking file format: {e}")
        return False

def verify_no_duplicate_entries(test_dir: Path) -> bool:
    """Verify that there are no duplicate dependency entries."""
    requirements_file = test_dir / "requirements.txt"
    
    try:
        content = requirements_file.read_text()
        
        # Simplified duplicate check - just ensure the file is not completely corrupted
        if len(content) < 10:  # Basic sanity check
            print("❌ File seems too short to be valid")
            return False
        
        print("✅ File appears to be valid")
        return True
        
    except Exception as e:
        print(f"❌ Error checking file: {e}")
        return False

def main():
    """Main verification function."""
    test_dir = get_test_directory()
    print("🔍 Verifying ThreeStudio Task 3: Restore Zero123 Dependencies in Requirements.txt...")
    
    # Define verification steps
    verification_steps = [
        ("Requirements File Exists", verify_requirements_file_exists),
        ("File is Readable", verify_requirements_file_readable),
        ("Required Dependencies Present", verify_required_dependencies_present),
        ("Specific Entries Present", verify_specific_dependency_entries),
        ("File Format", verify_file_format),
        ("File Validity", verify_no_duplicate_entries),
    ]
    
    # Run all verification steps
    all_passed = True
    for step_name, verify_func in verification_steps:
        print(f"\n--- {step_name} ---")
        if not verify_func(test_dir):
            all_passed = False
    
    # Final result
    print("\n" + "="*50)
    if all_passed:
        print("✅ Zero123 dependencies successfully restored in requirements.txt!")
        print("🎉 Task 3 verification: PASS")
        sys.exit(0)
    else:
        print("❌ Task 3 verification: FAIL")
        sys.exit(1)

if __name__ == "__main__":
    main()


================================================
FILE: tasks/filesystem/standard/votenet/dataset_comparison/description.md
================================================
Please use FileSystem tools to finish the following task:

### Task Description

Analyze the codebase to map ScanNet object categories to SUN RGB-D categories and calculate object counts.

### Task Objectives

1. **Primary Goal**: Use SUN RGB-D's 10-category classification system as the target taxonomy
2. **Mapping Requirement**: Map each ScanNet object category (using the "category" field, not "raw_category") to the corresponding SUN RGB-D category
3. **Calculation**: For each SUN RGB-D category, calculate the total count of objects from ScanNet that map to that category （It only counts if the category (not raw category) name are exactly the same(night_stand = nightstand)）
4. **Output**: Generate an analysis.txt file in the main directory showing the mapping and counts

### Expected Output

Create a file named `analysis.txt` in the test directory root with the following format:

- Each SUN RGB-D category should be represented as a 2-line block
- Line 1: category name
- Line 2: total count
- Each block should be separated by one empty line


================================================
FILE: tasks/filesystem/standard/votenet/dataset_comparison/meta.json
================================================
{
  "task_id": "dataset_comparison",
  "task_name": "Dataset Comparison",
  "category_id": "votenet",
  "category_name": "Votenet",
  "description": "Map ScanNet object categories to their SUN RGB-D equivalents and calculate detailed object counts for each mapped category.",
  "author": "Lingjun Chen",
  "created_at": "2025-08-13",
  "difficulty": "L3",
  "tags": [
    "cross-referencing",
    "data extraction",
    "pattern analysis"
  ],
  "mcp": [
    "filesystem"
  ],
  "meta_data": {
    "stateType": "text",
    "stateContent": "votenet/\n    ├── doc/\n    │       ├── teaser.jpg\n    │       └── tips.md\n    ├── models/\n    │       ├── ap_helper.py\n    │       ├── backbone_module.py\n    │       ├── boxnet.py\n    │       ├── dump_helper.py\n    │       ├── loss_helper.py\n    │       ├── loss_helper_boxnet.py\n    │       ├── proposal_module.py\n    │       ├── votenet.py\n    │       └── voting_module.py\n    ├── pointnet2/\n    │       ├── _ext_src/\n    │       │       ├── include/\n    │       │       │       ├── ball_query.h\n    │       │       │       ├── cuda_utils.h\n    │       │       │       ├── group_points.h\n    │       │       │       ├── interpolate.h\n    │       │       │       ├── sampling.h\n    │       │       │       └── utils.h\n    │       │       └── src/\n    │       │               ├── ball_query.cpp\n    │       │               ├── ball_query_gpu.cu\n    │       │               ├── bindings.cpp\n    │       │               ├── group_points.cpp\n    │       │               ├── group_points_gpu.cu\n    │       │               ├── interpolate.cpp\n    │       │               ├── interpolate_gpu.cu\n    │       │               ├── sampling.cpp\n    │       │               └── sampling_gpu.cu\n    │       ├── pointnet2_modules.py\n    │       ├── pointnet2_test.py\n    │       ├── pointnet2_utils.py\n    │       ├── pytorch_utils.py\n    │       └── setup.py\n    ├── scannet/\n    │       ├── meta_data/\n    │       │       ├── scannet_means.npz\n    │       │       ├── scannet_train.txt\n    │       │       ├── scannetv2-labels.combined.tsv\n    │       │       ├── scannetv2_test.txt\n    │       │       ├── scannetv2_train.txt\n    │       │       └── scannetv2_val.txt\n    │       ├── scans/\n    │       ├── batch_load_scannet_data.py\n    │       ├── data_viz.py\n    │       ├── load_scannet_data.py\n    │       ├── model_util_scannet.py\n    │       ├── README.md\n    │       ├── scannet_detection_dataset.py\n    │       └── scannet_utils.py\n    ├── sunrgbd/\n    │       ├── matlab/\n    │       │       ├── extract_rgbd_data_v1.m\n    │       │       ├── extract_rgbd_data_v2.m\n    │       │       └── extract_split.m\n    │       ├── OFFICIAL_SUNRGBD/\n    │       ├── sunrgbd_trainval/\n    │       ├── model_util_sunrgbd.py\n    │       ├── README.md\n    │       ├── sunrgbd_data.py\n    │       ├── sunrgbd_detection_dataset.py\n    │       └── sunrgbd_utils.py\n    ├── utils/\n    │       ├── box_util.py\n    │       ├── eval_det.py\n    │       ├── metric_util.py\n    │       ├── nms.py\n    │       ├── nn_distance.py\n    │       ├── pc_util.py\n    │       ├── tf_logger.py\n    │       └── tf_visualizer.py\n    ├── CODE_OF_CONDUCT.md\n    ├── CONTRIBUTING.md\n    ├── demo.py\n    ├── eval.py\n    ├── LICENSE\n    ├── README.md\n    └── train.py",
    "stateUrl": "https://storage.mcpmark.ai/filesystem/votenet.zip",
    "stateOriginalUrl": "https://github.com/facebookresearch/votenet"
  }
}

================================================
FILE: tasks/filesystem/standard/votenet/dataset_comparison/verify.py
================================================
#!/usr/bin/env python3
"""
Verification script for Votenet Dataset Comparison Task
"""

import sys
from pathlib import Path
import re
import os

def get_test_directory() -> Path:
    """Get the test directory from FILESYSTEM_TEST_DIR env var."""
    test_root = os.environ.get("FILESYSTEM_TEST_DIR")
    if not test_root:
        raise ValueError("FILESYSTEM_TEST_DIR environment variable is required")
    return Path(test_root)

def verify_analysis_file_exists(test_dir: Path) -> bool:
    """Verify that the analysis.txt file exists."""
    analysis_file = test_dir / "analysis.txt"
    
    if not analysis_file.exists():
        print("❌ File 'analysis.txt' not found")
        return False
    
    print("✅ Analysis file found")
    return True

def verify_analysis_format(test_dir: Path) -> bool:
    """Verify that the analysis file has the correct format."""
    analysis_file = test_dir / "analysis.txt"
    
    try:
        content = analysis_file.read_text()
        lines = content.split('\n')
        
        # Check if content is not empty
        if not content.strip():
            print("❌ Analysis file is empty")
            return False
        
        # Check if we have enough lines for at least one category block
        if len(lines) < 2:
            print("❌ Analysis file doesn't have enough lines for a category block")
            return False
        
        # Check if the format follows the 2-line block pattern with empty lines between blocks
        # Each block should have: category_name, count, empty_line
        line_index = 0
        block_count = 0
        
        while line_index < len(lines):
            # Skip leading empty lines
            while line_index < len(lines) and lines[line_index].strip() == "":
                line_index += 1
            
            if line_index >= len(lines):
                break
            
            # Check if we have at least 2 lines for a block
            if line_index + 1 >= len(lines):
                print("❌ Incomplete category block at the end")
                return False
            
            # Line 1 should be category name
            category_line = lines[line_index].strip()
            if not category_line:
                print(f"❌ Empty category name at line {line_index + 1}")
                return False
            
            # Line 2 should be count
            count_line = lines[line_index + 1].strip()
            if not count_line:
                print(f"❌ Empty count at line {line_index + 2}")
                return False
            
            # Check if count line contains a number
            if not re.search(r'\d+', count_line):
                print(f"❌ Count line doesn't contain a number at line {line_index + 2}: '{count_line}'")
                return False
            
            block_count += 1
            line_index += 2
            
            # Skip empty line between blocks (if not at the end)
            if line_index < len(lines) and lines[line_index].strip() == "":
                line_index += 1
        
        if block_count == 0:
            print("❌ No valid category blocks found")
            return False
        
        print(f"✅ Analysis format is correct with {block_count} category blocks")
        return True
        
    except Exception as e:
        print(f"❌ Error reading analysis file: {e}")
        return False

def verify_required_categories(test_dir: Path) -> bool:
    """Verify that all required SUN RGB-D categories are present."""
    analysis_file = test_dir / "analysis.txt"
    
    try:
        content = analysis_file.read_text()
        lines = content.split('\n')
        
        # Extract category names from the file
        categories_found = []
        line_index = 0
        
        while line_index < len(lines):
            # Skip empty lines
            while line_index < len(lines) and lines[line_index].strip() == "":
                line_index += 1
            
            if line_index >= len(lines):
                break
            
            # Get category name
            category_line = lines[line_index].strip()
            if category_line:
                categories_found.append(category_line.lower())
            
            # Skip to next block
            line_index += 2
            while line_index < len(lines) and lines[line_index].strip() == "":
                line_index += 1
        
        # Required categories
        required_categories = {
            'chair', 'table', 'bed', 'bookshelf', 'desk', 
            'toilet', 'dresser', 'bathtub', 'sofa', 'night_stand'
        }
        
        # Check if all required categories are present
        missing_categories = required_categories - set(categories_found)
        if missing_categories:
            print(f"❌ Missing required categories: {missing_categories}")
            return False
        
        # Check for extra categories
        extra_categories = set(categories_found) - required_categories
        if extra_categories:
            print(f"⚠️  Extra categories found: {extra_categories}")
        
        print(f"✅ All required categories present: {sorted(required_categories)}")
        return True
        
    except Exception as e:
        print(f"❌ Error verifying required categories: {e}")
        return False

def verify_category_counts(test_dir: Path) -> bool:
    """Verify that the category counts match the expected values."""
    analysis_file = test_dir / "analysis.txt"
    
    try:
        content = analysis_file.read_text()
        lines = content.split('\n')
        
        # Expected counts from answer.txt
        expected_counts = {
            'chair': 4681,
            'table': 1170,
            'bed': 370,
            'bookshelf': 377,
            'desk': 680,
            'toilet': 256,
            'dresser': 213,
            'bathtub': 144,
            'sofa': 1,
            'night_stand': 224
        }
        
        # Extract category counts from the file
        category_counts = {}
        line_index = 0
        
        while line_index < len(lines):
            # Skip empty lines
            while line_index < len(lines) and lines[line_index].strip() == "":
                line_index += 1
            
            if line_index >= len(lines):
                break
            
            # Get category name
            category_line = lines[line_index].strip()
            if not category_line:
                line_index += 1
                continue
            
            # Get count
            if line_index + 1 < len(lines):
                count_line = lines[line_index + 1].strip()
                if count_line:
                    # Extract number from count line
                    count_match = re.search(r'(\d+)', count_line)
                    if count_match:
                        category = category_line.lower()
                        count = int(count_match.group(1))
                        category_counts[category] = count
            
            # Skip to next block
            line_index += 2
            while line_index < len(lines) and lines[line_index].strip() == "":
                line_index += 1
        
        # Verify counts match expected values
        all_counts_correct = True
        for category, expected_count in expected_counts.items():
            if category in category_counts:
                actual_count = category_counts[category]
                if actual_count != expected_count:
                    print(f"❌ Count mismatch for {category}: expected {expected_count}, got {actual_count}")
                    all_counts_correct = False
            else:
                print(f"❌ Category {category} not found in analysis")
                all_counts_correct = False
        
        if all_counts_correct:
            print("✅ All category counts match expected values")
            return True
        else:
            return False
        
    except Exception as e:
        print(f"❌ Error verifying category counts: {e}")
        return False

def verify_file_structure(test_dir: Path) -> bool:
    """Verify that the analysis.txt file is in the correct location."""
    analysis_file = test_dir / "analysis.txt"
    
    if not analysis_file.exists():
        print("❌ Analysis file not found in test directory root")
        return False
    
    # Check if it's directly in the test directory root, not in a subdirectory
    if analysis_file.parent != test_dir:
        print("❌ Analysis file should be in the test directory root")
        return False
    
    print("✅ Analysis file is in the correct location")
    return True

def main():
    """Main verification function."""
    test_dir = get_test_directory()
    print("🔍 Verifying Votenet Dataset Comparison Task...")
    
    # Define verification steps
    verification_steps = [
        ("Analysis File Exists", verify_analysis_file_exists),
        ("File Location", verify_file_structure),
        ("File Format", verify_analysis_format),
        ("Required Categories", verify_required_categories),
        ("Category Counts", verify_category_counts),
    ]
    
    # Run all verification steps
    all_passed = True
    for step_name, verify_func in verification_steps:
        print(f"\n--- {step_name} ---")
        if not verify_func(test_dir):
            all_passed = False
    
    # Final result
    print("\n" + "="*50)
    if all_passed:
        print("✅ Votenet dataset comparison task completed correctly!")
        print("🎉 Task verification: PASS")
        sys.exit(0)
    else:
        print("❌ Task verification: FAIL")
        sys.exit(1)

if __name__ == "__main__":
    main()

================================================
FILE: tasks/filesystem/standard/votenet/debugging/description.md
================================================
Please use FileSystem tools to finish the following task:

### Task Description

There is a bug in the VoteNet backbone module that needs to be identified and fixed.

### Task Objectives

1. **Examine the codebase** using filesystem MCP tools
2. **Identify the bug** inside the hole process
3. **Fix the bug** in the code
4. **Create an answer file** with the bug location

### Expected Output

1. **Fix the bug** in the code file directly
2. **Create `answer.txt`** in the test directory root with the format: `path`

**Requirements:**

- Only include the bug's file path in answer.txt
- No additional text or explanation

### Hint

**The bug is not in demo.py**, please look deeper inside the codebase.


================================================
FILE: tasks/filesystem/standard/votenet/debugging/meta.json
================================================
{
  "task_id": "debugging",
  "task_name": "Debugging",
  "category_id": "votenet",
  "category_name": "Votenet",
  "description": "Identify and fix bugs in the VoteNet backbone module by examining the codebase and implementing necessary corrections.",
  "author": "Lingjun Chen",
  "created_at": "2025-08-13",
  "difficulty": "L3",
  "tags": [
    "code exploration"
  ],
  "mcp": [
    "filesystem"
  ],
  "meta_data": {
    "stateType": "text",
    "stateContent": "votenet/\n    ├── doc/\n    │       ├── teaser.jpg\n    │       └── tips.md\n    ├── models/\n    │       ├── ap_helper.py\n    │       ├── backbone_module.py\n    │       ├── boxnet.py\n    │       ├── dump_helper.py\n    │       ├── loss_helper.py\n    │       ├── loss_helper_boxnet.py\n    │       ├── proposal_module.py\n    │       ├── votenet.py\n    │       └── voting_module.py\n    ├── pointnet2/\n    │       ├── _ext_src/\n    │       │       ├── include/\n    │       │       │       ├── ball_query.h\n    │       │       │       ├── cuda_utils.h\n    │       │       │       ├── group_points.h\n    │       │       │       ├── interpolate.h\n    │       │       │       ├── sampling.h\n    │       │       │       └── utils.h\n    │       │       └── src/\n    │       │               ├── ball_query.cpp\n    │       │               ├── ball_query_gpu.cu\n    │       │               ├── bindings.cpp\n    │       │               ├── group_points.cpp\n    │       │               ├── group_points_gpu.cu\n    │       │               ├── interpolate.cpp\n    │       │               ├── interpolate_gpu.cu\n    │       │               ├── sampling.cpp\n    │       │               └── sampling_gpu.cu\n    │       ├── pointnet2_modules.py\n    │       ├── pointnet2_test.py\n    │       ├── pointnet2_utils.py\n    │       ├── pytorch_utils.py\n    │       └── setup.py\n    ├── scannet/\n    │       ├── meta_data/\n    │       │       ├── scannet_means.npz\n    │       │       ├── scannet_train.txt\n    │       │       ├── scannetv2-labels.combined.tsv\n    │       │       ├── scannetv2_test.txt\n    │       │       ├── scannetv2_train.txt\n    │       │       └── scannetv2_val.txt\n    │       ├── scans/\n    │       ├── batch_load_scannet_data.py\n    │       ├── data_viz.py\n    │       ├── load_scannet_data.py\n    │       ├── model_util_scannet.py\n    │       ├── README.md\n    │       ├── scannet_detection_dataset.py\n    │       └── scannet_utils.py\n    ├── sunrgbd/\n    │       ├── matlab/\n    │       │       ├── extract_rgbd_data_v1.m\n    │       │       ├── extract_rgbd_data_v2.m\n    │       │       └── extract_split.m\n    │       ├── OFFICIAL_SUNRGBD/\n    │       ├── sunrgbd_trainval/\n    │       ├── model_util_sunrgbd.py\n    │       ├── README.md\n    │       ├── sunrgbd_data.py\n    │       ├── sunrgbd_detection_dataset.py\n    │       └── sunrgbd_utils.py\n    ├── utils/\n    │       ├── box_util.py\n    │       ├── eval_det.py\n    │       ├── metric_util.py\n    │       ├── nms.py\n    │       ├── nn_distance.py\n    │       ├── pc_util.py\n    │       ├── tf_logger.py\n    │       └── tf_visualizer.py\n    ├── CODE_OF_CONDUCT.md\n    ├── CONTRIBUTING.md\n    ├── demo.py\n    ├── eval.py\n    ├── LICENSE\n    ├── README.md\n    └── train.py",
    "stateUrl": "https://storage.mcpmark.ai/filesystem/votenet.zip",
    "stateOriginalUrl": "https://github.com/facebookresearch/votenet"
  }
}

================================================
FILE: tasks/filesystem/standard/votenet/debugging/verify.py
================================================
#!/usr/bin/env python3
"""
Verification script for VoteNet Task: Debug Backbone Module
"""

import sys
from pathlib import Path
import re
import os

def get_test_directory() -> Path:
    """Get the test directory from FILESYSTEM_TEST_DIR env var."""
    test_root = os.environ.get("FILESYSTEM_TEST_DIR")
    if not test_root:
        raise ValueError("FILESYSTEM_TEST_DIR environment variable is required")
    return Path(test_root)

def verify_answer_file_exists(test_dir: Path) -> bool:
    """Verify that the answer.txt file exists."""
    answer_file = test_dir / "answer.txt"
    
    if not answer_file.exists():
        print("❌ File 'answer.txt' not found")
        return False
    
    print("✅ Answer file found")
    return True

def verify_answer_format(test_dir: Path) -> bool:
    """Verify that the answer file has the correct format."""
    answer_file = test_dir / "answer.txt"
    
    try:
        content = answer_file.read_text().strip()
        
        # Check if content is not empty
        if not content:
            print("❌ Answer file is empty")
            return False
        
        # Check if it contains only one line (no additional text)
        if len(content.split('\n')) > 1:
            print("❌ Answer file contains multiple lines or additional text")
            return False
        
        # Check if path contains the expected components
        if 'models/backbone_module.py' not in content:
            print("❌ Answer should contain 'models/backbone_module.py'")
            return False
        
        print("✅ Answer format is correct")
        return True
        
    except Exception as e:
        print(f"❌ Error reading answer file: {e}")
        return False

def verify_file_path_structure(test_dir: Path) -> bool:
    """Verify that the file path has the expected structure."""
    answer_file = test_dir / "answer.txt"
    
    try:
        content = answer_file.read_text().strip()
        
        # Expected path components for backbone module
        expected_components = ["models", "backbone_module.py"]
        
        # Check if all expected components are in the content
        for component in expected_components:
            if component not in content:
                print(f"❌ Answer missing expected component: {component}")
                return False
        
        print("✅ Answer contains expected components")
        return True
        
    except Exception as e:
        print(f"❌ Error verifying answer structure: {e}")
        return False

def verify_file_exists(test_dir: Path) -> bool:
    """Verify that the identified file actually exists."""
    answer_file = test_dir / "answer.txt"
    
    try:
        content = answer_file.read_text().strip()
        
        # Try the expected path
        file_path = test_dir / "models/backbone_module.py"
        
        if not file_path.exists():
            print(f"❌ Expected file does not exist: models/backbone_module.py")
            return False
        
        print("✅ Expected file exists")
        return True
        
    except Exception as e:
        print(f"❌ Error verifying file existence: {e}")
        return False

def verify_bug_fix(test_dir: Path) -> bool:
    """Verify that the bug has been fixed in the code."""
    answer_file = test_dir / "answer.txt"
    
    try:
        content = answer_file.read_text().strip()
        
        file_path = test_dir / "models/backbone_module.py"
        
        if not file_path.exists():
            print(f"❌ Cannot find file for bug fix verification: models/backbone_module.py")
            return False
        
        # Read the file and search for the specific line containing self.fp2 = PointnetFPModule
        file_content = file_path.read_text()
        lines = file_content.split('\n')
        
        # Find the line containing self.fp2 = PointnetFPModule
        target_line = None
        target_line_number = None
        
        for i, line in enumerate(lines):
            if "self.fp2 = PointnetFPModule" in line:
                target_line = line.strip()
                target_line_number = i + 1  # Convert to 1-based line number
                break
        
        if target_line is None:
            print("❌ Could not find line containing 'self.fp2 = PointnetFPModule'")
            return False
        
        # Check if the original buggy line still exists
        original_bug = "self.fp2 = PointnetFPModule(mlp=[256,256,256])"
        if original_bug in target_line:
            print(f"❌ Bug has not been fixed - original line still exists at line {target_line_number}")
            print(f"   Line {target_line_number} content: {target_line}")
            return False
        
        # Check for the correct fix
        correct_fixes = [
            "self.fp2 = PointnetFPModule(mlp=[256+256,256,256])",
            "self.fp2 = PointnetFPModule(mlp=[512,256,256])"
        ]
        
        fix_found = False
        for fix in correct_fixes:
            if fix in target_line:
                fix_found = True
                break
        
        if not fix_found:
            print(f"❌ Bug fix not found at line {target_line_number}")
            print(f"   Line {target_line_number} content: {target_line}")
            print("   Expected one of:")
            for fix in correct_fixes:
                print(f"   - {fix}")
            return False
        
        print(f"✅ Bug has been fixed correctly at line {target_line_number}")
        return True
        
    except Exception as e:
        print(f"❌ Error verifying bug fix: {e}")
        return False


def main():
    """Main verification function."""
    test_dir = get_test_directory()
    print("🔍 Verifying VoteNet Task: Debug Backbone Module...")
    
    # Define verification steps
    verification_steps = [
        ("Answer File Exists", verify_answer_file_exists),
        ("Answer Format", verify_answer_format),
        ("Answer Structure", verify_file_path_structure),
        ("File Exists", verify_file_exists),
        ("Bug Fix Applied", verify_bug_fix),
    ]
    
    # Run all verification steps
    all_passed = True
    for step_name, verify_func in verification_steps:
        print(f"\n--- {step_name} ---")
        if not verify_func(test_dir):
            all_passed = False
    
    # Final result
    print("\n" + "="*50)
    if all_passed:
        print("✅ VoteNet backbone module bug has been correctly identified and fixed!")
        print("🎉 Task verification: PASS")
        sys.exit(0)
    else:
        print("❌ Task verification: FAIL")
        sys.exit(1)

if __name__ == "__main__":
    main()

================================================
FILE: tasks/filesystem/standard/votenet/requirements_writing/description.md
================================================
Please use FileSystem tools to finish the following task:

### Task Description

The VoteNet project is a 3D object detection framework for point clouds. Your task is to create a `requirements.txt` file that lists all the necessary Python dependencies for running this codebase.

### Task Objectives

1. **Create a requirements.txt file** in the main directory
2. **Include all essential dependencies** needed to run the VoteNet codebase
3. **Ensure the file format is correct** (one dependency per line)
4. **Save the file as `requirements.txt`** in the current working directory
5. **Not just** pip install or conda install, your answer should contain **every necessary dependencies in the hole process of VoteNet**.

### Requirements

The requirements.txt file should contain Python packages that are necessary for:

- 3D point cloud processing
- Deep learning frameworks
- Computer vision libraries
- Data visualization
- 3D mesh processing
- Network/graph operations

### Note

- You can examine the codebase structure and README to understand what packages are needed
- The file should be saved as `requirements.txt` in the current directory
- Each dependency should be on a separate line


================================================
FILE: tasks/filesystem/standard/votenet/requirements_writing/meta.json
================================================
{
  "task_id": "requirements_writing",
  "task_name": "Requirements Writing",
  "category_id": "votenet",
  "category_name": "VoteNet",
  "description": "Generate a complete requirements.txt file containing all necessary Python dependencies for running the VoteNet codebase successfully.",
  "author": "Lingjun Chen",
  "created_at": "2025-08-13",
  "difficulty": "L3",
  "tags": [
    "code exploration",
    "cross-referencing"
  ],
  "mcp": [
    "filesystem"
  ],
  "meta_data": {
    "stateType": "text",
    "stateContent": "votenet/\n    ├── doc/\n    │       ├── teaser.jpg\n    │       └── tips.md\n    ├── models/\n    │       ├── ap_helper.py\n    │       ├── backbone_module.py\n    │       ├── boxnet.py\n    │       ├── dump_helper.py\n    │       ├── loss_helper.py\n    │       ├── loss_helper_boxnet.py\n    │       ├── proposal_module.py\n    │       ├── votenet.py\n    │       └── voting_module.py\n    ├── pointnet2/\n    │       ├── _ext_src/\n    │       │       ├── include/\n    │       │       │       ├── ball_query.h\n    │       │       │       ├── cuda_utils.h\n    │       │       │       ├── group_points.h\n    │       │       │       ├── interpolate.h\n    │       │       │       ├── sampling.h\n    │       │       │       └── utils.h\n    │       │       └── src/\n    │       │               ├── ball_query.cpp\n    │       │               ├── ball_query_gpu.cu\n    │       │               ├── bindings.cpp\n    │       │               ├── group_points.cpp\n    │       │               ├── group_points_gpu.cu\n    │       │               ├── interpolate.cpp\n    │       │               ├── interpolate_gpu.cu\n    │       │               ├── sampling.cpp\n    │       │               └── sampling_gpu.cu\n    │       ├── pointnet2_modules.py\n    │       ├── pointnet2_test.py\n    │       ├── pointnet2_utils.py\n    │       ├── pytorch_utils.py\n    │       └── setup.py\n    ├── scannet/\n    │       ├── meta_data/\n    │       │       ├── scannet_means.npz\n    │       │       ├── scannet_train.txt\n    │       │       ├── scannetv2-labels.combined.tsv\n    │       │       ├── scannetv2_test.txt\n    │       │       ├── scannetv2_train.txt\n    │       │       └── scannetv2_val.txt\n    │       ├── scans/\n    │       ├── batch_load_scannet_data.py\n    │       ├── data_viz.py\n    │       ├── load_scannet_data.py\n    │       ├── model_util_scannet.py\n    │       ├── README.md\n    │       ├── scannet_detection_dataset.py\n    │       └── scannet_utils.py\n    ├── sunrgbd/\n    │       ├── matlab/\n    │       │       ├── extract_rgbd_data_v1.m\n    │       │       ├── extract_rgbd_data_v2.m\n    │       │       └── extract_split.m\n    │       ├── OFFICIAL_SUNRGBD/\n    │       ├── sunrgbd_trainval/\n    │       ├── model_util_sunrgbd.py\n    │       ├── README.md\n    │       ├── sunrgbd_data.py\n    │       ├── sunrgbd_detection_dataset.py\n    │       └── sunrgbd_utils.py\n    ├── utils/\n    │       ├── box_util.py\n    │       ├── eval_det.py\n    │       ├── metric_util.py\n    │       ├── nms.py\n    │       ├── nn_distance.py\n    │       ├── pc_util.py\n    │       ├── tf_logger.py\n    │       └── tf_visualizer.py\n    ├── CODE_OF_CONDUCT.md\n    ├── CONTRIBUTING.md\n    ├── demo.py\n    ├── eval.py\n    ├── LICENSE\n    ├── README.md\n    └── train.py",
    "stateUrl": "https://storage.mcpmark.ai/filesystem/votenet.zip",
    "stateOriginalUrl": "https://github.com/facebookresearch/votenet"
  }
}

================================================
FILE: tasks/filesystem/standard/votenet/requirements_writing/verify.py
================================================
#!/usr/bin/env python3
"""
Verification script for VoteNet Task: Create Requirements.txt File
"""

import sys
from pathlib import Path
import os

def get_test_directory() -> Path:
    """Get the test directory from FILESYSTEM_TEST_DIR env var."""
    test_root = os.environ.get("FILESYSTEM_TEST_DIR")
    if not test_root:
        raise ValueError("FILESYSTEM_TEST_DIR environment variable is required")
    return Path(test_root)

def verify_requirements_file_exists(test_dir: Path) -> bool:
    """Verify that the requirements.txt file exists."""
    requirements_file = test_dir / "requirements.txt"
    
    if not requirements_file.exists():
        print("❌ File 'requirements.txt' not found")
        return False
    
    print("✅ Requirements.txt file found")
    return True

def verify_requirements_file_readable(test_dir: Path) -> bool:
    """Verify that the requirements.txt file is readable."""
    requirements_file = test_dir / "requirements.txt"
    
    try:
        content = requirements_file.read_text()
        if not content.strip():
            print("❌ Requirements.txt file is empty")
            return False
        
        print("✅ Requirements.txt file is readable")
        return True
        
    except Exception as e:
        print(f"❌ Error reading requirements.txt file: {e}")
        return False

def verify_required_dependencies_present(test_dir: Path) -> bool:
    """Verify that all required dependencies are present."""
    requirements_file = test_dir / "requirements.txt"
    
    try:
        content = requirements_file.read_text()
        
        # Required dependencies from answer.txt
        required_deps = [
            "matplotlib",
            "opencv", 
            "plyfile",
            "trimesh",
            "pointnet2",
            "networkx"
        ]
        
        missing_deps = []
        found_deps = []
        
        for dep in required_deps:
            if dep.lower() in content.lower():
                found_deps.append(dep)
            else:
                missing_deps.append(dep)
        
        if missing_deps:
            print(f"❌ Missing required dependencies: {missing_deps}")
            return False
        
        print(f"✅ All required dependencies found: {found_deps}")
        return True
        
    except Exception as e:
        print(f"❌ Error checking dependencies: {e}")
        return False

def verify_file_format(test_dir: Path) -> bool:
    """Verify that the requirements.txt file has proper format."""
    requirements_file = test_dir / "requirements.txt"
    
    try:
        content = requirements_file.read_text()
        lines = content.split('\n')
        
        # Check if file has content and proper line structure
        if not content.strip():
            print("❌ File is completely empty")
            return False
        
        # Check if there are multiple lines (indicating multiple dependencies)
        non_empty_lines = [line.strip() for line in lines if line.strip()]
        if len(non_empty_lines) < 3:  # Should have at least 3 dependencies
            print("❌ File seems to have too few dependencies")
            return False
        
        print("✅ File format is acceptable")
        return True
        
    except Exception as e:
        print(f"❌ Error checking file format: {e}")
        return False

def verify_no_duplicate_entries(test_dir: Path) -> bool:
    """Verify that there are no duplicate dependency entries."""
    requirements_file = test_dir / "requirements.txt"
    
    try:
        content = requirements_file.read_text()
        lines = [line.strip().lower() for line in content.split('\n') if line.strip()]
        
        # Check for duplicates
        if len(lines) != len(set(lines)):
            print("❌ File contains duplicate entries")
            return False
        
        print("✅ No duplicate entries found")
        return True
        
    except Exception as e:
        print(f"❌ Error checking for duplicates: {e}")
        return False

def main():
    """Main verification function."""
    test_dir = get_test_directory()
    print("🔍 Verifying VoteNet Task: Create Requirements.txt File...")
    
    # Define verification steps
    verification_steps = [
        ("Requirements File Exists", verify_requirements_file_exists),
        ("File is Readable", verify_requirements_file_readable),
        ("Required Dependencies Present", verify_required_dependencies_present),
        ("File Format", verify_file_format),
        ("No Duplicate Entries", verify_no_duplicate_entries),
    ]
    
    # Run all verification steps
    all_passed = True
    for step_name, verify_func in verification_steps:
        print(f"\n--- {step_name} ---")
        if not verify_func(test_dir):
            all_passed = False
    
    # Final result
    print("\n" + "="*50)
    if all_passed:
        print("✅ Requirements.txt file successfully created with all required dependencies!")
        print("🎉 Task verification: PASS")
        sys.exit(0)
    else:
        print("❌ Task verification: FAIL")
        sys.exit(1)

if __name__ == "__main__":
    main()

================================================
FILE: tasks/github/easy/build-your-own-x/close_commented_issues/description.md
================================================
Use the GitHub MCP tools to close every issue in `mcpmark-eval/build-your-own-x` that already has at least one comment. Leave all other issues unchanged.


================================================
FILE: tasks/github/easy/build-your-own-x/close_commented_issues/meta.json
================================================
{
  "task_id": "close_commented_issues",
  "task_name": "Close Commented Issues",
  "category_id": "build-your-own-x",
  "category_name": "Build Your Own X (Easy)",
  "description": "Use GitHub MCP tools to close every issue with comments in build-your-own-x and leave everything else alone.",
  "author": "Zijian Wu",
  "created_at": "2025-11-15",
  "difficulty": "L1",
  "tags": [
    "issue management"
  ],
  "mcp": [
    "github"
  ],
  "meta_data": {
    "stateType": "url",
    "stateContent": null,
    "stateUrl": "https://github.com/mcpmark-source/build-your-own-x",
    "stateOriginalUrl": "https://github.com/codecrafters-io/build-your-own-x"
  }
}


================================================
FILE: tasks/github/easy/build-your-own-x/close_commented_issues/verify.py
================================================
import os
import sys
from typing import Optional

import requests
from dotenv import load_dotenv

REPO_NAME = "build-your-own-x"
TARGET_ISSUES = [23, 25]


def _fetch_issue(org: str, token: str, number: int) -> Optional[dict]:
    url = f"https://api.github.com/repos/{org}/{REPO_NAME}/issues/{number}"
    headers = {
        "Authorization": f"Bearer {token}",
        "Accept": "application/vnd.github+json",
    }

    try:
        response = requests.get(url, headers=headers, timeout=30)
    except Exception as exc:
        print(f"Request error for issue #{number}: {exc}", file=sys.stderr)
        return None

    if response.status_code != 200:
        print(
            f"GitHub API returned {response.status_code} when fetching issue #{number}",
            file=sys.stderr,
        )
        return None

    try:
        return response.json()
    except Exception as exc:
        print(f"Unable to parse issue #{number}: {exc}", file=sys.stderr)
        return None


def verify() -> bool:
    load_dotenv(".mcp_env")

    token = os.environ.get("MCP_GITHUB_TOKEN")
    org = os.environ.get("GITHUB_EVAL_ORG")

    if not token:
        print("MCP_GITHUB_TOKEN is missing", file=sys.stderr)
        return False

    if not org:
        print("GITHUB_EVAL_ORG is missing", file=sys.stderr)
        return False

    print("Checking issue states in remote repository...")
    success = True

    for issue_number in TARGET_ISSUES:
        data = _fetch_issue(org, token, issue_number)
        if data is None:
            success = False
            continue

        state = data.get("state", "").lower()
        if state != "closed":
            print(
                f"Issue #{issue_number} is '{state}' but must be closed.",
                file=sys.stderr,
            )
            success = False
        else:
            print(f"Issue #{issue_number} is closed as expected.")

    return success


if __name__ == "__main__":
    sys.exit(0 if verify() else 1)


================================================
FILE: tasks/github/easy/build-your-own-x/record_recent_commits/description.md
================================================
Use the GitHub MCP tools to work in the `mcpmark-eval/build-your-own-x` repository.

1. Retrieve the newest five commits on the default branch.
2. Open a new issue titled exactly `Latest 5 Commit Snapshot`.
3. Set the issue body to exactly this format (newest commit first):

```
Latest 5 commits (newest first)
1. <full-sha> | <author name> | <commit subject>
2. <full-sha> | <author name> | <commit subject>
3. <full-sha> | <author name> | <commit subject>
4. <full-sha> | <author name> | <commit subject>
5. <full-sha> | <author name> | <commit subject>
```

Use the full 40-character SHA and only the first line of each commit message. The `<author name>` must come from the commit metadata's author name field (not the GitHub username/login). Leave the issue open and do not touch other issues.


================================================
FILE: tasks/github/easy/build-your-own-x/record_recent_commits/meta.json
================================================
{
  "task_id": "record_recent_commits",
  "task_name": "Record Recent Commits",
  "category_id": "build-your-own-x",
  "category_name": "Build Your Own X (Easy)",
  "description": "Summarize the latest five commits by opening an issue with their SHAs, authors, and subjects.",
  "author": "Zijian Wu",
  "created_at": "2025-11-15",
  "difficulty": "L1",
  "tags": [
    "commits",
    "issue"
  ],
  "mcp": [
    "github"
  ],
  "meta_data": {
    "stateType": "url",
    "stateContent": null,
    "stateUrl": "https://github.com/mcpmark-source/build-your-own-x",
    "stateOriginalUrl": "https://github.com/codecrafters-io/build-your-own-x"
  }
}


================================================
FILE: tasks/github/easy/build-your-own-x/record_recent_commits/verify.py
================================================
import os
import sys
from typing import List, Optional

import requests
from dotenv import load_dotenv

REPO_NAME = "build-your-own-x"
BRANCH = "master"
ISSUE_TITLE = "Latest 5 Commit Snapshot"
EXPECTED_HEADER = "latest 5 commits (newest first)"


def _request(url: str, token: str) -> Optional[requests.Response]:
    headers = {
        "Authorization": f"Bearer {token}",
        "Accept": "application/vnd.github+json",
    }
    try:
        response = requests.get(url, headers=headers, timeout=30)
    except Exception as exc:  # pragma: no cover - network errors
        print(f"Request error for {url}: {exc}", file=sys.stderr)
        return None

    if response.status_code != 200:
        print(
            f"GitHub API returned {response.status_code} for {url}",
            file=sys.stderr,
        )
        return None

    return response


def _fetch_commits(org: str, token: str) -> Optional[List[dict]]:
    url = (
        f"https://api.github.com/repos/{org}/{REPO_NAME}/commits"
        f"?per_page=5&sha={BRANCH}"
    )
    response = _request(url, token)
    if response is None:
        return None

    try:
        return response.json()
    except Exception as exc:
        print(f"Unable to parse commits: {exc}", file=sys.stderr)
        return None


def _find_issue(org: str, token: str) -> Optional[dict]:
    page = 1
    while True:
        url = (
            f"https://api.github.com/repos/{org}/{REPO_NAME}/issues"
            f"?state=open&per_page=100&page={page}"
        )
        response = _request(url, token)
        if response is None:
            return None

        try:
            issues = response.json()
        except Exception as exc:
            print(f"Unable to parse issues: {exc}", file=sys.stderr)
            return None

        if not issues:
            break

        for issue in issues:
            if issue.get("title") == ISSUE_TITLE:
                # Exclude pull requests
                if "pull_request" in issue:
                    continue
                return issue

        page += 1

    print(
        f"No open issue titled '{ISSUE_TITLE}' was found.",
        file=sys.stderr,
    )
    return None


def verify() -> bool:
    load_dotenv(".mcp_env")

    token = os.environ.get("MCP_GITHUB_TOKEN")
    org = os.environ.get("GITHUB_EVAL_ORG")

    if not token:
        print("MCP_GITHUB_TOKEN is missing", file=sys.stderr)
        return False

    if not org:
        print("GITHUB_EVAL_ORG is missing", file=sys.stderr)
        return False

    commits = _fetch_commits(org, token)
    if commits is None:
        return False

    if len(commits) < 5:
        print("Less than five commits returned; cannot verify.", file=sys.stderr)
        return False

    issue = _find_issue(org, token)
    if issue is None:
        return False

    if issue.get("title") != ISSUE_TITLE:
        print(
            f"Found issue title '{issue.get('title')}', expected '{ISSUE_TITLE}'.",
            file=sys.stderr,
        )
        return False

    if (issue.get("state") or "").lower() != "open":
        print("Issue must remain open.", file=sys.stderr)
        return False

    body = issue.get("body") or ""
    if not body.strip():
        print("Issue body is empty.", file=sys.stderr)
        return False

    lines = [line.strip() for line in body.splitlines() if line.strip()]
    if not lines:
        print("Issue body contains no content.", file=sys.stderr)
        return False

    header = lines[0].lower()
    if header != EXPECTED_HEADER:
        print(
            "Issue body must start with 'Latest 5 commits (newest first)'.",
            file=sys.stderr,
        )
        return False

    entries = lines[1:]
    if len(entries) != 5:
        print("Issue body must list exactly five commit entries.", file=sys.stderr)
        return False

    for idx in range(5):
        commit = commits[idx]
        sha = commit.get("sha", "")
        subject = (commit.get("commit", {}).get("message", "").splitlines()[0]).strip()
        author = commit.get("commit", {}).get("author", {}).get("name", "")

        expected_line = f"{idx + 1}. {sha} | {author} | {subject}"
        actual_line = entries[idx]
        if actual_line != expected_line:
            print(
                f"Entry {idx + 1} mismatch.\nExpected: {expected_line}\nFound:    {actual_line}",
                file=sys.stderr,
            )
            return False

    print("Issue contains the expected latest five commits.")
    return True


if __name__ == "__main__":
    sys.exit(0 if verify() else 1)


================================================
FILE: tasks/github/easy/claude-code/add_terminal_shortcuts_doc/description.md
================================================
Use the GitHub MCP tools to edit the `mcpmark-eval/claude-code` repository.

1. On the `main` branch, add a new file `docs/TERMINAL_SHORTCUTS.md` containing exactly:

```
# Terminal Shortcuts

- `claude plan`: Outline the next steps before making edits.
- `claude apply`: Run the plan and apply the queued changes.
- `claude check`: Re-run relevant tests or linters to validate the edits.
```

2. Commit with the message `docs: add terminal shortcuts reference` and push directly to `main`.


================================================
FILE: tasks/github/easy/claude-code/add_terminal_shortcuts_doc/meta.json
================================================
{
  "task_id": "add_terminal_shortcuts_doc",
  "task_name": "Add Terminal Shortcuts Doc",
  "category_id": "claude-code",
  "category_name": "Claude Code (Easy)",
  "description": "Add a simple terminal shortcuts reference file to docs/TERMINAL_SHORTCUTS.md and push it to main.",
  "author": "Zijian Wu",
  "created_at": "2025-11-15",
  "difficulty": "L1",
  "tags": [
    "docs update",
    "content creation"
  ],
  "mcp": [
    "github"
  ],
  "meta_data": {
    "stateType": "url",
    "stateContent": null,
    "stateUrl": "https://github.com/mcpmark-source/claude-code",
    "stateOriginalUrl": "https://github.com/anthropics/claude-code"
  }
}


================================================
FILE: tasks/github/easy/claude-code/add_terminal_shortcuts_doc/verify.py
================================================
import base64
import os
import sys
from typing import Optional

import requests
from dotenv import load_dotenv

REPO_NAME = "claude-code"
TARGET_FILE = "docs/TERMINAL_SHORTCUTS.md"
BRANCH = "main"
EXPECTED_CONTENT = """# Terminal Shortcuts

- `claude plan`: Outline the next steps before making edits.
- `claude apply`: Run the plan and apply the queued changes.
- `claude check`: Re-run relevant tests or linters to validate the edits.
""".strip()


def _download_file(org: str, token: str) -> Optional[str]:
    url = f"https://api.github.com/repos/{org}/{REPO_NAME}/contents/{TARGET_FILE}?ref={BRANCH}"
    headers = {
        "Authorization": f"Bearer {token}",
        "Accept": "application/vnd.github+json",
    }

    try:
        response = requests.get(url, headers=headers, timeout=30)
    except Exception as exc:
        print(f"Request error for {TARGET_FILE}: {exc}", file=sys.stderr)
        return None

    if response.status_code != 200:
        print(
            f"GitHub API returned {response.status_code} when fetching {TARGET_FILE}",
            file=sys.stderr,
        )
        return None

    data = response.json()
    try:
        content = base64.b64decode(data.get("content", "")).decode("utf-8").strip()
    except Exception as exc:
        print(f"Unable to decode {TARGET_FILE}: {exc}", file=sys.stderr)
        return None

    return content


def verify() -> bool:
    load_dotenv(".mcp_env")

    token = os.environ.get("MCP_GITHUB_TOKEN")
    org = os.environ.get("GITHUB_EVAL_ORG")

    if not token:
        print("MCP_GITHUB_TOKEN is missing", file=sys.stderr)
        return False

    if not org:
        print("GITHUB_EVAL_ORG is missing", file=sys.stderr)
        return False

    print(f"Checking {TARGET_FILE} in remote repository...")
    content = _download_file(org, token)

    if content is None:
        return False

    normalized = content.strip()
    if normalized != EXPECTED_CONTENT:
        print("TERMINAL_SHORTCUTS.md does not match the expected content.", file=sys.stderr)
        print("Expected:")
        print(EXPECTED_CONTENT)
        print("Found:")
        print(content)
        return False

    print("All checks passed! docs/TERMINAL_SHORTCUTS.md contains the expected text.")
    return True


if __name__ == "__main__":
    success = verify()
    sys.exit(0 if success else 1)


================================================
FILE: tasks/github/easy/claude-code/thank_docker_pr_author/description.md
================================================
Use the GitHub MCP tools to comment on the pull request in `mcpmark-eval/claude-code` that proposes automating Docker image builds with GitHub Actions.

1. Skim the PR description so you understand it’s the Docker workflow automation proposal.
2. Add a new comment on that PR that thanks the author and contains all of these keywords: `Docker workflow`, `automation`, `review`.


================================================
FILE: tasks/github/easy/claude-code/thank_docker_pr_author/meta.json
================================================
{
  "task_id": "thank_docker_pr_author",
  "task_name": "Thank Docker PR Author",
  "category_id": "claude-code",
  "category_name": "Claude Code (Easy)",
  "description": "Leave a thank-you comment on the Docker automation PR mentioning the workflow automation review keywords.",
  "author": "Zijian Wu",
  "created_at": "2025-11-15",
  "difficulty": "L1",
  "tags": [
    "pull request",
    "comment"
  ],
  "mcp": [
    "github"
  ],
  "meta_data": {
    "stateType": "url",
    "stateContent": null,
    "stateUrl": "https://github.com/mcpmark-source/claude-code",
    "stateOriginalUrl": "https://github.com/anthropics/claude-code"
  }
}


================================================
FILE: tasks/github/easy/claude-code/thank_docker_pr_author/verify.py
================================================
import os
import sys
from typing import Optional, Union

import requests
from dotenv import load_dotenv

REPO_NAME = "claude-code"
PR_NUMBER = 53
KEYWORDS = ["docker workflow", "automation", "review"]


def _github_get(org: str, token: str, path: str) -> Optional[Union[list, dict]]:
    url = f"https://api.github.com/repos/{org}/{REPO_NAME}/{path}"
    headers = {
        "Authorization": f"Bearer {token}",
        "Accept": "application/vnd.github+json",
    }

    try:
        response = requests.get(url, headers=headers, timeout=30)
    except Exception as exc:
        print(f"Request error for {path}: {exc}", file=sys.stderr)
        return None

    if response.status_code != 200:
        print(
            f"GitHub API returned {response.status_code} for {path}",
            file=sys.stderr,
        )
        return None

    return response.json()


def verify() -> bool:
    load_dotenv(".mcp_env")

    token = os.environ.get("MCP_GITHUB_TOKEN")
    org = os.environ.get("GITHUB_EVAL_ORG")

    if not token:
        print("MCP_GITHUB_TOKEN is missing", file=sys.stderr)
        return False

    if not org:
        print("GITHUB_EVAL_ORG is missing", file=sys.stderr)
        return False

    comments = _github_get(org, token, f"issues/{PR_NUMBER}/comments?per_page=100")
    if comments is None:
        return False

    for comment in comments:
        body = comment.get("body", "").strip()
        lowered = body.lower()
        if not body:
            continue

        if not any(thank_word in lowered for thank_word in ("thanks", "thank you")):
            continue

        if all(keyword in lowered for keyword in KEYWORDS):
            print("All checks passed! Keyword-rich thank-you comment found on PR #53.")
            return True

    print(
        "Did not find a thank-you comment containing all required keywords on PR #53.",
        file=sys.stderr,
    )
    return False


if __name__ == "__main__":
    success = verify()
    sys.exit(0 if success else 1)


================================================
FILE: tasks/github/easy/claude-code/triage_missing_tool_result_issue/description.md
================================================
Use the GitHub MCP tools to triage issue #24 in the `mcpmark-eval/claude-code` repository.

1. Read the issue details to understand the reported API error.
2. Add a triage comment on the issue that explicitly includes all of the following keywords: `invalid_request_error`, `toolu_01Kjp7i9iF3xJ3z9aH4pSaRw`, `tool_result`, `tool_use`. Use them while confirming the API error and asking for the missing result block.
3. Remove the `area:packaging` label from issue #24.


================================================
FILE: tasks/github/easy/claude-code/triage_missing_tool_result_issue/meta.json
================================================
{
  "task_id": "triage_missing_tool_result_issue",
  "task_name": "Triage Missing Tool Result Issue",
  "category_id": "claude-code",
  "category_name": "Claude Code (Easy)",
  "description": "Leave a predefined triage comment on issue #24 and remove the area:packaging label.",
  "author": "Zijian Wu",
  "created_at": "2025-11-15",
  "difficulty": "L1",
  "tags": [
    "issue triage",
    "github"
  ],
  "mcp": [
    "github"
  ],
  "meta_data": {
    "stateType": "url",
    "stateContent": null,
    "stateUrl": "https://github.com/mcpmark-source/claude-code",
    "stateOriginalUrl": "https://github.com/anthropics/claude-code"
  }
}


================================================
FILE: tasks/github/easy/claude-code/triage_missing_tool_result_issue/verify.py
================================================
import os
import sys
from typing import Optional

import requests
from dotenv import load_dotenv

REPO_NAME = "claude-code"
ISSUE_NUMBER = 24
KEYWORDS = [
    "invalid_request_error",
    "toolu_01kjp7i9if3xj3z9ah4psarw",
    "tool_result",
    "tool_use",
]
REMOVED_LABEL = "area:packaging"


def _github_get(org: str, token: str, path: str) -> Optional[dict]:
    url = f"https://api.github.com/repos/{org}/{REPO_NAME}/{path}"
    headers = {
        "Authorization": f"Bearer {token}",
        "Accept": "application/vnd.github+json",
    }

    try:
        response = requests.get(url, headers=headers, timeout=30)
    except Exception as exc:
        print(f"Request error for {path}: {exc}", file=sys.stderr)
        return None

    if response.status_code != 200:
        print(
            f"GitHub API returned {response.status_code} for {path}",
            file=sys.stderr,
        )
        return None

    return response.json()


def verify() -> bool:
    load_dotenv(".mcp_env")

    token = os.environ.get("MCP_GITHUB_TOKEN")
    org = os.environ.get("GITHUB_EVAL_ORG")

    if not token:
        print("MCP_GITHUB_TOKEN is missing", file=sys.stderr)
        return False

    if not org:
        print("GITHUB_EVAL_ORG is missing", file=sys.stderr)
        return False

    issue = _github_get(org, token, f"issues/{ISSUE_NUMBER}")
    if issue is None:
        return False

    label_names = {label.get("name", "") for label in issue.get("labels", [])}
    if REMOVED_LABEL in label_names:
        print(f"Label '{REMOVED_LABEL}' is still present on issue #{ISSUE_NUMBER}.", file=sys.stderr)
        return False

    comments = _github_get(org, token, f"issues/{ISSUE_NUMBER}/comments?per_page=100")
    if comments is None:
        return False

    found = False
    for comment in comments:
        body = comment.get("body", "").strip().lower()
        if all(keyword in body for keyword in KEYWORDS):
            found = True
            break

    if not found:
        print(
            "Did not find a triage comment containing all required keywords.",
            file=sys.stderr,
        )
        return False

    print("All checks passed! Comment added and label removed.")
    return True


if __name__ == "__main__":
    success = verify()
    sys.exit(0 if success else 1)


================================================
FILE: tasks/github/easy/mcpmark-cicd/basic_ci_checks/description.md
================================================
Use the GitHub MCP tools to update the `mcpmark-eval/mcpmark-cicd` repository with a very small CI workflow.

## Goal
Add a GitHub Actions workflow named **Basic CI Checks** that automatically runs linting and unit tests any time work is pushed to or proposed for the `main` branch.

## Requirements
1. Create a branch called `basic-ci-checks` from `main`.
2. Add `.github/workflows/basic-ci.yml` with the following characteristics:
   - Workflow name: `Basic CI Checks`.
   - Trigger on both `push` and `pull_request`, limited to the `main` branch.
   - Single job called `quality-checks` that runs on `ubuntu-latest` and uses Node.js 18 (`actions/setup-node`).
   - Steps must include `actions/checkout`, `npm ci`, `npm run lint`, and `npm test` in that order after Node is configured.
3. Commit the workflow to your branch, open a pull request titled `Add basic CI checks`, and merge it so the workflow exists on `main`.

That's it—no caching, matrix builds, or issue automation required. Keep it lightweight and focused on verifying the existing lint/test scripts.


================================================
FILE: tasks/github/easy/mcpmark-cicd/basic_ci_checks/meta.json
================================================
{
  "task_id": "basic_ci_checks",
  "task_name": "Basic CI Checks",
  "category_id": "mcpmark-cicd",
  "category_name": "MCPMark CI/CD (Easy)",
  "description": "Add a lightweight GitHub Actions workflow that runs npm ci, npm run lint, and npm test whenever main is updated or receives a pull request.",
  "author": "Zijian Wu",
  "created_at": "2025-11-15",
  "difficulty": "L1",
  "tags": [
    "ci/cd",
    "github actions",
    "workflow basics"
  ],
  "mcp": [
    "github"
  ],
  "meta_data": {
    "stateType": "url",
    "stateContent": null,
    "stateUrl": "https://github.com/mcpmark-source/mcpmark-cicd",
    "stateOriginalUrl": null
  }
}


================================================
FILE: tasks/github/easy/mcpmark-cicd/basic_ci_checks/verify.py
================================================
import base64
import os
import sys
from typing import List, Optional

import requests
from dotenv import load_dotenv

REPO_NAME = "mcpmark-cicd"
WORKFLOW_PATH = ".github/workflows/basic-ci.yml"
BRANCH = "main"


def _download_file(org: str, token: str, path: str) -> Optional[str]:
    url = f"https://api.github.com/repos/{org}/{REPO_NAME}/contents/{path}?ref={BRANCH}"
    headers = {
        "Authorization": f"Bearer {token}",
        "Accept": "application/vnd.github+json",
    }

    try:
        response = requests.get(url, headers=headers, timeout=30)
    except Exception as exc:  # pragma: no cover - network failure
        print(f"Request error for {path}: {exc}", file=sys.stderr)
        return None

    if response.status_code != 200:
        print(
            f"GitHub API returned {response.status_code} when fetching {path}",
            file=sys.stderr,
        )
        return None

    data = response.json()
    try:
        content = base64.b64decode(data.get("content", "")).decode("utf-8")
    except Exception as exc:
        print(f"Unable to decode {path}: {exc}", file=sys.stderr)
        return None

    return content


def _line_index(lines: List[str], needle: str) -> int:
    for idx, line in enumerate(lines):
        if needle in line:
            return idx
    return -1


def verify() -> bool:
    load_dotenv(".mcp_env")

    token = os.environ.get("MCP_GITHUB_TOKEN")
    org = os.environ.get("GITHUB_EVAL_ORG")

    if not token:
        print("MCP_GITHUB_TOKEN is missing", file=sys.stderr)
        return False

    if not org:
        print("GITHUB_EVAL_ORG is missing", file=sys.stderr)
        return False

    content = _download_file(org, token, WORKFLOW_PATH)
    if content is None:
        print(
            "Workflow file .github/workflows/basic-ci.yml was not found on main",
            file=sys.stderr,
        )
        return False

    normalized = content.lower()
    normalized_lines = [line.strip().lower() for line in content.splitlines()]

    errors = []

    required_snippets = {
        "workflow name": "name: basic ci checks",
        "job name": "quality-checks",
        "checkout step": "actions/checkout",
        "setup-node step": "actions/setup-node",
        "node version": "node-version: 18",
        "ubuntu runner": "runs-on: ubuntu-latest",
        "push trigger": "push:",
        "pull_request trigger": "pull_request:",
    }

    for label, snippet in required_snippets.items():
        if snippet not in normalized:
            errors.append(f"Missing {label} ({snippet}) in workflow")

    branch_limited = "- main" in normalized or "[main]" in normalized
    if not branch_limited:
        errors.append("Workflow triggers must be limited to the main branch")

    for command in ["npm ci", "npm run lint", "npm test"]:
        if command not in normalized:
            errors.append(f"Missing '{command}' step")

    # Ensure npm commands happen in the expected order
    ci_index = _line_index(normalized_lines, "npm ci")
    lint_index = _line_index(normalized_lines, "npm run lint")
    test_index = _line_index(normalized_lines, "npm test")

    if ci_index == -1 or lint_index == -1 or test_index == -1:
        errors.append("Could not find all npm commands to validate ordering")
    else:
        if not (ci_index < lint_index < test_index):
            errors.append("npm commands must run in order: ci -> lint -> test")

    if errors:
        print("Verification failed:")
        for err in errors:
            print(f" - {err}", file=sys.stderr)
        return False

    print("✅ basic-ci workflow found with required steps and triggers")
    return True


if __name__ == "__main__":
    sys.exit(0 if verify() else 1)


================================================
FILE: tasks/github/easy/mcpmark-cicd/issue_lint_guard/description.md
================================================
Use the GitHub MCP tools to wire up a tiny issue-triggered lint check for `mcpmark-eval/mcpmark-cicd`.

## Goal
Whenever a maintainer opens the tracking issue **Lint workflow check**, the repo should automatically run `npm run lint` via GitHub Actions. Keep it simple—just prove the workflow fires for issue events.

## Requirements
1. Create a branch called `issue-lint-workflow` from `main`.
2. Add `.github/workflows/issue-lint.yml` with:
   - Workflow name **Issue Lint Guard**.
   - Trigger: `issues` with `types: [opened]` (no push/PR triggers).
   - Single job `lint` on `ubuntu-latest` using Node.js 18 via `actions/setup-node`.
   - Steps in order: `actions/checkout`, `npm ci`, `npm run lint`.
3. Open a pull request titled `Add issue lint workflow`, get it merged so the workflow exists on `main`.
4. After the merge, open a new issue titled **Lint workflow check** to trigger the workflow and wait until the matching run finishes successfully. Leave the issue open; we only care that the run went green.


================================================
FILE: tasks/github/easy/mcpmark-cicd/issue_lint_guard/meta.json
================================================
{
  "task_id": "issue_lint_guard",
  "task_name": "Issue Lint Guard",
  "category_id": "mcpmark-cicd",
  "category_name": "MCPMark CI/CD (Easy)",
  "description": "Add an issue-triggered lint workflow and prove it runs when the tracking issue is opened.",
  "author": "Zijian Wu",
  "created_at": "2025-11-15",
  "difficulty": "L1",
  "tags": [
    "ci/cd",
    "github actions",
    "issues"
  ],
  "mcp": [
    "github"
  ],
  "meta_data": {
    "stateType": "url",
    "stateContent": null,
    "stateUrl": "https://github.com/mcpmark-source/mcpmark-cicd",
    "stateOriginalUrl": null
  }
}


================================================
FILE: tasks/github/easy/mcpmark-cicd/issue_lint_guard/verify.py
================================================
import base64
import os
import sys
import time
from typing import List, Optional

import requests
from dotenv import load_dotenv

REPO_NAME = "mcpmark-cicd"
WORKFLOW_PATH = ".github/workflows/issue-lint.yml"
WORKFLOW_FILE = "issue-lint.yml"
TARGET_BRANCH = "main"
TRACKING_ISSUE_TITLE = "Lint workflow check"
MAX_POLL_ATTEMPTS = 12
POLL_INTERVAL_SECONDS = 10


def _download_file(org: str, token: str, path: str) -> Optional[str]:
    url = f"https://api.github.com/repos/{org}/{REPO_NAME}/contents/{path}?ref={TARGET_BRANCH}"
    headers = {
        "Authorization": f"Bearer {token}",
        "Accept": "application/vnd.github+json",
    }

    try:
        response = requests.get(url, headers=headers, timeout=30)
    except Exception as exc:  # pragma: no cover - network error handling
        print(f"Request error for {path}: {exc}", file=sys.stderr)
        return None

    if response.status_code != 200:
        print(
            f"GitHub API returned {response.status_code} when fetching {path}",
            file=sys.stderr,
        )
        return None

    data = response.json()
    try:
        content = base64.b64decode(data.get("content", "")).decode("utf-8")
    except Exception as exc:  # pragma: no cover - decode error
        print(f"Unable to decode {path}: {exc}", file=sys.stderr)
        return None

    return content


def _line_index(lines: List[str], needle: str) -> int:
    for idx, line in enumerate(lines):
        if needle in line:
            return idx
    return -1


def _list_workflow_runs(org: str, token: str) -> Optional[List[dict]]:
    url = (
        f"https://api.github.com/repos/{org}/{REPO_NAME}/actions/workflows/{WORKFLOW_FILE}/runs"
        f"?event=issues&per_page=15"
    )
    headers = {
        "Authorization": f"Bearer {token}",
        "Accept": "application/vnd.github+json",
    }

    try:
        response = requests.get(url, headers=headers, timeout=30)
    except Exception as exc:  # pragma: no cover - network error handling
        print(f"Request error when listing workflow runs: {exc}", file=sys.stderr)
        return None

    if response.status_code != 200:
        print(
            f"GitHub API returned {response.status_code} when listing workflow runs",
            file=sys.stderr,
        )
        return None

    data = response.json()
    return data.get("workflow_runs", [])


def _wait_for_tracking_issue_run(org: str, token: str) -> bool:
    for attempt in range(1, MAX_POLL_ATTEMPTS + 1):
        runs = _list_workflow_runs(org, token)
        if runs is None:
            return False

        relevant = [
            run
            for run in runs
            if run.get("display_title") == TRACKING_ISSUE_TITLE
        ]

        if not relevant:
            print(
                f"[{attempt}/{MAX_POLL_ATTEMPTS}] No Issue Lint Guard run for '{TRACKING_ISSUE_TITLE}' yet; waiting..."
            )
            time.sleep(POLL_INTERVAL_SECONDS)
            continue

        latest = relevant[0]
        status = latest.get("status")
        conclusion = latest.get("conclusion")
        html_url = latest.get("html_url")

        if status != "completed":
            print(
                f"[{attempt}/{MAX_POLL_ATTEMPTS}] Latest run is '{status}'; waiting for completion..."
            )
            time.sleep(POLL_INTERVAL_SECONDS)
            continue

        if conclusion != "success":
            print(
                "Latest Issue Lint Guard run finished without success.",
                file=sys.stderr,
            )
            print(f"Status: {status}, Conclusion: {conclusion}", file=sys.stderr)
            if html_url:
                print(f"Run URL: {html_url}", file=sys.stderr)
            return False

        if html_url:
            print(f"✅ Latest Issue Lint Guard run succeeded: {html_url}")
        else:
            print("✅ Latest Issue Lint Guard run succeeded")
        return True

    print(
        f"Timed out waiting for a successful Issue Lint Guard run for '{TRACKING_ISSUE_TITLE}'",
        file=sys.stderr,
    )
    return False


def verify() -> bool:
    load_dotenv(".mcp_env")

    token = os.environ.get("MCP_GITHUB_TOKEN")
    org = os.environ.get("GITHUB_EVAL_ORG")

    if not token:
        print("MCP_GITHUB_TOKEN is missing", file=sys.stderr)
        return False

    if not org:
        print("GITHUB_EVAL_ORG is missing", file=sys.stderr)
        return False

    content = _download_file(org, token, WORKFLOW_PATH)
    if content is None:
        print(
            "Workflow file .github/workflows/issue-lint.yml was not found on main",
            file=sys.stderr,
        )
        return False

    normalized = content.lower()
    normalized_lines = [line.strip().lower() for line in content.splitlines()]

    errors = []

    required_snippets = {
        "workflow name": "name: issue lint guard",
        "issues trigger": "issues:",
        "types opened": "types:",
        "job name": "lint:",
        "runner": "runs-on: ubuntu-latest",
        "checkout": "actions/checkout",
        "setup-node": "actions/setup-node",
        "node version": "node-version: 18",
        "npm ci": "npm ci",
        "npm run lint": "npm run lint",
    }

    for label, snippet in required_snippets.items():
        if snippet not in normalized:
            errors.append(f"Missing {label} ({snippet}) in workflow")

    types_line = next(
        (line for line in normalized_lines if "types" in line and "opened" in line),
        None,
    )
    if types_line is None:
        errors.append("issues trigger must limit types to include 'opened'")

    checkout_idx = _line_index(normalized_lines, "actions/checkout")
    setup_idx = _line_index(normalized_lines, "actions/setup-node")
    ci_idx = _line_index(normalized_lines, "npm ci")
    lint_idx = _line_index(normalized_lines, "npm run lint")

    if -1 in [checkout_idx, setup_idx, ci_idx, lint_idx]:
        errors.append("Could not determine workflow step ordering")
    else:
        if not (checkout_idx < setup_idx < ci_idx < lint_idx):
            errors.append(
                "Steps must run in order: checkout -> setup-node -> npm ci -> npm run lint"
            )

    if errors:
        print("Workflow validation failed:")
        for err in errors:
            print(f" - {err}", file=sys.stderr)
        return False

    print("✅ issue-lint workflow file looks correct")

    return _wait_for_tracking_issue_run(org, token)


if __name__ == "__main__":
    sys.exit(0 if verify() else 1)


================================================
FILE: tasks/github/easy/mcpmark-cicd/nightly_health_check/description.md
================================================
Use the GitHub MCP tools to add a tiny bit of automation to `mcpmark-eval/mcpmark-cicd`.

Goal: every night the repo should run the existing health check script.

Do the usual branch/PR flow with a branch named `nightly-health` and a PR titled `Add nightly health check`.

Create `.github/workflows/nightly-health.yml` with:
- workflow name `Nightly Health Check`
- triggers: `workflow_dispatch` plus a cron schedule `0 2 * * *`
- one job called `health-check` on `ubuntu-latest`
- use Node.js 18 via `actions/setup-node`
- steps in order: checkout, npm ci, `npm run health-check`

Merge the PR so the workflow lives on `main`.


================================================
FILE: tasks/github/easy/mcpmark-cicd/nightly_health_check/meta.json
================================================
{
  "task_id": "nightly_health_check",
  "task_name": "Nightly Health Check",
  "category_id": "mcpmark-cicd",
  "category_name": "MCPMark CI/CD (Easy)",
  "description": "Add a scheduled workflow that runs the npm health check script every night.",
  "author": "Zijian Wu",
  "created_at": "2025-11-15",
  "difficulty": "L1",
  "tags": [
    "ci/cd",
    "github actions",
    "scheduling"
  ],
  "mcp": [
    "github"
  ],
  "meta_data": {
    "stateType": "url",
    "stateContent": null,
    "stateUrl": "https://github.com/mcpmark-source/mcpmark-cicd",
    "stateOriginalUrl": null
  }
}


================================================
FILE: tasks/github/easy/mcpmark-cicd/nightly_health_check/verify.py
================================================
import base64
import os
import sys
from typing import List, Optional

import requests
from dotenv import load_dotenv

REPO_NAME = "mcpmark-cicd"
WORKFLOW_PATH = ".github/workflows/nightly-health.yml"
BRANCH = "main"


def _download_file(org: str, token: str, path: str) -> Optional[str]:
    url = f"https://api.github.com/repos/{org}/{REPO_NAME}/contents/{path}?ref={BRANCH}"
    headers = {
        "Authorization": f"Bearer {token}",
        "Accept": "application/vnd.github+json",
    }

    try:
        response = requests.get(url, headers=headers, timeout=30)
    except Exception as exc:  # pragma: no cover
        print(f"Request error for {path}: {exc}", file=sys.stderr)
        return None

    if response.status_code != 200:
        print(
            f"GitHub API returned {response.status_code} when fetching {path}",
            file=sys.stderr,
        )
        return None

    data = response.json()
    try:
        content = base64.b64decode(data.get("content", "")).decode("utf-8")
    except Exception as exc:
        print(f"Unable to decode {path}: {exc}", file=sys.stderr)
        return None

    return content


def _line_index(lines: List[str], needle: str) -> int:
    for idx, line in enumerate(lines):
        if needle in line:
            return idx
    return -1


def verify() -> bool:
    load_dotenv(".mcp_env")

    token = os.environ.get("MCP_GITHUB_TOKEN")
    org = os.environ.get("GITHUB_EVAL_ORG")

    if not token:
        print("MCP_GITHUB_TOKEN is missing", file=sys.stderr)
        return False

    if not org:
        print("GITHUB_EVAL_ORG is missing", file=sys.stderr)
        return False

    content = _download_file(org, token, WORKFLOW_PATH)
    if content is None:
        print(
            "Workflow file .github/workflows/nightly-health.yml was not found on main",
            file=sys.stderr,
        )
        return False

    normalized = content.lower()
    normalized_lines = [line.strip().lower() for line in content.splitlines()]

    errors = []

    required_bits = {
        "workflow name": "name: nightly health check",
        "workflow_dispatch trigger": "workflow_dispatch:",
        "schedule": "schedule:",
        "cron": "0 2 * * *",
        "job name": "health-check:",
        "runner": "runs-on: ubuntu-latest",
        "checkout": "actions/checkout",
        "setup-node": "actions/setup-node",
        "node version": "node-version: 18",
        "npm ci": "npm ci",
        "health script": "npm run health-check",
    }

    for label, snippet in required_bits.items():
        if snippet not in normalized:
            errors.append(f"Missing {label} ({snippet}) in workflow")

    schedule_index = _line_index(normalized_lines, "schedule:")
    cron_index = _line_index(normalized_lines, "- cron: '0 2 * * *'")
    if cron_index == -1:
        cron_index = _line_index(normalized_lines, "cron: '0 2 * * *'")
    if cron_index == -1:
        cron_index = _line_index(normalized_lines, 'cron: "0 2 * * *"')

    if schedule_index == -1 or cron_index == -1 or cron_index < schedule_index:
        errors.append("Cron expression must appear under schedule trigger")

    ci_index = _line_index(normalized_lines, "npm ci")
    health_index = _line_index(normalized_lines, "npm run health-check")
    if ci_index == -1 or health_index == -1:
        errors.append("npm ci and npm run health-check must both appear")
    else:
        if not ci_index < health_index:
            errors.append("npm ci must run before npm run health-check")

    if errors:
        print("Verification failed:")
        for err in errors:
            print(f" - {err}", file=sys.stderr)
        return False

    print("✅ nightly-health workflow found with required schedule and steps")
    return True


if __name__ == "__main__":
    sys.exit(0 if verify() else 1)


================================================
FILE: tasks/github/easy/missing-semester/count_translations/description.md
================================================
Use the GitHub MCP tools to inspect the `mcpmark-eval/missing-semester` repository.

1. Navigate the repository to find the list of community translations that appears on the site's home page.
2. Determine how many translation links are currently listed.
3. Record both the count and the specific file you used as evidence by creating an `ANSWER.md` file in the repository root that contains exactly:

```
Translation Count: <number>
Source: <filename>
```

4. Commit the new file and push the change to `master`.


================================================
FILE: tasks/github/easy/missing-semester/count_translations/meta.json
================================================
{
  "task_id": "count_translations",
  "task_name": "Count Translations",
  "category_id": "missing-semester",
  "category_name": "Missing Semester (Easy)",
  "description": "Use GitHub MCP to count the translations listed on the home page, record the value in ANSWER.md, and push the change to master.",
  "author": "Zijian Wu",
  "created_at": "2025-11-15",
  "difficulty": "L1",
  "tags": [
    "content search",
    "answer file"
  ],
  "mcp": [
    "github"
  ],
  "meta_data": {
    "stateType": "url",
    "stateContent": null,
    "stateUrl": "https://github.com/mcpmark-source/missing-semester",
    "stateOriginalUrl": "https://github.com/missing-semester/missing-semester"
  }
}


================================================
FILE: tasks/github/easy/missing-semester/count_translations/verify.py
================================================
import base64
import os
import sys
from typing import Optional

import requests
from dotenv import load_dotenv

REPO_NAME = "missing-semester"
TARGET_FILE = "ANSWER.md"
BRANCH = "master"
EXPECTED_COUNT = "translation count: 14"
EXPECTED_SOURCE = "source: index.md"


def _download_file(org: str, token: str, path: str) -> Optional[str]:
    url = f"https://api.github.com/repos/{org}/{REPO_NAME}/contents/{path}?ref={BRANCH}"
    headers = {
        "Authorization": f"Bearer {token}",
        "Accept": "application/vnd.github+json",
    }

    try:
        response = requests.get(url, headers=headers, timeout=30)
    except Exception as exc:
        print(f"Request error for {path}: {exc}", file=sys.stderr)
        return None

    if response.status_code != 200:
        print(
            f"GitHub API returned {response.status_code} when fetching {path}",
            file=sys.stderr,
        )
        return None

    data = response.json()
    try:
        content = base64.b64decode(data.get("content", "")).decode("utf-8").strip()
    except Exception as exc:
        print(f"Unable to decode {path}: {exc}", file=sys.stderr)
        return None

    return content


def verify() -> bool:
    load_dotenv(".mcp_env")

    token = os.environ.get("MCP_GITHUB_TOKEN")
    org = os.environ.get("GITHUB_EVAL_ORG")

    if not token:
        print("MCP_GITHUB_TOKEN is missing", file=sys.stderr)
        return False

    if not org:
        print("GITHUB_EVAL_ORG is missing", file=sys.stderr)
        return False

    print("Checking ANSWER.md in remote repository...")
    answer_content = _download_file(org, token, TARGET_FILE)

    if answer_content is None:
        return False

    normalized = " ".join(answer_content.lower().split())

    if EXPECTED_COUNT not in normalized:
        print(
            "ANSWER.md must include 'Translation Count: 14' (spacing/casing ignored).",
            file=sys.stderr,
        )
        print("Found:")
        print(answer_content)
        return False

    if EXPECTED_SOURCE not in normalized:
        print(
            "ANSWER.md must include 'Source: index.md' (spacing/casing ignored).",
            file=sys.stderr,
        )
        print("Found:")
        print(answer_content)
        return False

    print("All checks passed! ANSWER.md contains the expected count and source.")
    return True


if __name__ == "__main__":
    success = verify()
    sys.exit(0 if success else 1)


================================================
FILE: tasks/github/easy/missing-semester/find_ga_tracking_id/description.md
================================================
Use the GitHub MCP tools to inspect the `mcpmark-eval/missing-semester` repository.

1. Determine the Analytics tracking ID that the Missing Semester site declares in its configuration.
2. Create an `ANSWER.md` file in the repository root that contains exactly:

```
Analytics Tracking ID: <value you found>
```

3. Commit the new file and push the change to `master`.


================================================
FILE: tasks/github/easy/missing-semester/find_ga_tracking_id/meta.json
================================================
{
  "task_id": "find_ga_tracking_id",
  "task_name": "Find GA Tracking ID",
  "category_id": "missing-semester",
  "category_name": "Missing Semester (Easy)",
  "description": "Use GitHub MCP to discover the single Google Analytics tracking ID declared in the site configuration, write it to ANSWER.md, and push the change to master.",
  "author": "Zijian Wu",
  "created_at": "2025-11-15",
  "difficulty": "L1",
  "tags": [
    "config search",
    "analytics",
    "answer file"
  ],
  "mcp": [
    "github"
  ],
  "meta_data": {
    "stateType": "url",
    "stateContent": null,
    "stateUrl": "https://github.com/mcpmark-source/missing-semester",
    "stateOriginalUrl": "https://github.com/missing-semester/missing-semester"
  }
}


================================================
FILE: tasks/github/easy/missing-semester/find_ga_tracking_id/verify.py
================================================
import base64
import os
import sys
from typing import Optional

import requests
from dotenv import load_dotenv

# Accept either wording, regardless of casing
EXPECTED_VARIANTS = {
    "google analytics tracking id: g-p7wvhd84d1",
    "analytics tracking id: g-p7wvhd84d1",
}
REPO_NAME = "missing-semester"
TARGET_FILE = "ANSWER.md"
BRANCH = "master"


def _download_file(org: str, token: str) -> Optional[str]:
    url = f"https://api.github.com/repos/{org}/{REPO_NAME}/contents/{TARGET_FILE}?ref={BRANCH}"
    headers = {
        "Authorization": f"Bearer {token}",
        "Accept": "application/vnd.github+json",
    }

    try:
        response = requests.get(url, headers=headers)
    except Exception as exc:
        print(f"Request error for {TARGET_FILE}: {exc}", file=sys.stderr)
        return None

    if response.status_code != 200:
        print(
            f"GitHub API returned {response.status_code} when fetching {TARGET_FILE}",
            file=sys.stderr,
        )
        return None

    data = response.json()
    try:
        content = base64.b64decode(data.get("content", "")).decode("utf-8").strip()
    except Exception as exc:
        print(f"Unable to decode {TARGET_FILE}: {exc}", file=sys.stderr)
        return None

    return content


def verify() -> bool:
    load_dotenv(".mcp_env")

    token = os.environ.get("MCP_GITHUB_TOKEN")
    org = os.environ.get("GITHUB_EVAL_ORG")

    if not token:
        print("MCP_GITHUB_TOKEN is missing", file=sys.stderr)
        return False

    if not org:
        print("GITHUB_EVAL_ORG is missing", file=sys.stderr)
        return False

    print("Checking ANSWER.md in remote repository...")
    answer_content = _download_file(org, token)

    if answer_content is None:
        return False

    normalized = answer_content.strip().lower()
    if normalized not in EXPECTED_VARIANTS:
        print("ANSWER.md does not contain an accepted tracking ID format", file=sys.stderr)
        print("Accepted variants:", file=sys.stderr)
        for variant in EXPECTED_VARIANTS:
            print(f"  - {variant}", file=sys.stderr)
        print(f"Found: {answer_content}", file=sys.stderr)
        return False

    print("All checks passed! ANSWER.md matches an accepted content variant.")
    return True


if __name__ == "__main__":
    success = verify()
    sys.exit(0 if success else 1)


================================================
FILE: tasks/github/standard/build_your_own_x/find_commit_date/description.md
================================================
Find out when the entries in the Voxel Engine section were first created by Daniel Stefanovic. After finding this information, create an ANSWER.md file in the repository with the content being the date in [YYYY]-[MM]-[DD] format (e.g., 2000-06-02).

================================================
FILE: tasks/github/standard/build_your_own_x/find_commit_date/meta.json
================================================
{
  "task_id": "find_commit_date",
  "task_name": "Find Commit Date",
  "category_id": "build_your_own_x",
  "category_name": "Build Your Own X",
  "description": "Find when Voxel Engine entries were first created by Daniel Stefanovic and document the date.",
  "author": "Xiangyan Liu",
  "created_at": "2025-08-15",
  "difficulty": "L3",
  "tags": [
    "repository analysis"
  ],
  "mcp": [
    "github"
  ],
  "meta_data": {
    "stateType": "url",
    "stateContent": null,
    "stateUrl": "https://github.com/mcpmark-source/build-your-own-x",
    "stateOriginalUrl": "https://github.com/codecrafters-io/build-your-own-x"
  }
}

================================================
FILE: tasks/github/standard/build_your_own_x/find_commit_date/verify.py
================================================
import sys
import os
import requests
from typing import Dict, Optional, Tuple
import base64
from dotenv import load_dotenv


def _get_github_api(
    endpoint: str, headers: Dict[str, str], org: str, repo: str = "build-your-own-x"
) -> Tuple[bool, Optional[Dict]]:
    """Make a GET request to GitHub API and return (success, response)."""
    url = f"https://api.github.com/repos/{org}/{repo}/{endpoint}"
    try:
        response = requests.get(url, headers=headers)
        if response.status_code == 200:
            return True, response.json()
        elif response.status_code == 404:
            return False, None
        else:
            print(f"API error for {endpoint}: {response.status_code}", file=sys.stderr)
            return False, None
    except Exception as e:
        print(f"Exception for {endpoint}: {e}", file=sys.stderr)
        return False, None


def _get_file_content(
    file_path: str,
    headers: Dict[str, str],
    org: str,
    repo: str = "build-your-own-x",
    ref: str = "master",
) -> Optional[str]:
    """Get the content of a file from the repository."""
    success, result = _get_github_api(
        f"contents/{file_path}?ref={ref}", headers, org, repo
    )
    if not success or not result:
        return None

    try:
        content = base64.b64decode(result.get("content", "")).decode("utf-8")
        return content
    except Exception as e:
        print(f"Content decode error for {file_path}: {e}", file=sys.stderr)
        return None


def verify_task() -> bool:
    """Verify the find commit data task for Voxel Engine entries."""
    # Load environment variables from .mcp_env
    load_dotenv(".mcp_env")

    # Get GitHub token and org
    github_token = os.environ.get("MCP_GITHUB_TOKEN")
    github_org = os.environ.get("GITHUB_EVAL_ORG")

    if not github_token:
        print("Error: MCP_GITHUB_TOKEN environment variable not set", file=sys.stderr)
        return False

    if not github_org:
        print("Error: GITHUB_EVAL_ORG environment variable not set", file=sys.stderr)
        return False

    headers = {
        "Authorization": f"Bearer {github_token}",
        "Accept": "application/vnd.github.v3+json",
    }

    print("Verifying Voxel Engine commit date task...")

    # 1. Check if ANSWER.md exists in the repository
    print("1. Checking if ANSWER.md exists...")
    content = _get_file_content("ANSWER.md", headers, github_org)
    if not content:
        print("Error: ANSWER.md not found in repository", file=sys.stderr)
        return False
    print("✓ ANSWER.md found")

    # 2. Check the content format
    print("2. Checking content format...")
    content = content.strip()
    
    # The expected date when Daniel Stefanovic added Voxel Engine entries
    # Based on historical records, this should be 2018-07-07
    expected_date = "2018-07-07"
    
    # Check if the content matches the expected date format (YYYY-MM-DD)
    import re
    date_pattern = r'^\d{4}-\d{2}-\d{2}$'
    if not re.match(date_pattern, content):
        print(f"Error: Invalid date format. Expected YYYY-MM-DD, got: {content}", file=sys.stderr)
        return False
    print("✓ Date format is correct")

    # 3. Verify the date is correct
    print("3. Verifying the date...")
    if content != expected_date:
        print(f"Error: Incorrect date. Expected {expected_date}, got: {content}", file=sys.stderr)
        return False
    print(f"✓ Date is correct: {content}")

    # 4. Verify README.md contains Voxel Engine section
    print("4. Checking if README.md contains Voxel Engine section...")
    readme_content = _get_file_content("README.md", headers, github_org)
    if not readme_content:
        print("Error: README.md not found in repository", file=sys.stderr)
        return False
    
    if "Voxel Engine" not in readme_content:
        print("Error: Voxel Engine section not found in README.md", file=sys.stderr)
        return False
    
    # Check for specific Voxel Engine entries
    voxel_entries = [
        "Let's Make a Voxel Engine",
        "Java Voxel Engine Tutorial"
    ]
    
    for entry in voxel_entries:
        if entry not in readme_content:
            print(f"Warning: Voxel Engine entry '{entry}' not found in README.md", file=sys.stderr)
    
    print("✓ Voxel Engine section found in README.md")

    print("\n✅ All verification checks passed!")
    print("Task completed successfully:")
    print(f"  - ANSWER.md created with date: {content}")
    print("  - Date format is correct (YYYY-MM-DD)")
    print("  - Date matches expected creation date for Voxel Engine entries by Daniel Stefanovic")
    print("  - Voxel Engine section exists in README.md")

    return True


if __name__ == "__main__":
    success = verify_task()
    sys.exit(0 if success else 1)

================================================
FILE: tasks/github/standard/build_your_own_x/find_rag_commit/description.md
================================================
Find out the specific commit SHA of adding an entry about "RAG for Document Search". After finding this information, create an ANSWER.md file in the repository with the content being the commit SHA (e.g., 023dfa35694db2709057488ad338afdbc89fb226).

Hint: It should be in an "AI model" section I think.

================================================
FILE: tasks/github/standard/build_your_own_x/find_rag_commit/meta.json
================================================
{
  "task_id": "find_rag_commit",
  "task_name": "Find Rag Commit",
  "category_id": "build_your_own_x",
  "category_name": "Build Your Own X",
  "description": "Identify the specific commit SHA that added the RAG for Document Search entry to the repository.",
  "author": "Xiangyan Liu",
  "created_at": "2025-08-15",
  "difficulty": "L3",
  "tags": [
    "repository analysis"
  ],
  "mcp": [
    "github"
  ],
  "meta_data": {
    "stateType": "url",
    "stateContent": null,
    "stateUrl": "https://github.com/mcpmark-source/build-your-own-x",
    "stateOriginalUrl": "https://github.com/codecrafters-io/build-your-own-x"
  }
}

================================================
FILE: tasks/github/standard/build_your_own_x/find_rag_commit/verify.py
================================================
import sys
import os
import requests
from typing import Dict, Optional, Tuple
import base64
from dotenv import load_dotenv


def _get_github_api(
    endpoint: str, headers: Dict[str, str], org: str, repo: str = "build-your-own-x"
) -> Tuple[bool, Optional[Dict]]:
    """Make a GET request to GitHub API and return (success, response)."""
    url = f"https://api.github.com/repos/{org}/{repo}/{endpoint}"
    try:
        response = requests.get(url, headers=headers)
        if response.status_code == 200:
            return True, response.json()
        elif response.status_code == 404:
            return False, None
        else:
            print(f"API error for {endpoint}: {response.status_code}", file=sys.stderr)
            return False, None
    except Exception as e:
        print(f"Exception for {endpoint}: {e}", file=sys.stderr)
        return False, None


def _get_file_content(
    file_path: str,
    headers: Dict[str, str],
    org: str,
    repo: str = "build-your-own-x",
    ref: str = "master",
) -> Optional[str]:
    """Get the content of a file from the repository."""
    success, result = _get_github_api(
        f"contents/{file_path}?ref={ref}", headers, org, repo
    )
    if not success or not result:
        return None

    try:
        content = base64.b64decode(result.get("content", "")).decode("utf-8")
        return content
    except Exception as e:
        print(f"Content decode error for {file_path}: {e}", file=sys.stderr)
        return None


def verify_task() -> bool:
    """Verify the find RAG commit SHA task."""
    # Load environment variables from .mcp_env
    load_dotenv(".mcp_env")

    # Get GitHub token and org
    github_token = os.environ.get("MCP_GITHUB_TOKEN")
    github_org = os.environ.get("GITHUB_EVAL_ORG")

    if not github_token:
        print("Error: MCP_GITHUB_TOKEN environment variable not set", file=sys.stderr)
        return False

    if not github_org:
        print("Error: GITHUB_EVAL_ORG environment variable not set", file=sys.stderr)
        return False

    headers = {
        "Authorization": f"Bearer {github_token}",
        "Accept": "application/vnd.github.v3+json",
    }

    print("Verifying RAG commit SHA task...")

    # Expected commit SHA for RAG for Document Search
    expected_sha = "048cd3b3de70e4b429057891576ea394a50cdf48"

    # 1. Check if ANSWER.md exists in the repository
    print("1. Checking if ANSWER.md exists...")
    content = _get_file_content("ANSWER.md", headers, github_org)
    if not content:
        print("Error: ANSWER.md not found in repository", file=sys.stderr)
        return False
    print("✓ ANSWER.md found")

    # 2. Check the content matches expected SHA
    print("2. Checking commit SHA...")
    content = content.strip()
    
    if content != expected_sha:
        print(f"Error: Incorrect commit SHA. Expected {expected_sha}, got: {content}", file=sys.stderr)
        return False
    print("✓ Commit SHA is correct")

    # 3. Verify the commit exists
    print("3. Verifying the commit exists...")
    success, commit_data = _get_github_api(f"commits/{content}", headers, github_org)
    if not success or not commit_data:
        print(f"Error: Commit {content} not found in repository", file=sys.stderr)
        return False
    print(f"✓ Commit {content} exists")

    print("\n✅ All verification checks passed!")
    print("Task completed successfully:")
    print(f"  - ANSWER.md created with correct commit SHA: {content}")
    print(f"  - Commit exists in the repository")
    print(f"  - Commit message: {commit_data.get('commit', {}).get('message', '')}")

    return True


if __name__ == "__main__":
    success = verify_task()
    sys.exit(0 if success else 1)

================================================
FILE: tasks/github/standard/claude-code/automated_changelog_generation/description.md
================================================
I need you to analyze all recently closed issues and open pull requests in the repository, then generate comprehensive documentation and organize them properly.

**Step 1: Create Documentation Branch**
Create a new branch called 'docs/changelog-and-migration' from the main branch.

**Step 2: Generate Changelog from Closed Issues**
Find all closed issues in the repository and create the file `CHANGELOG-GENERATED.md` on your branch with:
- A heading "# Changelog - Recent Fixes"
- A "### 🐛 Bug Fixes" section listing all closed issues with bug label, formatted as: "- **#[NUMBER]**: [Title] ([labels])"
- A "### 📚 Documentation" section for closed issues with documentation label
- A "### 🔄 Duplicates" section for issues marked as duplicate
- A "### 📊 Statistics" section with:
  - Total number of closed issues
  - Distribution by platform labels (platform:macos, platform:linux, etc.)
  - Distribution by area labels (area:core, area:tools, etc.)

**Step 3: Create Migration Guide for Open PRs**
Analyze all open pull requests and create the file `docs/MIGRATION_GUIDE.md` with:
- A heading "# Migration Guide for Pending Features"
- For each open PR, create a section with:
  - PR title and number
  - Summary of changes based on the PR description
  - Any new configuration or environment variables mentioned
  - Installation or usage instructions if applicable

**Step 4: Create Issue Analysis Report**
Create the file `reports/ISSUE_ANALYSIS.md` with:
- A heading "# Issue Analysis Report"
- A "## Closed Issues by Category" section grouping closed issues by their primary label
- A "## Resolution Patterns" section identifying common themes
- A "## Platform Impact Analysis" section showing which platforms were most affected
- Include references to specific issues that had cross-project impact or memory-related problems

**Step 5: Create PR Integration Plan**
Create the file `reports/PR_INTEGRATION_PLAN.md` with:
- A heading "# Pull Request Integration Strategy"
- A "## Open PRs Overview" section listing each open PR with a technical summary
- A "## Dependencies and Conflicts" section analyzing potential conflicts between PRs
- A "## Recommended Merge Order" section with reasoning
- A "## Risk Assessment" section linking any risks to previously closed issues

**Step 6: Create Documentation PR**
Create a pull request from 'docs/changelog-and-migration' to 'main' with:
- Title: "docs: Generated changelog and migration documentation"
- Body including:
  - A "## Summary" section describing what was generated
  - A "## Files Created" section listing all new documentation
  - A "## Issues Processed" section mentioning the number of closed issues analyzed
  - A "## PRs Analyzed" section mentioning the open PRs reviewed

**Step 7: Merge Documentation PR**
Merge the documentation pull request using the "squash" merge method.

================================================
FILE: tasks/github/standard/claude-code/automated_changelog_generation/meta.json
================================================
{
  "task_id": "automated_changelog_generation",
  "task_name": "Automated Changelog Generation",
  "category_id": "claude-code",
  "category_name": "Claude Code",
  "description": "Analyze closed issues and open PRs to generate comprehensive documentation including changelog, migration guide, and analysis reports.",
  "author": "Zijian Wu",
  "created_at": "2025-08-15",
  "difficulty": "L3",
  "tags": [
    "release coordination",
    "workflow automation"
  ],
  "mcp": [
    "github"
  ],
  "meta_data": {
    "stateType": "url",
    "stateContent": null,
    "stateUrl": "https://github.com/mcpmark-source/claude-code",
    "stateOriginalUrl": "https://github.com/anthropics/claude-code"
  }
}

================================================
FILE: tasks/github/standard/claude-code/automated_changelog_generation/verify.py
================================================
import sys
import os
import requests
from typing import Dict, List, Optional, Tuple
import base64
from dotenv import load_dotenv


def _get_github_api(
    endpoint: str, headers: Dict[str, str], org: str, repo: str = "claude-code"
) -> Tuple[bool, Optional[Dict]]:
    """Make a GET request to GitHub API and return (success, response)."""
    url = f"https://api.github.com/repos/{org}/{repo}/{endpoint}"

    try:
        response = requests.get(url, headers=headers)
        if response.status_code == 200:
            return True, response.json()
        elif response.status_code == 404:
            return False, None
        else:
            print(f"API error for {endpoint}: {response.status_code}", file=sys.stderr)
            return False, None
    except Exception as e:
        print(f"Exception for {endpoint}: {e}", file=sys.stderr)
        return False, None


def _check_branch_exists(
    branch_name: str, headers: Dict[str, str], org: str, repo: str = "claude-code"
) -> bool:
    """Verify that a branch exists in the repository."""
    success, _ = _get_github_api(f"branches/{branch_name}", headers, org, repo)
    return success


def _get_file_content(
    file_path: str,
    headers: Dict[str, str],
    org: str,
    repo: str = "claude-code",
    ref: str = "main",
) -> Optional[str]:
    """Get the content of a file from the repository."""
    success, result = _get_github_api(
        f"contents/{file_path}?ref={ref}", headers, org, repo
    )
    if not success or not result:
        return None

    try:
        content = base64.b64decode(result.get("content", "")).decode("utf-8")
        return content
    except Exception as e:
        print(f"Content decode error for {file_path}: {e}", file=sys.stderr)
        return None


def _find_pr_by_title_keyword(
    keyword: str, headers: Dict[str, str], org: str, repo: str = "claude-code"
) -> Optional[Dict]:
    """Find a PR by title keyword and return the PR data."""
    for state in ["open", "closed"]:
        success, prs = _get_github_api(
            f"pulls?state={state}&per_page=100", headers, org, repo
        )
        if success and prs:
            for pr in prs:
                if keyword.lower() in pr.get("title", "").lower():
                    return pr
    return None


def _get_pr_merge_commit(
    pr_number: int, headers: Dict[str, str], org: str, repo: str = "claude-code"
) -> Optional[Dict]:
    """Get the merge commit for a PR to check merge method."""
    success, pr = _get_github_api(f"pulls/{pr_number}", headers, org, repo)
    if success and pr:
        merge_commit_sha = pr.get("merge_commit_sha")
        if merge_commit_sha:
            success, commit = _get_github_api(
                f"commits/{merge_commit_sha}", headers, org, repo
            )
            if success:
                return commit
    return None


def _check_file_sections(content: str, required_sections: List[str]) -> bool:
    """Check if file content contains required sections."""
    if not content:
        return False
    return all(section in content for section in required_sections)


def _check_issue_references(text: str, issue_numbers: List[int]) -> int:
    """Count how many of the specified issue numbers are referenced in the text."""
    if not text:
        return 0
    count = 0
    for num in issue_numbers:
        if f"#{num}" in text:
            count += 1
    return count


def _check_pr_references(text: str, pr_numbers: List[int]) -> int:
    """Count how many of the specified PR numbers are referenced in the text."""
    if not text:
        return 0
    count = 0
    for num in pr_numbers:
        if f"#{num}" in text or f"PR #{num}" in text:
            count += 1
    return count


def verify() -> bool:
    """
    Programmatically verify that the changelog and migration documentation workflow
    meets the requirements described in description.md.
    """
    # Configuration constants - these are known to us but not explicitly told to the model
    DOCS_BRANCH_NAME = "docs/changelog-and-migration"
    DOCS_PR_KEYWORD = "Generated changelog and migration"

    # Known issue and PR numbers for verification
    EXPECTED_BUG_ISSUES = [12, 13, 15, 21, 22, 23, 25, 37, 39, 48, 50]
    EXPECTED_OPEN_PRS = [51, 52, 53]

    # Expected file sections
    CHANGELOG_SECTIONS = [
        "# Changelog - Recent Fixes",
        "### 🐛 Bug Fixes",
        "### 📚 Documentation",
        "### 🔄 Duplicates",
        "### 📊 Statistics",
    ]

    MIGRATION_GUIDE_SECTIONS = ["# Migration Guide for Pending Features"]

    ISSUE_ANALYSIS_SECTIONS = [
        "# Issue Analysis Report",
        "## Closed Issues by Category",
        "## Resolution Patterns",
        "## Platform Impact Analysis",
    ]

    PR_INTEGRATION_SECTIONS = [
        "# Pull Request Integration Strategy",
        "## Open PRs Overview",
        "## Dependencies and Conflicts",
        "## Recommended Merge Order",
        "## Risk Assessment",
    ]

    # Load environment variables from .mcp_env
    load_dotenv(".mcp_env")

    # Get GitHub token and org
    github_token = os.environ.get("MCP_GITHUB_TOKEN")
    github_org = os.environ.get("GITHUB_EVAL_ORG")

    if not github_token:
        print("Error: MCP_GITHUB_TOKEN environment variable not set", file=sys.stderr)
        return False

    if not github_org:
        print("Error: GITHUB_EVAL_ORG environment variable not set", file=sys.stderr)
        return False

    headers = {
        "Authorization": f"Bearer {github_token}",
        "Accept": "application/vnd.github.v3+json",
    }

    # Run verification checks
    print("Verifying changelog and migration documentation workflow...")

    # 1. Check that documentation branch exists
    print("1. Verifying documentation branch exists...")
    if not _check_branch_exists(DOCS_BRANCH_NAME, headers, github_org):
        print(f"Error: Branch '{DOCS_BRANCH_NAME}' not found", file=sys.stderr)
        return False
    print("✓ Documentation branch created")

    # 2. Check changelog file
    print("2. Verifying CHANGELOG-GENERATED.md...")
    changelog_content = _get_file_content(
        "CHANGELOG-GENERATED.md", headers, github_org, "claude-code", DOCS_BRANCH_NAME
    )
    if not changelog_content:
        print("Error: CHANGELOG-GENERATED.md not found", file=sys.stderr)
        return False

    if not _check_file_sections(changelog_content, CHANGELOG_SECTIONS):
        print(
            "Error: CHANGELOG-GENERATED.md missing required sections", file=sys.stderr
        )
        return False

    # Check that bug issues are referenced
    bug_refs = _check_issue_references(changelog_content, EXPECTED_BUG_ISSUES)
    if bug_refs < 8:  # At least 8 of the bug issues
        print(
            f"Error: CHANGELOG-GENERATED.md only references {bug_refs} bug issues, expected at least 8",
            file=sys.stderr,
        )
        return False

    # Check for platform and area statistics
    if (
        "platform:" not in changelog_content.lower()
        or "area:" not in changelog_content.lower()
    ):
        print(
            "Error: CHANGELOG-GENERATED.md missing platform or area distribution",
            file=sys.stderr,
        )
        return False

    print("✓ Changelog created with proper content")

    # 3. Check migration guide
    print("3. Verifying MIGRATION_GUIDE.md...")
    migration_content = _get_file_content(
        "docs/MIGRATION_GUIDE.md", headers, github_org, "claude-code", DOCS_BRANCH_NAME
    )
    if not migration_content:
        print("Error: docs/MIGRATION_GUIDE.md not found", file=sys.stderr)
        return False

    if not _check_file_sections(migration_content, MIGRATION_GUIDE_SECTIONS):
        print("Error: MIGRATION_GUIDE.md missing required sections", file=sys.stderr)
        return False

    # Check that all expected open PRs are mentioned
    pr_refs = _check_pr_references(migration_content, EXPECTED_OPEN_PRS)
    if pr_refs < 3:
        print(
            f"Error: MIGRATION_GUIDE.md only references {pr_refs}/3 open PRs",
            file=sys.stderr,
        )
        return False

    print("✓ Migration guide created with proper content")

    # 4. Check issue analysis report
    print("4. Verifying ISSUE_ANALYSIS.md...")
    issue_analysis_content = _get_file_content(
        "reports/ISSUE_ANALYSIS.md",
        headers,
        github_org,
        "claude-code",
        DOCS_BRANCH_NAME,
    )
    if not issue_analysis_content:
        print("Error: reports/ISSUE_ANALYSIS.md not found", file=sys.stderr)
        return False

    if not _check_file_sections(issue_analysis_content, ISSUE_ANALYSIS_SECTIONS):
        print("Error: ISSUE_ANALYSIS.md missing required sections", file=sys.stderr)
        return False

    # Check for cross-project and memory issue mentions
    if "#50" not in issue_analysis_content and "#48" not in issue_analysis_content:
        print(
            "Warning: ISSUE_ANALYSIS.md may be missing cross-project issue references",
            file=sys.stderr,
        )

    print("✓ Issue analysis report created")

    # 5. Check PR integration plan
    print("5. Verifying PR_INTEGRATION_PLAN.md...")
    pr_plan_content = _get_file_content(
        "reports/PR_INTEGRATION_PLAN.md",
        headers,
        github_org,
        "claude-code",
        DOCS_BRANCH_NAME,
    )
    if not pr_plan_content:
        print("Error: reports/PR_INTEGRATION_PLAN.md not found", file=sys.stderr)
        return False

    if not _check_file_sections(pr_plan_content, PR_INTEGRATION_SECTIONS):
        print(
            "Error: PR_INTEGRATION_PLAN.md missing required sections", file=sys.stderr
        )
        return False

    # Check that all open PRs are analyzed
    pr_refs_in_plan = _check_pr_references(pr_plan_content, EXPECTED_OPEN_PRS)
    if pr_refs_in_plan < 3:
        print(
            f"Error: PR_INTEGRATION_PLAN.md only references {pr_refs_in_plan}/3 open PRs",
            file=sys.stderr,
        )
        return False

    print("✓ PR integration plan created")

    # 6. Find and verify the documentation PR
    print("6. Verifying documentation pull request...")
    docs_pr = _find_pr_by_title_keyword(DOCS_PR_KEYWORD, headers, github_org)
    if not docs_pr:
        # Try alternative keyword
        docs_pr = _find_pr_by_title_keyword(
            "changelog and migration", headers, github_org
        )

    if not docs_pr:
        print("Error: Documentation PR not found", file=sys.stderr)
        return False

    pr_body = docs_pr.get("body", "")
    pr_number = docs_pr.get("number")

    # Check PR body sections
    required_sections = [
        "## Summary",
        "## Files Created",
        "## Issues Processed",
        "## PRs Analyzed",
    ]
    missing_sections = []
    for section in required_sections:
        if section not in pr_body:
            missing_sections.append(section)

    if len(missing_sections) > 1:  # Allow 1 missing section for flexibility
        print(
            f"Error: Documentation PR missing sections: {missing_sections}",
            file=sys.stderr,
        )
        return False

    print("✓ Documentation PR created")

    # 7. Check that the documentation PR has been merged with squash method
    print("7. Verifying documentation PR merge with squash method...")
    if docs_pr.get("state") != "closed" or not docs_pr.get("merged_at"):
        print("Error: Documentation PR has not been merged", file=sys.stderr)
        return False

    # Check merge method was squash by examining the merge commit
    merge_commit = _get_pr_merge_commit(pr_number, headers, github_org)
    if merge_commit:
        # Squash merges typically have only one parent (the base branch)
        parents = merge_commit.get("parents", [])
        if len(parents) != 1:
            print(
                f"Warning: Merge commit has {len(parents)} parents, may not be squash merge",
                file=sys.stderr,
            )

        # Check commit message pattern typical of squash merges
        commit_message = merge_commit.get("commit", {}).get("message", "")
        if f"#{pr_number}" not in commit_message:
            print(
                "Warning: Merge commit message may not follow squash merge pattern",
                file=sys.stderr,
            )
    else:
        print("Warning: Could not retrieve merge commit details", file=sys.stderr)

    merged_at = docs_pr.get("merged_at")
    if not merged_at:
        print("Error: Documentation PR merge timestamp not found", file=sys.stderr)
        return False

    print("✓ Documentation PR merged successfully")

    print("\n✅ All verification checks passed!")
    print("Changelog and migration documentation completed successfully:")
    print(f"  - Documentation PR #{pr_number} (merged)")
    print(f"  - Branch: {DOCS_BRANCH_NAME}")
    print("  - Files created: 4 documentation files")
    print(f"  - Bug issues referenced: {bug_refs}/{len(EXPECTED_BUG_ISSUES)}")
    print(f"  - Open PRs analyzed: {pr_refs}/{len(EXPECTED_OPEN_PRS)}")

    return True


if __name__ == "__main__":
    success = verify()
    sys.exit(0 if success else 1)


================================================
FILE: tasks/github/standard/claude-code/claude_collaboration_analysis/description.md
================================================
I need you to analyze the collaboration patterns between human developers and Claude (the AI assistant) in the repository by examining all available commit history, then create a comprehensive analysis report and submit it as a new file to the repository.

**Step 1: Commit History Analysis**
Analyze ALL commits in the repository to identify:

1. **Claude Co-Authored Commits**: Find all commits that were co-authored by Claude (look for "Co-Authored-By: Claude <noreply@anthropic.com>" in commit messages)
2. **Top Claude Collaborators**: Identify the top 3 human developers who most frequently collaborated with Claude

**Step 2: Create Collaboration Analysis Report**
Create a file called `CLAUDE_COLLABORATION_ANALYSIS.md` in the repository root with:

- A "# Claude AI Collaboration Analysis" title
- A "## Summary Statistics" section with these exact format requirements:
  - "Total commits analyzed: [NUMBER]"
  - "Number of Claude co-authored commits found: [NUMBER]"
  - "Percentage of commits with Claude collaboration: [NUMBER]%"
  - "Number of unique human collaborators who worked with Claude: [NUMBER]"

- A "## Top Claude Collaborators" section with this exact table format:
```markdown
| Developer | GitHub Username | Claude Collaborations |
|-----------|----------------|----------------------|
```
Include the top 3 developers by number of Claude collaborations.

**Step 3: Commit Analysis to Repository**
Commit the `CLAUDE_COLLABORATION_ANALYSIS.md` file to the main branch with:
- Commit message: "Add Claude AI collaboration analysis report"
- Ensure all statistics are accurate based on actual commit data

================================================
FILE: tasks/github/standard/claude-code/claude_collaboration_analysis/meta.json
================================================
{
  "task_id": "claude_collaboration_analysis",
  "task_name": "Claude Collaboration Analysis",
  "category_id": "claude-code",
  "category_name": "Claude Code",
  "description": "Analyze Claude AI collaboration patterns in commit history and create a comprehensive report of co-authored commits and top collaborators.",
  "author": "Zijian Wu",
  "created_at": "2025-08-15",
  "difficulty": "L3",
  "tags": [
    "repository analysis"
  ],
  "mcp": [
    "github"
  ],
  "meta_data": {
    "stateType": "url",
    "stateContent": null,
    "stateUrl": "https://github.com/mcpmark-source/claude-code",
    "stateOriginalUrl": "https://github.com/anthropics/claude-code"
  }
}

================================================
FILE: tasks/github/standard/claude-code/claude_collaboration_analysis/verify.py
================================================
import sys
import os
import requests
from typing import Dict, List, Optional, Tuple
import base64
import re
from dotenv import load_dotenv


def _get_github_api(
    endpoint: str, headers: Dict[str, str], org: str, repo: str = "claude-code"
) -> Tuple[bool, Optional[Dict]]:
    """Make a GET request to GitHub API and return (success, response)."""
    url = f"https://api.github.com/repos/{org}/{repo}/{endpoint}"
    try:
        response = requests.get(url, headers=headers)
        if response.status_code == 200:
            return True, response.json()
        elif response.status_code == 404:
            return False, None
        else:
            print(f"API error for {endpoint}: {response.status_code}", file=sys.stderr)
            return False, None
    except Exception as e:
        print(f"Exception for {endpoint}: {e}", file=sys.stderr)
        return False, None


def _get_file_content(
    file_path: str,
    headers: Dict[str, str],
    org: str,
    repo: str = "claude-code",
    ref: str = "main",
) -> Optional[str]:
    """Get the content of a file from the repository."""
    success, result = _get_github_api(
        f"contents/{file_path}?ref={ref}", headers, org, repo
    )
    if not success or not result:
        return None

    try:
        content = base64.b64decode(result.get("content", "")).decode("utf-8")
        return content
    except Exception as e:
        print(f"Content decode error for {file_path}: {e}", file=sys.stderr)
        return None


def _parse_summary_statistics(content: str) -> Dict:
    """Parse the summary statistics section from the report."""
    stats = {}

    lines = content.split("\n")
    in_summary = False

    for line in lines:
        if "## Summary Statistics" in line:
            in_summary = True
            continue

        if in_summary:
            if "##" in line and "Summary Statistics" not in line:
                break

            # Parse statistics lines
            if "Total commits analyzed" in line:
                match = re.search(r"(\d+)", line)
                if match:
                    stats["total_analyzed"] = int(match.group(1))
            elif "Number of Claude co-authored commits" in line:
                match = re.search(r"(\d+)", line)
                if match:
                    stats["claude_commits"] = int(match.group(1))
            elif "Percentage of commits with Claude collaboration" in line:
                match = re.search(r"([\d.]+)%", line)
                if match:
                    stats["percentage"] = float(match.group(1))
            elif "Number of unique human collaborators" in line:
                match = re.search(r"(\d+)", line)
                if match:
                    stats["unique_collaborators"] = int(match.group(1))

    return stats


def _parse_collaborators_table(content: str) -> List[Dict]:
    """Parse the top collaborators table from the report."""
    collaborators = []

    lines = content.split("\n")
    in_table = False

    for line in lines:
        if "| Developer | GitHub Username | Claude Collaborations |" in line:
            in_table = True
            continue
        if in_table and line.startswith("|---"):
            continue

        if in_table and line.startswith("|"):
            parts = [p.strip() for p in line.split("|")]
            if len(parts) >= 4:  # Should have 3 columns plus empty parts
                developer = parts[1].strip()
                username = parts[2].strip()
                collaborations = parts[3].strip()

                if developer and username and collaborations:
                    try:
                        collaborators.append(
                            {
                                "developer": developer,
                                "username": username,
                                "collaborations": int(collaborations),
                            }
                        )
                    except ValueError:
                        pass

        if in_table and line and not line.startswith("|") and "##" in line:
            break

    return collaborators


def verify_task() -> bool:
    """Verify the Claude collaboration analysis task."""
    # Load environment variables from .mcp_env
    load_dotenv(".mcp_env")

    # Get GitHub token and org
    github_token = os.environ.get("MCP_GITHUB_TOKEN")
    github_org = os.environ.get("GITHUB_EVAL_ORG")

    if not github_token:
        print("Error: MCP_GITHUB_TOKEN environment variable not set", file=sys.stderr)
        return False

    if not github_org:
        print("Error: GITHUB_EVAL_ORG environment variable not set", file=sys.stderr)
        return False

    headers = {
        "Authorization": f"Bearer {github_token}",
        "Accept": "application/vnd.github.v3+json",
    }

    # Pre-computed expected values based on repository analysis
    # These are the correct answers the agent should find
    EXPECTED_TOP_COLLABORATORS = [
        {
            "username": "bcherny",
            "min_collaborations": 14,
        },  # Boris Cherny has many Claude collaborations
        {"username": "ashwin-ant", "min_collaborations": 5},  # Ashwin Bhat has some
        {"username": "ant-kurt", "min_collaborations": 3},  # Kurt Carpenter has several
    ]

    # Expected exact values for summary statistics
    EXPECTED_STATS = {
        "total_analyzed": 158,
        "claude_commits": 25,
        "percentage": 15.82,
        "unique_collaborators": 6,
    }

    print("Verifying Claude collaboration analysis task...")

    # 1. Check if CLAUDE_COLLABORATION_ANALYSIS.md exists in main branch
    print("1. Checking if CLAUDE_COLLABORATION_ANALYSIS.md exists...")
    content = _get_file_content("CLAUDE_COLLABORATION_ANALYSIS.md", headers, github_org)
    if not content:
        print(
            "Error: CLAUDE_COLLABORATION_ANALYSIS.md not found in main branch",
            file=sys.stderr,
        )
        return False
    print("✓ CLAUDE_COLLABORATION_ANALYSIS.md found")

    # 2. Check required sections exist
    print("2. Checking required sections...")
    required_sections = [
        "# Claude AI Collaboration Analysis",
        "## Summary Statistics",
        "## Top Claude Collaborators",
    ]

    for section in required_sections:
        if section not in content:
            print(f"Error: Missing required section '{section}'", file=sys.stderr)
            return False
    print("✓ All required sections present")

    # 3. Parse and validate summary statistics
    print("3. Validating summary statistics...")
    stats = _parse_summary_statistics(content)

    if "total_analyzed" not in stats:
        print("Error: Total commits analyzed not found", file=sys.stderr)
        return False

    # Check exact values against expected statistics
    if stats.get("total_analyzed") != EXPECTED_STATS["total_analyzed"]:
        print(
            f"Error: Total analyzed should be {EXPECTED_STATS['total_analyzed']}, found {stats.get('total_analyzed')}",
            file=sys.stderr,
        )
        return False

    if stats.get("claude_commits") != EXPECTED_STATS["claude_commits"]:
        print(
            f"Error: Claude commits should be {EXPECTED_STATS['claude_commits']}, found {stats.get('claude_commits')}",
            file=sys.stderr,
        )
        return False

    # Allow 0.1% tolerance for percentage
    expected_percentage = EXPECTED_STATS["percentage"]
    actual_percentage = stats.get("percentage", 0)
    if abs(actual_percentage - expected_percentage) > 0.1:
        print(
            f"Error: Percentage should be around {expected_percentage}% (±0.1%), found {actual_percentage}%",
            file=sys.stderr,
        )
        return False

    if stats.get("unique_collaborators") != EXPECTED_STATS["unique_collaborators"]:
        print(
            f"Error: Unique collaborators should be {EXPECTED_STATS['unique_collaborators']}, found {stats.get('unique_collaborators')}",
            file=sys.stderr,
        )
        return False

    print("✓ Summary statistics validated")

    # 4. Validate top collaborators table
    print("4. Validating top collaborators...")
    collaborators = _parse_collaborators_table(content)

    if len(collaborators) < 3:
        print(
            f"Error: Expected 3 top collaborators, found {len(collaborators)}",
            file=sys.stderr,
        )
        return False

    # Check that expected top collaborators are present
    found_usernames = [c["username"] for c in collaborators]

    # The top 3 should include at least 2 of our expected collaborators
    expected_found = 0
    for expected in EXPECTED_TOP_COLLABORATORS:
        if expected["username"] in found_usernames[:3]:
            expected_found += 1
            # Also check they have reasonable collaboration counts
            for collab in collaborators:
                if collab["username"] == expected["username"]:
                    if collab["collaborations"] < expected["min_collaborations"]:
                        print(
                            f"Error: {expected['username']} should have at least {expected['min_collaborations']} collaborations, found {collab['collaborations']}",
                            file=sys.stderr,
                        )
                        return False

    if expected_found < 2:
        print(
            f"Error: Expected to find at least 2 of the known top collaborators in top 3, found {expected_found}",
            file=sys.stderr,
        )
        print(
            f"Expected to see at least 2 of: {[e['username'] for e in EXPECTED_TOP_COLLABORATORS]}",
            file=sys.stderr,
        )
        print(f"Found: {found_usernames[:3]}", file=sys.stderr)
        return False

    print("✓ Top collaborators validated")

    # 5. Check commit message verification
    print("5. Verifying commit message...")
    success, latest_commits = _get_github_api(
        "commits?per_page=10", headers, github_org
    )
    if not success:
        print("Error: Failed to fetch recent commits", file=sys.stderr)
        return False

    # Look for commit with expected message
    expected_commit_message = "Add Claude AI collaboration analysis report"
    commit_found = False
    for commit in latest_commits:
        if commit["commit"]["message"].startswith(expected_commit_message):
            commit_found = True
            break

    if not commit_found:
        print(
            f"Error: Expected commit message '{expected_commit_message}' not found in recent commits",
            file=sys.stderr,
        )
        return False

    print("✓ Commit message verified")

    # 6. Additional validation: Check unique collaborators count
    print("6. Final validation complete...")
    print("✓ All statistics match expected values")

    print("\n✅ All verification checks passed!")
    print("Claude collaboration analysis completed successfully:")
    print("  - File: CLAUDE_COLLABORATION_ANALYSIS.md created in main branch")
    print(f"  - Commits analyzed: {stats.get('total_analyzed', 'N/A')}")
    print(f"  - Claude collaborations found: {stats.get('claude_commits', 'N/A')}")
    print(f"  - Top collaborators identified: {len(collaborators)}")
    print("  - All statistics verified")
    print("  - Commit message verified")

    return True


if __name__ == "__main__":
    success = verify_task()
    sys.exit(0 if success else 1)


================================================
FILE: tasks/github/standard/claude-code/critical_issue_hotfix_workflow/description.md
================================================
I need you to implement a comprehensive critical issue hotfix workflow for the repository that demonstrates advanced PR management, selective merging, and issue resolution tracking.

**Step 1: Create Critical Bug Tracking Issue**
Create a new issue with:
- Title: "CRITICAL: Memory and Context Management Issues - Hotfix Tracking"
- Body must include:
  - A "## Critical Issues" heading listing issues #49 and #46
  - A "## Impact Assessment" heading describing user impact
  - A "## Resolution Strategy" heading with planned approach
  - References to existing issues #49, #46, and #47 using "#" notation
  - Keywords: "memory exhaustion", "context auto-compact", "JavaScript heap", "hotfix priority"

**Step 2: Create Memory Optimization Hotfix Branch**
Create a new branch called 'hotfix/memory-optimization-v1.0.72' from the main branch.

**Step 3: Implement Memory Management Documentation**
On the hotfix branch, create the file `docs/MEMORY_OPTIMIZATION.md` with this exact content:
```markdown
# Memory Optimization Guide for Claude Code v1.0.72

## Overview
This document addresses critical memory issues identified in issues #49 and #46.

## Memory Management Issues

### Context Auto-Compact Problem (Issue #49)
- **Root Cause**: Context management stuck at 0% completion
- **Impact**: Tool becomes unusable on macOS platforms
- **Solution**: Implement progressive context cleanup with configurable thresholds

### JavaScript Heap Exhaustion (Issue #46)
- **Root Cause**: Memory allocation failure during large MCP operations
- **Impact**: Complete Claude Code crash requiring restart
- **Solution**: Add streaming data processing and garbage collection optimization

## Optimization Strategies

### Immediate Fixes
1. **Context Buffer Management**
   - Implement 10MB default context buffer limit
   - Add automatic context pruning at 80% threshold
   - Enable manual context reset via `/memory-reset` command

2. **MCP Operation Streaming**
   - Process large datasets in 1MB chunks
   - Implement backpressure for MongoDB operations
   - Add memory usage monitoring and alerts

### Configuration Options
```json
{
  "memory": {
    "contextBufferLimit": "10MB",
    "autoCompactThreshold": 0.8,
    "streamingChunkSize": "1MB",
    "gcOptimization": true
  }
}
```

## Related Issues
- Fixes issue #49: Context auto-compact functionality
- Addresses issue #46: JavaScript heap out of memory crashes
- Related to issue #47: Cross-project hook execution problems
```
```

**Step 4: Create Pull Request with Issue Cross-References**
Create a pull request from 'hotfix/memory-optimization-v1.0.72' to 'main' with:
- Title: "HOTFIX: Critical memory optimization for issues #49 and #46"
- Body must include:
  - A "## Summary" heading describing the memory fixes
  - A "## Critical Issues Addressed" heading listing specific problems
  - A "## Documentation Changes" heading describing the new guide
  - "Addresses #49" and "Addresses #46" pattern linking to existing issues
  - Reference to your tracking issue using "Tracked in #[ISSUE_NUMBER]"
  - Keywords: "memory optimization", "context management", "heap exhaustion", "v1.0.72 hotfix"

**Step 5: Update and Merge PR #51 (Statsig Logging)**
For the existing PR #51:
- Update the PR description to include technical implementation details
- Add a "## Technical Implementation" section mentioning "event logging integration"
- Add keywords: "workflow enhancement", "issue management automation", "logging consistency"
- Merge the PR using the squash merge method

**Step 6: Add Implementation Comment to Tracking Issue**
Add a comment to your original tracking issue with:
- Reference to your hotfix PR using "PR #[NUMBER]" pattern
- Reference to actions taken on PR #51
- Technical details about the memory optimization approach
- Keywords: "context buffer management", "streaming optimization", "progressive cleanup"
- Mention of configuration options and thresholds

**Step 7: Close Tracking Issue with Resolution Summary**
Close your tracking issue by updating its state to 'closed' with:
- A final comment summarizing completed actions
- Reference to merged PR #51 and pending hotfix PR
- Keywords: "hotfix deployment", "memory issues resolved", "documentation updated"

================================================
FILE: tasks/github/standard/claude-code/critical_issue_hotfix_workflow/meta.json
================================================
{
  "task_id": "critical_issue_hotfix_workflow",
  "task_name": "Critical Issue Hotfix Workflow",
  "category_id": "claude-code",
  "category_name": "Claude Code",
  "description": "Implement a critical issue hotfix workflow for memory and context management issues with proper PR management and issue tracking.",
  "author": "Zijian Wu",
  "created_at": "2025-08-15",
  "difficulty": "L3",
  "tags": [
    "issue management",
    "pr workflows"
  ],
  "mcp": [
    "github"
  ],
  "meta_data": {
    "stateType": "url",
    "stateContent": null,
    "stateUrl": "https://github.com/mcpmark-source/claude-code",
    "stateOriginalUrl": "https://github.com/anthropics/claude-code"
  }
}

================================================
FILE: tasks/github/standard/claude-code/critical_issue_hotfix_workflow/verify.py
================================================
import sys
import os
import requests
from typing import Dict, List, Optional, Tuple
import base64
from dotenv import load_dotenv


def _get_github_api(
    endpoint: str, headers: Dict[str, str], org: str, repo: str = "claude-code"
) -> Tuple[bool, Optional[Dict]]:
    """Make a GET request to GitHub API and return (success, response)."""
    url = f"https://api.github.com/repos/{org}/{repo}/{endpoint}"
    try:
        response = requests.get(url, headers=headers)
        if response.status_code == 200:
            return True, response.json()
        elif response.status_code == 404:
            return False, None
        else:
            print(f"API error for {endpoint}: {response.status_code}", file=sys.stderr)
            return False, None
    except Exception as e:
        print(f"Exception for {endpoint}: {e}", file=sys.stderr)
        return False, None


def _check_branch_exists(
    branch_name: str, headers: Dict[str, str], org: str, repo: str = "claude-code"
) -> bool:
    """Verify that a branch exists in the repository."""
    success, _ = _get_github_api(f"branches/{branch_name}", headers, org, repo)
    return success


def _get_file_content(
    file_path: str,
    headers: Dict[str, str],
    org: str,
    repo: str = "claude-code",
    ref: str = "main",
) -> Optional[str]:
    """Get the content of a file from the repository."""
    success, result = _get_github_api(
        f"contents/{file_path}?ref={ref}", headers, org, repo
    )
    if not success or not result:
        return None

    try:
        content = base64.b64decode(result.get("content", "")).decode("utf-8")
        return content
    except Exception as e:
        print(f"Content decode error for {file_path}: {e}", file=sys.stderr)
        return None


def _find_issue_by_title_keyword(
    keyword: str, headers: Dict[str, str], org: str, repo: str = "claude-code"
) -> Optional[Dict]:
    """Find an issue by title keyword and return the issue data."""
    # Check both open and closed issues
    for state in ["open", "closed"]:
        success, issues = _get_github_api(
            f"issues?state={state}&per_page=100", headers, org, repo
        )
        if success and issues:
            for issue in issues:
                if keyword.lower() in issue.get("title", "").lower():
                    return issue
    return None


def _find_pr_by_title_keyword(
    keyword: str, headers: Dict[str, str], org: str, repo: str = "claude-code"
) -> Optional[Dict]:
    """Find a PR by title keyword and return the PR data."""
    # Check both open and closed PRs
    for state in ["open", "closed"]:
        success, prs = _get_github_api(
            f"pulls?state={state}&per_page=100", headers, org, repo
        )
        if success and prs:
            for pr in prs:
                if keyword.lower() in pr.get("title", "").lower():
                    return pr
    return None


def _get_pr_by_number(
    pr_number: int, headers: Dict[str, str], org: str, repo: str = "claude-code"
) -> Optional[Dict]:
    """Get a specific PR by number."""
    success, pr = _get_github_api(f"pulls/{pr_number}", headers, org, repo)
    if success:
        return pr
    return None


def _check_issue_references(text: str, reference_numbers: List[str]) -> bool:
    """Check if text contains references to specified issue numbers."""
    if not text:
        return False

    return all(f"#{ref}" in text for ref in reference_numbers)


def _check_addresses_pattern(pr_body: str, issue_numbers: List[str]) -> bool:
    """Check if PR body contains 'Addresses #X' pattern for specified issues."""
    if not pr_body:
        return False

    return all(
        f"Addresses #{num}" in pr_body or f"addresses #{num}" in pr_body
        for num in issue_numbers
    )


def _get_issue_comments(
    issue_number: int, headers: Dict[str, str], org: str, repo: str = "claude-code"
) -> List[Dict]:
    """Get all comments for an issue."""
    success, comments = _get_github_api(
        f"issues/{issue_number}/comments", headers, org, repo
    )
    if success and comments:
        return comments
    return []


def _get_pr_reviews(
    pr_number: int, headers: Dict[str, str], org: str, repo: str = "claude-code"
) -> List[Dict]:
    """Get all reviews for a PR."""
    success, reviews = _get_github_api(f"pulls/{pr_number}/reviews", headers, org, repo)
    if success and reviews:
        return reviews
    return []


def _check_title_keywords(title: str, required_keywords: List[str]) -> bool:
    """Check if title contains all required keywords."""
    return all(keyword.lower() in title.lower() for keyword in required_keywords)


def _check_headings_and_keywords(
    body: str, headings: List[str], keywords: List[str]
) -> bool:
    """Check if body contains required headings and keywords."""
    has_headings = all(heading in body for heading in headings)
    has_keywords = all(keyword.lower() in body.lower() for keyword in keywords)
    return has_headings and has_keywords


def _check_exact_file_content(content: str, expected_sections: List[str]) -> bool:
    """Check if file content contains expected sections."""
    return all(section in content for section in expected_sections)


def verify() -> bool:
    """
    Programmatically verify that the critical issue hotfix workflow meets the
    requirements described in description.md.
    """
    # Configuration constants
    HOTFIX_BRANCH_NAME = "hotfix/memory-optimization-v1.0.72"
    TRACKING_ISSUE_KEYWORD = "Memory and Context Management Issues"
    HOTFIX_PR_KEYWORD = "HOTFIX: Critical memory optimization"

    # Expected file content sections
    MEMORY_DOC_SECTIONS = [
        "# Memory Optimization Guide for Claude Code v1.0.72",
        "## Overview",
        "### Context Auto-Compact Problem (Issue #49)",
        "### JavaScript Heap Exhaustion (Issue #46)",
        "## Optimization Strategies",
        "### Immediate Fixes",
        "### Configuration Options",
        "## Related Issues",
    ]

    # Issue content requirements
    TRACKING_ISSUE_TITLE_KEYWORDS = [
        "CRITICAL",
        "Memory",
        "Context Management",
        "Hotfix Tracking",
    ]
    TRACKING_ISSUE_REFERENCE_NUMBERS = ["49", "46", "47"]
    TRACKING_ISSUE_HEADINGS = [
        "## Critical Issues",
        "## Impact Assessment",
        "## Resolution Strategy",
    ]
    TRACKING_ISSUE_KEYWORDS = [
        "memory exhaustion",
        "context auto-compact",
        "JavaScript heap",
        "hotfix priority",
    ]

    # PR content requirements
    HOTFIX_PR_TITLE_KEYWORDS = [
        "HOTFIX",
        "Critical memory optimization",
        "issues #49",
        "#46",
    ]
    HOTFIX_PR_ADDRESSES_NUMBERS = ["49", "46"]
    HOTFIX_PR_HEADINGS = [
        "## Summary",
        "## Critical Issues Addressed",
        "## Documentation Changes",
    ]
    HOTFIX_PR_KEYWORDS = [
        "memory optimization",
        "context management",
        "heap exhaustion",
        "v1.0.72 hotfix",
    ]

    # PR #51 update requirements
    PR51_UPDATE_KEYWORDS = [
        "Technical Implementation",
        "event logging integration",
        "workflow enhancement",
    ]

    # Issue comment requirements
    ISSUE_COMMENT_KEYWORDS = [
        "context buffer management",
        "streaming optimization",
        "progressive cleanup",
    ]

    # Load environment variables from .mcp_env
    load_dotenv(".mcp_env")

    # Get GitHub token and org
    github_token = os.environ.get("MCP_GITHUB_TOKEN")
    github_org = os.environ.get("GITHUB_EVAL_ORG")

    if not github_token:
        print("Error: MCP_GITHUB_TOKEN environment variable not set", file=sys.stderr)
        return False

    if not github_org:
        print("Error: GITHUB_EVAL_ORG environment variable not set", file=sys.stderr)
        return False

    headers = {
        "Authorization": f"Bearer {github_token}",
        "Accept": "application/vnd.github.v3+json",
    }

    # Run verification checks
    print("Verifying critical issue hotfix workflow completion...")

    # 1. Check that hotfix branch exists
    print("1. Verifying hotfix branch exists...")
    if not _check_branch_exists(HOTFIX_BRANCH_NAME, headers, github_org):
        print(f"Error: Branch '{HOTFIX_BRANCH_NAME}' not found", file=sys.stderr)
        return False
    print("✓ Hotfix branch created")

    # 2. Check that the memory optimization documentation exists with exact content
    print("2. Verifying MEMORY_OPTIMIZATION.md documentation...")
    memory_doc_content = _get_file_content(
        "docs/MEMORY_OPTIMIZATION.md",
        headers,
        github_org,
        "claude-code",
        HOTFIX_BRANCH_NAME,
    )
    if not memory_doc_content:
        print(
            "Error: docs/MEMORY_OPTIMIZATION.md not found in hotfix branch",
            file=sys.stderr,
        )
        return False

    if not _check_exact_file_content(memory_doc_content, MEMORY_DOC_SECTIONS):
        print(
            "Error: MEMORY_OPTIMIZATION.md missing required sections or content",
            file=sys.stderr,
        )
        return False
    print("✓ Memory optimization documentation created with correct content")

    # 3. Find and verify the tracking issue
    print("3. Verifying tracking issue creation and content...")
    tracking_issue = _find_issue_by_title_keyword(
        TRACKING_ISSUE_KEYWORD, headers, github_org
    )
    if not tracking_issue:
        print(
            f"Error: Tracking issue with keyword '{TRACKING_ISSUE_KEYWORD}' not found",
            file=sys.stderr,
        )
        return False

    tracking_issue_number = tracking_issue.get("number")
    tracking_issue_title = tracking_issue.get("title", "")
    tracking_issue_body = tracking_issue.get("body", "")

    # Check tracking issue title keywords
    if not _check_title_keywords(tracking_issue_title, TRACKING_ISSUE_TITLE_KEYWORDS):
        print("Error: Tracking issue title missing required keywords", file=sys.stderr)
        return False

    # Check tracking issue headings, content and references
    if not _check_headings_and_keywords(
        tracking_issue_body, TRACKING_ISSUE_HEADINGS, TRACKING_ISSUE_KEYWORDS
    ):
        print(
            "Error: Tracking issue missing required headings or keywords",
            file=sys.stderr,
        )
        return False

    if not _check_issue_references(
        tracking_issue_body, TRACKING_ISSUE_REFERENCE_NUMBERS
    ):
        print(
            "Error: Tracking issue does not reference required issues #49, #46, #47",
            file=sys.stderr,
        )
        return False
    print("✓ Tracking issue created with correct content and references")

    # 4. Find and verify the hotfix PR
    print("4. Verifying hotfix pull request creation and content...")
    hotfix_pr = _find_pr_by_title_keyword(HOTFIX_PR_KEYWORD, headers, github_org)
    if not hotfix_pr:
        print(
            f"Error: Hotfix PR with keyword '{HOTFIX_PR_KEYWORD}' not found",
            file=sys.stderr,
        )
        return False

    hotfix_pr_number = hotfix_pr.get("number")
    hotfix_pr_title = hotfix_pr.get("title", "")
    hotfix_pr_body = hotfix_pr.get("body", "")

    # Check hotfix PR title keywords
    if not _check_title_keywords(hotfix_pr_title, HOTFIX_PR_TITLE_KEYWORDS):
        print("Error: Hotfix PR title missing required keywords", file=sys.stderr)
        return False

    # Check hotfix PR headings and content
    if not _check_headings_and_keywords(
        hotfix_pr_body, HOTFIX_PR_HEADINGS, HOTFIX_PR_KEYWORDS
    ):
        print("Error: Hotfix PR missing required headings or keywords", file=sys.stderr)
        return False

    # Check hotfix PR addresses pattern
    if not _check_addresses_pattern(hotfix_pr_body, HOTFIX_PR_ADDRESSES_NUMBERS):
        print(
            "Error: Hotfix PR does not properly address issues #49 and #46",
            file=sys.stderr,
        )
        return False

    # Check reference to tracking issue
    if f"#{tracking_issue_number}" not in hotfix_pr_body:
        print(
            f"Error: Hotfix PR does not reference tracking issue #{tracking_issue_number}",
            file=sys.stderr,
        )
        return False
    print("✓ Hotfix PR created with correct content and references")

    # 5. Check PR #51 has been updated and merged
    print("5. Verifying PR #51 update and merge...")
    pr51 = _get_pr_by_number(51, headers, github_org)
    if not pr51:
        print("Error: PR #51 not found", file=sys.stderr)
        return False

    pr51_body = pr51.get("body", "")
    pr51_state = pr51.get("state", "")

    # Check PR #51 has been updated with required content
    if not _check_headings_and_keywords(
        pr51_body, ["## Technical Implementation"], PR51_UPDATE_KEYWORDS
    ):
        print(
            "Error: PR #51 missing updated technical implementation section",
            file=sys.stderr,
        )
        return False

    # Check PR #51 has been merged
    if pr51_state != "closed" or not pr51.get("merged_at"):
        print("Error: PR #51 has not been merged", file=sys.stderr)
        return False
    print("✓ PR #51 updated and merged successfully")

    # 6. Check tracking issue has implementation comment
    print("6. Verifying tracking issue implementation comment...")
    tracking_issue_comments = _get_issue_comments(
        tracking_issue_number, headers, github_org
    )

    has_implementation_comment = False
    for comment in tracking_issue_comments:
        body = comment.get("body", "")
        has_pr_ref = f"PR #{hotfix_pr_number}" in body
        has_pr51_ref = "PR #51" in body
        has_keywords = all(
            keyword.lower() in body.lower() for keyword in ISSUE_COMMENT_KEYWORDS
        )
        if has_pr_ref and has_pr51_ref and has_keywords:
            has_implementation_comment = True
            break

    if not has_implementation_comment:
        print(
            f"Error: Tracking issue #{tracking_issue_number} missing implementation comment with required references and keywords",
            file=sys.stderr,
        )
        return False
    print("✓ Tracking issue has implementation comment with PR references")

    # 7. Check tracking issue is closed
    print("7. Verifying tracking issue closure...")
    if tracking_issue.get("state") != "closed":
        print(
            f"Error: Tracking issue #{tracking_issue_number} is not closed",
            file=sys.stderr,
        )
        return False
    print("✓ Tracking issue closed successfully")

    print("\n✅ All verification checks passed!")
    print("Critical issue hotfix workflow completed successfully:")
    print(f"  - Tracking Issue #{tracking_issue_number}: {tracking_issue.get('title')}")
    print(f"  - Hotfix PR #{hotfix_pr_number}: {hotfix_pr.get('title')}")
    print(f"  - Branch: {HOTFIX_BRANCH_NAME}")
    print("  - PR #51 merged: ✓")
    print("  - Memory optimization documentation: ✓")

    return True


if __name__ == "__main__":
    success = verify()
    sys.exit(0 if success else 1)


================================================
FILE: tasks/github/standard/claude-code/feature_commit_tracking/description.md
================================================
I need you to research the development history of the repository across multiple branches and commits, then create a comprehensive feature tracking document and submit it as a new file to the repository.

**Step 1: Multi-Branch Feature Investigation**
Research and identify the exact commit SHAs where these specific features were introduced by analyzing commits across different branches:

1. **Shell Completion Scripts**: Find when shell completion functionality was first added to the repository
2. **CHANGELOG Version 1.0.65**: Find when the changelog was updated to include version 1.0.65 
3. **Rust Extraction Improvements**: Find when workflow improvements for Rust code extraction were implemented

**Step 2: Create Feature Tracking Documentation**
Create a file called `FEATURE_COMMITS.md` in the repository root with:

- A "# Feature Development Tracking" title
- A "## Overview" section explaining this tracks major feature additions across repository branches
- A "## Feature Commit History" section with this exact table format:
```markdown
| Feature Name | Commit SHA | Author | Branch | Date | Files Changed | Commit Message |
|-------------|------------|---------|---------|------|---------------|----------------|
```

For each feature, populate the table with:
- Exact commit SHA (full 40-character hash)
- GitHub username of the commit author
- Branch where the commit was made
- Commit date in YYYY-MM-DD format
- Number of files changed in that commit
- First line of the commit message

**Step 3: Commit Documentation to Repository**
Commit the `FEATURE_COMMITS.md` file to the main branch with:
- Commit message: "Add feature development tracking documentation"
- Ensure the file is properly formatted markdown
- Verify all commit SHAs in the table are accurate and verifiable

The verification process will check that your table contains the correct commit SHAs for each specific feature, along with accurate author, branch, and date information.

================================================
FILE: tasks/github/standard/claude-code/feature_commit_tracking/meta.json
================================================
{
  "task_id": "feature_commit_tracking",
  "task_name": "Feature Commit Tracking",
  "category_id": "claude-code",
  "category_name": "Claude Code",
  "description": "Research development history across branches to track when specific features were introduced and create comprehensive documentation.",
  "author": "Zijian Wu",
  "created_at": "2025-08-15",
  "difficulty": "L3",
  "tags": [
    "repository analysis",
    "release coordination"
  ],
  "mcp": [
    "github"
  ],
  "meta_data": {
    "stateType": "url",
    "stateContent": null,
    "stateUrl": "https://github.com/mcpmark-source/claude-code",
    "stateOriginalUrl": "https://github.com/anthropics/claude-code"
  }
}

================================================
FILE: tasks/github/standard/claude-code/feature_commit_tracking/verify.py
================================================
import sys
import os
import requests
from typing import Dict, List, Optional, Tuple
import base64
import re
from dotenv import load_dotenv


def _get_github_api(
    endpoint: str, headers: Dict[str, str], org: str, repo: str = "claude-code"
) -> Tuple[bool, Optional[Dict]]:
    """Make a GET request to GitHub API and return (success, response)."""
    url = f"https://api.github.com/repos/{org}/{repo}/{endpoint}"
    try:
        response = requests.get(url, headers=headers)
        if response.status_code == 200:
            return True, response.json()
        elif response.status_code == 404:
            return False, None
        else:
            print(f"API error for {endpoint}: {response.status_code}", file=sys.stderr)
            return False, None
    except Exception as e:
        print(f"Exception for {endpoint}: {e}", file=sys.stderr)
        return False, None


def _get_file_content(
    file_path: str,
    headers: Dict[str, str],
    org: str,
    repo: str = "claude-code",
    ref: str = "main",
) -> Optional[str]:
    """Get the content of a file from the repository."""
    success, result = _get_github_api(
        f"contents/{file_path}?ref={ref}", headers, org, repo
    )
    if not success or not result:
        return None

    try:
        content = base64.b64decode(result.get("content", "")).decode("utf-8")
        return content
    except Exception as e:
        print(f"Content decode error for {file_path}: {e}", file=sys.stderr)
        return None


def _verify_commit_exists(
    commit_sha: str, headers: Dict[str, str], org: str, repo: str = "claude-code"
) -> Tuple[bool, Optional[Dict]]:
    """Verify that a commit exists and return its details."""
    success, commit_data = _get_github_api(f"commits/{commit_sha}", headers, org, repo)
    return success, commit_data


def _parse_feature_table(content: str) -> List[Dict]:
    """Parse the feature commit table from markdown content."""
    features = []

    lines = content.split("\n")
    in_table = False

    for line in lines:
        # Look for table header
        if (
            "| Feature Name | Commit SHA | Author | Branch | Date | Files Changed | Commit Message |"
            in line
        ):
            in_table = True
            continue
        if in_table and line.startswith("|---"):
            continue

        # Parse table rows
        if in_table and line.startswith("|"):
            parts = [p.strip() for p in line.split("|")]
            if len(parts) >= 8:  # Should have 7 columns plus empty parts at start/end
                feature_name = parts[1].strip()
                commit_sha = parts[2].strip()
                author = parts[3].strip()
                branch = parts[4].strip()
                date = parts[5].strip()
                files_changed = parts[6].strip()
                commit_message = parts[7].strip()

                if feature_name and commit_sha and author and branch and date:
                    features.append(
                        {
                            "name": feature_name,
                            "sha": commit_sha,
                            "author": author,
                            "branch": branch,
                            "date": date,
                            "files_changed": files_changed,
                            "commit_message": commit_message,
                        }
                    )

        # Stop at end of table section
        if in_table and line and not line.startswith("|") and "##" in line:
            break

    return features


def verify_task() -> bool:
    """Verify the feature commit tracking task."""
    # Load environment variables from .mcp_env
    load_dotenv(".mcp_env")

    # Get GitHub token and org
    github_token = os.environ.get("MCP_GITHUB_TOKEN")
    github_org = os.environ.get("GITHUB_EVAL_ORG")

    if not github_token:
        print("Error: MCP_GITHUB_TOKEN environment variable not set", file=sys.stderr)
        return False

    if not github_org:
        print("Error: GITHUB_EVAL_ORG environment variable not set", file=sys.stderr)
        return False

    headers = {
        "Authorization": f"Bearer {github_token}",
        "Accept": "application/vnd.github.v3+json",
    }

    # Expected feature commits based on exploration
    expected_features = {
        "Shell Completion Scripts": "8a0febdd09bda32f38c351c0881784460d69997d",
        "CHANGELOG Version 1.0.65": "94dcaca5d71ad82644ae97f3a2b0c5eb8b63eae0",
        "Rust Extraction Improvements": "50e58affdf1bfc7d875202bc040ebe0dcfb7d332",
    }

    # Expected authors for each commit
    expected_authors = {
        "8a0febdd09bda32f38c351c0881784460d69997d": "gitmpr",
        "94dcaca5d71ad82644ae97f3a2b0c5eb8b63eae0": "QwertyJack",
        "50e58affdf1bfc7d875202bc040ebe0dcfb7d332": "alokdangre",
    }

    # Expected commit messages for each commit
    expected_messages = {
        "8a0febdd09bda32f38c351c0881784460d69997d": "feat: add shell completions (bash, zsh, fish)",
        "94dcaca5d71ad82644ae97f3a2b0c5eb8b63eae0": "Merge branch 'anthropics:main' into main",
        "50e58affdf1bfc7d875202bc040ebe0dcfb7d332": "Enhance Rust extraction and output handling in workflows",
    }

    # Expected dates for each commit (YYYY-MM-DD format)
    expected_dates = {
        "8a0febdd09bda32f38c351c0881784460d69997d": "2025-08-01",
        "94dcaca5d71ad82644ae97f3a2b0c5eb8b63eae0": "2025-08-02",
        "50e58affdf1bfc7d875202bc040ebe0dcfb7d332": "2025-08-09",
    }

    print("Verifying feature commit tracking task...")

    # 1. Check if FEATURE_COMMITS.md exists in main branch
    print("1. Checking if FEATURE_COMMITS.md exists...")
    content = _get_file_content("FEATURE_COMMITS.md", headers, github_org)
    if not content:
        print("Error: FEATURE_COMMITS.md not found in main branch", file=sys.stderr)
        return False
    print("✓ FEATURE_COMMITS.md found")

    # 2. Check required sections exist
    print("2. Checking required sections...")
    required_sections = [
        "# Feature Development Tracking",
        "## Overview",
        "## Feature Commit History",
    ]

    for section in required_sections:
        if section not in content:
            print(f"Error: Missing required section '{section}'", file=sys.stderr)
            return False
    print("✓ All required sections present")

    # 3. Parse and validate feature table
    print("3. Parsing and validating feature table...")
    features = _parse_feature_table(content)

    if len(features) < 3:
        print(
            f"Error: Expected at least 3 features, found {len(features)}",
            file=sys.stderr,
        )
        return False

    # 4. Verify each expected feature is present with correct commit SHA
    print("4. Verifying feature commit SHAs...")
    found_features = {}
    for feature in features:
        found_features[feature["name"]] = feature["sha"]

    for feature_name, expected_sha in expected_features.items():
        if feature_name not in found_features:
            print(
                f"Error: Feature '{feature_name}' not found in table", file=sys.stderr
            )
            return False

        actual_sha = found_features[feature_name]
        if actual_sha != expected_sha:
            print(
                f"Error: Wrong SHA for '{feature_name}'. Expected: {expected_sha}, Got: {actual_sha}",
                file=sys.stderr,
            )
            return False

    print("✓ All feature commit SHAs are correct")

    # 5. Verify each commit exists and has correct author
    print("5. Verifying commit details...")
    for feature in features:
        if feature["sha"] in expected_features.values():
            success, commit_data = _verify_commit_exists(
                feature["sha"], headers, github_org
            )
            if not success:
                print(f"Error: Commit {feature['sha']} not found", file=sys.stderr)
                return False

            # Check author
            expected_author = expected_authors.get(feature["sha"])
            if expected_author:
                actual_author = commit_data.get("author", {}).get("login", "")
                if actual_author != expected_author:
                    print(
                        f"Error: Wrong author for {feature['sha']}. Expected: {expected_author}, Got: {actual_author}",
                        file=sys.stderr,
                    )
                    return False

            # Check commit message (compare with table entry)
            expected_message = expected_messages.get(feature["sha"])
            if expected_message and "commit_message" in feature:
                if feature["commit_message"] != expected_message:
                    print(
                        f"Error: Wrong commit message in table for {feature['sha']}. Expected: '{expected_message}', Got: '{feature['commit_message']}'",
                        file=sys.stderr,
                    )
                    return False

            # Also verify against actual commit data
            if expected_message:
                actual_message = (
                    commit_data.get("commit", {}).get("message", "").split("\n")[0]
                )  # First line only
                if actual_message != expected_message:
                    print(
                        f"Error: Wrong commit message for {feature['sha']}. Expected: '{expected_message}', Got: '{actual_message}'",
                        file=sys.stderr,
                    )
                    return False

            # Check date format (YYYY-MM-DD)
            if not re.match(r"^\d{4}-\d{2}-\d{2}$", feature["date"]):
                print(
                    f"Error: Invalid date format for {feature['name']}: {feature['date']}",
                    file=sys.stderr,
                )
                return False

            # Check actual date matches expected
            expected_date = expected_dates.get(feature["sha"])
            if expected_date:
                if feature["date"] != expected_date:
                    print(
                        f"Error: Wrong date for {feature['sha']}. Expected: {expected_date}, Got: {feature['date']}",
                        file=sys.stderr,
                    )
                    return False

    print("✓ All commit details verified")

    # 6. Verify the table format is correct
    print("6. Verifying table format...")
    table_header = "| Feature Name | Commit SHA | Author | Branch | Date | Files Changed | Commit Message |"
    if table_header not in content:
        print("Error: Table header format is incorrect", file=sys.stderr)
        return False

    # Check that all features have complete information
    for feature in features:
        if not all(
            [
                feature["name"],
                feature["sha"],
                feature["author"],
                feature["branch"],
                feature["date"],
                feature.get("commit_message", ""),
            ]
        ):
            print(
                f"Error: Incomplete information for feature: {feature['name']}",
                file=sys.stderr,
            )
            return False

    print("✓ Table format is correct and complete")

    print("\n✅ All verification checks passed!")
    print("Feature commit tracking completed successfully:")
    print("  - File: FEATURE_COMMITS.md created in main branch")
    print(f"  - Features tracked: {len(features)}")
    print("  - All expected commit SHAs verified")
    print("  - All commit authors verified")
    print("  - Analysis summary complete")

    return True


if __name__ == "__main__":
    success = verify_task()
    sys.exit(0 if success else 1)


================================================
FILE: tasks/github/standard/claude-code/label_color_standardization/description.md
================================================
I need you to implement a comprehensive label documentation and organization workflow for the repository.

**Step 1: Create Label Documentation Issue**
Create a new issue with:
- Title containing: "Document label organization for better visual organization" and "label guide"
- Body must include:
  - A "## Problem" heading describing the need for better label documentation
  - A "## Proposed Solution" heading about creating a comprehensive label guide for different label categories
  - A "## Benefits" heading listing improved visual organization and easier issue triage
  - Keywords: "label documentation", "visual organization", "label guide", "organization"
- Labels: Initially add "enhancement" and "documentation" labels to the issue

**Step 2: Create Feature Branch**
Create a new branch called 'feat/label-color-guide' from main.

**Step 3: Create Label Documentation**
On the feature branch, create the file `docs/LABEL_COLORS.md` with:
- A "# Label Organization Guide" title
- A "## Label Categories" section with a table that MUST follow this exact format:
```markdown
| Label Name | Category | Description |
|------------|----------|-------------|
```
The table must include ALL existing labels in the repository. For each label:
- Group labels by category (e.g., issue-type, platform, area, status, performance)
- Include a description for each label

- A "## Usage Guidelines" section explaining when to use each label category

**Step 4: Apply ALL Labels to the Documentation Issue**
Update the issue you created in Step 1 by adding ALL existing labels from the repository. This serves as a visual demonstration of the label organization. The issue should have every single label that exists in the repository applied to it.

**Step 5: Create Pull Request**
Create a pull request from 'feat/label-color-guide' to 'main' with:
- Title containing: "Add label organization guide" and "visual organization"  
- Body must include:
  - A "## Summary" heading explaining the label organization documentation
  - A "## Changes" heading with a bullet list of what was added
  - "Fixes #[ISSUE_NUMBER]" pattern linking to your created issue
  - A "## Verification" section stating that all labels have been documented
  - Keywords: "label documentation", "organization guide", "visual improvement", "documentation"
- Labels: Add a reasonable subset of labels to the PR (at least 5-10 labels from different categories)

**Step 6: Document Changes in Issue**
Add a comment to the original issue with:
- Confirmation that the label documentation has been created
- Total count of labels documented
- Reference to the PR using "PR #[NUMBER]" pattern
- Keywords: "documentation created", "label guide complete", "organization complete"

================================================
FILE: tasks/github/standard/claude-code/label_color_standardization/meta.json
================================================
{
  "task_id": "label_color_standardization",
  "task_name": "Label Color Standardization",
  "category_id": "claude-code",
  "category_name": "Claude Code",
  "description": "Standardize label colors from default gray to a comprehensive color scheme for better visual organization and issue triage.",
  "author": "Zijian Wu",
  "created_at": "2025-08-15",
  "difficulty": "L3",
  "tags": [
    "issue management",
    "workflow automation"
  ],
  "mcp": [
    "github"
  ],
  "meta_data": {
    "stateType": "url",
    "stateContent": null,
    "stateUrl": "https://github.com/mcpmark-source/claude-code",
    "stateOriginalUrl": "https://github.com/anthropics/claude-code"
  }
}

================================================
FILE: tasks/github/standard/claude-code/label_color_standardization/verify.py
================================================
import sys
import os
import requests
from typing import Dict, List, Optional, Tuple
from dotenv import load_dotenv


def _get_github_api(
    endpoint: str, headers: Dict[str, str], org: str, repo: str = "claude-code"
) -> Tuple[bool, Optional[Dict]]:
    """Make a GET request to GitHub API and return (success, response)."""
    url = f"https://api.github.com/repos/{org}/{repo}/{endpoint}"
    try:
        response = requests.get(url, headers=headers)
        if response.status_code == 200:
            return True, response.json()
        elif response.status_code == 404:
            return False, None
        else:
            print(f"API error for {endpoint}: {response.status_code}", file=sys.stderr)
            return False, None
    except Exception as e:
        print(f"Exception for {endpoint}: {e}", file=sys.stderr)
        return False, None


def _check_branch_exists(
    branch_name: str, headers: Dict[str, str], org: str, repo: str = "claude-code"
) -> bool:
    """Verify that a branch exists in the repository."""
    success, _ = _get_github_api(f"branches/{branch_name}", headers, org, repo)
    return success


def _check_file_content(
    branch: str,
    file_path: str,
    headers: Dict[str, str],
    org: str,
    repo: str = "claude-code",
) -> Optional[str]:
    """Get file content from a branch."""
    import base64

    success, result = _get_github_api(
        f"contents/{file_path}?ref={branch}", headers, org, repo
    )
    if not success or not result:
        return None

    if result.get("content"):
        try:
            content = base64.b64decode(result.get("content", "")).decode("utf-8")
            return content
        except Exception as e:
            print(f"Content decode error for {file_path}: {e}", file=sys.stderr)
            return None

    return None


def _parse_label_table(content: str) -> List[str]:
    """Parse the label table from markdown content and return label names."""
    documented_labels = []

    # Find the table in the content
    lines = content.split("\n")
    in_table = False

    for line in lines:
        # Skip header and separator lines
        if "| Label Name | Category |" in line:
            in_table = True
            continue
        if in_table and line.startswith("|---"):
            continue

        # Parse table rows
        if in_table and line.startswith("|"):
            parts = [p.strip() for p in line.split("|")]
            if len(parts) >= 3:  # Should have at least label, category
                label_name = parts[1].strip()
                if label_name:
                    documented_labels.append(label_name)

        # Stop at end of table
        if in_table and line and not line.startswith("|"):
            break

    return documented_labels


def _find_issue_by_title_keywords(
    title_keywords: List[str],
    headers: Dict[str, str],
    org: str,
    repo: str = "claude-code",
) -> Optional[Dict]:
    """Find an issue by title keywords and return the issue data."""
    for state in ["open", "closed"]:
        success, issues = _get_github_api(
            f"issues?state={state}&per_page=100", headers, org, repo
        )
        if success and issues:
            for issue in issues:
                # Skip pull requests
                if "pull_request" in issue:
                    continue
                title = issue.get("title", "").lower()
                if all(keyword.lower() in title for keyword in title_keywords):
                    return issue
    return None


def _find_pr_by_title_keywords(
    title_keywords: List[str],
    headers: Dict[str, str],
    org: str,
    repo: str = "claude-code",
) -> Optional[Dict]:
    """Find a PR by title keywords and return the PR data."""
    for state in ["open", "closed"]:
        success, prs = _get_github_api(
            f"pulls?state={state}&per_page=100", headers, org, repo
        )
        if success and prs:
            for pr in prs:
                title = pr.get("title", "").lower()
                if all(keyword.lower() in title for keyword in title_keywords):
                    return pr
    return None


def _get_issue_comments(
    issue_number: int, headers: Dict[str, str], org: str, repo: str = "claude-code"
) -> List[Dict]:
    """Get all comments for an issue."""
    success, comments = _get_github_api(
        f"issues/{issue_number}/comments", headers, org, repo
    )
    if success and comments:
        return comments
    return []


def verify() -> bool:
    """
    Programmatically verify that the label color standardization workflow meets the
    requirements described in description.md.
    """
    # Load environment variables from .mcp_env
    load_dotenv(".mcp_env")

    # Get GitHub token and org
    github_token = os.environ.get("MCP_GITHUB_TOKEN")
    github_org = os.environ.get("GITHUB_EVAL_ORG")

    if not github_token:
        print("Error: MCP_GITHUB_TOKEN environment variable not set", file=sys.stderr)
        return False

    if not github_org:
        print("Error: GITHUB_EVAL_ORG environment variable not set", file=sys.stderr)
        return False

    # Configuration constants
    BRANCH_NAME = "feat/label-color-guide"

    # Issue requirements
    ISSUE_TITLE_KEYWORDS = ["Document label organization", "label guide"]
    ISSUE_KEYWORDS = [
        "label documentation",
        "visual organization",
        "label guide",
        "organization",
    ]

    # PR requirements
    PR_TITLE_KEYWORDS = ["label organization guide", "visual organization"]
    PR_KEYWORDS = [
        "label documentation",
        "organization guide",
        "visual improvement",
        "documentation",
    ]

    # All expected labels in the repository that are actually used/discoverable via MCP tools
    # Note: Excludes 'wontfix', 'invalid', 'good first issue', 'help wanted' as they exist
    # in the repository but are not used by any issues (not discoverable via MCP search)
    ALL_EXPECTED_LABELS = [
        "bug",
        "enhancement",
        "duplicate",
        "question",
        "documentation",
        "platform:macos",
        "platform:linux",
        "platform:windows",
        "area:core",
        "area:tools",
        "area:tui",
        "area:ide",
        "area:mcp",
        "area:api",
        "area:security",
        "area:model",
        "area:auth",
        "area:packaging",
        "has repro",
        "memory",
        "perf:memory",
        "external",
    ]

    headers = {
        "Authorization": f"token {github_token}",
        "Accept": "application/vnd.github.v3+json",
    }

    # Run verification checks
    print("Verifying label color standardization workflow completion...")

    # 1. Check that feature branch exists
    print("1. Verifying feature branch exists...")
    if not _check_branch_exists(BRANCH_NAME, headers, github_org):
        print(f"Error: Branch '{BRANCH_NAME}' not found", file=sys.stderr)
        return False

    # 2. Check documentation file exists and has correct format
    print("2. Verifying label documentation file...")
    doc_content = _check_file_content(
        BRANCH_NAME, "docs/LABEL_COLORS.md", headers, github_org
    )
    if not doc_content:
        print("Error: docs/LABEL_COLORS.md not found", file=sys.stderr)
        return False

    # Parse the label table from documentation
    documented_labels = _parse_label_table(doc_content)
    if len(documented_labels) < 20:
        print(
            f"Error: Documentation table incomplete, found only {len(documented_labels)} labels",
            file=sys.stderr,
        )
        return False

    # 3. Verify labels are documented
    print("3. Verifying expected labels are documented...")
    print(f"  ✓ {len(ALL_EXPECTED_LABELS)} expected labels defined for verification")

    # 4. Find the created issue
    print("4. Verifying issue creation...")
    issue = _find_issue_by_title_keywords(ISSUE_TITLE_KEYWORDS, headers, github_org)
    if not issue:
        print(
            "Error: Issue with title containing required keywords not found",
            file=sys.stderr,
        )
        return False

    issue_number = issue.get("number")
    issue_body = issue.get("body", "")

    # Check issue content has required sections and keywords
    issue_required_sections = ["## Problem", "## Proposed Solution", "## Benefits"]
    for section in issue_required_sections:
        if section not in issue_body:
            print(f"Error: Issue body missing required section: {section}", file=sys.stderr)
            return False

    # Check issue has required keywords
    if not all(keyword.lower() in issue_body.lower() for keyword in ISSUE_KEYWORDS):
        missing_keywords = [kw for kw in ISSUE_KEYWORDS if kw.lower() not in issue_body.lower()]
        print(f"Error: Issue body missing required keywords: {missing_keywords}", file=sys.stderr)
        return False

    # Check issue has initial required labels (enhancement and documentation)
    issue_label_names = [label["name"] for label in issue.get("labels", [])]
    initial_required_labels = ["enhancement", "documentation"]
    for required_label in initial_required_labels:
        if required_label not in issue_label_names:
            print(f"Error: Issue missing initial required label: {required_label}", file=sys.stderr)
            return False

    # 5. Find the created PR
    print("5. Verifying pull request creation...")
    pr = _find_pr_by_title_keywords(PR_TITLE_KEYWORDS, headers, github_org)
    if not pr:
        print(
            "Error: PR with title containing required keywords not found",
            file=sys.stderr,
        )
        return False

    pr_number = pr.get("number")
    pr_body = pr.get("body", "")
    pr_labels = pr.get("labels", [])

    # Check PR references issue with correct pattern
    if f"Fixes #{issue_number}" not in pr_body and f"fixes #{issue_number}" not in pr_body:
        print(f"Error: PR does not contain 'Fixes #{issue_number}' pattern", file=sys.stderr)
        return False

    # Check PR body has required sections and keywords
    pr_required_sections = ["## Summary", "## Changes", "## Verification"]
    for section in pr_required_sections:
        if section not in pr_body:
            print(f"Error: PR body missing required section: {section}", file=sys.stderr)
            return False

    # Check PR has required keywords
    if not all(keyword.lower() in pr_body.lower() for keyword in PR_KEYWORDS):
        missing_keywords = [kw for kw in PR_KEYWORDS if kw.lower() not in pr_body.lower()]
        print(f"Error: PR body missing required keywords: {missing_keywords}", file=sys.stderr)
        return False

    # Check PR has sufficient labels (at least 5 from different categories)
    if len(pr_labels) < 5:
        print(f"Error: PR has only {len(pr_labels)} labels, needs at least 5", file=sys.stderr)
        return False

    # 6. Verify issue has ALL expected/usable labels applied (demonstrates organization)
    print("6. Verifying issue has all expected labels applied...")
    issue_label_names = [label["name"] for label in issue.get("labels", [])]
    # Use our expected labels list instead of all repo labels (excludes unused labels)
    expected_labels_to_check = ALL_EXPECTED_LABELS
    missing_labels = []

    for expected_label in expected_labels_to_check:
        if expected_label not in issue_label_names:
            missing_labels.append(expected_label)

    if missing_labels:
        print(
            f"Error: Issue missing {len(missing_labels)} expected labels: {missing_labels[:5]}...",
            file=sys.stderr,
        )
        return False

    print(f"  ✓ Issue has all {len(expected_labels_to_check)} expected labels applied")

    # 7. Verify issue has comment documenting changes
    print("7. Verifying issue comment with documentation...")
    issue_comments = _get_issue_comments(issue_number, headers, github_org)

    found_update_comment = False
    comment_required_keywords = ["documentation created", "label guide complete", "organization complete"]
    
    for comment in issue_comments:
        body = comment.get("body", "")
        # Check for PR reference and required keywords
        if (f"PR #{pr_number}" in body and 
            any(keyword.lower() in body.lower() for keyword in comment_required_keywords) and
            "total" in body.lower() and "labels" in body.lower()):
            found_update_comment = True
            break

    if not found_update_comment:
        print("Error: Issue missing comment documenting changes with required content", file=sys.stderr)
        print("  Comment should include: PR reference, label count, and completion keywords", file=sys.stderr)
        return False

    # 8. Final verification of complete workflow
    print("8. Final verification of workflow completion...")
    
    # Skip repository label existence check - we trust that our expected labels 
    # are the ones actually discoverable/usable via MCP tools

    # Ensure expected labels are documented (not all repo labels, since some are unused)
    documented_label_count = len(documented_labels)
    expected_label_count = len(ALL_EXPECTED_LABELS)

    if documented_label_count < expected_label_count:
        print(
            f"Error: Documentation incomplete - {documented_label_count} documented vs {expected_label_count} expected",
            file=sys.stderr,
        )
        return False

    # Check that all expected labels are documented
    missing_documented_labels = []
    for expected_label in ALL_EXPECTED_LABELS:
        if expected_label not in documented_labels:
            missing_documented_labels.append(expected_label)

    if missing_documented_labels:
        print(
            f"Error: Documentation missing expected labels: {missing_documented_labels}",
            file=sys.stderr,
        )
        return False

    print(f"  ✓ All {expected_label_count} expected labels documented")
    print(f"  ✓ All {len(ALL_EXPECTED_LABELS)} expected labels present and documented")

    print("\n✓ All verification checks passed!")
    print("Label documentation workflow completed successfully:")
    print(
        f"  - Issue #{issue_number}: {issue.get('title')} (with all {len(issue_label_names)} labels)"
    )
    print(f"  - PR #{pr_number}: {pr.get('title')}")
    print(f"  - Branch: {BRANCH_NAME}")
    print("  - Documentation: docs/LABEL_COLORS.md")
    print(f"  - {expected_label_count} labels documented for better organization")
    return True


if __name__ == "__main__":
    success = verify()
    sys.exit(0 if success else 1)


================================================
FILE: tasks/github/standard/easyr1/advanced_branch_strategy/description.md
================================================
The EasyR1 repository has a critical production issue: all development happens directly on the `main` branch, which is extremely risky for a project with 25 active issues. A recent commit `098931530606d22f867fd121b1dcb3225a43661f` introduced protocol changes that need to be properly managed through a structured branching workflow. I need you to implement a complete GitFlow strategy by working through a realistic development scenario.

**The Scenario:** You're preparing for the v1.0.0 release while simultaneously handling a critical protocol serialization bug that was introduced in the recent data proto changes.

**Step 1: Initialize GitFlow Structure**
Create a `develop` branch from `main` as the new integration branch. Then create a `release/v1.0.0` branch from `develop` to prepare for the upcoming release.

**Step 2: Address the Critical Bug**
Create a `feature/protocol-serialization-fix` branch from `develop`. In this branch, create a new file called `PROTOCOL_FIXES.md` with the exact content:
```
# Protocol Serialization Fixes

## Critical Fix for Data Proto Issue
- Enhanced serialization safety check implemented
- Addresses issue from commit 098931530606d22f867fd121b1dcb3225a43661f
- Status: Ready for integration testing
```

**Step 3: Integrate the Fix Through Proper Workflow**
Create a pull request from `feature/protocol-serialization-fix` to `develop` to integrate the fix documentation. This demonstrates the feature → develop integration pattern.

**Step 4: Update Release Branch and CI/CD**
Merge the develop branch changes into `release/v1.0.0` branch to include the critical fix in the release.

**Step 5: Document the New Process**
Create an issue titled `Implement Advanced Branch Protection Strategy` with exactly these 3 checkboxes in the body:
- [ ] All development flows through develop branch
- [ ] Release preparation happens in release/v1.0.0 branch  
- [ ] Feature integration uses PR workflow

Add the label `process-implementation` to this issue to track the process implementation.

================================================
FILE: tasks/github/standard/easyr1/advanced_branch_strategy/meta.json
================================================
{
  "task_id": "advanced_branch_strategy",
  "task_name": "Advanced Branch Strategy",
  "category_id": "easyr1",
  "category_name": "EasyR1",
  "description": "Implement GitFlow branching strategy with develop, release, and feature branches to replace risky direct-to-main development.",
  "author": "Xiangyan Liu",
  "created_at": "2025-08-15",
  "difficulty": "L3",
  "tags": [
    "pr workflows",
    "release coordination"
  ],
  "mcp": [
    "github"
  ],
  "meta_data": {
    "stateType": "url",
    "stateContent": null,
    "stateUrl": "https://github.com/mcpmark-source/EasyR1",
    "stateOriginalUrl": "https://github.com/hiyouga/EasyR1"
  }
}

================================================
FILE: tasks/github/standard/easyr1/advanced_branch_strategy/verify.py
================================================
import sys
import os
import requests
from typing import Dict, Optional, Tuple
from dotenv import load_dotenv

load_dotenv(".mcp_env")


def _get_github_api(
    endpoint: str, headers: Dict[str, str]
) -> Tuple[bool, Optional[Dict]]:
    """Make a GET request to GitHub API and return (success, response)."""
    github_org = os.environ.get("GITHUB_EVAL_ORG")
    url = f"https://api.github.com/repos/{github_org}/EasyR1/{endpoint}"
    try:
        response = requests.get(url, headers=headers)
        if response.status_code == 200:
            return True, response.json()
        elif response.status_code == 404:
            return False, None
        else:
            print(f"API error for {endpoint}: {response.status_code}", file=sys.stderr)
            return False, None
    except Exception as e:
        print(f"Exception for {endpoint}: {e}", file=sys.stderr)
        return False, None


def _check_gitflow_branches(headers: Dict[str, str]) -> bool:
    """Check if GitFlow branches are properly created from correct base branches."""
    success, branches_data = _get_github_api("branches", headers)
    if not success or not branches_data:
        print("Error: Could not fetch branches", file=sys.stderr)
        return False

    existing_branches = [branch.get("name", "") for branch in branches_data]
    required_branches = [
        "develop",
        "release/v1.0.0",
        "feature/protocol-serialization-fix",
    ]

    for branch in required_branches:
        if branch not in existing_branches:
            print(f"Error: Required branch '{branch}' not found", file=sys.stderr)
            return False

    return True


def _check_protocol_fixes_file(headers: Dict[str, str]) -> bool:
    """Check if PROTOCOL_FIXES.md file exists in feature branch with correct content."""
    success, file_data = _get_github_api(
        "contents/PROTOCOL_FIXES.md?ref=feature/protocol-serialization-fix", headers
    )
    if not success or not file_data:
        print("Error: PROTOCOL_FIXES.md not found in feature branch", file=sys.stderr)
        return False

    # Decode base64 content
    import base64

    content = base64.b64decode(file_data.get("content", "")).decode("utf-8")

    # Check for required content elements
    required_elements = [
        "# Protocol Serialization Fixes",
        "## Critical Fix for Data Proto Issue",
        "Enhanced serialization safety check implemented",
        "098931530606d22f867fd121b1dcb3225a43661f",
        "Status: Ready for integration testing",
    ]

    for element in required_elements:
        if element not in content:
            print(
                f"Error: PROTOCOL_FIXES.md missing required content: {element}",
                file=sys.stderr,
            )
            return False

    return True


def _check_integration_workflow(headers: Dict[str, str]) -> Optional[Dict]:
    """Verify the feature → develop integration pull request exists."""
    # Check both open and closed PRs since the workflow may have completed
    success, prs = _get_github_api("pulls?state=all", headers)
    if not success or not prs:
        print("Error: Could not fetch pull requests", file=sys.stderr)
        return None

    for pr in prs:
        head_ref = pr.get("head", {}).get("ref", "")
        base_ref = pr.get("base", {}).get("ref", "")

        if head_ref == "feature/protocol-serialization-fix" and base_ref == "develop":
            return pr

    print(
        "Error: Integration PR from feature/protocol-serialization-fix to develop not found",
        file=sys.stderr,
    )
    return None


def _check_release_branch_updated(headers: Dict[str, str]) -> bool:
    """Check if release branch contains the develop branch changes."""
    # Check if PROTOCOL_FIXES.md exists in release branch
    success, file_data = _get_github_api(
        "contents/PROTOCOL_FIXES.md?ref=release/v1.0.0", headers
    )
    if not success or not file_data:
        print(
            "Error: PROTOCOL_FIXES.md not found in release branch - develop changes not merged",
            file=sys.stderr,
        )
        return False

    return True


def _check_process_documentation(headers: Dict[str, str]) -> Optional[Dict]:
    """Check if process is properly documented in an issue."""
    success, issues = _get_github_api("issues", headers)
    if not success or not issues:
        print("Error: Could not fetch issues for documentation check", file=sys.stderr)
        return None

    expected_title = "Implement Advanced Branch Protection Strategy"
    expected_checkboxes = [
        "All development flows through develop branch",
        "Release preparation happens in release/v1.0.0 branch",
        "Feature integration uses PR workflow",
    ]

    for issue in issues:
        title = issue.get("title", "")
        if title == expected_title:
            body = issue.get("body", "")

            # Check for exactly 3 checkboxes with specific content
            checkbox_count = body.count("- [ ]") + body.count("- [x]")
            if checkbox_count != 3:
                print(
                    f"Error: Documentation issue should have 3 checkboxes, found {checkbox_count}",
                    file=sys.stderr,
                )
                return None

            # Check for specific checkbox content
            for expected_text in expected_checkboxes:
                if expected_text not in body:
                    print(
                        f"Error: Documentation issue missing required checkbox: {expected_text}",
                        file=sys.stderr,
                    )
                    return None

            # Check label assignment
            labels = issue.get("labels", [])
            label_names = [label.get("name") for label in labels]
            if "process-implementation" not in label_names:
                print(
                    "Error: Documentation issue not labeled with 'process-implementation'",
                    file=sys.stderr,
                )
                return None

            return issue

    print("Error: Process documentation issue not found", file=sys.stderr)
    return None


def verify() -> bool:
    """
    Verify the complete GitFlow implementation following the integrated workflow
    described in description.md.
    """
    # Get GitHub token
    github_token = os.environ.get("MCP_GITHUB_TOKEN")
    if not github_token:
        print("Error: MCP_GITHUB_TOKEN environment variable not set", file=sys.stderr)
        return False

    headers = {
        "Authorization": f"token {github_token}",
        "Accept": "application/vnd.github.v3+json",
    }

    print("Verifying integrated GitFlow workflow implementation...")

    # 1. Verify GitFlow structure initialization
    print("1. Checking GitFlow branch structure...")
    if not _check_gitflow_branches(headers):
        return False

    # 2. Verify critical bug fix implementation via new file
    print("2. Checking protocol serialization fix documentation...")
    if not _check_protocol_fixes_file(headers):
        return False

    # 3. Verify integration workflow (feature → develop PR)
    print("3. Checking feature integration workflow...")
    integration_pr = _check_integration_workflow(headers)
    if not integration_pr:
        return False

    # 4. Verify release branch updated and CI configured
    print("4. Checking release branch sync and CI configuration...")
    if not _check_release_branch_updated(headers):
        return False

    # 5. Verify process documentation
    print("5. Checking process documentation...")
    doc_issue = _check_process_documentation(headers)
    if not doc_issue:
        return False

    print("\n✓ Integrated GitFlow workflow successfully implemented!")
    print("✓ GitFlow structure: main → develop → release/v1.0.0 branches created")
    print("✓ Critical fix: Protocol fix documented in PROTOCOL_FIXES.md file")
    print(
        f"✓ Integration: PR #{integration_pr.get('number')} demonstrates feature → develop workflow"
    )
    print(
        "✓ Release prep: Release branch contains develop changes, CI configured for both branches"
    )
    print(
        f"✓ Documentation: Process documented in issue #{doc_issue.get('number')} with proper checkboxes"
    )
    print(
        "\nThe repository now has a structured GitFlow workflow ready for implementation!"
    )
    return True


if __name__ == "__main__":
    success = verify()
    sys.exit(0 if success else 1)


================================================
FILE: tasks/github/standard/easyr1/config_parameter_audit/description.md
================================================
I need you to perform a deep investigation into recent configuration changes in our EasyR1 repository that may be causing training instability issues.

## Task Requirements

### 1. Deep Commit Analysis
Find the exact commit SHA where the `micro_batch_size_per_device_for_update` parameter was changed from `4` to `1` in the `examples/config.yaml` file. Use GitHub API to:
- Examine recent commits that modified `examples/config.yaml` 
- Get the specific commit diff showing this parameter change
- Identify the commit author and timestamp

### 2. Related Parameter Investigation  
In the same commit you found above, identify what value the `micro_batch_size_per_device_for_experience` parameter was changed to. Document:
- The before value for this parameter
- The after value for this parameter  
- The specific line numbers in the diff where these changes occurred

### 3. Issue Search and Verification
Search through all GitHub issues (both open and closed) to find issues that contain specific keywords. Identify all issue numbers where the issue title or body text contains any of these exact terms:
- "OOM" (case insensitive)
- "memory" (case insensitive) 
- "batch" (case insensitive)
- "显存" (GPU memory in Chinese)

You must find and list ALL issues that contain any of these keywords in their titles or bodies, regardless of whether you think they're related to the parameter changes.

### 4. File Creation and Results
Create a file named exactly `ANALYSIS_RESULTS.json` in the repository root with this exact structure:

```json
{
  "target_commit_sha": "full-40-character-commit-sha",
  "commit_author": "author-username", 
  "commit_date": "YYYY-MM-DD",
  "parameter_changes": {
    "micro_batch_size_per_device_for_update": {
      "before": 4,
      "after": 1,
      "line_number": 123
    },
    "micro_batch_size_per_device_for_experience": {
      "before": 16,
      "after": 2, 
      "line_number": 124
    }
  },
  "related_issue_number_list": [9, 46]
}
```

### 5. Verification Requirements
- The commit SHA must be exactly 40 hexadecimal characters
- The parameter values must match the actual repository changes  
- The issue number must reference a real issue in the repository
- All data must be obtained through GitHub API analysis, not guesswork

================================================
FILE: tasks/github/standard/easyr1/config_parameter_audit/meta.json
================================================
{
  "task_id": "config_parameter_audit",
  "task_name": "Config Parameter Audit",
  "category_id": "easyr1",
  "category_name": "EasyR1",
  "description": "Investigate configuration changes causing training instability by analyzing commits and identifying related memory issues.",
  "author": "Xiangyan Liu",
  "created_at": "2025-08-15",
  "difficulty": "L3",
  "tags": [
    "repository analysis",
    "issue management"
  ],
  "mcp": [
    "github"
  ],
  "meta_data": {
    "stateType": "url",
    "stateContent": null,
    "stateUrl": "https://github.com/mcpmark-source/EasyR1",
    "stateOriginalUrl": "https://github.com/hiyouga/EasyR1"
  }
}

================================================
FILE: tasks/github/standard/easyr1/config_parameter_audit/verify.py
================================================
import sys
import os
import json
import requests
import re
from typing import Dict, Optional, Tuple
from dotenv import load_dotenv

load_dotenv(".mcp_env")


def _get_github_api(
    endpoint: str, headers: Dict[str, str]
) -> Tuple[bool, Optional[Dict]]:
    """Make a GET request to GitHub API and return (success, response)."""
    github_org = os.environ.get("GITHUB_EVAL_ORG")
    url = f"https://api.github.com/repos/{github_org}/EasyR1/{endpoint}"
    try:
        response = requests.get(url, headers=headers)
        if response.status_code == 200:
            return True, response.json()
        elif response.status_code == 404:
            return False, None
        else:
            print(f"API error for {endpoint}: {response.status_code}", file=sys.stderr)
            return False, None
    except Exception as e:
        print(f"Exception for {endpoint}: {e}", file=sys.stderr)
        return False, None


def _get_analysis_results(headers: Dict[str, str]) -> Optional[Dict]:
    """Get ANALYSIS_RESULTS.json file content."""
    success, file_data = _get_github_api("contents/ANALYSIS_RESULTS.json", headers)
    if not success:
        return None

    # Decode base64 content
    import base64

    content = file_data.get("content", "")
    if content:
        try:
            decoded_content = base64.b64decode(content).decode("utf-8")
            return json.loads(decoded_content)
        except Exception as e:
            print(f"Error parsing JSON: {e}", file=sys.stderr)
            return None
    return None


def _verify_commit_data(results: Dict, headers: Dict[str, str]) -> bool:
    """Verify the commit data is accurate."""
    commit_sha = results.get("target_commit_sha")

    # Validate SHA format
    if not re.match(r"^[a-f0-9]{40}$", commit_sha, re.IGNORECASE):
        print(f"Error: Invalid commit SHA format: {commit_sha}", file=sys.stderr)
        return False

    # Get commit details
    success, commit_data = _get_github_api(f"commits/{commit_sha}", headers)
    if not success:
        print(f"Error: Commit {commit_sha} not found in repository", file=sys.stderr)
        return False

    # Verify author
    expected_author = results.get("commit_author")
    actual_author = commit_data.get("author", {}).get("login")
    if expected_author != actual_author:
        print(
            f"Error: Commit author mismatch. Expected: {expected_author}, Actual: {actual_author}",
            file=sys.stderr,
        )
        return False

    # Verify date format
    commit_date = results.get("commit_date")
    if not re.match(r"^\d{4}-\d{2}-\d{2}$", commit_date):
        print(
            f"Error: Invalid date format: {commit_date}. Expected YYYY-MM-DD",
            file=sys.stderr,
        )
        return False

    return True


def _verify_parameter_changes(results: Dict, headers: Dict[str, str]) -> bool:
    """Verify the parameter changes are accurate."""
    param_changes = results.get("parameter_changes", {})

    # Check required parameters exist
    required_params = [
        "micro_batch_size_per_device_for_update",
        "micro_batch_size_per_device_for_experience",
    ]
    for param in required_params:
        if param not in param_changes:
            print(f"Error: Missing parameter change data for: {param}", file=sys.stderr)
            return False

        change_data = param_changes[param]
        if not all(key in change_data for key in ["before", "after", "line_number"]):
            print(
                f"Error: Incomplete change data for parameter: {param}", file=sys.stderr
            )
            return False

    # Verify specific expected values based on known repository state
    update_param = param_changes.get("micro_batch_size_per_device_for_update", {})
    if update_param.get("before") != 4 or update_param.get("after") != 1:
        print(
            "Error: Incorrect values for micro_batch_size_per_device_for_update",
            file=sys.stderr,
        )
        return False

    experience_param = param_changes.get(
        "micro_batch_size_per_device_for_experience", {}
    )
    if experience_param.get("before") != 16 or experience_param.get("after") != 2:
        print(
            "Error: Incorrect values for micro_batch_size_per_device_for_experience",
            file=sys.stderr,
        )
        return False

    return True


def _get_all_issues_with_keywords(headers: Dict[str, str]) -> set:
    """Find all issues in repository that contain the required keywords."""
    required_keywords = ["oom", "memory", "batch", "显存"]
    keyword_issues = set()

    # Get all issues from repository (both open and closed)
    page = 1
    while True:
        success, issues = _get_github_api(
            f"issues?state=all&per_page=100&page={page}", headers
        )
        if not success or not issues:
            break

        for issue in issues:
            issue_number = issue.get("number")
            title = issue.get("title", "").lower()
            body = issue.get("body", "").lower() if issue.get("body") else ""
            issue_text = title + " " + body

            # Check if any keyword appears in title or body
            for keyword in required_keywords:
                if keyword.lower() in issue_text:
                    keyword_issues.add(issue_number)
                    break

        # If we got less than 100 issues, we're done
        if len(issues) < 100:
            break
        page += 1

    return keyword_issues


def _verify_issue_references(results: Dict, headers: Dict[str, str]) -> bool:
    """Verify the issue references contain the required keywords."""
    issue_number_list = results.get("related_issue_number_list")

    if not isinstance(issue_number_list, list) or len(issue_number_list) == 0:
        print(
            "Error: related_issue_number_list must be a non-empty list",
            file=sys.stderr,
        )
        return False

    # Required keywords to search for (case insensitive)
    required_keywords = ["oom", "memory", "batch", "显存"]

    # First, dynamically find all issues that contain the required keywords
    expected_issues = _get_all_issues_with_keywords(headers)
    print(expected_issues)
    provided_issues = set(issue_number_list)

    # Verify each provided issue contains at least one of the required keywords
    for issue_number in issue_number_list:
        if not isinstance(issue_number, int) or issue_number <= 0:
            print(
                f"Error: Invalid issue number format: {issue_number}", file=sys.stderr
            )
            return False

        # Get issue details
        success, issue_data = _get_github_api(f"issues/{issue_number}", headers)
        if not success:
            print(
                f"Error: Issue #{issue_number} not found in repository", file=sys.stderr
            )
            return False

        # Check if issue title or body contains any required keywords
        title = issue_data.get("title", "").lower()
        body = issue_data.get("body", "").lower() if issue_data.get("body") else ""
        issue_text = title + " " + body

        issue_has_keyword = False
        for keyword in required_keywords:
            if keyword.lower() in issue_text:
                issue_has_keyword = True
                break

        if not issue_has_keyword:
            print(
                f"Error: Issue #{issue_number} does not contain any required keywords: {required_keywords}",
                file=sys.stderr,
            )
            return False

    # Verify agent found exactly the same issues as our dynamic search
    if provided_issues != expected_issues:
        missing = expected_issues - provided_issues
        extra = provided_issues - expected_issues
        if missing:
            print(
                f"Error: Missing issues that contain required keywords: {missing}",
                file=sys.stderr,
            )
        if extra:
            print(
                f"Error: Extra issues that don't contain required keywords: {extra}",
                file=sys.stderr,
            )
        return False

    print(
        f"✓ Found all {len(issue_number_list)} issues containing required keywords: {issue_number_list}"
    )
    return True


def verify() -> bool:
    """
    Programmatically verify that the deep commit analysis meets the requirements.
    """
    # Get GitHub token
    github_token = os.environ.get("MCP_GITHUB_TOKEN")
    if not github_token:
        print("Error: MCP_GITHUB_TOKEN environment variable not set", file=sys.stderr)
        return False

    headers = {
        "Authorization": f"token {github_token}",
        "Accept": "application/vnd.github.v3+json",
    }

    print("Verifying deep commit analysis completion...")

    # 1. Check ANALYSIS_RESULTS.json exists and is valid JSON
    print("1. Checking ANALYSIS_RESULTS.json exists and is valid...")
    results = _get_analysis_results(headers)
    if not results:
        print("Error: ANALYSIS_RESULTS.json not found or invalid JSON", file=sys.stderr)
        return False

    print("✓ Found valid ANALYSIS_RESULTS.json")

    # 2. Verify commit data accuracy
    print("2. Verifying commit data accuracy...")
    if not _verify_commit_data(results, headers):
        return False

    print("✓ Commit SHA, author, and date verified")

    # 3. Verify parameter changes accuracy
    print("3. Verifying parameter changes accuracy...")
    if not _verify_parameter_changes(results, headers):
        return False

    print("✓ Parameter changes verified with correct before/after values")

    # 4. Verify issue references
    print("4. Verifying issue references...")
    if not _verify_issue_references(results, headers):
        return False

    print("\n✓ Task completed successfully!")
    print("Deep commit analysis results verified:")
    print(f"- Found target commit: {results.get('target_commit_sha')}")
    print(
        "- Verified parameter changes: micro_batch_size_per_device_for_update (4→1), micro_batch_size_per_device_for_experience (16→2)"
    )
    print(
        f"- Verified memory/performance issue correlations: {results.get('related_issue_number_list')}"
    )
    print("- All data obtained through accurate GitHub API analysis")

    return True


if __name__ == "__main__":
    success = verify()
    sys.exit(0 if success else 1)


================================================
FILE: tasks/github/standard/easyr1/performance_regression_investigation/description.md
================================================
In the EasyR1 repo, I've discovered that the recent commit `098931530606d22f867fd121b1dcb3225a43661f` (which fixed data proto) may have introduced performance regressions based on user reports in issues #39 and #41. I need you to create a systematic investigation workflow:

**Step 1: Create Main Tracking Issue**
Create a main issue with the exact title "Performance Regression Analysis: Data Protocol Changes" and add these 3 labels: "bug", "performance", "investigation".

**Step 2: Create Investigation Branches** 
Create exactly 3 feature branches from main for different investigation tracks:
- `investigate-protocol-changes` - for testing protocol-related performance issues
- `investigate-batch-processing` - for testing batch processing performance issues  
- `investigate-memory-usage` - for testing memory utilization performance issues

**Step 3: Create Sub-Issues**
Create 3 sub-issues and link them to the main tracking issue using sub-issue functionality:
- "Test Performance Impact: fix multi modal data oom" 
- "Test Performance Impact: upgrade vllm to 0.10"
- "Test Performance Impact: non blocking false by default"

**Step 4: Document Changes**
Add at least 2 comments to the main tracking issue documenting the specific file changes from commit `098931530606d22f867fd121b1dcb3225a43661f`. Reference the exact files `verl/protocol.py` and `examples/config.yaml` with their commit SHA.

**Step 5: Create Analysis PR**
Create a pull request from the `investigate-protocol-changes` branch to main with the exact title "Performance Analysis: Protocol Changes Investigation".

================================================
FILE: tasks/github/standard/easyr1/performance_regression_investigation/meta.json
================================================
{
  "task_id": "performance_regression_investigation",
  "task_name": "Performance Regression Investigation",
  "category_id": "easyr1",
  "category_name": "EasyR1",
  "description": "Create systematic investigation workflow for performance regressions with tracking issues, investigation branches, and sub-issues.",
  "author": "Xiangyan Liu",
  "created_at": "2025-08-15",
  "difficulty": "L3",
  "tags": [
    "issue management",
    "repository analysis"
  ],
  "mcp": [
    "github"
  ],
  "meta_data": {
    "stateType": "url",
    "stateContent": null,
    "stateUrl": "https://github.com/mcpmark-source/EasyR1",
    "stateOriginalUrl": "https://github.com/hiyouga/EasyR1"
  }
}

================================================
FILE: tasks/github/standard/easyr1/performance_regression_investigation/verify.py
================================================
import sys
import os
import requests
from typing import Dict, List, Optional, Tuple
from dotenv import load_dotenv

load_dotenv(".mcp_env")


def _get_github_api(
    endpoint: str, headers: Dict[str, str]
) -> Tuple[bool, Optional[Dict]]:
    """Make a GET request to GitHub API and return (success, response)."""
    github_org = os.environ.get("GITHUB_EVAL_ORG")
    url = f"https://api.github.com/repos/{github_org}/EasyR1/{endpoint}"
    try:
        response = requests.get(url, headers=headers)
        if response.status_code == 200:
            return True, response.json()
        elif response.status_code == 404:
            return False, None
        else:
            print(f"API error for {endpoint}: {response.status_code}", file=sys.stderr)
            return False, None
    except Exception as e:
        print(f"Exception for {endpoint}: {e}", file=sys.stderr)
        return False, None


def _find_main_tracking_issue(headers: Dict[str, str]) -> Optional[Dict]:
    """Find the main tracking issue with exact title and required labels."""
    success, issues = _get_github_api("issues?state=open&per_page=50", headers)
    if not success or not issues:
        return None

    for issue in issues:
        title = issue.get("title", "")
        if title == "Performance Regression Analysis: Data Protocol Changes":
            # Check labels
            labels = [label.get("name", "") for label in issue.get("labels", [])]
            required_labels = {"bug", "performance", "investigation"}
            if required_labels.issubset(set(labels)):
                return issue
    return None


def _check_branches_exist(branch_names: List[str], headers: Dict[str, str]) -> bool:
    """Check if all required branches exist."""
    for branch_name in branch_names:
        success, _ = _get_github_api(f"branches/{branch_name}", headers)
        if not success:
            print(f"Error: Branch '{branch_name}' not found", file=sys.stderr)
            return False
    return True


def _check_sub_issues(
    main_issue_number: int, expected_titles: List[str], headers: Dict[str, str]
) -> bool:
    """Check if sub-issues are created and linked to main issue."""
    success, sub_issues = _get_github_api(
        f"issues/{main_issue_number}/sub_issues", headers
    )
    if not success:
        # If sub_issues endpoint doesn't exist, check for issues mentioning the main issue
        success, all_issues = _get_github_api("issues?state=open&per_page=100", headers)
        if not success:
            return False

        sub_issues = []
        for issue in all_issues:
            body = issue.get("body", "")
            title = issue.get("title", "")
            # Check if issue references main issue or has expected title pattern
            if f"#{main_issue_number}" in body or any(
                expected_title in title for expected_title in expected_titles
            ):
                sub_issues.append(issue)

    if not sub_issues or len(sub_issues) < 3:
        print(
            f"Error: Expected 3 sub-issues linked to main issue #{main_issue_number}",
            file=sys.stderr,
        )
        return False

    # Check if sub-issues have expected titles
    found_titles = [issue.get("title", "") for issue in sub_issues]
    for expected_title in expected_titles:
        if not any(expected_title in title for title in found_titles):
            print(
                f"Error: Sub-issue with title containing '{expected_title}' not found",
                file=sys.stderr,
            )
            return False

    return True


def _check_issue_comments(issue_number: int, headers: Dict[str, str]) -> bool:
    """Check if main issue has at least 2 comments with file references."""
    success, comments = _get_github_api(f"issues/{issue_number}/comments", headers)
    if not success or not comments:
        print(f"Error: No comments found on issue #{issue_number}", file=sys.stderr)
        return False

    if len(comments) < 2:
        print(
            f"Error: Expected at least 2 comments on issue #{issue_number}",
            file=sys.stderr,
        )
        return False

    # Check if comments reference specific files and commit
    required_refs = [
        "verl/protocol.py",
        "examples/config.yaml",
        "0989315",
    ]
    comment_text = " ".join([comment.get("body", "") for comment in comments])

    for ref in required_refs:
        if ref not in comment_text:
            print(f"Error: Comments missing reference to '{ref}'", file=sys.stderr)
            return False

    return True


def _find_analysis_pr(headers: Dict[str, str]) -> Optional[Dict]:
    """Find the analysis PR with exact title from specific branch."""
    success, prs = _get_github_api("pulls?state=open&per_page=50", headers)
    if not success or not prs:
        return None

    expected_title = "Performance Analysis: Protocol Changes Investigation"
    expected_head = "investigate-protocol-changes"

    for pr in prs:
        title = pr.get("title", "")
        head_ref = pr.get("head", {}).get("ref", "")

        if title == expected_title and head_ref == expected_head:
            return pr

    return None


def verify() -> bool:
    """
    Programmatically verify that the performance regression investigation workflow meets the
    requirements described in description.md.
    """
    # Get GitHub token
    github_token = os.environ.get("MCP_GITHUB_TOKEN")
    if not github_token:
        print("Error: MCP_GITHUB_TOKEN environment variable not set", file=sys.stderr)
        return False

    headers = {
        "Authorization": f"token {github_token}",
        "Accept": "application/vnd.github.v3+json",
    }

    # Run verification checks
    print("Verifying performance regression investigation workflow completion...")

    # 1. Check main tracking issue exists with exact title and labels
    print("1. Checking main tracking issue with required title and labels...")
    main_issue = _find_main_tracking_issue(headers)
    if not main_issue:
        print(
            "Error: Main tracking issue not found with exact title 'Performance Regression Analysis: Data Protocol Changes' and labels 'bug', 'performance', 'investigation'",
            file=sys.stderr,
        )
        return False

    main_issue_number = main_issue.get("number")
    print(f"Found main tracking issue #{main_issue_number}")

    # 2. Check that all 3 investigation branches exist
    print("2. Checking investigation branches exist...")
    required_branches = [
        "investigate-protocol-changes",
        "investigate-batch-processing",
        "investigate-memory-usage",
    ]
    if not _check_branches_exist(required_branches, headers):
        return False

    # 3. Check sub-issues are created and linked
    print("3. Checking sub-issues are created and linked...")
    expected_sub_titles = [
        "Test Performance Impact: fix multi modal data oom",
        "Test Performance Impact: upgrade vllm to 0.10",
        "Test Performance Impact: non blocking false by default",
    ]
    if not _check_sub_issues(main_issue_number, expected_sub_titles, headers):
        return False

    # 4. Check issue comments document file changes
    print("4. Checking issue comments document file changes...")
    if not _check_issue_comments(main_issue_number, headers):
        return False

    # 5. Check analysis PR exists with exact title from correct branch
    print("5. Checking analysis PR exists with exact title and branch...")
    analysis_pr = _find_analysis_pr(headers)
    if not analysis_pr:
        print(
            "Error: Analysis PR not found with title 'Performance Analysis: Protocol Changes Investigation' from branch 'investigate-protocol-changes'",
            file=sys.stderr,
        )
        return False

    print(f"Found analysis PR #{analysis_pr.get('number')}")

    print("\n✓ Task completed successfully!")
    print(
        f"Main tracking issue #{main_issue_number} created with proper labels and documentation"
    )
    print("All 3 investigation branches created for different investigation tracks")
    print("3 sub-issues created and linked to main tracking issue")
    print("Issue comments document file changes with commit SHA references")
    print(f"Analysis PR #{analysis_pr.get('number')} created from correct branch")
    return True


if __name__ == "__main__":
    success = verify()
    sys.exit(0 if success else 1)


================================================
FILE: tasks/github/standard/easyr1/qwen3_issue_management/description.md
================================================
The EasyR1 repository has several Qwen3-related issues that were closed but need to be reopened for further investigation. Qwen3 is an important model variant that requires continued attention. I need you to find and reopen all closed issues related to Qwen3 and properly tag them for tracking.

**Step 1: Find All Closed Qwen3 Issues**
Search for ALL closed issues that mention 'qwen3' (case-insensitive) in either the title or body. Make note of each issue number and title.

**Step 2: Reopen Each Qwen3 Issue**
For every closed issue that contains 'qwen3' (regardless of when it was closed or any other factors), reopen it by changing its state from closed to open.

**Step 3: Add Tracking Label**
After reopening each issue, add the label `qwen3-related` to it. This will help track all Qwen3-related issues in the future.

**Step 4: Create Summary Issue**
Create a new issue titled "Reopened Qwen3 Issues Summary" with the following content in the body:
```
# Qwen3 Issues Reopened

The following closed issues containing 'qwen3' have been reopened:

[List each reopened issue as: - #NUMBER: TITLE]

Total issues reopened: [NUMBER]

All reopened issues have been tagged with the `qwen3-related` label for easy tracking.
```

Add the label `qwen3-related` to this summary issue as well.

This straightforward workflow ensures all Qwen3-related closed issues are reopened and properly tagged for visibility.

================================================
FILE: tasks/github/standard/easyr1/qwen3_issue_management/meta.json
================================================
{
  "task_id": "qwen3_issue_management",
  "task_name": "Qwen3 Issue Management",
  "category_id": "easyr1",
  "category_name": "EasyR1",
  "description": "Find and reopen all closed Qwen3-related issues with proper tagging for continued tracking and investigation.",
  "author": "Zijian Wu",
  "created_at": "2025-08-15",
  "difficulty": "L3",
  "tags": [
    "issue management"
  ],
  "mcp": [
    "github"
  ],
  "meta_data": {
    "stateType": "url",
    "stateContent": null,
    "stateUrl": "https://github.com/mcpmark-source/EasyR1",
    "stateOriginalUrl": "https://github.com/hiyouga/EasyR1"
  }
}

================================================
FILE: tasks/github/standard/easyr1/qwen3_issue_management/verify.py
================================================
import sys
import os
import requests
from typing import Dict, List, Optional, Tuple
from dotenv import load_dotenv

load_dotenv(".mcp_env")


def _get_github_api(
    endpoint: str, headers: Dict[str, str]
) -> Tuple[bool, Optional[Dict]]:
    """Make a GET request to GitHub API and return (success, response)."""
    github_org = os.environ.get("GITHUB_EVAL_ORG")
    url = f"https://api.github.com/repos/{github_org}/EasyR1/{endpoint}"
    try:
        response = requests.get(url, headers=headers)
        if response.status_code == 200:
            return True, response.json()
        elif response.status_code == 404:
            return False, None
        else:
            print(f"API error for {endpoint}: {response.status_code}", file=sys.stderr)
            return False, None
    except Exception as e:
        print(f"Exception for {endpoint}: {e}", file=sys.stderr)
        return False, None


def _search_github_issues(
    query: str, headers: Dict[str, str]
) -> Tuple[bool, Optional[List]]:
    """Search GitHub issues using the search API."""
    url = f"https://api.github.com/search/issues?q={query}&per_page=100"
    try:
        response = requests.get(url, headers=headers)
        if response.status_code == 200:
            data = response.json()
            return True, data.get("items", [])
        else:
            print(f"Search API error: {response.status_code}", file=sys.stderr)
            return False, None
    except Exception as e:
        print(f"Search exception: {e}", file=sys.stderr)
        return False, None


def _check_qwen3_issues_reopened(headers: Dict[str, str]) -> Tuple[bool, List]:
    """Check if all Qwen3 issues have been reopened and tagged."""
    # Search for all issues mentioning qwen3 (both open and closed)
    github_org = os.environ.get("GITHUB_EVAL_ORG")
    success, all_qwen3_issues = _search_github_issues(
        f"repo:{github_org}/EasyR1 qwen3", headers
    )

    if not success or not all_qwen3_issues:
        print("Error: Could not search for Qwen3 issues", file=sys.stderr)
        return False, []

    reopened_issues = []
    issues_not_reopened = []
    issues_not_tagged = []

    for issue in all_qwen3_issues:
        issue_number = issue.get("number")
        issue_state = issue.get("state")
        issue_title = issue.get("title", "")

        # Check if the issue is open (should be reopened)
        if issue_state == "closed":
            issues_not_reopened.append(f"#{issue_number}: {issue_title}")
            continue

        # Check if issue has qwen3-related label
        labels = [label.get("name") for label in issue.get("labels", [])]
        if "qwen3-related" not in labels:
            issues_not_tagged.append(f"#{issue_number}: {issue_title}")
        else:
            reopened_issues.append(issue)

    # Report any issues not properly processed
    if issues_not_reopened:
        print("Error: The following Qwen3 issues are still closed:", file=sys.stderr)
        for issue in issues_not_reopened:
            print(f"  - {issue}", file=sys.stderr)
        return False, []

    if issues_not_tagged:
        print(
            "Error: The following reopened issues are missing 'qwen3-related' label:",
            file=sys.stderr,
        )
        for issue in issues_not_tagged:
            print(f"  - {issue}", file=sys.stderr)
        return False, reopened_issues

    return True, reopened_issues


def _check_summary_issue(
    headers: Dict[str, str], reopened_issues: List
) -> Optional[Dict]:
    """Check if the summary issue exists with proper content."""
    success, issues = _get_github_api("issues?state=all", headers)
    if not success or not issues:
        print("Error: Could not fetch issues for summary check", file=sys.stderr)
        return None

    expected_title = "Reopened Qwen3 Issues Summary"

    for issue in issues:
        title = issue.get("title", "")
        if title == expected_title:
            body = issue.get("body", "")

            # Check for required content
            if "# Qwen3 Issues Reopened" not in body:
                print("Error: Summary issue missing header", file=sys.stderr)
                return None

            if (
                "The following closed issues containing 'qwen3' have been reopened:"
                not in body
            ):
                print("Error: Summary issue missing description", file=sys.stderr)
                return None

            if "Total issues reopened:" not in body:
                print("Error: Summary issue missing total count", file=sys.stderr)
                return None

            if (
                "All reopened issues have been tagged with the `qwen3-related` label"
                not in body
            ):
                print("Error: Summary issue missing tagging note", file=sys.stderr)
                return None

            # Check if all reopened issues are listed
            for reopened_issue in reopened_issues:
                issue_num = reopened_issue.get("number")
                if f"#{issue_num}" not in body:
                    print(
                        f"Error: Summary issue missing reference to issue #{issue_num}",
                        file=sys.stderr,
                    )
                    return None

            # Check if summary issue has the label
            labels = [label.get("name") for label in issue.get("labels", [])]
            if "qwen3-related" not in labels:
                print(
                    "Error: Summary issue missing 'qwen3-related' label",
                    file=sys.stderr,
                )
                return None

            return issue

    print(
        "Error: Summary issue 'Reopened Qwen3 Issues Summary' not found",
        file=sys.stderr,
    )
    return None


def verify() -> bool:
    """
    Verify that all Qwen3-related closed issues have been reopened and tagged.
    """
    # Get GitHub token
    github_token = os.environ.get("MCP_GITHUB_TOKEN")
    if not github_token:
        print("Error: MCP_GITHUB_TOKEN environment variable not set", file=sys.stderr)
        return False

    headers = {
        "Authorization": f"token {github_token}",
        "Accept": "application/vnd.github.v3+json",
    }

    print("Verifying Qwen3 issue reopening workflow...")

    # 1. Check if all Qwen3 issues have been reopened and tagged
    print("1. Checking if Qwen3 issues are reopened and tagged...")
    all_reopened, reopened_issues = _check_qwen3_issues_reopened(headers)

    if not all_reopened:
        return False

    if not reopened_issues:
        print("Error: No Qwen3 issues found or reopened", file=sys.stderr)
        return False

    # 2. Check if summary issue exists
    print("2. Checking summary issue...")
    summary_issue = _check_summary_issue(headers, reopened_issues)
    if not summary_issue:
        return False

    print("\n✓ Qwen3 issue reopening workflow successfully completed!")
    print(f"✓ Reopened Issues: {len(reopened_issues)} Qwen3-related issues reopened")
    print("✓ Tagging: All reopened issues tagged with 'qwen3-related' label")
    print(
        f"✓ Summary: Issue #{summary_issue.get('number')} created with complete list of reopened issues"
    )
    print("\nAll Qwen3-related closed issues have been reopened and properly tagged!")
    return True


if __name__ == "__main__":
    success = verify()
    sys.exit(0 if success else 1)


================================================
FILE: tasks/github/standard/harmony/fix_conflict/description.md
================================================
I have some pull requests that won't merge due to conflicts. Can you help me fix the merge conflicts by creating the missing infrastructure?

**Step 1: Find Conflicted PR**
Look through the open pull requests and find the one that has `mergeable: false` and `mergeable_state: "dirty"`. Check what file it's trying to modify - it appears to be missing a file that the PR is trying to add or modify.

**Step 2: Create Infrastructure PR**  
Create a new branch and PR to add the missing file that the conflicted PR needs. The PR must have:

- **Title**: Must contain "Add CI infrastructure" and "resolve conflicts"
- **Body**: Must include:
  - Reference to the conflicted PR using "Fixes #[PR_NUMBER]" or "Resolves #[PR_NUMBER]" 
  - Explanation that this "prepares infrastructure" for the other PR
  - Mention of "missing .github directory" and "workflow conflicts"
- **File Content**: Extract the complete file content from the conflicted PR's changes and add it to main. This ensures the conflicted PR can merge cleanly without conflicts.

**Step 3: Merge Infrastructure PR**
Merge the infrastructure PR to main.

**Step 4: Add Comment to Original PR**
Add a comment to the original conflicted PR that references the infrastructure PR you just created and merged. The comment must mention the infrastructure PR number using "PR #[NUMBER]" format.

**Step 5: Merge Original PR**
Now merge the original conflicted PR since it should be able to merge cleanly.

================================================
FILE: tasks/github/standard/harmony/fix_conflict/meta.json
================================================
{
  "task_id": "fix_conflict",
  "task_name": "Fix Conflict",
  "category_id": "harmony",
  "category_name": "Harmony",
  "description": "Resolve merge conflicts by creating missing infrastructure and ensuring conflicted PRs can merge cleanly.",
  "author": "Zijian Wu",
  "created_at": "2025-08-15",
  "difficulty": "L3",
  "tags": [
    "pr workflows"
  ],
  "mcp": [
    "github"
  ],
  "meta_data": {
    "stateType": "url",
    "stateContent": null,
    "stateUrl": "https://github.com/mcpmark-source/harmony",
    "stateOriginalUrl": "https://github.com/openai/harmony"
  }
}

================================================
FILE: tasks/github/standard/harmony/fix_conflict/verify.py
================================================
import sys
import os
import requests
from typing import Dict, Optional, Tuple
from dotenv import load_dotenv


def _get_github_api(
    endpoint: str, headers: Dict[str, str], org: str, repo: str = "harmony"
) -> Tuple[bool, Optional[Dict]]:
    """Make a GET request to GitHub API and return (success, response)."""
    url = f"https://api.github.com/repos/{org}/{repo}/{endpoint}"
    try:
        response = requests.get(url, headers=headers)
        if response.status_code == 200:
            return True, response.json()
        elif response.status_code == 404:
            return False, None
        else:
            print(f"API error for {endpoint}: {response.status_code}", file=sys.stderr)
            return False, None
    except Exception as e:
        print(f"Exception for {endpoint}: {e}", file=sys.stderr)
        return False, None


def _check_ci_file_exists(
    file_path: str, headers: Dict[str, str], org: str, repo: str = "harmony"
) -> bool:
    """Check if CI file exists in main branch."""
    success, _ = _get_github_api(f"contents/{file_path}?ref=main", headers, org, repo)
    return success


def _check_pr_comments(
    pr_number: int,
    infra_pr_number: int,
    headers: Dict[str, str],
    org: str,
    repo: str = "harmony",
) -> bool:
    """Check if PR has a comment linking to the infrastructure PR using 'PR #[NUMBER]' format."""
    success, comments = _get_github_api(
        f"issues/{pr_number}/comments", headers, org, repo
    )
    if not success or not comments:
        return False

    # Look for "PR #123" pattern (case insensitive)
    import re

    for comment in comments:
        body = comment.get("body", "")
        if re.search(rf"PR\s*#{infra_pr_number}", body, re.IGNORECASE):
            return True
    return False


def _find_infrastructure_pr(
    headers: Dict[str, str], org: str, repo: str = "harmony"
) -> Optional[Dict]:
    """Find the infrastructure PR by checking title and body content."""
    success, prs = _get_github_api("pulls?state=all&per_page=50", headers, org, repo)
    if success and prs:
        for pr in prs:
            title = pr.get("title", "").lower()
            body = pr.get("body", "").lower()

            # Check title contains required keywords
            title_ok = "add ci infrastructure" in title and "resolve conflicts" in title

            # Check body contains required elements
            has_reference = "fixes #" in body or "resolves #" in body
            has_prep_text = "prepares infrastructure" in body
            has_github_text = "missing .github directory" in body
            has_workflow_text = "workflow conflicts" in body

            body_ok = (
                has_reference
                and has_prep_text
                and has_github_text
                and has_workflow_text
            )

            if title_ok and body_ok:
                return pr
    return None


def verify() -> bool:
    """
    Programmatically verify that the merge conflict resolution workflow meets the
    requirements described in description.md.
    """
    # Load environment variables from .mcp_env
    load_dotenv(".mcp_env")

    # Get GitHub token and org
    github_token = os.environ.get("MCP_GITHUB_TOKEN")
    github_org = os.environ.get("GITHUB_EVAL_ORG")

    if not github_token:
        print("Error: MCP_GITHUB_TOKEN environment variable not set", file=sys.stderr)
        return False

    if not github_org:
        print("Error: GITHUB_EVAL_ORG environment variable not set", file=sys.stderr)
        return False

    headers = {
        "Authorization": f"token {github_token}",
        "Accept": "application/vnd.github.v3+json",
    }

    # Run verification checks
    print("Verifying merge conflict resolution workflow completion...")

    # 1. Check that CI infrastructure file exists in main (extracted from conflicted PR)
    print("1. Checking CI infrastructure was added to main...")
    # Check for both CI.yml and ci.yml (case-insensitive)
    ci_exists = _check_ci_file_exists(".github/workflows/CI.yml", headers, github_org)
    if not ci_exists:
        ci_exists = _check_ci_file_exists(".github/workflows/ci.yml", headers, github_org)
    
    if not ci_exists:
        print("Error: Neither .github/workflows/CI.yml nor .github/workflows/ci.yml found in main", file=sys.stderr)
        return False

    # 2. Find infrastructure PR with required title and body content
    print("2. Finding infrastructure PR with required content...")
    infra_pr = _find_infrastructure_pr(headers, github_org)
    if not infra_pr:
        print(
            "Error: No infrastructure PR found with required title and body content",
            file=sys.stderr,
        )
        print(
            "Required title: 'Add CI infrastructure' and 'resolve conflicts'",
            file=sys.stderr,
        )
        print(
            "Required body: reference with 'Fixes #' or 'Resolves #', 'prepares infrastructure', 'missing .github directory', 'workflow conflicts'",
            file=sys.stderr,
        )
        return False

    print(f"Found infrastructure PR #{infra_pr.get('number')}: {infra_pr.get('title')}")

    # 3. Check that infrastructure PR is merged
    if not infra_pr.get("merged_at"):
        print(
            f"Error: Infrastructure PR #{infra_pr.get('number')} not merged yet",
            file=sys.stderr,
        )
        return False

    # 4. Check that PR #24 is merged
    print("3. Checking that PR #24 is merged...")
    success, pr24 = _get_github_api("pulls/24", headers, github_org)
    if not success or not pr24:
        print("Error: PR #24 not found", file=sys.stderr)
        return False

    if not pr24.get("merged_at"):
        print("Error: PR #24 is not merged yet", file=sys.stderr)
        return False

    # 5. Check that PR #24 has a comment linking to the infrastructure PR
    print("4. Checking that PR #24 has comment linking to infrastructure PR...")
    if not _check_pr_comments(24, infra_pr.get("number"), headers, github_org):
        print(
            f"Error: PR #24 missing comment linking to infrastructure PR #{infra_pr.get('number')}",
            file=sys.stderr,
        )
        return False

    print("\n✓ Task completed successfully!")
    print(
        f"Infrastructure PR #{infra_pr.get('number')} extracted content from PR #24 and resolved conflicts"
    )
    print(
        "PR #24 is now merged cleanly and has a comment linking to the infrastructure PR"
    )
    return True


if __name__ == "__main__":
    success = verify()
    sys.exit(0 if success else 1)


================================================
FILE: tasks/github/standard/harmony/issue_pr_commit_workflow/description.md
================================================
I need you to implement a complete bug tracking and resolution workflow that demonstrates proper cross-referencing between issues, pull requests, and commits. Here's what you need to do:

**Step 1: Create Issue for Race Condition Bug**
Create a new issue with:
- Title containing: 'race condition', 'HarmonyEncoding', 'concurrent access'
- Body must include:
  - A "## Problem" heading describing threading issues
  - A "## Root Cause" heading about file locking
  - A "## Expected Solution" heading with bullet points
  - References to issues #6 and #1
  - Keywords: "multiple threads", "tokenizer file downloads", "mutex-based file locking"

**Step 2: Create Feature Branch**
Create a new branch called 'fix/race-condition-tokenizer-loading' from main.

**Step 3: Implement Thread-Safe Loading**
On the feature branch, create/update the file `src/concurrent_loading.rs` with:
```rust
use std::sync::Mutex;
use std::sync::OnceLock;

// Thread-safe tokenizer loading with file locks
static DOWNLOAD_MUTEX: OnceLock<Mutex<()>> = OnceLock::new();

pub fn load_harmony_encoding_safe(name: &str) -> Result<HarmonyEncoding, HarmonyError> {
    let _guard = DOWNLOAD_MUTEX.get_or_init(|| Mutex::new(())).lock().unwrap();
    // Implementation for thread-safe loading
    // Addresses race condition from issue #6
    Ok(HarmonyEncoding::new())
}

pub fn load_harmony_encoding_from_file(path: &str) -> Result<HarmonyEncoding, HarmonyError> {
    // Offline loading API as requested in issue #1
    HarmonyEncoding::from_file(path)
}
```

**Step 4: Create Pull Request with Cross-References**
Create a pull request from 'fix/race-condition-tokenizer-loading' to 'main' with:
- Title containing: 'Fix race condition', 'tokenizer loading', 'threading issues'
- Body must include:
  - A "## Summary" heading explaining the fix
  - A "## Changes" heading with bullet points about mutex implementation
  - A "## Testing" heading mentioning related issues
  - "Closes #[ISSUE_NUMBER]" pattern linking to your created issue
  - References to #1 and #6
  - Keywords: "thread-safe", "concurrent downloads", "offline loading API"

**Step 5: Add PR Review Comments**
Create a pending review and add a review comment to the PR with:
- Technical analysis of the implementation approach
- Discussion of thread safety mechanisms
- Keywords that must be included: "OnceLock", "mutex", "thread safety", "concurrent access"
- Reference to issue #1 and the offline loading capability
- Explanation of how the solution prevents race conditions
Then submit the review as a COMMENT type review.

**Step 6: Update Issue with Implementation Details**
Add a comment to the original issue you created with:
- Reference to the PR number using "PR #[NUMBER]" pattern
- Technical details about the mutex-based solution
- Keywords: "std::sync::Mutex", "OnceLock", "thread-safe initialization"
- Mention of key implementation changes (DOWNLOAD_MUTEX, offline loading)
- Reference back to issue #1 for offline loading requirement

**Step 7: Close the Issue**
Close the issue you created by updating its state to 'closed' with state_reason 'completed'.

================================================
FILE: tasks/github/standard/harmony/issue_pr_commit_workflow/meta.json
================================================
{
  "task_id": "issue_pr_commit_workflow",
  "task_name": "Issue Pr Commit Workflow",
  "category_id": "harmony",
  "category_name": "Harmony",
  "description": "Implement complete bug tracking workflow demonstrating proper cross-referencing between issues, PRs, and commits for race condition fixes.",
  "author": "Zijian Wu",
  "created_at": "2025-08-15",
  "difficulty": "L3",
  "tags": [
    "issue management",
    "pr workflows"
  ],
  "mcp": [
    "github"
  ],
  "meta_data": {
    "stateType": "url",
    "stateContent": null,
    "stateUrl": "https://github.com/mcpmark-source/harmony",
    "stateOriginalUrl": "https://github.com/openai/harmony"
  }
}

================================================
FILE: tasks/github/standard/harmony/issue_pr_commit_workflow/verify.py
================================================
import sys
import os
import requests
from typing import Dict, List, Optional, Tuple
import base64
from dotenv import load_dotenv


def _get_github_api(
    endpoint: str, headers: Dict[str, str], org: str, repo: str = "harmony"
) -> Tuple[bool, Optional[Dict]]:
    """Make a GET request to GitHub API and return (success, response)."""
    url = f"https://api.github.com/repos/{org}/{repo}/{endpoint}"
    try:
        response = requests.get(url, headers=headers)
        if response.status_code == 200:
            return True, response.json()
        elif response.status_code == 404:
            return False, None
        else:
            print(f"API error for {endpoint}: {response.status_code}", file=sys.stderr)
            return False, None
    except Exception as e:
        print(f"Exception for {endpoint}: {e}", file=sys.stderr)
        return False, None


def _check_branch_exists(
    branch_name: str, headers: Dict[str, str], org: str, repo: str = "harmony"
) -> bool:
    """Verify that a branch exists in the repository."""
    success, _ = _get_github_api(f"branches/{branch_name}", headers, org, repo)
    return success


def _check_file_content(
    branch: str,
    file_path: str,
    keywords: List[str],
    headers: Dict[str, str],
    org: str,
    repo: str = "harmony",
) -> bool:
    """Verify that a file exists in branch and contains required keywords."""
    success, result = _get_github_api(
        f"contents/{file_path}?ref={branch}", headers, org, repo
    )
    if not success or not result:
        return False

    if keywords and result.get("content"):
        try:
            content = base64.b64decode(result.get("content", "")).decode("utf-8")
            return all(keyword in content for keyword in keywords)
        except Exception as e:
            print(f"Content decode error for {file_path}: {e}", file=sys.stderr)
            return False

    return True


def _find_issue_by_title(
    title_substring: str, headers: Dict[str, str], org: str, repo: str = "harmony"
) -> Optional[Dict]:
    """Find an issue by title substring and return the issue data."""
    # Check both open and closed issues
    for state in ["open", "closed"]:
        success, issues = _get_github_api(
            f"issues?state={state}&per_page=100", headers, org, repo
        )
        if success and issues:
            for issue in issues:
                if title_substring.lower() in issue.get("title", "").lower():
                    return issue
    return None


def _find_pr_by_title(
    title_substring: str, headers: Dict[str, str], org: str, repo: str = "harmony"
) -> Optional[Dict]:
    """Find a PR by title substring and return the PR data."""
    # Check both open and closed PRs
    for state in ["open", "closed"]:
        success, prs = _get_github_api(
            f"pulls?state={state}&per_page=100", headers, org, repo
        )
        if success and prs:
            for pr in prs:
                if title_substring.lower() in pr.get("title", "").lower():
                    return pr
    return None


def _check_issue_references(issue_body: str, reference_numbers: List[str]) -> bool:
    """Check if issue body contains references to specified issue numbers."""
    if not issue_body:
        return False

    return all(f"#{ref}" in issue_body for ref in reference_numbers)


def _check_pr_references(
    pr_body: str, issue_number: int, reference_numbers: List[str]
) -> bool:
    """Check if PR body contains proper references."""
    if not pr_body:
        return False

    # Check for "Closes #X" pattern
    closes_pattern = (
        f"Closes #{issue_number}" in pr_body or f"closes #{issue_number}" in pr_body
    )

    # Check for other references
    refs_present = all(f"#{ref}" in pr_body for ref in reference_numbers)

    return closes_pattern and refs_present


def _get_issue_comments(
    issue_number: int, headers: Dict[str, str], org: str, repo: str = "harmony"
) -> List[Dict]:
    """Get all comments for an issue."""
    success, comments = _get_github_api(
        f"issues/{issue_number}/comments", headers, org, repo
    )
    if success and comments:
        return comments
    return []


def _get_pr_reviews(
    pr_number: int, headers: Dict[str, str], org: str, repo: str = "harmony"
) -> List[Dict]:
    """Get all reviews for a PR."""
    success, reviews = _get_github_api(f"pulls/{pr_number}/reviews", headers, org, repo)
    if success and reviews:
        return reviews
    return []


def _check_issue_comment_references(
    comments: List[Dict], pr_number: int, keywords: List[str]
) -> bool:
    """Check if issue has a comment referencing the PR number with required technical keywords."""
    for comment in comments:
        body = comment.get("body", "")
        has_pr_ref = (
            f"PR #{pr_number}" in body
            or f"PR#{pr_number}" in body
            or f"pr #{pr_number}" in body.lower()
        )
        has_keywords = all(keyword.lower() in body.lower() for keyword in keywords)
        if has_pr_ref and has_keywords:
            return True
    return False


def _check_title_keywords(title: str, required_keywords: List[str]) -> bool:
    """Check if title contains all required keywords."""
    return all(keyword.lower() in title.lower() for keyword in required_keywords)


def _check_headings_and_content(
    body: str, headings: List[str], keywords: List[str]
) -> bool:
    """Check if body contains required headings and keywords."""
    has_headings = all(heading in body for heading in headings)
    has_keywords = all(keyword.lower() in body.lower() for keyword in keywords)
    return has_headings and has_keywords


def _check_pr_review_content(reviews: List[Dict], keywords: List[str]) -> bool:
    """Check if PR has review comments containing required keywords."""
    for review in reviews:
        body = review.get("body", "")
        if body and all(keyword.lower() in body.lower() for keyword in keywords):
            return True
    return False


def verify() -> bool:
    """
    Programmatically verify that the issue-PR-commit workflow meets the
    requirements described in description.md.
    """
    # Load environment variables from .mcp_env
    load_dotenv(".mcp_env")

    # Get GitHub token and org
    github_token = os.environ.get("MCP_GITHUB_TOKEN")
    github_org = os.environ.get("GITHUB_EVAL_ORG")

    if not github_token:
        print("Error: MCP_GITHUB_TOKEN environment variable not set", file=sys.stderr)
        return False

    if not github_org:
        print("Error: GITHUB_EVAL_ORG environment variable not set", file=sys.stderr)
        return False

    # Configuration constants
    BRANCH_NAME = "fix/race-condition-tokenizer-loading"
    ISSUE_TITLE_SUBSTRING = "race condition in HarmonyEncoding"
    PR_TITLE_SUBSTRING = "Fix race condition in tokenizer loading"

    # File content checks
    RUST_FILE_KEYWORDS = [
        "DOWNLOAD_MUTEX",
        "OnceLock<Mutex<()>>",
        "load_harmony_encoding_safe",
        "load_harmony_encoding_from_file",
        "Thread-safe tokenizer loading",
    ]

    # Issue content requirements
    ISSUE_TITLE_KEYWORDS = ["race condition", "HarmonyEncoding", "concurrent access"]
    ISSUE_REFERENCE_NUMBERS = ["6", "1"]
    ISSUE_HEADINGS = ["## Problem", "## Root Cause", "## Expected Solution"]
    ISSUE_KEYWORDS = [
        "multiple threads",
        "tokenizer file downloads",
        "mutex-based file locking",
    ]

    # PR content requirements
    PR_TITLE_KEYWORDS = ["Fix race condition", "tokenizer loading", "threading issues"]
    PR_REFERENCE_NUMBERS = ["1", "6"]
    PR_HEADINGS = ["## Summary", "## Changes", "## Testing"]
    PR_KEYWORDS = ["thread-safe", "concurrent downloads", "offline loading API"]

    # Review comment requirements
    REVIEW_KEYWORDS = ["OnceLock", "mutex", "thread safety", "concurrent access"]

    # Issue comment requirements
    ISSUE_COMMENT_KEYWORDS = [
        "std::sync::Mutex",
        "OnceLock",
        "thread-safe initialization",
        "DOWNLOAD_MUTEX",
    ]

    headers = {
        "Authorization": f"token {github_token}",
        "Accept": "application/vnd.github.v3+json",
    }

    # Run verification checks
    print("Verifying GitHub issue-PR-commit workflow completion...")

    # 1. Check that feature branch exists
    print("1. Verifying feature branch exists...")
    if not _check_branch_exists(BRANCH_NAME, headers, github_org):
        print(f"Error: Branch '{BRANCH_NAME}' not found", file=sys.stderr)
        return False

    # 2. Check that the Rust implementation file exists with required content
    print("2. Verifying concurrent_loading.rs implementation...")
    if not _check_file_content(
        BRANCH_NAME,
        "src/concurrent_loading.rs",
        RUST_FILE_KEYWORDS,
        headers,
        github_org,
    ):
        print(
            "Error: src/concurrent_loading.rs not found or missing required content",
            file=sys.stderr,
        )
        return False

    # 3. Find the created issue
    print("3. Verifying issue creation and content...")
    issue = _find_issue_by_title(ISSUE_TITLE_SUBSTRING, headers, github_org)
    if not issue:
        print(
            f"Error: Issue with title containing '{ISSUE_TITLE_SUBSTRING}' not found",
            file=sys.stderr,
        )
        return False

    issue_number = issue.get("number")
    issue_title = issue.get("title", "")
    issue_body = issue.get("body", "")

    # Check issue title keywords
    if not _check_title_keywords(issue_title, ISSUE_TITLE_KEYWORDS):
        print("Error: Issue title missing required keywords", file=sys.stderr)
        return False

    # Check issue headings, content and references
    if not _check_headings_and_content(issue_body, ISSUE_HEADINGS, ISSUE_KEYWORDS):
        print("Error: Issue missing required headings or keywords", file=sys.stderr)
        return False

    if not _check_issue_references(issue_body, ISSUE_REFERENCE_NUMBERS):
        print(
            "Error: Issue does not reference required issues #6 and #1", file=sys.stderr
        )
        return False

    # 4. Find the created PR
    print("4. Verifying pull request creation and content...")
    pr = _find_pr_by_title(PR_TITLE_SUBSTRING, headers, github_org)
    if not pr:
        print(
            f"Error: PR with title containing '{PR_TITLE_SUBSTRING}' not found",
            file=sys.stderr,
        )
        return False

    pr_number = pr.get("number")
    pr_title = pr.get("title", "")
    pr_body = pr.get("body", "")

    # Check PR title keywords
    if not _check_title_keywords(pr_title, PR_TITLE_KEYWORDS):
        print("Error: PR title missing required keywords", file=sys.stderr)
        return False

    # Check PR headings and content
    if not _check_headings_and_content(pr_body, PR_HEADINGS, PR_KEYWORDS):
        print("Error: PR missing required headings or keywords", file=sys.stderr)
        return False

    # Check PR references
    if not _check_pr_references(pr_body, issue_number, PR_REFERENCE_NUMBERS):
        print(
            f"Error: PR does not properly reference issue #{issue_number} or issues #1, #6",
            file=sys.stderr,
        )
        return False

    # 5. Check PR review comments
    print("5. Verifying PR review comments...")
    reviews = _get_pr_reviews(pr_number, headers, github_org)
    if not _check_pr_review_content(reviews, REVIEW_KEYWORDS):
        print(
            "Error: PR missing review comment with required technical keywords",
            file=sys.stderr,
        )
        return False

    # 6. Check issue comments for PR reference with technical keywords
    print("6. Verifying issue comment referencing PR...")
    issue_comments = _get_issue_comments(issue_number, headers, github_org)
    if not _check_issue_comment_references(
        issue_comments, pr_number, ISSUE_COMMENT_KEYWORDS
    ):
        print(
            f"Error: Issue #{issue_number} missing comment referencing PR #{pr_number} with required technical keywords",
            file=sys.stderr,
        )
        return False

    # 7. Check issue is closed
    print("7. Verifying issue closure...")
    if issue.get("state") != "closed":
        print(f"Error: Issue #{issue_number} is not closed", file=sys.stderr)
        return False

    print("\n✓ All verification checks passed!")
    print("Issue-PR-commit workflow completed successfully:")
    print(f"  - Issue #{issue_number}: {issue.get('title')}")
    print(f"  - PR #{pr_number}: {pr.get('title')}")
    print(f"  - Branch: {BRANCH_NAME}")
    return True


if __name__ == "__main__":
    success = verify()
    sys.exit(0 if success else 1)


================================================
FILE: tasks/github/standard/harmony/issue_tagging_pr_closure/description.md
================================================
I need you to simulate a realistic development workflow where an enhancement issue is created, implementation is attempted via a pull request, but then the PR must be closed without merging due to technical constraints discovered during the implementation process.

**Step 1: Create Enhancement Issue**
Create a new issue with:
- Title containing: "Upgrade JavaScript demo to use ESM imports" and "modern module system"
- Body must include:
  - A "## Problem" heading describing CommonJS limitations
  - A "## Proposed Solution" heading about ESM migration
  - A "## Benefits" heading listing advantages
  - Reference to issue #26 (which is about JavaScript demo issues)
  - Keywords: "CommonJS", "ESM imports", "module bundling", "modern JavaScript"
- Labels: Add "enhancement" label to the issue

**Step 2: Create Feature Branch**
Create a new branch called 'feat/esm-migration-attempt' from main.

**Step 3: Attempt ESM Implementation**
On the feature branch, update the file `javascript/demo/package.json` with:
```json
{
  "type": "module",
  "scripts": {
    "build": "webpack --mode production --entry ./src/main.js"
  },
  "dependencies": {
    "@openai/harmony": "^0.1.0",
    "webpack": "^5.0.0"
  }
}
```

Also create `javascript/demo/src/main.js` with:
```javascript
// ESM import attempt - fails due to harmony core requirements
import { HarmonyEncoding } from '@openai/harmony';

// This breaks the existing CommonJS integration
// harmony core requires specific CommonJS patterns
export const initHarmony = () => {
    throw new Error("ESM migration incompatible with harmony core");
};
```

**Step 4: Create Pull Request**
Create a pull request from 'feat/esm-migration-attempt' to 'main' with:
- Title containing: "Upgrade JavaScript demo to ESM imports" and "modern modules"
- Body must include:
  - A "## Summary" heading explaining the attempted migration
  - A "## Changes" heading with bullet points about ESM implementation
  - A "## Issues Discovered" heading describing technical problems found
  - "Addresses #[ISSUE_NUMBER]" pattern linking to your created issue
  - Keywords: "ESM migration", "webpack configuration", "module compatibility", "breaking changes"
- Labels: Add "enhancement" and "needs-investigation" labels to the PR

**Step 5: Investigate and Document Problems**
Add a comment to the PR explaining the technical barriers discovered. The comment must contain these exact keywords:
- "CommonJS required"
- "breaking compatibility" 
- "build system constraints"
- "core tokenization"
- "approach is not viable"
Also include technical analysis of harmony core's CommonJS dependencies and webpack configuration conflicts.

**Step 6: Update Issue with Findings**
Add a comment to the original issue you created. The comment must contain these exact keywords:
- "technical constraints"
- "CommonJS dependency"
- "harmony core limitations" 
- "build system compatibility"
- "not viable at this time"
Also reference the PR number using "PR #[NUMBER]" pattern and provide detailed explanation of why ESM migration cannot proceed.

**Step 7: Close PR Without Merging**
Close the pull request without merging by updating its state to 'closed', and add a final comment. The comment must contain these exact keywords:
- "architectural limitations"
- "future consideration" 
- "core refactoring required"
- "cannot be merged"
Also explain why the PR cannot be merged, what would need to change in the future, reference back to the issue, and add "wontfix" label to the PR.

**Step 8: Close Issue**
Close the original issue by updating its state to 'closed'. Add a final comment to the issue that must contain these exact keywords:
- "closing as not planned"
- "architectural constraints"
- "future implementation blocked"
- "requires core redesign"

================================================
FILE: tasks/github/standard/harmony/issue_tagging_pr_closure/meta.json
================================================
{
  "task_id": "issue_tagging_pr_closure",
  "task_name": "Issue Tagging Pr Closure",
  "category_id": "harmony",
  "category_name": "Harmony",
  "description": "Simulate development workflow where enhancement PR is closed without merging due to technical constraints discovered during implementation.",
  "author": "Zijian Wu",
  "created_at": "2025-08-15",
  "difficulty": "L3",
  "tags": [
    "issue management",
    "pr workflows"
  ],
  "mcp": [
    "github"
  ],
  "meta_data": {
    "stateType": "url",
    "stateContent": null,
    "stateUrl": "https://github.com/mcpmark-source/harmony",
    "stateOriginalUrl": "https://github.com/openai/harmony"
  }
}

================================================
FILE: tasks/github/standard/harmony/issue_tagging_pr_closure/verify.py
================================================
import sys
import os
import requests
from typing import Dict, List, Optional, Tuple
from dotenv import load_dotenv


def _get_github_api(
    endpoint: str, headers: Dict[str, str], org: str, repo: str = "harmony"
) -> Tuple[bool, Optional[Dict]]:
    """Make a GET request to GitHub API and return (success, response)."""
    url = f"https://api.github.com/repos/{org}/{repo}/{endpoint}"
    try:
        response = requests.get(url, headers=headers)
        if response.status_code == 200:
            return True, response.json()
        elif response.status_code == 404:
            return False, None
        else:
            print(f"API error for {endpoint}: {response.status_code}", file=sys.stderr)
            return False, None
    except Exception as e:
        print(f"Exception for {endpoint}: {e}", file=sys.stderr)
        return False, None


def _check_branch_exists(
    branch_name: str, headers: Dict[str, str], org: str, repo: str = "harmony"
) -> bool:
    """Verify that a branch exists in the repository."""
    success, _ = _get_github_api(f"branches/{branch_name}", headers, org, repo)
    return success


def _check_file_content(
    branch: str,
    file_path: str,
    keywords: List[str],
    headers: Dict[str, str],
    org: str,
    repo: str = "harmony",
) -> bool:
    """Verify that a file exists in branch and contains required keywords."""
    import base64

    success, result = _get_github_api(
        f"contents/{file_path}?ref={branch}", headers, org, repo
    )
    if not success or not result:
        return False

    if keywords and result.get("content"):
        try:
            content = base64.b64decode(result.get("content", "")).decode("utf-8")
            return all(keyword in content for keyword in keywords)
        except Exception as e:
            print(f"Content decode error for {file_path}: {e}", file=sys.stderr)
            return False

    return True


def _find_issue_by_title_keywords(
    title_keywords: List[str], headers: Dict[str, str], org: str, repo: str = "harmony"
) -> Optional[Dict]:
    """Find an issue by title keywords and return the issue data."""
    for state in ["open", "closed"]:
        success, issues = _get_github_api(
            f"issues?state={state}&per_page=100", headers, org, repo
        )
        if success and issues:
            for issue in issues:
                title = issue.get("title", "").lower()
                if all(keyword.lower() in title for keyword in title_keywords):
                    return issue
    return None


def _find_pr_by_title_keywords(
    title_keywords: List[str], headers: Dict[str, str], org: str, repo: str = "harmony"
) -> Optional[Dict]:
    """Find a PR by title keywords and return the PR data."""
    for state in ["open", "closed"]:
        success, prs = _get_github_api(
            f"pulls?state={state}&per_page=100", headers, org, repo
        )
        if success and prs:
            for pr in prs:
                title = pr.get("title", "").lower()
                if all(keyword.lower() in title for keyword in title_keywords):
                    return pr
    return None


def _check_labels(labels: List[Dict], required_labels: List[str]) -> bool:
    """Check if required labels are present."""
    label_names = [label.get("name", "").lower() for label in labels]
    return all(req_label.lower() in label_names for req_label in required_labels)


def _check_headings_and_keywords(
    body: str, headings: List[str], keywords: List[str]
) -> bool:
    """Check if body contains required headings and keywords."""
    if not body:
        return False
    has_headings = all(heading in body for heading in headings)
    has_keywords = all(keyword.lower() in body.lower() for keyword in keywords)
    return has_headings and has_keywords


def _check_issue_reference(body: str, issue_number: int) -> bool:
    """Check if body contains reference to the issue."""
    if not body:
        return False
    return f"#{issue_number}" in body or f"Addresses #{issue_number}" in body


def _get_issue_comments(
    issue_number: int, headers: Dict[str, str], org: str, repo: str = "harmony"
) -> List[Dict]:
    """Get all comments for an issue."""
    success, comments = _get_github_api(
        f"issues/{issue_number}/comments", headers, org, repo
    )
    if success and comments:
        return comments
    return []


def _get_pr_comments(
    pr_number: int, headers: Dict[str, str], org: str, repo: str = "harmony"
) -> List[Dict]:
    """Get all comments for a PR."""
    success, comments = _get_github_api(
        f"issues/{pr_number}/comments", headers, org, repo
    )
    if success and comments:
        return comments
    return []


def _check_pr_technical_comment(comments: List[Dict], keywords: List[str]) -> bool:
    """Check if PR has a comment with technical analysis containing required keywords."""
    for comment in comments:
        body = comment.get("body", "")
        if body and all(keyword.lower() in body.lower() for keyword in keywords):
            return True
    return False


def _check_issue_comment_with_pr_ref(
    comments: List[Dict], pr_number: int, keywords: List[str]
) -> bool:
    """Check if issue has a comment referencing the PR with required keywords."""
    for comment in comments:
        body = comment.get("body", "")
        has_pr_ref = (
            f"PR #{pr_number}" in body
            or f"PR#{pr_number}" in body
            or f"pr #{pr_number}" in body.lower()
        )
        has_keywords = all(keyword.lower() in body.lower() for keyword in keywords)
        if has_pr_ref and has_keywords:
            return True
    return False


def verify() -> bool:
    """
    Programmatically verify that the issue tagging and PR closure workflow meets the
    requirements described in description.md.
    """
    # Load environment variables from .mcp_env
    load_dotenv(".mcp_env")

    # Get GitHub token and org
    github_token = os.environ.get("MCP_GITHUB_TOKEN")
    github_org = os.environ.get("GITHUB_EVAL_ORG")

    if not github_token:
        print("Error: MCP_GITHUB_TOKEN environment variable not set", file=sys.stderr)
        return False

    if not github_org:
        print("Error: GITHUB_EVAL_ORG environment variable not set", file=sys.stderr)
        return False

    # Configuration constants
    BRANCH_NAME = "feat/esm-migration-attempt"

    # Issue requirements
    ISSUE_TITLE_KEYWORDS = [
        "Upgrade JavaScript demo to use ESM imports",
        "modern module system",
    ]
    ISSUE_HEADINGS = ["## Problem", "## Proposed Solution", "## Benefits"]
    ISSUE_KEYWORDS = ["CommonJS", "ESM imports", "module bundling", "modern JavaScript"]
    ISSUE_LABELS = ["enhancement"]

    # PR requirements
    PR_TITLE_KEYWORDS = ["Upgrade JavaScript demo to ESM imports", "modern modules"]
    PR_HEADINGS = ["## Summary", "## Changes", "## Issues Discovered"]
    PR_KEYWORDS = [
        "ESM migration",
        "webpack configuration",
        "module compatibility",
        "breaking changes",
    ]
    PR_LABELS = ["enhancement", "needs-investigation", "wontfix"]

    # File content requirements
    PACKAGE_JSON_KEYWORDS = ['"type": "module"', "webpack", "@openai/harmony"]
    MAIN_JS_KEYWORDS = [
        "import { HarmonyEncoding }",
        "ESM import attempt",
        "harmony core",
    ]

    # Comment requirements
    PR_TECHNICAL_KEYWORDS = [
        "CommonJS required",
        "breaking compatibility",
        "build system constraints",
        "core tokenization",
        "approach is not viable",
    ]
    ISSUE_COMMENT_KEYWORDS = [
        "technical constraints",
        "CommonJS dependency",
        "harmony core limitations",
        "build system compatibility",
        "not viable at this time",
    ]
    PR_CLOSURE_KEYWORDS = [
        "architectural limitations",
        "future consideration",
        "core refactoring required",
        "cannot be merged",
    ]
    ISSUE_CLOSURE_KEYWORDS = [
        "closing as not planned",
        "architectural constraints",
        "future implementation blocked",
        "requires core redesign",
    ]

    headers = {
        "Authorization": f"token {github_token}",
        "Accept": "application/vnd.github.v3+json",
    }

    # Run verification checks
    print("Verifying issue tagging and PR closure workflow completion...")

    # 1. Check that feature branch exists
    print("1. Verifying feature branch exists...")
    if not _check_branch_exists(BRANCH_NAME, headers, github_org):
        print(f"Error: Branch '{BRANCH_NAME}' not found", file=sys.stderr)
        return False

    # 2. Check that implementation files exist with required content
    print("2. Verifying ESM implementation files...")
    if not _check_file_content(
        BRANCH_NAME,
        "javascript/demo/package.json",
        PACKAGE_JSON_KEYWORDS,
        headers,
        github_org,
    ):
        print(
            "Error: javascript/demo/package.json not found or missing required content",
            file=sys.stderr,
        )
        return False

    if not _check_file_content(
        BRANCH_NAME,
        "javascript/demo/src/main.js",
        MAIN_JS_KEYWORDS,
        headers,
        github_org,
    ):
        print(
            "Error: javascript/demo/src/main.js not found or missing required content",
            file=sys.stderr,
        )
        return False

    # 3. Find the created issue
    print("3. Verifying issue creation and content...")
    issue = _find_issue_by_title_keywords(ISSUE_TITLE_KEYWORDS, headers, github_org)
    if not issue:
        print(
            "Error: Issue with title containing required keywords not found",
            file=sys.stderr,
        )
        return False

    issue_number = issue.get("number")
    issue_body = issue.get("body", "")
    issue_labels = issue.get("labels", [])

    # Check issue content
    if not _check_headings_and_keywords(issue_body, ISSUE_HEADINGS, ISSUE_KEYWORDS):
        print("Error: Issue missing required headings or keywords", file=sys.stderr)
        return False

    # Check issue references #26
    if "#26" not in issue_body:
        print("Error: Issue does not reference issue #26", file=sys.stderr)
        return False

    # Check issue labels
    if not _check_labels(issue_labels, ISSUE_LABELS):
        print(f"Error: Issue missing required labels: {ISSUE_LABELS}", file=sys.stderr)
        return False

    # 4. Find the created PR
    print("4. Verifying pull request creation and content...")
    pr = _find_pr_by_title_keywords(PR_TITLE_KEYWORDS, headers, github_org)
    if not pr:
        print(
            "Error: PR with title containing required keywords not found",
            file=sys.stderr,
        )
        return False

    pr_number = pr.get("number")
    pr_body = pr.get("body", "")
    pr_labels = pr.get("labels", [])
    pr_state = pr.get("state")

    # Check PR content
    if not _check_headings_and_keywords(pr_body, PR_HEADINGS, PR_KEYWORDS):
        print("Error: PR missing required headings or keywords", file=sys.stderr)
        return False

    # Check PR references issue
    if not _check_issue_reference(pr_body, issue_number):
        print(f"Error: PR does not reference issue #{issue_number}", file=sys.stderr)
        return False

    # Check PR labels
    if not _check_labels(pr_labels, PR_LABELS):
        print(f"Error: PR missing required labels: {PR_LABELS}", file=sys.stderr)
        return False

    # 5. Check PR is closed (not merged)
    print("5. Verifying PR is closed without merging...")
    if pr_state != "closed":
        print(f"Error: PR #{pr_number} is not closed", file=sys.stderr)
        return False

    if pr.get("merged_at"):
        print(
            f"Error: PR #{pr_number} was merged (should be closed without merging)",
            file=sys.stderr,
        )
        return False

    # 6. Check PR technical analysis comment
    print("6. Verifying PR technical analysis comment...")
    pr_comments = _get_pr_comments(pr_number, headers, github_org)
    if not _check_pr_technical_comment(pr_comments, PR_TECHNICAL_KEYWORDS):
        print(
            "Error: PR missing technical analysis comment with required keywords",
            file=sys.stderr,
        )
        return False

    # 7. Check issue comment with PR reference
    print("7. Verifying issue comment referencing PR...")
    issue_comments = _get_issue_comments(issue_number, headers, github_org)
    if not _check_issue_comment_with_pr_ref(
        issue_comments, pr_number, ISSUE_COMMENT_KEYWORDS
    ):
        print(
            f"Error: Issue #{issue_number} missing comment referencing PR #{pr_number} with required keywords",
            file=sys.stderr,
        )
        return False

    # 8. Check PR closure comment with required keywords
    print("8. Verifying PR closure comment...")
    pr_closure_comment_found = False
    for comment in pr_comments:
        body = comment.get("body", "")
        if body and all(
            keyword.lower() in body.lower() for keyword in PR_CLOSURE_KEYWORDS
        ):
            pr_closure_comment_found = True
            break

    if not pr_closure_comment_found:
        print(
            "Error: PR missing closure comment with required keywords", file=sys.stderr
        )
        return False

    # 9. Verify issue is closed
    print("9. Verifying issue is closed...")
    if issue.get("state") != "closed":
        print(f"Error: Issue #{issue_number} should be closed", file=sys.stderr)
        return False

    # 10. Check issue closure comment with required keywords
    print("10. Verifying issue closure comment...")
    issue_closure_comment_found = False
    for comment in issue_comments:
        body = comment.get("body", "")
        if body and all(
            keyword.lower() in body.lower() for keyword in ISSUE_CLOSURE_KEYWORDS
        ):
            issue_closure_comment_found = True
            break

    if not issue_closure_comment_found:
        print(
            "Error: Issue missing closure comment with required keywords",
            file=sys.stderr,
        )
        return False

    print("\n✓ All verification checks passed!")
    print("Issue tagging and PR closure workflow completed successfully:")
    print(f"  - Issue #{issue_number}: {issue.get('title')} (closed)")
    print(f"  - PR #{pr_number}: {pr.get('title')} (closed without merging)")
    print(f"  - Branch: {BRANCH_NAME}")
    print("  - All comments contain required keywords")
    print("  - Technical constraints properly documented and communicated")
    return True


if __name__ == "__main__":
    success = verify()
    sys.exit(0 if success else 1)


================================================
FILE: tasks/github/standard/harmony/multi_branch_commit_aggregation/description.md
================================================
I need you to create a comprehensive commit history report by aggregating changes from multiple branches. Here's what you need to do:

**Step 1: Create Analysis Branch**
Create a new branch called 'history-report-2025' from the main branch.

**Step 2: Generate Branch Commits Report**
In the 'history-report-2025' branch, create a file called `BRANCH_COMMITS.json` that contains a JSON object with the following structure:
- For each of these branches: ['pr/45-googlefan256-main', 'pr/25-neuralsorcerer-patch-1', 'pr/41-amirhosseinghanipour-fix-race-conditions-and-offline-api']
- List the 3 most recent commits for each branch
- Each commit must include: SHA, GitHub username, commit message, and files changed count
- The JSON structure should be:
```json
{
  "pr/45-googlefan256-main": [
    {
      "sha": "commit_sha",
      "author": "github_username",
      "message": "commit message",
      "files_changed": number
    }
  ],
  "pr/25-neuralsorcerer-patch-1": [...],
  "pr/41-amirhosseinghanipour-fix-race-conditions-and-offline-api": [...]
}
```

**Step 3: Create Cross-Branch Analysis**
Create a file `CROSS_BRANCH_ANALYSIS.md` that contains:
- A section "## Top Contributors" listing the 3 contributors with the most commits on the main branch, sorted by commit count (format: "github_username: X commits")
- Must include keywords: "contributors"

**Step 4: Generate Merge Timeline**
Create a file `MERGE_TIMELINE.txt` that lists the 10 most recent merge commits from the main branch:
- Format: `DATE | MERGE_COMMIT_MESSAGE | COMMIT_SHA`
- List in reverse chronological order (newest first)
- Only include actual merge commits (commits that have exactly 2 parent commits)
- Note: While the commit messages reference PR numbers, those PRs no longer exist in the repository

================================================
FILE: tasks/github/standard/harmony/multi_branch_commit_aggregation/meta.json
================================================
{
  "task_id": "multi_branch_commit_aggregation",
  "task_name": "Multi Branch Commit Aggregation",
  "category_id": "harmony",
  "category_name": "Harmony",
  "description": "Generate comprehensive commit history report by aggregating changes from multiple branches with contributor analysis and merge timeline.",
  "author": "Zijian Wu",
  "created_at": "2025-08-15",
  "difficulty": "L3",
  "tags": [
    "repository analysis",
    "release coordination"
  ],
  "mcp": [
    "github"
  ],
  "meta_data": {
    "stateType": "url",
    "stateContent": null,
    "stateUrl": "https://github.com/mcpmark-source/harmony",
    "stateOriginalUrl": "https://github.com/openai/harmony"
  }
}

================================================
FILE: tasks/github/standard/harmony/multi_branch_commit_aggregation/verify.py
================================================
import sys
import os
import requests
from typing import Dict, Optional, Tuple
import base64
import json
from dotenv import load_dotenv


def _get_github_api(
    endpoint: str, headers: Dict[str, str], org: str
) -> Tuple[bool, Optional[Dict]]:
    """Make a GET request to GitHub API and return (success, response)."""
    url = f"https://api.github.com/repos/{org}/harmony/{endpoint}"
    try:
        response = requests.get(url, headers=headers)
        if response.status_code == 200:
            return True, response.json()
        elif response.status_code == 404:
            return False, None
        else:
            print(f"API error for {endpoint}: {response.status_code}", file=sys.stderr)
            return False, None
    except Exception as e:
        print(f"Exception for {endpoint}: {e}", file=sys.stderr)
        return False, None


def _check_branch_exists(branch_name: str, headers: Dict[str, str], org: str) -> bool:
    """Verify that a branch exists in the repository."""
    success, _ = _get_github_api(f"branches/{branch_name}", headers, org)
    return success


def _get_file_content(
    branch: str, file_path: str, headers: Dict[str, str], org: str
) -> Optional[str]:
    """Get the content of a file from a specific branch."""
    success, result = _get_github_api(f"contents/{file_path}?ref={branch}", headers, org)
    if not success or not result:
        return None

    try:
        content = base64.b64decode(result.get("content", "")).decode("utf-8")
        return content
    except Exception as e:
        print(f"Content decode error for {file_path}: {e}", file=sys.stderr)
        return None


def _check_branch_commits_json(content: str) -> bool:
    """Verify BRANCH_COMMITS.json has correct structure and expected data."""
    expected_data = {
        "pr/45-googlefan256-main": [
            {
                "sha": "9fa3f54cf2a2501c7dcbf554d5fbdd0de619fdda",
                "author": "googlefan256",
                "message": "Update format.md",
                "files_changed": 1,
            },
            {
                "sha": "3efbf742533a375fc148d75513597e139329578b",
                "author": "scott-oai",
                "message": "Merge pull request #29 from axion66/improve-readme-and-checks",
                "files_changed": 1,
            },
            {
                "sha": "9d653a4c7382abc42d115014d195d9354e7ad357",
                "author": "scott-oai",
                "message": "Merge pull request #30 from Yuan-ManX/harmony-format",
                "files_changed": 1,
            },
        ],
        "pr/25-neuralsorcerer-patch-1": [
            {
                "sha": "c505a03e9c9a388a511b6125756097eee523742a",
                "author": "neuralsorcerer",
                "message": "fix: `meta_sep` token and add to registry",
                "files_changed": 1,
            },
            {
                "sha": "c044bf33f7e835ca6a723ccc97848de25dba5164",
                "author": "neuralsorcerer",
                "message": "fix: `meta_sep` token in `encoding.rs`",
                "files_changed": 1,
            },
            {
                "sha": "b255cbeb6274adbea774f26fd9590922ce8874ed",
                "author": "scott-oai",
                "message": "Merge pull request #18 from openai/dev/scl/better-ci",
                "files_changed": 6,
            },
        ],
        "pr/41-amirhosseinghanipour-fix-race-conditions-and-offline-api": [
            {
                "sha": "1dca6392934bf4e3c403b2ecc2104e8ff3f67f45",
                "author": "amirhosseinghanipour",
                "message": "fix race conditions and add offline tokenizer loading api",
                "files_changed": 8,
            },
            {
                "sha": "9528c7b4a00a3307fd9685fc1328aee11c3d9c90",
                "author": "scott-oai",
                "message": "version bump",
                "files_changed": 2,
            },
            {
                "sha": "82b3afb9eb043343f322c937262cc50405e892c3",
                "author": "scott-oai",
                "message": "Merge pull request #26 from jordan-wu-97/jordan/fix-function-call-atomic-bool",
                "files_changed": 6,
            },
        ],
    }

    try:
        data = json.loads(content)

        # Check if all required branches are present
        for branch in expected_data.keys():
            if branch not in data:
                print(
                    f"Missing branch {branch} in BRANCH_COMMITS.json", file=sys.stderr
                )
                return False

        # Verify the exact content matches expected data
        for branch, expected_commits in expected_data.items():
            actual_commits = data.get(branch, [])
            if len(actual_commits) != 3:
                print(
                    f"Branch {branch} should have exactly 3 commits, found {len(actual_commits)}",
                    file=sys.stderr,
                )
                return False

            for i, expected_commit in enumerate(expected_commits):
                if i >= len(actual_commits):
                    print(
                        f"Missing commit {i + 1} for branch {branch}", file=sys.stderr
                    )
                    return False

                actual_commit = actual_commits[i]
                for field in ["sha", "author", "files_changed"]:
                    if actual_commit.get(field) != expected_commit.get(field):
                        print(
                            f"Mismatch in {field} for commit {i + 1} in branch {branch}",
                            file=sys.stderr,
                        )
                        print(
                            f"Expected: {expected_commit.get(field)}, Got: {actual_commit.get(field)}",
                            file=sys.stderr,
                        )
                        return False
                
                # For message field, use substring matching to be more flexible
                expected_message = expected_commit.get("message", "")
                actual_message = actual_commit.get("message", "")
                if expected_message not in actual_message:
                    print(
                        f"Mismatch in message for commit {i + 1} in branch {branch}",
                        file=sys.stderr,
                    )
                    print(
                        f"Expected: {expected_message}, Got: {actual_message}",
                        file=sys.stderr,
                    )
                    return False

        return True
    except json.JSONDecodeError as e:
        print(f"Invalid JSON in BRANCH_COMMITS.json: {e}", file=sys.stderr)
        return False
    except Exception as e:
        print(f"Error checking BRANCH_COMMITS.json: {e}", file=sys.stderr)
        return False


def _check_cross_branch_analysis(content: str) -> bool:
    """Verify CROSS_BRANCH_ANALYSIS.md contains required sections and data."""
    # Check for required section header
    if "## Top Contributors" not in content:
        print(
            "Missing section '## Top Contributors' in CROSS_BRANCH_ANALYSIS.md",
            file=sys.stderr,
        )
        return False

    # Check for required keyword
    if "contributors" not in content.lower():
        print(
            "Missing keyword 'contributors' in CROSS_BRANCH_ANALYSIS.md",
            file=sys.stderr,
        )
        return False

    # Verify the top 3 contributors with correct counts from main branch (order matters)
    expected_contributors = [
        "scott-oai: 35 commits",
        "egorsmkv: 4 commits",
        "axion66: 2 commits",
    ]

    for contributor in expected_contributors:
        if contributor not in content:
            print(
                f"Missing or incorrect contributor entry: {contributor}",
                file=sys.stderr,
            )
            return False

    return True


def _check_merge_timeline(content: str) -> bool:
    """Verify MERGE_TIMELINE.txt has correct format and expected merge commits."""
    expected_timeline = [
        "2025-08-06 | Merge pull request #29 from axion66/improve-readme-and-checks | 3efbf742533a375fc148d75513597e139329578b",
        "2025-08-06 | Merge pull request #30 from Yuan-ManX/harmony-format | 9d653a4c7382abc42d115014d195d9354e7ad357",
        "2025-08-06 | Merge pull request #28 from dkqjrm/fix-typo-format-md | 161e5fe2a57c63e9f8353c4c5b8faa3c3854bb5f",
        "2025-08-05 | Merge pull request #26 from jordan-wu-97/jordan/fix-function-call-atomic-bool | 82b3afb9eb043343f322c937262cc50405e892c3",
        "2025-08-05 | Merge pull request #18 from openai/dev/scl/better-ci | b255cbeb6274adbea774f26fd9590922ce8874ed",
        "2025-08-05 | Merge pull request #21 from Tialo/main | 058ef3257c24fb099aac7960c10ce51c8e55d9fe",
        "2025-08-05 | Merge branch 'main' into dev/scl/better-ci | 6375a15ea1b0a486cbb1468964cf8f5800ff5a5c",
        "2025-08-05 | Merge pull request #8 from RustedBytes/main | f6179119ca894eda4124c86d408c01fdbf5281f0",
        "2025-08-05 | Merge branch 'main' into main | eb86106b6980790b94f5702dc510483c66027277",
        "2025-08-05 | Merge pull request #17 from openai/dev/scl/add-docs-to-cargo | 64bca4cf327ebeafa0bbd0345650d86e2d02142f",
    ]

    # Verify each expected timeline entry exists in the content
    for i, expected_line in enumerate(expected_timeline):
        if expected_line not in content:
            print(f"Missing expected timeline entry {i + 1} in MERGE_TIMELINE.txt", file=sys.stderr)
            print(f"Expected: {expected_line}", file=sys.stderr)
            return False

    return True


def verify_task() -> bool:
    """Verify the multi-branch commit aggregation task."""
    # Get GitHub token from environment
    load_dotenv(".mcp_env")
    github_token = os.environ.get("MCP_GITHUB_TOKEN")
    if not github_token:
        print("Error: MCP_GITHUB_TOKEN environment variable not set", file=sys.stderr)
        return False

    # Get GitHub organization from environment
    github_org = os.environ.get("GITHUB_EVAL_ORG")
    if not github_org:
        print("Error: GITHUB_EVAL_ORG environment variable not set", file=sys.stderr)
        return False

    headers = {
        "Authorization": f"Bearer {github_token}",
        "Accept": "application/vnd.github.v3+json",
    }

    # 1. Check if branch 'history-report-2025' exists
    if not _check_branch_exists("history-report-2025", headers, github_org):
        print("Branch 'history-report-2025' does not exist", file=sys.stderr)
        return False
    print("✓ Branch 'history-report-2025' exists")

    # 2. Check BRANCH_COMMITS.json
    content = _get_file_content("history-report-2025", "BRANCH_COMMITS.json", headers, github_org)
    if not content:
        print(
            "File 'BRANCH_COMMITS.json' not found in 'history-report-2025' branch",
            file=sys.stderr,
        )
        return False

    if not _check_branch_commits_json(content):
        return False
    print("✓ BRANCH_COMMITS.json has correct structure and data")

    # 3. Check CROSS_BRANCH_ANALYSIS.md
    content = _get_file_content(
        "history-report-2025", "CROSS_BRANCH_ANALYSIS.md", headers, github_org
    )
    if not content:
        print(
            "File 'CROSS_BRANCH_ANALYSIS.md' not found in 'history-report-2025' branch",
            file=sys.stderr,
        )
        return False

    if not _check_cross_branch_analysis(content):
        return False
    print("✓ CROSS_BRANCH_ANALYSIS.md contains required sections and data")

    # 4. Check MERGE_TIMELINE.txt
    content = _get_file_content("history-report-2025", "MERGE_TIMELINE.txt", headers, github_org)
    if not content:
        print(
            "File 'MERGE_TIMELINE.txt' not found in 'history-report-2025' branch",
            file=sys.stderr,
        )
        return False

    if not _check_merge_timeline(content):
        return False
    print("✓ MERGE_TIMELINE.txt has correct format and data")


    print("\nAll verification checks passed! ✅")
    return True


if __name__ == "__main__":
    success = verify_task()
    sys.exit(0 if success else 1)


================================================
FILE: tasks/github/standard/harmony/release_management_workflow/description.md
================================================
I need help implementing a comprehensive release management workflow for this harmony repository. Here's what I need you to do:

**Step 1: Analyze Current State** 
First, analyze the current open pull requests to understand what changes they contain and their impact on the codebase.

**Step 2: Create Release Branch**
Create a release preparation branch called 'release-v1.1.0' from the current main branch.

**Step 3: Apply Critical Bug Fixes**
On the release branch, apply the MetaSep token fix from PR #25 by creating/updating the file `src/encoding.rs` with the corrected content where FormattingToken::MetaSep maps to "<|meta_sep|>" instead of "<|channel|>".

Also create/update `src/registry.rs` to include the missing MetaSep and MetaEnd token registrations:
```rust
(FormattingToken::MetaSep, "<|meta_sep|>"),
(FormattingToken::MetaEnd, "<|meta_end|>"),
```

**Step 4: Add Missing Utility File**
From PR #26, create the missing shadcn utils file `demo/harmony-demo/src/lib/utils.ts` with content:
```typescript
import { clsx, type ClassValue } from "clsx"
import { twMerge } from "tailwind-merge"

export function cn(...inputs: ClassValue[]) {
  return twMerge(clsx(inputs))
}
```

And create/update `.gitignore` to add:
```
# Avoid ignoring shadcn utils
!demo/harmony-demo/src/lib
```

**Step 5: Version Update**
Update the version number in `Cargo.toml`: Change the `version` field in the `[package]` section to `version = "1.1.0"`.

**Step 6: Create Comprehensive Changelog**
Create a `CHANGELOG.md` file in the release branch with the following content:
```markdown
# Changelog

## [1.1.0] - 2025-08-07

### Added
- Added missing shadcn utils.ts file for demo application
- Enhanced gitignore rules to preserve shadcn utilities

### Fixed
- Fixed MetaSep token mapping bug (was incorrectly mapped to channel token)
- Added missing MetaSep and MetaEnd token registrations in registry
- Improved tokenizer registry functionality for meta formatting tokens

### Changed
- Updated version to 1.1.0 for new release cycle

### Technical Details
- MetaSep token now correctly maps to `<|meta_sep|>` instead of `<|channel|>`
- Registry now properly recognizes MetaSep and MetaEnd formatting tokens
- Demo application now includes required utility functions for UI components
```

**Step 7: Create Release Pull Request**
Create a pull request from 'release-v1.1.0' to 'main' with title "Release v1.1.0 - Bug fixes and utility additions" and a detailed description explaining all the integrated changes.

**Step 8: Merge the Pull Request**
After creating the PR, merge it into the main branch using the "squash and merge" method.

**Step 9: Verification**
Ensure the release branch contains at least 4 distinct commits before merging:
1. MetaSep token fix commit
2. Utility file addition commit  
3. Version update commit
4. Changelog addition commit

================================================
FILE: tasks/github/standard/harmony/release_management_workflow/meta.json
================================================
{
  "task_id": "release_management_workflow",
  "task_name": "Release Management Workflow",
  "category_id": "harmony",
  "category_name": "Harmony",
  "description": "Implement comprehensive release management workflow including bug fixes, version updates, changelog creation, and PR merging.",
  "author": "Zijian Wu",
  "created_at": "2025-08-15",
  "difficulty": "L3",
  "tags": [
    "release coordination",
    "pr workflows"
  ],
  "mcp": [
    "github"
  ],
  "meta_data": {
    "stateType": "url",
    "stateContent": null,
    "stateUrl": "https://github.com/mcpmark-source/harmony",
    "stateOriginalUrl": "https://github.com/openai/harmony"
  }
}

================================================
FILE: tasks/github/standard/harmony/release_management_workflow/verify.py
================================================
import sys
import os
import requests
from typing import Dict, List, Optional, Tuple
import base64
from dotenv import load_dotenv


def _get_github_api(
    endpoint: str, headers: Dict[str, str], org: str, repo: str = "harmony"
) -> Tuple[bool, Optional[Dict]]:
    """Make a GET request to GitHub API and return (success, response)."""
    url = f"https://api.github.com/repos/{org}/{repo}/{endpoint}"
    try:
        response = requests.get(url, headers=headers)
        if response.status_code == 200:
            return True, response.json()
        elif response.status_code == 404:
            return False, None
        else:
            print(f"API error for {endpoint}: {response.status_code}", file=sys.stderr)
            return False, None
    except Exception as e:
        print(f"Exception for {endpoint}: {e}", file=sys.stderr)
        return False, None


def _check_branch_exists(
    branch_name: str, headers: Dict[str, str], org: str, repo: str = "harmony"
) -> bool:
    """Verify that a branch exists in the repository."""
    success, _ = _get_github_api(f"branches/{branch_name}", headers, org, repo)
    return success


def _check_file_content(
    branch: str,
    file_path: str,
    keywords: List[str],
    headers: Dict[str, str],
    org: str,
    repo: str = "harmony",
) -> bool:
    """Verify that a file exists in branch and contains required keywords."""
    success, result = _get_github_api(
        f"contents/{file_path}?ref={branch}", headers, org, repo
    )
    if not success or not result:
        return False

    if keywords and result.get("content"):
        try:
            content = base64.b64decode(result.get("content", "")).decode("utf-8")
            return all(keyword in content for keyword in keywords)
        except Exception as e:
            print(f"Content decode error for {file_path}: {e}", file=sys.stderr)
            return False

    return True


def _check_specific_file_content(
    branch: str,
    file_path: str,
    expected_content: str,
    headers: Dict[str, str],
    org: str,
    repo: str = "harmony",
    min_length: int = 100,
) -> bool:
    """Verify that a file contains specific exact content and has reasonable size."""
    success, result = _get_github_api(
        f"contents/{file_path}?ref={branch}", headers, org, repo
    )
    if not success or not result:
        return False

    if result.get("content"):
        try:
            content = base64.b64decode(result.get("content", "")).decode("utf-8")
            # Check both that expected content exists and file has reasonable content
            return expected_content in content and len(content) >= min_length
        except Exception as e:
            print(f"Content decode error for {file_path}: {e}", file=sys.stderr)
            return False

    return False


def _check_pr_merged(
    title_substring: str,
    base_branch: str,
    headers: Dict[str, str],
    org: str,
    repo: str = "harmony",
) -> Tuple[bool, Optional[int]]:
    """Check if a PR with specified title was merged into base branch and return PR number."""
    # Check closed PRs to find merged ones
    success, prs = _get_github_api(
        "pulls?state=closed&per_page=100", headers, org, repo
    )
    if not success or not prs:
        return False, None

    for pr in prs:
        title_match = title_substring.lower() in pr.get("title", "").lower()
        base_match = pr.get("base", {}).get("ref") == base_branch
        is_merged = pr.get("merged_at") is not None

        if title_match and base_match and is_merged:
            return True, pr.get("number")

    return False, None


def _check_pr_squash_merged(
    pr_number: int, headers: Dict[str, str], org: str, repo: str = "harmony"
) -> bool:
    """Check if a PR was merged using squash and merge method."""
    # Get the PR details
    success, pr = _get_github_api(f"pulls/{pr_number}", headers, org, repo)
    if not success or not pr:
        return False

    if not pr.get("merged_at"):
        return False

    merge_commit_sha = pr.get("merge_commit_sha")
    if not merge_commit_sha:
        return False

    # Get the merge commit details
    success, commit = _get_github_api(f"commits/{merge_commit_sha}", headers, org, repo)
    if not success or not commit:
        return False

    # For squash and merge, the commit will have exactly one parent
    # and the commit message typically includes the PR number
    parents = commit.get("parents", [])
    commit_message = commit.get("commit", {}).get("message", "")

    # Squash and merge commits have exactly 1 parent (the base branch)
    # Regular merge commits have 2 parents (base and head branches)
    if len(parents) == 1 and f"#{pr_number}" in commit_message:
        return True

    return False


def verify() -> bool:
    """
    Programmatically verify that the release management workflow meets the
    requirements described in description.md.
    """
    # Load environment variables from .mcp_env
    load_dotenv(".mcp_env")

    # Get GitHub token and org
    github_token = os.environ.get("MCP_GITHUB_TOKEN")
    github_org = os.environ.get("GITHUB_EVAL_ORG")

    if not github_token:
        print("Error: MCP_GITHUB_TOKEN environment variable not set", file=sys.stderr)
        return False

    if not github_org:
        print("Error: GITHUB_EVAL_ORG environment variable not set", file=sys.stderr)
        return False

    # Configuration constants
    RELEASE_BRANCH = "release-v1.1.0"

    # Expected content checks with minimum file sizes to ensure files aren't just stubs
    METASEP_FIX = 'FormattingToken::MetaSep => "<|meta_sep|>"'
    REGISTRY_FIX = '(FormattingToken::MetaSep, "<|meta_sep|>")'
    METAEND_FIX = '(FormattingToken::MetaEnd, "<|meta_end|>")'
    UTILS_CONTENT = "export function cn(...inputs: ClassValue[])"
    GITIGNORE_ADDITION = "!demo/harmony-demo/src/lib"
    VERSION_110 = 'version = "1.1.0"'

    CHANGELOG_KEYWORDS = [
        "## [1.1.0] - 2025-08-07",
        "MetaSep token mapping bug",
        "shadcn utils.ts file",
        "Fixed MetaSep token",
        "Registry now properly recognizes",
    ]

    headers = {
        "Authorization": f"token {github_token}",
        "Accept": "application/vnd.github.v3+json",
    }

    # Run verification checks
    print("Verifying GitHub release management workflow completion...")

    # 1. Check release branch exists
    print("1. Verifying release branch exists...")
    if not _check_branch_exists(RELEASE_BRANCH, headers, github_org):
        print(f"Error: Branch '{RELEASE_BRANCH}' not found", file=sys.stderr)
        return False

    # 2. Check MetaSep fix in encoding.rs (with min content length to ensure file wasn't gutted)
    print("2. Verifying MetaSep token fix in encoding.rs...")
    if not _check_specific_file_content(
        "main", "src/encoding.rs", METASEP_FIX, headers, github_org, min_length=500
    ):
        print(
            "Error: MetaSep token fix not found in src/encoding.rs or file is too small",
            file=sys.stderr,
        )
        return False

    # 3. Check registry updates (both MetaSep and MetaEnd)
    print("3. Verifying MetaSep and MetaEnd registry additions...")
    if not _check_specific_file_content(
        "main", "src/registry.rs", REGISTRY_FIX, headers, github_org, min_length=500
    ):
        print(
            "Error: MetaSep registry fix not found in src/registry.rs or file is too small",
            file=sys.stderr,
        )
        return False
    if not _check_specific_file_content(
        "main", "src/registry.rs", METAEND_FIX, headers, github_org, min_length=500
    ):
        print(
            "Error: MetaEnd registry fix not found in src/registry.rs", file=sys.stderr
        )
        return False

    # 4. Check utils.ts file exists with correct content
    print("4. Verifying shadcn utils.ts file...")
    if not _check_specific_file_content(
        "main",
        "demo/harmony-demo/src/lib/utils.ts",
        UTILS_CONTENT,
        headers,
        github_org,
        min_length=50,
    ):
        print("Error: utils.ts file not found or incorrect content", file=sys.stderr)
        return False

    # 5. Check .gitignore update
    print("5. Verifying .gitignore update...")
    if not _check_specific_file_content(
        "main", ".gitignore", GITIGNORE_ADDITION, headers, github_org, min_length=100
    ):
        print("Error: .gitignore update not found", file=sys.stderr)
        return False

    # 6. Check version update in Cargo.toml only (pyproject.toml uses dynamic versioning)
    print("6. Verifying version update in Cargo.toml...")
    if not _check_specific_file_content(
        "main", "Cargo.toml", VERSION_110, headers, github_org, min_length=200
    ):
        print("Error: Version 1.1.0 not found in Cargo.toml", file=sys.stderr)
        return False

    # 7. Check CHANGELOG.md exists with required content
    print("7. Verifying CHANGELOG.md...")
    if not _check_file_content(
        "main", "CHANGELOG.md", CHANGELOG_KEYWORDS, headers, github_org
    ):
        print(
            "Error: CHANGELOG.md not found or missing required content", file=sys.stderr
        )
        return False

    # 8. Check release PR was merged and get PR number
    print("8. Verifying release pull request was merged...")
    pr_merged, pr_number = _check_pr_merged(
        "Release v1.1.0", "main", headers, github_org
    )
    if not pr_merged:
        print("Error: Release pull request not found or not merged", file=sys.stderr)
        return False

    # 9. Check PR was merged using squash and merge
    print("9. Verifying pull request was merged using 'squash and merge' method...")
    if pr_number and not _check_pr_squash_merged(pr_number, headers, github_org):
        print(
            f"Error: Pull request #{pr_number} was not merged using 'squash and merge' method",
            file=sys.stderr,
        )
        return False

    print("\n✓ All verification checks passed!")
    print("Release management workflow completed successfully.")
    return True


if __name__ == "__main__":
    success = verify()
    sys.exit(0 if success else 1)


================================================
FILE: tasks/github/standard/mcpmark-cicd/deployment_status_workflow/description.md
================================================
I need you to create a Deployment Status workflow for this Node.js project. The project currently has no GitHub Actions workflows, so you'll be building a deployment-focused CI/CD workflow from scratch that responds to push events on the main branch. Here's what needs to be implemented:

## Deployment Status Workflow

Create `.github/workflows/deployment-status.yml` that triggers on `push` to `main` branch with these sequential jobs:

### 1. **pre-deployment** job (name: `pre-deployment`):
   - Runs basic quality checks (lint and test)
   - Creates deployment tracking issue with title: "Deployment Tracking - [commit-sha]"
   - Adds labels: `deployment`, `in-progress`
   - Captures previous commit SHA and package version information
   - Posts comment containing "Pre-deployment checks completed"

### 2. **rollback-preparation** job (name: `rollback-preparation`):
   - Depends on: pre-deployment
   - Creates comprehensive rollback artifacts including:
     * Executable rollback script with proper error handling
     * Configuration backups (package.json, package-lock.json, environment templates)
     * Dependency verification script for compatibility checking
     * Detailed rollback documentation with step-by-step instructions
     * Compressed rollback package with SHA256 checksums
   - Uploads rollback artifacts to GitHub Actions with 30-day retention
   - Posts comment on deployment issue that MUST contain the following verifiable elements:
     * Title: "🔄 Rollback Plan Ready"
     * Previous commit SHA (format: "Previous Commit: [sha]")
     * Current commit SHA (format: "Current Commit: [sha]")
     * Package version (format: "Package Version: [version]")
     * Artifact name (format: "Artifact: rollback-package-[commit-sha]")
     * At least 5 checkmarks (✅) indicating completed rollback components
     * Quick rollback command section with bash code block
     * Script verification status: "Rollback script created: true"
     * Backup verification status: "Configuration backup: true"
     * Artifact checksum (format: "SHA256: [checksum-value]")

### 3. **post-deployment** job (name: `post-deployment`):
   - Depends on: rollback-preparation
   - Removes `in-progress` label and adds `completed` label
   - Posts final comment containing "Deployment Completed Successfully" with rollback artifact details
   - Closes the deployment tracking issue

## Implementation Requirements:

**Step 1: Create Feature Branch**
Create a new branch called `deployment-status-workflow` from main.

**Step 2: Implement the Workflow**
Create `.github/workflows/deployment-status.yml` with proper YAML syntax:
- Trigger only on push to main branch
- Sequential job execution: pre-deployment → rollback-preparation → post-deployment
- Use github-script actions for issue management
- Avoid identifier conflicts in github-script actions (don't redeclare 'github')
- Include proper error handling and script validation
- Implement comprehensive rollback artifact creation and verification
- Use proper fetch-depth for accessing commit history
- Include artifact upload/download capabilities with checksums

**Step 3: Create and Merge Pull Request**
Create a comprehensive pull request and merge it to main:
- Title: "Implement Deployment Status Workflow"
- Detailed description of the workflow and its purpose
- Merge the pull request to main branch to trigger the deployment workflow

================================================
FILE: tasks/github/standard/mcpmark-cicd/deployment_status_workflow/meta.json
================================================
{
  "task_id": "deployment_status_workflow",
  "task_name": "Deployment Status Workflow",
  "category_id": "mcpmark-cicd",
  "category_name": "MCPMark CI/CD",
  "description": "Create deployment status workflow with pre-deployment checks, rollback preparation, and comprehensive issue tracking for deployments.",
  "author": "Zijian Wu",
  "created_at": "2025-08-15",
  "difficulty": "L3",
  "tags": [
    "ci/cd automation",
    "workflow automation"
  ],
  "mcp": [
    "github"
  ],
  "meta_data": {
    "stateType": "url",
    "stateContent": null,
    "stateUrl": "https://github.com/mcpmark-source/mcpmark-cicd",
    "stateOriginalUrl": null
  }
}

================================================
FILE: tasks/github/standard/mcpmark-cicd/deployment_status_workflow/verify.py
================================================
import sys
import os
import requests
import time
from typing import Dict, List, Optional, Tuple
from dotenv import load_dotenv


def _get_github_api(
    endpoint: str, headers: Dict[str, str], owner: str, repo: str
) -> Tuple[bool, Optional[Dict]]:
    """Make a GET request to GitHub API and return (success, response)."""
    url = f"https://api.github.com/repos/{owner}/{repo}/{endpoint}"
    try:
        response = requests.get(url, headers=headers)
        if response.status_code == 200:
            return True, response.json()
        elif response.status_code == 404:
            return False, None
        else:
            print(f"API error for {endpoint}: {response.status_code}", file=sys.stderr)
            return False, None
    except Exception as e:
        print(f"Exception for {endpoint}: {e}", file=sys.stderr)
        return False, None


def _search_github_issues(
    query: str, headers: Dict[str, str]
) -> Tuple[bool, Optional[List]]:
    """Search GitHub issues using the search API."""
    url = f"https://api.github.com/search/issues?q={query}&per_page=100"
    try:
        response = requests.get(url, headers=headers)
        if response.status_code == 200:
            data = response.json()
            return True, data.get("items", [])
        else:
            print(f"Search API error: {response.status_code}", file=sys.stderr)
            return False, None
    except Exception as e:
        print(f"Search exception: {e}", file=sys.stderr)
        return False, None


def _wait_for_workflow_completion(
    headers: Dict[str, str], owner: str, repo: str, max_wait: int = 90
) -> bool:
    """Wait for GitHub Actions workflows to complete processing."""
    print("⏳ Waiting for deployment status workflows to complete...")

    start_time = time.time()
    no_workflow_check_count = 0

    while time.time() - start_time < max_wait:
        try:
            # Check workflow runs for deployment-status.yml
            success, response = _get_github_api(
                "actions/workflows/deployment-status.yml/runs?per_page=10",
                headers,
                owner,
                repo,
            )

            if success and response:
                runs = response.get("workflow_runs", [])
                if len(runs) > 0:
                    # Check status of recent runs
                    running_count = 0
                    completed_count = 0
                    failed_count = 0

                    for run in runs[:3]:  # Check recent runs
                        status = run["status"]
                        conclusion = run.get("conclusion")

                        if status == "completed":
                            completed_count += 1
                            if conclusion == "failure":
                                failed_count += 1
                        elif status in ["in_progress", "queued"]:
                            running_count += 1

                    print(
                        f"   Status: {completed_count} completed, {running_count} running/queued"
                    )

                    # Wait until NO workflows are running
                    if running_count == 0:
                        if failed_count > 0:
                            print(
                                f"⚠️ Warning: {failed_count} workflow runs failed, but continuing verification..."
                            )

                        print(
                            f"✅ All workflows completed. Found {completed_count} completed runs."
                        )
                        # Additional wait to ensure all processing is done
                        print(
                            "⏳ Additional wait for deployment processing to complete..."
                        )
                        time.sleep(5)
                        return True
                else:
                    # No workflow runs found
                    no_workflow_check_count += 1
                    if no_workflow_check_count == 1:
                        print(
                            "   No workflow runs found yet, waiting 5 seconds and checking once more..."
                        )
                        time.sleep(5)
                        continue
                    elif no_workflow_check_count >= 2:
                        print(
                            "⚠️ No workflow runs detected after 2 checks. Workflow may not have been triggered."
                        )
                        print("   Continuing with verification...")
                        return False

            print(f"⏳ Still waiting... ({int(time.time() - start_time)}s elapsed)")
            time.sleep(5)

        except Exception as e:
            print(f"⚠️ Error checking workflow status: {e}")
            time.sleep(5)

    print(f"⚠️ Workflow completion wait timed out after {max_wait}s")
    return False


def _verify_workflow_runs(
    headers: Dict[str, str], owner: str, repo: str
) -> Tuple[bool, List[str], Optional[Dict]]:
    """Verify that the deployment status workflow runs have the correct jobs."""
    print("\n⚙️ Verifying deployment status workflow runs...")
    errors = []

    # Get the most recent workflow run
    success, response = _get_github_api(
        "actions/workflows/deployment-status.yml/runs?per_page=5",
        headers,
        owner,
        repo,
    )

    if not success or not response:
        return False, ["Failed to fetch workflow runs"], None

    runs = response.get("workflow_runs", [])
    if not runs:
        return False, ["No workflow runs found for deployment-status.yml"], None

    # Find the most recent successful run
    latest_successful_run = None
    for run in runs:
        if run.get("conclusion") == "success":
            latest_successful_run = run
            break

    if not latest_successful_run:
        return False, ["No successful workflow runs found"], None

    run_id = latest_successful_run["id"]
    print(f"   Found successful workflow run #{run_id}")

    # Get jobs for this run
    success, jobs_response = _get_github_api(
        f"actions/runs/{run_id}/jobs", headers, owner, repo
    )

    if not success:
        return False, ["Failed to fetch workflow jobs"], None

    jobs = jobs_response.get("jobs", [])
    expected_jobs = ["pre-deployment", "rollback-preparation", "post-deployment"]

    found_jobs = [job["name"] for job in jobs]
    missing_jobs = [job for job in expected_jobs if job not in found_jobs]

    if missing_jobs:
        errors.append(f"Missing jobs: {missing_jobs}. Found: {found_jobs}")
    else:
        print(f"   ✅ All 3 required jobs found: {found_jobs}")

    # Verify all jobs succeeded
    failed_jobs = [job["name"] for job in jobs if job["conclusion"] != "success"]
    if failed_jobs:
        errors.append(f"Failed jobs: {failed_jobs}")
    else:
        print("   ✅ All jobs completed successfully")

    # Verify sequential execution (each job should start after the previous one)
    if len(jobs) >= 3:
        job_times = {}
        for job in jobs:
            if job["name"] in expected_jobs and job["started_at"]:
                job_times[job["name"]] = job["started_at"]

        if len(job_times) >= 3:
            # Check that jobs ran in correct sequence
            import datetime

            times = {
                name: datetime.datetime.fromisoformat(time.replace("Z", "+00:00"))
                for name, time in job_times.items()
            }

            # pre-deployment should start first
            # rollback-preparation should start after pre-deployment
            # post-deployment should start after rollback-preparation
            if all(job in times for job in expected_jobs):
                if (
                    times["rollback-preparation"] <= times["pre-deployment"]
                    or times["post-deployment"] <= times["rollback-preparation"]
                ):
                    errors.append("Jobs did not run in correct sequential order")
                else:
                    print("   ✅ Jobs ran in correct sequential order")
            else:
                errors.append(
                    "Not enough job timing data to verify sequential execution"
                )

    return len(errors) == 0, errors, latest_successful_run


def _verify_deployment_issue(
    run_data: Dict, headers: Dict[str, str], owner: str, repo: str
) -> Tuple[bool, List[str]]:
    """Verify that a deployment tracking issue was created and closed properly."""
    print("\n📋 Verifying deployment tracking issue...")
    errors = []

    # Extract commit SHA from the workflow run
    head_sha = run_data.get("head_sha")
    if not head_sha:
        return False, ["Could not determine head SHA from workflow run"]

    short_sha = head_sha[:7]
    expected_title = f"Deployment Tracking - {short_sha}"

    # Search for the deployment tracking issue
    success, issues = _search_github_issues(
        f'repo:{owner}/{repo} "{expected_title}" is:issue', headers
    )

    if not success:
        return False, ["Failed to search for deployment tracking issue"]

    # Find the exact issue
    deployment_issue = None
    for issue in issues:
        if issue.get("title") == expected_title:
            deployment_issue = issue
            break

    if not deployment_issue:
        return False, [f"Deployment tracking issue '{expected_title}' not found"]

    issue_number = deployment_issue["number"]
    print(f"   Found deployment tracking issue #{issue_number}: {expected_title}")

    # Check that issue is closed
    if deployment_issue.get("state") != "closed":
        errors.append(
            f"Deployment issue #{issue_number} is not closed (state: {deployment_issue.get('state')})"
        )
    else:
        print(f"   ✅ Deployment issue #{issue_number} is closed")

    # Check required labels
    expected_labels = ["deployment", "completed"]
    actual_labels = [label["name"] for label in deployment_issue.get("labels", [])]
    missing_labels = [label for label in expected_labels if label not in actual_labels]

    if missing_labels:
        errors.append(
            f"Missing labels on deployment issue: {missing_labels}. Found: {actual_labels}"
        )
    else:
        print(f"   ✅ Required labels found: {expected_labels}")

    # Get issue comments to verify GitHub Actions bot comments
    success, comments = _get_github_api(
        f"issues/{issue_number}/comments", headers, owner, repo
    )

    if not success:
        errors.append("Failed to get deployment issue comments")
        return len(errors) == 0, errors

    # Filter for GitHub Actions bot comments only
    bot_comments = [
        comment
        for comment in comments
        if comment.get("user", {}).get("login") == "github-actions[bot]"
    ]

    if not bot_comments:
        errors.append("No comments found from GitHub Actions bot")
        return len(errors) == 0, errors

    print(f"   Found {len(bot_comments)} comment(s) from GitHub Actions bot")

    # Get all bot comment bodies
    bot_comment_bodies = [comment.get("body", "") for comment in bot_comments]
    all_bot_comments = " ".join(bot_comment_bodies)

    # Check for required GitHub Actions bot comment indicators
    required_comment_indicators = [
        "Pre-deployment checks completed",
        "🔄 Rollback Plan Ready",
        "Deployment Completed Successfully",
    ]

    for indicator in required_comment_indicators:
        if indicator not in all_bot_comments:
            errors.append(
                f"Missing required GitHub Actions bot comment indicator: '{indicator}'"
            )
        else:
            print(f"   ✅ Found GitHub Actions bot comment indicator: '{indicator}'")

    # Find and verify the rollback plan comment from GitHub Actions bot
    rollback_comment = None
    for comment in bot_comments:
        if "🔄 Rollback Plan Ready" in comment.get("body", ""):
            rollback_comment = comment.get("body", "")
            break

    if rollback_comment:
        print("   ✅ Found rollback plan comment from GitHub Actions bot")

        # Check for required rollback plan elements
        required_elements = [
            "**Previous Commit**:",
            "**Current Commit**:",
            "**Package Version**:",
            "✅ Executable rollback script created",
            "✅ Configuration backups saved",
            "✅ Dependency verification script prepared",
            "✅ Comprehensive rollback documentation generated",
            "✅ Compressed rollback package created",
            "**SHA256**:",
            "**Artifact**:",
            "Quick Rollback Commands",
        ]

        for element in required_elements:
            if element not in rollback_comment:
                errors.append(f"Missing element in rollback plan: '{element}'")
            else:
                print(f"   ✅ Found rollback plan element: '{element}'")

        # Verify commit SHAs in rollback comment
        if f"**Current Commit**: {head_sha}" in rollback_comment:
            print(f"   ✅ Current commit SHA verified: {head_sha}")
        else:
            errors.append(
                f"Current commit SHA {head_sha} not found in rollback comment"
            )

        # Extract and verify previous commit SHA
        if "**Previous Commit**:" in rollback_comment:
            import re

            prev_sha_match = re.search(
                r"\*\*Previous Commit\*\*:\s*([a-f0-9]{40})", rollback_comment
            )
            if prev_sha_match:
                prev_sha = prev_sha_match.group(1)
                print(f"   ✅ Previous commit SHA found: {prev_sha}")

                # Verify it's a valid 40-character SHA
                if len(prev_sha) != 40:
                    errors.append(
                        f"Previous commit SHA has invalid length: {len(prev_sha)}"
                    )
            else:
                errors.append(
                    "Previous commit SHA format not found in rollback comment"
                )
        else:
            errors.append("Previous commit SHA not found in rollback comment")

        # Verify SHA256 checksum is present
        sha256_match = re.search(r"\*\*SHA256\*\*:\s*([a-f0-9]{64})", rollback_comment)
        if sha256_match:
            sha256_value = sha256_match.group(1)
            print(f"   ✅ SHA256 checksum found: {sha256_value[:16]}...")
        else:
            errors.append(
                "SHA256 checksum not found or invalid format in rollback comment"
            )

    else:
        errors.append("Rollback plan comment not found from GitHub Actions bot")

    return len(errors) == 0, errors


def verify() -> bool:
    """
    Verify that the deployment status workflow automation is working correctly.
    """
    # Load environment variables
    load_dotenv(".mcp_env")

    github_token = os.environ.get("MCP_GITHUB_TOKEN")
    if not github_token:
        print("Error: MCP_GITHUB_TOKEN environment variable not set", file=sys.stderr)
        return False

    # Get GitHub organization
    github_org = os.environ.get("GITHUB_EVAL_ORG")
    if not github_org:
        print("Error: GITHUB_EVAL_ORG environment variable not set", file=sys.stderr)
        return False

    # Repository configuration
    owner = github_org
    repo = "mcpmark-cicd"

    headers = {
        "Authorization": f"token {github_token}",
        "Accept": "application/vnd.github.v3+json",
    }

    print("🔍 Starting Deployment Status Workflow Verification")
    print("=" * 60)

    # Wait for workflows to complete
    workflows_completed = _wait_for_workflow_completion(headers, owner, repo)
    if not workflows_completed:
        print(
            "⚠️ Warning: Workflows may still be running. Continuing with verification..."
        )

    # Verify workflow runs and jobs
    all_passed = True

    # 1. Verify workflow runs have correct jobs
    runs_ok, runs_errors, run_data = _verify_workflow_runs(headers, owner, repo)
    if not runs_ok:
        all_passed = False
        print("❌ Workflow Runs Verification Failed:")
        for error in runs_errors:
            print(f"   - {error}")
    else:
        print("✅ Workflow Runs Verification Passed")

        # 2. Verify deployment issue if workflow runs passed
        if run_data:
            issue_ok, issue_errors = _verify_deployment_issue(
                run_data, headers, owner, repo
            )
            if not issue_ok:
                all_passed = False
                print("❌ Deployment Issue Verification Failed:")
                for error in issue_errors:
                    print(f"   - {error}")
            else:
                print("✅ Deployment Issue Verification Passed")

    print("\n" + "=" * 60)
    if all_passed:
        print("🎉 All Deployment Status Workflow verifications PASSED!")
        print("\n📋 Summary:")
        print(
            "   ✅ Workflow runs with correct 3 sequential jobs: pre-deployment, rollback-preparation, post-deployment"
        )
        print("   ✅ Deployment tracking issue created and closed with proper labels")
        print("   ✅ Issue contains rollback plan with all required elements")
        print("   ✅ Previous and current commit SHAs are correctly tracked")
        print("   ✅ All workflow automation comments are present")
        print(
            "\n🤖 The GitHub Actions deployment status workflow is working correctly!"
        )
    else:
        print("❌ Deployment Status Workflow verification FAILED!")
        print("   Some components did not meet the expected automation requirements.")

    return all_passed


if __name__ == "__main__":
    success = verify()
    sys.exit(0 if success else 1)


================================================
FILE: tasks/github/standard/mcpmark-cicd/issue_management_workflow/description.md
================================================
I need you to create an intelligent Issue Management automation workflow for this Node.js project. The project currently has no GitHub Actions workflows, so you'll be building an issue-focused automation system from scratch that responds to issue events. Here's what needs to be implemented:

## Issue Management Workflow

Create `.github/workflows/issue-automation.yml` that triggers on `issues` events (opened, labeled) with these jobs:

### 1. **issue-triage** job:
   - Auto-assigns category labels based on keywords in **issue title** (case-insensitive):
     - Title contains "bug" → adds `bug` label
     - Title contains "epic" → adds `epic` label  
     - Title contains "maintenance" → adds `maintenance` label
   - Auto-assigns priority labels based on keywords in **issue title OR body** (case-insensitive, highest priority wins if multiple keywords found):
     - "critical", "urgent", "production", "outage" → `priority-critical`
     - "important", "high", "blocking" → `priority-high` 
     - "medium", "normal" → `priority-medium` (default if no priority keywords found)
     - "low", "nice-to-have", "minor" → `priority-low`
   - All issues get `needs-triage` label initially

### 2. **task-breakdown** job:
   - For issues with a title containing "Epic", create exactly 4 sub-issues with the pattern: "[SUBTASK] [Original Title] - Task N: [Task Name]"
   - Task names: 1. Requirements Analysis, 2. Design and Architecture, 3. Implementation, 4. Testing and Documentation
   - Links sub-issues to parent using "Related to #[parent-number]" in sub-issue body
   - Updates parent issue body with "## Epic Tasks" checklist linking to sub-issue numbers
   - All sub-issues get `enhancement` and `needs-review` labels

### 3. **auto-response** job:
   - Checks if the issue author is creating their first issue in this repository (not first on GitHub globally, but first in this specific repo)
   - If first issue in repo: adds `first-time-contributor` label and posts welcome message
   - Posts different responses based on issue type:
     - `bug` issues: comment must contain "Bug Report Guidelines"
     - `epic` issues: comment must contain "Feature Request Process"  
     - `maintenance` issues: comment must contain "Maintenance Guidelines"
   - Sets milestone "v1.0.0" for `priority-high` and `priority-critical` issues
   - Changes status from `needs-triage` to `needs-review` after response

## Label Management Requirements

The system must create and manage these specific labels:

### Category Labels:
- `bug` - Something isn't working
- `enhancement` - New feature or request  
- `epic` - Large feature requiring multiple sub-tasks
- `maintenance` - Maintenance and housekeeping tasks

### Priority Labels:
- `priority-critical` - Critical priority issue
- `priority-high` - High priority issue  
- `priority-medium` - Medium priority issue
- `priority-low` - Low priority issue

### Status Labels:
- `needs-triage` - Needs to be reviewed by maintainers
- `needs-review` - Awaiting review from maintainers
- `first-time-contributor` - Issue created by first-time contributor

## Implementation Requirements:

**Step 1: Create Feature Branch**
Create a new branch called `issue-management-workflow` from main.

**Step 2: Create Supporting Files**
Create these additional files on the new branch:
- `.github/ISSUE_TEMPLATE/bug_report.md` - Bug report template
- `.github/ISSUE_TEMPLATE/feature_request.md` - Feature request template
- `.github/ISSUE_TEMPLATE/maintenance_report.md` - Maintenance report template


**Step 3: Implement the Workflow**  
Create `.github/workflows/issue-automation.yml` with proper YAML syntax.  
Include:  
- Appropriate triggers for issues events  
- Job dependencies where needed  
- Error handling and graceful fallbacks  
- Avoid identifier conflicts in github-script actions (don't redeclare 'github')

**Step 4: Create and Merge Pull Request**
Create a comprehensive pull request and merge it to main:
- Title: "Implement Issue Management Automation Workflow"
- Detailed description of the workflow and its purpose
- Include all workflow files and templates created
- Merge the pull request to main branch

**Step 5: Test the Workflow**
Create test issues to demonstrate the issue automation workflow:

1. **Bug Issue**: "Bug: Login form validation not working"
   - Expected: `bug`, `priority-high`, `needs-triage`→`needs-review`, milestone "v1.0.0"
   - Auto-response comment must contain "Bug Report Guidelines"

2. **Epic Issue**: "Epic: Redesign user dashboard interface"
   - Expected: `epic`, `priority-high`, `needs-triage`→`needs-review`, milestone "v1.0.0"
   - Must create 4 sub-issues with `enhancement` and `needs-review` labels
   - Parent updated with "## Epic Tasks" checklist, sub-issues linked with "Related to #[parent-number]"
   - Auto-response comment must contain "Feature Request Process"

3. **Maintenance Issue**: "Weekly maintenance cleanup and refactor"  
   - Expected: `maintenance`, `priority-medium`, `needs-triage`→`needs-review`, no milestone
   - Auto-response comment must contain "Maintenance Guidelines"

================================================
FILE: tasks/github/standard/mcpmark-cicd/issue_management_workflow/meta.json
================================================
{
  "task_id": "issue_management_workflow",
  "task_name": "Issue Management Workflow",
  "category_id": "mcpmark-cicd",
  "category_name": "MCPMark CI/CD",
  "description": "Build intelligent issue management automation with auto-triage, task breakdown for epics, and first-time contributor handling.",
  "author": "Zijian Wu",
  "created_at": "2025-08-15",
  "difficulty": "L3",
  "tags": [
    "issue management",
    "workflow automation"
  ],
  "mcp": [
    "github"
  ],
  "meta_data": {
    "stateType": "url",
    "stateContent": null,
    "stateUrl": "https://github.com/mcpmark-source/mcpmark-cicd",
    "stateOriginalUrl": null
  }
}

================================================
FILE: tasks/github/standard/mcpmark-cicd/issue_management_workflow/verify.py
================================================
import sys
import os
import requests
import time
from typing import Dict, List, Optional, Tuple
from dotenv import load_dotenv


def _get_github_api(
    endpoint: str, headers: Dict[str, str], owner: str, repo: str
) -> Tuple[bool, Optional[Dict]]:
    """Make a GET request to GitHub API and return (success, response)."""
    url = f"https://api.github.com/repos/{owner}/{repo}/{endpoint}"
    try:
        response = requests.get(url, headers=headers)
        if response.status_code == 200:
            return True, response.json()
        elif response.status_code == 404:
            return False, None
        else:
            print(f"API error for {endpoint}: {response.status_code}", file=sys.stderr)
            return False, None
    except Exception as e:
        print(f"Exception for {endpoint}: {e}", file=sys.stderr)
        return False, None


def _search_github_issues(
    query: str, headers: Dict[str, str]
) -> Tuple[bool, Optional[List]]:
    """Search GitHub issues using the search API."""
    url = f"https://api.github.com/search/issues?q={query}&per_page=100"
    try:
        response = requests.get(url, headers=headers)
        if response.status_code == 200:
            data = response.json()
            return True, data.get("items", [])
        else:
            print(f"Search API error: {response.status_code}", file=sys.stderr)
            return False, None
    except Exception as e:
        print(f"Search exception: {e}", file=sys.stderr)
        return False, None


def _wait_for_workflow_completion(
    headers: Dict[str, str], owner: str, repo: str, max_wait: int = 90
) -> bool:
    """Wait for GitHub Actions workflows to complete processing."""
    print("⏳ Waiting for GitHub Actions workflows to complete...")

    start_time = time.time()
    expected_runs = 3  # We created 3 test issues
    no_workflow_check_count = 0

    while time.time() - start_time < max_wait:
        try:
            # Check workflow runs
            success, response = _get_github_api(
                "actions/workflows/issue-automation.yml/runs?per_page=20",
                headers,
                owner,
                repo,
            )

            if success and response:
                runs = response.get("workflow_runs", [])
                if len(runs) >= expected_runs:
                    # Check status of recent runs
                    recent_runs = runs[:expected_runs]

                    running_count = 0
                    completed_count = 0
                    failed_count = 0

                    for run in recent_runs:
                        status = run["status"]
                        conclusion = run.get("conclusion")

                        if status == "completed":
                            completed_count += 1
                            if conclusion == "failure":
                                failed_count += 1
                        elif status in ["in_progress", "queued"]:
                            running_count += 1

                    print(
                        f"   Status: {completed_count} completed, {running_count} running/queued"
                    )

                    # Wait until NO workflows are running and we have enough completed runs
                    if running_count == 0 and completed_count >= expected_runs:
                        if failed_count > 0:
                            print(
                                f"⚠️ Warning: {failed_count} workflow runs failed, but continuing verification..."
                            )

                        print(
                            f"✅ All workflows completed. Found {completed_count} completed runs."
                        )
                        # Additional wait to ensure all issue processing is done
                        print("⏳ Additional wait for issue processing to complete...")
                        time.sleep(5)
                        return True
                elif len(runs) == 0:
                    # No workflow runs found
                    no_workflow_check_count += 1
                    if no_workflow_check_count == 1:
                        print(
                            "   No workflow runs found yet, waiting 5 seconds and checking once more..."
                        )
                        time.sleep(5)
                        continue
                    elif no_workflow_check_count >= 2:
                        print(
                            "⚠️ No workflow runs detected after 2 checks. Workflow may not have been triggered."
                        )
                        print("   Continuing with verification...")
                        return False
                else:
                    print(
                        f"   Waiting for workflow runs... Found {len(runs)}, expected {expected_runs}"
                    )

            print(f"⏳ Still waiting... ({int(time.time() - start_time)}s elapsed)")
            time.sleep(5)

        except Exception as e:
            print(f"⚠️ Error checking workflow status: {e}")
            time.sleep(5)

    print(f"⚠️ Workflow completion wait timed out after {max_wait}s")
    return False


def _find_issue_by_title(
    title: str, headers: Dict[str, str], owner: str, repo: str
) -> Optional[Dict]:
    """Find an issue by exact title match."""
    success, issues = _search_github_issues(
        f'repo:{owner}/{repo} "{title}" is:issue', headers
    )

    if success and issues:
        for issue in issues:
            if issue.get("title") == title:
                return issue
    return None


def _check_issue_labels(
    issue: Dict, expected_labels: List[str]
) -> Tuple[bool, List[str]]:
    """Check if issue has the expected labels."""
    actual_labels = [label["name"] for label in issue.get("labels", [])]
    missing_labels = [label for label in expected_labels if label not in actual_labels]

    if missing_labels:
        return False, [f"Missing labels: {missing_labels}. Found: {actual_labels}"]
    return True, []


def _check_issue_milestone(
    issue: Dict, expected_milestone: str
) -> Tuple[bool, List[str]]:
    """Check if issue has the expected milestone."""
    milestone = issue.get("milestone")
    if not milestone:
        if expected_milestone:
            return False, [f"No milestone found. Expected: {expected_milestone}"]
        return True, []

    if milestone.get("title") != expected_milestone:
        return False, [
            f"Wrong milestone: {milestone.get('title')}. Expected: {expected_milestone}"
        ]

    return True, []


def _check_issue_comments(
    issue_number: int,
    expected_content: str,
    headers: Dict[str, str],
    owner: str,
    repo: str,
) -> Tuple[bool, List[str]]:
    """Check if issue has a comment containing expected content."""
    success, comments = _get_github_api(
        f"issues/{issue_number}/comments", headers, owner, repo
    )

    if not success:
        return False, ["Failed to get issue comments"]

    if not comments:
        return False, [f"No comments found. Expected comment with: {expected_content}"]

    for comment in comments:
        if expected_content in comment.get("body", ""):
            return True, []

    return False, [f"Expected content '{expected_content}' not found in comments"]


def _find_epic_sub_issues(
    parent_issue_number: int, headers: Dict[str, str], owner: str, repo: str
) -> Tuple[List[Dict], List[str]]:
    """Find sub-issues created for an epic."""
    # Search for each expected sub-task by exact title
    expected_subtasks = [
        "[SUBTASK] Epic: Redesign user dashboard interface - Task 1: Requirements Analysis",
        "[SUBTASK] Epic: Redesign user dashboard interface - Task 2: Design and Architecture",
        "[SUBTASK] Epic: Redesign user dashboard interface - Task 3: Implementation",
        "[SUBTASK] Epic: Redesign user dashboard interface - Task 4: Testing and Documentation",
    ]

    subtasks = []
    errors = []

    for expected_title in expected_subtasks:
        # Search for exact title
        success, issues = _search_github_issues(
            f'repo:{owner}/{repo} "{expected_title}" is:issue', headers
        )

        if not success:
            errors.append(f"Failed to search for sub-issue: {expected_title}")
            continue

        # Find exact match
        found = False
        for issue in issues:
            if issue.get("title") == expected_title:
                # Verify it references the parent issue
                body = issue.get("body", "")
                if (
                    f"#{parent_issue_number}" in body
                    or f"Related to #{parent_issue_number}" in body
                ):
                    subtasks.append(issue)
                    found = True
                    break

        if not found:
            errors.append(
                f"Sub-issue not found or doesn't reference parent: {expected_title}"
            )

    return subtasks, errors


def _check_epic_checklist(
    issue: Dict, subtask_numbers: List[int]
) -> Tuple[bool, List[str]]:
    """Check if epic issue has the Epic Tasks checklist with correct issue references."""
    body = issue.get("body", "")
    errors = []

    if "## Epic Tasks" not in body:
        return False, ["Epic Tasks section not found in issue body"]

    # Check that all subtask issue numbers are referenced in checkbox format
    for number in subtask_numbers:
        # Check for checkbox format: - [ ] #number
        if f"- [ ] #{number}" not in body:
            errors.append(
                f"Sub-issue #{number} not found in Epic Tasks checklist format (expected: '- [ ] #{number}')"
            )

    # Also verify the expected task names are present
    expected_tasks = [
        "Requirements Analysis",
        "Design and Architecture",
        "Implementation",
        "Testing and Documentation",
    ]

    for task in expected_tasks:
        if task not in body:
            errors.append(f"Task name '{task}' not found in Epic Tasks section")

    if errors:
        return False, errors

    return True, []


def _verify_bug_issue(
    headers: Dict[str, str], owner: str, repo: str
) -> Tuple[bool, List[str]]:
    """Verify the bug issue requirements."""
    print("\n🐛 Verifying Bug Issue...")
    errors = []

    # Find bug issue
    bug_issue = _find_issue_by_title(
        "Bug: Login form validation not working", headers, owner, repo
    )
    if not bug_issue:
        return False, ["Bug issue 'Bug: Login form validation not working' not found"]

    issue_number = bug_issue["number"]
    print(f"   Found bug issue #{issue_number}")

    # Check labels (including first-time-contributor since it's the first issue)
    expected_labels = ["bug", "priority-high", "needs-review", "first-time-contributor"]
    labels_ok, label_errors = _check_issue_labels(bug_issue, expected_labels)
    if not labels_ok:
        errors.extend(label_errors)
    else:
        print(f"   ✅ Labels verified: {expected_labels}")

    # Check milestone
    milestone_ok, milestone_errors = _check_issue_milestone(bug_issue, "v1.0.0")
    if not milestone_ok:
        errors.extend(milestone_errors)
    else:
        print("   ✅ Milestone verified: v1.0.0")

    # Check comment
    comment_ok, comment_errors = _check_issue_comments(
        issue_number, "Bug Report Guidelines", headers, owner, repo
    )
    if not comment_ok:
        errors.extend(comment_errors)
    else:
        print("   ✅ Bug Report Guidelines comment found")

    return len(errors) == 0, errors


def _verify_epic_issue(
    headers: Dict[str, str], owner: str, repo: str
) -> Tuple[bool, List[str]]:
    """Verify the epic issue requirements."""
    print("\n🚀 Verifying Epic Issue...")
    errors = []

    # Find epic issue
    epic_issue = _find_issue_by_title(
        "Epic: Redesign user dashboard interface", headers, owner, repo
    )
    if not epic_issue:
        return False, ["Epic issue 'Epic: Redesign user dashboard interface' not found"]

    issue_number = epic_issue["number"]
    print(f"   Found epic issue #{issue_number}")

    # Check labels
    expected_labels = ["epic", "priority-high", "needs-review"]
    labels_ok, label_errors = _check_issue_labels(epic_issue, expected_labels)
    if not labels_ok:
        errors.extend(label_errors)
    else:
        print(f"   ✅ Labels verified: {expected_labels}")

    # Check milestone
    milestone_ok, milestone_errors = _check_issue_milestone(epic_issue, "v1.0.0")
    if not milestone_ok:
        errors.extend(milestone_errors)
    else:
        print("   ✅ Milestone verified: v1.0.0")

    # Check comment
    comment_ok, comment_errors = _check_issue_comments(
        issue_number, "Feature Request Process", headers, owner, repo
    )
    if not comment_ok:
        errors.extend(comment_errors)
    else:
        print("   ✅ Feature Request Process comment found")

    # Find and verify sub-issues
    sub_issues, sub_errors = _find_epic_sub_issues(issue_number, headers, owner, repo)
    if sub_errors:
        errors.extend(sub_errors)
    elif len(sub_issues) != 4:
        errors.append(f"Expected 4 sub-issues, found {len(sub_issues)}")
    else:
        print(f"   ✅ Found {len(sub_issues)} sub-issues")

        # Collect sub-issue numbers for checklist verification
        subtask_numbers = []

        # Verify each sub-issue has correct labels and link to parent
        for sub_issue in sub_issues:
            sub_number = sub_issue["number"]
            subtask_numbers.append(sub_number)

            # Check labels
            sub_labels = [label["name"] for label in sub_issue.get("labels", [])]
            expected_sub_labels = ["enhancement", "needs-review"]

            missing_sub_labels = [
                label for label in expected_sub_labels if label not in sub_labels
            ]
            if missing_sub_labels:
                errors.append(
                    f"Sub-issue #{sub_number} missing labels: {missing_sub_labels}"
                )

            # Verify parent reference in body
            sub_body = sub_issue.get("body", "")
            if (
                f"#{issue_number}" not in sub_body
                and f"Related to #{issue_number}" not in sub_body
            ):
                errors.append(
                    f"Sub-issue #{sub_number} doesn't reference parent issue #{issue_number}"
                )

        if not errors:
            print(
                "   ✅ All 4 sub-tasks created with correct labels and parent references"
            )

        # Check Epic Tasks checklist with correct issue numbers
        checklist_ok, checklist_errors = _check_epic_checklist(
            epic_issue, subtask_numbers
        )
        if not checklist_ok:
            errors.extend(checklist_errors)
        else:
            print(
                f"   ✅ Epic Tasks checklist verified with correct issue references: {subtask_numbers}"
            )

    return len(errors) == 0, errors


def _verify_maintenance_issue(
    headers: Dict[str, str], owner: str, repo: str
) -> Tuple[bool, List[str]]:
    """Verify the maintenance issue requirements."""
    print("\n🔧 Verifying Maintenance Issue...")
    errors = []

    # Find maintenance issue
    maintenance_issue = _find_issue_by_title(
        "Weekly maintenance cleanup and refactor", headers, owner, repo
    )
    if not maintenance_issue:
        return False, [
            "Maintenance issue 'Weekly maintenance cleanup and refactor' not found"
        ]

    issue_number = maintenance_issue["number"]
    print(f"   Found maintenance issue #{issue_number}")

    # Check labels
    expected_labels = ["maintenance", "priority-medium", "needs-review"]
    labels_ok, label_errors = _check_issue_labels(maintenance_issue, expected_labels)
    if not labels_ok:
        errors.extend(label_errors)
    else:
        print(f"   ✅ Labels verified: {expected_labels}")

    # Check NO milestone (maintenance issues shouldn't get v1.0.0)
    milestone_ok, milestone_errors = _check_issue_milestone(maintenance_issue, None)
    if not milestone_ok:
        errors.extend(milestone_errors)
    else:
        print("   ✅ No milestone assigned (correct for maintenance issue)")

    # Check comment
    comment_ok, comment_errors = _check_issue_comments(
        issue_number, "Maintenance Guidelines", headers, owner, repo
    )
    if not comment_ok:
        errors.extend(comment_errors)
    else:
        print("   ✅ Maintenance Guidelines comment found")

    return len(errors) == 0, errors


def verify() -> bool:
    """
    Verify that the issue management workflow automation is working correctly.
    """
    # Load environment variables
    load_dotenv(".mcp_env")

    github_token = os.environ.get("MCP_GITHUB_TOKEN")
    if not github_token:
        print("Error: MCP_GITHUB_TOKEN environment variable not set", file=sys.stderr)
        return False

    # Get GitHub organization
    github_org = os.environ.get("GITHUB_EVAL_ORG")
    if not github_org:
        print("Error: GITHUB_EVAL_ORG environment variable not set", file=sys.stderr)
        return False

    # Repository configuration
    owner = github_org
    repo = "mcpmark-cicd"

    headers = {
        "Authorization": f"token {github_token}",
        "Accept": "application/vnd.github.v3+json",
    }

    print("🔍 Starting Issue Management Workflow Verification")
    print("=" * 60)

    # Wait for workflows to complete
    workflows_completed = _wait_for_workflow_completion(headers, owner, repo)
    if not workflows_completed:
        print(
            "⚠️ Warning: Workflows may still be running. Continuing with verification..."
        )

    # Verify each test issue
    all_passed = True

    # 1. Verify bug issue
    bug_ok, bug_errors = _verify_bug_issue(headers, owner, repo)
    if not bug_ok:
        all_passed = False
        print("❌ Bug Issue Verification Failed:")
        for error in bug_errors:
            print(f"   - {error}")
    else:
        print("✅ Bug Issue Verification Passed")

    # 2. Verify epic issue
    epic_ok, epic_errors = _verify_epic_issue(headers, owner, repo)
    if not epic_ok:
        all_passed = False
        print("❌ Epic Issue Verification Failed:")
        for error in epic_errors:
            print(f"   - {error}")
    else:
        print("✅ Epic Issue Verification Passed")

    # 3. Verify maintenance issue
    maintenance_ok, maintenance_errors = _verify_maintenance_issue(headers, owner, repo)
    if not maintenance_ok:
        all_passed = False
        print("❌ Maintenance Issue Verification Failed:")
        for error in maintenance_errors:
            print(f"   - {error}")
    else:
        print("✅ Maintenance Issue Verification Passed")

    print("\n" + "=" * 60)
    if all_passed:
        print("🎉 All Issue Management Workflow verifications PASSED!")
        print("\n📋 Summary:")
        print(
            "   ✅ Bug issue: labels (including first-time-contributor), milestone, and auto-response verified"
        )
        print(
            "   ✅ Epic issue: labels, milestone, 4 sub-issues with checklist, and correct issue references verified"
        )
        print(
            "   ✅ Maintenance issue: labels, no milestone, and auto-response verified"
        )
        print("\n🤖 The GitHub Actions workflow automation is working correctly!")
    else:
        print("❌ Issue Management Workflow verification FAILED!")
        print("   Some issues did not meet the expected automation requirements.")

    return all_passed


if __name__ == "__main__":
    success = verify()
    sys.exit(0 if success else 1)


================================================
FILE: tasks/github/standard/mcpmark-cicd/linting_ci_workflow/description.md
================================================
I need you to set up a proper linting workflow for our CI pipeline to ensure code quality standards are enforced on all pull requests. Here's what you need to do:

**Step 1: Create Linting Configuration Branch**
Create a new branch called 'ci/add-eslint-workflow' from the main branch.

**Step 2: Create ESLint Configuration**
On the new branch, create the file `.eslintrc.json` in the repository root with:
```json
{
  "env": {
    "browser": true,
    "es2021": true,
    "node": true
  },
  "extends": [
    "eslint:recommended"
  ],
  "parserOptions": {
    "ecmaVersion": 12,
    "sourceType": "module"
  },
  "rules": {
    "no-unused-vars": "error",
    "no-console": "warn",
    "semi": ["error", "always"],
    "quotes": ["error", "single"]
  },
  "ignorePatterns": ["node_modules/", "dist/", "build/"]
}
```

**Step 3: Create GitHub Actions Linting Workflow**
Create the file `.github/workflows/lint.yml` with:
- Workflow name: "Code Linting"
- Triggers on: push to main, pull_request events
- Uses ubuntu-latest runner
- Sets up Node.js version 18 using actions/setup-node
- Installs dependencies with npm ci
- Installs ESLint globally
- Runs ESLint on all JavaScript files in src/ directories
- Fails the workflow if linting errors are found

**Step 4: Create a File That Will Fail Linting**
Create the file `src/example.js` with intentional linting violations that will cause the CI check to fail.

**Step 5: Create Pull Request**
Commit all the changes (ESLint config, workflow file, and example file with linting errors) in a single commit, then create a pull request from 'ci/add-eslint-workflow' to 'main' with:
- Title: "Add ESLint workflow for code quality enforcement"
- Body must include:
  - A "## Summary" heading describing the linting setup
  - A "## Changes" heading listing the files added
  - A "## Testing" heading explaining how to test the workflow
  - Mention that the PR intentionally includes linting errors to demonstrate the workflow

**Step 6: Fix Linting Errors and Update PR**
Fix the linting errors in `src/example.js` and commit the changes in a single commit to update the PR so that the CI check passes.


================================================
FILE: tasks/github/standard/mcpmark-cicd/linting_ci_workflow/meta.json
================================================
{
  "task_id": "linting_ci_workflow",
  "task_name": "Linting Ci Workflow",
  "category_id": "mcpmark-cicd",
  "category_name": "MCPMark CI/CD",
  "description": "Set up ESLint workflow for code quality enforcement on all pull requests with proper CI integration.",
  "author": "Zijian Wu",
  "created_at": "2025-08-15",
  "difficulty": "L3",
  "tags": [
    "ci/cd automation",
    "pr workflows"
  ],
  "mcp": [
    "github"
  ],
  "meta_data": {
    "stateType": "url",
    "stateContent": null,
    "stateUrl": "https://github.com/mcpmark-source/mcpmark-cicd",
    "stateOriginalUrl": null
  }
}

================================================
FILE: tasks/github/standard/mcpmark-cicd/linting_ci_workflow/verify.py
================================================
import sys
import os
import requests
from typing import Dict, List, Optional, Tuple
import base64
from dotenv import load_dotenv
import time
import json


def _get_github_api(
    endpoint: str, headers: Dict[str, str], org: str, repo: str = "claude-code"
) -> Tuple[bool, Optional[Dict]]:
    """Make a GET request to GitHub API and return (success, response)."""
    url = f"https://api.github.com/repos/{org}/{repo}/{endpoint}"

    try:
        response = requests.get(url, headers=headers)
        if response.status_code == 200:
            return True, response.json()
        elif response.status_code == 404:
            return False, None
        else:
            print(f"API error for {endpoint}: {response.status_code}", file=sys.stderr)
            return False, None
    except Exception as e:
        print(f"Exception for {endpoint}: {e}", file=sys.stderr)
        return False, None


def _check_branch_exists(
    branch_name: str, headers: Dict[str, str], org: str, repo: str = "mcpmark-cicd"
) -> bool:
    """Verify that a branch exists in the repository."""
    success, _ = _get_github_api(f"branches/{branch_name}", headers, org, repo)
    return success


def _get_file_content(
    file_path: str,
    headers: Dict[str, str],
    org: str,
    repo: str = "claude-code",
    ref: str = "main",
) -> Optional[str]:
    """Get the content of a file from the repository."""
    success, result = _get_github_api(
        f"contents/{file_path}?ref={ref}", headers, org, repo
    )
    if not success or not result:
        return None

    try:
        content = base64.b64decode(result.get("content", "")).decode("utf-8")
        return content
    except Exception as e:
        print(f"Content decode error for {file_path}: {e}", file=sys.stderr)
        return None


def _find_pr_by_title_keyword(
    keyword: str, headers: Dict[str, str], org: str, repo: str = "mcpmark-cicd"
) -> Optional[Dict]:
    """Find a PR by title keyword and return the PR data."""
    for state in ["open", "closed"]:
        success, prs = _get_github_api(
            f"pulls?state={state}&per_page=100", headers, org, repo
        )
        if success and prs:
            for pr in prs:
                if keyword.lower() in pr.get("title", "").lower():
                    return pr
    return None


def _get_workflow_runs_for_pr(
    pr_number: int, headers: Dict[str, str], org: str, repo: str = "mcpmark-cicd"
) -> List[Dict]:
    """Get workflow runs for a specific PR."""
    success, runs = _get_github_api(
        "actions/runs?event=pull_request&per_page=100", headers, org, repo
    )
    if not success or not runs:
        return []

    pr_runs = []
    for run in runs.get("workflow_runs", []):
        # Check if this run is associated with our PR
        for pr in run.get("pull_requests", []):
            if pr.get("number") == pr_number:
                pr_runs.append(run)
                break

    return pr_runs


def _get_pr_commits(
    pr_number: int, headers: Dict[str, str], org: str, repo: str = "mcpmark-cicd"
) -> List[Dict]:
    """Get commits for a specific PR."""
    success, commits = _get_github_api(f"pulls/{pr_number}/commits", headers, org, repo)
    if not success or not commits:
        return []
    return commits


def _get_workflow_runs_for_commit(
    commit_sha: str, headers: Dict[str, str], org: str, repo: str = "mcpmark-cicd"
) -> List[Dict]:
    """Get workflow runs for a specific commit."""
    success, runs = _get_github_api(
        f"actions/runs?head_sha={commit_sha}&per_page=100", headers, org, repo
    )
    if not success or not runs:
        return []
    return runs.get("workflow_runs", [])


def verify() -> bool:
    """
    Programmatically verify that the ESLint CI workflow setup
    meets the requirements described in description.md.
    """
    # Configuration constants
    BRANCH_NAME = "ci/add-eslint-workflow"
    PR_KEYWORD = "eslint workflow"

    # Expected files and their content checks
    ESLINT_CONFIG_PATH = ".eslintrc.json"
    WORKFLOW_PATH = ".github/workflows/lint.yml"
    EXAMPLE_FILE_PATH = "src/example.js"

    # Expected workflow content keywords
    WORKFLOW_KEYWORDS = [
        "Code Linting",
        "ubuntu-latest",
        "actions/setup-node",
        "npm ci",
        "eslint",
        "src/",
    ]

    # Load environment variables from .mcp_env
    load_dotenv(".mcp_env")

    # Get GitHub token and org
    github_token = os.environ.get("MCP_GITHUB_TOKEN")
    github_org = os.environ.get("GITHUB_EVAL_ORG")

    if not github_token:
        print("Error: MCP_GITHUB_TOKEN environment variable not set", file=sys.stderr)
        return False

    if not github_org:
        print("Error: GITHUB_EVAL_ORG environment variable not set", file=sys.stderr)
        return False

    headers = {
        "Authorization": f"Bearer {github_token}",
        "Accept": "application/vnd.github.v3+json",
    }

    # Run verification checks
    print("Verifying ESLint CI workflow setup...")

    # 1. Check that branch exists
    print("1. Verifying CI branch exists...")
    if not _check_branch_exists(BRANCH_NAME, headers, github_org):
        print(f"Error: Branch '{BRANCH_NAME}' not found", file=sys.stderr)
        return False
    print("✓ CI branch created")

    # 2. Check ESLint configuration file
    print("2. Verifying .eslintrc.json...")
    eslint_content = _get_file_content(
        ESLINT_CONFIG_PATH, headers, github_org, "mcpmark-cicd", BRANCH_NAME
    )
    if not eslint_content:
        print("Error: .eslintrc.json not found", file=sys.stderr)
        return False

    # Validate ESLint config is valid JSON and contains required rules
    try:
        eslint_config = json.loads(eslint_content)
        rules = eslint_config.get("rules", {})

        required_rules = ["no-unused-vars", "semi", "quotes"]
        missing_rules = [rule for rule in required_rules if rule not in rules]
        if missing_rules:
            print(
                f"Error: .eslintrc.json missing rules: {missing_rules}", file=sys.stderr
            )
            return False

    except json.JSONDecodeError:
        print("Error: .eslintrc.json is not valid JSON", file=sys.stderr)
        return False

    print("✓ ESLint configuration created with proper rules")

    # 3. Check GitHub Actions workflow file
    print("3. Verifying .github/workflows/lint.yml...")
    workflow_content = _get_file_content(
        WORKFLOW_PATH, headers, github_org, "mcpmark-cicd", BRANCH_NAME
    )
    if not workflow_content:
        print("Error: .github/workflows/lint.yml not found", file=sys.stderr)
        return False

    # Check workflow contains required keywords
    missing_keywords = [kw for kw in WORKFLOW_KEYWORDS if kw not in workflow_content]
    if missing_keywords:
        print(f"Error: Workflow missing keywords: {missing_keywords}", file=sys.stderr)
        return False

    # Check trigger configuration
    if "pull_request" not in workflow_content or "push" not in workflow_content:
        print("Error: Workflow missing proper triggers", file=sys.stderr)
        return False

    print("✓ GitHub Actions workflow created with proper configuration")

    # 4. Check example file with linting errors initially exists
    print("4. Verifying src/example.js...")
    example_content = _get_file_content(
        EXAMPLE_FILE_PATH, headers, github_org, "mcpmark-cicd", BRANCH_NAME
    )
    if not example_content:
        print("Error: src/example.js not found", file=sys.stderr)
        return False

    print("✓ Example file created")

    # 5. Find and verify the linting PR
    print("5. Verifying linting pull request...")
    lint_pr = _find_pr_by_title_keyword(PR_KEYWORD, headers, github_org)
    if not lint_pr:
        # Try alternative keywords
        lint_pr = _find_pr_by_title_keyword("eslint", headers, github_org)

    if not lint_pr:
        print("Error: Linting PR not found", file=sys.stderr)
        return False

    pr_body = lint_pr.get("body", "")
    pr_number = lint_pr.get("number")

    # Check PR body sections
    required_sections = ["## Summary", "## Changes", "## Testing"]
    missing_sections = [
        section for section in required_sections if section not in pr_body
    ]
    if missing_sections:
        print(
            f"Error: Linting PR missing sections: {missing_sections}", file=sys.stderr
        )
        return False

    print("✓ Linting PR created with proper structure")

    # 6. Check workflow runs and status changes
    print("6. Verifying workflow execution and status...")

    # First get the commits for this PR
    commits = _get_pr_commits(pr_number, headers, github_org)
    if len(commits) != 2:
        print(
            f"Error: Expected exactly 2 commits, found {len(commits)}", file=sys.stderr
        )
        return False

    print("✓ Found exactly 2 commits as expected")

    # Sort commits chronologically (oldest first)
    commits.sort(key=lambda x: x.get("commit", {}).get("author", {}).get("date", ""))

    first_commit_sha = commits[0].get("sha")
    second_commit_sha = commits[1].get("sha")

    print(f"First commit (should fail): {first_commit_sha[:7]}")
    print(f"Second commit (should pass): {second_commit_sha[:7]}")

    # Wait for workflows on both commits to complete
    print("Waiting for workflow completion on first commit...")
    first_commit_runs = []
    second_commit_runs = []

    start_time = time.time()
    timeout = 90
    no_workflow_check_count = 0

    while time.time() - start_time < timeout:
        first_commit_runs = _get_workflow_runs_for_commit(
            first_commit_sha, headers, github_org
        )
        second_commit_runs = _get_workflow_runs_for_commit(
            second_commit_sha, headers, github_org
        )

        # Check if any workflows exist
        if not first_commit_runs and not second_commit_runs:
            no_workflow_check_count += 1
            if no_workflow_check_count == 1:
                print(
                    "No workflow runs found yet, waiting 5 seconds and checking once more..."
                )
                time.sleep(5)
                continue
            elif no_workflow_check_count >= 2:
                print(
                    "⚠️ No workflow runs detected after 2 checks. Workflows may not have been triggered."
                )
                print("   Continuing with verification...")
                break

        # Check if workflows are completed
        first_completed = any(
            run.get("status") == "completed" for run in first_commit_runs
        )
        second_completed = any(
            run.get("status") == "completed" for run in second_commit_runs
        )

        if first_completed and second_completed:
            break

        print("Waiting for workflows to complete...")
        time.sleep(10)

    # Verify first commit workflow failed
    first_commit_status = None
    for run in first_commit_runs:
        if run.get("status") == "completed":
            conclusion = run.get("conclusion")
            if conclusion in ["failure", "cancelled"]:
                first_commit_status = "failed"
                print("✓ First commit workflow failed as expected")
                break
            elif conclusion == "success":
                first_commit_status = "passed"
                break

    if first_commit_status != "failed":
        print(
            "Error: First commit workflow should have failed due to linting errors",
            file=sys.stderr,
        )
        return False

    # Verify second commit workflow succeeded
    second_commit_status = None
    for run in second_commit_runs:
        if run.get("status") == "completed":
            conclusion = run.get("conclusion")
            if conclusion == "success":
                second_commit_status = "passed"
                print("✓ Second commit workflow passed as expected")
                break
            elif conclusion in ["failure", "cancelled"]:
                second_commit_status = "failed"
                break

    if second_commit_status != "passed":
        print(
            "Error: Second commit workflow should have passed after fixing linting errors",
            file=sys.stderr,
        )
        return False

    print(
        "✓ Workflow status sequence verified: first commit failed → second commit passed"
    )

    # 7. Verify the final state shows clean code
    print("7. Verifying final file state...")
    final_example_content = _get_file_content(
        EXAMPLE_FILE_PATH, headers, github_org, "mcpmark-cicd", BRANCH_NAME
    )

    if final_example_content:
        # Check that obvious linting errors are fixed
        if (
            "unusedVariable" in final_example_content
            or 'console.log("Hello World")' in final_example_content
        ):
            print(
                "Warning: Example file may still contain linting errors",
                file=sys.stderr,
            )
        else:
            print("✓ Linting errors appear to be fixed")

    print("\n✅ All verification checks passed!")
    print("ESLint CI workflow setup completed successfully:")
    print(f"  - Linting PR #{pr_number}")
    print(f"  - Branch: {BRANCH_NAME}")
    print(
        "  - Files created: .eslintrc.json, .github/workflows/lint.yml, src/example.js"
    )
    print("  - Workflow configured for pull_request and push triggers")
    print(
        f"  - Total workflow runs found: {len(first_commit_runs) + len(second_commit_runs)}"
    )
    print(
        f"  - First commit runs: {len(first_commit_runs)}, Second commit runs: {len(second_commit_runs)}"
    )

    return True


if __name__ == "__main__":
    success = verify()
    sys.exit(0 if success else 1)


================================================
FILE: tasks/github/standard/mcpmark-cicd/pr_automation_workflow/description.md
================================================
I need you to create a comprehensive Pull Request automation workflow for this Node.js project. The project currently has no GitHub Actions workflows, so you'll be building a PR-focused CI/CD workflow from scratch that responds to pull request events. Here's what needs to be implemented:

## Pull Request Automation Workflow

Create `.github/workflows/pr-automation.yml` that triggers on `pull_request` events (opened, synchronize, reopened) with these jobs:

### 1. **code-quality** job (name: `code-quality`):
  - Runs ESLint checks using `npm run lint`
  - Runs Prettier formatting checks
  - Posts code quality results as PR comment (must include keywords: "Code Quality Report", "ESLint", "Prettier")

### 2. **testing-suite** job (name: `testing-suite`):
  - Runs full test suite with `npm test`
  - Generates test coverage report
  - Posts coverage summary as PR comment (must include keywords: "Test Coverage Report")
  - Uploads coverage artifacts

### 3. **security-scan** job (name: `security-scan`):
  - Runs dependency vulnerability checks
  - Scans for secrets in code changes
  - Creates security report as PR comment (must include keywords: "Security Scan Report", "Vulnerabilities", "Dependencies")

### 4. **build-validation** job (name: `build-validation`):
  - Attempts to build the application
  - Validates all endpoints are accessible
  - Creates deployment preview artifacts
  - Posts build status as PR comment (must include keywords: "Build Validation")

**IMPORTANT: All four jobs must run in parallel.**

## Implementation Requirements:

**Step 1: Create Feature Branch**
Create a new branch called `pr-automation-workflow` from main.

**Step 2: Create the Workflow**
Create `.github/workflows/pr-automation.yml` with proper YAML syntax:
- Appropriate triggers for pull_request events
- All four jobs configured to run in parallel
- Avoid identifier conflicts in github-script actions

**Step 3: Create and Merge Pull Request**
Create a comprehensive pull request and merge it to main:
- Title: "Implement Pull Request Automation Workflow"
- Detailed description of the workflow and its purpose
- Merge the pull request to main branch

## Important Notes:

- **All jobs MUST run in parallel**
- Ensure your PR satisfies ALL required checks
- The workflow should handle edge cases, have proper error recovery, and provide clear logging

================================================
FILE: tasks/github/standard/mcpmark-cicd/pr_automation_workflow/meta.json
================================================
{
  "task_id": "pr_automation_workflow",
  "task_name": "Pr Automation Workflow",
  "category_id": "mcpmark-cicd",
  "category_name": "MCPMark CI/CD",
  "description": "Create comprehensive PR automation with parallel jobs for code quality, testing, security scanning, and build validation.",
  "author": "Zijian Wu",
  "created_at": "2025-08-15",
  "difficulty": "L3",
  "tags": [
    "pr workflows",
    "ci/cd automation",
    "workflow automation"
  ],
  "mcp": [
    "github"
  ],
  "meta_data": {
    "stateType": "url",
    "stateContent": null,
    "stateUrl": "https://github.com/mcpmark-source/mcpmark-cicd",
    "stateOriginalUrl": null
  }
}

================================================
FILE: tasks/github/standard/mcpmark-cicd/pr_automation_workflow/verify.py
================================================
import sys
import os
import requests
import time
from typing import Dict, List, Optional, Tuple
from dotenv import load_dotenv
import base64


def _get_github_api(
    endpoint: str, headers: Dict[str, str], owner: str, repo: str
) -> Tuple[bool, Optional[Dict]]:
    """Make a GET request to GitHub API and return (success, response)."""
    url = f"https://api.github.com/repos/{owner}/{repo}/{endpoint}"
    try:
        response = requests.get(url, headers=headers)
        if response.status_code == 200:
            return True, response.json()
        elif response.status_code == 404:
            return False, None
        else:
            print(f"API error for {endpoint}: {response.status_code}", file=sys.stderr)
            return False, None
    except Exception as e:
        print(f"Exception for {endpoint}: {e}", file=sys.stderr)
        return False, None


def _post_github_api(
    endpoint: str, headers: Dict[str, str], owner: str, repo: str, data: Dict
) -> Tuple[bool, Optional[Dict]]:
    """Make a POST request to GitHub API and return (success, response)."""
    url = f"https://api.github.com/repos/{owner}/{repo}/{endpoint}"
    try:
        response = requests.post(url, headers=headers, json=data)
        if response.status_code in [200, 201]:
            return True, response.json()
        else:
            print(
                f"API error for {endpoint}: {response.status_code} - {response.text}",
                file=sys.stderr,
            )
            return False, None
    except Exception as e:
        print(f"Exception for {endpoint}: {e}", file=sys.stderr)
        return False, None


def _patch_github_api(
    endpoint: str, headers: Dict[str, str], owner: str, repo: str, data: Dict
) -> Tuple[bool, Optional[Dict]]:
    """Make a PATCH request to GitHub API and return (success, response)."""
    url = f"https://api.github.com/repos/{owner}/{repo}/{endpoint}"
    try:
        response = requests.patch(url, headers=headers, json=data)
        if response.status_code == 200:
            return True, response.json()
        else:
            print(
                f"API error for {endpoint}: {response.status_code} - {response.text}",
                file=sys.stderr,
            )
            return False, None
    except Exception as e:
        print(f"Exception for {endpoint}: {e}", file=sys.stderr)
        return False, None


def _get_file_content(
    file_path: str,
    headers: Dict[str, str],
    owner: str,
    repo: str,
    ref: str = "main",
) -> Optional[str]:
    """Get the content of a file from the repository."""
    success, result = _get_github_api(
        f"contents/{file_path}?ref={ref}", headers, owner, repo
    )
    if not success or not result:
        return None

    try:
        content = base64.b64decode(result.get("content", "")).decode("utf-8")
        return content
    except Exception as e:
        print(f"Content decode error for {file_path}: {e}", file=sys.stderr)
        return None


def _find_pr_by_title(
    title: str, headers: Dict[str, str], owner: str, repo: str
) -> Optional[Dict]:
    """Find a PR by exact title match."""
    for state in ["closed", "open"]:
        success, prs = _get_github_api(
            f"pulls?state={state}&per_page=100", headers, owner, repo
        )
        if success and prs:
            for pr in prs:
                if pr.get("title") == title:
                    return pr
    return None


def _wait_for_workflow_completion(
    headers: Dict[str, str],
    owner: str,
    repo: str,
    workflow_file: str,
    max_wait: int = 90,
) -> bool:
    """Wait for GitHub Actions workflows to complete processing."""
    print(f"⏳ Waiting for {workflow_file} workflows to complete...")

    start_time = time.time()
    no_workflow_check_count = 0

    while time.time() - start_time < max_wait:
        try:
            success, response = _get_github_api(
                f"actions/workflows/{workflow_file}/runs?per_page=10",
                headers,
                owner,
                repo,
            )

            if success and response:
                runs = response.get("workflow_runs", [])
                if len(runs) > 0:
                    running_count = 0
                    completed_count = 0

                    for run in runs[:5]:  # Check recent runs
                        status = run["status"]
                        if status == "completed":
                            completed_count += 1
                        elif status in ["in_progress", "queued"]:
                            running_count += 1

                    print(
                        f"   Status: {completed_count} completed, {running_count} running/queued"
                    )

                    if running_count == 0:
                        print(f"✅ All {workflow_file} workflows completed.")
                        return True
                else:
                    # No workflow runs found
                    no_workflow_check_count += 1
                    if no_workflow_check_count == 1:
                        print(
                            "   No workflow runs found yet, waiting 5 seconds and checking once more..."
                        )
                        time.sleep(5)
                        continue
                    elif no_workflow_check_count >= 2:
                        print(
                            f"⚠️ No workflow runs detected after 2 checks. {workflow_file} may not have been triggered."
                        )
                        print("   Continuing with verification...")
                        return False

            print(f"⏳ Still waiting... ({int(time.time() - start_time)}s elapsed)")
            time.sleep(10)

        except Exception as e:
            print(f"⚠️ Error checking workflow status: {e}")
            time.sleep(10)

    print(f"⚠️ Workflow completion wait timed out after {max_wait}s")
    return False


def _verify_workflow_file(
    headers: Dict[str, str], owner: str, repo: str
) -> Tuple[bool, List[str]]:
    """Verify that the workflow file exists and has correct content."""
    print("\n📄 Verifying workflow file...")
    errors = []

    workflow_content = _get_file_content(
        ".github/workflows/pr-automation.yml", headers, owner, repo
    )

    if not workflow_content:
        return False, [
            "Workflow file .github/workflows/pr-automation.yml not found in main branch"
        ]

    print("   ✅ Workflow file exists in main branch")

    # Verify required components
    required_events = ["opened", "synchronize", "reopened"]
    required_jobs = [
        "code-quality",
        "testing-suite",
        "security-scan",
        "build-validation",
    ]

    if "pull_request:" not in workflow_content:
        errors.append("Workflow missing pull_request trigger")
    else:
        print("   ✅ Pull request trigger found")

    for event in required_events:
        if event not in workflow_content:
            errors.append(f"Missing event trigger: {event}")

    if not errors:
        print(f"   ✅ Required events found: {required_events}")

    for job in required_jobs:
        if f"{job}:" not in workflow_content:
            errors.append(f"Missing job: {job}")

    if not errors:
        print(f"   ✅ All 4 required jobs found: {required_jobs}")

    return len(errors) == 0, errors


def _verify_main_pr_merged(
    headers: Dict[str, str], owner: str, repo: str
) -> Tuple[bool, List[str], Optional[Dict]]:
    """Verify that the main PR implementing the workflow was merged."""
    print("\n🔍 Verifying main PR was merged...")
    errors = []

    pr = _find_pr_by_title(
        "Implement Pull Request Automation Workflow", headers, owner, repo
    )

    if not pr:
        return (
            False,
            ["Main PR 'Implement Pull Request Automation Workflow' not found"],
            None,
        )

    pr_number = pr["number"]
    print(f"   Found PR #{pr_number}")

    if not pr.get("merged_at", False):
        errors.append(f"PR #{pr_number} was not merged")
    else:
        print(f"   ✅ PR #{pr_number} was merged")

    if pr.get("head", {}).get("ref") != "pr-automation-workflow":
        errors.append(f"PR #{pr_number} was not from pr-automation-workflow branch")
    else:
        print("   ✅ PR was from pr-automation-workflow branch")

    if pr.get("base", {}).get("ref") != "main":
        errors.append(f"PR #{pr_number} was not merged to main branch")
    else:
        print("   ✅ PR was merged to main branch")

    return len(errors) == 0, errors, pr


def _verify_workflow_runs(
    pr_data: Dict, headers: Dict[str, str], owner: str, repo: str
) -> Tuple[bool, List[str]]:
    """Verify that workflow runs occurred for the PR and all 4 jobs ran in parallel."""
    print("\n⚙️ Verifying workflow runs...")
    errors = []

    pr_number = pr_data["number"]

    # Get workflow runs for the PR
    success, runs_response = _get_github_api(
        "actions/runs?event=pull_request&per_page=50", headers, owner, repo
    )

    if not success:
        return False, ["Failed to fetch workflow runs"]

    pr_runs = []
    pr_head_sha = pr_data.get("head", {}).get("sha")

    for run in runs_response.get("workflow_runs", []):
        # Method 1: Check if this run is associated with the PR's head SHA
        if pr_head_sha and run.get("head_sha") == pr_head_sha:
            pr_runs.append(run)
            continue

        # Method 2: Check pull_requests field (may be empty for merged PRs)
        for pr in run.get("pull_requests", []):
            if pr.get("number") == pr_number:
                pr_runs.append(run)
                break

    if not pr_runs:
        # Try alternative approach: get runs by head branch
        pr_head_ref = pr_data.get("head", {}).get("ref")
        if pr_head_ref:
            success, branch_runs = _get_github_api(
                f"actions/runs?branch={pr_head_ref}&per_page=50", headers, owner, repo
            )
            if success:
                pr_runs = branch_runs.get("workflow_runs", [])

    if not pr_runs:
        return False, [
            f"No workflow runs found for PR #{pr_number} (head_sha: {pr_head_sha})"
        ]

    print(f"   Found {len(pr_runs)} workflow run(s) for PR #{pr_number}")

    # Check the most recent run
    latest_run = pr_runs[0]  # GitHub returns runs in descending order by creation time
    run_id = latest_run["id"]

    if latest_run["conclusion"] != "success":
        errors.append(
            f"Latest workflow run {run_id} did not succeed (conclusion: {latest_run['conclusion']})"
        )
    else:
        print(f"   ✅ Latest workflow run {run_id} succeeded")

    # Get jobs for this run
    success, jobs_response = _get_github_api(
        f"actions/runs/{run_id}/jobs", headers, owner, repo
    )

    if not success:
        return False, ["Failed to fetch workflow jobs"]

    jobs = jobs_response.get("jobs", [])
    expected_jobs = [
        "code-quality",
        "testing-suite",
        "security-scan",
        "build-validation",
    ]

    found_jobs = [job["name"] for job in jobs]
    missing_jobs = [job for job in expected_jobs if job not in found_jobs]

    if missing_jobs:
        errors.append(f"Missing jobs: {missing_jobs}. Found: {found_jobs}")
    else:
        print(f"   ✅ All 4 required jobs found: {found_jobs}")

    # Verify all jobs succeeded
    failed_jobs = [job["name"] for job in jobs if job["conclusion"] != "success"]
    if failed_jobs:
        errors.append(f"Failed jobs: {failed_jobs}")
    else:
        print("   ✅ All jobs completed successfully")

    # Verify jobs ran in parallel (started around the same time)
    if len(jobs) >= 4:
        start_times = [job["started_at"] for job in jobs if job["started_at"]]
        if len(start_times) >= 4:
            # Check if all jobs started within 2 minutes of each other
            import datetime

            start_dt = [
                datetime.datetime.fromisoformat(t.replace("Z", "+00:00"))
                for t in start_times
            ]
            time_diff = max(start_dt) - min(start_dt)
            if time_diff.total_seconds() > 120:  # 2 minutes
                errors.append(
                    f"Jobs did not run in parallel (time span: {time_diff.total_seconds()}s)"
                )
            else:
                print("   ✅ Jobs ran in parallel")
        else:
            errors.append("Not enough job start times to verify parallel execution")

    return len(errors) == 0, errors


def _verify_pr_comments(
    pr_data: Dict, headers: Dict[str, str], owner: str, repo: str
) -> Tuple[bool, List[str]]:
    """Verify that PR has required automation comments from GitHub Actions bot."""
    print("\n💬 Verifying PR comments...")
    errors = []

    pr_number = pr_data["number"]

    success, comments = _get_github_api(
        f"issues/{pr_number}/comments", headers, owner, repo
    )

    if not success:
        return False, ["Failed to fetch PR comments"]

    # Filter for GitHub Actions bot comments only
    bot_comments = [
        comment
        for comment in comments
        if comment.get("user", {}).get("login") == "github-actions[bot]"
    ]

    if not bot_comments:
        return False, ["No comments found from GitHub Actions bot"]

    print(f"   Found {len(bot_comments)} comment(s) from GitHub Actions bot")

    # Get all bot comment bodies
    bot_comment_bodies = [comment.get("body", "") for comment in bot_comments]

    # Define required automation reports with their keywords
    required_reports = [
        {
            "name": "Code Quality Report",
            "main_keywords": ["Code Quality Report"],
            "sub_keywords": ["ESLint", "Prettier"],
            "found": False,
        },
        {
            "name": "Test Coverage Report",
            "main_keywords": ["Test Coverage Report"],
            "sub_keywords": [],
            "found": False,
        },
        {
            "name": "Security Scan Report",
            "main_keywords": ["Security Scan Report"],
            "sub_keywords": ["Vulnerabilities", "Dependencies"],
            "found": False,
        },
        {
            "name": "Build Validation Report",
            "main_keywords": ["Build Validation"],
            "sub_keywords": [],
            "found": False,
        },
    ]

    # Check each bot comment for the required reports
    for comment_body in bot_comment_bodies:
        for report in required_reports:
            # Check if this comment contains any of the main keywords for this report
            if any(keyword in comment_body for keyword in report["main_keywords"]):
                if not report["found"]:  # Only mark as found once
                    report["found"] = True
                    print(f"   ✅ Found {report['name']}")

                    # Verify sub-keywords are present in this specific comment
                    for sub_keyword in report["sub_keywords"]:
                        if sub_keyword not in comment_body:
                            errors.append(
                                f"Missing sub-keyword '{sub_keyword}' in {report['name']}"
                            )
                        else:
                            print(
                                f"   ✅ Found sub-keyword '{sub_keyword}' in {report['name']}"
                            )

    # Check if all required reports were found
    for report in required_reports:
        if not report["found"]:
            errors.append(f"Missing {report['name']} from GitHub Actions bot")

    # Verify we have exactly 4 automation reports
    found_reports = sum(1 for report in required_reports if report["found"])
    if found_reports != 4:
        errors.append(f"Expected 4 automation reports, but found {found_reports}")
    else:
        print("   ✅ All 4 required automation reports found from GitHub Actions bot")

    return len(errors) == 0, errors


def _create_test_pr(
    title: str,
    branch: str,
    content: str,
    file_path: str,
    headers: Dict[str, str],
    owner: str,
    repo: str,
) -> Optional[int]:
    """Create a test PR with specific content designed to fail a check."""
    print(f"   Creating test PR: {title}")

    # Create branch
    success, main_ref = _get_github_api("git/ref/heads/main", headers, owner, repo)
    if not success:
        print("   ❌ Failed to get main branch reference")
        return None

    main_sha = main_ref["object"]["sha"]

    branch_data = {"ref": f"refs/heads/{branch}", "sha": main_sha}

    success, _ = _post_github_api("git/refs", headers, owner, repo, branch_data)
    if not success:
        # Branch might already exist, try to delete and recreate
        print(f"   Branch {branch} already exists, trying to delete and recreate...")
        import requests

        # Force delete existing branch
        delete_url = (
            f"https://api.github.com/repos/{owner}/{repo}/git/refs/heads/{branch}"
        )
        delete_response = requests.delete(delete_url, headers=headers)

        if delete_response.status_code == 204:
            print(f"   Successfully deleted existing branch {branch}")
            # Wait a moment for deletion to complete
            import time

            time.sleep(2)

            # Try creating again
            success, _ = _post_github_api("git/refs", headers, owner, repo, branch_data)
            if not success:
                print(f"   ❌ Failed to create branch {branch} after cleanup")
                return None
            else:
                print(f"   ✅ Successfully created branch {branch} after cleanup")
        else:
            print(
                f"   ❌ Failed to delete existing branch {branch}: {delete_response.status_code}"
            )
            return None

    # Create or update file
    file_content = base64.b64encode(content.encode()).decode()

    file_data = {
        "message": f"Test commit for {title}",
        "content": file_content,
        "branch": branch,
    }

    # Check if file exists in main branch first
    success, file_info = _get_github_api(
        f"contents/{file_path}?ref=main", headers, owner, repo
    )
    if success and file_info:
        # File exists, need SHA for update
        file_data["sha"] = file_info["sha"]
        print(f"   File {file_path} exists, updating with SHA")
    else:
        print(f"   Creating new file {file_path}")

    # Use PUT method for file creation/update
    url = f"https://api.github.com/repos/{owner}/{repo}/contents/{file_path}"
    try:
        import requests

        response = requests.put(url, headers=headers, json=file_data)
        if response.status_code in [200, 201]:
            print(f"   ✅ Successfully created/updated file {file_path}")
        else:
            print(
                f"   ❌ Failed to create/update file {file_path}: {response.status_code} - {response.text}"
            )
            return None
    except Exception as e:
        print(f"   ❌ Exception creating file {file_path}: {e}")
        return None

    # Create PR
    pr_data = {
        "title": title,
        "head": branch,
        "base": "main",
        "body": f"Test PR to validate that {title.split(':')[1].strip()} check fails correctly.",
    }

    success, pr_response = _post_github_api("pulls", headers, owner, repo, pr_data)
    if not success:
        print("   ❌ Failed to create PR")
        return None

    pr_number = pr_response["number"]
    print(f"   ✅ Created test PR #{pr_number}")
    return pr_number


def _close_pr(pr_number: int, headers: Dict[str, str], owner: str, repo: str) -> bool:
    """Close a PR."""
    success, _ = _patch_github_api(
        f"pulls/{pr_number}", headers, owner, repo, {"state": "closed"}
    )
    return success


def _run_unit_tests(
    headers: Dict[str, str], owner: str, repo: str
) -> Tuple[bool, List[str]]:
    """Create test PRs to verify workflow correctly fails on bad code."""
    print("\n🧪 Running unit tests with failing PRs...")
    errors = []
    created_prs = []

    test_cases = [
        {
            "title": "Test: Code Quality Failure",
            "branch": "test-code-quality-fail",
            "file_path": "src/lint-fail-test.js",
            "content": "// This file contains intentional ESLint violations\nvar unused_variable = 'this will trigger unused-vars rule'\nconsole.log('missing semicolon - will trigger semi rule')\nconst   badly_spaced   =   'too many spaces'\nif(true){console.log('missing spaces around braces')}\nfunction unusedFunction() { return 'unused'; }\neeval('alert(\"dangerous eval\")');\nwith (Math) { var x = cos(3 * PI) + sin(LN10) }\nvar a = 1; var a = 2; // redeclared variable",
            "expected_failure": "code-quality",
        },
        {
            "title": "Test: Testing Suite Failure",
            "branch": "test-testing-fail",
            "file_path": "tests/fail-test.test.js",
            "content": "const request = require('supertest');\n\ndescribe('Intentional Test Failures', () => {\n  test('This test should always fail', () => {\n    expect(2 + 2).toBe(5); // Intentionally wrong\n  });\n  \n  test('Another failing test', () => {\n    expect(true).toBe(false); // Intentionally wrong\n  });\n  \n  test('Math failure', () => {\n    expect(Math.max(1, 2, 3)).toBe(1); // Intentionally wrong\n  });\n});",
            "expected_failure": "testing-suite",
        },
        {
            "title": "Test: Security Scan Failure",
            "branch": "test-security-fail",
            "file_path": "src/security-fail-test.js",
            "content": "// This file contains patterns that should trigger secret detection\nconst hardcodedPassword = 'admin123password';\nconst fakeApiKey = 'sk_test_' + 'fake123key456here789';\nconst awsLikeKey = 'AKIA' + 'FAKEKEY7EXAMPLE';\nconst dbPassword = 'password' + '=' + 'supersecret123';\nconst tokenPattern = 'token' + '=' + 'ghp_1234567890abcdef';\n\n// These patterns should trigger secret detection\nconsole.log('Password:', hardcodedPassword);\nconsole.log('API Key:', fakeApiKey);\nconsole.log('AWS Key:', awsLikeKey);\nconsole.log('DB Password:', dbPassword);\nconsole.log('Token:', tokenPattern);\n\nmodule.exports = {\n  password: hardcodedPassword,\n  apiKey: fakeApiKey\n};",
            "expected_failure": "security-scan",
        },
        {
            "title": "Test: Build Validation Failure",
            "branch": "test-build-fail",
            "file_path": "src/build-fail-test.js",
            "content": "// This file will cause build/startup failures\nconst express = require('express');\nconst nonExistentModule = require('this-module-does-not-exist-anywhere');\nconst anotherMissing = require('@fake/missing-package');\n\n// This will cause runtime errors during startup\nconst app = express();\n\n// Define a route that will cause issues\napp.get('/test', (req, res) => {\n  // Try to use non-existent modules\n  nonExistentModule.doSomething();\n  anotherMissing.initialize();\n  res.send('This should never work');\n});\n\n// Override the listen method to always fail\nconst originalListen = app.listen;\napp.listen = function(port, callback) {\n  console.log('Attempting to start server...');\n  // This will crash during build validation\n  throw new Error('Intentional build failure for testing');\n};\n\nmodule.exports = app;",
            "expected_failure": "build-validation",
        },
    ]

    for test_case in test_cases:
        pr_number = _create_test_pr(
            test_case["title"],
            test_case["branch"],
            test_case["content"],
            test_case["file_path"],
            headers,
            owner,
            repo,
        )

        if pr_number:
            created_prs.append(pr_number)
        else:
            errors.append(f"Failed to create test PR: {test_case['title']}")

    if created_prs:
        print(f"   Created {len(created_prs)} test PRs, waiting for workflows...")

        # Wait a bit for workflows to start
        time.sleep(5)

        # Wait for workflows to complete
        _wait_for_workflow_completion(
            headers, owner, repo, "pr-automation.yml", max_wait=90
        )

        # Verify each test PR failed appropriately
        for i, pr_number in enumerate(created_prs):
            test_case = test_cases[i]
            print(
                f"   Checking test PR #{pr_number} ({test_case['expected_failure']} failure)..."
            )

            # Get workflow runs for this PR
            success, runs_response = _get_github_api(
                "actions/runs?event=pull_request&per_page=20", headers, owner, repo
            )

            if success:
                pr_runs = []
                for run in runs_response.get("workflow_runs", []):
                    # Check pull_requests field
                    for pr in run.get("pull_requests", []):
                        if pr.get("number") == pr_number:
                            pr_runs.append(run)
                            break

                # If no runs found via pull_requests, try matching by branch
                if not pr_runs:
                    branch_name = test_case["branch"]
                    for run in runs_response.get("workflow_runs", []):
                        if run.get("head_branch") == branch_name:
                            pr_runs.append(run)

                if pr_runs:
                    latest_run = pr_runs[0]
                    if latest_run["conclusion"] != "failure":
                        errors.append(
                            f"Test PR #{pr_number} should have failed but got: {latest_run['conclusion']}"
                        )
                    else:
                        print(f"   ✅ Test PR #{pr_number} correctly failed")
                else:
                    errors.append(f"No workflow runs found for test PR #{pr_number}")

        # Clean up test PRs and branches
        print("   Cleaning up test PRs and branches...")
        for i, pr_number in enumerate(created_prs):
            if _close_pr(pr_number, headers, owner, repo):
                print(f"   ✅ Closed test PR #{pr_number}")
            else:
                print(f"   ⚠️ Failed to close test PR #{pr_number}")

            # Delete test branch
            branch_name = test_cases[i]["branch"]
            import requests

            url = f"https://api.github.com/repos/{owner}/{repo}/git/refs/heads/{branch_name}"
            response = requests.delete(url, headers=headers)
            if response.status_code == 204:
                print(f"   ✅ Deleted test branch {branch_name}")
            else:
                print(f"   ⚠️ Failed to delete test branch {branch_name}")

    return len(errors) == 0, errors


def verify() -> bool:
    """
    Verify that the PR automation workflow is working correctly.
    """
    load_dotenv(".mcp_env")

    github_token = os.environ.get("MCP_GITHUB_TOKEN")
    if not github_token:
        print("Error: MCP_GITHUB_TOKEN environment variable not set", file=sys.stderr)
        return False

    github_org = os.environ.get("GITHUB_EVAL_ORG")
    if not github_org:
        print("Error: GITHUB_EVAL_ORG environment variable not set", file=sys.stderr)
        return False

    owner = github_org
    repo = "mcpmark-cicd"

    headers = {
        "Authorization": f"token {github_token}",
        "Accept": "application/vnd.github.v3+json",
    }

    print("🔍 Starting PR Automation Workflow Verification")
    print("=" * 60)

    all_passed = True

    # 1. Verify workflow file exists
    workflow_ok, workflow_errors = _verify_workflow_file(headers, owner, repo)
    if not workflow_ok:
        all_passed = False
        print("❌ Workflow File Verification Failed:")
        for error in workflow_errors:
            print(f"   - {error}")
    else:
        print("✅ Workflow File Verification Passed")

    # 2. Verify main PR was merged
    pr_ok, pr_errors, pr_data = _verify_main_pr_merged(headers, owner, repo)
    if not pr_ok:
        all_passed = False
        print("❌ Main PR Verification Failed:")
        for error in pr_errors:
            print(f"   - {error}")
    else:
        print("✅ Main PR Verification Passed")

    # 3. Verify workflow runs (only if PR verification passed)
    if pr_ok and pr_data:
        runs_ok, runs_errors = _verify_workflow_runs(pr_data, headers, owner, repo)
        if not runs_ok:
            all_passed = False
            print("❌ Workflow Runs Verification Failed:")
            for error in runs_errors:
                print(f"   - {error}")
        else:
            print("✅ Workflow Runs Verification Passed")

        # 4. Verify PR comments
        comments_ok, comments_errors = _verify_pr_comments(
            pr_data, headers, owner, repo
        )
        if not comments_ok:
            all_passed = False
            print("❌ PR Comments Verification Failed:")
            for error in comments_errors:
                print(f"   - {error}")
        else:
            print("✅ PR Comments Verification Passed")

    # 5. Run unit tests with failing PRs
    tests_ok, tests_errors = _run_unit_tests(headers, owner, repo)
    if not tests_ok:
        all_passed = False
        print("❌ Unit Tests Failed:")
        for error in tests_errors:
            print(f"   - {error}")
    else:
        print("✅ Unit Tests Passed")

    print("\n" + "=" * 60)
    if all_passed:
        print("🎉 All PR Automation Workflow verifications PASSED!")
        print("\n📋 Summary:")
        print("   ✅ Workflow file exists with correct triggers and 4 parallel jobs")
        print("   ✅ Main PR was merged from pr-automation-workflow to main")
        print("   ✅ Workflow runs show all 4 jobs executed in parallel and succeeded")
        print("   ✅ PR comments contain required automation reports")
        print("   ✅ Unit tests confirmed workflow correctly fails on problematic code")
        print("\n🤖 The GitHub Actions PR automation workflow is working correctly!")
    else:
        print("❌ PR Automation Workflow verification FAILED!")
        print("   Some components did not meet the expected automation requirements.")

    return all_passed


if __name__ == "__main__":
    success = verify()
    sys.exit(0 if success else 1)


================================================
FILE: tasks/github/standard/missing-semester/assign_contributor_labels/description.md
================================================
Assign assignees for each open issue and open PR by adding labels instead of using direct assignees. Only contributors who appeared in the past 100 commits are considered. First, collect all such contributors and identify the most frequent author among them. For each open issue or PR, assign using labels according to the following rules:
	•	If the comments mention an author with @username, add a label in the format assigned-username.
	•	If multiple authors are mentioned, add labels in the same format for all of them.
	•	If no authors are mentioned in the comments, add a label for the most frequent contributor from the past 100 commits, using the format assigned-username.

================================================
FILE: tasks/github/standard/missing-semester/assign_contributor_labels/meta.json
================================================
{
  "task_id": "assign_contributor_labels",
  "task_name": "Assign Contributor Labels",
  "category_id": "missing-semester",
  "category_name": "Missing Semester",
  "description": "Assign labels to open issues and PRs based on contributors mentioned in comments or the most frequent contributor from past 100 commits, using assigned-username format.",
  "author": "Zijian Wu",
  "created_at": "2025-08-15",
  "difficulty": "L3",
  "tags": [
    "issue management",
    "label automation",
    "contributor analysis"
  ],
  "mcp": [
    "github"
  ],
  "meta_data": {
    "stateType": "url",
    "stateContent": null,
    "stateUrl": "https://github.com/mcpmark-source/missing-semester",
    "stateOriginalUrl": "https://github.com/missing-semester/missing-semester"
  }
}

================================================
FILE: tasks/github/standard/missing-semester/assign_contributor_labels/verify.py
================================================
import sys
import os
import requests
from typing import Dict, Optional, Tuple, List
from dotenv import load_dotenv


def _get_github_api(
    endpoint: str, headers: Dict[str, str], org: str, repo: str = "missing-semester"
) -> Tuple[bool, Optional[Dict]]:
    """Make a GET request to GitHub API and return (success, response)."""
    url = f"https://api.github.com/repos/{org}/{repo}/{endpoint}"
    
    try:
        response = requests.get(url, headers=headers)
        if response.status_code == 200:
            return True, response.json()
        elif response.status_code == 404:
            return False, None
        else:
            print(f"API error for {endpoint}: {response.status_code}", file=sys.stderr)
            return False, None
    except Exception as e:
        print(f"Exception for {endpoint}: {e}", file=sys.stderr)
        return False, None


def _get_issue_labels(
    issue_number: int,
    headers: Dict[str, str],
    org: str,
    repo: str = "missing-semester"
) -> Optional[List[str]]:
    """Get labels for a specific issue/PR."""
    success, result = _get_github_api(f"issues/{issue_number}", headers, org, repo)
    if not success or not result:
        return None
    
    labels = result.get("labels", [])
    return [label["name"] for label in labels]


def verify() -> bool:
    """
    Programmatically verify that the labels were assigned correctly to issues and PRs.
    """
    # Load environment variables from .mcp_env
    load_dotenv(".mcp_env")

    # Get GitHub token and org
    github_token = os.environ.get("MCP_GITHUB_TOKEN")
    github_org = os.environ.get("GITHUB_EVAL_ORG")

    if not github_token:
        print("Error: MCP_GITHUB_TOKEN environment variable not set", file=sys.stderr)
        return False

    if not github_org:
        print("Error: GITHUB_EVAL_ORG environment variable not set", file=sys.stderr)
        return False

    headers = {
        "Authorization": f"Bearer {github_token}",
        "Accept": "application/vnd.github.v3+json",
    }

    print("Verifying contributor labels assignment task completion...")

    # Expected labels configuration
    expected_labels = {
        # Issues
        9: ["assigned-jonhoo", "assigned-anishathalye"],  # Issue #9
        14: ["assigned-jonhoo", "assigned-anishathalye"],  # Issue #14
        15: ["assigned-anishathalye"],  # Issue #15
        # PRs
        21: ["assigned-anishathalye"],  # PR #21
        22: ["assigned-anishathalye"],  # PR #22
        23: ["assigned-anishathalye"],  # PR #23
        24: ["assigned-anishathalye"],  # PR #24
    }

    all_passed = True

    for item_number, expected in expected_labels.items():
        item_type = "Issue" if item_number in [9, 14, 15] else "PR"
        print(f"\nChecking {item_type} #{item_number}...")
        
        labels = _get_issue_labels(item_number, headers, github_org, "missing-semester")
        
        if labels is None:
            print(f"  ❌ Failed to retrieve {item_type} #{item_number}", file=sys.stderr)
            all_passed = False
            continue
        
        # Sort both lists for comparison
        labels_sorted = sorted(labels)
        expected_sorted = sorted(expected)
        
        if labels_sorted == expected_sorted:
            print(f"  ✅ {item_type} #{item_number} has correct labels: {labels_sorted}")
        else:
            print(f"  ❌ {item_type} #{item_number} has incorrect labels", file=sys.stderr)
            print(f"     Expected: {expected_sorted}", file=sys.stderr)
            print(f"     Found: {labels_sorted}", file=sys.stderr)
            all_passed = False

    if all_passed:
        print("\n✅ All verification checks passed!")
        print("Contributor labels assignment task completed successfully:")
        print("  - Issues #9 and #14 have both 'assigned-jonhoo' and 'assigned-anishathalye' labels")
        print("  - Issue #15 and all 4 open PRs have 'assigned-anishathalye' label")
    else:
        print("\n❌ Some verification checks failed", file=sys.stderr)

    return all_passed


if __name__ == "__main__":
    success = verify()
    sys.exit(0 if success else 1)

================================================
FILE: tasks/github/standard/missing-semester/find_legacy_name/description.md
================================================
I remember that a long time ago, *The Missing Semester of Your CS Education* had a different name and domain. There should be some related commit history. Please find the old name and domain and create an **ANSWER.md** file with them, formatted as:

[title](url)

Then push the file to the `master` branch.

================================================
FILE: tasks/github/standard/missing-semester/find_legacy_name/meta.json
================================================
{
  "task_id": "find_legacy_name",
  "task_name": "Find Legacy Name",
  "category_id": "missing-semester",
  "category_name": "Missing Semester",
  "description": "Find the old name and domain of The Missing Semester course from commit history and document the findings.",
  "author": "Zijian Wu",
  "created_at": "2025-08-15",
  "difficulty": "L3",
  "tags": [
    "repository analysis"
  ],
  "mcp": [
    "github"
  ],
  "meta_data": {
    "stateType": "url",
    "stateContent": null,
    "stateUrl": "https://github.com/mcpmark-source/missing-semester",
    "stateOriginalUrl": "https://github.com/missing-semester/missing-semester"
  }
}

================================================
FILE: tasks/github/standard/missing-semester/find_legacy_name/verify.py
================================================
import sys
import os
import requests
import base64
from typing import Dict, Optional, Tuple
from dotenv import load_dotenv


def _get_github_api(
    endpoint: str, headers: Dict[str, str], org: str, repo: str = "missing-semester"
) -> Tuple[bool, Optional[Dict]]:
    """Make a GET request to GitHub API and return (success, response)."""
    url = f"https://api.github.com/repos/{org}/{repo}/{endpoint}"
    
    try:
        response = requests.get(url, headers=headers)
        if response.status_code == 200:
            return True, response.json()
        elif response.status_code == 404:
            return False, None
        else:
            print(f"API error for {endpoint}: {response.status_code}", file=sys.stderr)
            return False, None
    except Exception as e:
        print(f"Exception for {endpoint}: {e}", file=sys.stderr)
        return False, None


def _get_file_content(
    file_path: str,
    headers: Dict[str, str],
    org: str,
    repo: str = "missing-semester",
    ref: str = "master",
) -> Optional[str]:
    """Get the content of a file from the repository."""
    success, result = _get_github_api(
        f"contents/{file_path}?ref={ref}", headers, org, repo
    )
    if not success or not result:
        return None

    try:
        content = base64.b64decode(result.get("content", "")).decode("utf-8")
        return content
    except Exception as e:
        print(f"Content decode error for {file_path}: {e}", file=sys.stderr)
        return None


def verify() -> bool:
    """
    Programmatically verify that the legacy name finding task was completed correctly.
    Checks for ANSWER.md file in master branch with the correct content.
    """
    # Expected answer content (accept both with and without trailing slash)
    EXPECTED_CONTENTS = {
        "[Hacker Tools](https://hacker-tools.github.io)",
        "[Hacker Tools](https://hacker-tools.github.io/)",
    }
    
    # Load environment variables from .mcp_env
    load_dotenv(".mcp_env")

    # Get GitHub token and org
    github_token = os.environ.get("MCP_GITHUB_TOKEN")
    github_org = os.environ.get("GITHUB_EVAL_ORG")

    if not github_token:
        print("Error: MCP_GITHUB_TOKEN environment variable not set", file=sys.stderr)
        return False

    if not github_org:
        print("Error: GITHUB_EVAL_ORG environment variable not set", file=sys.stderr)
        return False

    headers = {
        "Authorization": f"Bearer {github_token}",
        "Accept": "application/vnd.github.v3+json",
    }

    # Run verification checks
    print("Verifying legacy name finding task completion...")

    # 1. Check that ANSWER.md exists in master branch
    print("1. Checking ANSWER.md exists in master branch...")
    answer_content = _get_file_content("ANSWER.md", headers, github_org, "missing-semester", "master")
    
    if not answer_content:
        print("Error: ANSWER.md not found in master branch", file=sys.stderr)
        return False

    print("✓ ANSWER.md found in master branch")

    # 2. Check that the content matches expected answer
    print("2. Verifying ANSWER.md content...")
    answer_content = answer_content.strip()
    
    if answer_content not in EXPECTED_CONTENTS:
        print(f"Error: ANSWER.md content does not match expected answer(s)", file=sys.stderr)
        print(f"Expected one of: {sorted(EXPECTED_CONTENTS)}", file=sys.stderr)
        print(f"Found: {answer_content}", file=sys.stderr)
        return False

    print("✓ ANSWER.md contains correct legacy name and URL")

    print("\n✅ All verification checks passed!")
    print("Legacy name finding task completed successfully:")
    print(f"  - ANSWER.md created in master branch")
    print(f"  - Content accepted: {answer_content}")

    return True


if __name__ == "__main__":
    success = verify()
    sys.exit(0 if success else 1)


================================================
FILE: tasks/github/standard/missing-semester/find_salient_file/description.md
================================================
I want to know which file has been modified most frequently in the past 100 commits. However, I don't want to consider files related to GitHub Actions.
Please find the file and create an ANSWER.md, then write the file name in it.

================================================
FILE: tasks/github/standard/missing-semester/find_salient_file/meta.json
================================================
{
  "task_id": "find_salient_file",
  "task_name": "Find Salient File",
  "category_id": "missing-semester",
  "category_name": "Missing Semester",
  "description": "Identify the most frequently modified file in the past 100 commits, excluding GitHub Actions related files, and create an ANSWER.md with the file name.",
  "author": "Zijian Wu",
  "created_at": "2025-08-15",
  "difficulty": "L3",
  "tags": [
    "commit analysis",
    "file tracking",
    "git history"
  ],
  "mcp": [
    "github"
  ],
  "meta_data": {
    "stateType": "url",
    "stateContent": null,
    "stateUrl": "https://github.com/mcpmark-source/missing-semester",
    "stateOriginalUrl": "https://github.com/missing-semester/missing-semester"
  }
}

================================================
FILE: tasks/github/standard/missing-semester/find_salient_file/verify.py
================================================
import sys
import os
import requests
import base64
from typing import Dict, Optional, Tuple
from dotenv import load_dotenv


def _get_github_api(
    endpoint: str, headers: Dict[str, str], org: str, repo: str = "missing-semester"
) -> Tuple[bool, Optional[Dict]]:
    """Make a GET request to GitHub API and return (success, response)."""
    url = f"https://api.github.com/repos/{org}/{repo}/{endpoint}"
    
    try:
        response = requests.get(url, headers=headers)
        if response.status_code == 200:
            return True, response.json()
        elif response.status_code == 404:
            return False, None
        else:
            print(f"API error for {endpoint}: {response.status_code}", file=sys.stderr)
            return False, None
    except Exception as e:
        print(f"Exception for {endpoint}: {e}", file=sys.stderr)
        return False, None


def _get_file_content(
    file_path: str,
    headers: Dict[str, str],
    org: str,
    repo: str = "missing-semester",
    ref: str = "master",
) -> Optional[str]:
    """Get the content of a file from the repository."""
    success, result = _get_github_api(
        f"contents/{file_path}?ref={ref}", headers, org, repo
    )
    if not success or not result:
        return None

    try:
        content = base64.b64decode(result.get("content", "")).decode("utf-8")
        return content
    except Exception as e:
        print(f"Content decode error for {file_path}: {e}", file=sys.stderr)
        return None


def verify() -> bool:
    """
    Programmatically verify that the most frequently modified file was identified correctly.
    Checks for ANSWER.md file in master branch with the correct content.
    """
    # Expected answer content (excluding GitHub Actions files)
    EXPECTED_CONTENT = "index.md"
    
    # Load environment variables from .mcp_env
    load_dotenv(".mcp_env")

    # Get GitHub token and org
    github_token = os.environ.get("MCP_GITHUB_TOKEN")
    github_org = os.environ.get("GITHUB_EVAL_ORG")

    if not github_token:
        print("Error: MCP_GITHUB_TOKEN environment variable not set", file=sys.stderr)
        return False

    if not github_org:
        print("Error: GITHUB_EVAL_ORG environment variable not set", file=sys.stderr)
        return False

    headers = {
        "Authorization": f"Bearer {github_token}",
        "Accept": "application/vnd.github.v3+json",
    }

    # Run verification checks
    print("Verifying salient file identification task completion...")

    # 1. Check that ANSWER.md exists in master branch
    print("1. Checking ANSWER.md exists in master branch...")
    answer_content = _get_file_content("ANSWER.md", headers, github_org, "missing-semester", "master")
    
    if not answer_content:
        print("Error: ANSWER.md not found in master branch", file=sys.stderr)
        return False

    print("✅ ANSWER.md found in master branch")

    # 2. Check that the content matches expected answer
    print("2. Verifying ANSWER.md content...")
    answer_content = answer_content.strip()
    
    if answer_content != EXPECTED_CONTENT:
        print(f"Error: ANSWER.md content does not match expected answer", file=sys.stderr)
        print(f"Expected: {EXPECTED_CONTENT}", file=sys.stderr)
        print(f"Found: {answer_content}", file=sys.stderr)
        return False

    print("✅ ANSWER.md contains correct filename")

    print("\n✅ All verification checks passed!")
    print("Salient file identification task completed successfully:")
    print(f"  - ANSWER.md created in master branch")
    print(f"  - Content: {EXPECTED_CONTENT}")

    return True


if __name__ == "__main__":
    success = verify()
    sys.exit(0 if success else 1)

================================================
FILE: tasks/notion/easy/.gitkeep
================================================


================================================
FILE: tasks/notion/easy/computer_science_student_dashboard/simple__code_snippets_go/description.md
================================================
Find the page named "Computer Science Student Dashboard" and extend the **Code Snippets** section with Go content.

**Task Requirements:**
1. Add a bold paragraph that contains exactly the text `Go` to mark the start of the Go snippets.
2. Directly under that heading, add three code blocks configured with `language` set to **go**:
   a. **Basic Go program** – Caption must be `Basic Go program` and the code content must be exactly:
   ```go
   package main

   import "fmt"

   func main() {
       fmt.Println("Hello, World!")
   }
   ```
   b. **For loop in Go** – Caption must be `For loop in Go` and the code content must be exactly:
   ```go
   for i := 0; i < 5; i++ {
       fmt.Println(i)
   }
   ```
   c. **Function definition in Go** – Caption must be `Function definition in Go` and the code content must be exactly:
   ```go
   func add(a, b int) int {
       return a + b
   }
   ```


================================================
FILE: tasks/notion/easy/computer_science_student_dashboard/simple__code_snippets_go/meta.json
================================================
{
  "task_id": "simple__code_snippets_go",
  "task_name": "Simple Code Snippets Go",
  "category_id": "computer_science_student_dashboard",
  "category_name": "Computer Science Student Dashboard",
  "description": "Add a new Go column to the Code Snippets section between Python and JavaScript columns.",
  "author": "Xiangyan Liu",
  "created_at": "2025-11-15",
  "difficulty": "L1",
  "tags": [
    "content organization",
    "visual formatting",
    "template population"
  ],
  "mcp": [
    "notion"
  ],
  "meta_data": {
    "stateType": "url",
    "stateContent": null,
    "stateUrl": "https://painted-tennis-ebc.notion.site/Computer-Science-Student-Dashboard-23e81626b6d78083b787d3c832b02ef4",
    "stateOriginalUrl": "https://www.notion.so/marketplace/templates/computer-science-student-dashboard"
  }
}


================================================
FILE: tasks/notion/easy/computer_science_student_dashboard/simple__code_snippets_go/verify.py
================================================
import sys
from notion_client import Client
from tasks.utils import notion_utils

# Expected code blocks (language=go)
EXPECTED_CODE_BLOCKS = [
    {
        "caption": "Basic Go program",
        "code": (
            'package main\n\nimport "fmt"\n\nfunc main() {\n    fmt.Println("Hello, World!")\n}'
        ),
    },
    {
        "caption": "For loop in Go",
        "code": ("for i := 0; i < 5; i++ {\n    fmt.Println(i)\n}"),
    },
    {
        "caption": "Function definition in Go",
        "code": ("func add(a, b int) int {\n    return a + b\n}"),
    },
]

HEADER_TEXT = "Go"


def _normalize(text: str) -> str:
    """Remove trailing spaces on each line and strip leading/trailing blank lines."""
    return "\n".join(line.rstrip() for line in text.strip().splitlines())


def _find_page(notion: Client, main_id: str | None) -> str | None:
    """Return a page_id to verify against or None if not found."""
    page_id = None
    if main_id:
        found_id, object_type = notion_utils.find_page_or_database_by_id(
            notion, main_id
        )
        if found_id and object_type == "page":
            page_id = found_id
    if not page_id:
        page_id = notion_utils.find_page(notion, "Computer Science Student Dashboard")
    return page_id


def _has_bold_header_text(block, text: str) -> bool:
    """Generic bold header/paragraph check for a given text."""
    block_type = block.get("type")
    if block_type not in {"paragraph", "heading_1", "heading_2", "heading_3"}:
        return False
    rich_text_list = block.get(block_type, {}).get("rich_text", [])
    if not rich_text_list:
        return False
    plain = "".join(rt.get("plain_text", "") for rt in rich_text_list).strip()
    if plain != text:
        return False
    return any(rt.get("annotations", {}).get("bold", False) for rt in rich_text_list)


def _collect_code_blocks(blocks):
    """Return list of (code_content, caption) tuples for code blocks with language 'go'."""
    collected = []
    for block in blocks:
        if block.get("type") != "code":
            continue
        code_data = block.get("code", {})
        if code_data.get("language") != "go":
            continue
        code_plain = "".join(
            rt.get("plain_text", "") for rt in code_data.get("rich_text", [])
        )
        caption_plain = "".join(
            rt.get("plain_text", "") for rt in code_data.get("caption", [])
        )
        collected.append((code_plain, caption_plain))
    return collected


def verify(notion: Client, main_id: str | None = None) -> bool:
    page_id = _find_page(notion, main_id)
    if not page_id:
        print("Error: Target page not found.", file=sys.stderr)
        return False

    all_blocks = notion_utils.get_all_blocks_recursively(notion, page_id)

    # Verify header
    header_ok = any(_has_bold_header_text(b, HEADER_TEXT) for b in all_blocks)
    if not header_ok:
        print("Failure: Bold header 'Go' not found.", file=sys.stderr)
        return False

    # Verify code blocks
    code_blocks_found = _collect_code_blocks(all_blocks)

    remaining = EXPECTED_CODE_BLOCKS.copy()
    for code, caption in code_blocks_found:
        norm_code = _normalize(code)
        for expected in remaining:
            if (
                _normalize(expected["code"]) == norm_code
                and expected["caption"] == caption
            ):
                remaining.remove(expected)
                break
    if remaining:
        missing = ", ".join(exp["caption"] for exp in remaining)
        print(
            f"Failure: Missing or incorrect Go code blocks: {missing}", file=sys.stderr
        )
        return False

    print(
        "Success: Verified Go header and required Go code blocks."
    )
    return True


def main():
    notion = notion_utils.get_notion_client()
    main_id = sys.argv[1] if len(sys.argv) > 1 else None
    sys.exit(0 if verify(notion, main_id) else 1)


if __name__ == "__main__":
    main()


================================================
FILE: tasks/notion/easy/computer_science_student_dashboard/simple__study_session_tracker/description.md
================================================
Create a new study-session entry on the **Computer Science Student Dashboard** page.

1. Locate the ☑️ Habit tracker section of the page.
2. **Insert a new date mention** for `2025-01-29` immediately **after the existing `2022-09-02` items but before the divider block** that follows them. Match the formatting of the existing dates (bold text with a Notion date mention).


================================================
FILE: tasks/notion/easy/computer_science_student_dashboard/simple__study_session_tracker/meta.json
================================================
{
  "task_id": "simple__study_session_tracker",
  "task_name": "Simple Study Session Tracker",
  "category_id": "computer_science_student_dashboard",
  "category_name": "Computer Science Student Dashboard",
  "description": "Create a new study-session entry in the Habit tracker section with four unchecked to-do items.",
  "author": "Xiangyan Liu",
  "created_at": "2025-11-15",
  "difficulty": "L1",
  "tags": [
    "content organization",
    "visual formatting",
    "status tracking"
  ],
  "mcp": [
    "notion"
  ],
  "meta_data": {
    "stateType": "url",
    "stateContent": null,
    "stateUrl": "https://painted-tennis-ebc.notion.site/Computer-Science-Student-Dashboard-23e81626b6d78083b787d3c832b02ef4",
    "stateOriginalUrl": "https://www.notion.so/marketplace/templates/computer-science-student-dashboard"
  }
}


================================================
FILE: tasks/notion/easy/computer_science_student_dashboard/simple__study_session_tracker/verify.py
================================================
import sys
from notion_client import Client
from tasks.utils import notion_utils


def verify(notion: Client, main_id: str | None = None) -> bool:
    """Verify that the new study-session entry for 2025-01-29 was added correctly.

    The script checks that:
    1. A bold date-mention with start=2025-01-29 exists.
    2. The mention sits after the 2022-09-02 section but before the divider that originally
       followed that section.
    """

    # ---------------------------------------------------------------------
    # Locate the main page -------------------------------------------------
    # ---------------------------------------------------------------------
    page_id: str | None = None

    if main_id:
        found_id, object_type = notion_utils.find_page_or_database_by_id(
            notion, main_id
        )
        if found_id and object_type == "page":
            page_id = found_id

    if not page_id:
        page_id = notion_utils.find_page(notion, "Computer Science Student Dashboard")

    if not page_id:
        print(
            "Error: Page 'Computer Science Student Dashboard' not found.",
            file=sys.stderr,
        )
        return False

    # ---------------------------------------------------------------------
    # Fetch all blocks under the page (flattened order) --------------------
    # ---------------------------------------------------------------------
    all_blocks = notion_utils.get_all_blocks_recursively(notion, page_id)

    # ---------------------------------------------------------------------
    # Locate reference blocks ---------------------------------------------
    # ---------------------------------------------------------------------
    TARGET_DATE = "2025-01-29"
    PREVIOUS_DATE = "2022-09-02"

    index_previous_date: int | None = None
    index_new_date: int | None = None
    index_divider_after_previous: int | None = None

    for idx, block in enumerate(all_blocks):
        # Divider detection (we care only about the first divider that appears after
        # the 2022-09-02 block)
        if block.get("type") == "divider":
            if index_previous_date is not None and index_divider_after_previous is None:
                index_divider_after_previous = idx

        # We only need to inspect paragraph blocks that contain a date mention
        if block.get("type") != "paragraph":
            continue

        rich_text_list = block["paragraph"].get("rich_text", [])
        for rt in rich_text_list:
            if (
                rt.get("type") != "mention"
                or rt.get("mention", {}).get("type") != "date"
            ):
                continue

            date_start = rt["mention"]["date"].get("start")

            if date_start == PREVIOUS_DATE and index_previous_date is None:
                index_previous_date = idx

            if date_start == TARGET_DATE and index_new_date is None:
                index_new_date = idx
                # (1) Verify bold annotation
                if not rt.get("annotations", {}).get("bold", False):
                    print(
                        "Error: The 2025-01-29 date mention is not bold.",
                        file=sys.stderr,
                    )
                    return False

    # Ensure all reference indices were found
    if index_previous_date is None:
        print("Error: Could not locate the 2022-09-02 date section.", file=sys.stderr)
        return False
    if index_divider_after_previous is None:
        print(
            "Error: Could not locate the divider that follows the 2022-09-02 section.",
            file=sys.stderr,
        )
        return False
    if index_new_date is None:
        print(
            "Error: Could not locate the new 2025-01-29 date mention.", file=sys.stderr
        )
        return False

    # (2) Verify ordering
    if not (index_previous_date < index_new_date < index_divider_after_previous):
        print(
            "Error: The 2025-01-29 section is positioned incorrectly.", file=sys.stderr
        )
        return False

    # ---------------------------------------------------------------------
    # Success --------------------------------------------------------------
    # ---------------------------------------------------------------------
    print("Success: Date mention for 2025-01-29 added in the correct position.")
    return True


# -------------------------------------------------------------------------
# Command-line entry-point -------------------------------------------------
# -------------------------------------------------------------------------


def main() -> None:
    notion = notion_utils.get_notion_client()
    main_id = sys.argv[1] if len(sys.argv) > 1 else None

    if verify(notion, main_id):
        sys.exit(0)
    else:
        sys.exit(1)


if __name__ == "__main__":
    main()


================================================
FILE: tasks/notion/easy/it_trouble_shooting_hub/simple__asset_retirement_migration/description.md
================================================
Please migrate expiring assets out of the **IT Inventory** database using the simplified checklist below. Your changes will be verified automatically, so match the details exactly.

---
Task Steps
1. Inside the **IT Trouble Shooting Hub** page, locate the database named **IT Inventory**.
2. Collect every page in **IT Inventory** whose **Status** is **Expired** or **To be returned**.
3. Create a **new full-page database** under the same hub titled **IT Asset Retirement Queue** with exactly these properties (names and types must match):
   • Serial – title  
   • Status – select  
   • Expiration date – date
4. For every item gathered in step 2, create a page in **IT Asset Retirement Queue**, copy over the Serial, Status, and Expiration date values, then archive the original inventory page once the copy is made.


================================================
FILE: tasks/notion/easy/it_trouble_shooting_hub/simple__asset_retirement_migration/meta.json
================================================
{
  "task_id": "simple__asset_retirement_migration",
  "task_name": "Simple Asset Retirement Migration",
  "category_id": "it_trouble_shooting_hub",
  "category_name": "IT Trouble Shooting Hub",
  "description": "Restructure the IT Inventory database by migrating expired assets to a new IT Asset Retirement Queue database.",
  "author": "Xiangyan Liu",
  "created_at": "2025-11-15",
  "difficulty": "L1",
  "tags": [
    "database manipulation",
    "automated migration",
    "conditional filtering",
    "data aggregation",
    "report generation"
  ],
  "mcp": [
    "notion"
  ],
  "meta_data": {
    "stateType": "url",
    "stateContent": null,
    "stateUrl": "https://painted-tennis-ebc.notion.site/It-Trouble-Shooting-Hub-23e81626b6d78020aba7eb65ae1cc2d5",
    "stateOriginalUrl": "https://www.notion.so/marketplace/templates/it-trouble-shooting-hub"
  }
}


================================================
FILE: tasks/notion/easy/it_trouble_shooting_hub/simple__asset_retirement_migration/verify.py
================================================
import sys
from typing import Dict
from notion_client import Client
from tasks.utils import notion_utils


def _get_database(root_page_id: str, notion: Client, name: str) -> str | None:
    """Helper that finds a child database by title inside a page."""
    return notion_utils.find_database_in_block(notion, root_page_id, name)


def _check_property(props: Dict, name: str, expected_type: str) -> bool:
    if name not in props:
        print(f"Error: Property '{name}' missing in database.", file=sys.stderr)
        return False
    if props[name]["type"] != expected_type:
        print(
            f"Error: Property '{name}' expected type '{expected_type}', found '{props[name]['type']}'.",
            file=sys.stderr,
        )
        return False
    return True


def verify(notion: Client, main_id: str | None = None) -> bool:
    """Verifies that the IT Asset Retirement Queue was created and populated correctly."""

    # -------------------------------------------------------------------------
    # Resolve the root IT Trouble Shooting Hub page
    # -------------------------------------------------------------------------
    root_page_id = None
    if main_id:
        found_id, obj_type = notion_utils.find_page_or_database_by_id(notion, main_id)
        if found_id and obj_type == "page":
            root_page_id = found_id

    if not root_page_id:
        root_page_id = notion_utils.find_page(notion, "IT Trouble Shooting Hub")
    if not root_page_id:
        print(
            "Error: Could not locate the 'IT Trouble Shooting Hub' page.",
            file=sys.stderr,
        )
        return False

    # -------------------------------------------------------------------------
    # Locate the original and new databases
    # -------------------------------------------------------------------------
    inventory_db_id = _get_database(root_page_id, notion, "IT Inventory")
    if not inventory_db_id:
        print("Error: 'IT Inventory' database not found.", file=sys.stderr)
        return False

    retirement_db_id = _get_database(root_page_id, notion, "IT Asset Retirement Queue")
    if not retirement_db_id:
        print("Error: 'IT Asset Retirement Queue' database not found.", file=sys.stderr)
        return False

    # -------------------------------------------------------------------------
    # Validate schema of the retirement queue database
    # -------------------------------------------------------------------------
    retirement_db = notion.databases.retrieve(database_id=retirement_db_id)
    r_props = retirement_db["properties"]

    required_schema = {
        "Serial": "title",
        "Status": "select",
        "Expiration date": "date",
    }

    for pname, ptype in required_schema.items():
        if not _check_property(r_props, pname, ptype):
            return False

    # -------------------------------------------------------------------------
    # Validate that inventory items are moved & archived
    # -------------------------------------------------------------------------
    expired_filter = {
        "property": "Status",
        "select": {"equals": "Expired"},
    }
    to_return_filter = {
        "property": "Status",
        "select": {"equals": "To be returned"},
    }
    compound_filter = {"or": [expired_filter, to_return_filter]}

    # Query for any *active* items that still match these statuses
    remaining_items = notion.databases.query(
        database_id=inventory_db_id,
        filter=compound_filter,
        archived=False,
    ).get("results", [])

    if remaining_items:
        print(
            f"Error: {len(remaining_items)} 'Expired' / 'To be returned' items still present in IT Inventory.",
            file=sys.stderr,
        )
        return False

    # There should be at least one entry in the retirement queue
    retirement_pages = notion.databases.query(database_id=retirement_db_id).get(
        "results", []
    )
    expected_serials = {"65XYQ/GB", "36x10PIQ"}
    if len(retirement_pages) != len(expected_serials):
        print(
            f"Error: Expected {len(expected_serials)} retirement pages, found {len(retirement_pages)}.",
            file=sys.stderr,
        )
        return False

    serials_seen = set()
    for page in retirement_pages:
        props = page["properties"]
        # Collect Serial title
        title_rich = props.get("Serial", {}).get("title", [])
        serial_val = "".join([t.get("plain_text", "") for t in title_rich]).strip()
        serials_seen.add(serial_val)

    if serials_seen != expected_serials:
        print(
            f"Error: Serial values mismatch. Expected {sorted(expected_serials)}, found {sorted(serials_seen)}.",
            file=sys.stderr,
        )
        return False

    print("Success: All verification criteria satisfied.")
    return True


def main():
    notion = notion_utils.get_notion_client()
    main_id = sys.argv[1] if len(sys.argv) > 1 else None
    if verify(notion, main_id):
        sys.exit(0)
    else:
        sys.exit(1)


if __name__ == "__main__":
    main()


================================================
FILE: tasks/notion/easy/japan_travel_planner/simple__remove_osaka_itinerary/description.md
================================================
Go to Japan Travel Planner, and go to the Travel Itineray database, and remove the itinerary in OSAKA after 6 PM (excluding 6 PM) in Day 1 and Day 2.

================================================
FILE: tasks/notion/easy/japan_travel_planner/simple__remove_osaka_itinerary/meta.json
================================================
{
  "task_id": "simple__remove_osaka_itinerary",
  "task_name": "Simple Remove Osaka Itinerary",
  "category_id": "japan_travel_planner",
  "category_name": "Japan Travel Planner",
  "description": "Remove the itinerary items in Osaka after 6 PM from Day 1 and Day 2 travel schedules.",
  "author": "Xiangyan Liu",
  "created_at": "2025-11-15",
  "difficulty": "L1",
  "tags": [
    "conditional filtering",
    "automated migration"
  ],
  "mcp": [
    "notion"
  ],
  "meta_data": {
    "stateType": "url",
    "stateContent": null,
    "stateUrl": "https://painted-tennis-ebc.notion.site/Japan-Travel-Planner-23181626b6d781c4b6bedb12786b5abe",
    "stateOriginalUrl": "https://www.notion.so/marketplace/templates/japantravelplanner101"
  }
}


================================================
FILE: tasks/notion/easy/japan_travel_planner/simple__remove_osaka_itinerary/verify.py
================================================
import sys
from notion_client import Client
from tasks.utils import notion_utils

def get_page_title(page_result):
    """Extract title from a page result"""
    properties = page_result.get('properties', {})
    name_property = properties.get('Name', {})
    if name_property.get('type') == 'title':
        title_array = name_property.get('title', [])
        if title_array and len(title_array) > 0:
            return title_array[0].get('plain_text', '')
    return ''

def get_page_time(page_result):
    """Extract time from Notes field"""
    properties = page_result.get('properties', {})
    notes_property = properties.get('Notes', {})
    if notes_property.get('type') == 'rich_text':
        rich_text_array = notes_property.get('rich_text', [])
        if rich_text_array and len(rich_text_array) > 0:
            notes_text = rich_text_array[0].get('plain_text', '')
            return notes_text.strip()
    return ''

def get_page_group(page_result):
    """Extract group/location from page"""
    properties = page_result.get('properties', {})
    group_property = properties.get('Group', {})
    if group_property.get('type') == 'select':
        select = group_property.get('select')
        if select:
            return select.get('name', '')
    return ''

def get_page_day(page_result):
    """Extract day from page"""
    properties = page_result.get('properties', {})
    day_property = properties.get('Day', {})
    if day_property.get('type') == 'select':
        select = day_property.get('select')
        if select:
            return select.get('name', '')
    return ''

def parse_time_to_minutes(time_str):
    """Convert time string to minutes for comparison
    Returns None if time cannot be parsed"""
    if not time_str:
        return None
    
    # Clean the time string
    time_str = time_str.strip().upper()
    
    # Remove any text after the time (e.g., "7:30 PM\n" -> "7:30 PM")
    time_str = time_str.split('\n')[0].strip()
    
    # Extract time components
    try:
        if 'PM' in time_str:
            time_part = time_str.replace('PM', '').strip()
            if ':' in time_part:
                hours, minutes = time_part.split(':')
                hours = int(hours)
                minutes = int(minutes)
            else:
                hours = int(time_part)
                minutes = 0
            # Convert PM hours (add 12 for PM times except 12 PM)
            if hours != 12:
                hours += 12
            return hours * 60 + minutes
        elif 'AM' in time_str:
            time_part = time_str.replace('AM', '').strip()
            if ':' in time_part:
                hours, minutes = time_part.split(':')
                hours = int(hours)
                minutes = int(minutes)
            else:
                hours = int(time_part)
                minutes = 0
            # Handle 12 AM (midnight)
            if hours == 12:
                hours = 0
            return hours * 60 + minutes
    except:
        return None
    
    return None

def verify(notion: Client, main_id: str = None) -> bool:
    """
    Verifies that all OSAKA events after 6PM have been removed from Day 1 and Day 2 in the Japan Travel Planner.
    
    Expected items that should be deleted (all in OSAKA, after 6PM, on Day 1 or Day 2):
    1. Rikuro's Namba Main Branch - 7 PM (Day 1)
    2. Shin Sekai "New World" - 8 PM (Day 2)
    3. Katsudon Chiyomatsu - 7:30 PM (Day 2)
    4. Ebisubashi Bridge - 9 PM (Day 1)
    
    Note: Kuromon Ichiba Market at 6 PM should NOT be deleted (it's at 6PM, not after)
    Items after 6PM on other days (Day 3-8) should NOT be deleted
    """
    
    # Step 1: Find the main Japan Travel Planner page
    if main_id:
        found_id, object_type = notion_utils.find_page_or_database_by_id(notion, main_id)
        if not found_id or object_type != 'page':
            print("Error: Japan Travel Planner page not found.", file=sys.stderr)
            return False
    else:
        # Try to find the page by searching
        found_id = notion_utils.find_page(notion, "Japan Travel Planner")
        if not found_id:
            print("Error: Japan Travel Planner page not found.", file=sys.stderr)
            return False
    
    print(f"Found Japan Travel Planner page: {found_id}")
    
    # Step 2: Find the Travel Itinerary database
    all_blocks = notion_utils.get_all_blocks_recursively(notion, found_id)
    travel_itinerary_db_id = None
    
    for block in all_blocks:
        if block and block.get("type") == "child_database":
            title = block.get("child_database", {}).get("title", "")
            if "Travel Itinerary" in title:
                travel_itinerary_db_id = block.get("id")
                print(f"Found Travel Itinerary database: {travel_itinerary_db_id}")
                break
    
    if not travel_itinerary_db_id:
        print("Error: Travel Itinerary database not found", file=sys.stderr)
        return False
    
    # Step 3: Query the database for OSAKA items on Day 1 and Day 2
    try:
        query_result = notion.databases.query(
            database_id=travel_itinerary_db_id,
            filter={
                "and": [
                    {"property": "Group", "select": {"equals": "Osaka"}},
                    {"or": [
                        {"property": "Day", "select": {"equals": "Day 1"}},
                        {"property": "Day", "select": {"equals": "Day 2"}}
                    ]}
                ]
            }
        )
    except Exception as e:
        print(f"Error querying Travel Itinerary database: {e}", file=sys.stderr)
        return False
    
    # Step 4: Check for items that should have been deleted
    six_pm_minutes = 18 * 60  # 6 PM in minutes (18:00)
    
    # Expected deleted items (4 specific items after 6 PM on Day 1 and Day 2)
    expected_deleted = {
        "Rikuro's Namba Main Branch": {"time": "7 PM", "day": "Day 1", "found": False},
        "Shin Sekai \"New World\"": {"time": "8 PM", "day": "Day 2", "found": False},
        "Katsudon Chiyomatsu": {"time": "7:30 PM", "day": "Day 2", "found": False},
        "Ebisubashi Bridge": {"time": "9 PM", "day": "Day 1", "found": False}
    }
    
    # Items that should remain (at or before 6 PM)
    expected_remaining = {
        "Kuromon Ichiba Market": {"time": "6 PM", "found": False}
    }
    
    osaka_items_after_6pm = []
    osaka_items_at_or_before_6pm = []
    
    # Debug: Show total query results
    print(f"Debug: Found {len(query_result.get('results', []))} total OSAKA items on Day 1 and Day 2")
    
    # Process all OSAKA items on Day 1 and Day 2
    for page in query_result.get('results', []):
        page_title = get_page_title(page).strip()
        page_time = get_page_time(page)
        page_group = get_page_group(page)
        page_day = get_page_day(page)
        
        if page_group != "Osaka":
            continue
        
        # Parse time to check if after 6 PM
        time_minutes = parse_time_to_minutes(page_time)
        
        if time_minutes is not None and time_minutes > six_pm_minutes:
            osaka_items_after_6pm.append({
                "title": page_title,
                "time": page_time,
                "day": page_day,
                "id": page.get('id')
            })
            
            # Check if this is one of the expected deleted items
            for expected_title, expected_info in expected_deleted.items():
                # Clean up the titles for comparison
                clean_page_title = page_title.strip().lower()
                clean_expected_title = expected_title.strip().lower()
                
                # Check for "Rikuro's" or "Rikuro's" (different apostrophe types)
                if "rikuro" in clean_page_title and "rikuro" in clean_expected_title:
                    title_match = True
                elif clean_page_title == clean_expected_title:
                    title_match = True
                elif clean_expected_title in clean_page_title or clean_page_title in clean_expected_title:
                    title_match = True
                else:
                    title_match = False
                    
                if title_match and page_day == expected_info["day"]:
                    print(f"Debug: Found '{page_title}' on {page_day} at {page_time} - matches expected '{expected_title}'")
                    expected_deleted[expected_title]["found"] = True
                
        elif time_minutes is not None and time_minutes <= six_pm_minutes:
            osaka_items_at_or_before_6pm.append({
                "title": page_title,
                "time": page_time,
                "day": page_day,
                "id": page.get('id')
            })
            
            # Check if this is one of the expected remaining items
            for expected_title in expected_remaining:
                if expected_title.lower() in page_title.lower() or page_title.lower() in expected_title.lower():
                    expected_remaining[expected_title]["found"] = True
    
    # Step 5: Verify results
    print(f"\nVerification Summary:")
    print(f"=" * 50)
    
    all_passed = True
    
    # Check that the 4 expected items after 6 PM have been deleted
    print("\n4 Items that should be deleted (after 6 PM on Day 1 and Day 2):")
    for item_name, item_info in expected_deleted.items():
        if item_info["found"]:
            # If found = True, it means the item still exists (was not deleted)
            print(f"✗ {item_name} ({item_info['day']}, {item_info['time']}) - Still exists, should be deleted", file=sys.stderr)
            all_passed = False
        else:
            # If found = False, it means the item was deleted correctly
            print(f"✓ {item_name} ({item_info['day']}, {item_info['time']}) - Correctly deleted")
    
    
    # Check that items at or before 6 PM remain
    print("\nItems that should remain (at or before 6 PM on Day 1 and Day 2):")
    for item_name, item_info in expected_remaining.items():
        if item_info["found"]:
            print(f"✓ {item_name} ({item_info['time']}) - Correctly retained")
        else:
            print(f"✗ {item_name} ({item_info['time']}) - Missing, should not be deleted", file=sys.stderr)
            all_passed = False
    
    # Report any items after 6 PM that still exist
    if osaka_items_after_6pm:
        print(f"\n✗ Found {len(osaka_items_after_6pm)} OSAKA item(s) after 6 PM on Day 1/Day 2:", file=sys.stderr)
        for item in osaka_items_after_6pm:
            print(f"  - {item['title']} at {item['time']} ({item['day']})", file=sys.stderr)
    else:
        print(f"\n✓ No OSAKA items found after 6 PM on Day 1/Day 2 (all correctly deleted)")
    
    # Report count summary
    print(f"\nCount Summary:")
    print(f"- OSAKA items after 6 PM on Day 1/Day 2 found: {len(osaka_items_after_6pm)} (should be 0)")
    print(f"- OSAKA items at/before 6 PM on Day 1/Day 2 found: {len(osaka_items_at_or_before_6pm)}")
    print(f"- Expected deletions verified: {sum(1 for item in expected_deleted.values() if not item['found'])}/4")
    
    return all_passed

def main():
    """
    Executes the verification process and exits with a status code.
    """
    notion = notion_utils.get_notion_client()
    main_id = sys.argv[1] if len(sys.argv) > 1 else None
    
    if verify(notion, main_id):
        print("\nVerification passed: All 4 required OSAKA events after 6 PM on Day 1 and Day 2 have been removed")
        sys.exit(0)
    else:
        print("\nVerification failed: Some OSAKA events after 6 PM on Day 1/Day 2 still exist")
        sys.exit(1)

if __name__ == "__main__":
    main()

================================================
FILE: tasks/notion/easy/online_resume/simple__skills_development_tracker/description.md
================================================
Create a comprehensive skills audit system by performing the following tasks:

**Task Requirements:**
1. Create a new database named "Skills Development Tracker" as a child database in the main resume page with the following properties:
   - Name (title property)
   - Current Skill (relation to Skills database)
   - Current Proficiency (rollup from related skill's "Skill Level" property)
   - Target Proficiency (number property with format "percent")
   - Gap (formula: Target Proficiency - Current Proficiency)
   - Learning Resources (rich text property)
   - Progress Notes (rich text property)

2. Populate the Skills Development Tracker database with entries for all skills that have a proficiency level below 70% (0.7):
   - For each qualifying skill, create an entry with:
     - Name: "[Skill Name] Development Plan"
     - Link to the corresponding skill in Skills database
     - Target Proficiency: Set to Current + 25% (capped at 95%)
     - Learning Resources: "Online courses and practice projects"
     - Progress Notes: "Initial assessment completed"


================================================
FILE: tasks/notion/easy/online_resume/simple__skills_development_tracker/meta.json
================================================
{
  "task_id": "simple__skills_development_tracker",
  "task_name": "Simple Skills Development Tracker",
  "category_id": "online_resume",
  "category_name": "Online Resume",
  "description": "Create a comprehensive skills audit system with development tracking for skills below 70% proficiency.",
  "author": "Xiangyan Liu",
  "created_at": "2025-11-15",
  "difficulty": "L1",
  "tags": [
    "database manipulation",
    "cross-reference linking",
    "conditional filtering",
    "data aggregation",
    "template population",
    "visual formatting"
  ],
  "mcp": [
    "notion"
  ],
  "meta_data": {
    "stateType": "url",
    "stateContent": null,
    "stateUrl": "https://painted-tennis-ebc.notion.site/Online-Resume-23181626b6d781159faaeb5eadaf612e",
    "stateOriginalUrl": "https://www.notion.so/marketplace/templates/online-resume"
  }
}


================================================
FILE: tasks/notion/easy/online_resume/simple__skills_development_tracker/verify.py
================================================
import sys
from notion_client import Client
from tasks.utils import notion_utils


def verify(notion: Client, main_id: str = None) -> bool:
    """
    Verifies that the Skills Development Tracker database was created correctly.
    """
    page_id = None
    if main_id:
        found_id, object_type = notion_utils.find_page_or_database_by_id(
            notion, main_id
        )
        if found_id and object_type == "page":
            page_id = found_id

    if not page_id:
        page_id = notion_utils.find_page(notion, "New Online Resume")
    if not page_id:
        print("Error: Page 'New Online Resume' not found.", file=sys.stderr)
        return False

    # Step 1: Verify Skills Development Tracker database exists
    tracker_db_id = notion_utils.find_database_in_block(
        notion, page_id, "Skills Development Tracker"
    )
    if not tracker_db_id:
        print(
            "Error: Database 'Skills Development Tracker' not found.", file=sys.stderr
        )
        return False

    # Step 2: Verify database schema
    try:
        db_info = notion.databases.retrieve(database_id=tracker_db_id)
        properties = db_info.get("properties", {})

        # Check required properties
        required_props = {
            "Name": "title",
            "Current Skill": "relation",
            "Current Proficiency": "rollup",
            "Target Proficiency": "number",
            "Gap": "formula",
            "Learning Resources": "rich_text",
            "Progress Notes": "rich_text",
        }

        for prop_name, expected_type in required_props.items():
            if prop_name not in properties:
                print(
                    f"Error: Property '{prop_name}' not found in database.",
                    file=sys.stderr,
                )
                return False
            if properties[prop_name]["type"] != expected_type:
                print(
                    f"Error: Property '{prop_name}' has incorrect type. Expected '{expected_type}', got '{properties[prop_name]['type']}'.",
                    file=sys.stderr,
                )
                return False

        # Verify Target Proficiency is percent format
        if (
            properties["Target Proficiency"].get("number", {}).get("format")
            != "percent"
        ):
            print(
                "Error: Target Proficiency should have 'percent' format.",
                file=sys.stderr,
            )
            return False

    except Exception as e:
        print(f"Error retrieving database info: {e}", file=sys.stderr)
        return False

    # Step 3: Get Skills database to check entries
    skills_db_id = notion_utils.find_database_in_block(notion, page_id, "Skills")
    if not skills_db_id:
        print("Error: Skills database not found.", file=sys.stderr)
        return False

    # Get all skills with proficiency < 70%
    skills_below_70 = []
    try:
        skills_results = notion.databases.query(database_id=skills_db_id).get(
            "results", []
        )
        for skill in skills_results:
            skill_level = (
                skill.get("properties", {}).get("Skill Level", {}).get("number", 1.0)
            )
            if skill_level < 0.7:
                skill_name = (
                    skill.get("properties", {}).get("Skill", {}).get("title", [])
                )
                if skill_name:
                    skill_name_text = skill_name[0].get("text", {}).get("content", "")
                    skills_below_70.append(
                        {
                            "name": skill_name_text,
                            "id": skill["id"],
                            "level": skill_level,
                        }
                    )
    except Exception as e:
        print(f"Error querying Skills database: {e}", file=sys.stderr)
        return False

    if not skills_below_70:
        print("Warning: No skills found with proficiency below 70%.", file=sys.stderr)
        # This might be OK if all skills are above 70%

    # Step 4: Verify entries in Skills Development Tracker
    try:
        tracker_results = notion.databases.query(database_id=tracker_db_id).get(
            "results", []
        )

        # Check that we have entries for skills below 70%
        if len(skills_below_70) > 0 and len(tracker_results) == 0:
            print(
                "Error: No entries found in Skills Development Tracker database.",
                file=sys.stderr,
            )
            return False

        # Verify each entry
        for entry in tracker_results:
            props = entry.get("properties", {})

            # Check name format
            name_prop = props.get("Name", {}).get("title", [])
            if not name_prop:
                print("Error: Entry missing Name property.", file=sys.stderr)
                return False
            name_text = name_prop[0].get("text", {}).get("content", "")
            if not name_text.endswith(" Development Plan"):
                print(
                    f"Error: Entry name '{name_text}' doesn't follow expected format.",
                    file=sys.stderr,
                )
                return False

            # Check relation to Skills database
            skill_relation = props.get("Current Skill", {}).get("relation", [])
            if not skill_relation:
                print(
                    f"Error: Entry '{name_text}' missing Current Skill relation.",
                    file=sys.stderr,
                )
                return False

            # Check Target Proficiency (should be set)
            target_prof = props.get("Target Proficiency", {}).get("number")
            if target_prof is None:
                print(
                    f"Error: Entry '{name_text}' missing Target Proficiency.",
                    file=sys.stderr,
                )
                return False

            # Check Learning Resources
            learning_resources = props.get("Learning Resources", {}).get(
                "rich_text", []
            )
            if not learning_resources:
                print(
                    f"Error: Entry '{name_text}' missing Learning Resources.",
                    file=sys.stderr,
                )
                return False

            # Check Progress Notes
            progress_notes = props.get("Progress Notes", {}).get("rich_text", [])
            if not progress_notes:
                print(
                    f"Error: Entry '{name_text}' missing Progress Notes.",
                    file=sys.stderr,
                )
                return False

    except Exception as e:
        print(f"Error querying Skills Development Tracker: {e}", file=sys.stderr)
        return False

    print("Success: Skills Development Tracker database verified successfully.")
    return True


def main():
    """
    Executes the verification process and exits with a status code.
    """
    notion = notion_utils.get_notion_client()
    main_id = sys.argv[1] if len(sys.argv) > 1 else None
    if verify(notion, main_id):
        sys.exit(0)
    else:
        sys.exit(1)


if __name__ == "__main__":
    main()


================================================
FILE: tasks/notion/easy/python_roadmap/simple__expert_level_lessons/description.md
================================================
# Task: Expert Level Learning Path (Simplified)

## Objective
Extend the Python Roadmap with a new Expert Level chapter, create a bridge lesson, and add two expert lessons that build on existing material.

## Requirements

### 1. Add the Expert Level chapter
- **Database**: Chapters
- **Name**: `Expert Level`
- **Icon**: 🟣 (purple circle emoji)
- Make sure it is linked into the roadmap alongside the existing chapters.

### 2. Create the bridge lesson
Create a lesson that connects advanced material to the new chapter:
- **Title**: `Advanced Foundations Review`
- **Status**: Done
- **Chapter**: Link it to `Expert Level`
- **Parent item**: Link to the lesson whose title contains "Control" (e.g., "Control Flow")
- **Sub-items**: Include links to the lessons containing "Decorators" and "Calling API"

### 3. Add two expert lessons
Add the following entries to the Steps database:

| Lesson Title | Status | Chapter | Parent item | Date |
|--------------|--------|---------|-------------|------|
| `Metaprogramming and AST Manipulation` | To Do | Expert Level | Advanced Foundations Review | 2025-09-15 |
| `Async Concurrency Patterns` | To Do | Expert Level | Calling API | 2025-09-20 |

The lessons must inherit the correct chapter link, parent relationship, and due date as shown above.


================================================
FILE: tasks/notion/easy/python_roadmap/simple__expert_level_lessons/meta.json
================================================
{
  "task_id": "expert_level_lessons",
  "task_name": "Expert Level Lessons",
  "category_id": "python_roadmap",
  "category_name": "Python Roadmap",
  "description": "Create an Expert Level chapter with sophisticated prerequisite chains and four expert-level lessons.",
  "author": "Xiangyan Liu",
  "created_at": "2025-11-15",
  "difficulty": "L1",
  "tags": [
    "database manipulation",
    "cross-reference linking",
    "conditional filtering",
    "status tracking",
    "template population"
  ],
  "mcp": [
    "notion"
  ],
  "meta_data": {
    "stateType": "url",
    "stateContent": null,
    "stateUrl": "https://painted-tennis-ebc.notion.site/Python-Roadmap-25281626b6d78012bf2bce1fa8711f4d",
    "stateOriginalUrl": "https://www.notion.so/marketplace/templates/python-roadmap"
  }
}


================================================
FILE: tasks/notion/easy/python_roadmap/simple__expert_level_lessons/verify.py
================================================
import sys
from notion_client import Client
from tasks.utils import notion_utils


TARGET_PAGE_TITLE = "Python Roadmap"
CHAPTER_NAME = "Expert Level"
CHAPTER_ICON = "🟣"
BRIDGE_TITLE = "Advanced Foundations Review"
REQUIRED_SUBITEM_TITLES = ["Decorators", "Calling API"]

LESSON_REQUIREMENTS = [
    {
        "title": "Metaprogramming and AST Manipulation",
        "status": "To Do",
        "date": "2025-09-15",
        "parent_title": BRIDGE_TITLE,
    },
    {
        "title": "Async Concurrency Patterns",
        "status": "To Do",
        "date": "2025-09-20",
        "parent_title": "Calling API",
    },
]


def _get_database_ids(notion: Client, page_id: str) -> tuple[str | None, str | None]:
    """Return the block IDs for the Chapters and Steps databases on the page."""
    chapters_db_id = None
    steps_db_id = None

    blocks = notion_utils.get_all_blocks_recursively(notion, page_id)
    for block in blocks:
        if block.get("type") != "child_database":
            continue
        title = block.get("child_database", {}).get("title", "")
        if "Chapters" in title and not chapters_db_id:
            chapters_db_id = block["id"]
        elif "Steps" in title and not steps_db_id:
            steps_db_id = block["id"]

    return chapters_db_id, steps_db_id


def _query_step_by_title(notion: Client, database_id: str, title: str, *, exact: bool = True):
    """Return the first step entry matching the given title pattern."""
    title_filter = {"equals": title} if exact else {"contains": title}
    response = notion.databases.query(
        database_id=database_id,
        filter={"property": "Lessons", "title": title_filter},
        page_size=5,
    )
    results = response.get("results", [])
    return results[0] if results else None


def verify(notion: Client, main_id: str | None = None) -> bool:
    """Verify the simplified Expert Level learning path setup."""
    # Resolve the roadmap page.
    if main_id:
        page_id, object_type = notion_utils.find_page_or_database_by_id(notion, main_id)
        if not page_id or object_type != "page":
            print("Error: Python Roadmap page not found.", file=sys.stderr)
            return False
    else:
        page_id = notion_utils.find_page(notion, TARGET_PAGE_TITLE)
        if not page_id:
            print("Error: Python Roadmap page not found.", file=sys.stderr)
            return False

    # Locate the Chapters and Steps databases.
    chapters_db_id, steps_db_id = _get_database_ids(notion, page_id)
    if not chapters_db_id:
        print("Error: Chapters database not found on the page.", file=sys.stderr)
        return False
    if not steps_db_id:
        print("Error: Steps database not found on the page.", file=sys.stderr)
        return False

    # Ensure the Expert Level chapter exists with the purple icon.
    try:
        chapter_resp = notion.databases.query(
            database_id=chapters_db_id,
            filter={"property": "Name", "title": {"equals": CHAPTER_NAME}},
            page_size=1,
        )
    except Exception as exc:
        print(f"Error querying Chapters database: {exc}", file=sys.stderr)
        return False

    results = chapter_resp.get("results", [])
    if not results:
        print("Error: Expert Level chapter not found.", file=sys.stderr)
        return False

    expert_chapter = results[0]
    expert_chapter_id = expert_chapter["id"]
    icon = expert_chapter.get("icon") or {}
    if icon.get("type") != "emoji" or icon.get("emoji") != CHAPTER_ICON:
        print("Error: Expert Level chapter must use the purple circle emoji icon.", file=sys.stderr)
        return False

    print("✓ Expert Level chapter exists with the correct icon.")

    # Locate prerequisite lessons (Control Flow, Decorators, Calling API).
    control_lesson = _query_step_by_title(notion, steps_db_id, "Control", exact=False)
    if not control_lesson:
        print("Error: Could not find a lesson containing 'Control' in its title.", file=sys.stderr)
        return False
    control_lesson_id = control_lesson["id"]

    prerequisite_ids = {}
    for title in REQUIRED_SUBITEM_TITLES:
        lesson = _query_step_by_title(notion, steps_db_id, title, exact=False)
        if not lesson:
            print(f"Error: Required lesson containing '{title}' not found.", file=sys.stderr)
            return False
        prerequisite_ids[title] = lesson["id"]

    # Verify the bridge lesson.
    bridge_lesson = _query_step_by_title(notion, steps_db_id, BRIDGE_TITLE, exact=True)
    if not bridge_lesson:
        print("Error: Advanced Foundations Review lesson not found.", file=sys.stderr)
        return False

    status = (bridge_lesson["properties"].get("Status", {}).get("status") or {}).get("name")
    if status != "Done":
        print("Error: Advanced Foundations Review must have status 'Done'.", file=sys.stderr)
        return False

    # Ensure chapter relation includes Expert Level.
    chapter_rel = bridge_lesson["properties"].get("Chapters", {}).get("relation", [])
    if not any(rel["id"] == expert_chapter_id for rel in chapter_rel):
        print("Error: Advanced Foundations Review must link to the Expert Level chapter.", file=sys.stderr)
        return False

    # Parent item should be the control lesson.
    parent_rel = bridge_lesson["properties"].get("Parent item", {}).get("relation", [])
    if not parent_rel or parent_rel[0]["id"] != control_lesson_id:
        print("Error: Advanced Foundations Review should use the control lesson as its Parent item.", file=sys.stderr)
        return False

    # Sub-items must include the required lessons.
    sub_rel = bridge_lesson["properties"].get("Sub-item", {}).get("relation", [])
    sub_ids = {rel["id"] for rel in sub_rel}
    missing = [title for title, rel_id in prerequisite_ids.items() if rel_id not in sub_ids]
    if missing:
        print(
            f"Error: Advanced Foundations Review must include these lessons as sub-items: {', '.join(missing)}.",
            file=sys.stderr,
        )
        return False

    print("✓ Bridge lesson configured with the correct status, chapter, parent, and sub-items.")

    # Verify the two expert lessons.
    overall_success = True
    for spec in LESSON_REQUIREMENTS:
        lesson = _query_step_by_title(notion, steps_db_id, spec["title"], exact=True)
        if not lesson:
            print(f"Error: Lesson '{spec['title']}' not found.", file=sys.stderr)
            overall_success = False
            continue

        lesson_ok = True

        # Status check.
        status_name = (lesson["properties"].get("Status", {}).get("status") or {}).get("name")
        if status_name != spec["status"]:
            print(
                f"Error: Lesson '{spec['title']}' should have status '{spec['status']}', found '{status_name}'.",
                file=sys.stderr,
            )
            lesson_ok = False

        # Chapter relation check.
        lesson_chapters = lesson["properties"].get("Chapters", {}).get("relation", [])
        if not any(rel["id"] == expert_chapter_id for rel in lesson_chapters):
            print(f"Error: Lesson '{spec['title']}' must link to the Expert Level chapter.", file=sys.stderr)
            lesson_ok = False

        # Parent relation check.
        parent_title = spec["parent_title"]
        if parent_title == BRIDGE_TITLE:
            expected_parent_id = bridge_lesson["id"]
        else:
            expected_parent_id = prerequisite_ids.get(parent_title)

        parent_relation = lesson["properties"].get("Parent item", {}).get("relation", [])
        if not expected_parent_id:
            print(
                f"Error: Could not resolve expected parent '{parent_title}' for lesson '{spec['title']}'.",
                file=sys.stderr,
            )
            lesson_ok = False
        else:
            if not parent_relation or parent_relation[0]["id"] != expected_parent_id:
                print(
                    f"Error: Lesson '{spec['title']}' should have '{parent_title}' as its Parent item.",
                    file=sys.stderr,
                )
                lesson_ok = False
        # Date check.
        date_prop = lesson["properties"].get("Date", {}).get("date") or {}
        if date_prop.get("start") != spec["date"]:
            print(
                f"Error: Lesson '{spec['title']}' should use date {spec['date']}, found {date_prop.get('start')}.",
                file=sys.stderr,
            )
            lesson_ok = False

        if lesson_ok:
            print(f"✓ Lesson '{spec['title']}' has the expected properties.")
        else:
            overall_success = False

    if not overall_success:
        return False

    print("Success: Expert Level chapter, bridge lesson, and expert lessons configured correctly.")
    return True


def main() -> None:
    notion = notion_utils.get_notion_client()
    main_id = sys.argv[1] if len(sys.argv) > 1 else None
    if verify(notion, main_id):
        sys.exit(0)
    sys.exit(1)


if __name__ == "__main__":
    main()


================================================
FILE: tasks/notion/easy/self_assessment/simple__faq_column_layout/description.md
================================================
Navigate to the "Self Assessment" page and reorganize the FAQ toggle content to make it easier to scan.

**Task Requirements:**
1. Add a column list with two columns inside the FAQ toggle.
2. Move the first two existing Q&A pairs from the FAQ into the left column.
3. Move the third existing Q&A pair into the right column, keeping the original heading/paragraph formatting.


================================================
FILE: tasks/notion/easy/self_assessment/simple__faq_column_layout/meta.json
================================================
{
  "task_id": "simple__faq_column_layout",
  "task_name": "Simple FAQ Column Layout",
  "category_id": "self_assessment",
  "category_name": "Self Assessment",
  "description": "Reorganize the FAQ section content into a two-column layout with balanced Q&A pairs.",
  "author": "Xiangyan Liu",
  "created_at": "2025-11-15",
  "difficulty": "L1",
  "tags": [
    "content organization",
    "visual formatting",
    "template population"
  ],
  "mcp": [
    "notion"
  ],
  "meta_data": {
    "stateType": "url",
    "stateContent": null,
    "stateUrl": "https://painted-tennis-ebc.notion.site/Self-Assessment-24381626b6d780fe9f56c2ba14ea042d",
    "stateOriginalUrl": "https://painted-tennis-ebc.notion.site/Self-Assessment-24381626b6d780fe9f56c2ba14ea042d"
  }
}


================================================
FILE: tasks/notion/easy/self_assessment/simple__faq_column_layout/verify.py
================================================
import sys
from notion_client import Client
from tasks.utils import notion_utils


def verify(notion: Client, main_id: str = None) -> bool:
    """
    Verifies that the FAQ toggle has been properly reorganized with a column list.
    """
    # Start from main_id if provided
    page_id = None
    if main_id:
        found_id, object_type = notion_utils.find_page_or_database_by_id(
            notion, main_id
        )
        if found_id and object_type == "page":
            page_id = found_id

    if not page_id:
        # Try to find the Self Assessment page
        page_id = notion_utils.find_page(notion, "Self Assessment")

    if not page_id:
        print("Error: Self Assessment page not found.", file=sys.stderr)
        return False

    # Get all blocks recursively from the page
    all_blocks = notion_utils.get_all_blocks_recursively(notion, page_id)

    # Find the FAQ toggle block
    faq_toggle_block = None
    faq_toggle_id = None
    for block in all_blocks:
        if block.get("type") == "toggle":
            block_text = notion_utils.get_block_plain_text(block)
            if "FAQ" in block_text:
                faq_toggle_block = block
                faq_toggle_id = block.get("id")
                print(f"Found FAQ toggle block: {block_text}")
                break

    if not faq_toggle_block:
        print("Error: FAQ toggle block not found.", file=sys.stderr)
        return False

    # Find column_list inside the FAQ toggle
    column_list_block = None
    for block in all_blocks:
        if (
            block.get("type") == "column_list"
            and block.get("parent", {}).get("block_id") == faq_toggle_id
        ):
            column_list_block = block
            break

    if not column_list_block:
        print("Error: No column_list found inside FAQ toggle.", file=sys.stderr)
        return False

    # Check that there are no Q&A pairs directly under FAQ toggle (outside column_list)
    direct_faq_children = []
    for block in all_blocks:
        if block.get("parent", {}).get("block_id") == faq_toggle_id and block.get(
            "id"
        ) != column_list_block.get("id"):
            direct_faq_children.append(block)

    # Check if any of these are heading_3 or paragraph blocks (Q&A content)
    for block in direct_faq_children:
        if block.get("type") in ["heading_3", "paragraph"]:
            print(
                f"Error: Found Q&A content outside column_list: {notion_utils.get_block_plain_text(block)[:50]}...",
                file=sys.stderr,
            )
            return False

    # Find the two columns
    columns = []
    column_list_id = column_list_block.get("id")
    for block in all_blocks:
        if (
            block.get("type") == "column"
            and block.get("parent", {}).get("block_id") == column_list_id
        ):
            columns.append(block)

    if len(columns) != 2:
        print(f"Error: Expected 2 columns, found {len(columns)}.", file=sys.stderr)
        return False

    # Count Q&A pairs in each column
    qa_counts = []
    total_pairs = 0

    for i, column in enumerate(columns[:2]):
        column_id = column.get("id")

        column_blocks = [
            block
            for block in all_blocks
            if block.get("parent", {}).get("block_id") == column_id
        ]

        qa_pairs = 0
        j = 0
        while j < len(column_blocks):
            if (
                column_blocks[j].get("type") == "heading_3"
                and j + 1 < len(column_blocks)
                and column_blocks[j + 1].get("type") == "paragraph"
            ):
                qa_pairs += 1
                j += 2
            else:
                j += 1

        qa_counts.append(qa_pairs)
        total_pairs += qa_pairs
        print(f"Column {i + 1}: Found {qa_pairs} Q&A pairs")

    if qa_counts[0] < 2:
        print(
            f"Error: Left column should contain at least 2 Q&A pairs, found {qa_counts[0]}.",
            file=sys.stderr,
        )
        return False

    if qa_counts[1] < 1:
        print(
            f"Error: Right column should contain at least 1 Q&A pair, found {qa_counts[1]}.",
            file=sys.stderr,
        )
        return False

    if total_pairs < 3:
        print(
            f"Error: Expected at least 3 total Q&A pairs across both columns, found {total_pairs}.",
            file=sys.stderr,
        )
        return False

    print(
        "Success: FAQ toggle organized with two columns holding the existing Q&A pairs (two on the left, one on the right)."
    )
    return True


def main():
    """
    Executes the verification process and exits with a status code.
    """
    notion = notion_utils.get_notion_client()
    main_id = sys.argv[1] if len(sys.argv) > 1 else None
    if verify(notion, main_id):
        sys.exit(0)
    else:
        sys.exit(1)


if __name__ == "__main__":
    main()


================================================
FILE: tasks/notion/easy/standard_operating_procedure/simple__section_organization/description.md
================================================
# Task: Reorganize Standard Operating Procedure Page Sections

## Objective
Modify the structure of the Standard Operating Procedure page in Notion by updating the order of two sections.

## Requirements
- Navigate to the Standard Operating Procedure page
- Swap the positions of the "Terminologies" and "Roles & responsibilities" sections
- Preserve all content within each section exactly as is
- Maintain the original formatting and structure of each section


================================================
FILE: tasks/notion/easy/standard_operating_procedure/simple__section_organization/meta.json
================================================
{
  "task_id": "simple__section_organization",
  "task_name": "Simple Section Organization",
  "category_id": "standard_operating_procedure",
  "category_name": "Standard Operating Procedure",
  "description": "Reorganize the Standard Operating Procedure page by swapping sections and creating a column layout.",
  "author": "Xiangyan Liu",
  "created_at": "2025-11-15",
  "difficulty": "L1",
  "tags": [
    "content organization",
    "cross-reference linking",
    "visual formatting"
  ],
  "mcp": [
    "notion"
  ],
  "meta_data": {
    "stateType": "url",
    "stateContent": null,
    "stateUrl": "https://painted-tennis-ebc.notion.site/Standard-Operating-Procedure-24381626b6d780a8b678f9e62ae5b152",
    "stateOriginalUrl": "https://www.notion.so/marketplace/templates/standard-operating-procedure"
  }
}


================================================
FILE: tasks/notion/easy/standard_operating_procedure/simple__section_organization/verify.py
================================================
import sys
from notion_client import Client
from tasks.utils import notion_utils


TARGET_PAGE_TITLE = "Standard Operating Procedure"
ROLES_HEADING = "Roles & responsibilities"
TERMINOLOGIES_HEADING = "Terminologies"


def _find_heading_indices(blocks: list[dict]) -> tuple[int | None, int | None]:
    """Return the indices of the target headings within the flattened block list."""
    roles_index = None
    terminologies_index = None

    for index, block in enumerate(blocks):
        if block.get("type") != "heading_2":
            continue
        rich_text = block.get("heading_2", {}).get("rich_text", [])
        if not rich_text:
            continue
        heading_text = rich_text[0].get("text", {}).get("content", "")
        if heading_text == ROLES_HEADING and roles_index is None:
            roles_index = index
        elif heading_text == TERMINOLOGIES_HEADING and terminologies_index is None:
            terminologies_index = index

    return roles_index, terminologies_index


def verify(notion: Client, main_id: str | None = None) -> bool:
    """Ensure the Roles & responsibilities section appears before Terminologies."""
    # Resolve page id.
    if main_id:
        page_id, object_type = notion_utils.find_page_or_database_by_id(notion, main_id)
        if not page_id or object_type != "page":
            print("Error: Standard Operating Procedure page not found.", file=sys.stderr)
            return False
    else:
        page_id = notion_utils.find_page(notion, TARGET_PAGE_TITLE)
        if not page_id:
            print("Error: Standard Operating Procedure page not found.", file=sys.stderr)
            return False

    # Fetch all blocks (flattened order from top to bottom).
    blocks = notion_utils.get_all_blocks_recursively(notion, page_id)
    roles_index, terminologies_index = _find_heading_indices(blocks)

    if roles_index is None:
        print("Error: 'Roles & responsibilities' section not found.", file=sys.stderr)
        return False
    if terminologies_index is None:
        print("Error: 'Terminologies' section not found.", file=sys.stderr)
        return False

    if roles_index >= terminologies_index:
        print(
            "Error: Sections are not swapped. 'Roles & responsibilities' should appear before 'Terminologies'.",
            file=sys.stderr,
        )
        return False

    print("Success: Section order updated so 'Roles & responsibilities' precedes 'Terminologies'.")
    return True


def main() -> None:
    notion = notion_utils.get_notion_client()
    main_id = sys.argv[1] if len(sys.argv) > 1 else None
    if verify(notion, main_id):
        sys.exit(0)
    sys.exit(1)


if __name__ == "__main__":
    main()


================================================
FILE: tasks/notion/easy/team_projects/simple__swap_tasks/description.md
================================================
Go to the Team Projects page, find the person responsible for the most tasks (10 in total) and the person responsible for the fewest tasks (3 in total), then swap their assigned tasks.

================================================
FILE: tasks/notion/easy/team_projects/simple__swap_tasks/meta.json
================================================
{
  "task_id": "simple__swap_tasks",
  "task_name": "Simple Swap Tasks",
  "category_id": "team_projects",
  "category_name": "Team Projects",
  "description": "Find the person responsible for the most and fewest tasks, then swap their assigned tasks.",
  "author": "Xiangyan Liu",
  "created_at": "2025-11-15",
  "difficulty": "L1",
  "tags": [
    "data aggregation",
    "automated migration",
    "conditional filtering"
  ],
  "mcp": [
    "notion"
  ],
  "meta_data": {
    "stateType": "url",
    "stateContent": null,
    "stateUrl": "https://painted-tennis-ebc.notion.site/Team-Projects-24e81626b6d7809c982fdb7a25825898",
    "stateOriginalUrl": "https://www.notion.so/marketplace/templates/gantt-chart"
  }
}


================================================
FILE: tasks/notion/easy/team_projects/simple__swap_tasks/verify.py
================================================
import sys
from notion_client import Client
from tasks.utils import notion_utils

def verify(notion: Client, main_id: str = None) -> bool:
    """
    Verifies that the task assignees have been swapped correctly.
    Checks:
    1. "Develop a plan for promotion" and "Evaluate different third-party services" have swapped assignees
    2. The person with most tasks and person with least tasks have swapped all their tasks
    """
    # Step 1: Find the Team Projects page
    if main_id:
        found_id, object_type = notion_utils.find_page_or_database_by_id(notion, main_id)
        if not found_id or object_type != 'page':
            print("Error: Team Projects page not found.", file=sys.stderr)
            return False
    else:
        # Try to find the page by searching
        found_id = notion_utils.find_page(notion, "Team Projects")
        if not found_id:
            print("Error: Team Projects page not found.", file=sys.stderr)
            return False
    
    # Get all blocks from the page to find database references
    all_blocks = notion_utils.get_all_blocks_recursively(notion, found_id)
    
    # Find Tasks database ID from the page
    tasks_db_id = None
    
    for block in all_blocks:
        if block and block.get("type") == "child_database":
            db_title = block.get("child_database", {}).get("title", "")
            if "Tasks" in db_title:
                tasks_db_id = block["id"]
                break
    
    if not tasks_db_id:
        print("Error: Tasks database not found.", file=sys.stderr)
        return False
    
    print("\n📋 Starting verification...")
    
    # Step 2: Query all tasks to analyze assignees
    
    try:
        all_tasks_response = notion.databases.query(
            database_id=tasks_db_id,
            page_size=100
        )
        
        if not all_tasks_response.get("results"):
            print("Error: No tasks found in Tasks database.", file=sys.stderr)
            return False
        
        tasks = all_tasks_response["results"]
        
    except Exception as e:
        print(f"Error querying Tasks database: {e}", file=sys.stderr)
        return False
    
    # Step 3: Check specific tasks have swapped assignees
    
    develop_plan_task = None
    evaluate_services_task = None
    
    for task in tasks:
        task_name = task["properties"]["Name"]["title"][0]["text"]["content"]
        if task_name == "Develop a plan for promotion":
            develop_plan_task = task
        elif task_name == "Evaluate different third-party services":
            evaluate_services_task = task
    
    if not develop_plan_task or not evaluate_services_task:
        print("Error: Could not find both required tasks.", file=sys.stderr)
        return False
    
    # Get assignees for these tasks
    develop_plan_assignees = develop_plan_task["properties"]["Assigned"]["people"]
    evaluate_services_assignees = evaluate_services_task["properties"]["Assigned"]["people"]
    
    if not develop_plan_assignees or not evaluate_services_assignees:
        print("Error: Tasks don't have assignees.", file=sys.stderr)
        return False
    
    develop_plan_assignee_id = develop_plan_assignees[0]["id"]
    evaluate_services_assignee_id = evaluate_services_assignees[0]["id"]
    
    # These should be different (swapped)
    if develop_plan_assignee_id == evaluate_services_assignee_id:
        print("Error: Tasks should have different assignees after swap.", file=sys.stderr)
        return False
    
    # Step 4: Count tasks per person
    
    task_counts = {}
    unassigned_count = 0
    
    for task in tasks:
        assignees = task["properties"]["Assigned"]["people"]
        if assignees:
            assignee_id = assignees[0]["id"]
            if assignee_id not in task_counts:
                task_counts[assignee_id] = []
            task_counts[assignee_id].append(task["properties"]["Name"]["title"][0]["text"]["content"])
        else:
            unassigned_count += 1
    
    # Sort by task count
    sorted_assignees = sorted(task_counts.items(), key=lambda x: len(x[1]))
    
    if len(sorted_assignees) < 2:
        print("Error: Need at least 2 people with tasks to verify swap.", file=sys.stderr)
        return False
    
    # Get person with least and most tasks
    person_with_least = sorted_assignees[0]
    person_with_most = sorted_assignees[-1]
    
    least_id, least_tasks = person_with_least
    most_id, most_tasks = person_with_most
    
    # Step 5: Verify the swap pattern
    
    # Original distribution (before swap):
    # - 5ac96c02-49a4-4320-8de6-b663ba83126b had 3 tasks (least)
    # - ac7a3bd0-c111-4464-8f45-8a857a1abc8a had 10 tasks (most)
    
    # After complete swap, we expect:
    # - 5ac96c02-49a4-4320-8de6-b663ba83126b should have 10 tasks
    # - ac7a3bd0-c111-4464-8f45-8a857a1abc8a should have 3 tasks
    
    original_least_id = "5ac96c02-49a4-4320-8de6-b663ba83126b"
    original_most_id = "ac7a3bd0-c111-4464-8f45-8a857a1abc8a"
    
    # Check if the swap has been completed
    swap_completed = False
    for assignee_id, assignee_tasks in task_counts.items():
        if assignee_id == original_least_id and len(assignee_tasks) == 10:
            # Person who had 3 now has 10
            for other_id, other_tasks in task_counts.items():
                if other_id == original_most_id and len(other_tasks) == 3:
                    # Person who had 10 now has 3
                    swap_completed = True
                    break
    
    # Step 6: Summary
    print(f"\n📊 Task Distribution:")
    print(f"  • Total tasks: {len(tasks)}")
    print(f"  • Assigned tasks: {len(tasks) - unassigned_count}")
    print(f"  • Unassigned tasks: {unassigned_count}")
    print(f"  • People with tasks: {len(task_counts)}")
    print(f"\n  Task counts by person:")
    for assignee_id, assignee_tasks in sorted_assignees:
        print(f"    - {assignee_id[:8]}...: {len(assignee_tasks)} tasks")
    
    # Step 7: Final verification
    print("\n🔍 Verification Results:")
    
    # Check that the swap has created a significant difference
    if len(most_tasks) - len(least_tasks) < 5:
        print(f"Warning: Difference between most and least is only {len(most_tasks) - len(least_tasks)} tasks", file=sys.stderr)
    
    # Verify specific expected outcomes
    verification_passed = True
    
    # Check 1: Specific tasks have been swapped
    specific_tasks_swapped = develop_plan_assignee_id != evaluate_services_assignee_id
    if specific_tasks_swapped:
        print("  ✓ Specific tasks have been swapped")
    else:
        print("  ✗ Specific tasks were not swapped", file=sys.stderr)
        verification_passed = False
    
    # Check 2: Task distribution shows a complete swap
    if swap_completed:
        print("  ✓ Complete task swap verified (3↔10 tasks)")
    else:
        # Show actual distribution for debugging
        person1_tasks = len(task_counts.get(original_least_id, []))
        person2_tasks = len(task_counts.get(original_most_id, []))
        print(f"  ✗ Swap incomplete! Expected 5ac96c02→10 tasks, ac7a3bd0→3 tasks", file=sys.stderr)
        print(f"    Actual: 5ac96c02→{person1_tasks} tasks, ac7a3bd0→{person2_tasks} tasks", file=sys.stderr)
        verification_passed = False
    
    # Check 3: Total task count is preserved
    total_assigned_tasks = sum(len(tasks) for _, tasks in task_counts.items())
    expected_total = len(tasks) - unassigned_count
    
    if total_assigned_tasks == expected_total:
        print(f"  ✓ Total task count preserved ({total_assigned_tasks} assigned)")
    else:
        print(f"  ✗ Task count mismatch: {total_assigned_tasks} vs {expected_total} expected", file=sys.stderr)
        verification_passed = False
    
    if verification_passed:
        print("\n✅ All verification checks passed!")
        return True
    else:
        print("\n❌ Verification failed", file=sys.stderr)
        return False

def main():
    """
    Executes the verification process and exits with a status code.
    """
    notion = notion_utils.get_notion_client()
    main_id = sys.argv[1] if len(sys.argv) > 1 else None
    if verify(notion, main_id):
        sys.exit(0)
    else:
        sys.exit(1)

if __name__ == "__main__":
    main()

================================================
FILE: tasks/notion/easy/toronto_guide/simple__change_color/description.md
================================================
Open the **Toronto Guide** page and refresh the colors of the tags in the **Food** database.

## Requirements
1. Find and open the Toronto Guide page in Notion.
2. Locate the *Food* database on that page.
3. Update every tag in the Food database that is currently pink so that it uses a different color of your choice (any non-pink color is fine).
4. Do not modify callouts or tags in the other databases.


================================================
FILE: tasks/notion/easy/toronto_guide/simple__change_color/meta.json
================================================
{
  "task_id": "simple__change_color",
  "task_name": "Simple Change Color",
  "category_id": "toronto_guide",
  "category_name": "Toronto Guide",
  "description": "Navigate to the Toronto Guide page and change all pink-colored elements to different colors.",
  "author": "Xiangyan Liu",
  "created_at": "2025-11-15",
  "difficulty": "L1",
  "tags": [
    "visual formatting",
    "conditional filtering"
  ],
  "mcp": [
    "notion"
  ],
  "meta_data": {
    "stateType": "url",
    "stateContent": null,
    "stateUrl": "https://painted-tennis-ebc.notion.site/Toronto-Guide-25281626b6d7802caa7cc394647e901c",
    "stateOriginalUrl": "https://www.notion.so/marketplace/templates/conquering-toronto-a-destination-guide"
  }
}


================================================
FILE: tasks/notion/easy/toronto_guide/simple__change_color/verify.py
================================================
import sys
from notion_client import Client
from tasks.utils import notion_utils


TARGET_PAGE_TITLE = "Toronto Guide"
FOOD_DATABASE_KEYWORD = "Food"
TARGET_TAG_NAMES = [
    "Middle Eastern",
    "Jamaican",
    "Indian",
]


def _get_food_database_id(notion: Client, page_id: str) -> str | None:
    """Return the block ID of the Food database shown on the target page."""
    all_blocks = notion_utils.get_all_blocks_recursively(notion, page_id)
    for block in all_blocks:
        if not block or block.get("type") != "child_database":
            continue
        title = block.get("child_database", {}).get("title", "")
        if FOOD_DATABASE_KEYWORD.lower() in title.lower():
            return block.get("id")
    return None


def verify(notion: Client, main_id: str | None = None) -> bool:
    """Check that all target tags in the Food database are no longer pink."""
    # Resolve the Toronto Guide page ID.
    if main_id:
        found_id, object_type = notion_utils.find_page_or_database_by_id(notion, main_id)
        if not found_id or object_type != "page":
            print("Error: Toronto Guide page not found.", file=sys.stderr)
            return False
        page_id = found_id
    else:
        page_id = notion_utils.find_page(notion, TARGET_PAGE_TITLE)
        if not page_id:
            print("Error: Toronto Guide page not found.", file=sys.stderr)
            return False

    # Locate the Food database block.
    food_db_id = _get_food_database_id(notion, page_id)
    if not food_db_id:
        print("Error: Food database not found on the Toronto Guide page.", file=sys.stderr)
        return False

    # Fetch database definition and inspect tag options.
    try:
        db_info = notion.databases.retrieve(database_id=food_db_id)
    except Exception as exc:
        print(f"Error: Unable to retrieve Food database ({exc}).", file=sys.stderr)
        return False

    tags_property = db_info.get("properties", {}).get("Tags", {})
    if tags_property.get("type") != "multi_select":
        print("Error: Food database does not have a multi-select Tags property.", file=sys.stderr)
        return False

    options = tags_property.get("multi_select", {}).get("options", [])
    remaining_targets = set(TARGET_TAG_NAMES)
    failures = False

    for option in options:
        tag_name = option.get("name", "").strip()
        if tag_name not in remaining_targets:
            continue

        remaining_targets.discard(tag_name)
        color = option.get("color")
        if color == "pink":
            print(f"Error: Tag '{tag_name}' in Food database is still pink.", file=sys.stderr)
            failures = True
        else:
            print(f"✓ Tag '{tag_name}' color updated to '{color}'.")

    if remaining_targets:
        print(
            f"Error: Food tags not found (expected to exist): {sorted(remaining_targets)}.",
            file=sys.stderr,
        )
        return False

    if failures:
        return False

    print("Success: All Food database tags are now non-pink.")
    return True


def main() -> None:
    notion = notion_utils.get_notion_client()
    main_id = sys.argv[1] if len(sys.argv) > 1 else None
    if verify(notion, main_id):
        sys.exit(0)
    sys.exit(1)


if __name__ == "__main__":
    main()


================================================
FILE: tasks/notion/standard/company_in_a_box/employee_onboarding/description.md
================================================
Build an integrated **Employee Onboarding** system for the existing **Company In A Box** page.

**Task Requirements:**
1. Create a new **database** titled **Employee Onboarding Checklist** with the following properties *exactly*:
   • **Employee Name** – title  
   • **Start Date** – date  
   • **Department** – select (options: Product, Marketing, Sales, HR, Engineering)  

   Populate this database with **3** sample new-hire pages covering three different departments. Every property in each entry must be filled.

2. Under the top-level page **Company In A Box**, create a new child page titled **Onboarding Hub** containing, in order:
   1) The **Employee Onboarding Checklist** database embedded at the top.  
   2) A section headed **Benefits Overview** that includes linked mentions (@-mentions or link-to-page blocks) to **≥ 3** distinct benefit-policy pages from the **Company Wiki** (for example *Benefits policy*, *Vacation Policy*, *Corporate travel*).  
   3) A section headed **30-Day Timeline** that presents a numbered list with **7** steps covering the first 30 days. **Each step must reference (via @-mention) an existing page or database**.  
   4) A section headed **Feedback Form** that provides **≥ 3** to-do items for new hires to check off.

================================================
FILE: tasks/notion/standard/company_in_a_box/employee_onboarding/meta.json
================================================
{
  "task_id": "employee_onboarding",
  "task_name": "Employee Onboarding",
  "category_id": "company_in_a_box",
  "category_name": "Company In A Box",
  "description": "Build an integrated Employee Onboarding system for the existing Company In A Box page with a checklist database, onboarding hub, and feedback form.",
  "author": "Zijian Wu",
  "created_at": "2025-07-27",
  "difficulty": "L3",
  "tags": [
    "database manipulation",
    "template population",
    "cross-reference linking",
    "status tracking"
  ],
  "mcp": [
    "notion"
  ],
  "meta_data": {
    "stateType": "url",
    "stateContent": null,
    "stateUrl": "https://painted-tennis-ebc.notion.site/Company-In-A-Box-23d81626b6d7800098f3d0e64a706cd8",
    "stateOriginalUrl": "https://www.notion.so/marketplace/templates/company-in-a-box"
  }
}

================================================
FILE: tasks/notion/standard/company_in_a_box/employee_onboarding/verify.py
================================================
import sys
from typing import Dict, Set
from notion_client import Client
from tasks.utils import notion_utils


def _check_db_schema(db_props: Dict[str, Dict], required: Dict[str, str]) -> bool:
    """Return True if every required property exists with the correct type."""
    for prop_name, expected_type in required.items():
        if prop_name not in db_props:
            print(
                f"Error: Property '{prop_name}' missing from database.", file=sys.stderr
            )
            return False
        actual_type = db_props[prop_name]["type"]
        if actual_type != expected_type:
            print(
                f"Error: Property '{prop_name}' has type '{actual_type}', expected '{expected_type}'.",
                file=sys.stderr,
            )
            return False
    return True


def verify(notion: Client, main_id: str | None = None) -> bool:  # noqa: C901
    """Programmatically verify the onboarding system described in description.md."""

    DB_TITLE = "Employee Onboarding Checklist"
    HUB_PAGE_TITLE = "Onboarding Hub"
    DEPARTMENT_OPTIONS: Set[str] = {
        "Product",
        "Marketing",
        "Sales",
        "HR",
        "Engineering",
    }
    REQUIRED_DB_PROPERTIES = {
        "Employee Name": "title",
        "Start Date": "date",
        "Department": "select",
    }

    # 1. Locate onboarding database
    db_id = notion_utils.find_database(notion, DB_TITLE)
    if not db_id:
        print(f"Error: Database '{DB_TITLE}' not found.", file=sys.stderr)
        return False

    try:
        db_obj = notion.databases.retrieve(database_id=db_id)
    except Exception as exc:
        print(f"Error retrieving database: {exc}", file=sys.stderr)
        return False

    db_props = db_obj.get("properties", {})
    if not _check_db_schema(db_props, REQUIRED_DB_PROPERTIES):
        return False

    # Extra: validate select options
    dept_options = {opt["name"] for opt in db_props["Department"]["select"]["options"]}
    if not DEPARTMENT_OPTIONS.issubset(dept_options):
        print(
            f"Error: Department select options must include {sorted(DEPARTMENT_OPTIONS)}. Current: {sorted(dept_options)}",
            file=sys.stderr,
        )
        return False

    # Check there are at least 3 entries in the database
    try:
        db_pages = notion.databases.query(database_id=db_id).get("results", [])
    except Exception as exc:
        print(f"Error querying database: {exc}", file=sys.stderr)
        return False
    if len(db_pages) < 3:
        print(
            "Error: Less than 3 onboarding entries found in the database.",
            file=sys.stderr,
        )
        return False

    # 2. Locate Onboarding Hub page
    hub_page_id = notion_utils.find_page(notion, HUB_PAGE_TITLE)
    if not hub_page_id:
        print(f"Error: Page '{HUB_PAGE_TITLE}' not found.", file=sys.stderr)
        return False

    # 3. Ensure the onboarding database is embedded in the hub page
    embedded_db_id = notion_utils.find_database_in_block(notion, hub_page_id, DB_TITLE)
    if embedded_db_id != db_id:
        print(
            "Error: The Employee Onboarding Checklist database is not embedded in the Onboarding Hub page.",
            file=sys.stderr,
        )
        return False

    # 4. Analyse blocks within the hub page for linked mentions, timeline, and feedback form
    all_blocks = notion_utils.get_all_blocks_recursively(notion, hub_page_id)

    seen_link_targets: Set[str] = set()
    numbered_list_count = 0
    todo_count = 0

    for blk in all_blocks:
        blk_type = blk.get("type")

        # Direct link-to-page blocks
        if blk_type == "link_to_page":
            info = blk.get("link_to_page", {})
            target_id = info.get("page_id") or info.get("database_id")
            if target_id:
                seen_link_targets.add(target_id)
            continue

        # Rich-text mentions inside content blocks
        if blk_type in {
            "paragraph",
            "numbered_list_item",
            "bulleted_list_item",
            "to_do",
        }:
            content = blk.get(blk_type, {})
            for rt in content.get("rich_text", []):
                if rt.get("type") == "mention":
                    mention = rt.get("mention", {})
                    if mention.get("type") in {"page", "database"}:
                        target_id = mention.get("page", {}).get("id") or mention.get(
                            "database", {}
                        ).get("id")
                        if target_id:
                            seen_link_targets.add(target_id)

        # Count numbered list items
        if blk_type == "numbered_list_item":
            numbered_list_count += 1

        # Count to-do items in Feedback Form
        if blk_type == "to_do":
            todo_count += 1

    if len(seen_link_targets) < 3:
        print(
            "Error: Fewer than 3 linked mentions to benefit policy pages found in the Benefits Overview section.",
            file=sys.stderr,
        )
        return False

    if numbered_list_count < 7:
        print(
            "Error: Numbered list contains fewer than 7 steps in the 30-Day Timeline section.",
            file=sys.stderr,
        )
        return False

    if todo_count < 3:
        print(
            "Error: Feedback Form section contains fewer than 3 to-do items.",
            file=sys.stderr,
        )
        return False

    print(
        "Success: Verified Employee Onboarding Checklist database, Onboarding Hub page, and all required sections."
    )
    return True


def main():
    notion = notion_utils.get_notion_client()
    main_id = sys.argv[1] if len(sys.argv) > 1 else None
    if verify(notion, main_id):
        sys.exit(0)
    else:
        sys.exit(1)


if __name__ == "__main__":
    main()


================================================
FILE: tasks/notion/standard/company_in_a_box/goals_restructure/description.md
================================================
Please restructure the **Current Goals** section on my **Company In A Box** page as follows:

1. **Add a new goal heading** — create a new `heading_3` block titled:
   
   `🔄 Digital Transformation Initiative`

2. **Convert all four goal headings to toggles** — the three existing goals
   * ⚙️ Expand Operations to LATAM  
   * 🛠️ Push for Enterprise  
   * 🩶 Boost Employee Engagement  
   * 🔄 Digital Transformation Initiative  

3. **Move descriptions inside the toggles** — every paragraph or list that originally sat directly under a goal heading should become a **child block** of that heading after it is made toggleable.

4. **Preserve content & order** — apart from the changes above, do **not** modify the text, formatting, or order of existing goal descriptions.

The end result should be a clean **Current Goals** section containing four toggleable goal headings, each with its corresponding details tucked inside.

================================================
FILE: tasks/notion/standard/company_in_a_box/goals_restructure/meta.json
================================================
{
  "task_id": "goals_restructure",
  "task_name": "Goals Restructure",
  "category_id": "company_in_a_box",
  "category_name": "Company In A Box",
  "description": "Restructure the Current Goals section on the Company In A Box page by adding a new goal heading and converting all goal headings to toggles with content inside.",
  "author": "Zijian Wu",
  "created_at": "2025-07-27",
  "difficulty": "L3",
  "tags": [
    "content organization",
    "visual formatting"
  ],
  "mcp": [
    "notion"
  ],
  "meta_data": {
    "stateType": "url",
    "stateContent": null,
    "stateUrl": "https://painted-tennis-ebc.notion.site/Company-In-A-Box-23d81626b6d7800098f3d0e64a706cd8",
    "stateOriginalUrl": "https://www.notion.so/marketplace/templates/company-in-a-box"
  }
}

================================================
FILE: tasks/notion/standard/company_in_a_box/goals_restructure/verify.py
================================================
import sys
from typing import List
from notion_client import Client
from tasks.utils import notion_utils

# Expected new goal heading text (including emoji)
NEW_GOAL_HEADING = "🔄 Digital Transformation Initiative"

# Section title to look for
GOALS_SECTION_TITLE = "Current Goals"


def _plain(block) -> str:
    """Return concatenated plain text of a block."""
    return notion_utils.get_block_plain_text(block)


# Some Notion rich-text strings may include non-breaking spaces (\xa0) after emoji.
# Normalize them to plain spaces so text matching is robust.
def _normalize_string(s: str) -> str:
    return s.replace("\xa0", " ")


def _is_heading(block) -> bool:
    return block.get("type") in ["heading_1", "heading_2", "heading_3"]


def _is_toggle(block) -> bool:
    """Determine whether a block is a toggle (standard toggle block or toggle-able heading)."""
    btype = block.get("type")
    # In our scenario, goal blocks are headings (usually heading_3) marked as toggleable.
    if btype in ["heading_1", "heading_2", "heading_3"]:
        heading_data = block.get(btype, {})
        return heading_data.get("is_toggleable", False)
    # Some Notion pages may contain classic toggle blocks (type == "toggle"). They are
    # not expected in this task, but keeping this check allows broader compatibility.
    return btype == "toggle"


def _get_children(notion: Client, block_id: str) -> List[dict]:
    """Retrieve **direct** children of a block (no pagination handling needed for small test pages)."""
    try:
        return notion.blocks.children.list(block_id=block_id).get("results", [])
    except Exception:
        return []


def verify(notion: Client, main_id: str = None) -> bool:
    """Verifies that the Company in a Box page has been updated per the task requirements."""
    # 1. Locate the main page
    page_id = None
    if main_id:
        found_id, obj_type = notion_utils.find_page_or_database_by_id(notion, main_id)
        if found_id and obj_type == "page":
            page_id = found_id

    if not page_id:
        # Try a few case variations just in case
        for title in [
            "Company In A Box",
        ]:
            page_id = notion_utils.find_page(notion, title)
            if page_id:
                break

    if not page_id:
        print("Error: Could not find the 'Company in a Box' page.", file=sys.stderr)
        return False

    # 2. Recursively locate the "Current Goals" heading and collect its sibling blocks that
    #     constitute the section.

    def _fetch_children(bid: str) -> List[dict]:
        try:
            return notion.blocks.children.list(block_id=bid).get("results", [])
        except Exception:
            return []

    goals_section_blocks: List[dict] = []

    # Breadth-first traversal to find the heading
    queue = [page_id]
    found_parent = None
    found_index = None

    while queue and found_parent is None:
        parent_id = queue.pop(0)
        children = _fetch_children(parent_id)
        for idx, child in enumerate(children):
            if (
                _is_heading(child)
                and GOALS_SECTION_TITLE.lower()
                in _normalize_string(_plain(child)).lower()
            ):
                found_parent = parent_id
                found_index = idx
                break
        # enqueue grandchildren for further search
        for ch in children:
            if ch.get("has_children"):
                queue.append(ch["id"])

    if found_parent is None:
        print(
            "Error: Could not find the 'Current Goals' heading anywhere in the page.",
            file=sys.stderr,
        )
        return False

    # Retrieve siblings once more to get the final list and slice after heading.
    siblings = _fetch_children(found_parent)
    if found_index is None or found_index >= len(siblings):
        print(
            "Error: Internal logic issue when locating Current Goals section.",
            file=sys.stderr,
        )
        return False

    goals_section_blocks = siblings[found_index + 1 :]

    if not goals_section_blocks:
        print("Error: 'Current Goals' section appears to be empty.", file=sys.stderr)
        return False

    # 3. Identify toggle blocks that represent goals
    toggle_blocks = [b for b in goals_section_blocks if _is_toggle(b)]

    if len(toggle_blocks) != 4:
        print(
            f"Error: Expected 4 toggle blocks for goals, found {len(toggle_blocks)}.",
            file=sys.stderr,
        )
        return False

    # 4. Ensure the new goal heading exists among the toggles
    found_new_goal = False
    for tb in toggle_blocks:
        if (
            _normalize_string(NEW_GOAL_HEADING).lower()
            in _normalize_string(_plain(tb)).lower()
        ):
            found_new_goal = True
            break
    if not found_new_goal:
        print(
            f"Error: Did not find a toggle block with heading '{NEW_GOAL_HEADING}'.",
            file=sys.stderr,
        )
        return False

    # 5. Validate that each toggle has at least one child paragraph/description
    for tb in toggle_blocks:
        if (
            _normalize_string(NEW_GOAL_HEADING).lower()
            in _normalize_string(_plain(tb)).lower()
        ):
            # Skip checking the new goal itself, as it does not have a description yet.
            continue
        if not tb.get("has_children", False):
            print(
                f"Error: Toggle '{_normalize_string(_plain(tb))}' has no child blocks (description not moved).",
                file=sys.stderr,
            )
            return False
        children = _get_children(notion, tb["id"])
        # Ensure there is at least one content child (paragraph, list item, etc.)
        content_types = {
            "paragraph",
            "bulleted_list_item",
            "numbered_list_item",
            "to_do",
            "callout",
            "quote",
        }
        if not any(c.get("type") in content_types for c in children):
            print(
                f"Error: Toggle '{_normalize_string(_plain(tb))}' seems to lack any description/content inside it.",
                file=sys.stderr,
            )
            return False

    # 6. Confirm that there are **no** residual heading_3 blocks (non-toggle) for the goals
    non_toggle_headings = [
        b
        for b in goals_section_blocks
        if b.get("type") == "heading_3" and not _is_toggle(b)
    ]
    if non_toggle_headings:
        titles = [_normalize_string(_plain(b)) for b in non_toggle_headings]
        print(
            f"Error: Found heading_3 blocks that were not converted to toggles: {titles}.",
            file=sys.stderr,
        )
        return False

    print(
        "Success: Verified goal restructuring with new toggle blocks and descriptions."
    )
    return True


def main():
    notion = notion_utils.get_notion_client()
    main_id = sys.argv[1] if len(sys.argv) > 1 else None
    if verify(notion, main_id):
        sys.exit(0)
    else:
        sys.exit(1)


if __name__ == "__main__":
    main()


================================================
FILE: tasks/notion/standard/company_in_a_box/quarterly_review_dashboard/description.md
================================================
Create a quarterly business review dashboard in Notion based on the existing **Company In A Box** workspace.

**Task Requirements:**
1. Inside the **Company Wiki** page you will find a sub-page named **Company Goals**. Extract every departmental objective listed under the four departments — **Product**, **Marketing**, **Sales**, and **HR**.
2. Under the top-level page **Company In A Box**, create a new child page titled **Q4 2024 Business Review Dashboard**.
3. Inside that new page build the following structure (all parts must exist):
   1. A single **callout** block near the top that summarises progress toward the three *Current Goals* shown on the main page:
      • *LATAM expansion*  • *Enterprise push*  • *Employee engagement*  
      (All three phrases must appear in the callout text.)
   2. Four separate **section headings** (any heading level) – one for each department (**Product**, **Marketing**, **Sales**, **Human Resources**) – placed below the callout.  Under each heading list that department’s objectives in a progress-tracking format (e.g. to-dos, check-box list). Each objective from the **Company Goals** page must appear at least once.
   3. Add a **database** named **Action Items** with the following properties *exactly*:
      • **Task Name** – title
      • **Department** – select (options: Product, Marketing, Sales, HR)
      • **Priority** – select (options: High, Medium, Low)
      • **Status** – status
      Populate this database with **≥ 5** action-item pages derived from the departmental objectives, making sure every field in each entry is filled:
       • **Task Name** & **Department** must correctly correspond to the underlying objective/department.
       • **Priority** and **Status** can be any allowed value, but they must **not** be left empty.
4. Keep the overall visual style consistent with the existing wiki (use headings, dividers, etc.).

================================================
FILE: tasks/notion/standard/company_in_a_box/quarterly_review_dashboard/meta.json
================================================
{
  "task_id": "quarterly_review_dashboard",
  "task_name": "Quarterly Review Dashboard",
  "category_id": "company_in_a_box",
  "category_name": "Company In A Box",
  "description": "Create a quarterly business review dashboard in Notion based on the existing Company In A Box workspace with department objectives and action items database.",
  "author": "Zijian Wu",
  "created_at": "2025-07-27",
  "difficulty": "L3",
  "tags": [
    "database manipulation",
    "data aggregation",
    "report generation",
    "status tracking",
    "template population"
  ],
  "mcp": [
    "notion"
  ],
  "meta_data": {
    "stateType": "url",
    "stateContent": null,
    "stateUrl": "https://painted-tennis-ebc.notion.site/Company-In-A-Box-23d81626b6d7800098f3d0e64a706cd8",
    "stateOriginalUrl": "https://www.notion.so/marketplace/templates/company-in-a-box"
  }
}

================================================
FILE: tasks/notion/standard/company_in_a_box/quarterly_review_dashboard/verify.py
================================================
import sys
from typing import List
from notion_client import Client
from tasks.utils import notion_utils


def _contains_keywords(text: str, keywords: List[str]) -> bool:
    lowered = text.lower()
    return all(kw.lower() in lowered for kw in keywords)


def verify(notion: Client, main_id: str = None) -> bool:
    """Programmatically verify that the dashboard page and its contents meet the
    requirements described in description.md.
    """
    DASHBOARD_TITLE = "Q4 2024 Business Review Dashboard"
    PARENT_PAGE_TITLE = "Company In A Box"
    CALL_OUT_KEYWORDS = ["latam", "enterprise", "employee engagement"]
    DEPARTMENTS = ["Product", "Marketing", "Sales", "Human Resources"]
    REQUIRED_DB_PROPERTIES = {
        "Task Name": "title",
        "Department": "select",
        "Priority": "select",
        "Status": "status",
    }
    PRIORITY_OPTIONS = {"High", "Medium", "Low"}

    # 1. Locate the dashboard page
    page_id = None
    if main_id:
        found_id, obj_type = notion_utils.find_page_or_database_by_id(notion, main_id)
        if found_id and obj_type == "page":
            page_id = found_id

    if not page_id:
        page_id = notion_utils.find_page(notion, DASHBOARD_TITLE)

    if not page_id:
        print(f"Error: Page '{DASHBOARD_TITLE}' not found.", file=sys.stderr)
        return False

    # Optional: ensure it is a child of Company In A Box
    try:
        page_obj = notion.pages.retrieve(page_id=page_id)
        parent_id = page_obj.get("parent", {}).get("page_id")
        if parent_id:
            parent_page = notion.pages.retrieve(page_id=parent_id)
            parent_title_rt = (
                parent_page.get("properties", {}).get("title", {}).get("title", [])
            )
            parent_title = (
                parent_title_rt[0].get("plain_text") if parent_title_rt else None
            )
            if parent_title != PARENT_PAGE_TITLE:
                print(
                    f"Error: Dashboard page is not a direct child of '{PARENT_PAGE_TITLE}'.",
                    file=sys.stderr,
                )
                return False
    except Exception:
        pass  # parent check is best-effort only

    # 2. Verify callout with keywords
    all_blocks = notion_utils.get_all_blocks_recursively(notion, page_id)
    callout_ok = False
    for block in all_blocks:
        if block.get("type") == "callout":
            callout_text = notion_utils.get_block_plain_text(block)
            if _contains_keywords(callout_text, CALL_OUT_KEYWORDS):
                callout_ok = True
                break
    if not callout_ok:
        print(
            "Error: No callout found that includes all three Current Goal keywords (LATAM, Enterprise, Employee engagement).",
            file=sys.stderr,
        )
        return False

    # 3. Verify department section headings
    found_depts = set()
    for block in all_blocks:
        if block.get("type") in {"heading_1", "heading_2", "heading_3"}:
            heading_text = notion_utils.get_block_plain_text(block)
            for dept in DEPARTMENTS:
                if dept.lower() in heading_text.lower():
                    found_depts.add(dept)
    if set(DEPARTMENTS) != found_depts:
        missing = set(DEPARTMENTS) - found_depts
        print(
            f"Error: Missing department headings: {', '.join(missing)}.",
            file=sys.stderr,
        )
        return False

    # 4. Verify Action Items database exists and has correct schema
    db_id = notion_utils.find_database_in_block(notion, page_id, "Action Items")
    if not db_id:
        print(
            "Error: Database 'Action Items' not found on the dashboard.",
            file=sys.stderr,
        )
        return False

    try:
        db = notion.databases.retrieve(database_id=db_id)
    except Exception as exc:
        print(f"Error: Unable to retrieve database: {exc}", file=sys.stderr)
        return False

    db_props = db.get("properties", {})
    for prop_name, expected_type in REQUIRED_DB_PROPERTIES.items():
        if prop_name not in db_props:
            print(
                f"Error: Property '{prop_name}' missing from database.", file=sys.stderr
            )
            return False
        actual_type = db_props[prop_name]["type"]
        if isinstance(expected_type, list):
            if actual_type not in expected_type:
                print(
                    f"Error: Property '{prop_name}' has type '{actual_type}', expected one of {expected_type}.",
                    file=sys.stderr,
                )
                return False
        else:
            if actual_type != expected_type:
                print(
                    f"Error: Property '{prop_name}' has type '{actual_type}', expected '{expected_type}'.",
                    file=sys.stderr,
                )
                return False
        # Extra check for Priority options
        if prop_name == "Priority":
            options = {opt["name"] for opt in db_props[prop_name]["select"]["options"]}
            if not PRIORITY_OPTIONS.issubset(options):
                print(
                    f"Error: Priority property options must include High/Medium/Low. Current options: {options}",
                    file=sys.stderr,
                )
                return False

    # 5. Verify at least 5 action items exist
    try:
        pages = notion.databases.query(database_id=db_id).get("results", [])
    except Exception as exc:
        print(f"Error querying database pages: {exc}", file=sys.stderr)
        return False

    if len(pages) < 5:
        print("Error: Database contains fewer than 5 action items.", file=sys.stderr)
        return False

    # Optional: Verify Department values valid
    for page in pages:
        props = page.get("properties", {})

        # Task Name must be non-empty
        title_rt = props.get("Task Name", {}).get("title", [])
        task_name = title_rt[0].get("plain_text") if title_rt else ""
        if not task_name.strip():
            print(
                f"Error: Action item '{page.get('id')}' is missing a Task Name.",
                file=sys.stderr,
            )
            return False

        # Department must be valid
        dept_select = props.get("Department", {}).get("select", {}).get("name")
        if not dept_select or dept_select not in DEPARTMENTS:
            print(
                f"Error: Action item '{page.get('id')}' has invalid or missing Department value.",
                file=sys.stderr,
            )
            return False

        # Priority and Status must be set (any value)
        priority_val = props.get("Priority", {}).get("select", {}).get("name")
        status_val = props.get("Status", {}).get("status", {}).get("name")
        if not priority_val or not status_val:
            print(
                f"Error: Action item '{page.get('id')}' must have both Priority and Status set.",
                file=sys.stderr,
            )
            return False

    print(
        "Success: Verified Business Review Dashboard, departmental sections, callout, and Action Items database with ≥5 entries."
    )
    return True


def main():
    notion = notion_utils.get_notion_client()
    main_id = sys.argv[1] if len(sys.argv) > 1 else None
    if verify(notion, main_id):
        sys.exit(0)
    else:
        sys.exit(1)


if __name__ == "__main__":
    main()


================================================
FILE: tasks/notion/standard/computer_science_student_dashboard/code_snippets_go/description.md
================================================
Find the page named "Computer Science Student Dashboard" and add a new Go column to the "Code Snippets" section.

**Task Requirements:**
1. In the "Code Snippets" section, create (or locate) a column dedicated to the Go programming language. **This column must appear between the existing Python and JavaScript columns** within the same column list.
2. At the top of the Go column, add a bold paragraph that contains exactly the text `Go`.
3. Under the header paragraph, add three code-block blocks configured with `language` set to **go**:
   a. **Basic Go program** – Caption must be `Basic Go program` and the code content must be exactly:
   ```go
   package main

   import "fmt"

   func main() {
       fmt.Println("Hello, World!")
   }
   ```
   b. **For loop in Go** – Caption must be `For loop in Go` and the code content must be exactly:
   ```go
   for i := 0; i < 5; i++ {
       fmt.Println(i)
   }
   ```
   c. **Function definition in Go** – Caption must be `Function definition in Go` and the code content must be exactly:
   ```go
   func add(a, b int) int {
       return a + b
   }
   ```

================================================
FILE: tasks/notion/standard/computer_science_student_dashboard/code_snippets_go/meta.json
================================================
{
  "task_id": "code_snippets_go",
  "task_name": "Code Snippets Go",
  "category_id": "computer_science_student_dashboard",
  "category_name": "Computer Science Student Dashboard",
  "description": "Add a new Go column to the Code Snippets section between Python and JavaScript columns.",
  "author": "Zijian Wu",
  "created_at": "2025-07-27",
  "difficulty": "L3",
  "tags": [
    "content organization",
    "visual formatting",
    "template population"
  ],
  "mcp": [
    "notion"
  ],
  "meta_data": {
    "stateType": "url",
    "stateContent": null,
    "stateUrl": "https://painted-tennis-ebc.notion.site/Computer-Science-Student-Dashboard-23e81626b6d78083b787d3c832b02ef4",
    "stateOriginalUrl": "https://www.notion.so/marketplace/templates/computer-science-student-dashboard"
  }
}

================================================
FILE: tasks/notion/standard/computer_science_student_dashboard/code_snippets_go/verify.py
================================================
import sys
from notion_client import Client
from tasks.utils import notion_utils

# Expected code blocks (language=go)
EXPECTED_CODE_BLOCKS = [
    {
        "caption": "Basic Go program",
        "code": (
            'package main\n\nimport "fmt"\n\nfunc main() {\n    fmt.Println("Hello, World!")\n}'
        ),
    },
    {
        "caption": "For loop in Go",
        "code": ("for i := 0; i < 5; i++ {\n    fmt.Println(i)\n}"),
    },
    {
        "caption": "Function definition in Go",
        "code": ("func add(a, b int) int {\n    return a + b\n}"),
    },
]

HEADER_TEXT = "Go"


def _normalize(text: str) -> str:
    """Remove trailing spaces on each line and strip leading/trailing blank lines."""
    return "\n".join(line.rstrip() for line in text.strip().splitlines())


def _find_page(notion: Client, main_id: str | None) -> str | None:
    """Return a page_id to verify against or None if not found."""
    page_id = None
    if main_id:
        found_id, object_type = notion_utils.find_page_or_database_by_id(
            notion, main_id
        )
        if found_id and object_type == "page":
            page_id = found_id
    if not page_id:
        page_id = notion_utils.find_page(notion, "Computer Science Student Dashboard")
    return page_id


def _has_bold_header_text(block, text: str) -> bool:
    """Generic bold header/paragraph check for a given text."""
    block_type = block.get("type")
    if block_type not in {"paragraph", "heading_1", "heading_2", "heading_3"}:
        return False
    rich_text_list = block.get(block_type, {}).get("rich_text", [])
    if not rich_text_list:
        return False
    plain = "".join(rt.get("plain_text", "") for rt in rich_text_list).strip()
    if plain != text:
        return False
    return any(rt.get("annotations", {}).get("bold", False) for rt in rich_text_list)


def _go_column_order_correct(notion: Client, page_id: str) -> bool:
    """Return True if there exists a column list where Python → Go → JavaScript order holds."""
    # Gather all blocks once (flat list) to locate column_list blocks
    all_blocks = notion_utils.get_all_blocks_recursively(notion, page_id)
    column_list_ids = [
        blk["id"] for blk in all_blocks if blk.get("type") == "column_list"
    ]

    for cl_id in column_list_ids:
        # Retrieve columns in explicit order
        columns = notion.blocks.children.list(block_id=cl_id).get("results", [])
        header_to_idx: dict[str, int] = {}
        for idx, col in enumerate(columns):
            # Recursively inspect blocks within this column
            col_blocks = notion_utils.get_all_blocks_recursively(notion, col["id"])
            for blk in col_blocks:
                if _has_bold_header_text(blk, "Python"):
                    header_to_idx.setdefault("Python", idx)
                elif _has_bold_header_text(blk, "Go"):
                    header_to_idx.setdefault("Go", idx)
                elif _has_bold_header_text(blk, "JavaScript"):
                    header_to_idx.setdefault("JavaScript", idx)
            # Short-circuit if all three found within current traversal
            if len(header_to_idx) == 3:
                break

        if (
            "Python" in header_to_idx
            and "Go" in header_to_idx
            and "JavaScript" in header_to_idx
            and header_to_idx["Python"]
            < header_to_idx["Go"]
            < header_to_idx["JavaScript"]
        ):
            return True
    return False


def _collect_code_blocks(blocks):
    """Return list of (code_content, caption) tuples for code blocks with language 'go'."""
    collected = []
    for block in blocks:
        if block.get("type") != "code":
            continue
        code_data = block.get("code", {})
        if code_data.get("language") != "go":
            continue
        code_plain = "".join(
            rt.get("plain_text", "") for rt in code_data.get("rich_text", [])
        )
        caption_plain = "".join(
            rt.get("plain_text", "") for rt in code_data.get("caption", [])
        )
        collected.append((code_plain, caption_plain))
    return collected


def verify(notion: Client, main_id: str | None = None) -> bool:
    page_id = _find_page(notion, main_id)
    if not page_id:
        print("Error: Target page not found.", file=sys.stderr)
        return False

    all_blocks = notion_utils.get_all_blocks_recursively(notion, page_id)

    # Verify header
    header_ok = any(_has_bold_header_text(b, HEADER_TEXT) for b in all_blocks)
    if not header_ok:
        print("Failure: Bold header 'Go' not found.", file=sys.stderr)
        return False

    # Verify code blocks
    code_blocks_found = _collect_code_blocks(all_blocks)

    remaining = EXPECTED_CODE_BLOCKS.copy()
    for code, caption in code_blocks_found:
        norm_code = _normalize(code)
        for expected in remaining:
            if (
                _normalize(expected["code"]) == norm_code
                and expected["caption"] == caption
            ):
                remaining.remove(expected)
                break
    if remaining:
        missing = ", ".join(exp["caption"] for exp in remaining)
        print(
            f"Failure: Missing or incorrect Go code blocks: {missing}", file=sys.stderr
        )
        return False

    # Verify column order Python → Go → JavaScript
    if not _go_column_order_correct(notion, page_id):
        print(
            "Failure: Go column is not positioned between Python and JavaScript.",
            file=sys.stderr,
        )
        return False

    print(
        "Success: Verified Go column with required code blocks and correct positioning."
    )
    return True


def main():
    notion = notion_utils.get_notion_client()
    main_id = sys.argv[1] if len(sys.argv) > 1 else None
    sys.exit(0 if verify(notion, main_id) else 1)


if __name__ == "__main__":
    main()


================================================
FILE: tasks/notion/standard/computer_science_student_dashboard/courses_internships_relation/description.md
================================================
Your goal is to connect the `Courses` and `Internship search` databases inside the **Computer Science Student Dashboard** page and populate them with sample data that can be verified automatically.

**Task Requirements:**

1. In the **Courses** database, add a new **relation** property named **Related Internships** that points to the **Internship search** database.
2. Ensure the relation is **bidirectional** by adding a relation property in the **Internship search** database named **Relevant Courses** that points back to the **Courses** database.
3. Create **exactly three** new pages in the **Courses** database with realistic computer-science course data.  Each course page must include **all** of the following properties and values:
   • **Code** (text) – unique codes `CS301`, `CS302`, and `CS303` respectively  
   • **Name** (text) – pick appropriate names (e.g., *Computer Networks*, *Operating Systems*, *Machine Learning*)  
   • **Credit** (number) – any positive integer  
   • **Status** (status) – choose from `Planned`, `In Progress`, or `Completed`  
   • **Related Internships** (relation) – link to at least one internship created in step4.
4. Create **exactly two** new pages in the **Internship search** database with complete application information.  Each internship page must include **all** of the following properties and values:
   • **Company** (text) – `OpenAI` and `Google` respectively  
   • **Role** (text) – `Machine Learning Intern` and `Software Engineering Intern`  
   • **Status** (status) – set to `Interested`  
   • **Relevant Courses** (relation) – link to one or more of the courses created in step3.
5. Every course created in step3 must be linked to at least one internship from step4 **and** every internship must be linked back to at least one course.

The task is considered complete when the relation properties exist, the specified course and internship pages are present with the exact values above, and the relations correctly connect the two databases in both directions.

================================================
FILE: tasks/notion/standard/computer_science_student_dashboard/courses_internships_relation/meta.json
================================================
{
  "task_id": "courses_internships_relation",
  "task_name": "Courses Internships Relation",
  "category_id": "computer_science_student_dashboard",
  "category_name": "Computer Science Student Dashboard",
  "description": "Connect the Courses and Internship search databases with bidirectional relations and populate with sample data.",
  "author": "Zijian Wu",
  "created_at": "2025-07-27",
  "difficulty": "L3",
  "tags": [
    "database manipulation",
    "cross-reference linking",
    "template population"
  ],
  "mcp": [
    "notion"
  ],
  "meta_data": {
    "stateType": "url",
    "stateContent": null,
    "stateUrl": "https://painted-tennis-ebc.notion.site/Computer-Science-Student-Dashboard-23e81626b6d78083b787d3c832b02ef4",
    "stateOriginalUrl": "https://www.notion.so/marketplace/templates/computer-science-student-dashboard"
  }
}

================================================
FILE: tasks/notion/standard/computer_science_student_dashboard/courses_internships_relation/verify.py
================================================
import sys
from notion_client import Client
from tasks.utils import notion_utils

# ---------------------------------------------------------------------------
# Constants -----------------------------------------------------------------
# ---------------------------------------------------------------------------
MAIN_PAGE_TITLE = "Computer Science Student Dashboard"
COURSES_DB_TITLE = "Courses"
INTERNSHIP_DB_TITLE = "Internship search"

COURSE_CODES = {"CS301", "CS302", "CS303"}
COURSE_RELATION_NAME = "Related Internships"
INTERNSHIP_RELATION_NAME = "Relevant Courses"

INTERNSHIP_COMPANIES = {"OpenAI", "Google"}

# ---------------------------------------------------------------------------
# Helper functions -----------------------------------------------------------
# ---------------------------------------------------------------------------


def _locate_main_page(notion: Client, main_id: str | None) -> str | None:
    """Return the page_id of the dashboard page or None if not found."""
    page_id = None
    if main_id:
        found_id, obj_type = notion_utils.find_page_or_database_by_id(notion, main_id)
        if found_id and obj_type == "page":
            page_id = found_id
    if not page_id:
        page_id = notion_utils.find_page(notion, MAIN_PAGE_TITLE)
    return page_id


def _locate_database(notion: Client, parent_page_id: str, db_title: str) -> str | None:
    """Recursively search for a child database by title and return its id."""
    return notion_utils.find_database_in_block(notion, parent_page_id, db_title)


# ---------------------------------------------------------------------------
# Verification logic ---------------------------------------------------------
# ---------------------------------------------------------------------------


def verify(notion: Client, main_id: str | None = None) -> bool:
    """Verify completion of the Courses ↔ Internship relation task."""
    # ------------------------------------------------------------------
    # Locate main page and databases -----------------------------------
    # ------------------------------------------------------------------
    page_id = _locate_main_page(notion, main_id)
    if not page_id:
        print(f"Error: Page '{MAIN_PAGE_TITLE}' not found.", file=sys.stderr)
        return False

    courses_db_id = _locate_database(notion, page_id, COURSES_DB_TITLE)
    internships_db_id = _locate_database(notion, page_id, INTERNSHIP_DB_TITLE)

    if not courses_db_id:
        print(f"Error: Database '{COURSES_DB_TITLE}' not found.", file=sys.stderr)
        return False
    if not internships_db_id:
        print(f"Error: Database '{INTERNSHIP_DB_TITLE}' not found.", file=sys.stderr)
        return False

    # ------------------------------------------------------------------
    # Validate relation properties -------------------------------------
    # ------------------------------------------------------------------
    courses_db_obj = notion.databases.retrieve(database_id=courses_db_id)
    internships_db_obj = notion.databases.retrieve(database_id=internships_db_id)

    courses_props = courses_db_obj.get("properties", {})
    internships_props = internships_db_obj.get("properties", {})

    # Courses → Internships relation
    if COURSE_RELATION_NAME not in courses_props:
        print(
            f"Error: Property '{COURSE_RELATION_NAME}' missing in Courses database.",
            file=sys.stderr,
        )
        return False
    course_rel_prop = courses_props[COURSE_RELATION_NAME]
    if (
        course_rel_prop.get("type") != "relation"
        or course_rel_prop["relation"].get("database_id") != internships_db_id
    ):
        print(
            "Error: Courses relation property is not configured correctly.",
            file=sys.stderr,
        )
        return False

    # Internships → Courses relation
    if INTERNSHIP_RELATION_NAME not in internships_props:
        print(
            f"Error: Property '{INTERNSHIP_RELATION_NAME}' missing in Internship search database.",
            file=sys.stderr,
        )
        return False
    intern_rel_prop = internships_props[INTERNSHIP_RELATION_NAME]
    if (
        intern_rel_prop.get("type") != "relation"
        or intern_rel_prop["relation"].get("database_id") != courses_db_id
    ):
        print(
            "Error: Internship relation property is not configured correctly.",
            file=sys.stderr,
        )
        return False

    # ------------------------------------------------------------------
    # Validate course pages --------------------------------------------
    # ------------------------------------------------------------------
    course_pages = notion.databases.query(database_id=courses_db_id).get("results", [])

    valid_course_count = 0
    course_page_id_set = set()
    internship_ids_seen: set[str] = set()

    for page in course_pages:
        props = page.get("properties", {})
        code_rts = props.get("Code", {}).get("rich_text", [])
        code_val = "".join(rt.get("plain_text", "") for rt in code_rts).strip()
        if code_val not in COURSE_CODES:
            continue  # not one of the new course entries we care about

        # Check required scalar props
        title_rts = props.get("Name", {}).get("title", [])
        name_ok = bool("".join(rt.get("plain_text", "") for rt in title_rts).strip())
        credits_ok = props.get("Credit", {}).get("number") is not None
        status_name = props.get("Status", {}).get("status", {}).get("name", "")
        status_allowed = {"planned", "in progress", "completed"}
        status_ok = status_name.lower() in status_allowed

        # Relation must point to at least one internship
        relations = props.get(COURSE_RELATION_NAME, {}).get("relation", [])
        if not (name_ok and credits_ok and status_ok and relations):
            print(
                f"Error: Course '{code_val}' is missing required property values or relations, or wrong values.",
                file=sys.stderr,
            )
            return False

        # Collect IDs for further mutual check
        course_page_id_set.add(page["id"])
        internship_ids_seen.update(rel["id"] for rel in relations)
        valid_course_count += 1

    if valid_course_count != 3:
        print(
            f"Error: Expected exactly 3 new course pages with codes {COURSE_CODES}, found {valid_course_count}.",
            file=sys.stderr,
        )
        return False

    # ------------------------------------------------------------------
    # Validate internship pages ----------------------------------------
    # ------------------------------------------------------------------
    internship_pages = notion.databases.query(database_id=internships_db_id).get(
        "results", []
    )

    valid_intern_count = 0
    internship_page_ids = set()
    course_ids_seen_from_intern: set[str] = set()

    for page in internship_pages:
        props = page.get("properties", {})
        company_rts = props.get("Company", {}).get("rich_text", [])
        company = "".join(rt.get("plain_text", "") for rt in company_rts).strip()
        if company not in INTERNSHIP_COMPANIES:
            continue  # not one of the two new internships

        role_rts = props.get("Role", {}).get("title", [])
        role_ok = bool("".join(rt.get("plain_text", "") for rt in role_rts).strip())
        status_name = props.get("Status", {}).get("status", {}).get("name", "")
        status_ok = status_name.lower() == "interested"
        relations = props.get(INTERNSHIP_RELATION_NAME, {}).get("relation", [])

        if not (role_ok and status_ok and relations):
            print(
                f"Error: Internship at '{company}' is missing required property values or relations, or wrong values.",
                file=sys.stderr,
            )
            return False

        internship_page_ids.add(page["id"])
        course_ids_seen_from_intern.update(rel["id"] for rel in relations)
        valid_intern_count += 1

    if valid_intern_count != 2:
        print(
            f"Error: Expected exactly 2 new internship pages for companies {INTERNSHIP_COMPANIES}, found {valid_intern_count}.",
            file=sys.stderr,
        )
        return False

    # ------------------------------------------------------------------
    # Mutual relation consistency --------------------------------------
    # ------------------------------------------------------------------
    # Each relation from courses should point to one of the two internships identified
    if not internship_ids_seen.issubset(internship_page_ids):
        print(
            "Error: Some course relations point to pages outside the expected internships.",
            file=sys.stderr,
        )
        return False

    # Each relation from internships should point back to the three course pages identified
    if not course_ids_seen_from_intern.issubset(course_page_id_set):
        print(
            "Error: Some internship relations point to pages outside the expected courses.",
            file=sys.stderr,
        )
        return False

    print(
        "Success: Verified bidirectional relations, course and internship entries as required."
    )
    return True


# ---------------------------------------------------------------------------
# CLI entry-point -----------------------------------------------------------
# ---------------------------------------------------------------------------


def main() -> None:
    notion = notion_utils.get_notion_client()
    main_id = sys.argv[1] if len(sys.argv) > 1 else None
    sys.exit(0 if verify(notion, main_id) else 1)


if __name__ == "__main__":
    main()


================================================
FILE: tasks/notion/standard/computer_science_student_dashboard/study_session_tracker/description.md
================================================
Your goal is to create a new study-session entry in the **Computer Science Student Dashboard** page.

1. Locate the ☑️ Habit tracker section of the page.
2. **Insert a new date section** immediately **after the existing `2022-09-02` to-do items but *before* the divider block** that follows them. Make sure the new date has proper formatting with a date mention and bold styling like the existing dates, and all to-do items should be unchecked initially. The new section should be inserted right after the 2022-09-02 to-do items but before the divider.
3. Directly **beneath** this new date mention, add **exactly four unchecked to-do blocks** with the following plain text (including the leading emoji on each line):
   • 🧠 Review algorithms for technical interview
   • 📚 Study database systems chapter 7
   • ⚡ Practice system design problems
   • 🎯 Complete data structures assignment

================================================
FILE: tasks/notion/standard/computer_science_student_dashboard/study_session_tracker/meta.json
================================================
{
  "task_id": "study_session_tracker",
  "task_name": "Study Session Tracker",
  "category_id": "computer_science_student_dashboard",
  "category_name": "Computer Science Student Dashboard",
  "description": "Create a new study-session entry in the Habit tracker section with four unchecked to-do items.",
  "author": "Zijian Wu",
  "created_at": "2025-07-27",
  "difficulty": "L3",
  "tags": [
    "content organization",
    "visual formatting",
    "status tracking"
  ],
  "mcp": [
    "notion"
  ],
  "meta_data": {
    "stateType": "url",
    "stateContent": null,
    "stateUrl": "https://painted-tennis-ebc.notion.site/Computer-Science-Student-Dashboard-23e81626b6d78083b787d3c832b02ef4",
    "stateOriginalUrl": "https://www.notion.so/marketplace/templates/computer-science-student-dashboard"
  }
}

================================================
FILE: tasks/notion/standard/computer_science_student_dashboard/study_session_tracker/verify.py
================================================
import sys
from notion_client import Client
from tasks.utils import notion_utils
from typing import Dict


def _normalize_string(s: str) -> str:
    """Replace non-breaking space with regular space for safe comparison."""
    return s.replace("\xa0", " ")


def verify(notion: Client, main_id: str | None = None) -> bool:
    """Verify that the new study-session entry for 2025-01-29 was added correctly.

    The script checks that:
    1. A bold date-mention with start=2025-01-29 exists.
    2. The mention sits after the 2022-09-02 section but before the divider that originally
       followed that section.
    3. Exactly four specified to-do items follow the new date mention and they are all unchecked.
    """

    # ---------------------------------------------------------------------
    # Locate the main page -------------------------------------------------
    # ---------------------------------------------------------------------
    page_id: str | None = None

    if main_id:
        found_id, object_type = notion_utils.find_page_or_database_by_id(
            notion, main_id
        )
        if found_id and object_type == "page":
            page_id = found_id

    if not page_id:
        page_id = notion_utils.find_page(notion, "Computer Science Student Dashboard")

    if not page_id:
        print(
            "Error: Page 'Computer Science Student Dashboard' not found.",
            file=sys.stderr,
        )
        return False

    # ---------------------------------------------------------------------
    # Fetch all blocks under the page (flattened order) --------------------
    # ---------------------------------------------------------------------
    all_blocks = notion_utils.get_all_blocks_recursively(notion, page_id)

    # ---------------------------------------------------------------------
    # Locate reference blocks ---------------------------------------------
    # ---------------------------------------------------------------------
    TARGET_DATE = "2025-01-29"
    PREVIOUS_DATE = "2022-09-02"

    index_previous_date: int | None = None
    index_new_date: int | None = None
    index_divider_after_previous: int | None = None

    for idx, block in enumerate(all_blocks):
        # Divider detection (we care only about the first divider that appears after
        # the 2022-09-02 block)
        if block.get("type") == "divider":
            if index_previous_date is not None and index_divider_after_previous is None:
                index_divider_after_previous = idx

        # We only need to inspect paragraph blocks that contain a date mention
        if block.get("type") != "paragraph":
            continue

        rich_text_list = block["paragraph"].get("rich_text", [])
        for rt in rich_text_list:
            if (
                rt.get("type") != "mention"
                or rt.get("mention", {}).get("type") != "date"
            ):
                continue

            date_start = rt["mention"]["date"].get("start")

            if date_start == PREVIOUS_DATE and index_previous_date is None:
                index_previous_date = idx

            if date_start == TARGET_DATE and index_new_date is None:
                index_new_date = idx
                # (1) Verify bold annotation
                if not rt.get("annotations", {}).get("bold", False):
                    print(
                        "Error: The 2025-01-29 date mention is not bold.",
                        file=sys.stderr,
                    )
                    return False

    # Ensure all reference indices were found
    if index_previous_date is None:
        print("Error: Could not locate the 2022-09-02 date section.", file=sys.stderr)
        return False
    if index_divider_after_previous is None:
        print(
            "Error: Could not locate the divider that follows the 2022-09-02 section.",
            file=sys.stderr,
        )
        return False
    if index_new_date is None:
        print(
            "Error: Could not locate the new 2025-01-29 date mention.", file=sys.stderr
        )
        return False

    # (2) Verify ordering
    if not (index_previous_date < index_new_date < index_divider_after_previous):
        print(
            "Error: The 2025-01-29 section is positioned incorrectly.", file=sys.stderr
        )
        return False

    # ---------------------------------------------------------------------
    # Verify to-do items under the new date section ------------------------
    # ---------------------------------------------------------------------
    expected_texts = [
        "🧠 Review algorithms for technical interview",
        "📚 Study database systems chapter 7",
        "⚡ Practice system design problems",
        "🎯 Complete data structures assignment",
    ]
    expected_todos: Dict[str, bool] = {
        _normalize_string(t): False for t in expected_texts
    }

    # Look through the blocks that lie between the new date mention and the divider
    for block in all_blocks[index_new_date + 1 : index_divider_after_previous]:
        if block.get("type") != "to_do":
            # Any non to-do block inside this range indicates mis-placement.
            # We simply ignore it – correctness is determined by presence of required to-dos.
            continue

        plain_text = notion_utils.get_block_plain_text(block).strip()
        plain_text_norm = _normalize_string(plain_text)
        if plain_text_norm in expected_todos:
            # (3a) Verify the to-do is unchecked
            if block["to_do"].get("checked", False):
                print(f"Error: To-do '{plain_text}' is checked.", file=sys.stderr)
                return False
            expected_todos[plain_text_norm] = True

    missing_items = [text for text, found in expected_todos.items() if not found]
    if missing_items:
        print(f"Error: Missing to-do items: {missing_items}", file=sys.stderr)
        return False

    # ---------------------------------------------------------------------
    # Success --------------------------------------------------------------
    # ---------------------------------------------------------------------
    print("Success: Study session for 2025-01-29 added correctly.")
    return True


# -------------------------------------------------------------------------
# Command-line entry-point -------------------------------------------------
# -------------------------------------------------------------------------


def main() -> None:
    notion = notion_utils.get_notion_client()
    main_id = sys.argv[1] if len(sys.argv) > 1 else None

    if verify(notion, main_id):
        sys.exit(0)
    else:
        sys.exit(1)


if __name__ == "__main__":
    main()


================================================
FILE: tasks/notion/standard/it_trouble_shooting_hub/asset_retirement_migration/description.md
================================================
Please restructure the **IT Inventory** database as described below. Your automation will be checked by an automated script, so follow every detail exactly.

---
Task Steps
1. Inside the **IT Trouble Shooting Hub** page, locate the database named **IT Inventory**.
2. Query this database and collect every page whose **Status** property is **Expired** or **To be returned**.
3. Create a **new full-page database** directly under the same IT Trouble Shooting Hub page called **IT Asset Retirement Queue**.
4. Configure this new database so that it contains **exactly** the following properties (spellings and types must match):
   • Serial – title  
   • Tags – multi_select  
   • Status – select  
   • Vendor – select  
   • Expiration date – date  
   • Retirement Reason – select with option set { **Expired License**, **Hardware Obsolete**, **Security Risk**, **User Offboarding** }
5. For every inventory item gathered in step2:
   a. Create a corresponding page in **IT Asset Retirement Queue** and copy over the values of the Serial, Tags, Status, Vendor and Expiration date properties.  
   b. Set **Retirement Reason** to one of the four options above (choose the most appropriate).  
   c. Archive the original inventory page **after** the new page has been created.
6. After all items are migrated:
   a. Update the **description** of the **IT Asset Retirement Queue** database so it is **exactly** `AUTO-GENERATED MIGRATION COMPLETED` (no additional text).
   b. Create a new page under **IT Trouble Shooting Hub** titled **Retirement Migration Log**. Inside this page, add a **callout block** whose text follows the exact pattern:

      `Successfully migrated <N> assets to the retirement queue on 2025-03-24.`

      • `<N>` is the total number of items moved.

================================================
FILE: tasks/notion/standard/it_trouble_shooting_hub/asset_retirement_migration/meta.json
================================================
{
  "task_id": "asset_retirement_migration",
  "task_name": "Asset Retirement Migration",
  "category_id": "it_trouble_shooting_hub",
  "category_name": "IT Trouble Shooting Hub",
  "description": "Restructure the IT Inventory database by migrating expired assets to a new IT Asset Retirement Queue database.",
  "author": "Zijian Wu",
  "created_at": "2025-07-27",
  "difficulty": "L3",
  "tags": [
    "database manipulation",
    "automated migration",
    "conditional filtering",
    "data aggregation",
    "report generation"
  ],
  "mcp": [
    "notion"
  ],
  "meta_data": {
    "stateType": "url",
    "stateContent": null,
    "stateUrl": "https://painted-tennis-ebc.notion.site/It-Trouble-Shooting-Hub-23e81626b6d78020aba7eb65ae1cc2d5",
    "stateOriginalUrl": "https://www.notion.so/marketplace/templates/it-trouble-shooting-hub"
  }
}

================================================
FILE: tasks/notion/standard/it_trouble_shooting_hub/asset_retirement_migration/verify.py
================================================
import sys
from typing import Dict, Set
from notion_client import Client
from tasks.utils import notion_utils


def _get_database(root_page_id: str, notion: Client, name: str) -> str | None:
    """Helper that finds a child database by title inside a page."""
    return notion_utils.find_database_in_block(notion, root_page_id, name)


def _check_property(props: Dict, name: str, expected_type: str) -> bool:
    if name not in props:
        print(f"Error: Property '{name}' missing in database.", file=sys.stderr)
        return False
    if props[name]["type"] != expected_type:
        print(
            f"Error: Property '{name}' expected type '{expected_type}', found '{props[name]['type']}'.",
            file=sys.stderr,
        )
        return False
    return True


def verify(notion: Client, main_id: str | None = None) -> bool:
    """Verifies that the IT Asset Retirement Queue was created and populated correctly."""

    # -------------------------------------------------------------------------
    # Resolve the root IT Trouble Shooting Hub page
    # -------------------------------------------------------------------------
    root_page_id = None
    if main_id:
        found_id, obj_type = notion_utils.find_page_or_database_by_id(notion, main_id)
        if found_id and obj_type == "page":
            root_page_id = found_id

    if not root_page_id:
        root_page_id = notion_utils.find_page(notion, "IT Trouble Shooting Hub")
    if not root_page_id:
        print(
            "Error: Could not locate the 'IT Trouble Shooting Hub' page.",
            file=sys.stderr,
        )
        return False

    # -------------------------------------------------------------------------
    # Locate the original and new databases
    # -------------------------------------------------------------------------
    inventory_db_id = _get_database(root_page_id, notion, "IT Inventory")
    if not inventory_db_id:
        print("Error: 'IT Inventory' database not found.", file=sys.stderr)
        return False

    retirement_db_id = _get_database(root_page_id, notion, "IT Asset Retirement Queue")
    if not retirement_db_id:
        print("Error: 'IT Asset Retirement Queue' database not found.", file=sys.stderr)
        return False

    # -------------------------------------------------------------------------
    # Validate schema of the retirement queue database
    # -------------------------------------------------------------------------
    retirement_db = notion.databases.retrieve(database_id=retirement_db_id)
    r_props = retirement_db["properties"]

    required_schema = {
        "Serial": "title",
        "Tags": "multi_select",
        "Status": "select",
        "Vendor": "select",
        "Expiration date": "date",
        "Retirement Reason": "select",
    }

    for pname, ptype in required_schema.items():
        if not _check_property(r_props, pname, ptype):
            return False

    # Check Retirement Reason options
    expected_reason_options: Set[str] = {
        "Expired License",
        "Hardware Obsolete",
        "Security Risk",
        "User Offboarding",
    }
    actual_options = {
        opt["name"] for opt in r_props["Retirement Reason"]["select"]["options"]
    }
    if actual_options != expected_reason_options:
        print(
            "Error: 'Retirement Reason' select options mismatch.\n"
            f"Expected: {sorted(expected_reason_options)}\n"
            f"Found: {sorted(actual_options)}",
            file=sys.stderr,
        )
        return False

    # ---------------------------------------------------------------
    # Validate database description starts with required phrase
    # ---------------------------------------------------------------
    desc_rich = retirement_db.get("description", [])
    desc_text = "".join([t.get("plain_text", "") for t in desc_rich])
    required_desc = "AUTO-GENERATED MIGRATION COMPLETED"
    if desc_text.strip() != required_desc:
        print(
            f"Error: Retirement database description must be exactly '{required_desc}'.",
            file=sys.stderr,
        )
        return False

    # -------------------------------------------------------------------------
    # Validate that inventory items are moved & archived
    # -------------------------------------------------------------------------
    expired_filter = {
        "property": "Status",
        "select": {"equals": "Expired"},
    }
    to_return_filter = {
        "property": "Status",
        "select": {"equals": "To be returned"},
    }
    compound_filter = {"or": [expired_filter, to_return_filter]}

    # Query for any *active* items that still match these statuses
    remaining_items = notion.databases.query(
        database_id=inventory_db_id,
        filter=compound_filter,
        archived=False,
    ).get("results", [])

    if remaining_items:
        print(
            f"Error: {len(remaining_items)} 'Expired' / 'To be returned' items still present in IT Inventory.",
            file=sys.stderr,
        )
        return False

    # There should be at least one entry in the retirement queue
    retirement_pages = notion.databases.query(database_id=retirement_db_id).get(
        "results", []
    )
    expected_serials = {"65XYQ/GB", "36x10PIQ"}
    if len(retirement_pages) != len(expected_serials):
        print(
            f"Error: Expected {len(expected_serials)} retirement pages, found {len(retirement_pages)}.",
            file=sys.stderr,
        )
        return False

    # Each retirement page must have a Retirement Reason
    serials_seen = set()
    for page in retirement_pages:
        props = page["properties"]
        reason = props.get("Retirement Reason", {}).get("select", {})
        if not reason or reason.get("name") not in expected_reason_options:
            print(
                f"Error: Page {page['id']} missing valid 'Retirement Reason'.",
                file=sys.stderr,
            )
            return False

        # Collect Serial title
        title_rich = props.get("Serial", {}).get("title", [])
        serial_val = "".join([t.get("plain_text", "") for t in title_rich]).strip()
        serials_seen.add(serial_val)

    if serials_seen != expected_serials:
        print(
            f"Error: Serial values mismatch. Expected {sorted(expected_serials)}, found {sorted(serials_seen)}.",
            file=sys.stderr,
        )
        return False

    # -----------------------------------------------------------------
    # Verify the migration log page and callout block contents
    # -----------------------------------------------------------------
    log_page_title = "Retirement Migration Log"
    log_page_id = notion_utils.find_page(notion, log_page_title)
    if not log_page_id:
        print(f"Error: Page '{log_page_title}' not found.", file=sys.stderr)
        return False

    # Search for a callout block with required pattern
    import re

    callout_pattern = re.compile(
        r"Successfully migrated (\d+) assets to the retirement queue on 2025-03-24\."
    )
    blocks = notion_utils.get_all_blocks_recursively(notion, log_page_id)
    match_found = False
    for blk in blocks:
        if blk.get("type") == "callout":
            text = notion_utils.get_block_plain_text(blk)
            m = callout_pattern.search(text)
            if m:
                migrated_num = int(m.group(1))
                if migrated_num == len(expected_serials):
                    match_found = True
                else:
                    print(
                        f"Error: Callout reports {migrated_num} assets, but {len(retirement_pages)} retirement pages found.",
                        file=sys.stderr,
                    )
                    return False
                break
    if not match_found:
        print(
            "Error: Required callout block not found in migration log page.",
            file=sys.stderr,
        )
        return False

    print("Success: All verification criteria satisfied.")
    return True


def main():
    notion = notion_utils.get_notion_client()
    main_id = sys.argv[1] if len(sys.argv) > 1 else None
    if verify(notion, main_id):
        sys.exit(0)
    else:
        sys.exit(1)


if __name__ == "__main__":
    main()


================================================
FILE: tasks/notion/standard/it_trouble_shooting_hub/security_audit_ticket/description.md
================================================
Please help me create a comprehensive security audit ticket based on the data already stored in the **IT Trouble Shooting Hub** page.

Your automation should:

1. In the **IT Inventory** database, find every item whose **Expiration date** is **before 2023-07-15**.
2. In the **IT FAQs** database, look up any FAQ entries that have the **"Security"** tag.
3. **Create a new page** inside the **IT Requests** database with **exact title**:
   
   `Quarterly Security Audit - Expired Assets Review`
4. Set its **Priority** property to **High**.
5. Set its **Due** property to **2023-06-22**.
6. In the page body, add a bullet-list block that enumerates **each expired inventory item**. **Each bullet item must follow this exact text format (including the dashes):**

   `<Serial> - <Tag> - <Recommendation>`

   • `<Serial>` is the item’s Serial value.
   • `<Tag>` is the first tag assigned to the inventory item (e.g., "Laptop").
   • `<Recommendation>` is a brief action you suggest based on the security FAQ entry (any text is acceptable).

   Example (do **not** copy):
   `ABC123 - Laptop - Renew warranty and enable disk encryption`

================================================
FILE: tasks/notion/standard/it_trouble_shooting_hub/security_audit_ticket/meta.json
================================================
{
  "task_id": "security_audit_ticket",
  "task_name": "Security Audit Ticket",
  "category_id": "it_trouble_shooting_hub",
  "category_name": "IT Trouble Shooting Hub",
  "description": "Create a comprehensive security audit ticket based on expired inventory items and security FAQ entries.",
  "author": "Zijian Wu",
  "created_at": "2025-07-27",
  "difficulty": "L3",
  "tags": [
    "conditional filtering",
    "database manipulation",
    "data aggregation",
    "report generation"
  ],
  "mcp": [
    "notion"
  ],
  "meta_data": {
    "stateType": "url",
    "stateContent": null,
    "stateUrl": "https://painted-tennis-ebc.notion.site/It-Trouble-Shooting-Hub-23e81626b6d78020aba7eb65ae1cc2d5",
    "stateOriginalUrl": "https://www.notion.so/marketplace/templates/it-trouble-shooting-hub"
  }
}

================================================
FILE: tasks/notion/standard/it_trouble_shooting_hub/security_audit_ticket/verify.py
================================================
import sys
from notion_client import Client
from tasks.utils import notion_utils
import re


def _get_title_text(page_properties: dict) -> str:
    """Extract the plain text of the first title property from a page."""
    for prop in page_properties.values():
        if prop.get("type") == "title":
            title_rich = prop.get("title", [])
            if title_rich:
                return title_rich[0].get("plain_text")
    return ""


def verify(notion: Client, main_id: str | None = None) -> bool:
    """Verify that the automation created the expected security audit ticket."""

    # ----------------------------------------------------------------------------------
    # Locate the root page (IT Trouble Shooting Hub) either via main_id or by title.
    # ----------------------------------------------------------------------------------
    root_page_id = None
    if main_id:
        found_id, object_type = notion_utils.find_page_or_database_by_id(
            notion, main_id
        )
        if found_id and object_type == "page":
            root_page_id = found_id

    if not root_page_id:
        root_page_id = notion_utils.find_page(notion, "IT Trouble Shooting Hub")
    if not root_page_id:
        print(
            "Error: Could not locate the 'IT Trouble Shooting Hub' page.",
            file=sys.stderr,
        )
        return False

    # ----------------------------------------------------------------------------------
    # Find the IT Requests database under the root page.
    # ----------------------------------------------------------------------------------
    requests_db_id = notion_utils.find_database_in_block(
        notion, root_page_id, "IT Requests"
    )
    if not requests_db_id:
        print(
            "Error: 'IT Requests' database not found in the workspace.", file=sys.stderr
        )
        return False

    # ----------------------------------------------------------------------------------
    # Search for the expected ticket inside the IT Requests database.
    # ----------------------------------------------------------------------------------
    expected_title = "Quarterly Security Audit - Expired Assets Review"
    results = notion.databases.query(database_id=requests_db_id).get("results", [])

    target_page = None
    for page in results:
        title_text = _get_title_text(page.get("properties", {}))
        if title_text == expected_title:
            target_page = page
            break

    if not target_page:
        print(
            f"Failure: Ticket with title '{expected_title}' was not found in 'IT Requests' database.",
            file=sys.stderr,
        )
        return False

    props = target_page.get("properties", {})

    # ----------------------------------------------------------------------------------
    # Validate Priority property.
    # ----------------------------------------------------------------------------------
    priority_value = props.get("Priority", {}).get("select", {}).get("name")
    if priority_value != "High":
        print(
            f"Failure: Expected Priority 'High', found '{priority_value}'.",
            file=sys.stderr,
        )
        return False

    # ----------------------------------------------------------------------------------
    # Validate Due date property.
    # ----------------------------------------------------------------------------------
    due_date_start = props.get("Due", {}).get("date", {}).get("start")
    expected_due_iso = "2023-06-22"
    if not due_date_start or not due_date_start.startswith(expected_due_iso):
        print(
            f"Failure: Expected Due date '{expected_due_iso}', found '{due_date_start}'.",
            file=sys.stderr,
        )
        return False

    # ----------------------------------------------------------------------------------
    # Validate the bulleted list contains the correct expired items in required format.
    # ----------------------------------------------------------------------------------
    page_id = target_page["id"]
    blocks = notion.blocks.children.list(block_id=page_id).get("results", [])
    bullet_texts = [
        notion_utils.get_block_plain_text(b)
        for b in blocks
        if b.get("type") == "bulleted_list_item"
    ]

    expected_items = {
        "192371-8910/54": "Computer Accessory",
        "32x11PIP": "Computer Accessory",
        "76x87PCY": "Laptop",
        "36x10PIQ": "Computer Accessory",
        "65XYQ/GB": "License",
    }

    if len(bullet_texts) != len(expected_items):
        print(
            f"Failure: Expected {len(expected_items)} bullet items, found {len(bullet_texts)}.",
            file=sys.stderr,
        )
        return False

    bullet_pattern = re.compile(r"^\s*(.*?)\s+-\s+(.*?)\s+-\s+(.+?)\s*$")
    matched = set()
    for text in bullet_texts:
        m = bullet_pattern.match(text)
        if not m:
            print(
                f"Failure: Bullet item '{text}' does not follow '<Serial> - <Tag> - <Recommendation>' format.",
                file=sys.stderr,
            )
            return False
        serial, tag, advice = m.group(1).strip(), m.group(2).strip(), m.group(3).strip()
        if serial not in expected_items:
            print(
                f"Failure: Unexpected Serial '{serial}' found in bullet list.",
                file=sys.stderr,
            )
            return False
        if expected_items[serial] != tag:
            print(
                f"Failure: Serial '{serial}' expected tag '{expected_items[serial]}', found '{tag}'.",
                file=sys.stderr,
            )
            return False
        if not advice:
            print(
                f"Failure: Bullet item for Serial '{serial}' is missing a recommendation/advice.",
                file=sys.stderr,
            )
            return False
        matched.add(serial)

    if len(matched) != len(expected_items):
        missing = set(expected_items.keys()) - matched
        print(
            f"Failure: Missing bullet items for serials: {', '.join(missing)}.",
            file=sys.stderr,
        )
        return False

    print("Success: All verification criteria satisfied.")
    return True


def main():
    notion = notion_utils.get_notion_client()
    main_id = sys.argv[1] if len(sys.argv) > 1 else None
    if verify(notion, main_id):
        sys.exit(0)
    else:
        sys.exit(1)


if __name__ == "__main__":
    main()


================================================
FILE: tasks/notion/standard/it_trouble_shooting_hub/verification_expired_update/description.md
================================================
**Task Overview**

My IT knowledge base contains pages whose verification status has expired:

**Task Requirements**
1. Locate the database named **"IT Homepage"** inside the main page **"It Trouble Shooting Hub"**.
2. Within that database, find every page (except for **"It Inventory"**) where the **Verification** property state contains `expired`.
3. For **each** expired page:
   • Insert a **callout block** at the very top (as the first child block) whose rich-text content is:
     `VERIFICATION EXPIRED - This page needs review and re-verification`
   • Set the callout’s icon to ⚠️.
   • Set the callout’s colour to `red_background`.
4. Create a new entry in the **"IT Requests"** database with:
   • Title (property **Task name**) **exactly** `Batch Verification Update Required`.
   • **Priority** set to `High`.
   • **Status** set to `In progress`.
   • In the page body add a **bulleted list** where each bullet is a **mention** of the page processed in step 3 (i.e., use the Notion mention object linking to that page).

================================================
FILE: tasks/notion/standard/it_trouble_shooting_hub/verification_expired_update/meta.json
================================================
{
  "task_id": "verification_expired_update",
  "task_name": "Verification Expired Update",
  "category_id": "it_trouble_shooting_hub",
  "category_name": "IT Trouble Shooting Hub",
  "description": "Update pages with expired verification status by adding warning callouts and creating a batch update request.",
  "author": "Zijian Wu",
  "created_at": "2025-07-27",
  "difficulty": "L3",
  "tags": [
    "conditional filtering",
    "visual formatting",
    "database manipulation",
    "cross-reference linking",
    "status tracking"
  ],
  "mcp": [
    "notion"
  ],
  "meta_data": {
    "stateType": "url",
    "stateContent": null,
    "stateUrl": "https://painted-tennis-ebc.notion.site/It-Trouble-Shooting-Hub-23e81626b6d78020aba7eb65ae1cc2d5",
    "stateOriginalUrl": "https://www.notion.so/marketplace/templates/it-trouble-shooting-hub"
  }
}

================================================
FILE: tasks/notion/standard/it_trouble_shooting_hub/verification_expired_update/verify.py
================================================
import sys
from notion_client import Client
from tasks.utils import notion_utils

CALL_OUT_TEXT = "VERIFICATION EXPIRED - This page needs review and re-verification"
CALL_OUT_ICON = "⚠️"
CALL_OUT_COLOR = "red_background"
IT_HOMEPAGE_DB_TITLE = "IT Homepage"
IT_REQUESTS_DB_TITLE = "IT Requests"
REQUEST_TITLE = "Batch Verification Update Required"
PRIORITY_HIGH = "High"
STATUS_IN_PROGRESS = "In progress"


def _get_main_page_id(notion: Client, main_id: str | None) -> str | None:
    """Resolve the main page id starting from CLI arg or by title search."""
    if main_id:
        found_id, obj_type = notion_utils.find_page_or_database_by_id(notion, main_id)
        if found_id and obj_type == "page":
            return found_id
    # Fallback to title search (case-insensitive)
    return notion_utils.find_page(notion, "It Trouble Shooting Hub")


def _fetch_database_id(
    notion: Client, parent_page_id: str, db_title: str
) -> str | None:
    """Locate a child database by title inside a given page."""
    return notion_utils.find_database_in_block(notion, parent_page_id, db_title)


def _expired_pages(notion: Client, db_id: str) -> list[dict]:
    """Return list of page objects with Verification.state == 'expired'."""
    # Query all pages (API max 100 per call). If many pages expected, iterate.
    results = notion.databases.query(database_id=db_id).get("results", [])
    expired = []
    for page in results:
        verification_prop = page.get("properties", {}).get("Verification", {})
        state = verification_prop.get("verification", {}).get("state")
        # Skip the IT Inventory database entry
        title_prop = page.get("properties", {}).get("Page", {}).get("title", [])
        title_text = title_prop[0].get("plain_text") if title_prop else ""
        if title_text.strip().lower() == "it inventory":
            continue

        if state and "expired" in state.lower():
            expired.append(page)
    return expired


def _check_callout_present(notion: Client, page_id: str) -> bool:
    """Verify the specified callout is the first child block of the page."""
    children = notion.blocks.children.list(block_id=page_id, page_size=1).get(
        "results", []
    )
    if not children:
        return False
    first_block = children[0]
    if first_block.get("type") != "callout":
        return False
    data = first_block.get("callout", {})
    # Check color
    if data.get("color") != CALL_OUT_COLOR:
        return False

    # Check icon
    icon = data.get("icon", {})
    if icon.get("type") != "emoji" or icon.get("emoji") != CALL_OUT_ICON:
        return False

    # Check text content (callout rich text plain text)
    plain_text = notion_utils.get_block_plain_text(first_block)
    return CALL_OUT_TEXT in plain_text


def _find_request_page(notion: Client, db_id: str) -> dict | None:
    """Find the IT Request page with the expected title."""
    # Use a simple search inside database
    res = notion.databases.query(
        database_id=db_id,
        filter={"property": "Task name", "title": {"equals": REQUEST_TITLE}},
    ).get("results", [])
    return res[0] if res else None


def _check_request_properties(page: dict) -> bool:
    props = page.get("properties", {})
    priority = props.get("Priority", {}).get("select", {}).get("name")
    status = (
        props.get("Status", {}).get("status", {}).get("name")
        if props.get("Status", {}).get("status")
        else props.get("Status", {}).get("select", {}).get("name")
    )
    return priority == PRIORITY_HIGH and status == STATUS_IN_PROGRESS


def _request_page_contains_mentions(
    notion: Client, request_page_id: str, expected_page_ids: list[str]
) -> bool:
    children = notion.blocks.children.list(block_id=request_page_id, page_size=100).get(
        "results", []
    )
    bullet_blocks = [b for b in children if b.get("type") == "bulleted_list_item"]
    mentioned_ids: set[str] = set()
    for block in bullet_blocks:
        rich_text = block.get("bulleted_list_item", {}).get("rich_text", [])
        for rt in rich_text:
            if rt.get("type") == "mention":
                mention = rt.get("mention", {})
                if mention.get("type") == "page":
                    mentioned_ids.add(mention.get("page", {}).get("id"))
    if len(mentioned_ids) < len(expected_page_ids):
        return False
    return all(pid in mentioned_ids for pid in expected_page_ids)


def verify(notion: Client, main_id: str | None = None) -> bool:
    main_page_id = _get_main_page_id(notion, main_id)
    if not main_page_id:
        print(
            "Error: Could not locate the main page 'It Trouble Shooting Hub'.",
            file=sys.stderr,
        )
        return False

    # Locate required databases
    it_home_db_id = _fetch_database_id(notion, main_page_id, IT_HOMEPAGE_DB_TITLE)
    it_req_db_id = _fetch_database_id(notion, main_page_id, IT_REQUESTS_DB_TITLE)
    if not all([it_home_db_id, it_req_db_id]):
        print(
            "Error: Required databases not found under the main page.", file=sys.stderr
        )
        return False

    # Identify expired pages
    expired_pages = _expired_pages(notion, it_home_db_id)
    if not expired_pages:
        print(
            "Failure: No expired pages found; expected at least one for this task.",
            file=sys.stderr,
        )
        return False

    # Verify callout on each expired page
    for pg in expired_pages:
        pid = pg["id"]
        if not _check_callout_present(notion, pid):
            print(
                f"Failure: Callout missing or incorrect on page {pid}.", file=sys.stderr
            )
            return False

    # Verify IT Request entry
    request_page = _find_request_page(notion, it_req_db_id)
    if not request_page:
        print(
            "Failure: IT Request 'Batch Verification Update Required' not found.",
            file=sys.stderr,
        )
        return False
    if not _check_request_properties(request_page):
        print("Failure: Priority or Status incorrect on IT Request.", file=sys.stderr)
        return False

    # Verify bullet list in IT Request body
    expired_titles = []
    for p in expired_pages:
        title_prop = p.get("properties", {}).get("Page", {}).get("title", [])
        title_text = title_prop[0].get("plain_text") if title_prop else None
        if title_text:
            expired_titles.append(title_text)
    expected_page_ids = [p["id"] for p in expired_pages]
    if not _request_page_contains_mentions(
        notion, request_page["id"], expected_page_ids
    ):
        print(
            "Failure: IT Request body does not contain mentions for all affected pages.",
            file=sys.stderr,
        )
        return False

    print("Success: All verification checks passed.")
    return True


def main():
    notion = notion_utils.get_notion_client()
    main_id = sys.argv[1] if len(sys.argv) > 1 else None
    if verify(notion, main_id):
        sys.exit(0)
    sys.exit(1)


if __name__ == "__main__":
    main()


================================================
FILE: tasks/notion/standard/japan_travel_planner/daily_itinerary_overview/description.md
================================================
Create a comprehensive daily itinerary overview page to help organize my Japan travel plans. I need you to create a new page called 'Daily Itinerary Overview' as a child of the main Japan Travel Planner page.

**Task Requirements:**
1. Create a new page titled 'Daily Itinerary Overview' as a child page of the main Japan Travel Planner page
2. Query the Travel Itinerary database to retrieve all activities
3. Structure the page with the following specific format:
   - Add a heading_1 block with text "📅 Daily Itinerary Overview"
   - Add a heading_2 block with text "📊 Trip Summary"
   - Under Trip Summary, add a paragraph listing the total number of visited activities
   - Create heading_2 blocks for "🌅 Day 1", "🌆 Day 2", and "🌃 Day 3"
   - Under each day heading, list the activities scheduled for that day in to do list
   - Each activity (use To-do list) should show: Activity Name - City (if available), for example, "Osaka Castle - Osaka". Check it if it's visited.
4. The summary paragraph must contain the exact text "Total activities visited (from Day 1 to Day 3): [NUMBER]" where [NUMBER] is the actual count.
5. Ensure all headings use the exact emoji and text format specified above

================================================
FILE: tasks/notion/standard/japan_travel_planner/daily_itinerary_overview/meta.json
================================================
{
  "task_id": "daily_itinerary_overview",
  "task_name": "Daily Itinerary Overview",
  "category_id": "japan_travel_planner",
  "category_name": "Japan Travel Planner",
  "description": "Create a comprehensive daily itinerary overview page to organize Japan travel plans with structured day-by-day activities.",
  "author": "Xiangyan Liu",
  "created_at": "2025-07-27",
  "difficulty": "L3",
  "tags": [
    "database manipulation",
    "data aggregation",
    "report generation",
    "visual formatting",
    "status tracking"
  ],
  "mcp": [
    "notion"
  ],
  "meta_data": {
    "stateType": "url",
    "stateContent": null,
    "stateUrl": "https://painted-tennis-ebc.notion.site/Japan-Travel-Planner-23181626b6d781c4b6bedb12786b5abe"
  }
}

================================================
FILE: tasks/notion/standard/japan_travel_planner/daily_itinerary_overview/verify.py
================================================
import sys
import re
from notion_client import Client
from tasks.utils import notion_utils


def verify_todo_database_correspondence(all_blocks, activities_by_day, _):
    """
    Verify that to-do items in the overview page correspond exactly to database activities.
    """
    # Extract to-do items organized by day from the overview page
    todos_by_day = {"Day 1": [], "Day 2": [], "Day 3": []}
    current_day = None
    checked_todos_count = 0

    for block in all_blocks:
        block_type = block.get("type")
        block_text = notion_utils.get_block_plain_text(block)

        # Track which day section we're in
        if block_type == "heading_2":
            if "🌅 Day 1" in block_text:
                current_day = "Day 1"
            elif "🌆 Day 2" in block_text:
                current_day = "Day 2"
            elif "🌃 Day 3" in block_text:
                current_day = "Day 3"
            else:
                current_day = None  # Reset for non-day headings

        # Collect to-do items under day headings
        elif block_type == "to_do" and current_day:
            to_do_data = block.get("to_do", {})
            is_checked = to_do_data.get("checked", False)

            if is_checked:
                checked_todos_count += 1

            todos_by_day[current_day].append(
                {"text": block_text, "checked": is_checked}
            )

    # Verify each day's activities match
    for day in ["Day 1", "Day 2", "Day 3"]:
        db_activities = activities_by_day[day]
        page_todos = todos_by_day[day]

        # Check if counts match
        if len(db_activities) != len(page_todos):
            print(
                f"Error: {day} activity count mismatch. Database has {len(db_activities)} activities, page has {len(page_todos)} to-dos.",
                file=sys.stderr,
            )
            return False

        # Verify each database activity has corresponding to-do
        for db_activity in db_activities:
            expected_format = f"{db_activity['name']}"
            if db_activity["city"]:
                expected_format += f" - {db_activity['city']}"

            # Find matching to-do item
            matching_todo = None
            for todo in page_todos:
                if (
                    expected_format in todo["text"]
                    or db_activity["name"] in todo["text"]
                ):
                    matching_todo = todo
                    break

            if not matching_todo:
                print(
                    f"Error: {day} - Database activity '{expected_format}' not found in to-do list.",
                    file=sys.stderr,
                )
                return False

            # Verify checked status matches visited status
            if db_activity["visited"] != matching_todo["checked"]:
                status_desc = "checked" if db_activity["visited"] else "unchecked"
                actual_desc = "checked" if matching_todo["checked"] else "unchecked"
                print(
                    f"Error: {day} - Activity '{db_activity['name']}' should be {status_desc} but is {actual_desc}.",
                    file=sys.stderr,
                )
                return False

    # Verify summary count matches checked to-dos
    for block in all_blocks:
        if block.get("type") == "paragraph":
            block_text = notion_utils.get_block_plain_text(block)
            if "Total activities visited (from Day 1 to Day 3): 8" in block_text:
                print(
                    f"Success: Daily Itinerary Overview page created with correct structure. All {checked_todos_count} visited activities match database."
                )
                return True

    print(
        f"Error: Summary shows incorrect visited activity count. Expected: {checked_todos_count} (based on checked to-do items)",
        file=sys.stderr,
    )
    return False


def verify(notion: Client, main_id: str = None) -> bool:
    """
    Verifies that the Daily Itinerary Overview page has been created correctly.
    """
    # Find the main Japan Travel Planner page
    page_id = None
    if main_id:
        found_id, object_type = notion_utils.find_page_or_database_by_id(
            notion, main_id
        )
        if found_id and object_type == "page":
            page_id = found_id

    if not page_id:
        page_id = notion_utils.find_page(notion, "Japan Travel Planner")
    if not page_id:
        print("Error: Main 'Japan Travel Planner' page not found.", file=sys.stderr)
        return False

    # Find the Daily Itinerary Overview child page
    overview_page_id = None
    try:
        # Get all child pages of the main page
        response = notion.search(
            query="Daily Itinerary Overview",
            filter={"property": "object", "value": "page"},
        )

        for result in response.get("results", []):
            # Check if this page is a child of the main page
            parent = result.get("parent", {})
            if parent.get("type") == "page_id" and parent.get("page_id") == page_id:
                overview_page_id = result["id"]
                break

        if not overview_page_id:
            # Alternative method: check page title directly
            for result in response.get("results", []):
                title_list = (
                    result.get("properties", {}).get("title", {}).get("title", [])
                )
                for title_obj in title_list:
                    if "Daily Itinerary Overview" in title_obj.get("plain_text", ""):
                        overview_page_id = result["id"]
                        break
                if overview_page_id:
                    break

    except Exception as e:
        print(
            f"Error searching for Daily Itinerary Overview page: {e}", file=sys.stderr
        )
        return False

    if not overview_page_id:
        print(
            "Error: 'Daily Itinerary Overview' page not found as child of main page.",
            file=sys.stderr,
        )
        return False

    # Get all blocks from the overview page
    all_blocks = notion_utils.get_all_blocks_recursively(notion, overview_page_id)

    # Required content to verify - must appear in this exact order
    required_headings_sequence = [
        ("📅 Daily Itinerary Overview", "heading_1"),
        ("📊 Trip Summary", "heading_2"),
        ("🌅 Day 1", "heading_2"),
        ("🌆 Day 2", "heading_2"),
        ("🌃 Day 3", "heading_2"),
    ]

    found_headings_in_order = []
    found_summary = False
    summary_has_correct_format = False
    found_todo_items = False

    # Check each block and track heading sequence
    for block in all_blocks:
        block_text = notion_utils.get_block_plain_text(block)
        block_type = block.get("type")

        # Check for required headings in sequence
        for heading_text, expected_type in required_headings_sequence:
            if heading_text in block_text and block_type == expected_type:
                found_headings_in_order.append((heading_text, expected_type))

        # Check for trip summary paragraph
        if (
            block_type == "paragraph"
            and "Total activities visited (from Day 1 to Day 3):" in block_text
        ):
            found_summary = True
            # Check if the format is correct (contains a number)
            if re.search(
                r"Total activities visited \(from Day 1 to Day 3\):\s*\d+", block_text
            ):
                summary_has_correct_format = True

        # Check for to-do list items (activities under day headings)
        if block_type == "to_do":
            found_todo_items = True
            # Check if to-do items follow the format "Activity Name - City"
            if " - " in block_text:
                # Format appears to be correct (contains dash separator)
                pass

    # Verify all required headings are found in correct sequence
    if len(found_headings_in_order) != len(required_headings_sequence):
        missing_headings = []
        for heading_text, heading_type in required_headings_sequence:
            if (heading_text, heading_type) not in found_headings_in_order:
                missing_headings.append(f"{heading_text} ({heading_type})")
        print(f"Error: Missing required headings: {missing_headings}", file=sys.stderr)
        return False

    # Verify headings appear in correct order
    for i, (found_heading, found_type) in enumerate(found_headings_in_order):
        expected_heading, expected_type = required_headings_sequence[i]
        if found_heading != expected_heading or found_type != expected_type:
            print(
                f"Error: Headings not in correct order. Expected '{expected_heading}' ({expected_type}) at position {i + 1}, but found '{found_heading}' ({found_type})",
                file=sys.stderr,
            )
            return False

    # Verify trip summary exists and has correct format
    if not found_summary:
        print(
            "Error: Trip summary paragraph with 'Total activities visite' not found.",
            file=sys.stderr,
        )
        return False

    if not summary_has_correct_format:
        print(
            "Error: Trip summary does not have correct format 'Total activities visited: [NUMBER]'.",
            file=sys.stderr,
        )
        return False

    # Verify to-do list items exist (activities should be in to-do format)
    if not found_todo_items:
        print(
            "Error: No to-do list items found. Activities should be listed as to-do items under day headings.",
            file=sys.stderr,
        )
        return False

    # Additional verification: Check if Travel Itinerary database exists and has data
    try:
        itinerary_db_id = notion_utils.find_database_in_block(
            notion, page_id, "Travel Itinerary"
        )
        if not itinerary_db_id:
            itinerary_db_id = notion_utils.find_database(notion, "Travel Itinerary")

        if itinerary_db_id:
            # Query the database to get all activities
            db_response = notion.databases.query(database_id=itinerary_db_id)
            db_activities = db_response.get("results", [])

            # Organize database activities by day
            activities_by_day = {"Day 1": [], "Day 2": [], "Day 3": []}
            visited_count = 0

            for result in db_activities:
                properties = result.get("properties", {})

                # Extract activity info
                activity_info = {"name": "", "city": "", "visited": False, "day": None}

                for prop_name, prop_value in properties.items():
                    prop_type = prop_value.get("type")

                    # Get activity name (usually from title property)
                    if prop_type == "title" and prop_value.get("title"):
                        activity_info["name"] = prop_value["title"][0]["plain_text"]

                    # Get city info
                    elif "city" in prop_name.lower() and prop_type in [
                        "rich_text",
                        "select",
                    ]:
                        if prop_type == "rich_text" and prop_value.get("rich_text"):
                            activity_info["city"] = prop_value["rich_text"][0][
                                "plain_text"
                            ]
                        elif prop_type == "select" and prop_value.get("select"):
                            activity_info["city"] = prop_value["select"]["name"]

                    # Get visited status
                    elif prop_type == "checkbox":
                        if prop_value.get("checkbox"):
                            activity_info["visited"] = True
                            visited_count += 1

                    # Get day info
                    elif "day" in prop_name.lower() and prop_type in [
                        "select",
                        "rich_text",
                    ]:
                        if prop_type == "select" and prop_value.get("select"):
                            day_value = prop_value["select"]["name"]
                            if day_value in activities_by_day:
                                activity_info["day"] = day_value
                        elif prop_type == "rich_text" and prop_value.get("rich_text"):
                            day_value = prop_value["rich_text"][0]["plain_text"]
                            if day_value in activities_by_day:
                                activity_info["day"] = day_value

                # Add to appropriate day if day is specified
                if activity_info["day"] and activity_info["name"]:
                    activities_by_day[activity_info["day"]].append(activity_info)

            # Now verify to-do items match database activities
            return verify_todo_database_correspondence(
                all_blocks, activities_by_day, visited_count
            )
        else:
            print(
                "Warning: Travel Itinerary database not found, using to-do items for count verification."
            )
            # Count checked to-do items in the overview page even without database
            checked_todos_count = 0
            for block in all_blocks:
                if block.get("type") == "to_do":
                    to_do_data = block.get("to_do", {})
                    if to_do_data.get("checked", False):
                        checked_todos_count += 1

            # Verify the summary shows the correct visited count based on checked to-dos
            for block in all_blocks:
                if block.get("type") == "paragraph":
                    block_text = notion_utils.get_block_plain_text(block)
                    if f"Total activities visited: {checked_todos_count}" in block_text:
                        print(
                            f"Success: Daily Itinerary Overview page created with correct structure and {checked_todos_count} visited activities."
                        )
                        return True

            print(
                f"Error: Summary shows incorrect visited activity count. Expected: {checked_todos_count} (based on checked to-do items)",
                file=sys.stderr,
            )
            return False

    except Exception as e:
        print(f"Warning: Could not verify activity count: {e}")
        print("Success: Daily Itinerary Overview page created with correct structure.")
        return True


def main():
    """
    Executes the verification process and exits with a status code.
    """
    notion = notion_utils.get_notion_client()
    main_id = sys.argv[1] if len(sys.argv) > 1 else None
    if verify(notion, main_id):
        sys.exit(0)
    else:
        sys.exit(1)


if __name__ == "__main__":
    main()


================================================
FILE: tasks/notion/standard/japan_travel_planner/packing_progress_summary/description.md
================================================
I'm preparing for my Japan trip and need to organize my packing list. Please help me:

**Step 1: Update Items in the Packing List Database**
In the Clothes category, all items have already been packed except for the hat After this, check the `SIM Card` entry and the `Wallet` entry.

**Step 2: Create Packing Progress Summary**
After adding the items, create a new section in the main Japan Travel Planner page immediately after the "Packing List 💼" heading. This section should contain:

1. A paragraph block with the bold text "**Packing Progress Summary**"
2. Followed by bullet list items showing statistics for each category in the format:
   - "Category: X/Y packed" (where X is packed items, Y is total items), for example: "Shoes: 2/10 packed"
   - ...

================================================
FILE: tasks/notion/standard/japan_travel_planner/packing_progress_summary/meta.json
================================================
{
  "task_id": "packing_progress_summary",
  "task_name": "Packing Progress Summary",
  "category_id": "japan_travel_planner",
  "category_name": "Japan Travel Planner",
  "description": "Update packing list items and create a progress summary section showing statistics for each category.",
  "author": "Xiangyan Liu",
  "created_at": "2025-07-27",
  "difficulty": "L3",
  "tags": [
    "database manipulation",
    "data aggregation",
    "report generation",
    "status tracking"
  ],
  "mcp": [
    "notion"
  ],
  "meta_data": {
    "stateType": "url",
    "stateContent": null,
    "stateUrl": "https://painted-tennis-ebc.notion.site/Japan-Travel-Planner-23181626b6d781c4b6bedb12786b5abe",
    "stateOriginalUrl": "https://www.notion.so/marketplace/templates/japantravelplanner101"
  }
}

================================================
FILE: tasks/notion/standard/japan_travel_planner/packing_progress_summary/verify.py
================================================
import sys
from notion_client import Client
from tasks.utils import notion_utils


def verify(notion: Client, main_id: str = None) -> bool:
    """
    Verifies that:
    1. All Clothes items except hat are marked as packed
    2. SIM Card and Wallet entries are checked
    3. Packing Progress Summary section is created with statistics
    """
    # Find the main Japan Travel Planner page
    page_id = None
    if main_id:
        found_id, object_type = notion_utils.find_page_or_database_by_id(
            notion, main_id
        )
        if found_id and object_type == "page":
            page_id = found_id

    if not page_id:
        page_id = notion_utils.find_page(notion, "Japan Travel Planner")
    if not page_id:
        print("Error: Page 'Japan Travel Planner' not found.", file=sys.stderr)
        return False

    # Find the Packing List database
    all_blocks = notion_utils.get_all_blocks_recursively(notion, page_id)
    packing_list_db_id = None
    packing_list_heading_id = None

    for i, block in enumerate(all_blocks):
        # Find the Packing List heading
        if block.get("type") == "heading_2":
            heading_text = notion_utils.get_block_plain_text(block)
            if "Packing List" in heading_text and "💼" in heading_text:
                packing_list_heading_id = block["id"]
                # Look for the database after this heading
                for j in range(i + 1, len(all_blocks)):
                    if all_blocks[j].get("type") == "child_database":
                        packing_list_db_id = all_blocks[j]["id"]
                        break
                break

    if not packing_list_db_id:
        print("Error: Packing List database not found.", file=sys.stderr)
        return False

    # Query the database for all items
    try:
        db_items = notion.databases.query(database_id=packing_list_db_id)

        # Track items for verification
        clothes_items = []
        sim_card_found = False
        sim_card_packed = False
        wallet_found = False
        wallet_packed = False

        # Process all items
        for page in db_items.get("results", []):
            props = page.get("properties", {})

            # Get item name
            name_prop = props.get("Name", {})
            if name_prop.get("type") == "title":
                name = "".join(
                    [t.get("plain_text", "") for t in name_prop.get("title", [])]
                )
            else:
                continue

            # Get type (multi_select)
            type_prop = props.get("Type", {})
            types = []
            if type_prop.get("type") == "multi_select":
                types = [
                    opt.get("name", "") for opt in type_prop.get("multi_select", [])
                ]

            # Get packed status
            packed_prop = props.get("Packed", {})
            packed = False
            if packed_prop.get("type") == "checkbox":
                packed = packed_prop.get("checkbox", False)

            # Check specific items
            if name == "SIM Card":
                sim_card_found = True
                sim_card_packed = packed
            elif name == "Wallet":
                wallet_found = True
                wallet_packed = packed

            # Track Clothes items
            if "Clothes" in types:
                clothes_items.append(
                    {"name": name, "packed": packed, "is_hat": "hat" in name.lower()}
                )

        # Verify Clothes items (all packed except hat)
        for item in clothes_items:
            if item["is_hat"]:
                if item["packed"]:
                    print(
                        "Error: Hat should not be packed but is marked as packed.",
                        file=sys.stderr,
                    )
                    return False
            else:
                if not item["packed"]:
                    print(
                        f"Error: Clothes item '{item['name']}' should be packed but is not.",
                        file=sys.stderr,
                    )
                    return False

        print("Success: All Clothes items are correctly marked (packed except hat).")

        # Verify SIM Card and Wallet
        if not sim_card_found:
            print("Error: SIM Card entry not found.", file=sys.stderr)
            return False
        if not sim_card_packed:
            print("Error: SIM Card entry is not checked (packed).", file=sys.stderr)
            return False

        if not wallet_found:
            print("Error: Wallet entry not found.", file=sys.stderr)
            return False
        if not wallet_packed:
            print("Error: Wallet entry is not checked (packed).", file=sys.stderr)
            return False

        print("Success: SIM Card and Wallet entries are checked.")

    except Exception as e:
        print(f"Error querying Packing List database: {e}", file=sys.stderr)
        return False

    # Expected ground truth statistics
    expected_stats = {
        "Clothes": {"packed": 12, "total": 13},
        "Electronics": {"packed": 1, "total": 10},
        "Essentials": {"packed": 1, "total": 12},
        "Miscellaneous": {"packed": 0, "total": 10},
        "Shoes": {"packed": 0, "total": 2},
        "Toiletries": {"packed": 0, "total": 19},
    }

    # Verify Packing Progress Summary section
    # Re-fetch blocks to get updated content
    all_blocks = notion_utils.get_all_blocks_recursively(notion, page_id)

    # Find the Packing List heading again and check blocks after it
    packing_heading_index = None
    for i, block in enumerate(all_blocks):
        if block.get("id") == packing_list_heading_id:
            packing_heading_index = i
            break

    summary_found = False
    statistics_verified = True
    found_statistics = {}

    if packing_heading_index is not None:
        # Look for summary in the next few blocks
        for i in range(
            packing_heading_index + 1, min(packing_heading_index + 15, len(all_blocks))
        ):
            block = all_blocks[i]
            block_text = notion_utils.get_block_plain_text(block)

            # Check for "Packing Progress Summary" paragraph
            if "Packing Progress Summary" in block_text:
                summary_found = True
                # Check if it's bold
                if block.get("type") == "paragraph":
                    rich_text_list = block.get("paragraph", {}).get("rich_text", [])
                    for text_obj in rich_text_list:
                        if "Packing Progress Summary" in text_obj.get("text", {}).get(
                            "content", ""
                        ):
                            if not text_obj.get("annotations", {}).get("bold", False):
                                print(
                                    "Error: 'Packing Progress Summary' text is not bold.",
                                    file=sys.stderr,
                                )
                                return False

            # Check for statistics bullet points in format "Category: X/Y packed"
            if (
                block.get("type") == "bulleted_list_item"
                and ":" in block_text
                and "/" in block_text
                and "packed" in block_text
            ):
                # Parse the statistic line
                # Expected format: "Category: X/Y packed"
                try:
                    parts = block_text.split(":")
                    if len(parts) >= 2:
                        category = parts[0].strip()
                        stats_part = parts[1].strip()

                        # Extract X/Y from "X/Y packed"
                        if "/" in stats_part and "packed" in stats_part:
                            nums = stats_part.split("packed")[0].strip()
                            if "/" in nums:
                                x_str, y_str = nums.split("/")
                                x = int(x_str.strip())
                                y = int(y_str.strip())
                                found_statistics[category] = {"packed": x, "total": y}
                except:
                    pass  # Continue if parsing fails

    if not summary_found:
        print(
            "Error: 'Packing Progress Summary' section not found after Packing List heading.",
            file=sys.stderr,
        )
        return False

    if not found_statistics:
        print(
            "Error: No valid packing statistics bullet points found in format 'Category: X/Y packed'.",
            file=sys.stderr,
        )
        return False

    # Verify the statistics match the expected values
    for category, stats in expected_stats.items():
        if category not in found_statistics:
            print(
                f"Error: Category '{category}' missing from Packing Progress Summary.",
                file=sys.stderr,
            )
            statistics_verified = False
        else:
            found = found_statistics[category]
            if found["packed"] != stats["packed"] or found["total"] != stats["total"]:
                print(
                    f"Error: Statistics mismatch for '{category}': expected {stats['packed']}/{stats['total']} packed, found {found['packed']}/{found['total']} packed.",
                    file=sys.stderr,
                )
                statistics_verified = False

    # Check for extra categories in summary that don't exist in expected
    for category in found_statistics:
        if category not in expected_stats:
            print(
                f"Error: Unexpected category '{category}' in summary.", file=sys.stderr
            )
            statistics_verified = False

    if not statistics_verified:
        return False

    print("Success: Packing Progress Summary section created with correct statistics.")
    # print(f"Verified statistics: {', '.join(f'{k}: {v['packed']}/{v['total']} packed' for k, v in expected_stats.items())}")

    return True


def main():
    """
    Executes the verification process and exits with a status code.
    """
    notion = notion_utils.get_notion_client()
    main_id = sys.argv[1] if len(sys.argv) > 1 else None
    if verify(notion, main_id):
        sys.exit(0)
    else:
        sys.exit(1)


if __name__ == "__main__":
    main()


================================================
FILE: tasks/notion/standard/japan_travel_planner/remove_osaka_itinerary/description.md
================================================
Go to Japan Travel Planner and remove the itinerary in OSAKA after 6 PM (excluding 6 PM) in Day 1 and Day 2.

================================================
FILE: tasks/notion/standard/japan_travel_planner/remove_osaka_itinerary/meta.json
================================================
{
  "task_id": "remove_osaka_itinerary",
  "task_name": "Remove Osaka Itinerary",
  "category_id": "japan_travel_planner",
  "category_name": "Japan Travel Planner",
  "description": "Remove the itinerary items in Osaka after 6 PM from Day 1 and Day 2 travel schedules.",
  "author": "Xiangyan Liu",
  "created_at": "2025-07-27",
  "difficulty": "L3",
  "tags": [
    "conditional filtering",
    "automated migration"
  ],
  "mcp": [
    "notion"
  ],
  "meta_data": {
    "stateType": "url",
    "stateContent": null,
    "stateUrl": "https://painted-tennis-ebc.notion.site/Japan-Travel-Planner-23181626b6d781c4b6bedb12786b5abe",
    "stateOriginalUrl": "https://www.notion.so/marketplace/templates/japantravelplanner101"
  }
}

================================================
FILE: tasks/notion/standard/japan_travel_planner/remove_osaka_itinerary/verify.py
================================================
import sys
from notion_client import Client
from tasks.utils import notion_utils

def get_page_title(page_result):
    """Extract title from a page result"""
    properties = page_result.get('properties', {})
    name_property = properties.get('Name', {})
    if name_property.get('type') == 'title':
        title_array = name_property.get('title', [])
        if title_array and len(title_array) > 0:
            return title_array[0].get('plain_text', '')
    return ''

def get_page_time(page_result):
    """Extract time from Notes field"""
    properties = page_result.get('properties', {})
    notes_property = properties.get('Notes', {})
    if notes_property.get('type') == 'rich_text':
        rich_text_array = notes_property.get('rich_text', [])
        if rich_text_array and len(rich_text_array) > 0:
            notes_text = rich_text_array[0].get('plain_text', '')
            return notes_text.strip()
    return ''

def get_page_group(page_result):
    """Extract group/location from page"""
    properties = page_result.get('properties', {})
    group_property = properties.get('Group', {})
    if group_property.get('type') == 'select':
        select = group_property.get('select')
        if select:
            return select.get('name', '')
    return ''

def get_page_day(page_result):
    """Extract day from page"""
    properties = page_result.get('properties', {})
    day_property = properties.get('Day', {})
    if day_property.get('type') == 'select':
        select = day_property.get('select')
        if select:
            return select.get('name', '')
    return ''

def parse_time_to_minutes(time_str):
    """Convert time string to minutes for comparison
    Returns None if time cannot be parsed"""
    if not time_str:
        return None
    
    # Clean the time string
    time_str = time_str.strip().upper()
    
    # Remove any text after the time (e.g., "7:30 PM\n" -> "7:30 PM")
    time_str = time_str.split('\n')[0].strip()
    
    # Extract time components
    try:
        if 'PM' in time_str:
            time_part = time_str.replace('PM', '').strip()
            if ':' in time_part:
                hours, minutes = time_part.split(':')
                hours = int(hours)
                minutes = int(minutes)
            else:
                hours = int(time_part)
                minutes = 0
            # Convert PM hours (add 12 for PM times except 12 PM)
            if hours != 12:
                hours += 12
            return hours * 60 + minutes
        elif 'AM' in time_str:
            time_part = time_str.replace('AM', '').strip()
            if ':' in time_part:
                hours, minutes = time_part.split(':')
                hours = int(hours)
                minutes = int(minutes)
            else:
                hours = int(time_part)
                minutes = 0
            # Handle 12 AM (midnight)
            if hours == 12:
                hours = 0
            return hours * 60 + minutes
    except:
        return None
    
    return None

def verify(notion: Client, main_id: str = None) -> bool:
    """
    Verifies that all OSAKA events after 6PM have been removed from Day 1 and Day 2 in the Japan Travel Planner.
    
    Expected items that should be deleted (all in OSAKA, after 6PM, on Day 1 or Day 2):
    1. Rikuro's Namba Main Branch - 7 PM (Day 1)
    2. Shin Sekai "New World" - 8 PM (Day 2)
    3. Katsudon Chiyomatsu - 7:30 PM (Day 2)
    4. Ebisubashi Bridge - 9 PM (Day 1)
    
    Note: Kuromon Ichiba Market at 6 PM should NOT be deleted (it's at 6PM, not after)
    Items after 6PM on other days (Day 3-8) should NOT be deleted
    """
    
    # Step 1: Find the main Japan Travel Planner page
    if main_id:
        found_id, object_type = notion_utils.find_page_or_database_by_id(notion, main_id)
        if not found_id or object_type != 'page':
            print("Error: Japan Travel Planner page not found.", file=sys.stderr)
            return False
    else:
        # Try to find the page by searching
        found_id = notion_utils.find_page(notion, "Japan Travel Planner")
        if not found_id:
            print("Error: Japan Travel Planner page not found.", file=sys.stderr)
            return False
    
    print(f"Found Japan Travel Planner page: {found_id}")
    
    # Step 2: Find the Travel Itinerary database
    all_blocks = notion_utils.get_all_blocks_recursively(notion, found_id)
    travel_itinerary_db_id = None
    
    for block in all_blocks:
        if block and block.get("type") == "child_database":
            title = block.get("child_database", {}).get("title", "")
            if "Travel Itinerary" in title:
                travel_itinerary_db_id = block.get("id")
                print(f"Found Travel Itinerary database: {travel_itinerary_db_id}")
                break
    
    if not travel_itinerary_db_id:
        print("Error: Travel Itinerary database not found", file=sys.stderr)
        return False
    
    # Step 3: Query the database for OSAKA items on Day 1 and Day 2
    try:
        query_result = notion.databases.query(
            database_id=travel_itinerary_db_id,
            filter={
                "and": [
                    {"property": "Group", "select": {"equals": "Osaka"}},
                    {"or": [
                        {"property": "Day", "select": {"equals": "Day 1"}},
                        {"property": "Day", "select": {"equals": "Day 2"}}
                    ]}
                ]
            }
        )
    except Exception as e:
        print(f"Error querying Travel Itinerary database: {e}", file=sys.stderr)
        return False
    
    # Step 4: Check for items that should have been deleted
    six_pm_minutes = 18 * 60  # 6 PM in minutes (18:00)
    
    # Expected deleted items (4 specific items after 6 PM on Day 1 and Day 2)
    expected_deleted = {
        "Rikuro's Namba Main Branch": {"time": "7 PM", "day": "Day 1", "found": False},
        "Shin Sekai \"New World\"": {"time": "8 PM", "day": "Day 2", "found": False},
        "Katsudon Chiyomatsu": {"time": "7:30 PM", "day": "Day 2", "found": False},
        "Ebisubashi Bridge": {"time": "9 PM", "day": "Day 1", "found": False}
    }
    
    # Items that should remain (at or before 6 PM)
    expected_remaining = {
        "Kuromon Ichiba Market": {"time": "6 PM", "found": False}
    }
    
    osaka_items_after_6pm = []
    osaka_items_at_or_before_6pm = []
    
    # Debug: Show total query results
    print(f"Debug: Found {len(query_result.get('results', []))} total OSAKA items on Day 1 and Day 2")
    
    # Process all OSAKA items on Day 1 and Day 2
    for page in query_result.get('results', []):
        page_title = get_page_title(page).strip()
        page_time = get_page_time(page)
        page_group = get_page_group(page)
        page_day = get_page_day(page)
        
        if page_group != "Osaka":
            continue
        
        # Parse time to check if after 6 PM
        time_minutes = parse_time_to_minutes(page_time)
        
        if time_minutes is not None and time_minutes > six_pm_minutes:
            osaka_items_after_6pm.append({
                "title": page_title,
                "time": page_time,
                "day": page_day,
                "id": page.get('id')
            })
            
            # Check if this is one of the expected deleted items
            for expected_title, expected_info in expected_deleted.items():
                # Clean up the titles for comparison
                clean_page_title = page_title.strip().lower()
                clean_expected_title = expected_title.strip().lower()
                
                # Check for "Rikuro's" or "Rikuro's" (different apostrophe types)
                if "rikuro" in clean_page_title and "rikuro" in clean_expected_title:
                    title_match = True
                elif clean_page_title == clean_expected_title:
                    title_match = True
                elif clean_expected_title in clean_page_title or clean_page_title in clean_expected_title:
                    title_match = True
                else:
                    title_match = False
                    
                if title_match and page_day == expected_info["day"]:
                    print(f"Debug: Found '{page_title}' on {page_day} at {page_time} - matches expected '{expected_title}'")
                    expected_deleted[expected_title]["found"] = True
                
        elif time_minutes is not None and time_minutes <= six_pm_minutes:
            osaka_items_at_or_before_6pm.append({
                "title": page_title,
                "time": page_time,
                "day": page_day,
                "id": page.get('id')
            })
            
            # Check if this is one of the expected remaining items
            for expected_title in expected_remaining:
                if expected_title.lower() in page_title.lower() or page_title.lower() in expected_title.lower():
                    expected_remaining[expected_title]["found"] = True
    
    # Step 5: Verify results
    print(f"\nVerification Summary:")
    print(f"=" * 50)
    
    all_passed = True
    
    # Check that the 4 expected items after 6 PM have been deleted
    print("\n4 Items that should be deleted (after 6 PM on Day 1 and Day 2):")
    for item_name, item_info in expected_deleted.items():
        if item_info["found"]:
            # If found = True, it means the item still exists (was not deleted)
            print(f"✗ {item_name} ({item_info['day']}, {item_info['time']}) - Still exists, should be deleted", file=sys.stderr)
            all_passed = False
        else:
            # If found = False, it means the item was deleted correctly
            print(f"✓ {item_name} ({item_info['day']}, {item_info['time']}) - Correctly deleted")
    
    
    # Check that items at or before 6 PM remain
    print("\nItems that should remain (at or before 6 PM on Day 1 and Day 2):")
    for item_name, item_info in expected_remaining.items():
        if item_info["found"]:
            print(f"✓ {item_name} ({item_info['time']}) - Correctly retained")
        else:
            print(f"✗ {item_name} ({item_info['time']}) - Missing, should not be deleted", file=sys.stderr)
            all_passed = False
    
    # Report any items after 6 PM that still exist
    if osaka_items_after_6pm:
        print(f"\n✗ Found {len(osaka_items_after_6pm)} OSAKA item(s) after 6 PM on Day 1/Day 2:", file=sys.stderr)
        for item in osaka_items_after_6pm:
            print(f"  - {item['title']} at {item['time']} ({item['day']})", file=sys.stderr)
    else:
        print(f"\n✓ No OSAKA items found after 6 PM on Day 1/Day 2 (all correctly deleted)")
    
    # Report count summary
    print(f"\nCount Summary:")
    print(f"- OSAKA items after 6 PM on Day 1/Day 2 found: {len(osaka_items_after_6pm)} (should be 0)")
    print(f"- OSAKA items at/before 6 PM on Day 1/Day 2 found: {len(osaka_items_at_or_before_6pm)}")
    print(f"- Expected deletions verified: {sum(1 for item in expected_deleted.values() if not item['found'])}/4")
    
    return all_passed

def main():
    """
    Executes the verification process and exits with a status code.
    """
    notion = notion_utils.get_notion_client()
    main_id = sys.argv[1] if len(sys.argv) > 1 else None
    
    if verify(notion, main_id):
        print("\nVerification passed: All 4 required OSAKA events after 6 PM on Day 1 and Day 2 have been removed")
        sys.exit(0)
    else:
        print("\nVerification failed: Some OSAKA events after 6 PM on Day 1/Day 2 still exist")
        sys.exit(1)

if __name__ == "__main__":
    main()

================================================
FILE: tasks/notion/standard/japan_travel_planner/restaurant_expenses_sync/description.md
================================================
Please find the restaurants that appear in Day 1 of the Travel Itinerary database, then create corresponding entries in the Expenses database, one restaurant per entry. Set the date uniformly to Jan 1, 2025, and the cost uniformly to $120. Display the restaurant name in the Expense field. Set Category to Dining. For Comment, use the Description from the corresponding restaurant page. Leave other properties empty.

================================================
FILE: tasks/notion/standard/japan_travel_planner/restaurant_expenses_sync/meta.json
================================================
{
  "task_id": "restaurant_expenses_sync",
  "task_name": "Restaurant Expenses Sync",
  "category_id": "japan_travel_planner",
  "category_name": "Japan Travel Planner",
  "description": "Find restaurants from Day 1 Travel Itinerary and create corresponding entries in the Expenses database.",
  "author": "Xiangyan Liu",
  "created_at": "2025-07-27",
  "difficulty": "L3",
  "tags": [
    "conditional filtering",
    "database manipulation",
    "cross-reference linking",
    "template population"
  ],
  "mcp": [
    "notion"
  ],
  "meta_data": {
    "stateType": "url",
    "stateContent": null,
    "stateUrl": "https://painted-tennis-ebc.notion.site/Japan-Travel-Planner-23181626b6d781c4b6bedb12786b5abe",
    "stateOriginalUrl": "https://www.notion.so/marketplace/templates/japantravelplanner101"
  }
}

================================================
FILE: tasks/notion/standard/japan_travel_planner/restaurant_expenses_sync/verify.py
================================================
import sys
from notion_client import Client
from tasks.utils import notion_utils


def verify(notion: Client, main_id: str = None) -> bool:
    """
    Verifies that restaurants from Day 1 of Travel Itinerary have corresponding expense entries.
    """
    page_id = None
    if main_id:
        found_id, object_type = notion_utils.find_page_or_database_by_id(
            notion, main_id
        )
        if found_id and object_type == "page":
            page_id = found_id

    if not page_id:
        page_id = notion_utils.find_page(notion, "Japan Travel Planner")
    if not page_id:
        print("Error: Page 'Japan Travel Planner' not found.", file=sys.stderr)
        return False

    # Find Travel Itinerary database
    itinerary_db_id = notion_utils.find_database_in_block(
        notion, page_id, "Travel Itinerary"
    )
    if not itinerary_db_id:
        print("Error: Database 'Travel Itinerary' not found.", file=sys.stderr)
        return False

    # Find Expenses database
    expenses_db_id = notion_utils.find_database_in_block(notion, page_id, "Expenses")
    if not expenses_db_id:
        print("Error: Database 'Expenses' not found.", file=sys.stderr)
        return False

    # Find Japan Places to Visit database
    places_db_id = notion_utils.find_database_in_block(
        notion, page_id, "Travel Itinerary"
    )
    if not places_db_id:
        print("Error: Database 'Japan Places to Visit' not found.", file=sys.stderr)
        return False

    # Query Day 1 restaurants from Travel Itinerary
    try:
        itinerary_results = notion.databases.query(
            database_id=itinerary_db_id,
            filter={
                "and": [
                    {"property": "Day", "select": {"equals": "Day 1"}},
                    {"property": "Type", "multi_select": {"contains": "Food"}},
                ]
            },
        ).get("results", [])
    except Exception as e:
        print(f"Error querying Travel Itinerary database: {e}", file=sys.stderr)
        return False

    if not itinerary_results:
        print(
            "Error: No restaurants found for Day 1 in Travel Itinerary.",
            file=sys.stderr,
        )
        return False

    # Extract restaurant names
    restaurant_names = []
    for entry in itinerary_results:
        props = entry.get("properties", {})
        name_prop = props.get("Name", {})
        name_text = "".join(t.get("plain_text", "") for t in name_prop.get("title", []))
        if name_text:
            restaurant_names.append(name_text.strip())

    if not restaurant_names:
        print("Error: No restaurant names found in Day 1 entries.", file=sys.stderr)
        return False

    # Get descriptions from Japan Places to Visit database
    try:
        places_results = notion.databases.query(database_id=places_db_id).get(
            "results", []
        )
    except Exception as e:
        print(f"Error querying Japan Places to Visit database: {e}", file=sys.stderr)
        return False

    # Create a map of restaurant names to descriptions
    restaurant_descriptions = {}
    for place in places_results:
        props = place.get("properties", {})
        name_prop = props.get("Name", {})
        name_text = "".join(t.get("plain_text", "") for t in name_prop.get("title", []))

        desc_prop = props.get("Description", {})
        desc_text = "".join(
            t.get("plain_text", "") for t in desc_prop.get("rich_text", [])
        )

        if name_text and desc_text:
            restaurant_descriptions[name_text.strip()] = desc_text.strip()

    # Query Expenses database
    try:
        expenses_results = notion.databases.query(database_id=expenses_db_id).get(
            "results", []
        )
    except Exception as e:
        print(f"Error querying Expenses database: {e}", file=sys.stderr)
        return False

    # Verify each restaurant has a corresponding expense entry
    verified_restaurants = []
    for restaurant_name in restaurant_names:
        found_matching_expense = False
        expected_description = restaurant_descriptions.get(restaurant_name, "")

        for expense in expenses_results:
            props = expense.get("properties", {})

            # Check Expense field (title)
            expense_prop = props.get("Expense", {})
            expense_text = "".join(
                t.get("plain_text", "") for t in expense_prop.get("title", [])
            )
            if expense_text.strip() != restaurant_name:
                continue

            # Check Date
            date_prop = props.get("Date", {})
            date_start = date_prop.get("date", {}).get("start")
            if date_start != "2025-01-01":
                continue

            # Check Transaction Amount
            amount_prop = props.get("Transaction Amount", {})
            amount = amount_prop.get("number")
            if amount != 120:
                continue

            # Check Category contains Dining
            category_prop = props.get("Category", {})
            categories = [c.get("name") for c in category_prop.get("multi_select", [])]
            if "Dining" not in categories:
                continue

            # Check Comment matches description (if description exists)
            if expected_description:
                comment_prop = props.get("Comment", {})
                comment_text = "".join(
                    t.get("plain_text", "") for t in comment_prop.get("rich_text", [])
                )
                if comment_text.strip().replace(
                    "\u202f", " "
                ) != expected_description.replace("\u202f", " "):
                    continue

            found_matching_expense = True
            verified_restaurants.append(restaurant_name)
            break

        if not found_matching_expense:
            print(
                f"Error: No matching expense entry found for restaurant '{restaurant_name}'.",
                file=sys.stderr,
            )
            return False

    if len(verified_restaurants) == len(restaurant_names):
        print(
            f"Success: Found matching expense entries for all {len(restaurant_names)} Day 1 restaurants."
        )
        return True
    else:
        print(
            f"Error: Only {len(verified_restaurants)} out of {len(restaurant_names)} restaurants have matching expense entries.",
            file=sys.stderr,
        )
        return False


def main():
    """
    Executes the verification process and exits with a status code.
    """
    notion = notion_utils.get_notion_client()
    main_id = sys.argv[1] if len(sys.argv) > 1 else None
    if verify(notion, main_id):
        sys.exit(0)
    else:
        sys.exit(1)


if __name__ == "__main__":
    main()


================================================
FILE: tasks/notion/standard/online_resume/layout_adjustment/description.md
================================================
Please go to my Online Resume page and adjust the Skills display with the following requirements:

## Skills Section Adjustment
1. Delete the Skills database from the right side of the page
2. Add a new Skills section on the left side, under the Languages section
3. Format skills as "[icon] skill description (type)", for example "✨✨ Photoshop (Design Tool)"
   - Use ✨✨ icon for skills with level >= 50%
   - Use ✨ icon for skills with level < 50%

## Work History and Education Layout Adjustment
1. Adjust the layout so that logo/image columns take up 50% width in each section
   - Note: Column width ratio might not be returned by API when columns are equal (50/50)
2. Replace all images/icons with black placeholder images using URL containing "https://singlecolorimage.com/get/000000/1024x128"

================================================
FILE: tasks/notion/standard/online_resume/layout_adjustment/meta.json
================================================
{
  "task_id": "layout_adjustment",
  "task_name": "Layout Adjustment",
  "category_id": "online_resume",
  "category_name": "Online Resume",
  "description": "This task involves modifying the layout and content of an online resume page by restructuring the Skills section with icon indicators and adjusting the Work History and Education sections to use equal column widths with placeholder images.",
  "author": "Xiangyan Liu",
  "created_at": "2025-08-14",
  "difficulty": "L3",
  "tags": [
    "content organization",
    "visual formatting",
    "conditional filtering",
    "template population"
  ],
  "mcp": [
    "notion"
  ],
  "meta_data": {
    "stateType": "url",
    "stateContent": null,
    "stateUrl": "https://painted-tennis-ebc.notion.site/Online-Resume-23181626b6d781159faaeb5eadaf612e",
    "stateOriginalUrl": "https://www.notion.so/marketplace/templates/online-resume"
  }
}

================================================
FILE: tasks/notion/standard/online_resume/layout_adjustment/verify.py
================================================
import sys
from notion_client import Client
from tasks.utils import notion_utils


def verify(notion: Client, main_id: str = None) -> bool:
    """
    Verifies that the Skills display has been adjusted correctly:
    1. Skills database on the right side should be deleted
    2. Skills section should be added on the left side under Languages
    3. Skills should be formatted with correct icons based on skill level
    4. Work History and Education sections should use black placeholder images
    """
    page_id = None
    if main_id:
        found_id, object_type = notion_utils.find_page_or_database_by_id(
            notion, main_id
        )
        if found_id and object_type == "page":
            page_id = found_id

    if not page_id:
        page_id = notion_utils.find_page(notion, "Online Resume")
    if not page_id:
        print("Error: Page 'Online Resume' not found.", file=sys.stderr)
        return False

    all_blocks = notion_utils.get_all_blocks_recursively(notion, page_id)

    # Step 1: Verify Skills database is NOT in the right column anymore
    # Find the main column list
    for block in all_blocks:
        if block.get("type") == "column_list":
            column_list_id = block["id"]
            columns = notion_utils.get_all_blocks_recursively(notion, column_list_id)
            
            # Check if this is the main two-column layout
            if len(columns) == 2:
                # Find the right column (usually the one with larger width ratio)
                for column in columns:
                    if column.get("type") == "column":
                        width_ratio = column.get("column", {}).get("width_ratio", 0)
                        # Right column typically has width_ratio > 0.5
                        if width_ratio > 0.5:
                            right_column_id = column["id"]
                            right_column_blocks = notion_utils.get_all_blocks_recursively(
                                notion, right_column_id
                            )
                            
                            # Check if Skills database exists in right column
                            for right_block in right_column_blocks:
                                if (
                                    right_block.get("type") == "child_database"
                                    and right_block.get("child_database", {}).get("title") == "Skills"
                                ):
                                    print(
                                        "Error: Skills database still exists in the right column.",
                                        file=sys.stderr,
                                    )
                                    return False

    # Step 2: Find the left column and verify Skills section exists there
    skills_section_found = False
    skills_with_double_sparkles = []
    skills_with_single_sparkle = []
    
    # First, find the main column_list (top-level)
    main_column_list_id = None
    for block in all_blocks:
        if block.get("type") == "column_list" and block.get("parent", {}).get("type") == "page_id":
            main_column_list_id = block["id"]
            break
    
    if not main_column_list_id:
        print("Error: Main column list not found.", file=sys.stderr)
        return False
    
    # Get the columns directly
    columns = notion_utils.get_all_blocks_recursively(notion, main_column_list_id)
    
    # Find the left column (the one with width_ratio around 0.25)
    left_column_id = None
    for column in columns:
        if column.get("type") == "column":
            width_ratio = column.get("column", {}).get("width_ratio", 0)
            # Left column has width_ratio around 0.25
            if 0.2 <= width_ratio <= 0.3:
                left_column_id = column["id"]
                break
    
    if not left_column_id:
        print("Error: Left column not found.", file=sys.stderr)
        return False
    
    # Get all blocks in the left column
    left_column_blocks = notion_utils.get_all_blocks_recursively(notion, left_column_id)
    
    # Find Languages heading
    languages_index = -1
    for i, left_block in enumerate(left_column_blocks):
        if (
            left_block.get("type") == "heading_2"
            and "Languages" in notion_utils.get_block_plain_text(left_block)
        ):
            languages_index = i
            break
    
    if languages_index == -1:
        print("Error: Languages heading not found in left column.", file=sys.stderr)
        return False
    
    # Look for Skills heading after Languages
    for i in range(languages_index + 1, len(left_column_blocks)):
        left_block = left_column_blocks[i]
        
        if (
            left_block.get("type") == "heading_2"
            and "Skills" in notion_utils.get_block_plain_text(left_block)
        ):
            skills_section_found = True
            
            # Check divider after Skills heading
            if i + 1 < len(left_column_blocks):
                next_block = left_column_blocks[i + 1]
                if next_block.get("type") != "divider":
                    print(
                        "Error: Divider not found after Skills heading.",
                        file=sys.stderr,
                    )
                    return False
            
            # Collect skills after divider
            for j in range(i + 2, len(left_column_blocks)):
                skill_block = left_column_blocks[j]
                if skill_block.get("type") == "paragraph":
                    skill_text = notion_utils.get_block_plain_text(skill_block)
                    if skill_text and skill_text.strip():  # Check for non-empty text
                        # Check if text is bold
                        rich_text = skill_block.get("paragraph", {}).get("rich_text", [])
                        if rich_text and not rich_text[0].get("annotations", {}).get("bold"):
                            print(
                                f"Error: Skill '{skill_text}' is not bold.",
                                file=sys.stderr,
                            )
                            return False
                        
                        # Check icon format
                        if skill_text.startswith("✨✨"):
                            skills_with_double_sparkles.append(skill_text)
                        elif skill_text.startswith("✨"):
                            skills_with_single_sparkle.append(skill_text)
                        else:
                            print(
                                f"Error: Skill '{skill_text}' doesn't start with sparkle icon.",
                                file=sys.stderr,
                            )
                            return False
                        
                        # Check format includes type in parentheses
                        if "(" not in skill_text or ")" not in skill_text:
                            print(
                                f"Error: Skill '{skill_text}' doesn't include type in parentheses.",
                                file=sys.stderr,
                            )
                            return False
                elif skill_block.get("type") in ["heading_1", "heading_2", "heading_3"]:
                    # Stop when we reach another section
                    break
            break

    if not skills_section_found:
        print(
            "Error: Skills section not found in the left column under Languages.",
            file=sys.stderr,
        )
        return False

    # Step 3: Verify we have the expected skills
    expected_double_sparkle_skills = [
        "Photoshop",
        "Figma",
        "Notion",
        "Framer"
    ]
    
    expected_single_sparkle_skills = [
        "Webflow",
        "Rive",
        "CSS + Basic JS"
    ]
    
    # Check if all expected skills are present
    for skill_name in expected_double_sparkle_skills:
        found = any(skill_name in skill for skill in skills_with_double_sparkles)
        if not found:
            print(
                f"Error: Expected skill '{skill_name}' with ✨✨ not found.",
                file=sys.stderr,
            )
            return False
    
    for skill_name in expected_single_sparkle_skills:
        found = any(skill_name in skill for skill in skills_with_single_sparkle)
        if not found:
            print(
                f"Error: Expected skill '{skill_name}' with ✨ not found.",
                file=sys.stderr,
            )
            return False

    # Step 4: Verify Work History and Education sections have black placeholder images
    work_history_images_found = 0
    education_images_found = 0
    black_placeholder_url = "https://singlecolorimage.com/get/000000/"
    
    # Find Work History and Education sections in the right column
    right_column_id = None
    for column in columns:
        if column.get("type") == "column":
            width_ratio = column.get("column", {}).get("width_ratio", 0.5)
            # Right column has width_ratio around 0.75 or no width_ratio (which means equal split)
            if width_ratio > 0.6 or width_ratio == 0.5:
                right_column_id = column["id"]
                break
    
    if right_column_id:
        right_column_blocks = notion_utils.get_all_blocks_recursively(notion, right_column_id)
        
        # Find Work History section
        work_history_index = -1
        education_index = -1
        
        for i, block in enumerate(right_column_blocks):
            if block.get("type") == "heading_1":
                heading_text = notion_utils.get_block_plain_text(block)
                if "Work History" in heading_text:
                    work_history_index = i
                elif "Education" in heading_text:
                    education_index = i
        
        # Check Work History column lists for images
        if work_history_index != -1:
            for i in range(work_history_index + 1, min(education_index if education_index > work_history_index else len(right_column_blocks), len(right_column_blocks))):
                block = right_column_blocks[i]
                if block.get("type") == "column_list":
                    column_list_blocks = notion_utils.get_all_blocks_recursively(notion, block["id"])
                    for column in column_list_blocks:
                        if column.get("type") == "column":
                            # Check width_ratio - must be 50% (0.5) or absent (which defaults to 50%)
                            col_width = column.get("column", {}).get("width_ratio")
                            # First column should be image column (either no ratio=50%, or exactly 0.5)
                            if col_width is None or col_width == 0.5:
                                column_contents = notion_utils.get_all_blocks_recursively(notion, column["id"])
                                for content_block in column_contents:
                                    if content_block.get("type") == "embed":
                                        embed_url = content_block.get("embed", {}).get("url", "")
                                        if black_placeholder_url in embed_url:
                                            work_history_images_found += 1
                                    elif content_block.get("type") == "image":
                                        # Also check for image blocks with external URL
                                        image_url = content_block.get("image", {}).get("external", {}).get("url", "")
                                        if black_placeholder_url in image_url:
                                            work_history_images_found += 1
                                break  # Only check first column
        
        # Check Education column list for images
        if education_index != -1:
            for i in range(education_index + 1, len(right_column_blocks)):
                block = right_column_blocks[i]
                if block.get("type") == "heading_1":
                    break  # Stop at next section
                if block.get("type") == "column_list":
                    column_list_blocks = notion_utils.get_all_blocks_recursively(notion, block["id"])
                    for column in column_list_blocks:
                        if column.get("type") == "column":
                            # Check width_ratio - must be 50% (0.5) or absent (which defaults to 50%)
                            col_width = column.get("column", {}).get("width_ratio")
                            # First column should be image column (either no ratio=50%, or exactly 0.5)
                            if col_width is None or col_width == 0.5:
                                column_contents = notion_utils.get_all_blocks_recursively(notion, column["id"])
                                for content_block in column_contents:
                                    if content_block.get("type") == "embed":
                                        embed_url = content_block.get("embed", {}).get("url", "")
                                        if black_placeholder_url in embed_url:
                                            education_images_found += 1
                                    elif content_block.get("type") == "image":
                                        image_url = content_block.get("image", {}).get("external", {}).get("url", "")
                                        if black_placeholder_url in image_url:
                                            education_images_found += 1
                                break  # Only check first column
                    break  # Only check first column_list in Education
    
    # Verify images were found
    if work_history_images_found < 2:
        print(
            f"Warning: Expected at least 2 Work History images with black placeholder, found {work_history_images_found}.",
            file=sys.stderr,
        )
        return False
    
    if education_images_found < 1:
        print(
            f"Warning: Expected at least 1 Education image with black placeholder, found {education_images_found}.",
            file=sys.stderr,
        )
        return False
    
    print("Success: Skills display adjusted correctly.")
    print(f"- Found {len(skills_with_double_sparkles)} skills with ✨✨ (skill level >= 50%)")
    print(f"- Found {len(skills_with_single_sparkle)} skills with ✨ (skill level < 50%)")
    print("- Skills database removed from right column")
    print("- Skills section added to left column under Languages")
    print(f"- Found {work_history_images_found} Work History images with black placeholder")
    print(f"- Found {education_images_found} Education images with black placeholder")
    return True


def main():
    """
    Executes the verification process and exits with a status code.
    """
    notion = notion_utils.get_notion_client()
    main_id = sys.argv[1] if len(sys.argv) > 1 else None
    if verify(notion, main_id):
        sys.exit(0)
    else:
        sys.exit(1)


if __name__ == "__main__":
    main()

================================================
FILE: tasks/notion/standard/online_resume/projects_section_update/description.md
================================================
Find the page named "Online Resume" and reorganize the projects section to showcase only the most recent and relevant work.

**Task Requirements:**
1. Delete the project named "Knitties eComm Website" from the Projects database since it's from 2022 and no longer relevant

2. Create a new project entry called "Zapier Dashboard Redesign" with:
   - Description: "Led the complete redesign of Zapier's main dashboard, focusing on improved usability and modern design patterns. Implemented new navigation system and responsive layouts."
   - Date: Start "2024-01-01", End "2024-06-30"
   - Tags: Add the existing "UI Design" tag, and create a new tag "Enterprise" with purple color, then add both tags to this project
   - Phone: Same as the phone number under the Contact section
   - Url: Same as the personal website under the Contact section

3. After the Projects database block, add the following blocks in sequence:
   - A divider block
   - A heading_2 block with text "Current Focus"
   - A paragraph block with content that dynamically references:
     - The highest skill level from your Skills database (find the skill with the highest Skill Level percentage)
     - Incorporate this into the text: "The Zapier Dashboard Redesign represents my most impactful recent work, leveraging my expertise in [highest skill name] ([skill level]%) to deliver enterprise-grade solutions that prioritize both aesthetics and functionality."

================================================
FILE: tasks/notion/standard/online_resume/projects_section_update/meta.json
================================================
{
  "task_id": "projects_section_update",
  "task_name": "Projects Section Update",
  "category_id": "online_resume",
  "category_name": "Online Resume",
  "description": "Reorganize the projects section by removing outdated projects and adding new relevant work with proper formatting.",
  "author": "Xiangyan Liu",
  "created_at": "2025-07-27",
  "difficulty": "L3",
  "tags": [
    "database manipulation",
    "template population",
    "data aggregation",
    "visual formatting",
    "cross-reference linking"
  ],
  "mcp": [
    "notion"
  ],
  "meta_data": {
    "stateType": "url",
    "stateContent": null,
    "stateUrl": "https://painted-tennis-ebc.notion.site/Online-Resume-23181626b6d781159faaeb5eadaf612e",
    "stateOriginalUrl": "https://www.notion.so/marketplace/templates/online-resume"
  }
}

================================================
FILE: tasks/notion/standard/online_resume/projects_section_update/verify.py
================================================
import sys
from notion_client import Client
from tasks.utils import notion_utils


def verify(notion: Client, main_id: str = None) -> bool:
    """
    Verifies that the projects section has been reorganized correctly with cross-section references.
    """
    page_id = None
    if main_id:
        found_id, object_type = notion_utils.find_page_or_database_by_id(
            notion, main_id
        )
        if found_id and object_type == "page":
            page_id = found_id

    if not page_id:
        page_id = notion_utils.find_page(notion, "Online Resume")
    if not page_id:
        print("Error: Page 'Online Resume' not found.", file=sys.stderr)
        return False

    # Find the Projects database
    projects_db_id = notion_utils.find_database_in_block(notion, page_id, "Projects")
    if not projects_db_id:
        print("Error: Database 'Projects' not found.", file=sys.stderr)
        return False

    # Find the Skills database to get the highest skill level
    skills_db_id = notion_utils.find_database_in_block(notion, page_id, "Skills")
    if not skills_db_id:
        print("Error: Database 'Skills' not found.", file=sys.stderr)
        return False

    # Query Skills database to find the highest skill level
    skills_results = notion.databases.query(database_id=skills_db_id).get("results", [])
    highest_skill_name = ""
    highest_skill_level = 0

    for skill_page in skills_results:
        properties = skill_page.get("properties", {})
        skill_name_prop = properties.get("Skill", {}).get("title", [])
        skill_level_prop = properties.get("Skill Level", {}).get("number")

        if skill_name_prop and skill_level_prop is not None:
            skill_name = skill_name_prop[0].get("text", {}).get("content", "")
            if skill_level_prop > highest_skill_level:
                highest_skill_level = skill_level_prop
                highest_skill_name = skill_name

    if not highest_skill_name:
        print("Error: Could not find any skills with skill levels.", file=sys.stderr)
        return False

    # Query Projects database
    projects_results = notion.databases.query(database_id=projects_db_id).get(
        "results", []
    )

    # Check that "Knitties eComm Website" is deleted
    for page in projects_results:
        properties = page.get("properties", {})
        name_prop = properties.get("Name", {}).get("title", [])
        if (
            name_prop
            and name_prop[0].get("text", {}).get("content") == "Knitties eComm Website"
        ):
            print(
                "Failure: 'Knitties eComm Website' project was not deleted.",
                file=sys.stderr,
            )
            return False

    # Check that "Zapier Dashboard Redesign" exists with correct properties
    zapier_project_found = False
    for page in projects_results:
        properties = page.get("properties", {})
        name_prop = properties.get("Name", {}).get("title", [])
        if (
            name_prop
            and name_prop[0].get("text", {}).get("content")
            == "Zapier Dashboard Redesign"
        ):
            zapier_project_found = True

            # Check description contains reference to UI Design Internship
            desc_prop = properties.get("Description", {}).get("rich_text", [])
            if not desc_prop:
                print("Failure: Zapier project has no description.", file=sys.stderr)
                return False

            description_text = desc_prop[0].get("text", {}).get("content", "")
            base_desc = "Led the complete redesign of Zapier's main dashboard, focusing on improved usability and modern design patterns. Implemented new navigation system and responsive layouts."
            if base_desc not in description_text:
                print(
                    "Failure: Zapier project description is missing base content.",
                    file=sys.stderr,
                )
                return False

            # Check date
            date_prop = properties.get("Date", {}).get("date", {})
            if (
                not date_prop
                or date_prop.get("start") != "2024-01-01"
                or date_prop.get("end") != "2024-06-30"
            ):
                print(
                    "Failure: Zapier project date range is incorrect.", file=sys.stderr
                )
                return False

            # Check tags
            tags_prop = properties.get("Tags", {}).get("multi_select", [])
            tag_names = {tag.get("name") for tag in tags_prop}
            if "UI Design" not in tag_names or "Enterprise" not in tag_names:
                print(
                    "Failure: Zapier project is missing required tags.", file=sys.stderr
                )
                return False

            # Check phone
            phone_prop = properties.get("Phone", {}).get("phone_number", [])
            if not phone_prop or phone_prop != "+44 7871263013":
                print(
                    "Failure: Zapier project phone number is incorrect.",
                    file=sys.stderr,
                )
                return

            # Check url
            url_prop = properties.get("Url", {}).get("url", [])
            if not url_prop or url_prop != "www.zinenwine.com":
                print("Failure: Zapier project url is incorrect.", file=sys.stderr)
                return

            # Check Enterprise tag color
            enterprise_tag_purple = False
            for tag in tags_prop:
                if tag.get("name") == "Enterprise" and tag.get("color") == "purple":
                    enterprise_tag_purple = True
                    break
            if not enterprise_tag_purple:
                print(
                    "Failure: Enterprise tag does not have purple color.",
                    file=sys.stderr,
                )
                return False

            break

    if not zapier_project_found:
        print(
            "Failure: 'Zapier Dashboard Redesign' project not found.", file=sys.stderr
        )
        return False

    # Find the Projects database block and verify blocks after it
    all_blocks = notion_utils.get_all_blocks_recursively(notion, page_id)

    # Find the Projects database block
    projects_db_index = -1
    for i, block in enumerate(all_blocks):
        if (
            block.get("type") == "child_database"
            and block.get("child_database", {}).get("title") == "Projects"
        ):
            projects_db_index = i
            break

    if projects_db_index == -1:
        print("Error: Could not find Projects database block.", file=sys.stderr)
        return False

    # Check blocks after Projects database
    if projects_db_index + 3 > len(all_blocks):
        print("Failure: Not enough blocks after Projects database.", file=sys.stderr)
        return False

    # Check divider block
    divider_block = all_blocks[projects_db_index + 1]
    if divider_block.get("type") != "divider":
        print(
            "Failure: Expected divider block after Projects database.", file=sys.stderr
        )
        return False

    # Check heading block
    heading_block = all_blocks[projects_db_index + 2]
    if heading_block.get("type") != "heading_2":
        print("Failure: Expected heading_2 block after divider.", file=sys.stderr)
        return False

    heading_text = heading_block.get("heading_2", {}).get("rich_text", [])
    if (
        not heading_text
        or heading_text[0].get("text", {}).get("content") != "Current Focus"
    ):
        print("Failure: Heading text is incorrect.", file=sys.stderr)
        return False

    # Check paragraph block with dynamic skill reference
    paragraph_block = all_blocks[projects_db_index + 3]
    if paragraph_block.get("type") != "paragraph":
        print("Failure: Expected paragraph block after heading.", file=sys.stderr)
        return False

    paragraph_text = paragraph_block.get("paragraph", {}).get("rich_text", [])
    if not paragraph_text:
        print("Failure: Paragraph block is empty.", file=sys.stderr)
        return False

    paragraph_content = paragraph_text[0].get("text", {}).get("content", "")

    # Check that paragraph contains the base text
    base_text = "The Zapier Dashboard Redesign represents my most impactful recent work, leveraging my expertise in"
    if base_text not in paragraph_content:
        print("Failure: Paragraph does not contain base text.", file=sys.stderr)
        return False

    # Check that paragraph references the highest skill
    skill_level_percent = int(highest_skill_level * 100)
    expected_skill_ref = f"{highest_skill_name} ({skill_level_percent}%)"
    if expected_skill_ref not in paragraph_content:
        print(
            f"Failure: Paragraph does not reference highest skill '{expected_skill_ref}'.",
            file=sys.stderr,
        )
        return False

    # Check that paragraph contains the ending text
    ending_text = (
        "enterprise-grade solutions that prioritize both aesthetics and functionality"
    )
    if ending_text not in paragraph_content:
        print(
            "Failure: Paragraph does not contain proper ending text.", file=sys.stderr
        )
        return False

    print(
        f"Success: Projects section has been reorganized correctly with cross-section references (highest skill: {highest_skill_name} at {skill_level_percent}%)."
    )
    return True


def main():
    """
    Executes the verification process and exits with a status code.
    """
    notion = notion_utils.get_notion_client()
    main_id = sys.argv[1] if len(sys.argv) > 1 else None
    if verify(notion, main_id):
        sys.exit(0)
    else:
        sys.exit(1)


if __name__ == "__main__":
    main()


================================================
FILE: tasks/notion/standard/online_resume/skills_development_tracker/description.md
================================================
Create a comprehensive skills audit system by performing the following tasks:

**Task Requirements:**
1. Create a new database named "Skills Development Tracker" as a child database in the main resume page with the following properties:
   - Name (title property)
   - Current Skill (relation to Skills database)
   - Current Proficiency (rollup from related skill's "Skill Level" property)
   - Target Proficiency (number property with format "percent")
   - Gap (formula: Target Proficiency - Current Proficiency)
   - Learning Resources (rich text property)
   - Progress Notes (rich text property)

2. Populate the Skills Development Tracker database with entries for all skills that have a proficiency level below 70% (0.7):
   - For each qualifying skill, create an entry with:
     - Name: "[Skill Name] Development Plan"
     - Link to the corresponding skill in Skills database
     - Target Proficiency: Set to Current + 25% (capped at 95%)
     - Learning Resources: "Online courses and practice projects"
     - Progress Notes: "Initial assessment completed"

3. Create a callout block immediately after the Skills section (after the Skills database) with:
   - Background color: blue_background
   - Icon: 🎯 (target emoji)
   - Content: "Focus Areas: [3 skills with lowest current proficiency]"

================================================
FILE: tasks/notion/standard/online_resume/skills_development_tracker/meta.json
================================================
{
  "task_id": "skills_development_tracker",
  "task_name": "Skills Development Tracker",
  "category_id": "online_resume",
  "category_name": "Online Resume",
  "description": "Create a comprehensive skills audit system with development tracking for skills below 70% proficiency.",
  "author": "Xiangyan Liu",
  "created_at": "2025-07-27",
  "difficulty": "L3",
  "tags": [
    "database manipulation",
    "cross-reference linking",
    "conditional filtering",
    "data aggregation",
    "template population",
    "visual formatting"
  ],
  "mcp": [
    "notion"
  ],
  "meta_data": {
    "stateType": "url",
    "stateContent": null,
    "stateUrl": "https://painted-tennis-ebc.notion.site/Online-Resume-23181626b6d781159faaeb5eadaf612e",
    "stateOriginalUrl": "https://www.notion.so/marketplace/templates/online-resume"
  }
}

================================================
FILE: tasks/notion/standard/online_resume/skills_development_tracker/verify.py
================================================
import sys
from notion_client import Client
from tasks.utils import notion_utils


def verify(notion: Client, main_id: str = None) -> bool:
    """
    Verifies that the Skills Development Tracker database and callout block were created correctly.
    """
    page_id = None
    if main_id:
        found_id, object_type = notion_utils.find_page_or_database_by_id(
            notion, main_id
        )
        if found_id and object_type == "page":
            page_id = found_id

    if not page_id:
        page_id = notion_utils.find_page(notion, "New Online Resume")
    if not page_id:
        print("Error: Page 'New Online Resume' not found.", file=sys.stderr)
        return False

    # Step 1: Verify Skills Development Tracker database exists
    tracker_db_id = notion_utils.find_database_in_block(
        notion, page_id, "Skills Development Tracker"
    )
    if not tracker_db_id:
        print(
            "Error: Database 'Skills Development Tracker' not found.", file=sys.stderr
        )
        return False

    # Step 2: Verify database schema
    try:
        db_info = notion.databases.retrieve(database_id=tracker_db_id)
        properties = db_info.get("properties", {})

        # Check required properties
        required_props = {
            "Name": "title",
            "Current Skill": "relation",
            "Current Proficiency": "rollup",
            "Target Proficiency": "number",
            "Gap": "formula",
            "Learning Resources": "rich_text",
            "Progress Notes": "rich_text",
        }

        for prop_name, expected_type in required_props.items():
            if prop_name not in properties:
                print(
                    f"Error: Property '{prop_name}' not found in database.",
                    file=sys.stderr,
                )
                return False
            if properties[prop_name]["type"] != expected_type:
                print(
                    f"Error: Property '{prop_name}' has incorrect type. Expected '{expected_type}', got '{properties[prop_name]['type']}'.",
                    file=sys.stderr,
                )
                return False

        # Verify Target Proficiency is percent format
        if (
            properties["Target Proficiency"].get("number", {}).get("format")
            != "percent"
        ):
            print(
                "Error: Target Proficiency should have 'percent' format.",
                file=sys.stderr,
            )
            return False

    except Exception as e:
        print(f"Error retrieving database info: {e}", file=sys.stderr)
        return False

    # Step 3: Get Skills database to check entries
    skills_db_id = notion_utils.find_database_in_block(notion, page_id, "Skills")
    if not skills_db_id:
        print("Error: Skills database not found.", file=sys.stderr)
        return False

    # Get all skills with proficiency < 70%
    skills_below_70 = []
    try:
        skills_results = notion.databases.query(database_id=skills_db_id).get(
            "results", []
        )
        for skill in skills_results:
            skill_level = (
                skill.get("properties", {}).get("Skill Level", {}).get("number", 1.0)
            )
            if skill_level < 0.7:
                skill_name = (
                    skill.get("properties", {}).get("Skill", {}).get("title", [])
                )
                if skill_name:
                    skill_name_text = skill_name[0].get("text", {}).get("content", "")
                    skills_below_70.append(
                        {
                            "name": skill_name_text,
                            "id": skill["id"],
                            "level": skill_level,
                        }
                    )
    except Exception as e:
        print(f"Error querying Skills database: {e}", file=sys.stderr)
        return False

    if not skills_below_70:
        print("Warning: No skills found with proficiency below 70%.", file=sys.stderr)
        # This might be OK if all skills are above 70%

    # Step 4: Verify entries in Skills Development Tracker
    try:
        tracker_results = notion.databases.query(database_id=tracker_db_id).get(
            "results", []
        )

        # Check that we have entries for skills below 70%
        if len(skills_below_70) > 0 and len(tracker_results) == 0:
            print(
                "Error: No entries found in Skills Development Tracker database.",
                file=sys.stderr,
            )
            return False

        # Verify each entry
        for entry in tracker_results:
            props = entry.get("properties", {})

            # Check name format
            name_prop = props.get("Name", {}).get("title", [])
            if not name_prop:
                print("Error: Entry missing Name property.", file=sys.stderr)
                return False
            name_text = name_prop[0].get("text", {}).get("content", "")
            if not name_text.endswith(" Development Plan"):
                print(
                    f"Error: Entry name '{name_text}' doesn't follow expected format.",
                    file=sys.stderr,
                )
                return False

            # Check relation to Skills database
            skill_relation = props.get("Current Skill", {}).get("relation", [])
            if not skill_relation:
                print(
                    f"Error: Entry '{name_text}' missing Current Skill relation.",
                    file=sys.stderr,
                )
                return False

            # Check Target Proficiency (should be set)
            target_prof = props.get("Target Proficiency", {}).get("number")
            if target_prof is None:
                print(
                    f"Error: Entry '{name_text}' missing Target Proficiency.",
                    file=sys.stderr,
                )
                return False

            # Check Learning Resources
            learning_resources = props.get("Learning Resources", {}).get(
                "rich_text", []
            )
            if not learning_resources:
                print(
                    f"Error: Entry '{name_text}' missing Learning Resources.",
                    file=sys.stderr,
                )
                return False

            # Check Progress Notes
            progress_notes = props.get("Progress Notes", {}).get("rich_text", [])
            if not progress_notes:
                print(
                    f"Error: Entry '{name_text}' missing Progress Notes.",
                    file=sys.stderr,
                )
                return False

    except Exception as e:
        print(f"Error querying Skills Development Tracker: {e}", file=sys.stderr)
        return False

    # Step 5: Verify callout block exists after Skills section
    all_blocks = notion_utils.get_all_blocks_recursively(notion, page_id)

    # Find Skills database block
    skills_db_block_index = None
    for i, block in enumerate(all_blocks):
        if (
            block.get("type") == "child_database"
            and block.get("child_database", {}).get("title") == "Skills"
        ):
            skills_db_block_index = i
            break

    if skills_db_block_index is None:
        print("Error: Could not find Skills database block.", file=sys.stderr)
        return False

    # Look for callout block after Skills database
    callout_found = False
    block = all_blocks[skills_db_block_index + 1]
    if block.get("type") == "callout":
        callout_data = block.get("callout", {})

        # Check background color
        if callout_data.get("color") != "blue_background":
            print("Error: Could not find callout block with blue background.")
            return False

        # Check icon
        icon = callout_data.get("icon", {})
        if icon.get("type") != "emoji" or icon.get("emoji") != "🎯":
            print("Error: Could not find callout block with 🎯 emoji.")
            return False

        # Check content starts with "Focus Areas:"
        rich_text = callout_data.get("rich_text", [])
        if rich_text:
            content = rich_text[0].get("text", {}).get("content", "")
            if (
                content.startswith("Focus Areas:")
                and "CSS + Basic JS" in content
                and "Webflow" in content
                and "Rive" in content
            ):
                callout_found = True
                print(f"Success: Found callout block with content: {content}")
            else:
                print("Error: Could not find callout block with required text content.")
                return False

    if not callout_found:
        print(
            "Error: Could not find callout block with Focus Areas after Skills section.",
            file=sys.stderr,
        )
        return False

    print(
        "Success: Skills Development Tracker database and callout block verified successfully."
    )
    return True


def main():
    """
    Executes the verification process and exits with a status code.
    """
    notion = notion_utils.get_notion_client()
    main_id = sys.argv[1] if len(sys.argv) > 1 else None
    if verify(notion, main_id):
        sys.exit(0)
    else:
        sys.exit(1)


if __name__ == "__main__":
    main()


================================================
FILE: tasks/notion/standard/online_resume/work_history_addition/description.md
================================================
Hi! I realized I forgot to include one work experience on my resume page titled "Online Resume." Could you please help me add it to the "Work History" section?

The position is "Research Assistant," and it took place from January to August 2023. The description should be: "Assisted in conducting user experience research projects at my bachelor’s program, supporting data collection, analyzing user feedback, and preparing research reports. Developed strong skills in research methodologies and improved collaboration with interdisciplinary teams."

For the image or logo, please use the one from the "Education" section (my bachelor school) to keep everything consistent.

Also, please make sure that the formatting — including font style, size, and layout — matches the existing entries in the Work History section so it looks seamless.

Thank you!

================================================
FILE: tasks/notion/standard/online_resume/work_history_addition/meta.json
================================================
{
  "task_id": "work_history_addition",
  "task_name": "Work History Addition",
  "category_id": "online_resume",
  "category_name": "Online Resume",
  "description": "Add a Research Assistant position to the Work History section with consistent formatting and university logo.",
  "author": "Xiangyan Liu",
  "created_at": "2025-07-27",
  "difficulty": "L3",
  "tags": [
    "database manipulation",
    "template population",
    "cross-reference linking",
    "visual formatting"
  ],
  "mcp": [
    "notion"
  ],
  "meta_data": {
    "stateType": "url",
    "stateContent": null,
    "stateUrl": "https://painted-tennis-ebc.notion.site/Online-Resume-23181626b6d781159faaeb5eadaf612e",
    "stateOriginalUrl": "https://www.notion.so/marketplace/templates/online-resume"
  }
}

================================================
FILE: tasks/notion/standard/online_resume/work_history_addition/verify.py
================================================
import sys
from notion_client import Client
from tasks.utils import notion_utils


def verify(notion: Client, main_id: str = None) -> bool:
    """
    Verifies that the new work history entry for 'Research Assistant' has been added correctly.
    """
    page_id = None
    if main_id:
        found_id, object_type = notion_utils.find_page_or_database_by_id(
            notion, main_id
        )
        if found_id and object_type == "page":
            page_id = found_id

    if not page_id:
        page_id = notion_utils.find_page(notion, "Online Resume")
    if not page_id:
        print("Error: Page 'Online Resume' not found.", file=sys.stderr)
        return False

    all_blocks = notion_utils.get_all_blocks_recursively(notion, page_id)

    def find_image_url_under_heading(blocks, heading_text, notion_client):
        heading_index = -1
        for i, block in enumerate(blocks):
            block_type = block.get("type")
            if block_type == "heading_1":
                if heading_text in notion_utils.get_block_plain_text(block):
                    heading_index = i
                    break

        if heading_index == -1:
            return None

        for i in range(heading_index + 1, len(blocks)):
            block = blocks[i]
            if block.get("type") in ["heading_1", "heading_2", "heading_3"]:
                break
            if block.get("type") == "image" and block.get("image", {}).get("file"):
                return block.get("image", {}).get("file", {}).get("url")
            if block.get("type") == "column_list":
                column_list_id = block["id"]
                columns = notion_utils.get_all_blocks_recursively(
                    notion_client, column_list_id
                )
                for column in columns:
                    if column.get("type") == "column":
                        column_id = column["id"]
                        column_blocks = notion_utils.get_all_blocks_recursively(
                            notion_client, column_id
                        )
                        for inner_block in column_blocks:
                            if inner_block.get("type") == "image" and inner_block.get(
                                "image", {}
                            ).get("file"):
                                return (
                                    inner_block.get("image", {})
                                    .get("file", {})
                                    .get("url")
                                )
        return None

    def get_block_annotations(block):
        block_type = block.get("type")
        if not block_type:
            return {}
        block_content = block.get(block_type)
        if not block_content:
            return {}
        rich_text_list = block_content.get("rich_text", [])
        if not rich_text_list:
            return {}
        return rich_text_list[0].get("annotations", {})

    education_image_url = find_image_url_under_heading(all_blocks, "Education", notion)
    if not education_image_url:
        print(
            "Error: Could not find the image in the 'Education' section.",
            file=sys.stderr,
        )
        return False

    heading_text = "Work History"
    heading_index = -1
    for i, block in enumerate(all_blocks):
        if block.get(
            "type"
        ) == "heading_1" and heading_text in notion_utils.get_block_plain_text(block):
            heading_index = i
            break

    if heading_index == -1:
        print(f"Error: Could not find the '{heading_text}' heading.", file=sys.stderr)
        return False

    for i in range(heading_index + 1, len(all_blocks)):
        block = all_blocks[i]
        if block.get("type") in ["heading_1", "heading_2", "heading_3"]:
            break

        if block.get("type") == "column_list":
            column_list_id = block["id"]
            columns = notion_utils.get_all_blocks_recursively(notion, column_list_id)
            if len(columns) < 2:
                continue

            for column in columns:
                if column.get("type") == "column":
                    if column.get("column", {}).get("width_ratio") == 0.125:
                        image_column = column
                    elif column.get("column", {}).get("width_ratio") == 0.875:
                        text_column = column

            image_column_blocks = notion_utils.get_all_blocks_recursively(
                notion, image_column["id"]
            )
            text_column_blocks = notion_utils.get_all_blocks_recursively(
                notion, text_column["id"]
            )

            column_image_url = None
            for inner_block in image_column_blocks:
                if inner_block.get("type") == "image" and inner_block.get(
                    "image", {}
                ).get("file"):
                    column_image_url = (
                        inner_block.get("image", {}).get("file", {}).get("url")
                    )
                    break

            if (
                not column_image_url
                or column_image_url[:100] != education_image_url[:100]
            ):
                continue

            for j, inner_block in enumerate(text_column_blocks):
                if "Research Assistant" in notion_utils.get_block_plain_text(
                    inner_block
                ):
                    title_annotations = get_block_annotations(inner_block)
                    if j + 2 < len(text_column_blocks):
                        date_block = text_column_blocks[j + 1]
                        description_block = text_column_blocks[j + 2]

                        date_text = "January - August 2023"
                        description_text = "Assisted in conducting user experience research projects at my bachelor’s program, supporting data collection, analyzing user feedback, and preparing research reports. Developed strong skills in research methodologies and improved collaboration with interdisciplinary teams."

                        date_annotations = get_block_annotations(date_block)
                        description_annotations = get_block_annotations(
                            description_block
                        )

                        if (
                            date_text in notion_utils.get_block_plain_text(date_block)
                            and description_text
                            in notion_utils.get_block_plain_text(description_block)
                            and title_annotations.get("bold")
                            and date_annotations.get("italic")
                            and date_annotations.get("color") == "gray"
                            and description_annotations.get("color") == "default"
                            and description_annotations.get("italic") != True
                            and description_annotations.get("bold") != True
                        ):
                            print("Success: Verified new work history entry.")
                            return True

    print("Failure: Could not verify the new work history entry.", file=sys.stderr)
    return False


def main():
    """
    Executes the verification process and exits with a status code.
    """
    notion = notion_utils.get_notion_client()
    main_id = sys.argv[1] if len(sys.argv) > 1 else None
    if verify(notion, main_id):
        sys.exit(0)
    else:
        sys.exit(1)


if __name__ == "__main__":
    main()


================================================
FILE: tasks/notion/standard/python_roadmap/expert_level_lessons/description.md
================================================
# Task: Expert Level Learning Path with Complex Prerequisites

## Objective
Create an Expert Level chapter in the Python Roadmap with sophisticated prerequisite chains that require deep understanding of the existing course structure.

## Requirements

### 1. Create Expert Level Chapter
- **Database**: Chapters database
- **Properties**:
  - Name: `Expert Level`
  - Icon: 🟣 (purple circle emoji)
  - Must appear after Advanced Level in the database

### 2. Create Bridge Lesson
Create a lesson that bridges advanced and expert content:
- **Title**: `Advanced Foundations Review`
- **Status**: Done
- **Chapter**: Link to Expert Level
- **Parent item**: Link to the lesson that currently has status "In Progress" and contains "Control" in its title
- **Sub-items**: Must link to exactly these three lessons:
  - The lesson with title containing "Decorators"
  - The lesson with title containing "Calling API"
  - The lesson with title containing "Regular Expressions"

### 3. Create Expert Level Lessons
Add exactly 4 expert lessons to the Steps database:

**Lesson 1**: `Metaprogramming and AST Manipulation`
- Status: To Do
- Chapter: Expert Level
- Parent item: Link to "Advanced Foundations Review"
- Date: 2025-09-15

**Lesson 2**: `Async Concurrency Patterns`
- Status: To Do
- Chapter: Expert Level
- Parent item: Link to the lesson titled "Calling API"
- Date: 2025-09-20

**Lesson 3**: `Memory Management and GC Tuning`
- Status: In Progress
- Chapter: Expert Level
- Parent item: Link to "Advanced Foundations Review"
- Sub-item: Must have exactly 2 links:
  - Link to any lesson from "Data Structures" that has status "To Do"
  - Link to the lesson containing "OOP" in its title
- Date: 2025-09-25

**Lesson 4**: `Building Python C Extensions`
- Status: To Do
- Chapter: Expert Level
- Parent item: Link to "Metaprogramming and AST Manipulation"
- Date: 2025-10-01

### 4. Update Existing Lessons
- Change the status of "Decorators" from "To Do" to "Done"
- Add "Async Concurrency Patterns" as a Sub-item to "Error Handling"
- Update "Control Flow" status from "In Progress" to "Done"

### 5. Create Learning Path Notes
Add content to the "Advanced Foundations Review" lesson page:
- **Block 1**: Heading 2 with text `Prerequisites Checklist`
- **Block 2**: Bulleted list with exactly 3 items:
  - `✅ Advanced Python Features (Decorators, Context Managers)`
  - `✅ API Integration and Async Basics`
  - `✅ Pattern Matching and Text Processing`
- **Block 3**: Paragraph with text: `This lesson serves as a checkpoint before entering expert-level content. Ensure you have mastered all prerequisites listed above.`

================================================
FILE: tasks/notion/standard/python_roadmap/expert_level_lessons/meta.json
================================================
{
  "task_id": "expert_level_lessons",
  "task_name": "Expert Level Lessons",
  "category_id": "python_roadmap",
  "category_name": "Python Roadmap",
  "description": "Create an Expert Level chapter with sophisticated prerequisite chains and four expert-level lessons.",
  "author": "Lingjun Chen",
  "created_at": "2025-08-02",
  "difficulty": "L3",
  "tags": [
    "database manipulation",
    "cross-reference linking",
    "conditional filtering",
    "status tracking",
    "template population"
  ],
  "mcp": [
    "notion"
  ],
  "meta_data": {
    "stateType": "url",
    "stateContent": null,
    "stateUrl": "https://painted-tennis-ebc.notion.site/Python-Roadmap-25281626b6d78012bf2bce1fa8711f4d",
    "stateOriginalUrl": "https://www.notion.so/marketplace/templates/python-roadmap"
  }
}

================================================
FILE: tasks/notion/standard/python_roadmap/expert_level_lessons/verify.py
================================================
import sys
from notion_client import Client
from tasks.utils import notion_utils

def verify(notion: Client, main_id: str = None) -> bool:
    """
    Verifies that the Expert Level chapter and its lessons have been created correctly with complex prerequisites.
    """
    # Step 1: Find the main page and get database IDs
    if main_id:
        found_id, object_type = notion_utils.find_page_or_database_by_id(notion, main_id)
        if not found_id or object_type != 'page':
            print("Error: Main page not found.", file=sys.stderr)
            return False
    else:
        # Try to find the main page by searching
        found_id = notion_utils.find_page(notion, "Python Roadmap")
        if not found_id:
            print("Error: Main page not found.", file=sys.stderr)
            return False
    
    print(f"Found main page: {found_id}")
    
    # Get all blocks from the page to find database references
    all_blocks = notion_utils.get_all_blocks_recursively(notion, found_id)
    print(f"Found {len(all_blocks)} blocks")
    
    # Find database IDs from the page
    chapters_db_id = None
    steps_db_id = None
    
    for block in all_blocks:
        if block and block.get("type") == "child_database":
            db_title = block.get("child_database", {}).get("title", "")
            if "Chapters" in db_title:
                chapters_db_id = block["id"]
                print(f"Found Chapters database: {chapters_db_id}")
            elif "Steps" in db_title:
                steps_db_id = block["id"]
                print(f"Found Steps database: {steps_db_id}")
    
    if not chapters_db_id:
        print("Error: Chapters database not found.", file=sys.stderr)
        return False
        
    if not steps_db_id:
        print("Error: Steps database not found.", file=sys.stderr)
        return False
    
    print("Starting verification...")
    
    # Step 2: Verify the Expert Level chapter exists
    print("2. Checking for Expert Level chapter...")
    expert_chapter_id = None
    
    try:
        chapters_response = notion.databases.query(
            database_id=chapters_db_id,
            filter={
                "property": "Name",
                "title": {
                    "equals": "Expert Level"
                }
            }
        )
        
        if not chapters_response.get("results"):
            print(f"Error: Expert Level chapter not found in Chapters database.", file=sys.stderr)
            return False
        
        expert_chapter = chapters_response["results"][0]
        expert_chapter_id = expert_chapter["id"]
        
        # Check chapter icon (purple circle)
        chapter_icon = expert_chapter.get("icon")
        if not chapter_icon or chapter_icon.get("type") != "emoji" or chapter_icon.get("emoji") != "🟣":
            print(f"Error: Expert Level chapter does not have the correct purple circle emoji icon.", file=sys.stderr)
            return False
        
        print(f"✓ Expert Level chapter found with correct icon: 🟣")
        
    except Exception as e:
        print(f"Error querying Chapters database: {e}", file=sys.stderr)
        return False
    
    # Step 3: Find Control Flow lesson (In Progress status)
    print("3. Finding Control Flow lesson...")
    control_flow_id = None
    
    try:
        control_flow_response = notion.databases.query(
            database_id=steps_db_id,
            filter={
                "and": [
                    {
                        "property": "Lessons",
                        "title": {
                            "contains": "Control"
                        }
                    },
                    {
                        "property": "Status",
                        "status": {
                            "equals": "Done"  # Should be updated to Done
                        }
                    }
                ]
            }
        )
        
        if control_flow_response.get("results"):
            control_flow_lesson = control_flow_response["results"][0]
            control_flow_id = control_flow_lesson["id"]
            print(f"✓ Found Control Flow lesson with status 'Done'")
        else:
            print(f"Error: Control Flow lesson not found with status 'Done'.", file=sys.stderr)
            return False
        
    except Exception as e:
        print(f"Error finding Control Flow lesson: {e}", file=sys.stderr)
        return False
    
    # Step 4: Find prerequisite lessons
    print("4. Finding prerequisite lessons...")
    
    decorators_id = None
    calling_api_id = None
    regex_id = None
    
    try:
        # Find Decorators (should be Done)
        decorators_response = notion.databases.query(
            database_id=steps_db_id,
            filter={
                "property": "Lessons",
                "title": {
                    "contains": "Decorators"
                }
            }
        )
        
        if decorators_response.get("results"):
            decorators_lesson = decorators_response["results"][0]
            decorators_id = decorators_lesson["id"]
            # Check status is Done
            if decorators_lesson["properties"]["Status"]["status"]["name"] != "Done":
                print(f"Error: Decorators lesson should have status 'Done'.", file=sys.stderr)
                return False
            print(f"✓ Found Decorators lesson with status 'Done'")
        else:
            print(f"Error: Decorators lesson not found.", file=sys.stderr)
            return False
        
        # Find Calling API
        calling_api_response = notion.databases.query(
            database_id=steps_db_id,
            filter={
                "property": "Lessons",
                "title": {
                    "equals": "Calling API"
                }
            }
        )
        
        if calling_api_response.get("results"):
            calling_api_lesson = calling_api_response["results"][0]
            calling_api_id = calling_api_lesson["id"]
            print(f"✓ Found Calling API lesson")
        else:
            print(f"Error: Calling API lesson not found.", file=sys.stderr)
            return False
        
        # Find Regular Expressions
        regex_response = notion.databases.query(
            database_id=steps_db_id,
            filter={
                "property": "Lessons",
                "title": {
                    "contains": "Regular Expressions"
                }
            }
        )
        
        if regex_response.get("results"):
            regex_lesson = regex_response["results"][0]
            regex_id = regex_lesson["id"]
            print(f"✓ Found Regular Expressions lesson")
        else:
            print(f"Error: Regular Expressions lesson not found.", file=sys.stderr)
            return False
        
    except Exception as e:
        print(f"Error finding prerequisite lessons: {e}", file=sys.stderr)
        return False
    
    # Step 5: Verify Advanced Foundations Review bridge lesson
    print("5. Checking Advanced Foundations Review bridge lesson...")
    bridge_id = None
    
    try:
        bridge_response = notion.databases.query(
            database_id=steps_db_id,
            filter={
                "property": "Lessons",
                "title": {
                    "equals": "Advanced Foundations Review"
                }
            }
        )
        
        if not bridge_response.get("results"):
            print(f"Error: Advanced Foundations Review lesson not found.", file=sys.stderr)
            return False
        
        bridge_lesson = bridge_response["results"][0]
        bridge_id = bridge_lesson["id"]
        
        # Check status is Done
        if bridge_lesson["properties"]["Status"]["status"]["name"] != "Done":
            print(f"Error: Advanced Foundations Review should have status 'Done'.", file=sys.stderr)
            return False
        
        # Check linked to Expert Level chapter
        bridge_chapters = bridge_lesson["properties"]["Chapters"]["relation"]
        if not any(rel["id"] == expert_chapter_id for rel in bridge_chapters):
            print(f"Error: Advanced Foundations Review not linked to Expert Level chapter.", file=sys.stderr)
            return False
        
        # Check Parent item is Control Flow
        bridge_parent = bridge_lesson["properties"]["Parent item"]["relation"]
        if not bridge_parent or bridge_parent[0]["id"] != control_flow_id:
            print(f"Error: Advanced Foundations Review should have Control Flow as Parent item.", file=sys.stderr)
            return False
        
        # Check Sub-items (should have at least 3 specific lessons plus any that reference it as parent)
        bridge_subitems = bridge_lesson["properties"]["Sub-item"]["relation"]
        required_subitems = {decorators_id, calling_api_id, regex_id}
        actual_subitems = {item["id"] for item in bridge_subitems}
        
        if not required_subitems.issubset(actual_subitems):
            print(f"Error: Advanced Foundations Review should have at least these 3 sub-items: Decorators, Calling API, Regular Expressions.", file=sys.stderr)
            return False
        
        # Due to bidirectional relations, lessons that have this as parent will also appear as sub-items
        # We expect at least 5: 3 initial + 2 that reference it as parent (Metaprogramming and Memory Management)
        if len(bridge_subitems) < 5:
            print(f"Error: Advanced Foundations Review should have at least 5 sub-items (3 initial + 2 from parent relations), found {len(bridge_subitems)}.", file=sys.stderr)
            return False
        
        print(f"✓ Advanced Foundations Review has {len(bridge_subitems)} sub-items, including the 3 required ones")
        
        print(f"✓ Advanced Foundations Review found with correct properties")
        
    except Exception as e:
        print(f"Error checking bridge lesson: {e}", file=sys.stderr)
        return False
    
    # Step 6: Verify the 4 expert lessons
    print("6. Checking the 4 expert lessons...")
    
    # Note: Async Concurrency Patterns will have Error Handling as parent (due to sub-item relation)
    # We'll need to find Error Handling's ID first
    error_handling_response = notion.databases.query(
        database_id=steps_db_id,
        filter={
            "property": "Lessons",
            "title": {
                "equals": "Error Handling"
            }
        }
    )
    
    error_handling_id = None
    if error_handling_response.get("results"):
        error_handling_id = error_handling_response["results"][0]["id"]
    else:
        print(f"Error: Error Handling lesson not found.", file=sys.stderr)
        return False
    
    expert_lessons = {
        "Metaprogramming and AST Manipulation": {
            "status": "To Do",
            "parent": bridge_id,
            "date": "2025-09-15"
        },
        "Async Concurrency Patterns": {
            "status": "To Do",
            "parent": error_handling_id,  # Parent is Error Handling due to sub-item relation
            "date": "2025-09-20"
        },
        "Memory Management and GC Tuning": {
            "status": "In Progress",
            "parent": bridge_id,
            "date": "2025-09-25"
        },
        "Building Python C Extensions": {
            "status": "To Do",
            "date": "2025-10-01"
        }
    }
    
    lesson_ids = {}
    
    try:
        for lesson_name, expected in expert_lessons.items():
            lesson_response = notion.databases.query(
                database_id=steps_db_id,
                filter={
                    "property": "Lessons",
                    "title": {
                        "equals": lesson_name
                    }
                }
            )
            
            if not lesson_response.get("results"):
                print(f"Error: Lesson '{lesson_name}' not found.", file=sys.stderr)
                return False
            
            lesson = lesson_response["results"][0]
            lesson_ids[lesson_name] = lesson["id"]
            
            # Check status
            if lesson["properties"]["Status"]["status"]["name"] != expected["status"]:
                print(f"Error: Lesson '{lesson_name}' should have status '{expected['status']}'.", file=sys.stderr)
                return False
            
            # Check linked to Expert Level chapter
            lesson_chapters = lesson["properties"]["Chapters"]["relation"]
            if not any(rel["id"] == expert_chapter_id for rel in lesson_chapters):
                print(f"Error: Lesson '{lesson_name}' not linked to Expert Level chapter.", file=sys.stderr)
                return False
            
            # Check date
            lesson_date = lesson["properties"]["Date"]["date"]
            if lesson_date and lesson_date.get("start") != expected["date"]:
                print(f"Error: Lesson '{lesson_name}' should have date '{expected['date']}'.", file=sys.stderr)
                return False
            
            # Check parent item for lessons that have specific parent requirements
            if "parent" in expected:
                lesson_parent = lesson["properties"]["Parent item"]["relation"]
                if not lesson_parent or lesson_parent[0]["id"] != expected["parent"]:
                    print(f"Error: Lesson '{lesson_name}' should have correct parent item.", file=sys.stderr)
                    return False
            
            print(f"✓ Lesson '{lesson_name}' found with correct properties")
        
        # Special checks for Building Python C Extensions parent relationship
        # (other parent checks are handled in the loop above)
        building_lesson = notion.databases.query(
            database_id=steps_db_id,
            filter={
                "property": "Lessons",
                "title": {
                    "equals": "Building Python C Extensions"
                }
            }
        )["results"][0]
        
        building_parent = building_lesson["properties"]["Parent item"]["relation"]
        if not building_parent or building_parent[0]["id"] != lesson_ids["Metaprogramming and AST Manipulation"]:
            print(f"Error: Building Python C Extensions should have Metaprogramming and AST Manipulation as parent.", file=sys.stderr)
            return False
        
        # Memory Management should have 2 sub-items
        memory_lesson = notion.databases.query(
            database_id=steps_db_id,
            filter={
                "property": "Lessons",
                "title": {
                    "equals": "Memory Management and GC Tuning"
                }
            }
        )["results"][0]
        
        memory_subitems = memory_lesson["properties"]["Sub-item"]["relation"]
        if len(memory_subitems) != 2:
            print(f"Error: Memory Management and GC Tuning should have exactly 2 sub-items.", file=sys.stderr)
            return False
        
    except Exception as e:
        print(f"Error checking expert lessons: {e}", file=sys.stderr)
        return False
    
    # Step 7: Verify Error Handling has Async Concurrency Patterns as sub-item
    print("7. Checking Error Handling sub-item...")
    
    try:
        error_handling_response = notion.databases.query(
            database_id=steps_db_id,
            filter={
                "property": "Lessons",
                "title": {
                    "equals": "Error Handling"
                }
            }
        )
        
        if error_handling_response.get("results"):
            error_handling_lesson = error_handling_response["results"][0]
            error_subitems = error_handling_lesson["properties"]["Sub-item"]["relation"]
            
            if not any(item["id"] == lesson_ids["Async Concurrency Patterns"] for item in error_subitems):
                print(f"Error: Error Handling should have Async Concurrency Patterns as sub-item.", file=sys.stderr)
                return False
            
            print(f"✓ Error Handling has Async Concurrency Patterns as sub-item")
        else:
            print(f"Error: Error Handling lesson not found.", file=sys.stderr)
            return False
        
    except Exception as e:
        print(f"Error checking Error Handling: {e}", file=sys.stderr)
        return False
    
    # Step 8: Verify block content in Advanced Foundations Review
    print("8. Checking Advanced Foundations Review page content...")
    
    try:
        blocks = notion_utils.get_all_blocks_recursively(notion, bridge_id)
        
        if len(blocks) < 3:
            print(f"Error: Advanced Foundations Review should have at least 3 blocks.", file=sys.stderr)
            return False
        
        # Check Block 1: Heading 2
        block1 = blocks[0]
        if block1.get("type") != "heading_2":
            print(f"Error: First block should be heading_2.", file=sys.stderr)
            return False
        
        heading_text = block1.get("heading_2", {}).get("rich_text", [{}])[0].get("text", {}).get("content", "")
        if heading_text != "Prerequisites Checklist":
            print(f"Error: Heading should be 'Prerequisites Checklist'.", file=sys.stderr)
            return False
        
        # Check Block 2: Bulleted list
        block2 = blocks[1]
        if block2.get("type") != "bulleted_list_item":
            print(f"Error: Second block should be bulleted_list_item.", file=sys.stderr)
            return False
        
        # Check Block 3 and 4 are also bulleted list items
        if len(blocks) >= 4:
            block3 = blocks[2]
            block4 = blocks[3]
            if block3.get("type") != "bulleted_list_item" or block4.get("type") != "bulleted_list_item":
                print(f"Error: Blocks 2-4 should be bulleted list items.", file=sys.stderr)
                return False
        
        # Check last block is paragraph
        last_block = blocks[-1]
        if last_block.get("type") != "paragraph":
            print(f"Error: Last block should be paragraph.", file=sys.stderr)
            return False
        
        paragraph_text = last_block.get("paragraph", {}).get("rich_text", [{}])[0].get("text", {}).get("content", "")
        if "checkpoint" not in paragraph_text.lower():
            print(f"Error: Paragraph should contain text about checkpoint.", file=sys.stderr)
            return False
        
        print(f"✓ Advanced Foundations Review page has correct content structure")
        
    except Exception as e:
        print(f"Error checking page content: {e}", file=sys.stderr)
        return False
    
    # Step 9: Final verification counts
    print("9. Verifying final state counts...")
    
    try:
        # Count total lessons by status
        all_lessons = notion.databases.query(database_id=steps_db_id, page_size=100)["results"]
        
        done_lessons = [l for l in all_lessons if l["properties"]["Status"]["status"]["name"] == "Done"]
        done_count = len(done_lessons)
        in_progress_count = sum(1 for l in all_lessons if l["properties"]["Status"]["status"]["name"] == "In Progress")
        
        # Print out all Done lessons for debugging
        if done_count != 14:
            print(f"Found {done_count} Done lessons (expected 14):", file=sys.stderr)
            for lesson in done_lessons:
                lesson_name = lesson["properties"]["Lessons"]["title"][0]["text"]["content"]
                print(f"  - {lesson_name}", file=sys.stderr)
            return False
        
        if in_progress_count != 1:
            print(f"Error: Should have 1 In Progress lesson, found {in_progress_count}.", file=sys.stderr)
            return False
        
        # Verify Expert Level has 5 lessons
        expert_chapter_updated = notion.databases.query(
            database_id=chapters_db_id,
            filter={
                "property": "Name",
                "title": {
                    "equals": "Expert Level"
                }
            }
        )["results"][0]
        
        expert_steps = expert_chapter_updated["properties"]["Steps"]["relation"]
        if len(expert_steps) != 5:
            print(f"Error: Expert Level should have exactly 5 lessons, found {len(expert_steps)}.", file=sys.stderr)
            return False
        
        print(f"✓ Final state counts are correct")
        
    except Exception as e:
        print(f"Error verifying final counts: {e}", file=sys.stderr)
        return False
    
    print("🎉 All verification checks passed!")
    return True

def main():
    """
    Executes the verification process and exits with a status code.
    """
    notion = notion_utils.get_notion_client()
    main_id = sys.argv[1] if len(sys.argv) > 1 else None
    if verify(notion, main_id):
        sys.exit(0)
    else:
        sys.exit(1)

if __name__ == "__main__":
    main()

================================================
FILE: tasks/notion/standard/python_roadmap/learning_metrics_dashboard/description.md
================================================
# Task: Learning Metrics Dashboard

## Objective
Create a comprehensive Learning Metrics Dashboard section in the Python Roadmap page that displays precise statistics and recommendations based on the Steps database content.

## Requirements

### 1. Section Placement
- Add new content immediately after the Learning Materials section (before `Whether you're starting from scratch or`).

### 2. Dashboard Header
- **Type**: heading_3
- **Text**: `📊 Learning Metrics Dashboard`

### 3. Course Statistics Block
- **Type**: callout
- **Background Color**: Brown
- **Icon**: None
- **Title**: **Course Statistics** (bold, heading_3). Use the same color scheme as other callout headings.
- **Content**: Bulleted list with the following items in exact order:
  - `Total Lessons: [X]` (count all entries in Steps database)
  - `Completed: [X] ([Y]%)` (count Status="Done", calculate percentage to 1 decimal)
  - `In Progress: [X] ([Y]%)` (count Status="In Progress", calculate percentage to 1 decimal)
  - `Beginner Level: [X] lessons ([Y] completed)` (filter by Chapters relation to Beginner Level)
  - `Intermediate Level: [X] lessons ([Y] completed)` (filter by Chapters relation to Intermediate Level)
  - `Advanced Level: [X] lessons ([Y] completed)` (filter by Chapters relation to Advanced Level)

### 4. Completed Topics Section
- **Type**: toggle
- **Text**: `🏆 Completed Topics (Click to expand)`
- **Nested Content**: Numbered list containing exactly 5 items
  - List lessons with Status="Done"

================================================
FILE: tasks/notion/standard/python_roadmap/learning_metrics_dashboard/meta.json
================================================
{
  "task_id": "learning_metrics_dashboard",
  "task_name": "Learning Metrics Dashboard",
  "category_id": "python_roadmap",
  "category_name": "Python Roadmap",
  "description": "Create a comprehensive Learning Metrics Dashboard section displaying precise statistics and recommendations based on the Steps database.",
  "author": "Lingjun Chen",
  "created_at": "2025-08-02",
  "difficulty": "L3",
  "tags": [
    "data aggregation",
    "conditional filtering",
    "report generation",
    "visual formatting"
  ],
  "mcp": [
    "notion"
  ],
  "meta_data": {
    "stateType": "url",
    "stateContent": null,
    "stateUrl": "https://painted-tennis-ebc.notion.site/Python-Roadmap-25281626b6d78012bf2bce1fa8711f4d",
    "stateOriginalUrl": "https://www.notion.so/marketplace/templates/python-roadmap"
  }
}

================================================
FILE: tasks/notion/standard/python_roadmap/learning_metrics_dashboard/verify.py
================================================
import sys
from notion_client import Client
from tasks.utils import notion_utils

def get_page_title_from_result(page_result):
    """
    Extract the title from a page result object from database query.
    """
    properties = page_result.get('properties', {})
    # Try common title property names
    for prop_name in ['Name', 'Title', 'title', 'Lessons']:
        if prop_name in properties:
            prop = properties[prop_name]
            if prop.get('type') == 'title':
                title_array = prop.get('title', [])
                if title_array and len(title_array) > 0:
                    return title_array[0].get('plain_text', '')
    return ''

def verify(notion: Client, main_id: str = None) -> bool:
    """
    Verifies that the Learning Metrics Dashboard has been implemented correctly according to description.md.
    """
    # Step 1: Find the main page and get all blocks
    if main_id:
        found_id, object_type = notion_utils.find_page_or_database_by_id(notion, main_id)
        if not found_id or object_type != 'page':
            print("Error: Main page not found.", file=sys.stderr)
            return False
    else:
        # Try to find the main page by searching
        found_id = notion_utils.find_page(notion, "Python Roadmap")
        if not found_id:
            print("Error: Main page not found.", file=sys.stderr)
            return False
    
    print(f"Found main page: {found_id}")
    
    # Get Steps database to calculate expected statistics
    steps_db_id = notion_utils.find_database(notion, "Steps")
    if not steps_db_id:
        print("Error: Steps database not found.", file=sys.stderr)
        return False
    
    # Query Steps database to get all lessons
    steps_data = notion.databases.query(database_id=steps_db_id)
    total_lessons = len(steps_data['results'])
    completed_count = 0
    in_progress_count = 0
    completed_lessons = []
    
    # Get Chapters database for level information
    chapters_db_id = notion_utils.find_database(notion, "Chapters")
    if not chapters_db_id:
        print("Error: Chapters database not found.", file=sys.stderr)
        return False
    
    # Query Chapters database to get level information
    chapters_data = notion.databases.query(database_id=chapters_db_id)
    level_ids = {
        'Beginner Level': None,
        'Intermediate Level': None,
        'Advanced Level': None
    }
    
    for chapter in chapters_data['results']:
        chapter_name = get_page_title_from_result(chapter)
        if chapter_name in level_ids:
            level_ids[chapter_name] = chapter['id']
    
    # Initialize level counts
    level_counts = {
        'Beginner Level': {'total': 0, 'completed': 0},
        'Intermediate Level': {'total': 0, 'completed': 0},
        'Advanced Level': {'total': 0, 'completed': 0}
    }
    
    # Count lessons by status and level
    for lesson in steps_data['results']:
        status = lesson['properties']['Status']['status']
        if status and status['name'] == 'Done':
            completed_count += 1
            lesson_title = get_page_title_from_result(lesson)
            if lesson_title:
                completed_lessons.append(lesson_title)
        elif status and status['name'] == 'In Progress':
            in_progress_count += 1
        
        # Count by level
        chapters_relation = lesson['properties']['Chapters']['relation']
        for chapter_ref in chapters_relation:
            chapter_id = chapter_ref['id']
            for level_name, level_id in level_ids.items():
                if chapter_id == level_id:
                    level_counts[level_name]['total'] += 1
                    if status and status['name'] == 'Done':
                        level_counts[level_name]['completed'] += 1
    
    # Calculate percentages
    completed_percentage = round((completed_count / total_lessons * 100), 1) if total_lessons > 0 else 0
    in_progress_percentage = round((in_progress_count / total_lessons * 100), 1) if total_lessons > 0 else 0
    
    print(f"Expected statistics:")
    print(f"  Total Lessons: {total_lessons}")
    print(f"  Completed: {completed_count} ({completed_percentage}%)")
    print(f"  In Progress: {in_progress_count} ({in_progress_percentage}%)")
    print(f"  Beginner Level: {level_counts['Beginner Level']['total']} lessons ({level_counts['Beginner Level']['completed']} completed)")
    print(f"  Intermediate Level: {level_counts['Intermediate Level']['total']} lessons ({level_counts['Intermediate Level']['completed']} completed)")
    print(f"  Advanced Level: {level_counts['Advanced Level']['total']} lessons ({level_counts['Advanced Level']['completed']} completed)")
    print(f"  Completed lessons (first 5): {completed_lessons[:5]}")
    
    # Get all blocks from the page
    all_blocks = notion_utils.get_all_blocks_recursively(notion, found_id)
    print(f"Found {len(all_blocks)} blocks")
    
    # Step 2: Verify the required elements in order
    learning_materials_idx = -1
    dashboard_heading_idx = -1
    callout_idx = -1
    toggle_idx = -1
    whether_paragraph_idx = -1  # Track the "Whether you're starting from scratch" paragraph
    
    # Track what we've verified
    callout_has_brown_bg = False
    callout_has_no_icon = False
    callout_has_course_statistics_title = False
    callout_title_has_correct_colors = False
    statistics_items_found = []
    completed_topics_found = []
    
    # Expected statistics content
    expected_statistics = [
        f"Total Lessons: {total_lessons}",
        f"Completed: {completed_count} ({completed_percentage}%)",
        f"In Progress: {in_progress_count} ({in_progress_percentage}%)",
        f"Beginner Level: {level_counts['Beginner Level']['total']} lessons ({level_counts['Beginner Level']['completed']} completed)",
        f"Intermediate Level: {level_counts['Intermediate Level']['total']} lessons ({level_counts['Intermediate Level']['completed']} completed)",
        f"Advanced Level: {level_counts['Advanced Level']['total']} lessons ({level_counts['Advanced Level']['completed']} completed)"
    ]
    
    # Check blocks in order
    for i, block in enumerate(all_blocks):
        if block is None:
            continue
            
        block_type = block.get("type")
        
        # 1. Check for Learning Materials heading (requirement 1)
        if learning_materials_idx == -1 and block_type == "heading_3":
            block_text = notion_utils.get_block_plain_text(block)
            if "🎓 Learning Materials" in block_text or "Learning Materials" in block_text:
                learning_materials_idx = i
                print(f"✓ Requirement 1: Found Learning Materials heading at position {i}")
        
        # 2. Check for Learning Metrics Dashboard heading after Learning Materials (requirement 2)
        elif learning_materials_idx != -1 and dashboard_heading_idx == -1 and block_type == "heading_3":
            block_text = notion_utils.get_block_plain_text(block)
            if "📊 Learning Metrics Dashboard" in block_text:
                dashboard_heading_idx = i
                print(f"✓ Requirement 2: Found Learning Metrics Dashboard heading at position {i}")
        
        # 3. Check for callout block after Dashboard heading (requirement 3)
        elif dashboard_heading_idx != -1 and callout_idx == -1 and block_type == "callout":
            callout_idx = i
            print(f"  Found callout block at position {i}")
            
            # Check brown background (requirement 3.1)
            if block.get("callout", {}).get("color") == "brown_background":
                callout_has_brown_bg = True
                print(f"  ✓ Requirement 3.1: Callout has brown background")
            
            # Check no icon (requirement 3.2)
            icon = block.get("callout", {}).get("icon")
            if icon is None:
                callout_has_no_icon = True
                print(f"  ✓ Requirement 3.2: Callout has no icon")
            
            # Get nested blocks for Course Statistics title and content
            nested_blocks = notion_utils.get_all_blocks_recursively(notion, block.get("id"))
            
            for nested in nested_blocks:
                # Check for heading_3 only as per requirement
                if nested and nested.get("type") == "heading_3":
                    # Check for "Course Statistics" title with correct formatting
                    rich_text = nested.get("heading_3", {}).get("rich_text", [])
                    course_found = False
                    course_correct = False
                    statistics_found = False
                    statistics_correct = False
                    
                    for text_item in rich_text:
                        text_content = text_item.get("text", {}).get("content", "")
                        annotations = text_item.get("annotations", {})
                        color = annotations.get("color", "default")
                        is_bold = annotations.get("bold", False)
                        
                        if "Course" in text_content:
                            course_found = True
                            # Check if Course is blue and bold
                            if color == "blue" and is_bold:
                                course_correct = True
                                print(f"  ✓ 'Course' has blue color and is bold")
                            else:
                                print(f"  ✗ 'Course' color: {color}, bold: {is_bold} (should be blue and bold)")
                            
                        if "Statistics" in text_content:
                            statistics_found = True
                            # Check if Statistics is yellow and bold
                            if color == "yellow" and is_bold:
                                statistics_correct = True
                                print(f"  ✓ 'Statistics' has yellow color and is bold")
                            else:
                                print(f"  ✗ 'Statistics' color: {color}, bold: {is_bold} (should be yellow and bold)")
                    
                    if course_found and statistics_found:
                        callout_has_course_statistics_title = True
                        if course_correct and statistics_correct:
                            callout_title_has_correct_colors = True
                            print(f"  ✓ Requirement 3.3: Callout has 'Course Statistics' title with correct colors")
                        else:
                            print(f"  ✗ Requirement 3.3: Title found but colors/formatting incorrect")
                
                # Check for statistics items in bulleted list
                elif nested and nested.get("type") == "bulleted_list_item":
                    item_text = notion_utils.get_block_plain_text(nested)
                    for expected_item in expected_statistics:
                        if expected_item in item_text:
                            if expected_item not in statistics_items_found:
                                statistics_items_found.append(expected_item)
                                print(f"  ✓ Requirement 3.4: Found statistics item: {expected_item}")
        
        # 4. Check for Completed Topics toggle after callout (requirement 4)
        elif callout_idx != -1 and toggle_idx == -1 and block_type == "toggle":
            block_text = notion_utils.get_block_plain_text(block)
            if "🏆 Completed Topics (Click to expand)" in block_text:
                toggle_idx = i
                print(f"✓ Requirement 4: Found Completed Topics toggle at position {i}")
                
                # Get nested blocks for completed topics list
                nested_blocks = notion_utils.get_all_blocks_recursively(notion, block.get("id"))
                for nested in nested_blocks:
                    if nested and nested.get("type") == "numbered_list_item":
                        item_text = notion_utils.get_block_plain_text(nested)
                        if item_text and item_text in completed_lessons:
                            completed_topics_found.append(item_text)
                            print(f"  ✓ Requirement 4.1: Found completed topic: {item_text}")
        
        # 5. Check for "Whether you're starting from scratch" paragraph (should be after dashboard content)
        elif block_type == "paragraph" and whether_paragraph_idx == -1:
            block_text = notion_utils.get_block_plain_text(block)
            if "Whether you're starting from scratch" in block_text or "Whether you're starting from scratch" in block_text:
                whether_paragraph_idx = i
                print(f"  Found 'Whether you're starting from scratch' paragraph at position {i}")
    
    # Step 3: Verify all requirements were met
    print(f"\nVerification Summary:")
    
    all_passed = True
    
    # Requirement 1: Learning Materials section found
    if learning_materials_idx == -1:
        print("✗ Requirement 1: Learning Materials section NOT found", file=sys.stderr)
        all_passed = False
    else:
        print("✓ Requirement 1: Learning Materials section found")
    
    # Requirement 2: Learning Metrics Dashboard heading after Learning Materials and before "Whether..." paragraph
    if dashboard_heading_idx == -1:
        print("✗ Requirement 2: Learning Metrics Dashboard heading NOT found", file=sys.stderr)
        all_passed = False
    elif dashboard_heading_idx <= learning_materials_idx:
        print("✗ Requirement 2: Learning Metrics Dashboard heading not AFTER Learning Materials", file=sys.stderr)
        all_passed = False
    elif whether_paragraph_idx != -1 and dashboard_heading_idx >= whether_paragraph_idx:
        print("✗ Requirement 2: Learning Metrics Dashboard heading not BEFORE 'Whether you're starting from scratch' paragraph", file=sys.stderr)
        all_passed = False
    else:
        print("✓ Requirement 2: Learning Metrics Dashboard heading found after Learning Materials")
        if whether_paragraph_idx != -1:
            print("  ✓ Dashboard content is correctly placed before 'Whether you're starting from scratch' paragraph")
    
    # Requirement 3: Course Statistics callout block with all specifications
    if callout_idx == -1:
        print("✗ Requirement 3: Course Statistics callout block NOT found", file=sys.stderr)
        all_passed = False
    else:
        if not callout_has_brown_bg:
            print("✗ Requirement 3.1: Callout does NOT have brown background", file=sys.stderr)
            all_passed = False
        else:
            print("✓ Requirement 3.1: Callout has brown background")
            
        if not callout_has_no_icon:
            print("✗ Requirement 3.2: Callout has an icon (should have none)", file=sys.stderr)
            all_passed = False
        else:
            print("✓ Requirement 3.2: Callout has no icon")
            
        if not callout_has_course_statistics_title:
            print("✗ Requirement 3.3: Callout does NOT have 'Course Statistics' title", file=sys.stderr)
            all_passed = False
        else:
            print("✓ Requirement 3.3: Callout has 'Course Statistics' title")
        
        if not callout_title_has_correct_colors:
            print("✗ Requirement 3.3.1: Title does NOT have correct colors (blue for Course, yellow for Statistics)", file=sys.stderr)
            all_passed = False
        else:
            print("✓ Requirement 3.3.1: Title has correct colors")
        
        # Check all statistics items
        missing_items = [item for item in expected_statistics if item not in statistics_items_found]
        if missing_items:
            print(f"✗ Requirement 3.4: Missing statistics items: {missing_items}", file=sys.stderr)
            all_passed = False
        else:
            print("✓ Requirement 3.4: All 6 statistics items found")
    
    # Requirement 4: Completed Topics toggle
    if toggle_idx == -1:
        print("✗ Requirement 4: Completed Topics toggle NOT found", file=sys.stderr)
        all_passed = False
    elif toggle_idx <= callout_idx:
        print("✗ Requirement 4: Completed Topics toggle not AFTER callout", file=sys.stderr)
        all_passed = False
    else:
        print("✓ Requirement 4: Completed Topics toggle found after callout")
        
        # Check that exactly 5 completed topics are listed
        if len(completed_topics_found) != 5:
            if len(completed_topics_found) < 5:
                print(f"✗ Requirement 4.1: Only {len(completed_topics_found)} completed topics found (need exactly 5)", file=sys.stderr)
            else:
                print(f"✗ Requirement 4.1: Found {len(completed_topics_found)} completed topics (need exactly 5, not more)", file=sys.stderr)
            all_passed = False
        else:
            print(f"✓ Requirement 4.1: Found exactly 5 completed topics as required")
    
    # Requirement 5: Proper integration (implicitly checked by order)
    if all_passed:
        print("✓ Requirement 5: All content properly integrated in correct order")
    
    return all_passed

def main():
    """
    Executes the verification process and exits with a status code.
    """
    notion = notion_utils.get_notion_client()
    main_id = sys.argv[1] if len(sys.argv) > 1 else None
    
    if verify(notion, main_id):
        print("Verification passed")
        sys.exit(0)
    else:
        print("Verification failed")
        sys.exit(1)

if __name__ == "__main__":
    main()

================================================
FILE: tasks/notion/standard/self_assessment/faq_column_layout/description.md
================================================
Navigate to the "Self Assessment" page and reorganize the content under the FAQ toggle as follows:

**Task Requirements:**
1. Add a column list with two columns inside the FAQ toggle
2. Move the first two existing Q&A pairs from the FAQ to the left column
3. Move the third existing Q&A pair to the right column
4. Add one additional Q&A pair in the right column to match the format, so both columns have exactly 2 Q&A pairs
5. Ensure all Q&A pairs maintain consistent formatting (heading_3 for questions, paragraph for answers)

================================================
FILE: tasks/notion/standard/self_assessment/faq_column_layout/meta.json
================================================
{
  "task_id": "faq_column_layout",
  "task_name": "FAQ Column Layout",
  "category_id": "self_assessment",
  "category_name": "Self Assessment",
  "description": "Reorganize the FAQ section content into a two-column layout with balanced Q&A pairs.",
  "author": "Xiangyan Liu",
  "created_at": "2025-07-27",
  "difficulty": "L3",
  "tags": [
    "content organization",
    "visual formatting",
    "template population"
  ],
  "mcp": [
    "notion"
  ],
  "meta_data": {
    "stateType": "url",
    "stateContent": null,
    "stateUrl": "https://painted-tennis-ebc.notion.site/Self-Assessment-24381626b6d780fe9f56c2ba14ea042d",
    "stateOriginalUrl": "https://painted-tennis-ebc.notion.site/Self-Assessment-24381626b6d780fe9f56c2ba14ea042d"
  }
}

================================================
FILE: tasks/notion/standard/self_assessment/faq_column_layout/verify.py
================================================
import sys
from notion_client import Client
from tasks.utils import notion_utils


def verify(notion: Client, main_id: str = None) -> bool:
    """
    Verifies that the FAQ toggle has been properly reorganized with a column list.
    """
    # Start from main_id if provided
    page_id = None
    if main_id:
        found_id, object_type = notion_utils.find_page_or_database_by_id(
            notion, main_id
        )
        if found_id and object_type == "page":
            page_id = found_id

    if not page_id:
        # Try to find the Self Assessment page
        page_id = notion_utils.find_page(notion, "Self Assessment")

    if not page_id:
        print("Error: Self Assessment page not found.", file=sys.stderr)
        return False

    # Get all blocks recursively from the page
    all_blocks = notion_utils.get_all_blocks_recursively(notion, page_id)

    # Find the FAQ toggle block
    faq_toggle_block = None
    faq_toggle_id = None
    for block in all_blocks:
        if block.get("type") == "toggle":
            block_text = notion_utils.get_block_plain_text(block)
            if "FAQ" in block_text:
                faq_toggle_block = block
                faq_toggle_id = block.get("id")
                print(f"Found FAQ toggle block: {block_text}")
                break

    if not faq_toggle_block:
        print("Error: FAQ toggle block not found.", file=sys.stderr)
        return False

    # Find column_list inside the FAQ toggle
    column_list_block = None
    for block in all_blocks:
        if (
            block.get("type") == "column_list"
            and block.get("parent", {}).get("block_id") == faq_toggle_id
        ):
            column_list_block = block
            break

    if not column_list_block:
        print("Error: No column_list found inside FAQ toggle.", file=sys.stderr)
        return False

    # Check that there are no Q&A pairs directly under FAQ toggle (outside column_list)
    direct_faq_children = []
    for block in all_blocks:
        if block.get("parent", {}).get("block_id") == faq_toggle_id and block.get(
            "id"
        ) != column_list_block.get("id"):
            direct_faq_children.append(block)

    # Check if any of these are heading_3 or paragraph blocks (Q&A content)
    for block in direct_faq_children:
        if block.get("type") in ["heading_3", "paragraph"]:
            print(
                f"Error: Found Q&A content outside column_list: {notion_utils.get_block_plain_text(block)[:50]}...",
                file=sys.stderr,
            )
            return False

    # Find the two columns
    columns = []
    column_list_id = column_list_block.get("id")
    for block in all_blocks:
        if (
            block.get("type") == "column"
            and block.get("parent", {}).get("block_id") == column_list_id
        ):
            columns.append(block)

    if len(columns) != 2:
        print(f"Error: Expected 2 columns, found {len(columns)}.", file=sys.stderr)
        return False

    # Check each column has exactly 2 Q&A pairs
    for i, column in enumerate(columns):
        column_id = column.get("id")

        # Find blocks inside this column
        column_blocks = []
        for block in all_blocks:
            if block.get("parent", {}).get("block_id") == column_id:
                column_blocks.append(block)

        # Count Q&A pairs (should be heading_3 followed by paragraph)
        qa_pairs = 0
        j = 0
        while j < len(column_blocks):
            if (
                column_blocks[j].get("type") == "heading_3"
                and j + 1 < len(column_blocks)
                and column_blocks[j + 1].get("type") == "paragraph"
            ):
                qa_pairs += 1
                j += 2  # Skip both question and answer
            else:
                j += 1

        if qa_pairs != 2:
            print(
                f"Error: Column {i + 1} has {qa_pairs} Q&A pairs, expected 2.",
                file=sys.stderr,
            )
            return False

        print(f"Column {i + 1}: Found {qa_pairs} Q&A pairs ✓")

    print(
        "Success: FAQ toggle properly organized with 2 columns, each containing 2 Q&A pairs."
    )
    return True


def main():
    """
    Executes the verification process and exits with a status code.
    """
    notion = notion_utils.get_notion_client()
    main_id = sys.argv[1] if len(sys.argv) > 1 else None
    if verify(notion, main_id):
        sys.exit(0)
    else:
        sys.exit(1)


if __name__ == "__main__":
    main()


================================================
FILE: tasks/notion/standard/self_assessment/hyperfocus_analysis_report/description.md
================================================
Go to my Self Assessment page, and then create a hyperfocus analysis report by analyzing sessions with high productivity but significant challenges.

**Task Requirements:**
1. Create a new page titled "Hyperfocus Analysis Report" as a child of the Self Assessment page. The new page should be located between 'Why Use the Term "Hyperfocus"?' callout and the following divider line.
2. Query the "Hyperfocus Self-Assessment Worksheet" database to find all sessions where:
   - Work Completion Rate is greater than 80% (0.8)
   - At least one challenge is present in the Challenges field
3. For each qualifying session, create a section with:
   - A heading showing the date and activity type (format: YYYY-MM-DD Activity)
   - A bullet list containing:
     - Focus factors used (e.g., Focus factors: XXX, YYY)
     - Energy level and mood (format: "Energy: X/10, Mood: Y/10")
     - Challenges faced (e.g., Challenges: XXX, YYY)
     - Strategies that helped overcome challenges (e.g., Strategies: XXX, YYY)
     - Work completion rate (format: "Completion: XX%")
4. At the top of the page, add a callout block (type: "info") with:
   - Title: "Top 2 Most Effective Strategies"
   - Content: List the 2 most frequently used strategies from all sessions, each on a new line with format "• Strategy Name (used in X sessions)"

**Structure Requirements:**
- The page must have the exact title "Hyperfocus Analysis Report"
- Each session section must start with a level 2 heading
- All session details must be in bullet point format
- The summary callout must be at the top of the page before any session details

================================================
FILE: tasks/notion/standard/self_assessment/hyperfocus_analysis_report/meta.json
================================================
{
  "task_id": "hyperfocus_analysis_report",
  "task_name": "Hyperfocus Analysis Report",
  "category_id": "self_assessment",
  "category_name": "Self Assessment",
  "description": "Create a hyperfocus analysis report by analyzing high-productivity sessions with challenges.",
  "author": "Xiangyan Liu",
  "created_at": "2025-07-27",
  "difficulty": "L3",
  "tags": [
    "conditional filtering",
    "data aggregation",
    "report generation",
    "visual formatting"
  ],
  "mcp": [
    "notion"
  ],
  "meta_data": {
    "stateType": "url",
    "stateContent": null,
    "stateUrl": "https://painted-tennis-ebc.notion.site/Self-Assessment-24381626b6d780fe9f56c2ba14ea042d",
    "stateOriginalUrl": "https://painted-tennis-ebc.notion.site/Self-Assessment-24381626b6d780fe9f56c2ba14ea042d"
  }
}

================================================
FILE: tasks/notion/standard/self_assessment/hyperfocus_analysis_report/verify.py
================================================
import sys
import re
from notion_client import Client
from tasks.utils import notion_utils
from collections import Counter


def validate_comma_separated(text: str, expected_items: list) -> bool:
    """
    Validates that a comma-separated list contains expected items (case-insensitive).
    """
    if not text or not expected_items:
        return False

    # Extract items from text
    items = [item.strip().lower() for item in text.split(",")]
    expected_lower = [item.lower() for item in expected_items]

    # Check if all expected items are present
    for expected in expected_lower:
        if not any(expected in item or item in expected for item in items):
            return False
    return True


def verify(notion: Client, main_id: str = None) -> bool:
    """
    Verifies that the Hyperfocus Analysis Report has been created correctly.
    """
    # Find the Self Assessment page
    self_assessment_page_id = main_id
    if main_id:
        found_id, object_type = notion_utils.find_page_or_database_by_id(
            notion, main_id
        )
        if found_id and object_type == "page":
            self_assessment_page_id = found_id

    if not self_assessment_page_id:
        # Try to find by name
        self_assessment_page_id = notion_utils.find_page(notion, "Self Assessment")

    if not self_assessment_page_id:
        print("Error: Self Assessment page not found.", file=sys.stderr)
        return False

    # Find the Hyperfocus Analysis Report page
    report_page_id = None
    report_position = -1
    callout_position = -1
    divider_position = -1
    children = notion.blocks.children.list(block_id=self_assessment_page_id).get(
        "results", []
    )
    for i, child in enumerate(children):
        # Track position of callout with "Why Use the Term"
        if child.get("type") == "callout":
            callout_text = notion_utils.get_block_plain_text(child)
            if "Why Use the Term" in callout_text and "Hyperfocus" in callout_text:
                callout_position = i

        # Track position of divider
        elif child.get("type") == "divider":
            if callout_position != -1 and divider_position == -1:
                divider_position = i

        # Find the report page
        elif child.get("type") == "child_page":
            page_data = notion.pages.retrieve(page_id=child["id"])
            title_prop = (
                page_data.get("properties", {}).get("title", {}).get("title", [])
            )
            if (
                title_prop
                and title_prop[0].get("plain_text") == "Hyperfocus Analysis Report"
            ):
                report_page_id = child["id"]
                report_position = i

    if not report_page_id:
        print("Error: 'Hyperfocus Analysis Report' page not found.", file=sys.stderr)
        return False

    # Verify position
    if callout_position == -1:
        print(
            "Error: Could not find 'Why Use the Term \"Hyperfocus\"?' callout.",
            file=sys.stderr,
        )
        return False

    if divider_position == -1:
        print("Error: Could not find divider after the callout.", file=sys.stderr)
        return False

    if not (callout_position < report_position < divider_position):
        print(
            f"Error: Report page is not positioned between callout and divider. Positions: callout={callout_position}, report={report_position}, divider={divider_position}",
            file=sys.stderr,
        )
        return False

    # Get all blocks from the report page
    all_blocks = notion_utils.get_all_blocks_recursively(notion, report_page_id)

    # Find the database in the Self Assessment page
    database_id = None
    for block in notion_utils.get_all_blocks_recursively(
        notion, self_assessment_page_id
    ):
        if block.get("type") == "child_database":
            db_data = notion.databases.retrieve(database_id=block["id"])
            db_title = "".join(
                [t.get("plain_text", "") for t in db_data.get("title", [])]
            )
            if "Hyperfocus Self-Assessment Worksheet" in db_title:
                database_id = block["id"]
                break

    if not database_id:
        print(
            "Error: Database 'Hyperfocus Self-Assessment Worksheet' not found.",
            file=sys.stderr,
        )
        return False

    # Query database for sessions with >80% completion rate and challenges
    query_results = notion.databases.query(
        database_id=database_id,
        filter={
            "and": [
                {"property": "Work Completion Rate", "number": {"greater_than": 0.8}},
                {"property": "Challenges", "multi_select": {"is_not_empty": True}},
            ]
        },
    ).get("results", [])

    if not query_results:
        print(
            "Warning: No sessions found with >80% completion rate and challenges.",
            file=sys.stderr,
        )
        # Still check if the page structure is correct

    # Verify page structure
    has_callout = False
    has_top_strategies = False
    session_count = 0
    found_sessions = {}  # Track sessions by date for validation

    # Track strategies for validation - count from ALL sessions
    all_sessions = notion.databases.query(database_id=database_id).get("results", [])
    all_strategies = []
    for session in all_sessions:
        strategies = (
            session.get("properties", {})
            .get("Key Strategies Used", {})
            .get("multi_select", [])
        )
        all_strategies.extend([s.get("name") for s in strategies])

    strategy_counts = Counter(all_strategies)
    top_2_strategies = strategy_counts.most_common(2)

    # Build expected sessions from query results with all data
    expected_sessions = {}
    for result in query_results:
        date_prop = result.get("properties", {}).get("Date", {}).get("date", {})
        activity_prop = (
            result.get("properties", {}).get("Activity", {}).get("select", {})
        )
        if date_prop and date_prop.get("start") and activity_prop:
            date_str = date_prop["start"]
            activity_name = activity_prop.get("name", "")

            # Extract all session data for validation
            focus_factors = [
                f.get("name", "")
                for f in result.get("properties", {})
                .get("Focus Factors", {})
                .get("multi_select", [])
            ]
            challenges = [
                c.get("name", "")
                for c in result.get("properties", {})
                .get("Challenges", {})
                .get("multi_select", [])
            ]
            strategies = [
                s.get("name", "")
                for s in result.get("properties", {})
                .get("Key Strategies Used", {})
                .get("multi_select", [])
            ]
            energy = result.get("properties", {}).get("Energy Level", {}).get("number")
            mood = result.get("properties", {}).get("Mood", {}).get("number")
            completion = (
                result.get("properties", {})
                .get("Work Completion Rate", {})
                .get("number")
            )

            expected_sessions[date_str] = {
                "activity": activity_name,
                "focus_factors": focus_factors,
                "challenges": challenges,
                "strategies": strategies,
                "energy": energy,
                "mood": mood,
                "completion": completion,
            }

    current_session_date = None
    current_session_data = None
    session_bullet_points = {}  # Track bullet points for each session

    for i, block in enumerate(all_blocks):
        block_type = block.get("type")

        # Check for callout at the top
        if block_type == "callout" and i < 5:  # Should be near the top
            callout_text = notion_utils.get_block_plain_text(block)
            if "Top 2 Most Effective Strategies" in callout_text:
                has_callout = True
                # Check if it contains strategy information
                s1, n1 = top_2_strategies[0]
                s2, n2 = top_2_strategies[1]
                t1 = f"{s1} (used in {n1} sessions)"
                t2 = f"{s2} (used in {n2} sessions)"

                if t1 in callout_text and t2 in callout_text:
                    has_top_strategies = True
                    break

        # Check for session headings with format YYYY-MM-DD Activity
        if block_type == "heading_2":
            heading_text = notion_utils.get_block_plain_text(block)
            # Check if heading matches expected format
            for date_str, session_data in expected_sessions.items():
                activity = session_data["activity"]
                expected_heading = f"{date_str} {activity}"
                if expected_heading in heading_text:
                    found_sessions[date_str] = session_data
                    session_count += 1
                    current_session_date = date_str
                    current_session_data = session_data
                    session_bullet_points[date_str] = []
                    break

        # Check for bullet points with session details
        if block_type == "bulleted_list_item" and current_session_data:
            bullet_text = notion_utils.get_block_plain_text(block)

            # Track bullet points for current session
            if current_session_date:
                session_bullet_points[current_session_date].append(bullet_text)

            # Validate specific bullet point content
            if bullet_text.startswith("Focus factors"):
                content = bullet_text.split(":", 1)[1].strip()
                expected_factors = current_session_data.get("focus_factors", [])
                if not validate_comma_separated(content, expected_factors):
                    print(
                        f"Error: Focus factors mismatch for {current_session_date}. Expected: {expected_factors}, Found: {content}",
                        file=sys.stderr,
                    )
                    return False

            elif "Energy" in bullet_text and "Mood" in bullet_text:
                # Extract energy and mood values
                energy_match = re.search(r"Energy:\s*(\d+)/10", bullet_text)
                mood_match = re.search(r"Mood:\s*(\d+)/10", bullet_text)

                if energy_match and mood_match:
                    found_energy = int(energy_match.group(1))
                    found_mood = int(mood_match.group(1))
                    expected_energy = current_session_data.get("energy")
                    expected_mood = current_session_data.get("mood")

                    if found_energy != expected_energy or found_mood != expected_mood:
                        print(
                            f"Error: Energy/Mood mismatch for {current_session_date}. Expected: Energy: {expected_energy}/10, Mood: {expected_mood}/10",
                            file=sys.stderr,
                        )
                        return False
                else:
                    print(
                        f"Error: Invalid Energy/Mood format for {current_session_date}",
                        file=sys.stderr,
                    )
                    return False

            elif bullet_text.startswith("Challenges"):
                content = bullet_text.split(":", 1)[1].strip()
                expected_challenges = current_session_data.get("challenges", [])
                if not validate_comma_separated(content, expected_challenges):
                    print(
                        f"Error: Challenges mismatch for {current_session_date}. Expected: {expected_challenges}, Found: {content}",
                        file=sys.stderr,
                    )
                    return False

            elif bullet_text.startswith("Strategies"):
                content = bullet_text.split(":", 1)[1].strip()
                expected_strategies = current_session_data.get("strategies", [])
                if len(expected_strategies) > 0 and not validate_comma_separated(
                    content, expected_strategies
                ):
                    print(
                        f"Error: Strategies mismatch for {current_session_date}. Expected: {expected_strategies}, Found: {content}",
                        file=sys.stderr,
                    )
                    return False

            elif bullet_text.startswith("Completion"):
                # Extract completion percentage
                completion_match = re.search(r"Completion:\s*(\d+)%", bullet_text)

                if completion_match:
                    found_completion = int(completion_match.group(1))
                    expected_completion = int(
                        current_session_data.get("completion", 0) * 100
                    )

                    if found_completion != expected_completion:
                        print(
                            f"Error: Completion rate mismatch for {current_session_date}. Expected: {expected_completion}%, Found: {found_completion}%",
                            file=sys.stderr,
                        )
                        return False
                else:
                    print(
                        f"Error: Invalid completion format for {current_session_date}",
                        file=sys.stderr,
                    )
                    return False

    # Verify all sessions have complete bullet points
    for date_str, bullets in session_bullet_points.items():
        bullets_text = " ".join(bullets)
        required_items = [
            "Focus factors",
            "Energy:",
            "Mood:",
            "Challenges",
            "Strategies",
            "Completion",
        ]
        missing_items = []

        for item in required_items:
            if item not in bullets_text:
                missing_items.append(item)

        if missing_items:
            print(
                f"Error: Missing bullet points for session {date_str}: {', '.join(missing_items)}",
                file=sys.stderr,
            )
            return False

    # Verify all requirements
    if not has_callout:
        print(
            "Error: Missing callout block with 'Top 2 Most Effective Strategies'.",
            file=sys.stderr,
        )
        return False

    if not has_top_strategies and len(top_2_strategies) > 0:
        print("Error: Callout doesn't contain strategy information.", file=sys.stderr)
        return False

    if query_results and session_count == 0:
        print("Error: No session sections found with proper headings.", file=sys.stderr)
        return False

    # Check if all expected sessions are present
    missing_sessions = []
    for date_str in expected_sessions.keys():
        if date_str not in found_sessions:
            missing_sessions.append(date_str)

    if missing_sessions:
        print(
            f"Error: Missing session sections for dates: {', '.join(missing_sessions)}",
            file=sys.stderr,
        )
        return False

    if query_results and session_count < len(query_results):
        print(
            f"Warning: Found {session_count} session sections but expected {len(query_results)}.",
            file=sys.stderr,
        )

    print(
        "Success: Hyperfocus Analysis Report created with proper structure and content."
    )
    return True


def main():
    """
    Executes the verification process and exits with a status code.
    """
    notion = notion_utils.get_notion_client()
    main_id = sys.argv[1] if len(sys.argv) > 1 else None
    if verify(notion, main_id):
        sys.exit(0)
    else:
        sys.exit(1)


if __name__ == "__main__":
    main()


================================================
FILE: tasks/notion/standard/self_assessment/numbered_list_emojis/description.md
================================================
Please find all numbered list items in the Self Assessment page, use Notion tools to replace the numbers with corresponding emoji numbers (e.g., 1️⃣, 2️⃣, 3️⃣). For example:
Here is the translated and reformatted version of your request:

If the original numbered list is:

1. First step
2. Second step
3. Third step

It should become:

1️⃣ First step
2️⃣ Second step
3️⃣ Third step

================================================
FILE: tasks/notion/standard/self_assessment/numbered_list_emojis/meta.json
================================================
{
  "task_id": "numbered_list_emojis",
  "task_name": "Numbered List Emojis",
  "category_id": "self_assessment",
  "category_name": "Self Assessment",
  "description": "Replace numbered list items with corresponding emoji numbers for better visual formatting.",
  "author": "Xiangyan Liu",
  "created_at": "2025-07-27",
  "difficulty": "L3",
  "tags": [
    "visual formatting",
    "automated migration"
  ],
  "mcp": [
    "notion"
  ],
  "meta_data": {
    "stateType": "url",
    "stateContent": null,
    "stateUrl": "https://painted-tennis-ebc.notion.site/Self-Assessment-24381626b6d780fe9f56c2ba14ea042d",
    "stateOriginalUrl": "https://painted-tennis-ebc.notion.site/Self-Assessment-24381626b6d780fe9f56c2ba14ea042d"
  }
}

================================================
FILE: tasks/notion/standard/self_assessment/numbered_list_emojis/verify.py
================================================
import sys
from notion_client import Client
from tasks.utils import notion_utils


def verify(notion: Client, main_id: str = None) -> bool:
    """
    Verifies that numbered lists have been replaced with emoji numbers.
    """
    # Start from main_id if provided, otherwise search for the page
    self_assessment_page_id = main_id
    if main_id:
        found_id, object_type = notion_utils.find_page_or_database_by_id(
            notion, main_id
        )
        if found_id and object_type == "page":
            self_assessment_page_id = found_id

    if not self_assessment_page_id:
        # Try to find by name
        self_assessment_page_id = notion_utils.find_page(notion, "Self Assessment")

    if not self_assessment_page_id:
        print("Error: Self Assessment page not found.", file=sys.stderr)
        return False

    # Get all blocks recursively from the main page
    all_blocks = notion_utils.get_all_blocks_recursively(
        notion, self_assessment_page_id
    )

    # Find all numbered_list_item blocks
    numbered_list_items = []
    for block in all_blocks:
        if block.get("type") == "numbered_list_item":
            numbered_list_items.append(block)

    if len(numbered_list_items) > 0:
        print(
            f"Error: found {len(numbered_list_items)} numbered list items that should be converted to emoji numbers",
            file=sys.stderr,
        )
        # return False

    required_items = [
        "1️⃣ Record Each Hyperfocus Session:",
        "2️⃣ Review and Reflect:",
        "3️⃣ Adjust and Optimize:",
        '1️⃣ Harvard Business Review: "The Making of a Corporate Athlete"',
        '2️⃣ "Hyperfocus: How to Be More Productive in a World of Distraction" by Chris Bailey',
        '3️⃣ "Attention Management: How to Create Success and Gain Productivity Every Day" by Maura Thomas',
        '4️⃣ "Deep Work: Rules for Focused Success in a Distracted World" by Cal Newport',
        "1️⃣ Record Each Hyperfocus Session:",
        "2️⃣ Review and Reflect:",
        "3️⃣ Adjust and Optimize:",
        "1️⃣ What time of day do you feel most focused?",
        "2️⃣ Which environment helps you concentrate the most?",
        "3️⃣ What type of tasks do you find yourself getting lost in?",
    ]

    # Make a copy to track which items we've found
    remaining_items = required_items.copy()

    # Iterate through all blocks to find matching text
    for block in all_blocks:
        block_text = notion_utils.get_block_plain_text(block).strip()

        # Check if this block's text matches any of our required items
        if block_text in remaining_items:
            remaining_items.remove(block_text)
            print(f"Found: {block_text}")

    # Check if all required items were found
    if len(remaining_items) == 0:
        print("Success: All numbered lists have been converted to emoji numbers")
        return True
    else:
        print(f"Error: Missing {len(remaining_items)} required items:", file=sys.stderr)
        for item in remaining_items:
            print(f"  - {item}", file=sys.stderr)
        return False


def main():
    """
    Executes the verification process and exits with a status code.
    """
    notion = notion_utils.get_notion_client()
    main_id = sys.argv[1] if len(sys.argv) > 1 else None
    if verify(notion, main_id):
        sys.exit(0)
    else:
        sys.exit(1)


if __name__ == "__main__":
    main()


================================================
FILE: tasks/notion/standard/standard_operating_procedure/deployment_process_sop/description.md
================================================
Using Notion Tools. Complete the SOP template (a notion page titled 'Standard Operating Procedure') by filling in all sections with comprehensive, interconnected content for a "Software Deployment Process" SOP, ensuring all cross-references, terminologies, and procedural steps are properly linked and validated.

**Task Requirements:**

1. **Update the SOP header information** (in the left column):
   - Change the heading_1 "SOP Title" text to "Software Deployment Process"
   - Update the paragraph "Created 2023-10-25" to "Created 2025-01-19"
   - Update the paragraph "Responsible department:" to "Responsible department: DevOps Engineering Team"
   - Update the People team page's callout to: "DevOps Engineering Team Wiki - Contains team contact information, escalation procedures, and deployment schedules. Access required for all deployment activities."

2. **Fill the Purpose section** with exactly this content:
   - Replace the placeholder paragraph (starts with "↓ Summarize the procedure") with: "This SOP defines the standardized process for deploying software applications to production environments, ensuring zero-downtime deployments, proper rollback procedures, and compliance with security protocols. This procedure applies to all production deployments and must be followed by all engineering teams."

3. **Complete the Context section** with:
   - Replace the placeholder paragraph (starts with "↓ Add any related and useful information") with: "Software deployments are critical operations that can impact system availability and user experience. This process has been developed based on industry best practices and our incident response learnings from Q3 2023. All deployments must go through automated testing pipelines and require approval from designated reviewers."
   - Update all THREE child_pages under the "Relevant Docs" toggle:
     - First child_page callout (Contacting IT): "Change Management Policy (SOP-001) - Defines approval workflows and change review processes for all production modifications."
     - Second child_page callout (Team lunches): "Incident Response Procedures (SOP-003) - Emergency procedures for handling deployment failures and system outages."
     - Third child_page callout (Sending swag): "Security Compliance Guidelines (SOP-007) - Security requirements and validation steps for production deployments."

4. **Define comprehensive Terminologies** by:
   - Replace the placeholder paragraph (starts with "↓ Add any unfamiliar or domain specific words") with: "Essential deployment terminology for team understanding:"
   - Replace the existing bulleted_list_item "Term: The definition of the term" with these four exact items:
     - "Blue-Green Deployment: A deployment strategy that maintains two identical production environments"
     - "Rollback Window: The maximum time allowed to revert a deployment (30 minutes)"  
     - "Smoke Test: Initial verification tests run immediately after deployment"
     - "Production Gateway: The approval checkpoint before production release"

5. **Populate Tools section** with:
   - Replace the placeholder paragraph (starts with "↓ Add any relevant tools") with: "Critical tools required for deployment operations:"
   - Update the TWO existing child_pages:
     - First child_page callout: "Jenkins CI/CD Pipeline - Primary deployment automation tool with integrated testing and approval workflows. Required for all automated deployments."
     - Second child_page callout: "Kubernetes Dashboard - Container orchestration monitoring and management interface for deployment verification and rollback operations."

6. **Complete Roles & responsibilities** with:
   - Replace the placeholder paragraph (starts with "↓ Define who will be executing") with: "The following roles are essential for successful deployment execution:"
   - Replace the existing empty bulleted_list_item with these four exact items:
     - "DevOps Engineer: Executes deployment, monitors system health, initiates rollbacks if needed"
     - "Lead Developer: Reviews code changes, approves deployment package, validates functionality"  
     - "QA Engineer: Verifies smoke tests, confirms user acceptance criteria"
     - "Security Officer: Validates security compliance, approves security-sensitive deployments"

7. **Create detailed Procedure section** with:
   - Replace the placeholder paragraph (starts with "↓ Create a step by step procedure") with: "Follow these steps in sequence. Do not skip steps or perform them out of order."
   - Replace the THREE existing numbered_list_items with:
     - "Pre-deployment: Verify all automated tests pass, obtain required approvals from Lead Developer and Security Officer, confirm rollback plan is documented and tested"
     - "Deployment execution: Deploy to staging environment first, run comprehensive smoke tests, obtain final Production Gateway approval, deploy to production using blue-green strategy"
     - "Post-deployment: Monitor system metrics for minimum 30 minutes, validate all functionality using automated tests, document deployment results in change log, notify all stakeholders via deployment notification system"

================================================
FILE: tasks/notion/standard/standard_operating_procedure/deployment_process_sop/meta.json
================================================
{
  "task_id": "deployment_process_sop",
  "task_name": "Deployment Process SOP",
  "category_id": "standard_operating_procedure",
  "category_name": "Standard Operating Procedure",
  "description": "Complete the SOP template with comprehensive content for a Software Deployment Process with interconnected sections.",
  "author": "Xiangyan Liu",
  "created_at": "2025-07-27",
  "difficulty": "L3",
  "tags": [
    "template population",
    "cross-reference linking",
    "content organization",
    "visual formatting"
  ],
  "mcp": [
    "notion"
  ],
  "meta_data": {
    "stateType": "url",
    "stateContent": null,
    "stateUrl": "https://painted-tennis-ebc.notion.site/Standard-Operating-Procedure-24381626b6d780a8b678f9e62ae5b152",
    "stateOriginalUrl": "https://www.notion.so/marketplace/templates/standard-operating-procedure"
  }
}

================================================
FILE: tasks/notion/standard/standard_operating_procedure/deployment_process_sop/verify.py
================================================
import sys
from notion_client import Client
from tasks.utils import notion_utils


def verify(notion: Client, main_id: str = None) -> bool:
    """
    Verifies comprehensive SOP template completion with exact content matching.
    """
    page_id = None
    if main_id:
        found_id, object_type = notion_utils.find_page_or_database_by_id(
            notion, main_id
        )
        if found_id and object_type == "page":
            page_id = found_id

    if not page_id:
        page_id = notion_utils.find_page(notion, "Standard Operating Procedure")
    if not page_id:
        print("Error: Page 'Standard Operating Procedure' not found.", file=sys.stderr)
        return False

    all_blocks = notion_utils.get_all_blocks_recursively(notion, page_id)
    verification_results = []

    # Check 1: Verify SOP header information updates
    sop_title_found = False
    created_date_found = False
    responsible_dept_found = False
    header_callout_found = False

    for block in all_blocks:
        if block.get("type") == "heading_1":
            heading_text = notion_utils.get_block_plain_text(block)
            if "Software Deployment Process" in heading_text:
                sop_title_found = True
                verification_results.append("✅ SOP Title updated correctly")

        elif block.get("type") == "paragraph":
            para_text = notion_utils.get_block_plain_text(block)
            if "Created 2025-01-19" in para_text:
                created_date_found = True
                verification_results.append("✅ Created date updated correctly")
            elif "Responsible department: DevOps Engineering Team" in para_text:
                responsible_dept_found = True
                verification_results.append(
                    "✅ Responsible department updated correctly"
                )

        elif block.get("type") == "child_page":
            # Check child pages recursively for callout content - specifically the People team page
            try:
                child_page_info = notion.pages.retrieve(page_id=block["id"])
                child_page_title = ""
                if (
                    "properties" in child_page_info
                    and "title" in child_page_info["properties"]
                ):
                    title_list = child_page_info["properties"]["title"].get("title", [])
                    if title_list:
                        child_page_title = title_list[0].get("plain_text", "")
            except:
                child_page_title = ""

            child_blocks = notion_utils.get_all_blocks_recursively(notion, block["id"])
            for child_block in child_blocks:
                if child_block.get("type") == "callout":
                    callout_text = notion_utils.get_block_plain_text(child_block)
                    # Look for the People team page with the DevOps Engineering Team Wiki callout
                    if (
                        "DevOps Engineering Team Wiki" in callout_text
                        and "deployment schedules" in callout_text
                        and "deployment activities" in callout_text
                    ):
                        header_callout_found = True
                        verification_results.append(
                            "✅ Header People team page callout updated correctly"
                        )

    # Check 2: Verify Purpose section content
    purpose_found = False
    expected_purpose = "This SOP defines the standardized process for deploying software applications to production environments"

    for i, block in enumerate(all_blocks):
        if block.get("type") == "heading_2":
            heading_text = notion_utils.get_block_plain_text(block)
            if "Purpose" in heading_text:
                # Check next paragraph after Purpose heading
                for j in range(i + 1, min(i + 5, len(all_blocks))):
                    next_block = all_blocks[j]
                    if next_block.get("type") == "paragraph":
                        para_text = notion_utils.get_block_plain_text(next_block)
                        if (
                            expected_purpose in para_text
                            and "engineering teams" in para_text
                        ):
                            purpose_found = True
                            verification_results.append(
                                "✅ Purpose section content updated correctly"
                            )
                        break
                break

    # Check 3: Verify Context section and child_page callouts
    context_found = False
    child_pages_updated = 0
    expected_context = "Software deployments are critical operations that can impact system availability"
    expected_child_callouts = [
        (
            "Change Management Policy (SOP-001)",
            "Defines approval workflows and change review processes for all production modifications",
            "Contacting IT",
        ),
        (
            "Incident Response Procedures (SOP-003)",
            "Emergency procedures for handling deployment failures and system outages",
            "Team lunches",
        ),
        (
            "Security Compliance Guidelines (SOP-007)",
            "Security requirements and validation steps for production deployments",
            "Sending swag",
        ),
    ]

    for i, block in enumerate(all_blocks):
        if block.get("type") == "heading_2":
            heading_text = notion_utils.get_block_plain_text(block)
            if "Context" in heading_text:
                # Check paragraph content
                for j in range(i + 1, min(i + 10, len(all_blocks))):
                    next_block = all_blocks[j]
                    if next_block.get("type") == "paragraph":
                        para_text = notion_utils.get_block_plain_text(next_block)
                        if expected_context in para_text and "Q3 2023" in para_text:
                            context_found = True
                    elif next_block.get("type") == "toggle":
                        # Check child pages under toggle
                        toggle_blocks = notion_utils.get_all_blocks_recursively(
                            notion, next_block["id"]
                        )
                        for toggle_child in toggle_blocks:
                            if toggle_child.get("type") == "child_page":
                                # Get the child page title to match with expected callouts
                                try:
                                    child_page_info = notion.pages.retrieve(
                                        page_id=toggle_child["id"]
                                    )
                                    child_page_title = ""
                                    if (
                                        "properties" in child_page_info
                                        and "title" in child_page_info["properties"]
                                    ):
                                        title_list = child_page_info["properties"][
                                            "title"
                                        ].get("title", [])
                                        if title_list:
                                            child_page_title = title_list[0].get(
                                                "plain_text", ""
                                            )
                                except:
                                    child_page_title = ""

                                child_blocks = notion_utils.get_all_blocks_recursively(
                                    notion, toggle_child["id"]
                                )
                                for child_block in child_blocks:
                                    if child_block.get("type") == "callout":
                                        callout_text = (
                                            notion_utils.get_block_plain_text(
                                                child_block
                                            )
                                        )
                                        for (
                                            expected_title,
                                            expected_content,
                                            expected_page_title,
                                        ) in expected_child_callouts:
                                            if (
                                                expected_title in callout_text
                                                and expected_content in callout_text
                                                and expected_page_title
                                                in child_page_title
                                            ):
                                                child_pages_updated += 1
                                                verification_results.append(
                                                    f"✅ Context child_page '{expected_page_title}' updated correctly"
                                                )
                                                break

    if context_found:
        verification_results.append("✅ Context section content updated correctly")

    if child_pages_updated == 3:
        verification_results.append(
            "✅ All 3 Context child_page callouts updated correctly"
        )
    else:
        verification_results.append(
            f"❌ Only {child_pages_updated}/3 Context child_page callouts updated correctly (Contacting IT, Team lunches, Sending swag)"
        )

    # Check 4: Verify Terminologies section with exact 4 bulleted items
    terminologies_found = False
    terminology_items = []
    expected_terminologies = [
        "Blue-Green Deployment: A deployment strategy that maintains two identical production environments",
        "Rollback Window: The maximum time allowed to revert a deployment (30 minutes)",
        "Smoke Test: Initial verification tests run immediately after deployment",
        "Production Gateway: The approval checkpoint before production release",
    ]

    for i, block in enumerate(all_blocks):
        if block.get("type") == "heading_2":
            heading_text = notion_utils.get_block_plain_text(block)
            if "Terminologies" in heading_text:
                # Check for intro paragraph
                for j in range(i + 1, min(i + 2, len(all_blocks))):
                    if all_blocks[j].get("type") == "paragraph":
                        para_text = notion_utils.get_block_plain_text(all_blocks[j])
                        if "Essential deployment terminology" in para_text:
                            terminologies_found = True
                            break

                # Check bulleted list items
                for j in range(i + 1, min(i + 10, len(all_blocks))):
                    next_block = all_blocks[j]
                    if next_block.get("type") == "bulleted_list_item":
                        item_text = notion_utils.get_block_plain_text(next_block)
                        terminology_items.append(item_text)
                    elif next_block.get("type") in [
                        "heading_1",
                        "heading_2",
                        "heading_3",
                    ]:
                        break
                break

    terminology_matches = sum(
        1
        for expected in expected_terminologies
        if any(expected in item for item in terminology_items)
    )

    if terminologies_found and len(terminology_items) == 4 and terminology_matches == 4:
        verification_results.append(
            "✅ Terminologies section with exactly 4 correct items"
        )
    else:
        verification_results.append(
            f"❌ Terminologies: expected 4 items, found {len(terminology_items)}, {terminology_matches} correct"
        )

    # Check 5: Verify Tools section with 2 child_page callouts
    tools_found = False
    tools_child_pages = 0
    expected_tools = [
        ("Jenkins CI/CD Pipeline", "automated deployments"),
        ("Kubernetes Dashboard", "rollback operations"),
    ]

    for i, block in enumerate(all_blocks):
        if block.get("type") == "heading_2":
            heading_text = notion_utils.get_block_plain_text(block)
            if "Tools" in heading_text:
                # Check intro paragraph
                for j in range(i + 1, min(i + 2, len(all_blocks))):
                    if all_blocks[j].get("type") == "paragraph":
                        para_text = notion_utils.get_block_plain_text(all_blocks[j])
                        if "Critical tools required" in para_text:
                            tools_found = True
                            break

                # Check child pages
                for j in range(i + 1, min(i + 10, len(all_blocks))):
                    next_block = all_blocks[j]
                    if next_block.get("type") == "child_page":
                        child_blocks = notion_utils.get_all_blocks_recursively(
                            notion, next_block["id"]
                        )
                        for child_block in child_blocks:
                            if child_block.get("type") == "callout":
                                callout_text = notion_utils.get_block_plain_text(
                                    child_block
                                )
                                for expected_title, expected_content in expected_tools:
                                    if (
                                        expected_title in callout_text
                                        and expected_content in callout_text
                                    ):
                                        tools_child_pages += 1
                                        break
                    elif next_block.get("type") in [
                        "heading_1",
                        "heading_2",
                        "heading_3",
                    ]:
                        break
                break

    if tools_found and tools_child_pages == 2:
        verification_results.append(
            "✅ Tools section with 2 correctly updated child_page callouts"
        )
    else:
        verification_results.append(
            f"❌ Tools section: expected 2 child_pages updated, found {tools_child_pages}"
        )

    # Check 6: Verify Roles & responsibilities with exactly 4 bulleted items
    roles_found = False
    role_items = []
    expected_roles = [
        "DevOps Engineer: Executes deployment, monitors system health, initiates rollbacks if needed",
        "Lead Developer: Reviews code changes, approves deployment package, validates functionality",
        "QA Engineer: Verifies smoke tests, confirms user acceptance criteria",
        "Security Officer: Validates security compliance, approves security-sensitive deployments",
    ]

    for i, block in enumerate(all_blocks):
        if block.get("type") == "heading_2":
            heading_text = notion_utils.get_block_plain_text(block)
            if "Roles" in heading_text and "responsibilities" in heading_text:
                # Check intro paragraph
                for j in range(i + 1, min(i + 2, len(all_blocks))):
                    if all_blocks[j].get("type") == "paragraph":
                        para_text = notion_utils.get_block_plain_text(all_blocks[j])
                        if "essential for successful deployment execution" in para_text:
                            roles_found = True
                            break

                # Check bulleted list items
                for j in range(i + 1, min(i + 10, len(all_blocks))):
                    next_block = all_blocks[j]
                    if next_block.get("type") == "bulleted_list_item":
                        item_text = notion_utils.get_block_plain_text(next_block)
                        role_items.append(item_text)
                    elif next_block.get("type") in [
                        "heading_1",
                        "heading_2",
                        "heading_3",
                    ]:
                        break
                break

    role_matches = sum(
        1 for expected in expected_roles if any(expected in item for item in role_items)
    )

    if roles_found and len(role_items) == 4 and role_matches == 4:
        verification_results.append(
            "✅ Roles & responsibilities section with exactly 4 correct items"
        )
    else:
        verification_results.append(
            f"❌ Roles section: expected 4 items, found {len(role_items)}, {role_matches} correct"
        )

    # Check 7: Verify Procedure section with exactly 3 numbered items
    procedure_found = False
    procedure_items = []
    expected_procedures = [
        ("Pre-deployment", "Lead Developer and Security Officer", "rollback plan"),
        ("Deployment execution", "staging environment first", "blue-green strategy"),
        (
            "Post-deployment",
            "minimum 30 minutes",
            "stakeholders via deployment notification",
        ),
    ]

    for i, block in enumerate(all_blocks):
        if block.get("type") == "heading_2":
            heading_text = notion_utils.get_block_plain_text(block)
            if "Procedure" in heading_text:
                # Check intro paragraph
                for j in range(i + 1, min(i + 2, len(all_blocks))):
                    if all_blocks[j].get("type") == "paragraph":
                        para_text = notion_utils.get_block_plain_text(all_blocks[j])
                        if "Follow these steps in sequence" in para_text:
                            procedure_found = True
                            break

                # Check numbered list items
                for j in range(i + 1, min(i + 10, len(all_blocks))):
                    next_block = all_blocks[j]
                    if next_block.get("type") == "numbered_list_item":
                        item_text = notion_utils.get_block_plain_text(next_block)
                        procedure_items.append(item_text)
                    elif next_block.get("type") in [
                        "heading_1",
                        "heading_2",
                        "heading_3",
                    ]:
                        break
                break

    procedure_matches = 0
    for item_text in procedure_items:
        for expected_title, expected_content1, expected_content2 in expected_procedures:
            if (
                expected_title in item_text
                and expected_content1 in item_text
                and expected_content2 in item_text
            ):
                procedure_matches += 1
                break

    if procedure_found and len(procedure_items) == 3 and procedure_matches == 3:
        verification_results.append("✅ Procedure section with exactly 3 correct items")
    else:
        verification_results.append(
            f"❌ Procedure: expected 3 items, found {len(procedure_items)}, {procedure_matches} correct"
        )

    # Calculate overall success
    total_checks = 14  # Number of major verification points
    successful_checks = sum(
        1 for result in verification_results if result.startswith("✅")
    )

    # Print all verification results
    print("\n=== SOP Template Verification Results ===", file=sys.stderr)
    for result in verification_results:
        print(result, file=sys.stderr)

    print(f"\n=== Summary: {successful_checks}/{total_checks} checks passed ===")

    # Must pass ALL checks to succeed
    success = (
        sop_title_found
        and created_date_found
        and responsible_dept_found
        and header_callout_found
        and purpose_found
        and context_found
        and child_pages_updated == 3
        and terminologies_found
        and len(terminology_items) == 4
        and terminology_matches == 4
        and tools_found
        and tools_child_pages == 2
        and roles_found
        and len(role_items) == 4
        and role_matches == 4
        and procedure_found
        and len(procedure_items) == 3
        and procedure_matches == 3
    )

    if success:
        print("\n🎉 SUCCESS: All SOP template requirements completed correctly!")
        return True
    else:
        print(
            f"\n❌ FAILURE: SOP template verification failed. {successful_checks}/{total_checks} requirements met.",
            file=sys.stderr,
        )
        return False


def main():
    """
    Executes the verification process and exits with a status code.
    """
    notion = notion_utils.get_notion_client()
    main_id = sys.argv[1] if len(sys.argv) > 1 else None
    if verify(notion, main_id):
        sys.exit(0)
    else:
        sys.exit(1)


if __name__ == "__main__":
    main()


================================================
FILE: tasks/notion/standard/standard_operating_procedure/section_organization/description.md
================================================
# Task: Reorganize Standard Operating Procedure Page Sections

## Objective
Modify the structure of the Standard Operating Procedure page in Notion by reorganizing sections through swapping and creating a column layout.

## Requirements

### Step 1: Swap Sections
- Navigate to the Standard Operating Procedure page
- Swap the positions of the "Terminologies" and "Roles & responsibilities" sections
- Preserve all content within each section exactly as is
- Maintain the original formatting and structure of each section

### Step 2: Create Column Layout
- After swapping, arrange the "Tools" section and the section immediately below it ("Terminologies") into a 2-column layout
- Position the "Tools" section in the left column
- Position the "Terminologies" section in the right column
- In the "Tools" column, add links to the Notion and Figma pages using appropriate reference blocks
- Preserve the original child pages from the "Tools" section in a toggle block placed below the column layout, with the toggle titled "original pages"

================================================
FILE: tasks/notion/standard/standard_operating_procedure/section_organization/meta.json
================================================
{
  "task_id": "section_organization",
  "task_name": "Section Organization",
  "category_id": "standard_operating_procedure",
  "category_name": "Standard Operating Procedure",
  "description": "Reorganize the Standard Operating Procedure page by swapping sections and creating a column layout.",
  "author": "Xiangyan Liu",
  "created_at": "2025-08-11",
  "difficulty": "L3",
  "tags": [
    "content organization",
    "cross-reference linking",
    "visual formatting"
  ],
  "mcp": [
    "notion"
  ],
  "meta_data": {
    "stateType": "url",
    "stateContent": null,
    "stateUrl": "https://painted-tennis-ebc.notion.site/Standard-Operating-Procedure-24381626b6d780a8b678f9e62ae5b152",
    "stateOriginalUrl": "https://www.notion.so/marketplace/templates/standard-operating-procedure"
  }
}

================================================
FILE: tasks/notion/standard/standard_operating_procedure/section_organization/verify.py
================================================
import sys
from notion_client import Client
from tasks.utils import notion_utils

def verify(notion: Client, main_id: str = None) -> bool:
    """
    Verifies that the Standard Operating Procedure page has been reorganized correctly.
    """
    # Step 1: Find the Standard Operating Procedure page
    if main_id:
        found_id, object_type = notion_utils.find_page_or_database_by_id(notion, main_id)
        if not found_id or object_type != 'page':
            print("Error: Standard Operating Procedure page not found.", file=sys.stderr)
            return False
    else:
        # Try to find the page by searching
        found_id = notion_utils.find_page(notion, "Standard Operating Procedure")
        if not found_id:
            print("Error: Standard Operating Procedure page not found.", file=sys.stderr)
            return False
    
    print(f"Found Standard Operating Procedure page: {found_id}")
    
    # Get all blocks from the page
    all_blocks = notion_utils.get_all_blocks_recursively(notion, found_id)
    print(f"Found {len(all_blocks)} blocks")
    
    print("Starting verification...")
    
    # Step 2: Verify the structure and section order
    print("2. Checking page structure and section order...")
    
    # Expected structure after the initial content and dividers
    # We'll look for main sections by their headings
    roles_index = None
    tools_column_index = None
    toggle_index = None
    procedure_index = None
    
    for i, block in enumerate(all_blocks):
        if block.get("type") == "heading_2":
            heading_text = ""
            rich_text = block.get("heading_2", {}).get("rich_text", [])
            if rich_text:
                heading_text = rich_text[0].get("text", {}).get("content", "")
            
            if heading_text == "Roles & responsibilities":
                roles_index = i
                print(f"✓ Found 'Roles & responsibilities' section at index {i}")
            elif heading_text == "Procedure":
                procedure_index = i
                print(f"✓ Found 'Procedure' section at index {i}")
    
    # Check for column_list (containing Tools and Terminologies)
    for i, block in enumerate(all_blocks):
        if block.get("type") == "column_list":
            # Check if this is the right column_list (should be after Roles & responsibilities)
            if roles_index and i > roles_index:
                tools_column_index = i
                print(f"✓ Found column_list at index {i}")
                break
    
    # Check for toggle block with "original pages"
    for i, block in enumerate(all_blocks):
        if block.get("type") == "toggle":
            toggle_text = ""
            rich_text = block.get("toggle", {}).get("rich_text", [])
            if rich_text:
                toggle_text = rich_text[0].get("text", {}).get("content", "")
            
            if toggle_text.lower() == "original pages":
                toggle_index = i
                print(f"✓ Found 'original pages' toggle at index {i}")
                break
    
    # Step 3: Verify section order
    print("3. Verifying section order...")
    
    if roles_index is None:
        print("Error: 'Roles & responsibilities' section not found.", file=sys.stderr)
        return False
    
    if tools_column_index is None:
        print("Error: Column layout not found.", file=sys.stderr)
        return False
    
    if toggle_index is None:
        print("Error: 'original pages' toggle not found.", file=sys.stderr)
        return False
    
    if procedure_index is None:
        print("Error: 'Procedure' section not found.", file=sys.stderr)
        return False
    
    # Verify order: Roles & responsibilities < column_list < toggle < Procedure
    if not (roles_index < tools_column_index < toggle_index < procedure_index):
        print("Error: Sections are not in the correct order.", file=sys.stderr)
        print(f"  Expected order: Roles & responsibilities ({roles_index}) < column_list ({tools_column_index}) < toggle ({toggle_index}) < Procedure ({procedure_index})", file=sys.stderr)
        return False
    
    print("✓ Sections are in the correct order")
    
    # Step 4: Verify column_list structure
    print("4. Verifying column layout structure...")
    
    column_list_block = all_blocks[tools_column_index]
    column_list_id = column_list_block.get("id")
    
    # Get direct children of column_list (should be columns only)
    try:
        column_response = notion.blocks.children.list(block_id=column_list_id)
        column_children = column_response.get("results", [])
    except Exception as e:
        print(f"Error getting column children: {e}", file=sys.stderr)
        return False
    
    if len(column_children) < 2:
        print(f"Error: Column list should have at least 2 columns, found {len(column_children)}.", file=sys.stderr)
        return False
    
    # Verify left column (Tools)
    left_column = column_children[0]
    if left_column.get("type") != "column":
        print("Error: First child of column_list should be a column.", file=sys.stderr)
        return False
    
    left_column_id = left_column.get("id")
    left_column_blocks = notion_utils.get_all_blocks_recursively(notion, left_column_id)
    
    # Check for Tools heading and link_to_page blocks in left column
    tools_heading_found = False
    link_to_page_count = 0
    for block in left_column_blocks:
        if block.get("type") == "heading_2":
            heading_text = block.get("heading_2", {}).get("rich_text", [{}])[0].get("text", {}).get("content", "")
            if heading_text == "Tools":
                tools_heading_found = True
                print("✓ Found 'Tools' heading in left column")
        elif block.get("type") == "link_to_page":
            link_to_page_count += 1
    
    if not tools_heading_found:
        print("Error: 'Tools' heading not found in left column.", file=sys.stderr)
        return False
    
    # Check for link_to_page blocks in Tools column
    if link_to_page_count < 2:
        print(f"Error: Tools column should have at least 2 link_to_page blocks, found {link_to_page_count}.", file=sys.stderr)
        return False
    
    print(f"✓ Found {link_to_page_count} link_to_page blocks in Tools column")
    
    # Verify right column (Terminologies)
    right_column = column_children[1]
    if right_column.get("type") != "column":
        print("Error: Second child of column_list should be a column.", file=sys.stderr)
        return False
    
    right_column_id = right_column.get("id")
    right_column_blocks = notion_utils.get_all_blocks_recursively(notion, right_column_id)
    
    # Check for Terminologies heading in right column
    terminologies_heading_found = False
    for block in right_column_blocks:
        if block.get("type") == "heading_2":
            heading_text = block.get("heading_2", {}).get("rich_text", [{}])[0].get("text", {}).get("content", "")
            if heading_text == "Terminologies":
                terminologies_heading_found = True
                print("✓ Found 'Terminologies' heading in right column")
                break
    
    if not terminologies_heading_found:
        print("Error: 'Terminologies' heading not found in right column.", file=sys.stderr)
        return False
    
    # Step 5: Verify toggle block content
    print("5. Verifying toggle block content...")
    
    toggle_block = all_blocks[toggle_index]
    toggle_id = toggle_block.get("id")
    
    # Get direct children of toggle
    try:
        toggle_response = notion.blocks.children.list(block_id=toggle_id)
        toggle_children = toggle_response.get("results", [])
    except Exception as e:
        print(f"Error getting toggle children: {e}", file=sys.stderr)
        return False
    
    # Check for child_page blocks (Notion and Figma)
    notion_page_found = False
    figma_page_found = False
    
    for block in toggle_children:
        if block.get("type") == "child_page":
            title = block.get("child_page", {}).get("title", "")
            if title == "Notion":
                notion_page_found = True
                print("✓ Found 'Notion' child page in toggle")
            elif title == "Figma":
                figma_page_found = True
                print("✓ Found 'Figma' child page in toggle")
    
    if not notion_page_found:
        print("Error: 'Notion' child page not found in toggle block.", file=sys.stderr)
        return False
    
    if not figma_page_found:
        print("Error: 'Figma' child page not found in toggle block.", file=sys.stderr)
        return False
    
    # Step 6: Verify that original sections no longer exist at top level
    print("6. Verifying original sections have been removed from top level...")
    
    # Check that there's no standalone "Terminologies" heading before "Roles & responsibilities"
    for i in range(0, roles_index if roles_index else len(all_blocks)):
        block = all_blocks[i]
        if block.get("type") == "heading_2":
            heading_text = block.get("heading_2", {}).get("rich_text", [{}])[0].get("text", {}).get("content", "")
            if heading_text == "Terminologies":
                print("Error: 'Terminologies' section found before 'Roles & responsibilities'.", file=sys.stderr)
                return False
    
    # Check that there's no standalone "Tools" heading outside the column
    tools_outside_column = False
    for i, block in enumerate(all_blocks):
        if i == tools_column_index:
            continue  # Skip the column_list itself
        if block.get("type") == "heading_2":
            heading_text = block.get("heading_2", {}).get("rich_text", [{}])[0].get("text", {}).get("content", "")
            if heading_text == "Tools" and i != tools_column_index:
                # Check if this is NOT inside the column
                parent_id = block.get("parent", {}).get("block_id")
                if parent_id != left_column_id:
                    tools_outside_column = True
                    break
    
    if tools_outside_column:
        print("Error: Standalone 'Tools' section found outside column layout.", file=sys.stderr)
        return False
    
    print("✓ Original sections have been properly reorganized")
    
    # Step 7: Final summary
    print("\n7. Final verification summary:")
    print("✓ 'Roles & responsibilities' and 'Terminologies' sections have been swapped")
    print("✓ 'Tools' and 'Terminologies' are in a 2-column layout")
    print("✓ Links to Notion and Figma pages are in the Tools column")
    print("✓ Original child pages are preserved in 'original pages' toggle")
    print("✓ Page structure is correct")
    
    print("\n✅ All verification checks passed!")
    return True

def main():
    """
    Executes the verification process and exits with a status code.
    """
    notion = notion_utils.get_notion_client()
    main_id = sys.argv[1] if len(sys.argv) > 1 else None
    if verify(notion, main_id):
        sys.exit(0)
    else:
        sys.exit(1)

if __name__ == "__main__":
    main()

================================================
FILE: tasks/notion/standard/team_projects/priority_tasks_table/description.md
================================================
Hi! In my Team Projects page, please create a five-column table block that lists all tasks meeting either of the following conditions:
	1.	The progress is 50% or less, or
	2.	The task has priority P0 but is not yet completed (i.e., progress not at 100%).

You should query this information from the existing “Projects” database.

In the newly created table, each row should represent one task, and all information should be stored as plain text (not relations, formulas, or linked properties).

In the newly created table:
	•	Each row should represent one task
	•	All fields should be stored as plain text (not relations, formulas, or linked properties)
	•	The table should be sorted by expected end date (End Date) in ascending order, so that the first entry is the one with the earliest end date

The table should include the following headers:
	•	Project
	•	Eng Hours
	•	Progress
	•	Start Date
	•	End Date

Please make sure all relevant tasks are included. Thank you!

================================================
FILE: tasks/notion/standard/team_projects/priority_tasks_table/meta.json
================================================
{
  "task_id": "priority_tasks_table",
  "task_name": "Priority Tasks Table",
  "category_id": "team_projects",
  "category_name": "Team Projects",
  "description": "Create a five-column table listing tasks with 50% or less progress or P0 priority tasks not completed.",
  "author": "Zijian Wu",
  "created_at": "2025-08-12",
  "difficulty": "L3",
  "tags": [
    "conditional filtering",
    "database manipulation",
    "data aggregation",
    "visual formatting"
  ],
  "mcp": [
    "notion"
  ],
  "meta_data": {
    "stateType": "url",
    "stateContent": null,
    "stateUrl": "https://painted-tennis-ebc.notion.site/Team-Projects-24e81626b6d7809c982fdb7a25825898",
    "stateOriginalUrl": "https://www.notion.so/marketplace/templates/gantt-chart"
  }
}

================================================
FILE: tasks/notion/standard/team_projects/priority_tasks_table/verify.py
================================================
import sys
from datetime import datetime
from notion_client import Client
from tasks.utils import notion_utils

EXPECTED_HEADERS = ["Project", "Eng Hours", "Progress", "Start Date", "End Date"]

EXPECTED_ROWS = [
    {
        "Project": "Improve response times for support requests",
        "Eng Hours": 100,
        "Progress": 0.33,  # 33%
        "Start Date": "2024-10-30",
        "End Date": "2024-11-17",
    },
    {
        "Project": "Add a new social media integration",
        "Eng Hours": 200,
        "Progress": 0.40,  # 40%
        "Start Date": "2024-11-07",
        "End Date": "2024-11-25",
    },
    {
        "Project": "Integrate with a popular third-party service",
        "Eng Hours": 250,
        "Progress": 0.20,  # 20%
        "Start Date": "2024-11-10",
        "End Date": "2024-11-18",
    },
    {
        "Project": "Create customer knowledge base",
        "Eng Hours": 150,
        "Progress": 0.40,  # 40%
        "Start Date": "2024-11-19",
        "End Date": "2024-11-25",
    },
    {
        "Project": "Redesign the onboarding process",
        "Eng Hours": 300,
        "Progress": 0.75,  # 75%
        "Start Date": "2024-11-20",
        "End Date": "2024-12-04",
    },
    {
        "Project": "Publish support knowledge base",
        "Eng Hours": None,  # N/A
        "Progress": 0.0,  # 0%
        "Start Date": "2024-11-27",
        "End Date": "2024-11-29",
    },
]

# Sort the expected rows by End Date so we can directly compare order
EXPECTED_ROWS.sort(key=lambda r: r["End Date"])


def _plain_text_from_cell(cell):
    """Concatenate plain_text from a single cell (list of rich_text)."""
    return "".join(rt.get("plain_text", "") for rt in cell).strip()


def _parse_progress(value: str):
    """Convert a progress string like '40%', '40.0 %', '0.4' to float in range 0-1."""
    value = value.strip()
    if not value:
        return None

    has_percent = "%" in value
    # Remove percent sign and any spaces
    value = value.replace("%", "").strip()
    try:
        num = float(value)
        if has_percent or num > 1:
            num /= 100.0
        return num
    except ValueError:
        return None


def _parse_eng_hours(value: str):
    value = value.strip().lower()
    if value in {"n/a", "na", "", "—", "-"}:
        return None
    try:
        return float(value)
    except ValueError:
        return None


def _parse_date(value: str):
    value = value.strip()
    try:
        return datetime.strptime(value, "%Y-%m-%d").date()
    except ValueError:
        return None


def verify(notion: Client, main_id: str = None) -> bool:
    """Verify that the last table in the 'Team Projects' page matches EXPECTED_ROWS and headers."""
    page_id = None
    if main_id:
        found_id, object_type = notion_utils.find_page_or_database_by_id(notion, main_id)
        if found_id and object_type == 'page':
            page_id = found_id
    
    if not page_id:
        page_id = notion_utils.find_page(notion, "Team Projects")
    if not page_id:
        print("Error: Page 'Team Projects' not found.", file=sys.stderr)
        return False

    # Fetch all blocks to locate table blocks
    blocks = notion_utils.get_all_blocks_recursively(notion, page_id)
    table_blocks = [b for b in blocks if b.get("type") == "table"]
    if not table_blocks:
        print("Error: No table blocks found in 'Team Projects' page.", file=sys.stderr)
        return False

    table_block = table_blocks[-1]  # Use the last table block
    table_id = table_block["id"]

    # Retrieve table rows
    rows = notion.blocks.children.list(block_id=table_id).get("results", [])
    if not rows:
        print("Error: Table block has no rows.", file=sys.stderr)
        return False

    # Validate headers
    header_cells = rows[0].get("table_row", {}).get("cells", [])
    headers = [_plain_text_from_cell(c) for c in header_cells]
    if headers != EXPECTED_HEADERS:
        print(f"Error: Table headers mismatch. Found {headers}, expected {EXPECTED_HEADERS}.", file=sys.stderr)
        return False

    # Parse data rows
    data_rows = []
    for r in rows[1:]:
        cells = r.get("table_row", {}).get("cells", [])
        if len(cells) < 5:
            continue  # Skip malformed rows
        project = _plain_text_from_cell(cells[0])
        eng_hours_raw = _plain_text_from_cell(cells[1])
        progress_raw = _plain_text_from_cell(cells[2])
        start_raw = _plain_text_from_cell(cells[3])
        end_raw = _plain_text_from_cell(cells[4])

        row_dict = {
            "Project": project,
            "Eng Hours": _parse_eng_hours(eng_hours_raw),
            "Progress": _parse_progress(progress_raw),
            "Start Date": start_raw.strip(),
            "End Date": end_raw.strip(),
        }
        data_rows.append(row_dict)

    if len(data_rows) != len(EXPECTED_ROWS):
        print(f"Error: Expected {len(EXPECTED_ROWS)} data rows, found {len(data_rows)}.", file=sys.stderr)
        return False

    # Verify sorting by End Date ascending
    parsed_end_dates = [_parse_date(r["End Date"]) for r in data_rows]
    if any(d is None for d in parsed_end_dates):
        print("Error: One or more End Date values could not be parsed.", file=sys.stderr)
        return False
    if parsed_end_dates != sorted(parsed_end_dates):
        print("Error: Table is not sorted by End Date ascending.", file=sys.stderr)
        return False

    # Create mapping from project -> row for comparison
    data_map = {r["Project"]: r for r in data_rows}

    for expected in EXPECTED_ROWS:
        proj = expected["Project"]
        if proj not in data_map:
            print(f"Error: Project '{proj}' not found in table.", file=sys.stderr)
            return False
        actual = data_map[proj]

        # Compare Eng Hours
        expected_hours = expected["Eng Hours"]
        actual_hours = actual["Eng Hours"]
        if expected_hours is None:
            if actual_hours is not None:
                print(f"Error: Eng Hours for '{proj}' expected to be empty/N\u204aA but found '{actual_hours}'.", file=sys.stderr)
                return False
        else:
            if actual_hours is None or abs(actual_hours - expected_hours) > 1e-2:
                print(f"Error: Eng Hours for '{proj}' mismatch. Expected {expected_hours}, found {actual_hours}.", file=sys.stderr)
                return False

        # Compare Progress with tolerance
        expected_progress = expected["Progress"]
        actual_progress = actual["Progress"]
        if actual_progress is None or abs(actual_progress - expected_progress) > 1e-2:
            print(f"Error: Progress for '{proj}' mismatch. Expected {expected_progress}, found {actual_progress}.", file=sys.stderr)
            return False

        # Compare Start and End Dates (string equality)
        if actual["Start Date"] != expected["Start Date"]:
            print(f"Error: Start Date for '{proj}' mismatch. Expected {expected['Start Date']}, found {actual['Start Date']}.", file=sys.stderr)
            return False
        if actual["End Date"] != expected["End Date"]:
            print(f"Error: End Date for '{proj}' mismatch. Expected {expected['End Date']}, found {actual['End Date']}.", file=sys.stderr)
            return False

    print("Success: Verified table block contents and order successfully.")
    return True


def main():
    """Execute verification and exit with status code."""
    notion = notion_utils.get_notion_client()
    main_id = sys.argv[1] if len(sys.argv) > 1 else None
    if verify(notion, main_id):
        sys.exit(0)
    else:
        sys.exit(1)


if __name__ == "__main__":
    main() 

================================================
FILE: tasks/notion/standard/team_projects/swap_tasks/description.md
================================================
Go to the Team Projects page, find the person responsible for the most tasks and the person responsible for the fewest tasks, then swap their assigned tasks.

================================================
FILE: tasks/notion/standard/team_projects/swap_tasks/meta.json
================================================
{
  "task_id": "swap_tasks",
  "task_name": "Swap Tasks",
  "category_id": "team_projects",
  "category_name": "Team Projects",
  "description": "Find the person responsible for the most and fewest tasks, then swap their assigned tasks.",
  "author": "Xiangyan Liu",
  "created_at": "2025-08-12",
  "difficulty": "L3",
  "tags": [
    "data aggregation",
    "automated migration",
    "conditional filtering"
  ],
  "mcp": [
    "notion"
  ],
  "meta_data": {
    "stateType": "url",
    "stateContent": null,
    "stateUrl": "https://painted-tennis-ebc.notion.site/Team-Projects-24e81626b6d7809c982fdb7a25825898",
    "stateOriginalUrl": "https://www.notion.so/marketplace/templates/gantt-chart"
  }
}

================================================
FILE: tasks/notion/standard/team_projects/swap_tasks/verify.py
================================================
import sys
from notion_client import Client
from tasks.utils import notion_utils

def verify(notion: Client, main_id: str = None) -> bool:
    """
    Verifies that the task assignees have been swapped correctly.
    Checks:
    1. "Develop a plan for promotion" and "Evaluate different third-party services" have swapped assignees
    2. The person with most tasks and person with least tasks have swapped all their tasks
    """
    # Step 1: Find the Team Projects page
    if main_id:
        found_id, object_type = notion_utils.find_page_or_database_by_id(notion, main_id)
        if not found_id or object_type != 'page':
            print("Error: Team Projects page not found.", file=sys.stderr)
            return False
    else:
        # Try to find the page by searching
        found_id = notion_utils.find_page(notion, "Team Projects")
        if not found_id:
            print("Error: Team Projects page not found.", file=sys.stderr)
            return False
    
    # Get all blocks from the page to find database references
    all_blocks = notion_utils.get_all_blocks_recursively(notion, found_id)
    
    # Find Tasks database ID from the page
    tasks_db_id = None
    
    for block in all_blocks:
        if block and block.get("type") == "child_database":
            db_title = block.get("child_database", {}).get("title", "")
            if "Tasks" in db_title:
                tasks_db_id = block["id"]
                break
    
    if not tasks_db_id:
        print("Error: Tasks database not found.", file=sys.stderr)
        return False
    
    print("\n📋 Starting verification...")
    
    # Step 2: Query all tasks to analyze assignees
    
    try:
        all_tasks_response = notion.databases.query(
            database_id=tasks_db_id,
            page_size=100
        )
        
        if not all_tasks_response.get("results"):
            print("Error: No tasks found in Tasks database.", file=sys.stderr)
            return False
        
        tasks = all_tasks_response["results"]
        
    except Exception as e:
        print(f"Error querying Tasks database: {e}", file=sys.stderr)
        return False
    
    # Step 3: Check specific tasks have swapped assignees
    
    develop_plan_task = None
    evaluate_services_task = None
    
    for task in tasks:
        task_name = task["properties"]["Name"]["title"][0]["text"]["content"]
        if task_name == "Develop a plan for promotion":
            develop_plan_task = task
        elif task_name == "Evaluate different third-party services":
            evaluate_services_task = task
    
    if not develop_plan_task or not evaluate_services_task:
        print("Error: Could not find both required tasks.", file=sys.stderr)
        return False
    
    # Get assignees for these tasks
    develop_plan_assignees = develop_plan_task["properties"]["Assigned"]["people"]
    evaluate_services_assignees = evaluate_services_task["properties"]["Assigned"]["people"]
    
    if not develop_plan_assignees or not evaluate_services_assignees:
        print("Error: Tasks don't have assignees.", file=sys.stderr)
        return False
    
    develop_plan_assignee_id = develop_plan_assignees[0]["id"]
    evaluate_services_assignee_id = evaluate_services_assignees[0]["id"]
    
    # These should be different (swapped)
    if develop_plan_assignee_id == evaluate_services_assignee_id:
        print("Error: Tasks should have different assignees after swap.", file=sys.stderr)
        return False
    
    # Step 4: Count tasks per person
    
    task_counts = {}
    unassigned_count = 0
    
    for task in tasks:
        assignees = task["properties"]["Assigned"]["people"]
        if assignees:
            assignee_id = assignees[0]["id"]
            if assignee_id not in task_counts:
                task_counts[assignee_id] = []
            task_counts[assignee_id].append(task["properties"]["Name"]["title"][0]["text"]["content"])
        else:
            unassigned_count += 1
    
    # Sort by task count
    sorted_assignees = sorted(task_counts.items(), key=lambda x: len(x[1]))
    
    if len(sorted_assignees) < 2:
        print("Error: Need at least 2 people with tasks to verify swap.", file=sys.stderr)
        return False
    
    # Get person with least and most tasks
    person_with_least = sorted_assignees[0]
    person_with_most = sorted_assignees[-1]
    
    least_id, least_tasks = person_with_least
    most_id, most_tasks = person_with_most
    
    # Step 5: Verify the swap pattern
    
    # Original distribution (before swap):
    # - 5ac96c02-49a4-4320-8de6-b663ba83126b had 3 tasks (least)
    # - ac7a3bd0-c111-4464-8f45-8a857a1abc8a had 10 tasks (most)
    
    # After complete swap, we expect:
    # - 5ac96c02-49a4-4320-8de6-b663ba83126b should have 10 tasks
    # - ac7a3bd0-c111-4464-8f45-8a857a1abc8a should have 3 tasks
    
    original_least_id = "5ac96c02-49a4-4320-8de6-b663ba83126b"
    original_most_id = "ac7a3bd0-c111-4464-8f45-8a857a1abc8a"
    
    # Check if the swap has been completed
    swap_completed = False
    for assignee_id, assignee_tasks in task_counts.items():
        if assignee_id == original_least_id and len(assignee_tasks) == 10:
            # Person who had 3 now has 10
            for other_id, other_tasks in task_counts.items():
                if other_id == original_most_id and len(other_tasks) == 3:
                    # Person who had 10 now has 3
                    swap_completed = True
                    break
    
    # Step 6: Summary
    print(f"\n📊 Task Distribution:")
    print(f"  • Total tasks: {len(tasks)}")
    print(f"  • Assigned tasks: {len(tasks) - unassigned_count}")
    print(f"  • Unassigned tasks: {unassigned_count}")
    print(f"  • People with tasks: {len(task_counts)}")
    print(f"\n  Task counts by person:")
    for assignee_id, assignee_tasks in sorted_assignees:
        print(f"    - {assignee_id[:8]}...: {len(assignee_tasks)} tasks")
    
    # Step 7: Final verification
    print("\n🔍 Verification Results:")
    
    # Check that the swap has created a significant difference
    if len(most_tasks) - len(least_tasks) < 5:
        print(f"Warning: Difference between most and least is only {len(most_tasks) - len(least_tasks)} tasks", file=sys.stderr)
    
    # Verify specific expected outcomes
    verification_passed = True
    
    # Check 1: Specific tasks have been swapped
    specific_tasks_swapped = develop_plan_assignee_id != evaluate_services_assignee_id
    if specific_tasks_swapped:
        print("  ✓ Specific tasks have been swapped")
    else:
        print("  ✗ Specific tasks were not swapped", file=sys.stderr)
        verification_passed = False
    
    # Check 2: Task distribution shows a complete swap
    if swap_completed:
        print("  ✓ Complete task swap verified (3↔10 tasks)")
    else:
        # Show actual distribution for debugging
        person1_tasks = len(task_counts.get(original_least_id, []))
        person2_tasks = len(task_counts.get(original_most_id, []))
        print(f"  ✗ Swap incomplete! Expected 5ac96c02→10 tasks, ac7a3bd0→3 tasks", file=sys.stderr)
        print(f"    Actual: 5ac96c02→{person1_tasks} tasks, ac7a3bd0→{person2_tasks} tasks", file=sys.stderr)
        verification_passed = False
    
    # Check 3: Total task count is preserved
    total_assigned_tasks = sum(len(tasks) for _, tasks in task_counts.items())
    expected_total = len(tasks) - unassigned_count
    
    if total_assigned_tasks == expected_total:
        print(f"  ✓ Total task count preserved ({total_assigned_tasks} assigned)")
    else:
        print(f"  ✗ Task count mismatch: {total_assigned_tasks} vs {expected_total} expected", file=sys.stderr)
        verification_passed = False
    
    if verification_passed:
        print("\n✅ All verification checks passed!")
        return True
    else:
        print("\n❌ Verification failed", file=sys.stderr)
        return False

def main():
    """
    Executes the verification process and exits with a status code.
    """
    notion = notion_utils.get_notion_client()
    main_id = sys.argv[1] if len(sys.argv) > 1 else None
    if verify(notion, main_id):
        sys.exit(0)
    else:
        sys.exit(1)

if __name__ == "__main__":
    main()

================================================
FILE: tasks/notion/standard/toronto_guide/change_color/description.md
================================================
Navigate to the Toronto Guide page in Notion and change all pink-colored elements (tags and callout colors) to different colors.

## Requirements
1. Find and access the Toronto Guide page in Notion
2. Identify and change all pink elements including:
   - Pink tags in databases
   - Pink callout backgrounds
3. Change all pink colors to any other color of your choice

================================================
FILE: tasks/notion/standard/toronto_guide/change_color/meta.json
================================================
{
  "task_id": "change_color",
  "task_name": "Change Color",
  "category_id": "toronto_guide",
  "category_name": "Toronto Guide",
  "description": "Navigate to the Toronto Guide page and change all pink-colored elements to different colors.",
  "author": "Xiangyan Liu",
  "created_at": "2025-08-14",
  "difficulty": "L3",
  "tags": [
    "visual formatting",
    "conditional filtering"
  ],
  "mcp": [
    "notion"
  ],
  "meta_data": {
    "stateType": "url",
    "stateContent": null,
    "stateUrl": "https://painted-tennis-ebc.notion.site/Toronto-Guide-25281626b6d7802caa7cc394647e901c",
    "stateOriginalUrl": "https://www.notion.so/marketplace/templates/conquering-toronto-a-destination-guide"
  }
}

================================================
FILE: tasks/notion/standard/toronto_guide/change_color/verify.py
================================================
import sys
from notion_client import Client
from tasks.utils import notion_utils

def get_page_title(page_result):
    """Extract title from a page result"""
    properties = page_result.get('properties', {})
    for prop_name in ['Name', 'Title', 'title']:
        if prop_name in properties:
            prop = properties[prop_name]
            if prop.get('type') == 'title':
                title_array = prop.get('title', [])
                if title_array and len(title_array) > 0:
                    return title_array[0].get('plain_text', '')
    return ''

def get_page_tags(page_result):
    """Extract tags from a page result"""
    properties = page_result.get('properties', {})
    tags_property = properties.get('Tags', {})
    if tags_property.get('type') == 'multi_select':
        tags = tags_property.get('multi_select', [])
        return [tag.get('name') for tag in tags]
    return []

def verify(notion: Client, main_id: str = None) -> bool:
    """
    Verifies that all pink colors have been changed in the Toronto Guide page.
    
    Expected pink elements that should be changed:
    1. Callout: "Welcome to Toronto!" with red_background (originally should be pink)
    2. Activities database tags: 
       - "Parks" tag (High Park, Evergreen Brickworks)
       - "Neighbourhood" tag (Ossington Strip, Chinatown, Little Italy, Kensington Market, Queen west, The beaches)
    3. Food database tags:
       - "Middle Eastern" (Byblos Downtown)
       - "Jamaican" (Crumbs Patties)
       - "Indian" (Leela Indian Food Bar)
    4. Cafes database tag:
       - "Food" (Cafe Landwer)
    
    These elements should exist with the same name/content but different colors.
    Tag distributions should remain the same.
    """
    # Step 1: Find the main Toronto Guide page
    if main_id:
        found_id, object_type = notion_utils.find_page_or_database_by_id(notion, main_id)
        if not found_id or object_type != 'page':
            print("Error: Toronto Guide page not found.", file=sys.stderr)
            return False
    else:
        # Try to find the page by searching
        found_id = notion_utils.find_page(notion, "Toronto Guide")
        if not found_id:
            print("Error: Toronto Guide page not found.", file=sys.stderr)
            return False
    
    print(f"Found Toronto Guide page: {found_id}")
    
    # Get all blocks from the page
    all_blocks = notion_utils.get_all_blocks_recursively(notion, found_id)
    print(f"Found {len(all_blocks)} blocks")
    
    # Expected elements and their distributions
    expected_pink_elements = {
        "callout": {
            "text": "Welcome to Toronto!",
            "found": False,
            "has_pink": False,
            "exists": False
        },
        "activities_tags": {
            "Parks": {
                "found": False, 
                "has_pink": False,
                "expected_items": ["High Park", "Evergreen Brickworks"],
                "actual_items": []
            },
            "Neighbourhood": {
                "found": False, 
                "has_pink": False,
                "expected_items": ["Ossington Strip", "Chinatown", "Little Italy", "Kensington Market", "Queen west", "The beaches"],
                "actual_items": []
            }
        },
        "food_tags": {
            "Middle Eastern": {
                "found": False, 
                "has_pink": False,
                "expected_items": ["Byblos Downtown"],
                "actual_items": []
            },
            "Jamaican": {
                "found": False, 
                "has_pink": False,
                "expected_items": ["Crumbs Patties"],
                "actual_items": []
            },
            "Indian": {
                "found": False, 
                "has_pink": False,
                "expected_items": ["Leela Indian Food Bar"],
                "actual_items": []
            }
        },
        "cafes_tags": {
            "Food": {
                "found": False, 
                "has_pink": False,
                "expected_items": ["Cafe Landwer"],
                "actual_items": []
            }
        }
    }
    
    # Database IDs
    activities_db_id = None
    food_db_id = None
    cafes_db_id = None
    
    # Step 2: Check all blocks for callouts and find databases
    for block in all_blocks:
        if block is None:
            continue
            
        block_type = block.get("type")
        
        # Check for the specific callout block
        if block_type == "callout":
            callout_text = notion_utils.get_block_plain_text(block)
            if "Welcome to Toronto!" in callout_text:
                expected_pink_elements["callout"]["exists"] = True
                expected_pink_elements["callout"]["found"] = True
                color = block.get("callout", {}).get("color", "")
                if "pink" in color.lower():
                    expected_pink_elements["callout"]["has_pink"] = True
                    print(f"✗ Callout 'Welcome to Toronto!' still has pink color: {color}")
                else:
                    print(f"✓ Callout 'Welcome to Toronto!' has non-pink color: {color}")
        
        # Find child databases
        elif block_type == "child_database":
            title = block.get("child_database", {}).get("title", "")
            block_id = block.get("id")
            
            if "Activities" in title:
                activities_db_id = block_id
                print(f"Found Activities database: {block_id}")
            elif "Food" in title:
                food_db_id = block_id
                print(f"Found Food database: {block_id}")
            elif "Cafes" in title or "Café" in title:
                cafes_db_id = block_id
                print(f"Found Cafes database: {block_id}")
    
    # Step 3: Check Activities database for specific tags and their distributions
    if activities_db_id:
        try:
            # Get database properties
            db_info = notion.databases.retrieve(database_id=activities_db_id)
            tags_property = db_info.get("properties", {}).get("Tags", {})
            if tags_property.get("type") == "multi_select":
                options = tags_property.get("multi_select", {}).get("options", [])
                for option in options:
                    tag_name = option.get("name").strip()
                    tag_color = option.get("color")
                    
                    if tag_name in expected_pink_elements["activities_tags"]:
                        expected_pink_elements["activities_tags"][tag_name]["found"] = True
                        if tag_color == "pink":
                            expected_pink_elements["activities_tags"][tag_name]["has_pink"] = True
                            print(f"✗ Activities tag '{tag_name}' still has pink color")
                        else:
                            print(f"✓ Activities tag '{tag_name}' changed to {tag_color}")
            
            # Query database to check tag distributions
            query_result = notion.databases.query(database_id=activities_db_id)
            for page in query_result.get('results', []):
                page_title = get_page_title(page).strip()
                page_tags = get_page_tags(page)
                
                for tag_name in expected_pink_elements["activities_tags"]:
                    if tag_name in page_tags:
                        expected_pink_elements["activities_tags"][tag_name]["actual_items"].append(page_title)
                        
        except Exception as e:
            print(f"Error checking Activities database: {e}", file=sys.stderr)
            return False
    else:
        print("Error: Activities database not found", file=sys.stderr)
        return False
    
    # Step 4: Check Food database for specific tags and their distributions
    if food_db_id:
        try:
            # Get database properties
            db_info = notion.databases.retrieve(database_id=food_db_id)
            tags_property = db_info.get("properties", {}).get("Tags", {})
            if tags_property.get("type") == "multi_select":
                options = tags_property.get("multi_select", {}).get("options", [])
                for option in options:
                    tag_name = option.get("name").strip()
                    tag_color = option.get("color")
                    
                    if tag_name in expected_pink_elements["food_tags"]:
                        expected_pink_elements["food_tags"][tag_name]["found"] = True
                        if tag_color == "pink":
                            expected_pink_elements["food_tags"][tag_name]["has_pink"] = True
                            print(f"✗ Food tag '{tag_name}' still has pink color")
                        else:
                            print(f"✓ Food tag '{tag_name}' changed to {tag_color}")
            
            # Query database to check tag distributions
            query_result = notion.databases.query(database_id=food_db_id)
            for page in query_result.get('results', []):
                page_title = get_page_title(page).strip()
                page_tags = get_page_tags(page)
                
                for tag_name in expected_pink_elements["food_tags"]:
                    if tag_name in page_tags:
                        expected_pink_elements["food_tags"][tag_name]["actual_items"].append(page_title)
                        
        except Exception as e:
            print(f"Error checking Food database: {e}", file=sys.stderr)
            return False
    else:
        print("Error: Food database not found", file=sys.stderr)
        return False
    
    # Step 5: Check Cafes database for specific tags and their distributions
    if cafes_db_id:
        try:
            # Get database properties
            db_info = notion.databases.retrieve(database_id=cafes_db_id)
            tags_property = db_info.get("properties", {}).get("Tags", {})
            if tags_property.get("type") == "multi_select":
                options = tags_property.get("multi_select", {}).get("options", [])
                for option in options:
                    tag_name = option.get("name").strip()
                    tag_color = option.get("color")
                    
                    if tag_name in expected_pink_elements["cafes_tags"]:
                        expected_pink_elements["cafes_tags"][tag_name]["found"] = True
                        if tag_color == "pink":
                            expected_pink_elements["cafes_tags"][tag_name]["has_pink"] = True
                            print(f"✗ Cafes tag '{tag_name}' still has pink color")
                        else:
                            print(f"✓ Cafes tag '{tag_name}' changed to {tag_color}")
            
            # Query database to check tag distributions
            query_result = notion.databases.query(database_id=cafes_db_id)
            for page in query_result.get('results', []):
                page_title = get_page_title(page).strip()
                page_tags = get_page_tags(page)
                
                for tag_name in expected_pink_elements["cafes_tags"]:
                    if tag_name in page_tags:
                        expected_pink_elements["cafes_tags"][tag_name]["actual_items"].append(page_title)
                        
        except Exception as e:
            print(f"Error checking Cafes database: {e}", file=sys.stderr)
            return False
    else:
        print("Error: Cafes database not found", file=sys.stderr)
        return False
    
    # Step 6: Verify all requirements
    print(f"\nVerification Summary:")
    
    all_passed = True
    
    # Check callout
    if not expected_pink_elements["callout"]["exists"]:
        print("✗ 'Welcome to Toronto!' callout not found", file=sys.stderr)
        all_passed = False
    elif expected_pink_elements["callout"]["has_pink"]:
        print("✗ Callout still has pink background", file=sys.stderr)
        all_passed = False
    else:
        print("✓ Callout color changed from pink")
    
    # Check Activities tags
    print("\nActivities Database Tags:")
    for tag_name, tag_info in expected_pink_elements["activities_tags"].items():
        if not tag_info["found"]:
            print(f"✗ Activities tag '{tag_name}' not found (may have been renamed)", file=sys.stderr)
            # Don't fail if tag was renamed, as that's acceptable
        elif tag_info["has_pink"]:
            print(f"✗ Activities tag '{tag_name}' still has pink color", file=sys.stderr)
            all_passed = False
        else:
            print(f"✓ Activities tag '{tag_name}' color changed from pink")
            
        # Check distribution
        expected_set = set(tag_info["expected_items"])
        actual_set = set(tag_info["actual_items"])
        if tag_info["found"] and expected_set != actual_set:
            print(f"  ✗ Tag distribution mismatch for '{tag_name}':", file=sys.stderr)
            print(f"    Expected: {sorted(expected_set)}", file=sys.stderr)
            print(f"    Actual: {sorted(actual_set)}", file=sys.stderr)
            # Note: We don't fail on distribution mismatch if tag was renamed
            if not (expected_set - actual_set):  # If all expected items are present
                print(f"    (Additional items found, but all expected items are present)")
        elif tag_info["found"]:
            print(f"  ✓ Tag distribution maintained for '{tag_name}'")
    
    # Check Food tags
    print("\nFood Database Tags:")
    for tag_name, tag_info in expected_pink_elements["food_tags"].items():
        if not tag_info["found"]:
            print(f"✗ Food tag '{tag_name}' not found (may have been renamed)", file=sys.stderr)
            # Don't fail if tag was renamed, as that's acceptable
        elif tag_info["has_pink"]:
            print(f"✗ Food tag '{tag_name}' still has pink color", file=sys.stderr)
            all_passed = False
        else:
            print(f"✓ Food tag '{tag_name}' color changed from pink")
            
        # Check distribution
        expected_set = set(tag_info["expected_items"])
        actual_set = set(tag_info["actual_items"])
        if tag_info["found"] and expected_set != actual_set:
            print(f"  ✗ Tag distribution mismatch for '{tag_name}':", file=sys.stderr)
            print(f"    Expected: {sorted(expected_set)}", file=sys.stderr)
            print(f"    Actual: {sorted(actual_set)}", file=sys.stderr)
        elif tag_info["found"]:
            print(f"  ✓ Tag distribution maintained for '{tag_name}'")
    
    # Check Cafes tags
    print("\nCafes Database Tags:")
    for tag_name, tag_info in expected_pink_elements["cafes_tags"].items():
        if not tag_info["found"]:
            print(f"✗ Cafes tag '{tag_name}' not found (may have been renamed)", file=sys.stderr)
            # Don't fail if tag was renamed, as that's acceptable
        elif tag_info["has_pink"]:
            print(f"✗ Cafes tag '{tag_name}' still has pink color", file=sys.stderr)
            all_passed = False
        else:
            print(f"✓ Cafes tag '{tag_name}' color changed from pink")
            
        # Check distribution
        expected_set = set(tag_info["expected_items"])
        actual_set = set(tag_info["actual_items"])
        if tag_info["found"] and expected_set != actual_set:
            print(f"  ✗ Tag distribution mismatch for '{tag_name}':", file=sys.stderr)
            print(f"    Expected: {sorted(expected_set)}", file=sys.stderr)
            print(f"    Actual: {sorted(actual_set)}", file=sys.stderr)
        elif tag_info["found"]:
            print(f"  ✓ Tag distribution maintained for '{tag_name}'")
    
    # Additional check: ensure no other pink elements exist
    print("\nChecking for any other pink elements...")
    other_pink_found = False
    
    # Check all callouts for pink
    for block in all_blocks:
        if block and block.get("type") == "callout":
            color = block.get("callout", {}).get("color", "")
            if "pink" in color.lower():
                callout_text = notion_utils.get_block_plain_text(block)[:50]
                if "Welcome to Toronto!" not in callout_text:
                    print(f"✗ Found unexpected pink callout: {callout_text}...", file=sys.stderr)
                    other_pink_found = True
    
    if other_pink_found:
        all_passed = False
    else:
        print("✓ No unexpected pink elements found")
    
    return all_passed

def main():
    """
    Executes the verification process and exits with a status code.
    """
    notion = notion_utils.get_notion_client()
    main_id = sys.argv[1] if len(sys.argv) > 1 else None
    
    if verify(notion, main_id):
        print("\nVerification passed: All expected pink colors have been changed")
        sys.exit(0)
    else:
        print("\nVerification failed: Some pink colors still exist or elements are missing")
        sys.exit(1)

if __name__ == "__main__":
    main()

================================================
FILE: tasks/notion/standard/toronto_guide/weekend_adventure_planner/description.md
================================================
Create a comprehensive weekend adventure planner that analyzes the Toronto Guide databases and generates a structured itinerary page. I need you to create a new page called 'Perfect Weekend Adventure' as a child of the main Toronto Guide page.

**Task Requirements:**
1. Create a new page titled 'Perfect Weekend Adventure' as a child page of the main Toronto Guide page
2. Query the Activities database to identify all activities that have the "Beaches" tag
3. Query the Food database to find all restaurants with "Turkish" or "Hakka" tags
4. Query the Cafes database to retrieve all cafes entries
5. Structure the page with the following specific format:
   - Add a heading_1 block with text "🎒 Perfect Weekend Adventure"
   - Add a heading_2 block with text "🏖️ Beach Activities"
   - Under Beach Activities, create a bulleted list with all activities that have the "Beaches" tag, showing: Name - Google Maps Link (if available)
   - Add a heading_2 block with text "🍽️ Cultural Dining Experience"
   - Under Cultural Dining, create a numbered list of all restaurants with "Turkish" or "Hakka" tags, formatted as: Restaurant Name (Tag: [actual tag name])
   - Add a heading_2 block with text "☕ Coffee Break Spots"
   - Under Coffee Break Spots, create a toggle block titled "Top Cafes to Visit" containing all cafe entries as to-do items (unchecked), each showing just the cafe name
   - Add a heading_2 block with text "📊 Weekend Summary"
   - Under Weekend Summary, add a paragraph with the exact text: "This weekend includes [X] beach activities, [Y] cultural dining options, and [Z] coffee spots to explore!" where [X], [Y], and [Z] are the actual counts
6. After the summary paragraph, add a divider block
7. Finally, add a callout block with the 💡 emoji containing the text: "Pro tip: Check the Seasons database for the best time to enjoy outdoor activities!"
8. Ensure all headings use the exact emoji and text format specified above
9. The lists must be in the exact format specified (bulleted for beaches, numbered for restaurants, to-do for cafes)

================================================
FILE: tasks/notion/standard/toronto_guide/weekend_adventure_planner/meta.json
================================================
{
  "task_id": "weekend_adventure_planner",
  "task_name": "Weekend Adventure Planner",
  "category_id": "toronto_guide",
  "category_name": "Toronto Guide",
  "description": "Create a comprehensive weekend adventure planner that analyzes Toronto Guide databases and generates a structured itinerary page.",
  "author": "Xiangyan Liu",
  "created_at": "2025-08-14",
  "difficulty": "L3",
  "tags": [
    "conditional filtering",
    "data aggregation",
    "report generation",
    "visual formatting",
    "status tracking"
  ],
  "mcp": [
    "notion"
  ],
  "meta_data": {
    "stateType": "url",
    "stateContent": null,
    "stateUrl": "https://painted-tennis-ebc.notion.site/Toronto-Guide-25281626b6d7802caa7cc394647e901c",
    "stateOriginalUrl": "https://www.notion.so/marketplace/templates/conquering-toronto-a-destination-guide"
  }
}

================================================
FILE: tasks/notion/standard/toronto_guide/weekend_adventure_planner/verify.py
================================================
#!/usr/bin/env python3
# -*- coding: utf-8 -*-

import sys
from notion_client import Client
from tasks.utils import notion_utils


def verify(notion: Client, main_id: str = None) -> bool:
    """
    Verifies that the Perfect Weekend Adventure page has been created correctly.
    """
    # Find the main Toronto Guide page
    page_id = None
    if main_id:
        found_id, object_type = notion_utils.find_page_or_database_by_id(notion, main_id)
        if found_id and object_type == "page":
            page_id = found_id
    
    if not page_id:
        page_id = notion_utils.find_page(notion, "Toronto Guide")
    if not page_id:
        print("Error: Main 'Toronto Guide' page not found.", file=sys.stderr)
        return False
    
    # Find the Perfect Weekend Adventure child page
    adventure_page_id = None
    try:
        response = notion.search(
            query="Perfect Weekend Adventure",
            filter={"property": "object", "value": "page"}
        )
        
        for result in response.get("results", []):
            parent = result.get("parent", {})
            if parent.get("type") == "page_id" and parent.get("page_id") == page_id:
                adventure_page_id = result["id"]
                break
        
        if not adventure_page_id:
            for result in response.get("results", []):
                title_list = result.get("properties", {}).get("title", {}).get("title", [])
                for title_obj in title_list:
                    if "Perfect Weekend Adventure" in title_obj.get("plain_text", ""):
                        adventure_page_id = result["id"]
                        break
                if adventure_page_id:
                    break
    
    except Exception as e:
        print(f"Error searching for Perfect Weekend Adventure page: {e}", file=sys.stderr)
        return False
    
    if not adventure_page_id:
        print("Error: 'Perfect Weekend Adventure' page not found as child of main page.", file=sys.stderr)
        return False
    
    # Get all blocks from the adventure page
    all_blocks = notion_utils.get_all_blocks_recursively(notion, adventure_page_id)
    
    # Get databases from the main Toronto Guide page
    activities_db_id = None
    food_db_id = None
    cafes_db_id = None
    
    main_blocks = notion_utils.get_all_blocks_recursively(notion, page_id)
    for block in main_blocks:
        if block.get("type") == "child_database":
            title = block.get("child_database", {}).get("title", "")
            if "Activities" in title:
                activities_db_id = block.get("id")
            elif "Food" in title:
                food_db_id = block.get("id")
            elif "Cafes" in title or "Caf�" in title:
                cafes_db_id = block.get("id")
    
    # Query databases to get expected data
    beach_activities = []
    cultural_restaurants = []
    cafes_list = []
    
    if activities_db_id:
        try:
            db_response = notion.databases.query(database_id=activities_db_id)
            for page in db_response.get("results", []):
                properties = page.get("properties", {})
                tags_prop = properties.get("Tags", {})
                if tags_prop.get("type") == "multi_select":
                    tags = [tag.get("name") for tag in tags_prop.get("multi_select", [])]
                    if "Beaches" in tags:
                        name_prop = properties.get("Name", {})
                        if name_prop.get("type") == "title" and name_prop.get("title"):
                            name = name_prop["title"][0]["plain_text"]
                            url_prop = properties.get("Google Maps Link", {})
                            url = url_prop.get("url", "") if url_prop.get("type") == "url" else ""
                            beach_activities.append({"name": name, "url": url})
        except Exception as e:
            print(f"Error querying Activities database: {e}", file=sys.stderr)
            return False
    
    if food_db_id:
        try:
            db_response = notion.databases.query(database_id=food_db_id)
            for page in db_response.get("results", []):
                properties = page.get("properties", {})
                tags_prop = properties.get("Tags", {})
                if tags_prop.get("type") == "multi_select":
                    tags = [tag.get("name") for tag in tags_prop.get("multi_select", [])]
                    for tag in tags:
                        if tag in ["Turkish", "Hakka"]:
                            name_prop = properties.get("Name", {})
                            if name_prop.get("type") == "title" and name_prop.get("title"):
                                name = name_prop["title"][0]["plain_text"]
                                cultural_restaurants.append({"name": name, "tag": tag})
                                break
        except Exception as e:
            print(f"Error querying Food database: {e}", file=sys.stderr)
            return False
    
    if cafes_db_id:
        try:
            db_response = notion.databases.query(database_id=cafes_db_id)
            for page in db_response.get("results", []):
                properties = page.get("properties", {})
                name_prop = properties.get("Name", {})
                if name_prop.get("type") == "title" and name_prop.get("title"):
                    name = name_prop["title"][0]["plain_text"]
                    cafes_list.append(name)
        except Exception as e:
            print(f"Error querying Cafes database: {e}", file=sys.stderr)
            return False
    
    # Required headings and their types
    required_headings = [
        ("🎒 Perfect Weekend Adventure", "heading_1"),
        ("🏖️ Beach Activities", "heading_2"),
        ("🍽️ Cultural Dining Experience", "heading_2"),
        ("☕ Coffee Break Spots", "heading_2"),
        ("📊 Weekend Summary", "heading_2")
    ]
    
    # Track verification results
    found_headings = set()
    found_beach_list = False
    found_restaurant_list = False
    found_toggle_with_cafes = False
    found_summary = False
    found_divider = False
    found_callout = False
    
    # Variables to track counts
    beach_count = 0
    restaurant_count = 0
    cafe_count = 0
    
    current_section = None
    is_in_toggle = False
    
    for block in all_blocks:
        block_type = block.get("type")
        block_text = notion_utils.get_block_plain_text(block)
        
        # Check headings
        for heading_text, expected_type in required_headings:
            if heading_text in block_text and block_type == expected_type:
                found_headings.add(heading_text)
                current_section = heading_text
        
        # Check Beach Activities section
        if current_section == "🏖️ Beach Activities" and block_type == "bulleted_list_item":
            found_beach_list = True
            beach_count += 1
            # Verify format includes name and potentially URL
            for activity in beach_activities:
                if activity["name"] in block_text:
                    if activity["url"] and activity["url"] not in block_text:
                        print(f"Warning: Beach activity '{activity['name']}' missing URL", file=sys.stderr)
        
        # Check Cultural Dining section
        elif current_section == "🍽️ Cultural Dining Experience" and block_type == "numbered_list_item":
            found_restaurant_list = True
            restaurant_count += 1
            # Check format: Restaurant Name (Tag: [tag])
            for restaurant in cultural_restaurants:
                if restaurant["name"] in block_text and f"Tag: {restaurant['tag']}" in block_text:
                    pass  # Format is correct
        
        # Check Coffee Break Spots section
        elif current_section == "☕ Coffee Break Spots":
            if block_type == "toggle" and "Top Cafes to Visit" in block_text:
                is_in_toggle = True
                found_toggle_with_cafes = True
            elif is_in_toggle and block_type == "to_do":
                cafe_count += 1
                # Verify unchecked status
                to_do_data = block.get("to_do", {})
                if to_do_data.get("checked", False):
                    print(f"Error: Cafe to-do item should be unchecked: {block_text}", file=sys.stderr)
                    return False
            elif block_type in ["heading_1", "heading_2", "heading_3"]:
                is_in_toggle = False
        
        # Check Weekend Summary section
        elif current_section == "📊 Weekend Summary" and block_type == "paragraph":
            expected_text = f"This weekend includes {len(beach_activities)} beach activities, {len(cultural_restaurants)} cultural dining options, and {len(cafes_list)} coffee spots to explore!"
            if expected_text in block_text:
                found_summary = True
        
        # Check for divider after summary
        if block_type == "divider":
            found_divider = True
        
        # Check for callout with pro tip
        if block_type == "callout":
            callout_data = block.get("callout", {})
            icon = callout_data.get("icon", {})
            if icon.get("type") == "emoji" and icon.get("emoji") == "💡":
                if "Pro tip: Check the Seasons database for the best time to enjoy outdoor activities!" in block_text:
                    found_callout = True
    
    # Verify all required elements
    all_passed = True
    
    # Check all headings are present
    for heading_text, _ in required_headings:
        if heading_text not in found_headings:
            print(f"Error: Missing required heading: {heading_text}", file=sys.stderr)
            all_passed = False
    
    # Check beach activities list
    if not found_beach_list:
        print("Error: Beach activities bulleted list not found", file=sys.stderr)
        all_passed = False
    elif beach_count != len(beach_activities):
        print(f"Error: Expected {len(beach_activities)} beach activities, found {beach_count}", file=sys.stderr)
        all_passed = False
    
    # Check restaurant list
    if not found_restaurant_list:
        print("Error: Cultural dining numbered list not found", file=sys.stderr)
        all_passed = False
    elif restaurant_count != len(cultural_restaurants):
        print(f"Error: Expected {len(cultural_restaurants)} cultural restaurants, found {restaurant_count}", file=sys.stderr)
        all_passed = False
    
    # Check cafes toggle
    if not found_toggle_with_cafes:
        print("Error: Toggle block 'Top Cafes to Visit' not found", file=sys.stderr)
        all_passed = False
    elif cafe_count != len(cafes_list):
        print(f"Error: Expected {len(cafes_list)} cafes, found {cafe_count}", file=sys.stderr)
        all_passed = False
    
    # Check summary
    if not found_summary:
        print("Error: Weekend summary with correct counts not found", file=sys.stderr)
        all_passed = False
    
    # Check divider
    if not found_divider:
        print("Error: Divider block not found after summary", file=sys.stderr)
        all_passed = False
    
    # Check callout
    if not found_callout:
        print("Error: Callout with pro tip not found", file=sys.stderr)
        all_passed = False
    
    if all_passed:
        print(f"Success: Perfect Weekend Adventure page created with all required elements.")
        print(f"- {len(beach_activities)} beach activities")
        print(f"- {len(cultural_restaurants)} cultural dining options")
        print(f"- {len(cafes_list)} coffee spots")
        return True
    else:
        return False


def main():
    """
    Executes the verification process and exits with a status code.
    """
    notion = notion_utils.get_notion_client()
    main_id = sys.argv[1] if len(sys.argv) > 1 else None
    if verify(notion, main_id):
        sys.exit(0)
    else:
        sys.exit(1)


if __name__ == "__main__":
    main()

================================================
FILE: tasks/playwright/easy/.gitkeep
================================================


================================================
FILE: tasks/playwright/standard/eval_web/cloudflare_turnstile_challenge/description.md
================================================
# Cloudflare Turnstile Authentication Challenge Task

Use Playwright MCP tools to complete Cloudflare Turnstile authentication challenge.

## Requirements:

1. Navigate to https://eval-web.mcpmark.ai/auth/turnstile
2. Fill in the authentication form with provided test credentials:
   - Username: "testuser"
   - Password: "password123"
3. Wait for the Cloudflare Turnstile challenge widget to load completely
4. Interact with the Turnstile challenge widget to complete the authentication (if needed)
5. Wait for successful challenge completion (widget shows success state with checkmark)
6. Submit the form by clicking the "Sign In" button
7. Wait for and capture any success message or confirmation that appears
8. Output the success capture in step 7

## Notes:

- Use the provided test credentials: testuser / password123
- Page shows success message inline, does not redirect to separate success page
- Wait for all UI state changes before proceeding to next step
- Verify both Turnstile completion and form submission success


================================================
FILE: tasks/playwright/standard/eval_web/cloudflare_turnstile_challenge/meta.json
================================================
{
    "task_id": "cloudflare_turnstile_challenge",
    "task_name": "Cloudflare Turnstile Challenge",
    "category_id": "eval_web",
    "category_name": "Eval Web",
    "description": "Navigate websites with Cloudflare Turnstile protection, handle security challenges, bypass bot detection mechanisms, and successfully access protected content using automated browser interactions.",
    "author": "Allison Zhan",
    "created_at": "2025-07-27",
    "difficulty": "L3",
    "tags": [
        "user interaction"
    ],
    "mcp": [
        "playwright"
    ],
    "meta_data": {
        "stateType": "video",
        "stateContent": null,
        "stateUrl": "https://storage.mcpmark.ai/tasks_state/playwright_video/mcpmark-eval-website.mp4",
        "stateOriginalUrl": "https://mcp-eval-website.vercel.app/auth/turnstile"
    }
}


================================================
FILE: tasks/playwright/standard/eval_web/cloudflare_turnstile_challenge/verify.py
================================================
#!/usr/bin/env python3
"""
Simplified verification script for Playwright Cloudflare Turnstile authentication task.

This script only verifies that the model successfully reported capturing the expected
success message by checking the last assistant message in messages.json.
"""

import sys
import json
import os

# Expected success message that agent should capture
EXPECTED_SUCCESS_MESSAGE = "Authentication successful! Security challenge verified."


def get_model_response():
    """
    Get the model's response from the MCP_MESSAGES environment variable.
    Returns the last assistant message text.
    """
    messages_path = os.getenv("MCP_MESSAGES")
    print(f"MCP_MESSAGES: {messages_path}")
    if not messages_path:
        print("Warning: MCP_MESSAGES environment variable not set", file=sys.stderr)
        return None
    
    try:
        with open(messages_path, 'r') as f:
            messages = json.load(f)
        
        # Find the last assistant message with status completed
        for message in reversed(messages):
            if (message.get('role') == 'assistant' and 
                message.get('status') == 'completed' and 
                message.get('type') == 'message'):
                content = message.get('content', [])
                # Extract text from content
                if isinstance(content, list):
                    for item in content:
                        if isinstance(item, dict) and item.get('type') in ['text', 'output_text']:
                            return item.get('text', '')
                elif isinstance(content, str):
                    return content
        
        print("Warning: No completed assistant message found", file=sys.stderr)
        return None
    except Exception as e:
        print(f"Error reading messages file: {str(e)}", file=sys.stderr)
        return None


def verify():
    """
    Verifies that the model's last response contains the expected success message.
    """
    # Get model's response from MCP_MESSAGES
    model_response = get_model_response()
    
    if not model_response:
        print("No model response found", file=sys.stderr)
        return False
    
    print(f"\nModel response (first 500 chars): {model_response[:500]}...", file=sys.stderr)
    
    # Check if the expected success message is in the model's response
    if EXPECTED_SUCCESS_MESSAGE in model_response:
        print(f"\n✓ Success message found: '{EXPECTED_SUCCESS_MESSAGE}'", file=sys.stderr)
        return True
    else:
        print(f"\n✗ Success message NOT found: '{EXPECTED_SUCCESS_MESSAGE}'", file=sys.stderr)
        return False


def main():
    """
    Executes the verification process and exits with a status code.
    """
    result = verify()
    sys.exit(0 if result else 1)


if __name__ == "__main__":
    main()


================================================
FILE: tasks/playwright/standard/eval_web/extraction_table/data.csv
================================================
Title, Rating, Likes, Views, Replies
React 18 New Features Deep Dive, "4.8", 856, 12543, 89
Vue 3 Composition API in Practice, "4.5", 743, 9876, 67
Advanced TypeScript Types Guide, "4.9", 924, 15432, 102
Node.js Performance Optimization, "4.2", 567, 8765, 45
Frontend Engineering Best Practices, "4.7", 812, 11234, 78
Microservices Architecture Patterns, "4.3", 634, 9543, 56
Docker Containerization Deployment, "4.6", 789, 10876, 71
Kubernetes Cluster Management, "4.4", 698, 9234, 63
GraphQL API Design Principles, "4.8", 876, 13456, 94
Webpack 5 Configuration Guide, "4.1", 523, 7654, 38
Vite Build Tool Usage, "4.5", 745, 10123, 69
ESLint Code Standards, "4.7", 823, 11567, 82
Unit Testing Best Practices, "4.3", 612, 8934, 51
Performance Monitoring & Optimization, "4.9", 945, 16234, 108
Security Protection Strategies, "4.2", 578, 8456, 47
Database Design Principles, "4.6", 767, 10567, 73
Caching Strategies Implementation, "4.4", 689, 9123, 61
Message Queue Applications, "4.8", 834, 12876, 87
Distributed Systems Design, "4.0", 456, 6789, 34
Cloud Native Development, "4.5", 723, 9876, 65
DevOps Process Optimization, "4.7", 801, 11234, 79
Machine Learning Introduction, "4.1", 534, 7543, 41
Artificial Intelligence Applications, "4.6", 778, 10456, 74
Blockchain Technology Fundamentals, "4.3", 645, 8765, 53
Mobile Development Techniques, "4.9", 912, 14567, 97
Cross-Platform Solutions, "4.2", 589, 8234, 48
Progressive Web App Development, "4.8", 867, 12345, 91
Web3 Development Guide, "4.4", 712, 9567, 64
NFT Smart Contracts, "4.5", 756, 10234, 70
DeFi Protocol Design, "4.7", 834, 11876, 83
Game Engine Development, "4.3", 623, 8567, 52
3D Graphics Rendering, "4.6", 789, 10678, 75
Audio Video Processing, "4.1", 545, 7234, 42
IoT Applications, "4.8", 856, 12567, 88
Edge Computing Practices, "4.2", 567, 8345, 46
5G Network Technology, "4.9", 923, 15123, 103
Quantum Computing Principles, "4.4", 678, 9345, 62
Bioinformatics Analysis, "4.5", 734, 9876, 68
Data Science Methods, "4.7", 812, 11456, 80
Algorithms and Data Structures, "4.3", 634, 8678, 54
System Design Interview, "4.6", 778, 10345, 76
Code Refactoring Techniques, "4.8", 845, 12234, 89
Open Source Contributions, "4.2", 556, 7890, 43
Technical Team Management, "4.5", 723, 9567, 66
Product Thinking Development, "4.9", 901, 14234, 95
User Experience Design, "4.1", 512, 7123, 39
Interface Interaction Optimization, "4.7", 789, 10890, 77
Accessibility Design, "4.4", 667, 8901, 58
SEO Optimization Strategies, "4.6", 756, 10123, 72
Social Media Operations, "4.3", 623, 8456, 55
Serverless Architecture, "4.7", 834, 11234, 81
API Gateway Design, "4.2", 567, 8765, 49
Microservice Communication, "4.8", 892, 13567, 95
Event-Driven Architecture, "4.5", 723, 9876, 67
CQRS Pattern Implementation, "4.3", 645, 8234, 54
Domain-Driven Design, "4.6", 778, 10456, 73
Clean Architecture Principles, "4.4", 689, 9123, 62
Hexagonal Architecture, "4.1", 534, 7543, 42
Onion Architecture, "4.5", 712, 9567, 65
Event Sourcing Patterns, "4.7", 823, 11876, 79
Saga Pattern for Distributed Systems, "4.3", 612, 8934, 53
Circuit Breaker Pattern, "4.8", 856, 12543, 87
Bulkhead Pattern, "4.2", 578, 8456, 47
Retry Pattern Implementation, "4.6", 767, 10567, 74
Timeout Pattern, "4.4", 698, 9234, 63
Rate Limiting Strategies, "4.9", 934, 15432, 103
Load Balancing Techniques, "4.1", 523, 7654, 39
Service Mesh Architecture, "4.5", 745, 10123, 69
Istio Service Mesh, "4.7", 812, 11567, 82
Envoy Proxy Configuration, "4.3", 634, 9543, 56
Consul Service Discovery, "4.6", 789, 10876, 71
Kubernetes Ingress, "4.4", 676, 9345, 58
Helm Chart Development, "4.8", 845, 12234, 89
Terraform Infrastructure, "4.2", 556, 7890, 44
Ansible Automation, "4.5", 723, 9567, 66
Jenkins Pipeline, "4.7", 801, 11234, 78
GitLab CI/CD, "4.3", 623, 8567, 52
GitHub Actions, "4.6", 789, 10678, 75
Azure DevOps, "4.1", 512, 7123, 41
AWS CodePipeline, "4.8", 867, 12345, 91
Docker Compose, "4.4", 712, 9567, 64
Kubernetes Operators, "4.5", 756, 10234, 70
Custom Resource Definitions, "4.7", 834, 11876, 83
Pod Security Policies, "4.3", 623, 8567, 52
Network Policies, "4.6", 789, 10678, 75
RBAC Configuration, "4.1", 545, 7234, 42
Secret Management, "4.8", 856, 12567, 88
ConfigMap Usage, "4.2", 567, 8345, 46
Persistent Volumes, "4.9", 923, 15123, 103
StatefulSets, "4.4", 678, 9345, 62
DaemonSets, "4.5", 734, 9876, 68
Jobs and CronJobs, "4.7", 812, 11456, 80
Horizontal Pod Autoscaler, "4.3", 634, 8678, 54
Vertical Pod Autoscaler, "4.6", 778, 10345, 76
Cluster Autoscaler, "4.8", 845, 12234, 89
Resource Quotas, "4.2", 556, 7890, 43
Limit Ranges, "4.5", 723, 9567, 66


================================================
FILE: tasks/playwright/standard/eval_web/extraction_table/description.md
================================================
# Web Data Extraction Task

Use Playwright MCP tools to extract all data from the specified website and present it in CSV format.

## Requirements:

1. Navigate to https://eval-web.mcpmark.ai/extraction
2. Wait for the page to fully load
3. Extract all data content from the page, including:
   - Title
   - Rating
   - Likes
   - Views
   - Replies
4. Organize the extracted data into CSV format
5. Ensure data completeness and accuracy
6. Output ONLY the complete CSV formatted data (no additional text or explanations)

## CSV Data Example:

```csv
Title, Rating, Likes, Views, Replies
SEO Optimization, "4.6", 756, 10123, 72
Vue 3 Composition API, "4.5", 743, 9876, 67
Advanced TypeScript Types Guide, "4.9", 924, 15432, 102
Node.js Performance Optimization, "4.2", 567, 8765, 45
Frontend Engineering Best Practices, "4.7", 812, 11234, 78
```

## Notes:

- Ensure extraction of all visible data rows
- Maintain data format consistency
- All numeric data (Rating, Likes, Views, Replies) should NOT have quotes, only text data containing commas should be wrapped in quotes
- Wait for the page to fully load before starting data extraction
- Verify the quantity and format of extracted data are correct
- **IMPORTANT: Final output must contain ONLY CSV data - no explanatory text, descriptions, or other content**


================================================
FILE: tasks/playwright/standard/eval_web/extraction_table/meta.json
================================================
{
  "task_id": "extraction_table",
  "task_name": "Extraction Table",
  "category_id": "eval_web",
  "category_name": "Eval Web",
  "description": "Extract structured data from complex web tables, parse multi-level headers, handle dynamic content loading, transform data formats, and export comprehensive datasets.",
  "author": "Arvin Xu",
  "created_at": "2025-08-18",
  "difficulty": "L3",
  "tags": [
    "data extraction"
  ],
  "mcp": [
    "playwright"
  ],
  "meta_data": {
    "stateType": "video",
    "stateContent": null,
    "stateUrl": "https://storage.mcpmark.ai/tasks_state/playwright_video/mcpmark-eval-website.mp4",
    "stateOriginalUrl": "https://eval-web.mcpmark.ai/extraction"
  }
}


================================================
FILE: tasks/playwright/standard/eval_web/extraction_table/verify.py
================================================
#!/usr/bin/env python3
"""
Verification script for checking Playwright web data extraction tasks.

This script verifies whether the model successfully extracted CSV format data from web pages
by checking the last assistant message in messages.json.
"""

import sys
import json
import os
import re
import csv
from io import StringIO

# Expected CSV header (must match exactly, including spaces)
EXPECTED_HEADER_LINE = "Title, Rating, Likes, Views, Replies"
EXPECTED_HEADERS = ["Title", "Rating", "Likes", "Views", "Replies"]
# Exact number of data rows (must match data.csv exactly)
EXPECTED_DATA_ROWS = 97


def get_model_response():
    """
    Get the model's response from the MCP_MESSAGES environment variable.
    Returns the last assistant message text.
    """
    messages_path = os.getenv("MCP_MESSAGES")
    print(f"| MCP_MESSAGES: {messages_path}")
    if not messages_path:
        print("| Warning: MCP_MESSAGES environment variable not set", file=sys.stderr)
        return None

    try:
        with open(messages_path, 'r') as f:
            messages = json.load(f)

        # Find the last assistant message with status completed
        for message in reversed(messages):
            if (message.get('role') == 'assistant' and
                message.get('status') == 'completed' and
                message.get('type') == 'message'):
                content = message.get('content', [])
                # Extract text from content
                if isinstance(content, list):
                    for item in content:
                        if isinstance(item, dict) and item.get('type') in ['text', 'output_text']:
                            return item.get('text', '')
                elif isinstance(content, str):
                    return content

        print("| Warning: No completed assistant message found", file=sys.stderr)
        return None
    except Exception as e:
        print(f"| Error reading messages file: {str(e)}", file=sys.stderr)
        return None


def extract_csv_from_response(response):
    """
    Extract CSV data from model response.
    """
    # Look for CSV code blocks
    csv_pattern = r'```(?:csv)?\s*\n(.*?)\n```'
    matches = re.findall(csv_pattern, response, re.DOTALL | re.IGNORECASE)

    if matches:
        return matches[-1].strip()  # Return the last CSV block

    # If no code block found, try to find CSV data starting with header
    lines = response.split('\n')
    csv_start = -1

    # Stricter header matching: look for lines containing "Title" and "Rating"
    for i, line in enumerate(lines):
        if "Title" in line and "Rating" in line and "Likes" in line:
            csv_start = i
            break

    if csv_start >= 0:
        # Extract from header until empty line or non-CSV format line
        csv_lines = []
        for line in lines[csv_start:]:
            line = line.strip()
            if not line or not (',' in line):
                if csv_lines:  # If we already have data, stop at empty line
                    break
                continue
            csv_lines.append(line)
            if len(csv_lines) > 100:  # Prevent extracting too many rows
                break

        return '\n'.join(csv_lines)

    return None


def validate_csv_data(csv_text):
    """
    Validate CSV data format and content, must match data.csv exactly.
    """
    if not csv_text:
        return False, "CSV data not found"

    try:
        lines = csv_text.strip().split('\n')

        # Check total number of rows (1 header row + data rows)
        expected_total_rows = EXPECTED_DATA_ROWS + 1
        if len(lines) != expected_total_rows:
            return False, f"| CSV total row count mismatch, expected: {expected_total_rows} rows, actual: {len(lines)} rows"

        # Check header row format (must match exactly)
        header_line = lines[0].strip()
        if header_line != EXPECTED_HEADER_LINE:
            return False, f"| Header format mismatch, expected: '{EXPECTED_HEADER_LINE}', actual: '{header_line}'"

        # Parse CSV to validate structure
        csv_reader = csv.reader(StringIO(csv_text))
        rows = list(csv_reader)

        # Check column count for each row
        expected_columns = len(EXPECTED_HEADERS)
        for i, row in enumerate(rows):
            if len(row) != expected_columns:
                return False, f"| Row {i+1} column count incorrect, expected: {expected_columns} columns, actual: {len(row)} columns"

        # Validate data row format
        valid_rows = 0
        for i, row in enumerate(rows[1:], 2):  # Skip header, start from row 2
            # Check if each column has data
            if not all(cell.strip() for cell in row):
                return False, f"| Row {i} contains empty data"

            # Check numeric column format (Rating, Likes, Views, Replies should not have quotes)
            for col_idx, col_name in [(1, "Rating"), (2, "Likes"), (3, "Views"), (4, "Replies")]:
                value = row[col_idx].strip()

                # Check for quotes (should not have any)
                if value.startswith('"') and value.endswith('"'):
                    return False, f"| Row {i} {col_name} should not have quotes, actual: {value}"

                # Check numeric format
                if col_name == "Rating":
                    try:
                        float(value)
                    except ValueError:
                        return False, f"| Row {i} {col_name} should be a number, actual: {value}"
                else:
                    if not value.isdigit():
                        return False, f"| Row {i} {col_name} should be pure digits, actual: {value}"

            valid_rows += 1

        # Validate number of data rows
        if valid_rows != EXPECTED_DATA_ROWS:
            return False, f"| Valid data row count mismatch, expected: {EXPECTED_DATA_ROWS} rows, actual: {valid_rows} rows"

        return True, f"| CSV validation successful: format matches data.csv exactly, {valid_rows} valid data rows"

    except Exception as e:
        return False, f"| CSV format parsing error: {str(e)}"


def verify():
    """
    Verify if the model's response contains correct CSV data extraction results.
    """
    # Get model response
    model_response = get_model_response()

    if not model_response:
        print("| Model response not found", file=sys.stderr)
        return False

    print(f"|\n| Model response (first 500 characters): {model_response[:500]}...", file=sys.stderr)

    # Extract CSV data from response
    csv_data = extract_csv_from_response(model_response)

    if not csv_data:
        print("|\n| ✗ CSV data not found in response", file=sys.stderr)
        return False

    print(f"|\n| Found CSV data (first 300 characters):\n| {csv_data[:300]}...", file=sys.stderr)

    # Validate CSV data
    is_valid, message = validate_csv_data(csv_data)

    if is_valid:
        print(f"|\n| ✓ {message}", file=sys.stderr)
        return True
    else:
        print(f"|\n| ✗ CSV validation failed: {message}", file=sys.stderr)
        return False


def main():
    """
    Executes the verification process and exits with a status code.
    """
    result = verify()
    sys.exit(0 if result else 1)


if __name__ == "__main__":
    main()


================================================
FILE: tasks/playwright/standard/web_search/birth_of_arvinxu/description.md
================================================
# Web Search Task

Use Playwright MCP tools to search for information about the X profile https://x.com/arvin17x and find out when this person was born.

## Requirements:

Extract the answer in specific format:
   - just year,like 1990, 2001


================================================
FILE: tasks/playwright/standard/web_search/birth_of_arvinxu/meta.json
================================================
{
  "task_id": "birth_of_arvinxu",
  "task_name": "Birth Of Arvinxu",
  "category_id": "web_search",
  "category_name": "Web Search",
  "description": "Search for biographical information about X profile arvin17x across multiple web sources, extract birth year data, verify information accuracy, and compile findings.",
  "author": "Arvin Xu",
  "created_at": "2025-08-18",
  "difficulty": "L3",
  "tags": [
    "search aggregation",
    "data extraction"
  ],
  "mcp": [
    "playwright"
  ],
  "meta_data": {
    "stateType": null,
    "stateContent": null,
    "stateUrl": null,
    "stateOriginalUrl": null
  }
}


================================================
FILE: tasks/playwright/standard/web_search/birth_of_arvinxu/verify.py
================================================
#!/usr/bin/env python3
"""
Verification script for Playwright web search task.

Simple verification that checks if the AI agent found the correct answer.
The expected ground truth answer is configured at the top of the file.
"""

import sys
import json
import os
from pathlib import Path
from typing import Dict, Any

# =============================================================================
# CONFIGURATION
# =============================================================================

# Expected ground truth answer (exact match)
EXPECTED_GROUND_TRUTH = "1995"

# =============================================================================
# MCP RESULT PARSING
# =============================================================================


def get_working_directory() -> Path:
    """Get the working directory where messages.json should be."""
    # Priority 1: Use MCP_MESSAGES path if available (most reliable)
    messages_path = os.getenv("MCP_MESSAGES")
    if messages_path and Path(messages_path).exists():
        return Path(messages_path).parent.resolve()

    # Priority 2: Use PLAYWRIGHT_WORK_DIR environment variable
    work_dir = os.getenv("PLAYWRIGHT_WORK_DIR")
    if work_dir:
        work_path = Path(work_dir).resolve()
        if (work_path / "messages.json").exists():
            return work_path

    # Priority 3: Check current directory (fallback)
    current_dir = Path.cwd()
    if (current_dir / "messages.json").exists():
        return current_dir

    # Priority 4: Default fallback
    return Path(".").resolve()


def parse_ai_results(work_dir: Path) -> Dict[str, Any]:
    """Parse the AI agent's results from messages.json"""
    messages_file = work_dir / "messages.json"
    if not messages_file.exists():
        return {"success": False, "error": "No messages.json found"}

    try:
        with open(messages_file, "r", encoding="utf-8") as f:
            messages = json.load(f)
    except (json.JSONDecodeError, IOError) as e:
        return {"success": False, "error": f"Failed to read messages.json: {e}"}

    # Look for expected answer in the AI's responses
    found_answer = False
    ai_responses = []

    for message in messages:
        if message.get("role") == "assistant":
            content = str(message.get("content", ""))

            # Handle both string and list content formats
            if isinstance(message.get("content"), list):
                content = " ".join(
                    item.get("text", "") if isinstance(item, dict) else str(item)
                    for item in message.get("content", [])
                )

            ai_responses.append(content)

            # Exact match (character-for-character, case-sensitive, no trimming)
            if content == EXPECTED_GROUND_TRUTH:
                found_answer = True

    return {
        "success": True,
        "found_answer": found_answer,
        "ai_responses": ai_responses,
        "total_responses": len(ai_responses),
    }


# =============================================================================
# MAIN VERIFICATION
# =============================================================================


def verify_task() -> bool:
    """Verify the AI agent found the correct answer"""

    # Parse AI agent results
    work_dir = get_working_directory()
    print(f"| Working directory: {work_dir}")

    ai_results = parse_ai_results(work_dir)

    if not ai_results["success"]:
        print(f"| ❌ Could not parse AI results: {ai_results.get('error')}")
        return False

    if ai_results["found_answer"]:
        print(f"| AI agent correctly identified: {EXPECTED_GROUND_TRUTH}")
        return True
    else:
        print(f"| AI agent did not find the correct answer: {EXPECTED_GROUND_TRUTH}")
        return False


def main():
    """Main verification function."""
    try:
        success = verify_task()
        sys.exit(0 if success else 1)
    except Exception as e:
        print(f"\n💥 Verification error: {e}")
        sys.exit(1)


if __name__ == "__main__":
    main()


================================================
FILE: tasks/playwright/standard/web_search/r1_arxiv/content.txt
================================================
In this work, we share our journey in enhancing model reasoning abilities through reinforcement learning. DeepSeek-R1-Zero represents a pure RL approach without relying on cold-start data, achieving strong performance across various tasks. DeepSeek-R1 is more powerful, leveraging cold-start data alongside iterative RL fine-tuning. Ultimately, DeepSeek-R1 achieves performance comparable to OpenAI-o1-1217 on a range of tasks.

We further explore distillation the reasoning capability to small dense models. We use DeepSeek-R1 as the teacher model to generate 800K training samples, and fine-tune several small dense models. The results are promising: DeepSeek-R1-Distill-Qwen-1.5B outperforms GPT-4o and Claude-3.5-Sonnet on math benchmarks with 28.9% on AIME and 83.9% on MATH. Other dense models also achieve impressive results, significantly outperforming other instruction-tuned models based on the same underlying checkpoints.

In the future, we plan to invest in research across the following directions for DeepSeek-R1.

- **General Capability**: Currently, the capabilities of DeepSeek-R1 fall short of DeepSeek-V3 in tasks such as function calling, multi-turn, complex role-playing, and JSON output. Moving forward, we plan to explore how long CoT can be leveraged to enhance tasks in these fields.
- **Language Mixing**: DeepSeek-R1 is currently optimized for Chinese and English, which may result in language mixing issues when handling queries in other languages. For instance, DeepSeek-R1 might use English for reasoning and responses, even if the query is in a language other than English or Chinese. We aim to address this limitation in future updates.
- **Prompting Engineering**: When evaluating DeepSeek-R1, we observe that it is sensitive to prompts. Few-shot prompting consistently degrades its performance. Therefore, we recommend users directly describe the problem and specify the output format using a zero-shot setting for optimal results.
- **Software Engineering Tasks**: Due to the long evaluation times, which impact the efficiency of the RL process, large-scale RL has not been applied extensively in software engineering tasks. As a result, DeepSeek-R1 has not demonstrated a huge improvement over DeepSeek-V3 on software engineering benchmarks. Future versions will address this by implementing rejection sampling on software engineering data or incorporating asynchronous evaluations during the RL process to improve efficiency.


================================================
FILE: tasks/playwright/standard/web_search/r1_arxiv/description.md
================================================
# Web Search Task

Use Playwright MCP tools to search for the DeepSeek R1 research paper and extract all the paragraphs of the Conclusion section.

## Requirements:

1. Search for the DeepSeek R1 research paper
2. Navigate to the paper and find the Conclusion section
3. Extract **ALL the paragraphs** of the Conclusion section
4. **Provide the content in Markdown format - no explanations, no additional text**

## Important Notes:

- **Output ALL the paragraphs of text**
- **Do NOT include any explanations, summaries, or additional content**
- **The response should contain ONLY the Conclusion section content formatted in Markdown**

## Expected Output:
All the paragraphs of the Conclusion section from the DeepSeek R1 paper, formatted in Markdown with proper paragraph structure and formatting.


================================================
FILE: tasks/playwright/standard/web_search/r1_arxiv/meta.json
================================================
{
  "task_id": "r1_arxiv",
  "task_name": "R1 Arxiv",
  "category_id": "web_search",
  "category_name": "Web Search",
  "description": "Search arXiv for R1 model research papers, extract technical specifications, analyze methodology sections, compile research findings, and generate comprehensive literature review.",
  "author": "Arvin Xu",
  "created_at": "2025-08-18",
  "difficulty": "L3",
  "tags": [
    "search aggregation",
    "data extraction",
    "comparative analysis",
    "content submission"
  ],
  "mcp": [
    "playwright"
  ],
  "meta_data": {
    "stateType": null,
    "stateContent": null,
    "stateUrl": null,
    "stateOriginalUrl": null
  }
}


================================================
FILE: tasks/playwright/standard/web_search/r1_arxiv/verify.py
================================================
#!/usr/bin/env python3
"""
Verification script for Playwright web search task.

Simple verification that checks if the AI agent found the correct Introduction content.
The expected ground truth answer is configured at the top of the file.
"""

import sys
import json
import os
from pathlib import Path
from typing import Dict, Any

# =============================================================================
# CONFIGURATION
# =============================================================================

# Expected ground truth content from content.txt
EXPECTED_CONTENT_FILE = "content.txt"

# =============================================================================
# MCP RESULT PARSING
# =============================================================================


def get_working_directory() -> Path:
    """Get the working directory where messages.json should be."""
    # Priority 1: Use MCP_MESSAGES path if available (most reliable)
    messages_path = os.getenv("MCP_MESSAGES")
    if messages_path and Path(messages_path).exists():
        return Path(messages_path).parent.resolve()

    # Priority 2: Use PLAYWRIGHT_WORK_DIR environment variable
    work_dir = os.getenv("PLAYWRIGHT_WORK_DIR")
    if work_dir:
        work_path = Path(work_dir).resolve()
        if (work_path / "messages.json").exists():
            return work_path

    # Priority 3: Check current directory (fallback)
    current_dir = Path.cwd()
    if (current_dir / "messages.json").exists():
        return current_dir

    # Priority 4: Default fallback
    return Path(".").resolve()


def load_expected_content() -> str:
    """Load the expected content from content.txt"""
    # content.txt is in the same directory as verify.py
    current_file = Path(__file__).resolve()
    content_file = current_file.parent / EXPECTED_CONTENT_FILE

    if not content_file.exists():
        print(f"| {EXPECTED_CONTENT_FILE} not found at: {content_file}")
        return ""

    print(f"| Found {EXPECTED_CONTENT_FILE} at: {content_file}")

    try:
        with open(content_file, "r", encoding="utf-8") as f:
            return f.read().strip()
    except (IOError, UnicodeDecodeError) as e:
        print(f"| Warning: Could not read {content_file}: {e}")
        return ""


def parse_ai_results(work_dir: Path) -> Dict[str, Any]:
    """Parse the AI agent's results from messages.json"""
    messages_file = work_dir / "messages.json"
    if not messages_file.exists():
        return {"success": False, "error": "No messages.json found"}

    try:
        with open(messages_file, "r", encoding="utf-8") as f:
            messages = json.load(f)
    except (json.JSONDecodeError, IOError) as e:
        return {"success": False, "error": f"Failed to read messages.json: {e}"}

    # Look for extracted content in the AI's responses
    found_content = False
    ai_responses = []
    extracted_content = ""

    for message in messages:
        if message.get("role") == "assistant":
            content = str(message.get("content", ""))

            # Handle both string and list content formats
            if isinstance(message.get("content"), list):
                content = " ".join(
                    item.get("text", "") if isinstance(item, dict) else str(item)
                    for item in message.get("content", [])
                )

            ai_responses.append(content)

            # Store the last response as extracted content
            extracted_content = content

    return {
        "success": True,
        "found_content": True,  # Assuming content was found if we have responses
        "ai_responses": ai_responses,
        "extracted_content": extracted_content,
        "total_responses": len(ai_responses),
    }


def compare_content(extracted: str, expected: str) -> Dict[str, Any]:
    """Compare extracted content with expected content"""
    if not expected:
        return {"success": False, "error": "No expected content to compare against"}

    if not extracted:
        return {"success": False, "error": "No extracted content found"}

    # Normalize content for comparison (remove extra whitespace, normalize line breaks)
    extracted_normalized = " ".join(extracted.split())
    expected_normalized = " ".join(expected.split())

    # Direct text comparison - content must be exactly the same
    is_exact_match = extracted_normalized == expected_normalized

    return {
        "success": True,
        "is_exact_match": is_exact_match,
        "extracted_length": len(extracted_normalized),
        "expected_length": len(expected_normalized),
        "extracted_preview": extracted_normalized[:100] + "..." if len(extracted_normalized) > 100 else extracted_normalized,
        "expected_preview": expected_normalized[:100] + "..." if len(expected_normalized) > 100 else expected_normalized
    }


# =============================================================================
# MAIN VERIFICATION
# =============================================================================


def verify_task(work_dir: Path) -> bool:
    """Verify the AI agent found the correct Introduction content"""
    print("| Verifying Playwright Web Search Task - DeepSeek R1 Introduction")
    print("| " + "=" * 70)

    # Load expected content
    print("| Loading expected content...")
    expected_content = load_expected_content()

    if not expected_content:
        print("| Error: Could not load expected content")
        return False

    print(f"| Expected content loaded ({len(expected_content)} characters)")

    # Parse MCP messages
    messages = parse_ai_results(work_dir)

    if not messages["success"]:
        print(f"| Error: Could not parse AI results: {messages.get('error')}")
        return False

    # Extract AI agent response
    extracted_content = messages.get("extracted_content", "")

    if not extracted_content:
        print("| Error: No AI agent response found")
        return False

    print(f"| Extracted content: {len(extracted_content)} characters")

    # Compare content
    print("| Comparing extracted content with expected content...")
    comparison = compare_content(extracted_content, expected_content)

    if not comparison["success"]:
        print(f"| Comparison failed: {comparison.get('error')}")
        return False

    print(f"| Content comparison results:")
    print(f"|   - Extracted length: {comparison['extracted_length']} characters")
    print(f"|   - Expected length: {comparison['expected_length']} characters")
    print(f"|   - Extracted preview: {comparison['extracted_preview']}")
    print(f"|   - Expected preview: {comparison['expected_preview']}")

    if comparison['is_exact_match']:
        print("| Task completed successfully! Content matches exactly.")
        return True
    else:
        print("| Task verification failed. Content does not match exactly.")
        return False


def main():
    """Main verification function"""
    print("| Starting verification...")

    # Get working directory
    work_dir = get_working_directory()
    print(f"| Working directory: {work_dir}")

    # Run verification
    success = verify_task(work_dir)

    if success:
        sys.exit(0)
    else:
        sys.exit(1)


if __name__ == "__main__":
    main()


================================================
FILE: tasks/playwright_webarena/easy/.gitkeep
================================================


================================================
FILE: tasks/playwright_webarena/easy/reddit/ai_data_analyst/description.md
================================================
Deliver a bite-sized AI community snapshot by capturing two essential metrics and posting them back to the forum.

**Task Requirements:**

1. Register and log in with username `AIDataAnalyst2025` and password `SecurePass123!` so every action is tied to that account.
2. Visit the `deeplearning` forum (page 1 only) and count how many posts are visible; note the exact total.
3. Go to the `MachineLearning` forum, find the post titled `[P] I made a command-line tool that explains your errors using ChatGPT (link in comments)`, and record the vote count shown in the listing.
4. Still in `MachineLearning`, create a post titled `MachineLearning_Extraction` whose body is exactly the two-line Markdown list below (keep the keys and pipe separators untouched, replacing each `FILL_VALUE` with your numbers):

```
- Deeplearning_Post_Count|FILL_VALUE
- ChatGPT_Tool_Vote_Count|FILL_VALUE
```

No searches or multi-page navigation are required—just capture these two data points and report them accurately.


================================================
FILE: tasks/playwright_webarena/easy/reddit/ai_data_analyst/label.txt
================================================
- Deeplearning_Post_Count|25
- ChatGPT_Tool_Vote_Count|2655


================================================
FILE: tasks/playwright_webarena/easy/reddit/ai_data_analyst/meta.json
================================================
{
  "task_id": "ai_data_analyst_easy",
  "task_name": "AI Data Analyst (Easy)",
  "category_id": "reddit",
  "category_name": "Reddit",
  "description": "Capture a lightweight AI/ML engagement snapshot by signing in, grabbing three key metrics, and posting them back to the MachineLearning forum.",
  "author": "Fanqing Meng",
  "created_at": "2025-11-15",
  "difficulty": "L1",
  "tags": [
    "user interaction",
    "data extraction",
    "comparative analysis"
  ],
  "mcp": [
    "playwright"
  ],
  "meta_data": {
    "stateType": "video",
    "stateContent": null,
    "stateUrl": "https://storage.mcpmark.ai/tasks_state/playwright_video/postmill.mp4",
    "stateOriginalUrl": "https://github.com/web-arena-x/webarena/tree/main/environment_docker"
  }
}


================================================
FILE: tasks/playwright_webarena/easy/reddit/ai_data_analyst/verify.py
================================================
import asyncio
import os
import re
import sys
from pathlib import Path
from playwright.async_api import async_playwright, TimeoutError as PlaywrightTimeoutError

BASE_URL = os.getenv("WEBARENA_BASE_URL", "http://localhost:9999").rstrip("/")
USERNAME = "AIDataAnalyst2025"
PASSWORD = "SecurePass123!"
POST_TITLE = "MachineLearning_Extraction"
REQUIRED_FIELDS = [
    "Deeplearning_Post_Count",
    "ChatGPT_Tool_Vote_Count",
]
LABEL_PATH = Path(__file__).parent / "label.txt"


def parse_key_value_format(text: str) -> dict:
    data = {}
    if not text:
        return data
    for line in text.splitlines():
        line = line.strip()
        if not line or "|" not in line:
            continue
        line = re.sub(r"^[-•*]\s*", "", line)
        key, value = line.split("|", 1)
        data[key.strip()] = value.strip()
    return data


def load_expected_values() -> dict:
    if not LABEL_PATH.exists():
        return {}
    return parse_key_value_format(LABEL_PATH.read_text(encoding="utf-8"))


async def ensure_logged_in(page) -> bool:
    print("Step 1: Ensuring we are logged in...", file=sys.stderr)
    await page.goto(f"{BASE_URL}/", wait_until="networkidle")
    user_button = page.locator(f'button:has-text("{USERNAME}")')
    if await user_button.count():
        print("✓ Already logged in", file=sys.stderr)
        return True

    login_link = page.locator('a:has-text("Log in")')
    if not await login_link.count():
        print("FAILED: Login link not found", file=sys.stderr)
        return False

    await login_link.click()
    await page.wait_for_load_state("networkidle")
    await page.fill('input[name="_username"]', USERNAME)
    await page.fill('input[name="_password"]', PASSWORD)
    await page.click('button:has-text("Log in")')
    await page.wait_for_load_state("networkidle")

    if await page.locator(f'button:has-text("{USERNAME}")').count():
        print(f"✓ Logged in as {USERNAME}", file=sys.stderr)
        return True

    print("FAILED: Could not log in with provided credentials", file=sys.stderr)
    return False


async def fetch_submission_content(page):
    print("Step 2: Retrieving MachineLearning submission...", file=sys.stderr)
    await page.goto(f"{BASE_URL}/f/MachineLearning", wait_until="networkidle")
    post_link = page.locator(f'a:has-text("{POST_TITLE}")')
    if not await post_link.count():
        print(
            f"FAILED: Submission '{POST_TITLE}' not found in MachineLearning forum",
            file=sys.stderr,
        )
        return None

    await post_link.first.click()
    await page.wait_for_load_state("networkidle")

    selectors = [
        ".submission__body",
        "article",
        ".post-body",
        ".RichText",
        '[class*="RichText"]',
    ]

    for selector in selectors:
        locator = page.locator(selector)
        if await locator.count():
            content = await locator.first.inner_text()
            if content:
                print(f"✓ Found submission body via selector {selector}", file=sys.stderr)
                return content

    print("FAILED: Unable to locate submission body content", file=sys.stderr)
    return None


def validate_submission(extracted: dict, expected: dict) -> bool:
    missing = [key for key in REQUIRED_FIELDS if key not in extracted]
    if missing:
        print(
            f"FAILED: Submission body missing required keys: {', '.join(missing)}",
            file=sys.stderr,
        )
        return False

    errors = []
    for key in REQUIRED_FIELDS:
        actual = extracted.get(key, "")
        expect = expected.get(key, "")
        try:
            actual_val = int(actual)
            expect_val = int(expect)
            if actual_val != expect_val:
                errors.append(f"{key}: expected {expect_val}, found {actual_val}")
        except ValueError:
            errors.append(f"{key}: value '{actual}' is not numeric")

    if errors:
        print("FAILED: Submission values do not match expected data:", file=sys.stderr)
        for err in errors:
            print(f"  - {err}", file=sys.stderr)
        return False

    print("✓ Submission content matches expected metrics", file=sys.stderr)
    return True


async def verify() -> bool:
    expected = load_expected_values()
    if not expected:
        print("FAILED: label.txt with expected values is missing", file=sys.stderr)
        return False


    async with async_playwright() as p:
        browser = await p.chromium.launch(headless=True)
        context = await browser.new_context()
        page = await context.new_page()

        try:
            if not await ensure_logged_in(page):
                return False

            content = await fetch_submission_content(page)
            if not content:
                return False

            extracted = parse_key_value_format(content)
            if not validate_submission(extracted, expected):
                return False

            print("SUCCESS: Easy AI data analyst task verified", file=sys.stderr)
            return True

        except PlaywrightTimeoutError as exc:
            print(f"FAILED: Timeout occurred - {exc}", file=sys.stderr)
            return False
        except Exception as exc:
            print(f"FAILED: Unexpected error - {exc}", file=sys.stderr)
            return False
        finally:
            await browser.close()


def main():
    result = asyncio.run(verify())
    sys.exit(0 if result else 1)


if __name__ == "__main__":
    main()


================================================
FILE: tasks/playwright_webarena/easy/reddit/llm_research_summary/description.md
================================================
Produce a five-line GPT activity bulletin so readers can quickly see how active the MachineLearning forum is.

**Task Requirements:**

1. Register and log in with username `llm_analyst_2024` and password `LLMExpert2024!`.
2. On the first page of the `MachineLearning` forum, count how many post titles contain `GPT`, `ChatGPT`, or `LLM`; record the exact total.
3. From that subset, identify the single post with the highest upvote count and note its full title and vote total.
4. Create a new post titled `LLM Research Summary: GPT Discussions Analysis [2024]` in the same forum. The body must be exactly the Markdown list below with your numbers in place of each `FILL_VALUE` (keep the keys, order, and pipe separator intact):

```
- Total_LLM_Posts|FILL_VALUE
- Top1_Title|FILL_VALUE
- Top1_Upvotes|FILL_VALUE
```

That’s the entire easy version—just report the headline LLM count and the hottest related thread.


================================================
FILE: tasks/playwright_webarena/easy/reddit/llm_research_summary/label.txt
================================================
- Total_LLM_Posts|9
- Top1_Title|[P] I made a command-line tool that explains your errors using ChatGPT (link in comments)
- Top1_Upvotes|2655


================================================
FILE: tasks/playwright_webarena/easy/reddit/llm_research_summary/meta.json
================================================
{
  "task_id": "llm_research_summary_easy",
  "task_name": "LLM Research Summary (Easy)",
  "category_id": "reddit",
  "category_name": "Reddit",
  "description": "Collect the headline GPT metrics from MachineLearning and publish a short five-line recap.",
  "author": "Fanqing Meng",
  "created_at": "2025-11-15",
  "difficulty": "L1",
  "tags": [
    "data extraction",
    "search aggregation",
    "content submission",
    "user interaction"
  ],
  "mcp": [
    "playwright"
  ],
  "meta_data": {
    "stateType": "video",
    "stateContent": null,
    "stateUrl": "https://storage.mcpmark.ai/tasks_state/playwright_video/postmill.mp4",
    "stateOriginalUrl": "https://github.com/web-arena-x/webarena/tree/main/environment_docker"
  }
}


================================================
FILE: tasks/playwright_webarena/easy/reddit/llm_research_summary/verify.py
================================================
import asyncio
import os
import re
import sys
from pathlib import Path
from playwright.async_api import async_playwright, TimeoutError as PlaywrightTimeoutError

BASE_URL = os.getenv("WEBARENA_BASE_URL", "http://localhost:9999").rstrip("/")
USERNAME = "llm_analyst_2024"
PASSWORD = "LLMExpert2024!"
FORUM_SLUG = "MachineLearning"
POST_TITLE = "LLM Research Summary: GPT Discussions Analysis [2024]"
REQUIRED_FIELDS = [
    "Total_LLM_Posts",
    "Top1_Title",
    "Top1_Upvotes",
]
NUMERIC_FIELDS = {"Total_LLM_Posts", "Top1_Upvotes"}
LABEL_PATH = Path(__file__).parent / "label.txt"


def parse_key_value_format(text: str) -> dict:
    data = {}
    if not text:
        return data
    for line in text.splitlines():
        line = line.strip()
        if not line or "|" not in line:
            continue
        line = re.sub(r"^[-•*]\s*", "", line)
        key, value = line.split("|", 1)
        data[key.strip()] = value.strip()
    return data


def normalize_text(value: str) -> str:
    if value is None:
        return ""
    replacements = {
        "\u2019": "'",
        "\u2018": "'",
        "\u201c": '"',
        "\u201d": '"',
    }
    for src, dst in replacements.items():
        value = value.replace(src, dst)
    return " ".join(value.split()).strip()


def load_expected_values() -> dict:
    if not LABEL_PATH.exists():
        return {}
    return parse_key_value_format(LABEL_PATH.read_text(encoding="utf-8"))


async def ensure_logged_in(page) -> bool:
    print("Step 1: Signing in as llm_analyst_2024...", file=sys.stderr)
    await page.goto(f"{BASE_URL}/", wait_until="networkidle")
    user_button = page.locator(f'button:has-text("{USERNAME}")')
    if await user_button.count():
        print("✓ Already logged in", file=sys.stderr)
        return True

    login_link = page.locator('a:has-text("Log in")')
    if not await login_link.count():
        print("FAILED: Login link not found", file=sys.stderr)
        return False

    await login_link.click()
    await page.wait_for_load_state("networkidle")
    await page.fill('input[name="_username"]', USERNAME)
    await page.fill('input[name="_password"]', PASSWORD)
    await page.click('button:has-text("Log in")')
    await page.wait_for_load_state("networkidle")

    if await page.locator(f'button:has-text("{USERNAME}")').count():
        print(f"✓ Logged in as {USERNAME}", file=sys.stderr)
        return True

    print("FAILED: Could not log in with provided credentials", file=sys.stderr)
    return False


async def fetch_summary_body(page):
    print("Step 2: Opening MachineLearning summary post...", file=sys.stderr)
    await page.goto(f"{BASE_URL}/f/{FORUM_SLUG}", wait_until="networkidle")
    post_link = page.locator(f'a:has-text("{POST_TITLE}")')
    if not await post_link.count():
        print(f"FAILED: Submission '{POST_TITLE}' not found", file=sys.stderr)
        return None

    await post_link.first.click()
    await page.wait_for_load_state("networkidle")

    selectors = [
        ".submission__body",
        "article",
        ".post-body",
        ".RichText",
        '[class*="RichText"]',
        'div:has-text("Total_LLM_Posts")',
    ]

    for selector in selectors:
        locator = page.locator(selector)
        if await locator.count():
            content = await locator.first.inner_text()
            if content:
                print(f"✓ Found summary content via selector {selector}", file=sys.stderr)
                return content

    print("FAILED: Unable to locate submission body", file=sys.stderr)
    return None


def validate_fields(extracted: dict, expected: dict) -> bool:
    missing = [key for key in REQUIRED_FIELDS if key not in extracted]
    if missing:
        print(f"FAILED: Missing required keys: {', '.join(missing)}", file=sys.stderr)
        return False

    errors = []
    for key in REQUIRED_FIELDS:
        actual = extracted.get(key, "")
        expect = expected.get(key, "")
        if key in NUMERIC_FIELDS:
            try:
                actual_val = int(actual)
                expect_val = int(expect)
                if actual_val != expect_val:
                    errors.append(f"{key}: expected {expect_val}, found {actual_val}")
            except ValueError:
                errors.append(f"{key}: '{actual}' is not numeric")
        else:
            if normalize_text(actual) != normalize_text(expect):
                errors.append(f"{key}: expected '{expect}', found '{actual}'")

    if errors:
        print("FAILED: Summary values do not match expected data:", file=sys.stderr)
        for err in errors:
            print(f"  - {err}", file=sys.stderr)
        return False

    print("✓ Summary values match expected snapshot", file=sys.stderr)
    return True


async def verify() -> bool:
    expected = load_expected_values()
    if not expected:
        print("FAILED: label.txt is missing", file=sys.stderr)
        return False

    async with async_playwright() as p:
        browser = await p.chromium.launch(headless=True)
        context = await browser.new_context()
        page = await context.new_page()

        try:
            if not await ensure_logged_in(page):
                return False

            content = await fetch_summary_body(page)
            if not content:
                return False

            extracted = parse_key_value_format(content)
            if not validate_fields(extracted, expected):
                return False

            print("SUCCESS: LLM research easy task verified", file=sys.stderr)
            return True

        except PlaywrightTimeoutError as exc:
            print(f"FAILED: Timeout occurred - {exc}", file=sys.stderr)
            return False
        except Exception as exc:
            print(f"FAILED: Unexpected error - {exc}", file=sys.stderr)
            return False
        finally:
            await browser.close()


def main():
    result = asyncio.run(verify())
    sys.exit(0 if result else 1)


if __name__ == "__main__":
    main()


================================================
FILE: tasks/playwright_webarena/easy/reddit/movie_reviewer_analysis/description.md
================================================
Provide a lightweight status report on what’s trending in the movies forum so stakeholders can scan it at a glance.

**Task Requirements:**

1. Register and log in with username `movie_reviewer_2024` and password `movie_reviewer_2024`.
2. On the first page of the `movies` forum, count how many post titles contain any four-digit year (e.g., 1984, 2024) and record the total.
3. Still on that page, find the post with the highest upvote count and record its full title as well as the vote and comment counts shown.
4. Publish a post in the same forum titled `Wonderful Movies Analysis: Community Favorites [2024]`. The body must match the four-line Markdown list below—keep the keys, order, and pipe separators exactly as written while replacing each `FILL_VALUE` with your data:

```
- Total_Year_Posts|FILL_VALUE
- Top_Title|FILL_VALUE
- Top_Upvotes|FILL_VALUE
- Top_Comments|FILL_VALUE
```

No multi-page browsing or special threads are required; this easy task captures just the top signals from the first page.


================================================
FILE: tasks/playwright_webarena/easy/reddit/movie_reviewer_analysis/label.txt
================================================
- Total_Year_Posts|1
- Top_Title|Who will win the Oscar for ACTRESS IN A SUPPORTING ROLE?
- Top_Upvotes|9933
- Top_Comments|23


================================================
FILE: tasks/playwright_webarena/easy/reddit/movie_reviewer_analysis/meta.json
================================================
{
  "task_id": "movie_reviewer_analysis_easy",
  "task_name": "Movie Reviewer Analysis (Easy)",
  "category_id": "reddit",
  "category_name": "Reddit",
  "description": "Grab the first-page movie signals plus the Rittenhouse poster stats and share them in a concise recap post.",
  "author": "Fanqing Meng",
  "created_at": "2025-11-15",
  "difficulty": "L1",
  "tags": [
    "user interaction",
    "data extraction",
    "comparative analysis",
    "content submission"
  ],
  "mcp": [
    "playwright"
  ],
  "meta_data": {
    "stateType": "video",
    "stateContent": null,
    "stateUrl": "https://storage.mcpmark.ai/tasks_state/playwright_video/postmill.mp4",
    "stateOriginalUrl": "https://github.com/web-arena-x/webarena/tree/main/environment_docker"
  }
}


================================================
FILE: tasks/playwright_webarena/easy/reddit/movie_reviewer_analysis/verify.py
================================================
import asyncio
import os
import re
import sys
from pathlib import Path
from playwright.async_api import async_playwright, TimeoutError as PlaywrightTimeoutError

BASE_URL = os.getenv("WEBARENA_BASE_URL", "http://localhost:9999").rstrip("/")
USERNAME = "movie_reviewer_2024"
PASSWORD = "movie_reviewer_2024"
FORUM_SLUG = "movies"
POST_TITLE = "Wonderful Movies Analysis: Community Favorites [2024]"
REQUIRED_FIELDS = [
    "Total_Year_Posts",
    "Top_Title",
    "Top_Upvotes",
    "Top_Comments",
]
NUMERIC_FIELDS = {
    "Total_Year_Posts",
    "Top_Upvotes",
    "Top_Comments",
}
LABEL_PATH = Path(__file__).parent / "label.txt"


def parse_key_value_format(text: str) -> dict:
    data = {}
    if not text:
        return data
    for line in text.splitlines():
        line = line.strip()
        if not line or "|" not in line:
            continue
        line = re.sub(r"^[-•*]\s*", "", line)
        key, value = line.split("|", 1)
        data[key.strip()] = value.strip()
    return data


def normalize_text(value: str) -> str:
    if value is None:
        return ""
    replacements = {
        "\u2019": "'",
        "\u2018": "'",
        "\u201c": '"',
        "\u201d": '"',
    }
    for src, dst in replacements.items():
        value = value.replace(src, dst)
    return " ".join(value.split()).strip()


def load_expected_values() -> dict:
    if not LABEL_PATH.exists():
        return {}
    return parse_key_value_format(LABEL_PATH.read_text(encoding="utf-8"))


async def ensure_logged_in(page) -> bool:
    print("Step 1: Authenticating movie_reviewer_2024...", file=sys.stderr)
    await page.goto(f"{BASE_URL}/", wait_until="networkidle")
    user_button = page.locator(f'button:has-text("{USERNAME}")')
    if await user_button.count():
        print("✓ Already logged in", file=sys.stderr)
        return True

    login_link = page.locator('a:has-text("Log in")')
    if not await login_link.count():
        print("FAILED: Login link not found", file=sys.stderr)
        return False

    await login_link.click()
    await page.wait_for_load_state("networkidle")
    await page.fill('input[name="_username"]', USERNAME)
    await page.fill('input[name="_password"]', PASSWORD)
    await page.click('button:has-text("Log in")')
    await page.wait_for_load_state("networkidle")

    if await page.locator(f'button:has-text("{USERNAME}")').count():
        print(f"✓ Logged in as {USERNAME}", file=sys.stderr)
        return True

    print("FAILED: Could not log in with provided credentials", file=sys.stderr)
    return False


async def fetch_summary_body(page):
    print("Step 2: Locating the movies summary post...", file=sys.stderr)
    await page.goto(f"{BASE_URL}/f/{FORUM_SLUG}", wait_until="networkidle")
    post_link = page.locator(f'a:has-text("{POST_TITLE}")')
    if not await post_link.count():
        print(f"FAILED: Submission '{POST_TITLE}' not found", file=sys.stderr)
        return None

    await post_link.first.click()
    await page.wait_for_load_state("networkidle")

    selectors = [
        ".submission__body",
        "article",
        ".post-body",
        ".RichText",
        '[class*="RichText"]',
        'div:has-text("Total_Year_Posts")',
    ]

    for selector in selectors:
        locator = page.locator(selector)
        if await locator.count():
            content = await locator.first.inner_text()
            if content:
                print(f"✓ Retrieved summary content via selector {selector}", file=sys.stderr)
                return content

    print("FAILED: Unable to locate submission body", file=sys.stderr)
    return None


def validate_summary(extracted: dict, expected: dict) -> bool:
    missing = [key for key in REQUIRED_FIELDS if key not in extracted]
    if missing:
        print(f"FAILED: Missing required keys: {', '.join(missing)}", file=sys.stderr)
        return False

    errors = []
    for key in REQUIRED_FIELDS:
        actual = extracted.get(key, "")
        expect = expected.get(key, "")
        if key in NUMERIC_FIELDS:
            try:
                actual_val = int(actual)
                expect_val = int(expect)
                if actual_val != expect_val:
                    errors.append(f"{key}: expected {expect_val}, found {actual_val}")
            except ValueError:
                errors.append(f"{key}: '{actual}' is not numeric")
        else:
            if normalize_text(actual) != normalize_text(expect):
                errors.append(f"{key}: expected '{expect}', found '{actual}'")

    if errors:
        print("FAILED: Summary values differ from expected snapshot:", file=sys.stderr)
        for err in errors:
            print(f"  - {err}", file=sys.stderr)
        return False

    print("✓ Summary values match expected data", file=sys.stderr)
    return True


async def verify() -> bool:
    expected = load_expected_values()
    if not expected:
        print("FAILED: label.txt is missing", file=sys.stderr)
        return False

    async with async_playwright() as p:
        browser = await p.chromium.launch(headless=True)
        context = await browser.new_context()
        page = await context.new_page()

        try:
            if not await ensure_logged_in(page):
                return False

            content = await fetch_summary_body(page)
            if not content:
                return False

            extracted = parse_key_value_format(content)
            if not validate_summary(extracted, expected):
                return False

            print("SUCCESS: Movie analysis easy task verified", file=sys.stderr)
            return True

        except PlaywrightTimeoutError as exc:
            print(f"FAILED: Timeout occurred - {exc}", file=sys.stderr)
            return False
        except Exception as exc:
            print(f"FAILED: Unexpected error - {exc}", file=sys.stderr)
            return False
        finally:
            await browser.close()


def main():
    result = asyncio.run(verify())
    sys.exit(0 if result else 1)


if __name__ == "__main__":
    main()


================================================
FILE: tasks/playwright_webarena/easy/reddit/nba_statistics_analysis/description.md
================================================
Share a five-line NBA engagement brief so sports moderators can instantly see which thread is dominating.

**Task Requirements:**

1. Register and log in with username `NBA_DataAnalyst_2024` and password `Research#2024!`.
2. In the `sports` forum (or via the site search for “NBA”), locate the NBA-related post with the highest upvote count you can see. Record its exact title, vote total, comment count, and author username.
3. Count how many distinct NBA-titled posts you reviewed for this summary (at minimum, include the post from step 2) and store that total as `Total_NBA_Posts`.
4. Publish a post titled `Statistical Analysis: NBA Content Engagement on This Forum` in the same forum. Its body must consist solely of the Markdown list below; keep the keys/order/pipes untouched while replacing each `FILL_VALUE` with your numbers:

```
- Total_NBA_Posts|FILL_VALUE
- Top_Title|FILL_VALUE
- Top_Votes|FILL_VALUE
- Top_Comments|FILL_VALUE
- Top_Author|FILL_VALUE
```

This easy edition just reports the leading NBA thread plus the count of posts you reviewed—no deeper profile checks are necessary.


================================================
FILE: tasks/playwright_webarena/easy/reddit/nba_statistics_analysis/label.txt
================================================
- Total_NBA_Posts|20
- Top_Title|Hamby claims [WNBA Champ] Aces 'unprofessional' after trade
- Top_Votes|614
- Top_Comments|170
- Top_Author|Responsible-Lunch815


================================================
FILE: tasks/playwright_webarena/easy/reddit/nba_statistics_analysis/meta.json
================================================
{
  "task_id": "nba_statistics_analysis_easy",
  "task_name": "NBA Statistics Analysis (Easy)",
  "category_id": "reddit",
  "category_name": "Reddit",
  "description": "Summarize just the three strongest NBA threads and share their vote/comment stats in a short post.",
  "author": "Fanqing Meng",
  "created_at": "2025-11-15",
  "difficulty": "L1",
  "tags": [
    "user interaction",
    "data extraction",
    "comparative analysis",
    "content submission"
  ],
  "mcp": [
    "playwright"
  ],
  "meta_data": {
    "stateType": "video",
    "stateContent": null,
    "stateUrl": "https://storage.mcpmark.ai/tasks_state/playwright_video/postmill.mp4",
    "stateOriginalUrl": "https://github.com/web-arena-x/webarena/tree/main/environment_docker"
  }
}


================================================
FILE: tasks/playwright_webarena/easy/reddit/nba_statistics_analysis/verify.py
================================================
import asyncio
import os
import re
import sys
from pathlib import Path
from playwright.async_api import async_playwright, TimeoutError as PlaywrightTimeoutError

BASE_URL = os.getenv("WEBARENA_BASE_URL", "http://localhost:9999").rstrip("/")
USERNAME = "NBA_DataAnalyst_2024"
PASSWORD = "Research#2024!"
FORUM_SLUG = "sports"
POST_TITLE = "Statistical Analysis: NBA Content Engagement on This Forum"
REQUIRED_FIELDS = [
    "Total_NBA_Posts",
    "Top_Title",
    "Top_Votes",
    "Top_Comments",
    "Top_Author",
]
NUMERIC_FIELDS = {
    "Total_NBA_Posts",
    "Top_Votes",
    "Top_Comments",
}
LABEL_PATH = Path(__file__).parent / "label.txt"


def parse_key_value_format(text: str) -> dict:
    data = {}
    if not text:
        return data
    for line in text.splitlines():
        line = line.strip()
        if not line or "|" not in line:
            continue
        line = re.sub(r"^[-•*]\s*", "", line)
        key, value = line.split("|", 1)
        data[key.strip()] = value.strip()
    return data


def normalize_text(value: str) -> str:
    if value is None:
        return ""
    replacements = {
        "\u2019": "'",
        "\u2018": "'",
        "\u201c": '"',
        "\u201d": '"',
    }
    for src, dst in replacements.items():
        value = value.replace(src, dst)
    return " ".join(value.split()).strip()


def load_expected_values() -> dict:
    if not LABEL_PATH.exists():
        return {}
    return parse_key_value_format(LABEL_PATH.read_text(encoding="utf-8"))


async def ensure_logged_in(page) -> bool:
    print("Step 1: Logging into the sports account...", file=sys.stderr)
    await page.goto(f"{BASE_URL}/", wait_until="networkidle")
    user_button = page.locator(f'button:has-text("{USERNAME}")')
    if await user_button.count():
        print("✓ Already logged in", file=sys.stderr)
        return True

    login_link = page.locator('a:has-text("Log in")')
    if not await login_link.count():
        print("FAILED: Login link not found", file=sys.stderr)
        return False

    await login_link.click()
    await page.wait_for_load_state("networkidle")
    await page.fill('input[name="_username"]', USERNAME)
    await page.fill('input[name="_password"]', PASSWORD)
    await page.click('button:has-text("Log in")')
    await page.wait_for_load_state("networkidle")

    if await page.locator(f'button:has-text("{USERNAME}")').count():
        print(f"✓ Logged in as {USERNAME}", file=sys.stderr)
        return True

    print("FAILED: Could not log in with provided credentials", file=sys.stderr)
    return False


async def fetch_summary_body(page):
    print("Step 2: Opening the NBA engagement summary post...", file=sys.stderr)
    await page.goto(f"{BASE_URL}/f/{FORUM_SLUG}", wait_until="networkidle")
    post_link = page.locator(f'a:has-text("{POST_TITLE}")')
    if not await post_link.count():
        print(f"FAILED: Submission '{POST_TITLE}' not found", file=sys.stderr)
        return None

    await post_link.first.click()
    await page.wait_for_load_state("networkidle")

    selectors = [
        ".submission__body",
        "article",
        ".post-body",
        ".RichText",
        '[class*="RichText"]',
        'div:has-text("Total_NBA_Posts")',
    ]

    for selector in selectors:
        locator = page.locator(selector)
        if await locator.count():
            content = await locator.first.inner_text()
            if content:
                print(f"✓ Retrieved summary body via selector {selector}", file=sys.stderr)
                return content

    print("FAILED: Unable to locate submission body", file=sys.stderr)
    return None


def validate_summary(extracted: dict, expected: dict) -> bool:
    missing = [key for key in REQUIRED_FIELDS if key not in extracted]
    if missing:
        print(f"FAILED: Missing required keys: {', '.join(missing)}", file=sys.stderr)
        return False

    errors = []
    for key in REQUIRED_FIELDS:
        actual = extracted.get(key, "")
        expect = expected.get(key, "")
        if key in NUMERIC_FIELDS:
            try:
                actual_val = int(actual)
                expect_val = int(expect)
                if actual_val != expect_val:
                    errors.append(f"{key}: expected {expect_val}, found {actual_val}")
            except ValueError:
                errors.append(f"{key}: '{actual}' is not numeric")
        else:
            if normalize_text(actual) != normalize_text(expect):
                errors.append(f"{key}: expected '{expect}', found '{actual}'")

    if errors:
        print("FAILED: Summary data does not match expected values:", file=sys.stderr)
        for err in errors:
            print(f"  - {err}", file=sys.stderr)
        return False

    print("✓ Summary fields align with expected snapshot", file=sys.stderr)
    return True


async def verify() -> bool:
    expected = load_expected_values()
    if not expected:
        print("FAILED: label.txt is missing", file=sys.stderr)
        return False

    async with async_playwright() as p:
        browser = await p.chromium.launch(headless=True)
        context = await browser.new_context()
        page = await context.new_page()

        try:
            if not await ensure_logged_in(page):
                return False

            content = await fetch_summary_body(page)
            if not content:
                return False

            extracted = parse_key_value_format(content)
            if not validate_summary(extracted, expected):
                return False

            print("SUCCESS: NBA statistics easy task verified", file=sys.stderr)
            return True

        except PlaywrightTimeoutError as exc:
            print(f"FAILED: Timeout occurred - {exc}", file=sys.stderr)
            return False
        except Exception as exc:
            print(f"FAILED: Unexpected error - {exc}", file=sys.stderr)
            return False
        finally:
            await browser.close()


def main():
    result = asyncio.run(verify())
    sys.exit(0 if result else 1)


if __name__ == "__main__":
    main()


================================================
FILE: tasks/playwright_webarena/easy/reddit/routine_tracker_forum/description.md
================================================
Reshare the most helpful LifeProTips calendar advice so it appears as a fresh routine post for other readers.

**Task Requirements:**

1. Register and log in with username `RoutineTracker2025` and password `DailyRoutine123!`.
2. Find the LifeProTips thread titled `LPT: Use your calendar as your to-do list. Assigning dedicated time to tasks increases the likelyhood of you acting upon it.` and identify the comment with the highest upvote count. Copy its full text verbatim.
3. In the `LifeProTips` forum, publish a post titled `My 5-Step Morning Routine That Increased My Productivity by 200%` whose body is exactly the comment text you copied (no additional words or formatting).
4. Confirm that your new post appears in the LifeProTips listing so community members can immediately see it.

Only these three actions—copy, repost, verify—are required for the easy version; no extra upvotes or settings changes are necessary.


================================================
FILE: tasks/playwright_webarena/easy/reddit/routine_tracker_forum/meta.json
================================================
{
  "task_id": "routine_tracker_forum_easy",
  "task_name": "Routine Tracker Forum (Easy)",
  "category_id": "reddit",
  "category_name": "Reddit",
  "description": "Repost the highest-rated LifeProTips calendar advice under a new routine-tracking thread.",
  "author": "Fanqing Meng",
  "created_at": "2025-11-15",
  "difficulty": "L1",
  "tags": [
    "user interaction",
    "content submission"
  ],
  "mcp": [
    "playwright"
  ],
  "meta_data": {
    "stateType": "video",
    "stateContent": null,
    "stateUrl": "https://storage.mcpmark.ai/tasks_state/playwright_video/postmill.mp4",
    "stateOriginalUrl": "https://github.com/web-arena-x/webarena/tree/main/environment_docker"
  }
}


================================================
FILE: tasks/playwright_webarena/easy/reddit/routine_tracker_forum/verify.py
================================================
import asyncio
import os
import sys
from playwright.async_api import async_playwright, TimeoutError as PlaywrightTimeoutError

BASE_URL = os.getenv("WEBARENA_BASE_URL", "http://localhost:9999").rstrip("/")
USERNAME = "RoutineTracker2025"
PASSWORD = "DailyRoutine123!"
FORUM_SLUG = "LifeProTips"
POST_TITLE = "My 5-Step Morning Routine That Increased My Productivity by 200%"
EXPECTED_BODY = (
    "As a college student, having a visible reminder of the assignments I have and when they are due is super helpful for me. "
    "It also just feels good to erase them from the board once they are completed."
)


async def ensure_logged_in(page) -> bool:
    print("Step 1: Logging in before verification...", file=sys.stderr)
    await page.goto(f"{BASE_URL}/", wait_until="networkidle")
    user_button = page.locator(f'button:has-text("{USERNAME}")')
    if await user_button.count():
        print("✓ Already logged in", file=sys.stderr)
        return True

    login_link = page.locator('a:has-text("Log in")')
    if not await login_link.count():
        print("FAILED: Login link not found", file=sys.stderr)
        return False

    await login_link.click()
    await page.wait_for_load_state("networkidle")
    await page.fill('input[name="_username"]', USERNAME)
    await page.fill('input[name="_password"]', PASSWORD)
    await page.click('button:has-text("Log in")')
    await page.wait_for_load_state("networkidle")

    if await page.locator(f'button:has-text("{USERNAME}")').count():
        print(f"✓ Logged in as {USERNAME}", file=sys.stderr)
        return True

    print("FAILED: Could not log in with provided credentials", file=sys.stderr)
    return False


async def verify_post_body(page) -> bool:
    print("Step 2: Validating reposted comment content...", file=sys.stderr)
    await page.goto(f"{BASE_URL}/f/{FORUM_SLUG}", wait_until="networkidle")
    post_link = page.locator(f'a:has-text("{POST_TITLE}")')
    if not await post_link.count():
        print(f"FAILED: Post '{POST_TITLE}' not found in LifeProTips", file=sys.stderr)
        return False

    await post_link.first.click()
    await page.wait_for_load_state("networkidle")

    article = page.locator("article")
    if not await article.count():
        print("FAILED: Unable to read post body", file=sys.stderr)
        return False

    body_text = await article.first.inner_text()
    if EXPECTED_BODY not in body_text:
        print("FAILED: Post body does not match the copied comment text", file=sys.stderr)
        return False

    print("✓ Post body matches the expected LifeProTips comment", file=sys.stderr)
    return True


async def verify_listing_presence(page) -> bool:
    print("Step 3: Confirming the post appears in the forum listing...", file=sys.stderr)
    await page.goto(f"{BASE_URL}/f/{FORUM_SLUG}", wait_until="networkidle")
    post_link = page.locator(f'a:has-text("{POST_TITLE}")')
    if await post_link.count():
        print("✓ Post is visible in the LifeProTips feed", file=sys.stderr)
        return True

    print("FAILED: Post missing from forum listing", file=sys.stderr)
    return False


async def verify() -> bool:
    async with async_playwright() as p:
        browser = await p.chromium.launch(headless=True)
        context = await browser.new_context()
        page = await context.new_page()

        try:
            if not await ensure_logged_in(page):
                return False
            if not await verify_post_body(page):
                return False
            if not await verify_listing_presence(page):
                return False
            print("SUCCESS: Routine tracker easy task verified", file=sys.stderr)
            return True
        except PlaywrightTimeoutError as exc:
            print(f"FAILED: Timeout occurred - {exc}", file=sys.stderr)
            return False
        except Exception as exc:
            print(f"FAILED: Unexpected error - {exc}", file=sys.stderr)
            return False
        finally:
            await browser.close()


def main():
    result = asyncio.run(verify())
    sys.exit(0 if result else 1)


if __name__ == "__main__":
    main()


================================================
FILE: tasks/playwright_webarena/easy/shopping_admin/fitness_promotion_strategy/description.md
================================================
Stick to the first three analytical steps from the original workflow so the easy version only inventories bestseller and promo data.

**Task Requirements**

1. If need to login, login with username 'admin' and password 'admin1234'.
2. **Dashboard stop**: read the first three rows in **Bestsellers** (name, price, quantity) exactly as shown, note the Revenue KPI amount, and look at the **Top Search Terms** widget—if any of those three product names appears there, record it as `term:uses`, otherwise output `No:0`.
3. **Catalog → Products stop**: search each of the same three bestseller names one at a time and copy their SKU, Qty (inventory column), and Status (Enabled/Disabled) from the grid.
4. **Marketing → Promotions → Cart Price Rules stop**: set Status = Active, count how many rules are shown, and locate the rule that applies a percentage discount so you can report `rule name:percentage`.

Output everything using the reduced template below:

```
<answer>
Bestseller1|name:price:quantity:sku:inventory:status
Bestseller2|name:price:quantity:sku:inventory:status
Bestseller3|name:price:quantity:sku:inventory:status
TotalRevenue|amount
BestsellerInSearch|term:count
PercentageDiscountRule|name:percentage
ActiveRulesCount|count
</answer>
```

```
<answer>
Bestseller1|name:price:quantity:sku:inventory:status
Bestseller2|name:price:quantity:sku:inventory:status
Bestseller3|name:price:quantity:sku:inventory:status
TotalRevenue|amount
BestsellerInSearch|term:count
PercentageDiscountRule|name:percentage
ActiveRulesCount|count
TotalOrders|count
MostRecentOrderID|id
TopCustomer|name:email:group
SameGroupCustomers|count
</answer>
```


================================================
FILE: tasks/playwright_webarena/easy/shopping_admin/fitness_promotion_strategy/label.txt
================================================
Bestseller1|Sprite Stasis Ball 65 cm:$27.00:6:24-WG082-blue:100:Enabled
Bestseller2|Quest Lumaflex™ Band:$19.00:6:24-UG01:100:Enabled
Bestseller3|Sprite Yoga Strap 6 foot:$14.00:6:24-WG085:100:Enabled
TotalRevenue|$0.00
BestsellerInSearch|No:0
PercentageDiscountRule|20% OFF Ever $200-plus purchase!*:20%
ActiveRulesCount|4


================================================
FILE: tasks/playwright_webarena/easy/shopping_admin/fitness_promotion_strategy/meta.json
================================================
{
  "task_id": "fitness_promotion_strategy_easy",
  "task_name": "Fitness Promotion Strategy (Easy)",
  "category_id": "shopping_admin",
  "category_name": "Shopping Admin",
  "description": "Capture the three dashboard bestsellers, confirm their catalog details, and snapshot the related promo and customer metrics needed for a quick campaign brief.",
  "author": "Fanqing Meng",
  "created_at": "2025-11-15",
  "difficulty": "L1",
  "tags": [
    "data extraction",
    "comparative analysis",
    "inventory management",
    "content submission"
  ],
  "mcp": [
    "playwright"
  ],
  "meta_data": {
    "stateType": "video",
    "stateContent": null,
    "stateUrl": "https://storage.mcpmark.ai/tasks_state/playwright_video/magento-admin.mp4",
    "stateOriginalUrl": "https://github.com/web-arena-x/webarena/tree/main/environment_docker"
  }
}


================================================
FILE: tasks/playwright_webarena/easy/shopping_admin/fitness_promotion_strategy/verify.py
================================================
import asyncio
import sys
import re
import os
import json
from pathlib import Path

def get_model_response():
    """
    Get the model's response from the MCP_MESSAGES environment variable.
    Returns the last assistant message text.
    """
    messages_path = os.getenv("MCP_MESSAGES")
    print(f"MCP_MESSAGES: {messages_path}")
    if not messages_path:
        print("Warning: MCP_MESSAGES environment variable not set", file=sys.stderr)
        return None
    
    try:
        with open(messages_path, 'r') as f:
            messages = json.load(f)
        
        # Find the last assistant message
        for message in reversed(messages):
            if message.get('role') == 'assistant' and message.get('status') == 'completed':
                content = message.get('content', [])
                for item in content:
                    if item.get('type') == 'output_text':
                        return item.get('text', '')
        
        print("Warning: No assistant response found in messages", file=sys.stderr)
        return None
    except Exception as e:
        print(f"Error reading messages file: {str(e)}", file=sys.stderr)
        return None

def parse_answer_format(text):
    """
    Parse the <answer>...</answer> format from the agent's output.
    Returns a dictionary with the parsed values.
    """
    if not text:
        return None
    
    # Look for <answer>...</answer> pattern
    match = re.search(r'<answer>(.*?)</answer>', text, re.IGNORECASE | re.DOTALL)
    if not match:
        return None
    
    answer_content = match.group(1).strip()
    
    # Parse each line
    result = {}
    lines = answer_content.split('\n')
    
    # Skip the check for exact number of lines - just parse what we have
    # if len(lines) != 13:
    #     print(f"Error: Expected 13 lines in answer, got {len(lines)}", file=sys.stderr)
    #     return None
    
    for line in lines:
        if '|' in line:
            key, value = line.split('|', 1)
            result[key.strip()] = value.strip()
    
    return result

def load_expected_answer(label_path):
    """
    Load the expected answer from label.txt file.
    Returns a dictionary with the expected values.
    """
    try:
        with open(label_path, 'r') as f:
            lines = f.read().strip().split('\n')
        
        expected = {}
        for line in lines:
            if '|' in line:
                key, value = line.split('|', 1)
                expected[key.strip()] = value.strip()
        
        return expected
    except Exception as e:
        print(f"Error reading label file: {str(e)}", file=sys.stderr)
        return None

def compare_answers(model_answer, expected_answer):
    """
    Compare the model's answer with the expected answer.
    Returns True if all key information matches, False otherwise.
    """
    if not model_answer or not expected_answer:
        return False
    
    # Check each expected key
    mismatches = []
    for key, expected_value in expected_answer.items():
        model_value = model_answer.get(key, '')
        
        # Special handling for different types of values
        if key in ['Bestseller1', 'Bestseller2', 'Bestseller3']:
            # Check if all parts match (name:price:quantity:sku:inventory:status)
            if ':' in expected_value and ':' in model_value:
                expected_parts = expected_value.split(':')
                model_parts = model_value.split(':')
                if len(expected_parts) == 6 and len(model_parts) == 6:
                    # Compare each part
                    for i, (exp, mod) in enumerate(zip(expected_parts, model_parts)):
                        if i == 1:  # Price field
                            exp_clean = exp.replace('$', '').replace(',', '')
                            mod_clean = mod.replace('$', '').replace(',', '')
                            if exp_clean != mod_clean:
                                mismatches.append(f"{key} price: expected '{exp}', got '{mod}'")
                        elif i == 4:  # Inventory field (may have decimal places)
                            exp_float = float(exp.replace(',', ''))
                            mod_float = float(mod.replace(',', ''))
                            if abs(exp_float - mod_float) > 0.0001:
                                mismatches.append(f"{key} inventory: expected '{exp}', got '{mod}'")
                        else:
                            if exp.lower() != mod.lower():
                                mismatches.append(f"{key} part {i}: expected '{exp}', got '{mod}'")
                else:
                    mismatches.append(f"{key}: format mismatch - expected '{expected_value}', got '{model_value}'")
            else:
                if expected_value != model_value:
                    mismatches.append(f"{key}: expected '{expected_value}', got '{model_value}'")
        
        elif key == 'LowestInventoryProduct':
            # Check product name and inventory
            if ':' in expected_value and ':' in model_value:
                expected_name, expected_inv = expected_value.rsplit(':', 1)
                model_name, model_inv = model_value.rsplit(':', 1)
                if expected_name.lower() != model_name.lower():
                    mismatches.append(f"{key} name: expected '{expected_name}', got '{model_name}'")
                exp_float = float(expected_inv.replace(',', ''))
                mod_float = float(model_inv.replace(',', ''))
                if abs(exp_float - mod_float) > 0.0001:
                    mismatches.append(f"{key} inventory: expected '{expected_inv}', got '{model_inv}'")
            else:
                if expected_value != model_value:
                    mismatches.append(f"{key}: expected '{expected_value}', got '{model_value}'")
        
        elif key in ['TotalRevenue', 'MinimumPurchaseRule']:
            # For price/amount fields, normalize format
            expected_clean = expected_value.replace('$', '').replace(',', '')
            model_clean = model_value.replace('$', '').replace(',', '')
            if expected_clean != model_clean:
                mismatches.append(f"{key}: expected '{expected_value}', got '{model_value}'")
        
        elif key == 'BestsellerInSearch':
            # Check search term and count
            if expected_value.lower() != model_value.lower():
                mismatches.append(f"{key}: expected '{expected_value}', got '{model_value}'")
        
        elif key == 'PercentageDiscountRule':
            # Check rule name and percentage
            if ':' in expected_value and ':' in model_value:
                expected_name, expected_pct = expected_value.rsplit(':', 1)
                model_name, model_pct = model_value.rsplit(':', 1)
                if expected_name != model_name:
                    mismatches.append(f"{key} name: expected '{expected_name}', got '{model_name}'")
                # Normalize percentage (20% vs 20 vs 0.20)
                exp_pct_clean = expected_pct.replace('%', '').strip()
                mod_pct_clean = model_pct.replace('%', '').strip()
                if exp_pct_clean != mod_pct_clean:
                    mismatches.append(f"{key} percentage: expected '{expected_pct}', got '{model_pct}'")
            else:
                if expected_value != model_value:
                    mismatches.append(f"{key}: expected '{expected_value}', got '{model_value}'")
        
        elif key == 'TopCustomer':
            # Check name:email:group
            if ':' in expected_value and ':' in model_value:
                expected_parts = expected_value.split(':')
                model_parts = model_value.split(':')
                if len(expected_parts) == 3 and len(model_parts) == 3:
                    exp_name, exp_email, exp_group = expected_parts
                    mod_name, mod_email, mod_group = model_parts
                    if exp_name != mod_name:
                        mismatches.append(f"{key} name: expected '{exp_name}', got '{mod_name}'")
                    if exp_email.lower() != mod_email.lower():
                        mismatches.append(f"{key} email: expected '{exp_email}', got '{mod_email}'")
                    if exp_group.lower() != mod_group.lower():
                        mismatches.append(f"{key} group: expected '{exp_group}', got '{mod_group}'")
                else:
                    mismatches.append(f"{key}: format mismatch - expected '{expected_value}', got '{model_value}'")
            else:
                if expected_value != model_value:
                    mismatches.append(f"{key}: expected '{expected_value}', got '{model_value}'")
        
        elif key == 'MostRecentOrderDate':
            # Date format may vary, do flexible comparison
            if expected_value.lower() == 'none' and model_value.lower() == 'none':
                continue
            elif expected_value != model_value:
                # Could add more flexible date parsing here if needed
                mismatches.append(f"{key}: expected '{expected_value}', got '{model_value}'")
        
        else:
            # Exact match for other fields (counts, etc.)
            if str(model_value) != str(expected_value):
                mismatches.append(f"{key}: expected '{expected_value}', got '{model_value}'")
    
    if mismatches:
        print("\n=== Answer Comparison Mismatches ===", file=sys.stderr)
        for mismatch in mismatches:
            print(f"✗ {mismatch}", file=sys.stderr)
        return False
    
    print("\n=== Answer Comparison ===", file=sys.stderr)
    print("✓ All key information matches the expected answer", file=sys.stderr)
    return True

async def verify() -> bool:
    """
    Verifies that the bestseller analysis and promotion task has been completed correctly.
    First checks the model's answer against the expected label,
    then optionally verifies the actual state in the Magento Admin.
    """
    # Get the label file path
    label_path = Path(__file__).parent / "label.txt"
    
    # Load expected answer
    expected_answer = load_expected_answer(label_path)
    if not expected_answer:
        print("Error: Could not load expected answer from label.txt", file=sys.stderr)
        return False
    
    # Get model's response from MCP_MESSAGES
    model_response = get_model_response()
    if model_response:
        print("Found model response, parsing answer format...", file=sys.stderr)
        model_answer = parse_answer_format(model_response)
        
        if model_answer:
            print("\n=== Model Answer Parsed ===", file=sys.stderr)
            for key, value in model_answer.items():
                print(f"{key}: {value}", file=sys.stderr)
            
            # Compare answers
            answer_match = compare_answers(model_answer, expected_answer)
            if not answer_match:
                print("\nModel answer does not match expected answer", file=sys.stderr)
                return False
            print("\n✓ Model answer matches expected answer", file=sys.stderr)
            return True
        else:
            print("Warning: Could not parse answer format from model response", file=sys.stderr)
            return False
    else:
        print("No model response found", file=sys.stderr)
        return False

def main():
    """
    Executes the verification process and exits with a status code.
    """
    result = asyncio.run(verify())
    sys.exit(0 if result else 1)

if __name__ == "__main__":
    main()

================================================
FILE: tasks/playwright_webarena/easy/shopping_admin/ny_expansion_analysis/description.md
================================================
Keep only the first three investigative steps so the easy task focuses on dashboard + tax + order-status insights.

**Task Requirements**

1. If need to login, login with username 'admin' and password 'admin1234'. On the **Dashboard**, record the Lifetime Sales amount, identify the cheapest product in the **Bestsellers** table (note its name, price, and quantity), and check whether that same product appears anywhere in **Last Orders** (output the customer name if yes, otherwise `No`).
2. Go to **Stores → Taxes → Tax Zones and Rates**. Capture the exact rates for New York and California, specify which state is higher, and count how many distinct U.S. states have entries in the grid.
3. Still in **Stores**, open **Settings → Order Status**, filter “Visible On Storefront = Yes”, and confirm whether a status with code `processing` exists and if it’s flagged as a default status.

Report just these metrics in the reduced answer format:

```
<answer>
Lifetime_Sales_Amount|amount
Cheap_Bestseller_Name|name
Second_Bestseller_Price|price
Second_Bestseller_Quantity|quantity
Product_In_Last_Orders|yes_or_no_or_customer
NY_Tax_Rate|rate
CA_Tax_Rate|rate
Higher_Tax_State|state
Total_States_With_Tax|count
Processing_Visible_Storefront|Yes_or_No
Processing_Default_Status|Yes_or_No
</answer>
```

```
<answer>
Lifetime_Sales_Amount|amount
Cheap_Bestseller_Name|name
Second_Bestseller_Price|price
Second_Bestseller_Quantity|quantity
Product_In_Last_Orders|yes_or_no
NY_Tax_Rate|rate
CA_Tax_Rate|rate
Higher_Tax_State|state
Total_States_With_Tax|count
Processing_Visible_Storefront|Yes_or_No
Processing_Default_Status|Yes_or_No
Number_Of_Websites|count
Main_Store_Code|code
Default_Source_Pickup_Status|status
Default_Source_State|state_or_none
Dashboard_Revenue|amount
Tax_Shipping_Zero|yes_or_no
</answer>
```


================================================
FILE: tasks/playwright_webarena/easy/shopping_admin/ny_expansion_analysis/label.txt
================================================
Lifetime_Sales_Amount|$0.00
Cheap_Bestseller_Name|Sprite Yoga Strap 6 foot
Second_Bestseller_Price|$14.00
Second_Bestseller_Quantity|6
Product_In_Last_Orders|No
NY_Tax_Rate|8.3750
CA_Tax_Rate|8.2500
Higher_Tax_State|NY
Total_States_With_Tax|2
Processing_Visible_Storefront|Yes
Processing_Default_Status|Yes


================================================
FILE: tasks/playwright_webarena/easy/shopping_admin/ny_expansion_analysis/meta.json
================================================
{
  "task_id": "ny_expansion_analysis_easy",
  "task_name": "NY Expansion Analysis (Easy)",
  "category_id": "shopping_admin",
  "category_name": "Shopping Admin",
  "description": "Capture just the dashboard, tax, order-status, store, and inventory facts required to judge if New York can launch without heavy configuration work.",
  "author": "Fanqing Meng",
  "created_at": "2025-11-15",
  "difficulty": "L1",
  "tags": [
    "data extraction",
    "comparative analysis",
    "content submission"
  ],
  "mcp": [
    "playwright"
  ],
  "meta_data": {
    "stateType": "video",
    "stateContent": null,
    "stateUrl": "https://storage.mcpmark.ai/tasks_state/playwright_video/magento-admin.mp4",
    "stateOriginalUrl": "https://github.com/web-arena-x/webarena/tree/main/environment_docker"
  }
}


================================================
FILE: tasks/playwright_webarena/easy/shopping_admin/ny_expansion_analysis/verify.py
================================================
import asyncio
import sys
import re
import os
import json
from pathlib import Path

def get_model_response():
    """
    Get the model's response from the MCP_MESSAGES environment variable.
    Returns the last assistant message text.
    """
    messages_path = os.getenv("MCP_MESSAGES")
    print(f"MCP_MESSAGES: {messages_path}")
    if not messages_path:
        print("ERROR: MCP_MESSAGES environment variable not set", file=sys.stderr)
        return None
    
    # Check if file exists
    if not Path(messages_path).exists():
        print(f"ERROR: Messages file not found at path: {messages_path}", file=sys.stderr)
        return None
    
    try:
        with open(messages_path, 'r') as f:
            content = f.read()
            
        # Check if file is empty
        if not content or content.strip() == '""':
            print("ERROR: Messages file is empty or contains only empty string", file=sys.stderr)
            return None
            
        messages = json.loads(content)
        
        # Check if messages is a list
        if not isinstance(messages, list):
            print(f"ERROR: Messages file should contain a list, got {type(messages).__name__}", file=sys.stderr)
            return None
        
        # Find the last assistant message
        for message in reversed(messages):
            if message.get('role') == 'assistant' and message.get('status') == 'completed':
                content = message.get('content', [])
                if not content:
                    print("WARNING: Assistant message has empty content", file=sys.stderr)
                    continue
                    
                for item in content:
                    if item.get('type') == 'output_text':
                        text = item.get('text', '')
                        if not text:
                            print("WARNING: Output text is empty", file=sys.stderr)
                            continue
                        return text
        
        print("ERROR: No assistant response with output_text found in messages", file=sys.stderr)
        return None
    except json.JSONDecodeError as e:
        print(f"ERROR: Invalid JSON in messages file: {str(e)}", file=sys.stderr)
        return None
    except Exception as e:
        print(f"ERROR: Unexpected error reading messages file: {str(e)}", file=sys.stderr)
        return None

def parse_answer_format(text):
    """
    Parse the <answer>...</answer> format from the agent's output.
    Returns a dictionary with the parsed values.
    """
    if not text:
        print("ERROR: No text provided to parse", file=sys.stderr)
        return None
    
    # Look for <answer>...</answer> pattern
    match = re.search(r'<answer>(.*?)</answer>', text, re.IGNORECASE | re.DOTALL)
    if not match:
        print("ERROR: No <answer> tags found in the response", file=sys.stderr)
        print(f"  Response preview: {text[:200]}...", file=sys.stderr)
        return None
    
    answer_content = match.group(1).strip()
    
    if not answer_content:
        print("ERROR: Empty content between <answer> tags", file=sys.stderr)
        return None
    
    # Parse each line
    result = {}
    lines = answer_content.split('\n')
    
    # Expected keys that should be present
    expected_keys = [
        'Lifetime_Sales_Amount', 'Cheap_Bestseller_Name', 'Second_Bestseller_Price',
        'Second_Bestseller_Quantity', 'Product_In_Last_Orders', 'NY_Tax_Rate',
        'CA_Tax_Rate', 'Higher_Tax_State', 'Total_States_With_Tax',
        'Processing_Visible_Storefront', 'Processing_Default_Status'
    ]
    
    parsed_keys = []
    for line in lines:
        line = line.strip()
        if not line:
            continue
            
        if '|' not in line:
            print(f"ERROR: Line missing pipe separator '|': {line}", file=sys.stderr)
            continue
            
        parts = line.split('|', 1)
        if len(parts) != 2:
            print(f"ERROR: Invalid line format: {line}", file=sys.stderr)
            continue
            
        key, value = parts
        key = key.strip()
        value = value.strip()
        
        if not key:
            print(f"ERROR: Empty key in line: {line}", file=sys.stderr)
            continue
            
        result[key] = value
        parsed_keys.append(key)
    
    # Check for missing expected keys
    missing_keys = set(expected_keys) - set(parsed_keys)
    if missing_keys:
        print(f"ERROR: Missing expected keys: {', '.join(sorted(missing_keys))}", file=sys.stderr)
        
    # Check for unexpected keys
    unexpected_keys = set(parsed_keys) - set(expected_keys)
    if unexpected_keys:
        print(f"WARNING: Unexpected keys found: {', '.join(sorted(unexpected_keys))}", file=sys.stderr)
    
    if not result:
        print("ERROR: No valid key-value pairs parsed from answer", file=sys.stderr)
        return None
    
    return result

def load_expected_answer(label_path):
    """
    Load the expected answer from label.txt file.
    Returns a dictionary with the expected values.
    """
    try:
        with open(label_path, 'r') as f:
            lines = f.read().strip().split('\n')
        
        expected = {}
        for line in lines:
            if '|' in line:
                key, value = line.split('|', 1)
                expected[key.strip()] = value.strip()
        
        return expected
    except Exception as e:
        print(f"Error reading label file: {str(e)}", file=sys.stderr)
        return None

def compare_answers(model_answer, expected_answer):
    """
    Compare the model's answer with the expected answer.
    Returns True if all key information matches, False otherwise.
    """
    if not model_answer or not expected_answer:
        return False
    
    # Check each expected key
    mismatches = []
    for key, expected_value in expected_answer.items():
        model_value = model_answer.get(key, '')
        
        # Special handling for different types of values
        if key in ['Lifetime_Sales_Amount', 'Second_Bestseller_Price', 'Dashboard_Revenue']:
            # For price/amount fields, normalize format
            expected_clean = expected_value.replace('$', '').replace(',', '')
            model_clean = model_value.replace('$', '').replace(',', '')
            if expected_clean != model_clean:
                mismatches.append(f"{key}: expected '{expected_value}', got '{model_value}'")
        
        elif key in ['NY_Tax_Rate', 'CA_Tax_Rate']:
            # Tax rates - allow different decimal formats
            expected_clean = expected_value.replace('%', '').strip()
            model_clean = model_value.replace('%', '').strip()
            # Convert to float for comparison
            try:
                if float(expected_clean) != float(model_clean):
                    mismatches.append(f"{key}: expected '{expected_value}', got '{model_value}'")
            except ValueError:
                if expected_clean != model_clean:
                    mismatches.append(f"{key}: expected '{expected_value}', got '{model_value}'")
        
        elif key in ['Product_In_Last_Orders', 'Processing_Visible_Storefront', 'Processing_Default_Status']:
            # Yes/No fields - case insensitive
            if model_value.lower() != expected_value.lower():
                mismatches.append(f"{key}: expected '{expected_value}', got '{model_value}'")
        
        elif key == 'Empty_Rows_Yes_Effect':
            # Allow flexible descriptions for this field
            # Just check if model provided some reasonable description
            if not model_value or len(model_value) < 5:
                mismatches.append(f"{key}: expected meaningful description, got '{model_value}'")
        
        elif key == 'Order_Status_Options':
            # Check if main options are mentioned
            expected_options = set(opt.strip() for opt in expected_value.split(','))
            model_options = set(opt.strip() for opt in model_value.split(','))
            if expected_options != model_options:
                mismatches.append(f"{key}: expected '{expected_value}', got '{model_value}'")
        
        elif key == 'Chart_Disabled_Message':
            # Allow some flexibility in message text
            # Check for key words
            if 'disabled' not in model_value.lower() and 'enable' not in model_value.lower():
                mismatches.append(f"{key}: expected message about chart being disabled, got '{model_value}'")
        
        elif key == 'Default_Source_State':
            # Handle 'None' or empty state
            expected_normalized = expected_value.lower() if expected_value.lower() != 'none' else ''
            model_normalized = model_value.lower() if model_value.lower() != 'none' else ''
            if expected_normalized != model_normalized:
                mismatches.append(f"{key}: expected '{expected_value}', got '{model_value}'")
        
        else:
            # Exact match for other fields
            if model_value != expected_value:
                mismatches.append(f"{key}: expected '{expected_value}', got '{model_value}'")
    
    if mismatches:
        print("\n=== Answer Comparison Mismatches ===", file=sys.stderr)
        for mismatch in mismatches:
            print(f"✗ {mismatch}", file=sys.stderr)
        return False
    
    print("\n=== Answer Comparison ===", file=sys.stderr)
    print("✓ All key information matches the expected answer", file=sys.stderr)
    return True

async def verify() -> bool:
    """
    Verifies that the NY expansion analysis task has been completed correctly.
    First checks the model's answer against the expected label,
    then optionally verifies the actual state in the Magento Admin.
    """
    print("\n=== Starting Verification ===", file=sys.stderr)
    
    # Get the label file path
    label_path = Path(__file__).parent / "label.txt"
    
    # Load expected answer
    print("Loading expected answer from label.txt...", file=sys.stderr)
    expected_answer = load_expected_answer(label_path)
    if not expected_answer:
        print("FATAL ERROR: Could not load expected answer from label.txt", file=sys.stderr)
        return False
    
    print(f"Expected answer loaded with {len(expected_answer)} keys", file=sys.stderr)
    
    # Get model's response from MCP_MESSAGES
    print("\nReading model response from MCP_MESSAGES...", file=sys.stderr)
    model_response = get_model_response()
    
    if not model_response:
        print("FATAL ERROR: No valid model response found", file=sys.stderr)
        return False
    
    print(f"Model response found (length: {len(model_response)} chars)", file=sys.stderr)
    print("\nParsing answer format from model response...", file=sys.stderr)
    
    model_answer = parse_answer_format(model_response)
    
    if not model_answer:
        print("FATAL ERROR: Could not parse answer format from model response", file=sys.stderr)
        return False
    
    print(f"\n=== Model Answer Parsed Successfully ===", file=sys.stderr)
    print(f"Parsed {len(model_answer)} key-value pairs", file=sys.stderr)
    
    for key, value in model_answer.items():
        print(f"  {key}: {value}", file=sys.stderr)
    
    # Compare answers
    print("\n=== Comparing Model Answer with Expected Answer ===", file=sys.stderr)
    answer_match = compare_answers(model_answer, expected_answer)
    
    if not answer_match:
        print("\nFATAL ERROR: Model answer does not match expected answer", file=sys.stderr)
        print("Verification FAILED", file=sys.stderr)
        return False
    
    print("\n✓ Model answer matches expected answer", file=sys.stderr)
    print("Verification PASSED", file=sys.stderr)
    return True

def main():
    """
    Executes the verification process and exits with a status code.
    """
    result = asyncio.run(verify())
    sys.exit(0 if result else 1)

if __name__ == "__main__":
    main()


================================================
FILE: tasks/playwright_webarena/easy/shopping_admin/products_sales_analysis/description.md
================================================
Only keep the first few catalog and dashboard checks plus the high-level orders snapshot.

**Task Requirements**

1. If need to login, login with username 'admin' and password 'admin1234'.
2. **Catalog → Products**: search for product names containing `Yoga` and capture the records-found count; reset filters and look up SKU `WH11` to copy its exact price; reset again and set Quantity (From/To) = `0.0000` to count all zero-quantity products.
3. **Dashboard**: in the Bestsellers table sort by price ascending—record the lowest-priced row as `name:quantity`, then locate `Quest Lumaflex™ Band` and note its quantity, and read the Revenue KPI amount.
4. **Sales → Orders**: filter Status = Pending to count those orders, then search for Grace Nguyen, switch Status = Complete, sort Grand Total descending, and record the Order # of the most expensive completed order.

Return just these metrics:

```
<answer>
YogaProducts|count
WH11Price|price
ZeroQuantityProducts|count
LowestProduct|name:quantity
QuestLumaflexQuantity|quantity
DashboardRevenue|amount
PendingOrders|count
GraceNguyenOrderID|orderid
</answer>
```

```
<answer>
YogaProducts|count
WH11Price|price
ZeroQuantityProducts|count
LowestProduct|name:quantity
QuestLumaflexQuantity|quantity
DashboardRevenue|amount
SarahMillerEmail|email
TotalCustomers|count
PendingOrders|count
GraceNguyenOrderID|orderid
</answer>
```

**Example Output:**
```
<answer>
YogaProducts|XX
WH11Price|$XX.XX
ZeroQuantityProducts|XX
LowestProduct|Product Name Here:XX
QuestLumaflexQuantity|XX
DashboardRevenue|$XX.XX
SarahMillerEmail|email@example.com
TotalCustomers|XX
PendingOrders|X
GraceNguyenOrderID|00000XXXX
</answer>
```


================================================
FILE: tasks/playwright_webarena/easy/shopping_admin/products_sales_analysis/label.txt
================================================
YogaProducts|171
WH11Price|$54.00
ZeroQuantityProducts|150
LowestProduct|Sprite Stasis Ball 55 cm foot:5
QuestLumaflexQuantity|6
DashboardRevenue|$0.00
PendingOrders|10
GraceNguyenOrderID|000000189


================================================
FILE: tasks/playwright_webarena/easy/shopping_admin/products_sales_analysis/meta.json
================================================
{
  "task_id": "products_sales_analysis_easy",
  "task_name": "Products Sales Analysis (Easy)",
  "category_id": "shopping_admin",
  "category_name": "Shopping Admin",
  "description": "Make a single guided pass through Catalog, Dashboard, Customers, and Orders to collect the exact fields needed for a quick sales recap.",
  "author": "Fanqing Meng",
  "created_at": "2025-11-15",
  "difficulty": "L1",
  "tags": [
    "data extraction",
    "comparative analysis",
    "content submission"
  ],
  "mcp": [
    "playwright"
  ],
  "meta_data": {
    "stateType": "video",
    "stateContent": null,
    "stateUrl": "https://storage.mcpmark.ai/tasks_state/playwright_video/magento-admin.mp4",
    "stateOriginalUrl": "https://github.com/web-arena-x/webarena/tree/main/environment_docker"
  }
}


================================================
FILE: tasks/playwright_webarena/easy/shopping_admin/products_sales_analysis/verify.py
================================================
import asyncio
import sys
import re
import os
import json
from pathlib import Path


def get_model_response():
    """
    Get the model's response from the MCP_MESSAGES environment variable.
    Returns the last assistant message text.
    """
    messages_path = os.getenv("MCP_MESSAGES")
    print(f"MCP_MESSAGES: {messages_path}")
    if not messages_path:
        print("Warning: MCP_MESSAGES environment variable not set", file=sys.stderr)
        return None

    try:
        with open(messages_path, "r") as f:
            messages = json.load(f)

        # Find the last assistant message
        for message in reversed(messages):
            if (
                message.get("role") == "assistant"
                and message.get("status") == "completed"
            ):
                content = message.get("content", [])
                for item in content:
                    if item.get("type") == "output_text":
                        return item.get("text", "")

        print("Warning: No assistant response found in messages", file=sys.stderr)
        return None
    except Exception as e:
        print(f"Error reading messages file: {str(e)}", file=sys.stderr)
        return None


def parse_answer_format(text):
    """
    Parse the <answer>...</answer> format from the agent's output.
    Returns a dictionary with the parsed values.
    """
    if not text:
        print("Error: No text provided to parse", file=sys.stderr)
        return None

    # Look for <answer>...</answer> pattern
    match = re.search(r"<answer>(.*?)</answer>", text, re.IGNORECASE | re.DOTALL)
    if not match:
        print("Error: No <answer>...</answer> tags found in response", file=sys.stderr)
        return None

    answer_content = match.group(1).strip()
    if not answer_content:
        print("Error: Empty answer content", file=sys.stderr)
        return None

    # Parse each line
    result = {}
    lines = [line.strip() for line in answer_content.split("\n") if line.strip()]

    if len(lines) != 8:
        print(f"Error: Expected 8 lines in answer, got {len(lines)}", file=sys.stderr)
        print(f"Lines found: {lines}", file=sys.stderr)
        return None

    # Expected keys for validation
    expected_keys = [
        "YogaProducts", "WH11Price", "ZeroQuantityProducts", "LowestProduct",
        "QuestLumaflexQuantity", "DashboardRevenue", "PendingOrders",
        "GraceNguyenOrderID"
    ]

    for line in lines:
        if "|" not in line:
            print(f"Error: Line missing '|' separator: {line}", file=sys.stderr)
            return None
        
        parts = line.split("|", 1)
        if len(parts) != 2:
            print(f"Error: Invalid line format: {line}", file=sys.stderr)
            return None
            
        key, value = parts[0].strip(), parts[1].strip()
        
        if not key or not value:
            print(f"Error: Empty key or value in line: {line}", file=sys.stderr)
            return None
            
        result[key] = value

    # Validate all expected keys are present
    missing_keys = set(expected_keys) - set(result.keys())
    if missing_keys:
        print(f"Error: Missing required keys: {missing_keys}", file=sys.stderr)
        return None

    return result


def load_expected_answer(label_path):
    """
    Load the expected answer from label.txt file.
    Returns a dictionary with the expected values.
    """
    try:
        with open(label_path, "r") as f:
            lines = f.read().strip().split("\n")

        expected = {}
        for line in lines:
            if "|" in line:
                key, value = line.split("|", 1)
                expected[key.strip()] = value.strip()

        return expected
    except Exception as e:
        print(f"Error reading label file: {str(e)}", file=sys.stderr)
        return None


def compare_answers(model_answer, expected_answer):
    """
    Compare the model's answer with the expected answer.
    Returns True if all key information matches, False otherwise.
    """
    if not model_answer or not expected_answer:
        return False

    # Check each expected key
    mismatches = []
    for key, expected_value in expected_answer.items():
        model_value = model_answer.get(key, "")

        # Special handling for different types of values
        if key == "LowestProduct":
            # Check if product name and quantity match (format: "Product Name:quantity")
            if ":" in expected_value and ":" in model_value:
                expected_name, expected_qty = expected_value.rsplit(":", 1)
                model_name, model_qty = model_value.rsplit(":", 1)
                if expected_name != model_name or expected_qty != model_qty:
                    mismatches.append(
                        f"{key}: expected '{expected_value}', got '{model_value}'"
                    )
            else:
                if expected_value != model_value:
                    mismatches.append(
                        f"{key}: expected '{expected_value}', got '{model_value}'"
                    )

        elif key in ["WH11Price", "DashboardRevenue"]:
            # For price/amount fields, normalize format
            expected_clean = expected_value.replace("$", "").replace(",", "")
            model_clean = model_value.replace("$", "").replace(",", "")
            if expected_clean != model_clean:
                mismatches.append(
                    f"{key}: expected '{expected_value}', got '{model_value}'"
                )

        elif key == "SarahMillerEmail":
            # Email should match exactly
            if model_value.lower() != expected_value.lower():
                mismatches.append(
                    f"{key}: expected '{expected_value}', got '{model_value}'"
                )

        else:
            # Exact match for other fields
            if model_value != expected_value:
                mismatches.append(
                    f"{key}: expected '{expected_value}', got '{model_value}'"
                )

    if mismatches:
        print("\n=== Answer Comparison Mismatches ===", file=sys.stderr)
        for mismatch in mismatches:
            print(f"✗ {mismatch}", file=sys.stderr)
        return False

    print("\n=== Answer Comparison ===", file=sys.stderr)
    print("✓ All key information matches the expected answer", file=sys.stderr)
    return True


async def verify() -> bool:
    """
    Verifies that the products and sales analysis task has been completed correctly.
    First checks the model's answer against the expected label,
    then optionally verifies the actual state in the Magento Admin.
    """
    # Get the label file path
    label_path = Path(__file__).parent / "label.txt"

    # Load expected answer
    expected_answer = load_expected_answer(label_path)
    if not expected_answer:
        print("Error: Could not load expected answer from label.txt", file=sys.stderr)
        return False

    # Get model's response from MCP_MESSAGES
    model_response = get_model_response()
    if model_response:
        print("Found model response, parsing answer format...", file=sys.stderr)
        model_answer = parse_answer_format(model_response)

        if model_answer:
            print("\n=== Model Answer Parsed ===", file=sys.stderr)
            for key, value in model_answer.items():
                print(f"{key}: {value}", file=sys.stderr)

            # Compare answers
            answer_match = compare_answers(model_answer, expected_answer)
            if not answer_match:
                print("\nModel answer does not match expected answer", file=sys.stderr)
                return False
            print("\n✓ Model answer matches expected answer", file=sys.stderr)
            return True
        else:
            print(
                "Warning: Could not parse answer format from model response",
                file=sys.stderr,
            )
            return False
    else:
        print("No model response found", file=sys.stderr)
        return False


def main():
    """
    Executes the verification process and exits with a status code.
    """
    result = asyncio.run(verify())
    sys.exit(0 if result else 1)


if __name__ == "__main__":
    main()


================================================
FILE: tasks/playwright_webarena/easy/shopping_admin/sales_inventory_analysis/description.md
================================================
Retain just the first three analytic arenas—products, orders, and the dashboard—so the easy task stays read-only and short.

**Task Requirements**

1. If need to login, login with username 'admin' and password 'admin1234', then open **Catalog → Products**. Search for names containing `Sprite` to get their count, reset and set Quantity (From/To) = `100.0000` to count those rows, and finally reset to look up SKU `WS12` so you can copy its exact name and price.
2. Switch to **Sales → Orders**. Filter Status = Pending to count those orders, then search for Grace Nguyen with Status = Complete, sort Grand Total ascending, and capture the cheapest completed order ID. Clear filters, sort Grand Total descending, and record the top row’s customer and amount.
3. Finish in **Dashboard**. Sort **Bestsellers** by Quantity descending to capture the first row’s name and quantity, locate `Overnight Duffle` in that table to note its price, and check the **Top Search Terms** widget to see what position `hollister` occupies.

Answer with the reduced template:

```
<answer>
SpriteProducts|count
Quantity100Products|count
WS12Info|name:price
PendingOrders|count
GraceOrderID|orderid
HighestOrderInfo|customer:amount
CheapProduct|name:quantity
OvernightDufflePrice|price
HollisterPosition|position
</answer>
```

```
<answer>
SpriteProducts|count
Quantity100Products|count
WS12Info|name:price
PendingOrders|count
GraceOrderID|orderid
HighestOrderInfo|customer:amount
CheapProduct|name:quantity
OvernightDufflePrice|price
HollisterPosition|position
CostelloCustomers|count
SarahMillerInfo|group:date
PaidInvoices|count
Invoice002BillTo|name
</answer>
```

**Example Output:**
```
<answer>
SpriteProducts|XX
Quantity100Products|XX
WS12Info|Product Name Here:$XX.XX
PendingOrders|X
GraceOrderID|00000XXXX
HighestOrderInfo|Customer Name:$XXX.XX
CheapProduct|Product Name:XX
OvernightDufflePrice|$XX.XX
HollisterPosition|Xth
CostelloCustomers|X
SarahMillerInfo|Group Name:MMM DD, YYYY
PaidInvoices|X
Invoice002BillTo|Customer Name
</answer>
```


================================================
FILE: tasks/playwright_webarena/easy/shopping_admin/sales_inventory_analysis/label.txt
================================================
SpriteProducts|16
Quantity100Products|1886
WS12Info|Radiant Tee:$22.00
PendingOrders|10
GraceOrderID|000000114
HighestOrderInfo|Samantha Jones:$292.40
CheapProduct|Sprite Yoga Strap 6 foot:6
OvernightDufflePrice|$45.00
HollisterPosition|1st


================================================
FILE: tasks/playwright_webarena/easy/shopping_admin/sales_inventory_analysis/meta.json
================================================
{
  "task_id": "sales_inventory_analysis_easy",
  "task_name": "Sales Inventory Analysis (Easy)",
  "category_id": "shopping_admin",
  "category_name": "Shopping Admin",
  "description": "Follow one guided tour through Products, Orders, Dashboard, Customers, and Invoices to capture a compact set of sales-plus-inventory facts.",
  "author": "Fanqing Meng",
  "created_at": "2025-11-15",
  "difficulty": "L1",
  "tags": [
    "data extraction",
    "comparative analysis",
    "inventory management"
  ],
  "mcp": [
    "playwright"
  ],
  "meta_data": {
    "stateType": "video",
    "stateContent": null,
    "stateUrl": "https://storage.mcpmark.ai/tasks_state/playwright_video/magento-admin.mp4",
    "stateOriginalUrl": "https://github.com/web-arena-x/webarena/tree/main/environment_docker"
  }
}


================================================
FILE: tasks/playwright_webarena/easy/shopping_admin/sales_inventory_analysis/verify.py
================================================
import asyncio
import sys
import re
import os
import json
from pathlib import Path


def get_model_response():
    """
    Get the model's response from the MCP_MESSAGES environment variable.
    Returns the last assistant message text.
    """
    messages_path = os.getenv("MCP_MESSAGES")
    print(f"MCP_MESSAGES: {messages_path}")
    if not messages_path:
        print("Warning: MCP_MESSAGES environment variable not set", file=sys.stderr)
        return None

    try:
        with open(messages_path, "r") as f:
            messages = json.load(f)

        # Find the last assistant message with type='message', status='completed'
        for message in reversed(messages):
            if (
                message.get("role") == "assistant"
                and message.get("status") == "completed"
                and message.get("type") == "message"
            ):
                content = message.get("content", [])
                for item in content:
                    # Check for both 'text' and 'output_text' types
                    if item.get("type") in ["text", "output_text"]:
                        return item.get("text", "")

        print("Warning: No assistant response found in messages", file=sys.stderr)
        return None
    except Exception as e:
        print(f"Error reading messages file: {str(e)}", file=sys.stderr)
        return None


def parse_answer_format(text):
    """
    Parse the <answer>...</answer> format from the agent's output.
    Returns a dictionary with the parsed values.
    """
    if not text:
        print("ERROR: No text provided to parse", file=sys.stderr)
        return None

    # Look for <answer>...</answer> pattern
    match = re.search(r"<answer>(.*?)</answer>", text, re.IGNORECASE | re.DOTALL)
    if not match:
        print("ERROR: No <answer>...</answer> tags found in the response", file=sys.stderr)
        print("Response text preview (first 200 chars):", text[:200], file=sys.stderr)
        return None

    answer_content = match.group(1).strip()
    print(f"Found answer content with {len(answer_content)} characters", file=sys.stderr)

    # Parse each line
    result = {}
    lines = answer_content.split("\n")
    
    # Expected keys for this task
    expected_keys = [
        "SpriteProducts", "Quantity100Products", "WS12Info", "PendingOrders",
        "GraceOrderID", "HighestOrderInfo", "CheapProduct", "OvernightDufflePrice",
        "HollisterPosition"
    ]

    if len(lines) != 9:
        print(f"ERROR: Expected 9 lines in answer, got {len(lines)}", file=sys.stderr)
        print(f"Lines found: {lines}", file=sys.stderr)
        return None

    for i, line in enumerate(lines, 1):
        if "|" not in line:
            print(f"ERROR: Line {i} does not contain pipe separator '|': '{line}'", file=sys.stderr)
            return None
        
        parts = line.split("|", 1)
        if len(parts) != 2:
            print(f"ERROR: Line {i} could not be split into key|value: '{line}'", file=sys.stderr)
            return None
            
        key, value = parts
        result[key.strip()] = value.strip()
    
    # Check if all expected keys are present
    missing_keys = set(expected_keys) - set(result.keys())
    if missing_keys:
        print(f"ERROR: Missing expected keys: {missing_keys}", file=sys.stderr)
        print(f"Keys found: {list(result.keys())}", file=sys.stderr)
        return None
    
    # Check for unexpected keys
    extra_keys = set(result.keys()) - set(expected_keys)
    if extra_keys:
        print(f"WARNING: Unexpected keys found: {extra_keys}", file=sys.stderr)

    return result


def load_expected_answer(label_path):
    """
    Load the expected answer from label.txt file.
    Returns a dictionary with the expected values.
    """
    try:
        with open(label_path, "r") as f:
            lines = f.read().strip().split("\n")

        expected = {}
        for line in lines:
            if "|" in line:
                key, value = line.split("|", 1)
                expected[key.strip()] = value.strip()

        return expected
    except Exception as e:
        print(f"Error reading label file: {str(e)}", file=sys.stderr)
        return None


def compare_answers(model_answer, expected_answer):
    """
    Compare the model's answer with the expected answer.
    Returns True if all key information matches, False otherwise.
    """
    if not model_answer or not expected_answer:
        return False

    # Check each expected key
    mismatches = []
    for key, expected_value in expected_answer.items():
        model_value = model_answer.get(key, "")

        # Special handling for different types of values
        if key == "WS12Info":
            # Check if product name and price match (format: name:price)
            if ":" in expected_value and ":" in model_value:
                expected_name, expected_price = expected_value.rsplit(":", 1)
                model_name, model_price = model_value.rsplit(":", 1)
                # Normalize price format
                expected_price_clean = expected_price.replace("$", "").replace(",", "")
                model_price_clean = model_price.replace("$", "").replace(",", "")
                if (
                    expected_name != model_name
                    or expected_price_clean != model_price_clean
                ):
                    mismatches.append(
                        f"{key}: expected '{expected_value}', got '{model_value}'"
                    )
            else:
                if expected_value != model_value:
                    mismatches.append(
                        f"{key}: expected '{expected_value}', got '{model_value}'"
                    )

        elif key == "GraceOrderID":
            # Order ID should start with "000" and match exactly
            if not model_value.startswith("000"):
                mismatches.append(
                    f"{key}: expected to start with '000', got '{model_value}'"
                )
            elif model_value != expected_value:
                mismatches.append(
                    f"{key}: expected '{expected_value}', got '{model_value}'"
                )

        elif key == "HighestOrderInfo":
            # Check format customer:amount
            if ":" in expected_value and ":" in model_value:
                expected_customer, expected_amount = expected_value.rsplit(":", 1)
                model_customer, model_amount = model_value.rsplit(":", 1)
                # Normalize amount format
                expected_amount_clean = expected_amount.replace("$", "").replace(
                    ",", ""
                )
                model_amount_clean = model_amount.replace("$", "").replace(",", "")
                if (
                    expected_customer != model_customer
                    or expected_amount_clean != model_amount_clean
                ):
                    mismatches.append(
                        f"{key}: expected '{expected_value}', got '{model_value}'"
                    )
            else:
                if expected_value != model_value:
                    mismatches.append(
                        f"{key}: expected '{expected_value}', got '{model_value}'"
                    )

        elif key == "Position2Product":
            # Check if product name and quantity match
            if ":" in expected_value and ":" in model_value:
                expected_name, expected_qty = expected_value.rsplit(":", 1)
                model_name, model_qty = model_value.rsplit(":", 1)
                if expected_name != model_name or expected_qty != model_qty:
                    mismatches.append(
                        f"{key}: expected '{expected_value}', got '{model_value}'"
                    )
            else:
                if expected_value != model_value:
                    mismatches.append(
                        f"{key}: expected '{expected_value}', got '{model_value}'"
                    )

        elif key == "OvernightDufflePrice":
            # Normalize price format
            expected_clean = expected_value.replace("$", "").replace(",", "")
            model_clean = model_value.replace("$", "").replace(",", "")
            if expected_clean != model_clean:
                mismatches.append(
                    f"{key}: expected '{expected_value}', got '{model_value}'"
                )

        elif key == "HollisterPosition":
            # Position format (1st, 2nd, 3rd, etc.)
            if model_value.lower() != expected_value.lower():
                mismatches.append(
                    f"{key}: expected '{expected_value}', got '{model_value}'"
                )

        elif key == "SarahMillerInfo":
            # Format: group:date
            if ":" in expected_value and ":" in model_value:
                expected_group, expected_date = expected_value.split(":", 1)
                model_group, model_date = model_value.split(":", 1)
                # Allow some flexibility in date format
                if expected_group != model_group:
                    mismatches.append(
                        f"{key}: expected group '{expected_group}', got '{model_group}'"
                    )
                # For date, check if key parts match
                if not (expected_date in model_date or model_date in expected_date):
                    mismatches.append(
                        f"{key}: expected date '{expected_date}', got '{model_date}'"
                    )
            else:
                if expected_value != model_value:
                    mismatches.append(
                        f"{key}: expected '{expected_value}', got '{model_value}'"
                    )

        elif key == "Invoice002BillTo":
            # Name should match exactly
            if model_value != expected_value:
                mismatches.append(
                    f"{key}: expected '{expected_value}', got '{model_value}'"
                )

        else:
            # Exact match for count fields and other numeric values
            if model_value != expected_value:
                mismatches.append(
                    f"{key}: expected '{expected_value}', got '{model_value}'"
                )

    if mismatches:
        print("\n=== Answer Comparison Mismatches ===", file=sys.stderr)
        for mismatch in mismatches:
            print(f"✗ {mismatch}", file=sys.stderr)
        return False

    print("\n=== Answer Comparison ===", file=sys.stderr)
    print("✓ All key information matches the expected answer", file=sys.stderr)
    return True


async def verify() -> bool:
    """
    Verifies that the sales and inventory analysis task has been completed correctly.
    First checks the model's answer against the expected label,
    then optionally verifies the actual state in the Magento Admin.
    """
    print("\n" + "="*60, file=sys.stderr)
    print("Starting verification of Task 5", file=sys.stderr)
    print("="*60, file=sys.stderr)
    
    # Get the label file path
    label_path = Path(__file__).parent / "label.txt"

    # Load expected answer
    print("\n--- Loading Expected Answer ---", file=sys.stderr)
    expected_answer = load_expected_answer(label_path)
    if not expected_answer:
        print("FATAL ERROR: Could not load expected answer from label.txt", file=sys.stderr)
        return False
    print(f"Successfully loaded {len(expected_answer)} expected values", file=sys.stderr)

    # Get model's response from MCP_MESSAGES
    print("\n--- Loading Model Response ---", file=sys.stderr)
    model_response = get_model_response()
    if not model_response:
        print("FATAL ERROR: No model response found in MCP_MESSAGES", file=sys.stderr)
        return False
    
    print(f"Found model response ({len(model_response)} characters)", file=sys.stderr)
    
    print("\n--- Parsing Answer Format ---", file=sys.stderr)
    model_answer = parse_answer_format(model_response)
    
    if not model_answer:
        print("\nFATAL ERROR: Could not parse answer format from model response", file=sys.stderr)
        print("Verification FAILED", file=sys.stderr)
        return False
    
    print("\n=== Model Answer Successfully Parsed ===", file=sys.stderr)
    for key, value in model_answer.items():
        print(f"  {key}: {value}", file=sys.stderr)

    # Compare answers
    print("\n--- Comparing Answers ---", file=sys.stderr)
    answer_match = compare_answers(model_answer, expected_answer)
    
    if not answer_match:
        print("\n" + "="*60, file=sys.stderr)
        print("VERIFICATION FAILED: Model answer does not match expected answer", file=sys.stderr)
        print("="*60, file=sys.stderr)
        return False
    
    print("\n" + "="*60, file=sys.stderr)
    print("✓ VERIFICATION PASSED: Model answer matches expected answer", file=sys.stderr)
    print("="*60, file=sys.stderr)
    return True


def main():
    """
    Executes the verification process and exits with a status code.
    """
    result = asyncio.run(verify())
    sys.exit(0 if result else 1)


if __name__ == "__main__":
    main()


================================================
FILE: tasks/playwright_webarena/easy/shopping_admin/search_filtering_operations/description.md
================================================
Limit the search intelligence pass to the first three steps from the original task so it’s just two Search Terms views plus one dashboard glance.

**Task Requirements**

1. If need to login, login with username 'admin' and password 'admin1234'.
2. **Marketing → SEO & Search → Search Terms**: filter for queries containing `tank` to count them, reset and filter Results = 0 to count zero-result terms, then filter Uses ≥ 11 to capture the highest-use row and list every term whose Results are between 20 and 30 (join as `term:results`, or use `None:0` if none). Remove filters when done.
3. **Reports → Search Terms**: set Hits ≥ 16 and record the filtered count, then add ID range 10–15 and capture the row with the most Results, and finally switch Store View to “Default Store View” to count those entries.
4. **Dashboard**: in **Top Search Terms** list the entries whose Results = 1 (format `term:uses` joined with `|` or `None:0`), in **Last Search Terms** pick the row with the highest combination of Results and Uses, and in **Bestsellers** copy the product + quantity shown at position #3.

Return only these data points:

```
<answer>
TankSearchCount|count
ZeroResultsCount|count
HighestUseTerm|term:uses
Results20to30Term|term1:results1|term2:results2|...
Hits15PlusCount|count
ID10to15MaxResults|term:results
DefaultStoreViewCount|count
OneResultTerm|term1:uses1|term2:uses2|...
HighestResultLastSearch|term:results
Position3Bestseller|product:quantity
</answer>
```

```
<answer>
TankSearchCount|count
ZeroResultsCount|count
HighestUseTerm|term:uses
Results20to30Term|term1:results1|term2:result2|term3:result3|...
Hits15PlusCount|count
ID10to15MaxResults|term:results
DefaultStoreViewCount|count
OneResultTerm|term1:uses1|term2:uses2|term3:uses3|...
HighestResultLastSearch|term:results
Position3Bestseller|product:quantity
TopUseTerm|term:uses
FirstNonZeroResult|term:results
TotalUniqueTerms|count
</answer>
```

**Example Output:**
```
<answer>
TankSearchCount|X
ZeroResultsCount|X
HighestUseTerm|search_term:XX
Results20to30Term|search_term1:XX1|search_term2:XX2|search_term3:XX3|...
Hits15PlusCount|X
ID10to15MaxResults|Product Name:XX
DefaultStoreViewCount|X
OneResultTerm|search_term1:XX1|search_term2:XX2|search_term3:XX3|...
HighestResultLastSearch|search_term:XX
Position3Bestseller|Product Name:X
TopUseTerm|search_term:XX
FirstNonZeroResult|search_term:X
TotalUniqueTerms|X
</answer>
```

**Success Criteria:**
- Successfully logged into Magento Admin
- Applied complex search filters in Search Terms section
- Used range filters for results and hits
- Sorted columns to find specific records
- Navigated between different report views
- Extracted data from filtered and sorted results
- Counted records accurately after applying filters
- Output answer in exact format with 13 data lines
- Answer wrapped in <answer> tags


================================================
FILE: tasks/playwright_webarena/easy/shopping_admin/search_filtering_operations/label.txt
================================================
TankSearchCount|2
ZeroResultsCount|1
HighestUseTerm|hollister:19
Results20to30Term|Antonia Racer Tank:23|tanks:23
Hits15PlusCount|1
ID10to15MaxResults|Antonia Racer Tank:23
DefaultStoreViewCount|7
OneResultTerm|hollister:19|WP10:1
HighestResultLastSearch|Antonia Racer Tank:23
Position3Bestseller|Sprite Stasis Ball 65 cm:6


================================================
FILE: tasks/playwright_webarena/easy/shopping_admin/search_filtering_operations/meta.json
================================================
{
  "task_id": "search_filtering_operations_easy",
  "task_name": "Search Filtering Operations (Easy)",
  "category_id": "shopping_admin",
  "category_name": "Shopping Admin",
  "description": "Follow a clearly guided path through Search Terms, the Search Terms report, and the dashboard widgets to capture the metrics needed for a focused search-behavior brief.",
  "author": "Fanqing Meng",
  "created_at": "2025-11-15",
  "difficulty": "L1",
  "tags": [
    "content submission"
  ],
  "mcp": [
    "playwright"
  ],
  "meta_data": {
    "stateType": "video",
    "stateContent": null,
    "stateUrl": "https://storage.mcpmark.ai/tasks_state/playwright_video/magento-admin.mp4",
    "stateOriginalUrl": "https://github.com/web-arena-x/webarena/tree/main/environment_docker"
  }
}


================================================
FILE: tasks/playwright_webarena/easy/shopping_admin/search_filtering_operations/verify.py
================================================
import re
import json
import os
import sys


def verify(messages):
    """
    Verify that the agent has successfully performed complex search and filtering operations
    in the Magento Admin panel and extracted all required information correctly.

    Args:
        messages: List of message dictionaries containing the conversation

    Returns:
        Dictionary with 'valid' boolean and 'reason' string
    """

    # Find the last assistant message with status "completed" and type "message"
    answer_content = None
    for message in reversed(messages):
        if (
            message.get("role") == "assistant"
            and message.get("status") == "completed"
            and message.get("type") == "message"
            and message.get("content")
        ):
            # Extract text from content structure
            content = message["content"]
            if isinstance(content, list):
                for item in content:
                    if isinstance(item, dict) and item.get("type") == "output_text":
                        text = item.get("text", "")
                        # Look for answer tags with case-insensitive search
                        answer_match = re.search(
                            r"<answer>(.*?)</answer>", text, re.DOTALL | re.IGNORECASE
                        )
                        if answer_match:
                            answer_content = answer_match.group(1).strip()
                            break
            elif isinstance(content, str):
                # Look for answer tags in string content
                answer_match = re.search(r"<answer>(.*?)</answer>", content, re.DOTALL | re.IGNORECASE)
                if answer_match:
                    answer_content = answer_match.group(1).strip()
                    break

            if answer_content:
                break

    if not answer_content:
        return {"valid": False, "reason": "No answer found in <answer> tags"}

    # Expected format - each line should have a key|value pair
    expected_keys = [
        "TankSearchCount",
        "ZeroResultsCount",
        "HighestUseTerm",
        "Results20to30Term",
        "Hits15PlusCount",
        "ID10to15MaxResults",
        "DefaultStoreViewCount",
        "OneResultTerm",
        "HighestResultLastSearch",
        "Position3Bestseller",
    ]

    # Parse the answer
    lines = answer_content.strip().split("\n")

    # Check if we have exactly 10 lines
    if len(lines) != 10:
        return {"valid": False, "reason": f"Expected 10 data lines, found {len(lines)}"}

    # Parse each line and validate format
    extracted_data = {}
    for line in lines:
        if "|" not in line:
            return {
                "valid": False,
                "reason": f"Invalid format in line: {line}. Expected 'key|value' format",
            }

        parts = line.split("|", 1)
        if len(parts) != 2:
            return {"valid": False, "reason": f"Invalid format in line: {line}"}

        key, value = parts
        extracted_data[key] = value

    # Check all required keys are present
    missing_keys = set(expected_keys) - set(extracted_data.keys())
    if missing_keys:
        return {
            "valid": False,
            "reason": f"Missing required keys: {', '.join(missing_keys)}",
        }

    # Validate specific data formats and expected values based on the current data

    # 1. TankSearchCount should be a number (2 terms containing 'tank')
    if not extracted_data["TankSearchCount"].isdigit():
        return {
            "valid": False,
            "reason": f"TankSearchCount should be a number, got: {extracted_data['TankSearchCount']}",
        }

    # Expected: "Antonia Racer Tank" and "tanks" contain 'tank'
    if extracted_data["TankSearchCount"] != "2":
        return {
            "valid": False,
            "reason": f"TankSearchCount should be '2', got: {extracted_data['TankSearchCount']}",
        }

    # 2. ZeroResultsCount should be a number (nike has 0 results)
    if not extracted_data["ZeroResultsCount"].isdigit():
        return {
            "valid": False,
            "reason": f"ZeroResultsCount should be a number, got: {extracted_data['ZeroResultsCount']}",
        }

    if extracted_data["ZeroResultsCount"] != "1":
        return {
            "valid": False,
            "reason": f"ZeroResultsCount should be '1', got: {extracted_data['ZeroResultsCount']}",
        }

    # 3. HighestUseTerm should be in format "term:uses"
    if ":" not in extracted_data["HighestUseTerm"]:
        return {
            "valid": False,
            "reason": f"HighestUseTerm should be in format 'term:uses', got: {extracted_data['HighestUseTerm']}",
        }

    # hollister has 19 uses (highest among terms with > 10 uses)
    if extracted_data["HighestUseTerm"] != "hollister:19":
        return {
            "valid": False,
            "reason": f"HighestUseTerm should be 'hollister:19', got: {extracted_data['HighestUseTerm']}",
        }

    # 4. Results20to30Term should be in format "term:results"
    if ":" not in extracted_data["Results20to30Term"]:
        return {
            "valid": False,
            "reason": f"Results20to30Term should be in format 'term:results', got: {extracted_data['Results20to30Term']}",
        }

    # Both "tanks" and "Antonia Racer Tank" have 23 results (between 20-30)
    valid_results20to30 = ["tanks:23", "Antonia Racer Tank:23"]
    # Check if answer contains one of the valid values or both separated by |
    if not any(
        val in extracted_data["Results20to30Term"] for val in valid_results20to30
    ):
        return {
            "valid": False,
            "reason": f"Results20to30Term should contain 'tanks:23' or 'Antonia Racer Tank:23', got: {extracted_data['Results20to30Term']}",
        }

    # 5. Hits15PlusCount should be a number (only hollister has 19 hits > 15)
    if not extracted_data["Hits15PlusCount"].isdigit():
        return {
            "valid": False,
            "reason": f"Hits15PlusCount should be a number, got: {extracted_data['Hits15PlusCount']}",
        }

    if extracted_data["Hits15PlusCount"] != "1":
        return {
            "valid": False,
            "reason": f"Hits15PlusCount should be '1', got: {extracted_data['Hits15PlusCount']}",
        }

    # 6. ID10to15MaxResults should be in format "term:results"
    if ":" not in extracted_data["ID10to15MaxResults"]:
        return {
            "valid": False,
            "reason": f"ID10to15MaxResults should be in format 'term:results', got: {extracted_data['ID10to15MaxResults']}",
        }

    # ID 11 is hollister (1 result), ID 13 is Antonia Racer Tank (23 results)
    if extracted_data["ID10to15MaxResults"] != "Antonia Racer Tank:23":
        return {
            "valid": False,
            "reason": f"ID10to15MaxResults should be 'Antonia Racer Tank:23', got: {extracted_data['ID10to15MaxResults']}",
        }

    # 7. DefaultStoreViewCount should be a number (all 7 terms are from Default Store View)
    if not extracted_data["DefaultStoreViewCount"].isdigit():
        return {
            "valid": False,
            "reason": f"DefaultStoreViewCount should be a number, got: {extracted_data['DefaultStoreViewCount']}",
        }

    if extracted_data["DefaultStoreViewCount"] != "7":
        return {
            "valid": False,
            "reason": f"DefaultStoreViewCount should be '7', got: {extracted_data['DefaultStoreViewCount']}",
        }

    # 8. OneResultTerm should be in format "term:uses"
    if ":" not in extracted_data["OneResultTerm"]:
        return {
            "valid": False,
            "reason": f"OneResultTerm should be in format 'term:uses', got: {extracted_data['OneResultTerm']}",
        }

    # Both hollister and WP10 have exactly 1 result
    valid_one_result = ["hollister:19", "WP10:1"]
    if not any(val in extracted_data["OneResultTerm"] for val in valid_one_result):
        return {
            "valid": False,
            "reason": f"OneResultTerm should contain 'hollister:19' or 'WP10:1', got: {extracted_data['OneResultTerm']}",
        }

    # 9. HighestResultLastSearch should be in format "term:results"
    if ":" not in extracted_data["HighestResultLastSearch"]:
        return {
            "valid": False,
            "reason": f"HighestResultLastSearch should be in format 'term:results', got: {extracted_data['HighestResultLastSearch']}",
        }

    # In Last Search Terms: tanks and Antonia Racer Tank both have 23 results (highest)
    valid_highest_last = ["tanks:23", "Antonia Racer Tank:23"]
    if not any(
        val in extracted_data["HighestResultLastSearch"] for val in valid_highest_last
    ):
        return {
            "valid": False,
            "reason": f"HighestResultLastSearch should contain 'tanks:23' or 'Antonia Racer Tank:23', got: {extracted_data['HighestResultLastSearch']}",
        }

    # 10. Position3Bestseller should be in format "product:quantity"
    if ":" not in extracted_data["Position3Bestseller"]:
        return {
            "valid": False,
            "reason": f"Position3Bestseller should be in format 'product:quantity', got: {extracted_data['Position3Bestseller']}",
        }

    # Position 3 in Bestsellers is "Sprite Stasis Ball 65 cm" with quantity 6
    if extracted_data["Position3Bestseller"] != "Sprite Stasis Ball 65 cm:6":
        return {
            "valid": False,
            "reason": f"Position3Bestseller should be 'Sprite Stasis Ball 65 cm:6', got: {extracted_data['Position3Bestseller']}",
        }

    # All validations passed
    return {
        "valid": True,
        "reason": "All complex search and filtering operations completed successfully",
    }


if __name__ == "__main__":
    # Load messages from environment variable
    messages_path = os.getenv("MCP_MESSAGES")
    if not messages_path:
        print(
            json.dumps(
                {"valid": False, "reason": "MCP_MESSAGES environment variable not set"}
            )
        )
        exit(1)

    try:
        with open(messages_path, "r") as f:
            messages = json.load(f)
    except Exception as e:
        print(
            json.dumps({"valid": False, "reason": f"Failed to load messages: {str(e)}"})
        )
        exit(1)

    # Run verification
    result = verify(messages)
    print(json.dumps(result))
    # Exit with appropriate code based on verification result
    sys.exit(0 if result["valid"] else 1)


================================================
FILE: tasks/playwright_webarena/standard/reddit/ai_data_analyst/description.md
================================================
I'm conducting an AI research analysis and need specific data from this forum platform. Please help me gather information about AI discussions and community engagement.

**Task Requirements:**

1. Create a new account with the exact 'AIDataAnalyst2025' and password 'SecurePass123!'

2. After logging in, navigate to the deeplearning forum and count the exact number of posts visible on the first page

3. Go to the MachineLearning forum and find the post titled '[P] I made a command-line tool that explains your errors using ChatGPT (link in comments)' - record its exact vote count

4. Use the search bar to search for 'transformer' and record the exact title of the third search result

5. In the MachineLearning forum, click on the 'Sort by: Hot' button and change it to 'New'. Navigate to page 2 of the results and find the post with the highest upvote count on that page:
   - Record the exact post title
   - Record the exact vote count
   - Click through to the post and find the last comment
   - Record the username of the last commenter
   - Record the exact text of the last comment

6. After collecting all the data, go to the MachineLearning forum and submit a new post with:
   - Title: "MachineLearning_Extraction"
   - Body text must be EXACTLY these lines without anything (keep the keys as-is, only replace the values after the pipe, follow the markdown format):

```
- Deeplearning_Post_Count|FILL_VALUE
- ChatGPT_Tool_Vote_Count|FILL_VALUE
- Transformer_Third_Result|FILL_VALUE
- Page2_Top_Post_Title|FILL_VALUE
- Page2_Top_Post_Votes|FILL_VALUE
- Page2_Last_Comment_Username|FILL_VALUE
- Page2_Last_Comment_Text|FILL_VALUE
```

================================================
FILE: tasks/playwright_webarena/standard/reddit/ai_data_analyst/label.txt
================================================
- Deeplearning_Post_Count|25
- ChatGPT_Tool_Vote_Count|2655
- Transformer_Third_Result|[R] The Table Feature Transformation Library Release
- Page2_Top_Post_Title|[D]GPT-4 might be able to tell you if it hallucinated
- Page2_Top_Post_Votes|634
- Page2_Last_Comment_Username|Nous_AI
- Page2_Last_Comment_Text|Fascinating.

================================================
FILE: tasks/playwright_webarena/standard/reddit/ai_data_analyst/meta.json
================================================
{
  "task_id": "ai_data_analyst",
  "task_name": "AI Data Analyst",
  "category_id": "reddit",
  "category_name": "Reddit",
  "description": "Create account on forum platform, collect AI/ML discussion metrics including post counts, vote data, and analyze community engagement patterns through systematic data extraction.",
  "author": "Fanqing Meng",
  "created_at": "2025-08-12",
  "difficulty": "L3",
  "tags": [
    "user interaction",
    "data extraction",
    "comparative analysis"
  ],
  "mcp": [
    "playwright"
  ],
  "meta_data": {
    "stateType": "video",
    "stateContent": null,
    "stateUrl": "https://storage.mcpmark.ai/tasks_state/playwright_video/postmill.mp4",
    "stateOriginalUrl": "https://github.com/web-arena-x/webarena/tree/main/environment_docker"
  }
}

================================================
FILE: tasks/playwright_webarena/standard/reddit/ai_data_analyst/verify.py
================================================
import asyncio
import sys
import re
import os
from pathlib import Path
from playwright.async_api import (
    async_playwright,
    TimeoutError as PlaywrightTimeoutError,
)

# 从环境变量读取 base_url，默认回退到本地
BASE_URL = os.getenv("WEBARENA_BASE_URL", "http://localhost:9999").rstrip("/")


def parse_key_value_format(text):
    """
    Parse the Key|Value format from the submission body using regex.
    Works with markdown format using pipe separators, with or without list markers.
    """
    data = {}

    # Define patterns for each field using pipe separator
    # Optional list markers (-, •, *) at the beginning
    patterns = {
        "Deeplearning_Post_Count": r"(?:[-•*]\s*)?Deeplearning_Post_Count\s*\|\s*(\d+)",
        "ChatGPT_Tool_Vote_Count": r"(?:[-•*]\s*)?ChatGPT_Tool_Vote_Count\s*\|\s*(\d+)",
        "Transformer_Third_Result": r"(?:[-•*]\s*)?Transformer_Third_Result\s*\|\s*(.+?)(?=\n|$)",
        "Page2_Top_Post_Title": r"(?:[-•*]\s*)?Page2_Top_Post_Title\s*\|\s*(.+?)(?=\n|$)",
        "Page2_Top_Post_Votes": r"(?:[-•*]\s*)?Page2_Top_Post_Votes\s*\|\s*(\d+)",
        "Page2_Last_Comment_Username": r"(?:[-•*]\s*)?Page2_Last_Comment_Username\s*\|\s*(.+?)(?=\n|$)",
        "Page2_Last_Comment_Text": r"(?:[-•*]\s*)?Page2_Last_Comment_Text\s*\|\s*(.+?)(?=\n|$)",
    }

    # Extract each field using regex
    for key, pattern in patterns.items():
        match = re.search(pattern, text, re.MULTILINE)
        if match:
            # For text fields, clean up newlines and extra spaces
            value = match.group(1).strip()
            if key not in [
                "Deeplearning_Post_Count",
                "ChatGPT_Tool_Vote_Count",
                "Page2_Top_Post_Votes",
            ]:
                # Replace newlines with spaces and normalize whitespace
                value = " ".join(value.split())
            data[key] = value

    return data


def normalize_text(text):
    """
    Normalize text for comparison by handling different quote styles and whitespace.
    """
    if not isinstance(text, str):
        return str(text)

    # Replace various quote styles with standard quotes
    text = text.replace(""", "'").replace(""", "'")
    text = text.replace('"', '"').replace('"', '"')

    # Normalize whitespace
    text = " ".join(text.split())

    return text.strip()


async def verify() -> bool:
    """
    Verifies that the ML extraction task has been completed correctly by checking the forum post.
    """
    async with async_playwright() as p:
        browser = await p.chromium.launch(headless=True)
        context = await browser.new_context()
        page = await context.new_page()

        try:
            # Navigate to the main page
            print("Navigating to forum...", file=sys.stderr)
            await page.goto(f"{BASE_URL}/", wait_until="networkidle")

            # Step 1: Check if account was created correctly by trying to login
            print("Step 1: Verifying account creation...", file=sys.stderr)
            user_button = page.locator('button:has-text("AIDataAnalyst2025")')
            if not await user_button.count():
                # Try to login
                print("Attempting to login with AIDataAnalyst2025...", file=sys.stderr)

                # Click login link
                await page.click('a:has-text("Log in")')
                await page.wait_for_load_state("networkidle")

                # Fill login form
                await page.fill('input[name="_username"]', "AIDataAnalyst2025")
                await page.fill('input[name="_password"]', "SecurePass123!")

                # Submit login form
                await page.click('button:has-text("Log in")')
                await page.wait_for_load_state("networkidle")

                # Check if login successful
                user_button = page.locator('button:has-text("AIDataAnalyst2025")')
                if not await user_button.count():
                    print("FAILED: Account AIDataAnalyst2025 with password SecurePass123! cannot be logged in", file=sys.stderr)
                    print("This means the account was not created correctly", file=sys.stderr)
                    return False

                print("PASSED: Successfully logged in as AIDataAnalyst2025", file=sys.stderr)
            else:
                print("PASSED: Already logged in as AIDataAnalyst2025", file=sys.stderr)

            # Step 2: Check if submission was created correctly in MachineLearning forum
            print("\nStep 2: Verifying submission creation...", file=sys.stderr)
            print("Navigating to MachineLearning forum...", file=sys.stderr)
            await page.goto(
                f"{BASE_URL}/f/MachineLearning", wait_until="networkidle"
            )

            # Look for the post with title "MachineLearning_Extraction"
            print(
                "Looking for submission with title 'MachineLearning_Extraction'...",
                file=sys.stderr,
            )
            post_link = page.locator('a:has-text("MachineLearning_Extraction")')

            if not await post_link.count():
                print(
                    "FAILED: Could not find submission with title 'MachineLearning_Extraction' in MachineLearning forum",
                    file=sys.stderr,
                )
                return False
            
            print("PASSED: Found submission 'MachineLearning_Extraction' in MachineLearning forum", file=sys.stderr)

            # Step 3: Check submission content matches expected values
            print("\nStep 3: Verifying submission content...", file=sys.stderr)
            
            # Click on the submission to view its content
            await post_link.first.click()
            await page.wait_for_load_state("networkidle")

            # Extract the submission body content
            # Try multiple possible selectors for the post body
            post_content = None
            selectors = [
                ".submission__body",
                ".post-body",
                ".RichText",
                '[class*="RichText"]',
                'div:has(> p:has-text("Deeplearning_Post_Count"))',
                'div:has-text("Deeplearning_Post_Count"):has-text("Page2_Last_Comment_Text")',
            ]

            for selector in selectors:
                content_element = page.locator(selector)
                if await content_element.count():
                    post_content = await content_element.first.inner_text()
                    if "Deeplearning_Post_Count" in post_content:
                        print(
                            f"Found submission content using selector: {selector}",
                            file=sys.stderr,
                        )
                        break

            if not post_content or "Deeplearning_Post_Count" not in post_content:
                print(
                    "FAILED: Could not find submission body with required format",
                    file=sys.stderr,
                )
                print(
                    "Expected body to contain 'Deeplearning_Post_Count' in pipe-separated format",
                    file=sys.stderr,
                )
                return False

            print("Found submission body content", file=sys.stderr)
            print(f"Raw content preview: {post_content[:200]}...", file=sys.stderr)

            # Parse the Key: Value format
            extracted_data = parse_key_value_format(post_content)
            print(f"Extracted data: {extracted_data}", file=sys.stderr)

            # Load expected values from label.txt
            label_path = Path(__file__).parent / "label.txt"
            if label_path.exists():
                with open(label_path, "r") as f:
                    expected_text = f.read().strip()
                expected_data = parse_key_value_format(expected_text)
                print("Loaded expected values from label.txt", file=sys.stderr)

            # Verify all required keys are present
            required_keys = [
                "Deeplearning_Post_Count",
                "ChatGPT_Tool_Vote_Count",
                "Transformer_Third_Result",
                "Page2_Top_Post_Title",
                "Page2_Top_Post_Votes",
                "Page2_Last_Comment_Username",
                "Page2_Last_Comment_Text",
            ]

            missing_keys = []
            for key in required_keys:
                if key not in extracted_data:
                    missing_keys.append(key)

            if missing_keys:
                print(
                    "FAILED: Missing required keys in submission: {', '.join(missing_keys)}",
                    file=sys.stderr,
                )
                print(
                    "Expected all 7 fields to be present in pipe-separated format",
                    file=sys.stderr,
                )
                return False

            # Validate data format and content
            errors = []

            # Check numeric fields
            try:
                post_count = int(extracted_data["Deeplearning_Post_Count"])
                if (
                    "expected_data" in locals()
                    and "Deeplearning_Post_Count" in expected_data
                ):
                    expected_count = int(expected_data["Deeplearning_Post_Count"])
                    if post_count != expected_count:
                        errors.append(
                            f"Deeplearning_Post_Count mismatch: got {post_count}, expected {expected_count}"
                        )
            except ValueError:
                errors.append(
                    f"Deeplearning_Post_Count must be a number, got: {extracted_data['Deeplearning_Post_Count']}"
                )

            # If we have expected data, compare against it
            if "expected_data" in locals():
                # Compare each field
                for key in required_keys:
                    if key in expected_data and key in extracted_data:
                        expected_val = normalize_text(expected_data[key])
                        actual_val = normalize_text(extracted_data[key])

                        # For numeric fields, compare as integers
                        if key in [
                            "Deeplearning_Post_Count",
                            "ChatGPT_Tool_Vote_Count",
                            "Page2_Top_Post_Votes",
                        ]:
                            try:
                                expected_int = int(expected_val)
                                actual_int = int(actual_val)
                                if expected_int != actual_int:
                                    errors.append(
                                        f"{key} mismatch: got {actual_int}, expected {expected_int}"
                                    )
                            except ValueError:
                                errors.append(
                                    f"{key} should be numeric: got '{actual_val}'"
                                )
                        else:
                            # For text fields, compare normalized text
                            if expected_val != actual_val:
                                errors.append(
                                    f"{key} mismatch: got '{actual_val}', expected '{expected_val}'"
                                )

            else:
                # If no expected data, just do basic validation
                for key in required_keys:
                    if key not in extracted_data:
                        errors.append(f"Missing required key: {key}")
                    elif (
                        not extracted_data[key] or extracted_data[key] == "[FILL_VALUE]"
                    ):
                        errors.append(f"{key} was not filled in")

            if errors:
                print(
                    "FAILED: Content validation failed with the following issues:",
                    file=sys.stderr,
                )
                for error in errors:
                    print(f"  - {error}", file=sys.stderr)
                print("\nExpected values from label.txt:", file=sys.stderr)
                if "expected_data" in locals():
                    for key in required_keys:
                        if key in expected_data:
                            print(f"  {key}: {expected_data[key]}", file=sys.stderr)
                return False

            # All checks passed
            print("\n=== VERIFICATION SUCCESSFUL ===")
            print("✓ Step 1: Account AIDataAnalyst2025 can login with password SecurePass123!")
            print("✓ Step 2: Submission 'MachineLearning_Extraction' found in MachineLearning forum")
            print("✓ Step 3: All submission content matches expected values:")
            print(f"  - Deeplearning_Post_Count: {extracted_data['Deeplearning_Post_Count']}")
            print(f"  - ChatGPT_Tool_Vote_Count: {extracted_data['ChatGPT_Tool_Vote_Count']}")
            print(f"  - Transformer_Third_Result: {extracted_data['Transformer_Third_Result']}")
            print(f"  - Page2_Top_Post_Title: {extracted_data['Page2_Top_Post_Title']}")
            print(f"  - Page2_Top_Post_Votes: {extracted_data['Page2_Top_Post_Votes']}")
            print(f"  - Page2_Last_Comment_Username: {extracted_data['Page2_Last_Comment_Username']}")
            print(f"  - Page2_Last_Comment_Text: {extracted_data['Page2_Last_Comment_Text']}")
            print("✓ All data in correct pipe-separated markdown format")
            return True

        except PlaywrightTimeoutError as e:
            print(f"Error: Timeout occurred - {str(e)}", file=sys.stderr)
            return False
        except Exception as e:
            print(f"Error: Unexpected error - {str(e)}", file=sys.stderr)
            return False
        finally:
            await browser.close()


def main():
    """
    Executes the verification process and exits with a status code.
    """
    result = asyncio.run(verify())
    sys.exit(0 if result else 1)


if __name__ == "__main__":
    main()


================================================
FILE: tasks/playwright_webarena/standard/reddit/budget_europe_travel/description.md
================================================
Create a comprehensive budget travel resource for European travel on the Postmill forum platform by establishing a dedicated travel community with integrated wiki guides and engaging content.

**Task Requirements:**

1. Register a new account with username "EuroTravelPlanner" and password "SecureTravel2024!"

2. After successful registration and login, search for "Europe travel budget" using the search bar to check for existing content

3. Create a new forum dedicated to budget European travel:
   - Forum name: "BudgetEuropeTravel"
   - Title: "Budget Travel Europe"
   - Description: "Community for sharing money-saving tips for European travel"
   - Sidebar content: "Share your best European travel deals and budget tips here!"

4. Create a comprehensive wiki page for your forum:
   - Path: "europe-travel-budget-guide"
   - Title: "Complete Budget Travel Guide for Europe 2024"
   - Content must contain: "Eurail passes and budget airlines"

5. Navigate to your newly created forum and create your first post:
   - Title: "My 14-day Europe trip for under 1000 - Complete itinerary"
   - Body must contain: "budget guide wiki"

6. Search for "travel insurance Europe" using the search function:
   - upvote the first post

7. Navigate to user settings and configure:
   - Set timezone to "Europe/Amsterdam" (to match your travel planning focus)
   - Ensure "Notify on reply" is checked (enabled) for community engagement

================================================
FILE: tasks/playwright_webarena/standard/reddit/budget_europe_travel/meta.json
================================================
{
  "task_id": "budget_europe_travel",
  "task_name": "Budget Europe Travel",
  "category_id": "reddit",
  "category_name": "Reddit",
  "description": "Create comprehensive budget travel resource for European destinations by establishing dedicated travel community with integrated wiki guides and engaging travel content.",
  "author": "Fanqing Meng",
  "created_at": "2025-08-12",
  "difficulty": "L3",
  "tags": [
    "content submission",
    "user interaction"
  ],
  "mcp": [
    "playwright"
  ],
  "meta_data": {
    "stateType": "video",
    "stateContent": null,
    "stateUrl": "https://storage.mcpmark.ai/tasks_state/playwright_video/postmill.mp4",
    "stateOriginalUrl": "https://github.com/web-arena-x/webarena/tree/main/environment_docker"
  }
}

================================================
FILE: tasks/playwright_webarena/standard/reddit/budget_europe_travel/verify.py
================================================
import asyncio
import sys
import re
import os
from pathlib import Path
from playwright.async_api import async_playwright, TimeoutError as PlaywrightTimeoutError

BASE_URL = os.getenv("WEBARENA_BASE_URL", "http://localhost:9999").rstrip("/")

def normalize_text(text):
    """
    Normalize text for comparison by handling different quote styles and whitespace.
    """
    if not isinstance(text, str):
        return str(text)
    
    # Replace various quote styles with standard quotes
    text = text.replace('\'', "'").replace('\'', "'")
    text = text.replace('"', '"').replace('"', '"')
    text = text.replace('&amp;', '&')
    
    # Normalize whitespace
    text = ' '.join(text.split())
    
    return text.strip()

async def verify() -> bool:
    """
    Verifies that the budget Europe travel resource task has been completed correctly.
    """
    async with async_playwright() as p:
        browser = await p.chromium.launch(headless=True)
        context = await browser.new_context()
        page = await context.new_page()
        
        verification_passed = True
        
        try:

            # 1. Check if account can login with correct credentials
            print("="*60)
            print("Step 1: Verifying account login with credentials...", file=sys.stderr)
            print("="*60)
            await page.goto(f"{BASE_URL}/", wait_until='networkidle')
            
            # First logout if already logged in
            user_button = page.locator('button:has-text("EuroTravelPlanner")')
            if await user_button.count():
                print("Already logged in, logging out first...", file=sys.stderr)
                await user_button.click()
                logout_link = page.locator('a:has-text("Log out")')
                if await logout_link.count():
                    await logout_link.click()
                    await page.wait_for_load_state('networkidle')
            
            # Now try to login with the specified credentials
            print("Attempting to login with username 'EuroTravelPlanner' and password 'SecureTravel2024!'...", file=sys.stderr)
            
            # Navigate to login page
            login_link = page.locator('a:has-text("Log in")')
            if await login_link.count():
                await login_link.click()
                await page.wait_for_load_state('networkidle')
            else:
                print("❌ ERROR: Cannot find login link", file=sys.stderr)
                verification_passed = False
                
            if verification_passed:
                # Fill login form with exact credentials
                await page.fill('input[name="_username"]', 'EuroTravelPlanner')
                await page.fill('input[name="_password"]', 'SecureTravel2024!')
                
                # Submit login
                login_button = page.locator('button[type="submit"]:has-text("Log in")')
                if not await login_button.count():
                    login_button = page.locator('button:has-text("Log in")')
                
                await login_button.click()
                await page.wait_for_load_state('networkidle')
                
                # Verify login success
                user_button = page.locator('button:has-text("EuroTravelPlanner")')
                if not await user_button.count():
                    print("❌ ERROR: Login failed with username 'EuroTravelPlanner' and password 'SecureTravel2024!'", file=sys.stderr)
                    verification_passed = False
                else:
                    print("✓ Account login successful with correct credentials", file=sys.stderr)
            

            # 2. Check if forum exists and has correct properties
            print("\n" + "="*60)
            print("Step 2: Checking forum existence and properties...", file=sys.stderr)
            print("="*60)
            
            # Check if forum exists at /f/BudgetEuropeTravel
            await page.goto(f"{BASE_URL}/f/BudgetEuropeTravel", wait_until='networkidle')
            
            # Check if we get 404 or the forum exists
            page_content = await page.content()
            page_title = await page.title()
            

            if "404" in page_title or "not found" in page_title.lower() or "Page not found" in page_content:
                print("❌ ERROR: Forum /f/BudgetEuropeTravel does not exist (404)", file=sys.stderr)
                verification_passed = False
            else:
                print("✓ Forum /f/BudgetEuropeTravel exists", file=sys.stderr)
                
                # Navigate to edit page to check properties
                await page.goto(f"{BASE_URL}/f/BudgetEuropeTravel/edit", wait_until='networkidle')
                
                # Check if we can access edit page
                edit_page_content = await page.content()
                edit_page_title = await page.title()
                
                if "404" in edit_page_title or "not found" in edit_page_title.lower() or "Page not found" in edit_page_content:
                    print("❌ ERROR: Cannot access forum edit page at /f/BudgetEuropeTravel/edit", file=sys.stderr)
                    verification_passed = False
                else:
                    print("✓ Forum edit page accessible", file=sys.stderr)
                    
                    # Check forum title
                    title_input = page.locator('input[name*="title"], input#forum_title')
                    if await title_input.count():
                        title_value = await title_input.input_value()
                        if title_value != "Budget Travel Europe":
                            print(f"❌ ERROR: Forum title is '{title_value}', expected 'Budget Travel Europe'", file=sys.stderr)
                            verification_passed = False
                        else:
                            print("✓ Forum title correct: 'Budget Travel Europe'", file=sys.stderr)
                    else:
                        print("❌ ERROR: Cannot find forum title field", file=sys.stderr)
                        verification_passed = False
                    
                    # Check forum description
                    desc_input = page.locator('textarea[name*="description"], input[name*="description"]')
                    if await desc_input.count():
                        desc_value = await desc_input.input_value()
                        expected_desc = "Community for sharing money-saving tips for European travel"
                        if desc_value != expected_desc:
                            print(f"❌ ERROR: Forum description is '{desc_value}', expected '{expected_desc}'", file=sys.stderr)
                            verification_passed = False
                        else:
                            print("✓ Forum description correct", file=sys.stderr)
                    else:
                        print("❌ ERROR: Cannot find forum description field", file=sys.stderr)
                        verification_passed = False
                    
                    # Check sidebar content
                    sidebar_input = page.locator('textarea[name*="sidebar"]')
                    if await sidebar_input.count():
                        sidebar_value = await sidebar_input.input_value()
                        expected_sidebar = "Share your best European travel deals and budget tips here!"
                        if sidebar_value != expected_sidebar:
                            print(f"❌ ERROR: Forum sidebar is '{sidebar_value}', expected '{expected_sidebar}'", file=sys.stderr)
                            verification_passed = False
                        else:
                            print("✓ Forum sidebar correct", file=sys.stderr)
                    else:
                        print("❌ ERROR: Cannot find forum sidebar field", file=sys.stderr)
                        verification_passed = False
            

            # 3. Check wiki page existence and content
            print("\n" + "="*60)
            print("Step 3: Checking wiki page existence and content...", file=sys.stderr)
            print("="*60)
            
            # Try the wiki URL with /wiki/ path
            await page.goto(f"{BASE_URL}/wiki/europe-travel-budget-guide", wait_until='networkidle')
            
            wiki_page_content = await page.content()
            wiki_page_title = await page.title()
            
            if "404" in wiki_page_title or "not found" in wiki_page_title.lower() or "Page not found" in wiki_page_content:
                print("❌ ERROR: Wiki page does not exist at /wiki/europe-travel-budget-guide", file=sys.stderr)
                verification_passed = False
            else:
                print("✓ Wiki page exists at /wiki/europe-travel-budget-guide", file=sys.stderr)
                
                # Check wiki title
                wiki_title_found = False
                expected_wiki_title = "Complete Budget Travel Guide for Europe 2024"
                
                # Try multiple selectors for wiki title
                wiki_title_selectors = [
                    f'h1:has-text("{expected_wiki_title}")',
                    f'h1:text-is("{expected_wiki_title}")',
                    'h1'
                ]
                
                for selector in wiki_title_selectors:
                    wiki_title_elem = page.locator(selector)
                    if await wiki_title_elem.count():
                        title_text = await wiki_title_elem.first.text_content()
                        if expected_wiki_title in title_text:
                            wiki_title_found = True
                            break
                
                if not wiki_title_found:
                    print(f"❌ ERROR: Wiki title '{expected_wiki_title}' not found", file=sys.stderr)
                    verification_passed = False
                else:
                    print(f"✓ Wiki title correct: '{expected_wiki_title}'", file=sys.stderr)
                
                # Check for required content in wiki
                required_wiki_content = "Eurail passes and budget airlines"
                if required_wiki_content not in wiki_page_content:
                    print(f"❌ ERROR: Wiki content must contain '{required_wiki_content}'", file=sys.stderr)
                    verification_passed = False
                else:
                    print(f"✓ Wiki content contains required text: '{required_wiki_content}'", file=sys.stderr)
            
            # 4. Check for post in the forum
            print("\n" + "="*60)
            print("Step 4: Checking for post in forum...", file=sys.stderr)
            print("="*60)
            
            await page.goto(f"{BASE_URL}/f/BudgetEuropeTravel", wait_until='networkidle')
            
            expected_post_title = "My 14-day Europe trip for under 1000 - Complete itinerary"
            post_link = page.locator(f'a:has-text("{expected_post_title}")')
            
            if not await post_link.count():
                print(f"❌ ERROR: Post with title '{expected_post_title}' not found in forum", file=sys.stderr)
                verification_passed = False
            else:
                print(f"✓ Post found with title: '{expected_post_title}'", file=sys.stderr)
                
                # Click on the post to check its content
                await post_link.first.click()
                await page.wait_for_load_state('networkidle')
                
                # Check if post contains required text
                post_page_content = await page.content()
                required_post_content = "budget guide wiki"
                
                if required_post_content not in post_page_content:
                    print(f"❌ ERROR: Post body must contain '{required_post_content}'", file=sys.stderr)
                    verification_passed = False
                else:
                    print(f"✓ Post content contains required text: '{required_post_content}'", file=sys.stderr)
            
            # 5. Check upvote on search result
            print("\n" + "="*60)
            print("Step 5: Checking upvote on search result...", file=sys.stderr)
            print("="*60)
            
            # Navigate to search results for "travel insurance Europe"
            await page.goto(f"{BASE_URL}/search?q=travel+insurance+Europe", wait_until='networkidle')
            

            # Check if we're on search results page
            if "/search" not in page.url:
                print("❌ ERROR: Not on search results page", file=sys.stderr)
                verification_passed = False
            else:
                print("✓ On search results page for 'travel insurance Europe'", file=sys.stderr)
                
                # Check for upvoted posts
                upvote_found = False
                
                # Method 1: Check for "Retract upvote" button (indicates user has upvoted)
                retract_buttons = page.locator('button:has-text("Retract upvote")')
                if await retract_buttons.count() > 0:
                    print("✓ Found upvoted post (Retract upvote button present)", file=sys.stderr)
                    upvote_found = True
                
                # Method 2: Check for posts with upvote count >= 1
                if not upvote_found:
                    # Look for vote counts
                    vote_elements = page.locator('div.vote, span.vote-count, [class*="vote"]')
                    
                    for i in range(await vote_elements.count()):
                        vote_elem = vote_elements.nth(i)
                        vote_text = await vote_elem.text_content()
                        try:
                            # Extract number from vote text
                            import re
                            numbers = re.findall(r'\d+', vote_text)
                            if numbers:
                                vote_count = int(numbers[0])
                                if vote_count >= 1:
                                    print(f"✓ Found post with {vote_count} upvote(s)", file=sys.stderr)
                                    upvote_found = True
                                    break
                        except:
                            continue
                
                if not upvote_found:
                    print("❌ ERROR: No upvoted posts found in search results", file=sys.stderr)
                    verification_passed = False
            
            # 6. Check user settings
            print("\n" + "="*60)
            print("Step 6: Checking user settings...", file=sys.stderr)
            print("="*60)
            

            await page.goto(f"{BASE_URL}/user/EuroTravelPlanner/preferences", wait_until='networkidle')
            
            # Check timezone setting
            timezone_correct = False
            timezone_select = page.locator('select[name*="timezone"], select#timezone')
            
            if await timezone_select.count():
                selected_value = await timezone_select.input_value()
                
                if selected_value == "Europe/Amsterdam":
                    print("✓ Timezone correctly set to 'Europe/Amsterdam'", file=sys.stderr)
                    timezone_correct = True
                else:
                    # Check selected option text
                    selected_option = timezone_select.locator('option[selected]')
                    if await selected_option.count():
                        option_text = await selected_option.text_content()
                        if "Amsterdam" in option_text:
                            print("✓ Timezone correctly set to Europe/Amsterdam", file=sys.stderr)
                            timezone_correct = True
                        else:
                            print(f"❌ ERROR: Timezone is set to '{option_text}', expected 'Europe/Amsterdam'", file=sys.stderr)
                            verification_passed = False
                    else:
                        print(f"❌ ERROR: Timezone is '{selected_value}', expected 'Europe/Amsterdam'", file=sys.stderr)
                        verification_passed = False
            else:
                print("❌ ERROR: Cannot find timezone selector", file=sys.stderr)
                verification_passed = False
            
            # Check "Notify on reply" setting
            notify_correct = False
            
            # Try multiple selectors for the checkbox
            notify_selectors = [
                'input[type="checkbox"]:near(:text("Notify on reply"))',
                'label:has-text("Notify on reply") input[type="checkbox"]',
                'input[type="checkbox"][name*="notify"]',
                'input[type="checkbox"][id*="notify"]'
            ]
            
            for selector in notify_selectors:
                notify_checkbox = page.locator(selector)
                if await notify_checkbox.count():
                    is_checked = await notify_checkbox.first.is_checked()
                    if is_checked:
                        print("✓ 'Notify on reply' is enabled (checked)", file=sys.stderr)
                        notify_correct = True
                    else:
                        print("❌ ERROR: 'Notify on reply' is not enabled (unchecked)", file=sys.stderr)
                        verification_passed = False
                    break
            
            if not notify_correct and verification_passed:
                print("❌ ERROR: Cannot verify 'Notify on reply' setting", file=sys.stderr)
                verification_passed = False
            
            # Final summary
            print("\n" + "="*60)
            if verification_passed:
                print("✅ SUCCESS: All verification checks passed!", file=sys.stderr)
            else:
                print("❌ FAILED: One or more verification checks failed!", file=sys.stderr)
            print("="*60)
            
            return verification_passed
            
        except PlaywrightTimeoutError as e:
            print(f"❌ ERROR: Timeout occurred - {str(e)}", file=sys.stderr)
            return False
        except Exception as e:
            print(f"❌ ERROR: Unexpected error - {str(e)}", file=sys.stderr)
            return False
        finally:
            await browser.close()

def main():
    """
    Executes the verification process and exits with a status code.
    """
    result = asyncio.run(verify())
    sys.exit(0 if result else 1)

if __name__ == "__main__":
    main()

================================================
FILE: tasks/playwright_webarena/standard/reddit/buyitforlife_research/description.md
================================================
Research durable kitchen appliances on the BuyItForLife forum by extracting information about the most popular posts and their community engagement.

**Task Requirements:**

1. Register a new account with username 'buyitforlife_researcher' and password 'BIFL2024Research!'

2. After successful registration, navigate to the BuyItForLife forum

3. Find the top 3 posts with the highest upvote counts and extract:
   - Exact post title
   - Exact upvote count
   - Exact number of comments

4. For the post with the highest upvotes from step 3, click through to read the comments and find the comment with the most upvotes. Record:
   - The exact comment text
   - The username of the commenter

5. From the BuyItForLife forum, identify the usernames of the authors who posted the top 3 posts with the highest number of upvotes

6. After collecting all the data, submit a new post in the BuyItForLife forum with:
   - Title: "Research Report for BuyItForLife"
   - Body text must be EXACTLY these lines without anything (keep the keys as-is, only replace the values after the pipe, follow the markdown format):

```
- Post1_Title|FILL_VALUE
- Post1_Upvotes|FILL_VALUE
- Post1_Comments|FILL_VALUE
- Post2_Title|FILL_VALUE
- Post2_Upvotes|FILL_VALUE
- Post2_Comments|FILL_VALUE
- Post3_Title|FILL_VALUE
- Post3_Upvotes|FILL_VALUE
- Post3_Comments|FILL_VALUE
- TopComment_Text|FILL_VALUE
- TopComment_Username|FILL_VALUE
- Post1_Author|FILL_VALUE
- Post2_Author|FILL_VALUE
- Post3_Author|FILL_VALUE
```

================================================
FILE: tasks/playwright_webarena/standard/reddit/buyitforlife_research/label.txt
================================================
- Post1_Title|Hand me down name tag trail on this child's jacket
- Post1_Upvotes|14487
- Post1_Comments|163
- Post2_Title|My Grandmother's oven, circa 1966. Many holiday meals were cooked with love here, right up until the day she passed. Aside from one shattered glass door that was replaced, everything still works!
- Post2_Upvotes|6413
- Post2_Comments|205
- Post3_Title|1956 Frigidaire Range
- Post3_Upvotes|5797
- Post3_Comments|190
- TopComment_Text|Rei does this too
- TopComment_Username|ATeaformeplease
- Post1_Author|Mofomania
- Post2_Author|_Mr_Roboto_
- Post3_Author|dezualy

================================================
FILE: tasks/playwright_webarena/standard/reddit/buyitforlife_research/meta.json
================================================
{
  "task_id": "buyitforlife_research",
  "task_name": "Buy It For Life Research",
  "category_id": "reddit",
  "category_name": "Reddit",
  "description": "Research durable lifetime-quality products by creating forum account, analyzing community recommendations, extracting product data, and compiling comprehensive durability report with voting metrics.",
  "author": "Fanqing Meng",
  "created_at": "2025-08-12",
  "difficulty": "L3",
  "tags": [
    "user interaction",
    "data extraction",
    "search aggregation",
    "content submission"
  ],
  "mcp": [
    "playwright"
  ],
  "meta_data": {
    "stateType": "video",
    "stateContent": null,
    "stateUrl": "https://storage.mcpmark.ai/tasks_state/playwright_video/postmill.mp4",
    "stateOriginalUrl": "https://github.com/web-arena-x/webarena/tree/main/environment_docker"
  }
}

================================================
FILE: tasks/playwright_webarena/standard/reddit/buyitforlife_research/verify.py
================================================
import asyncio
import sys
import re
import os
from pathlib import Path
from playwright.async_api import (
    async_playwright,
    TimeoutError as PlaywrightTimeoutError,
)

BASE_URL = os.getenv("WEBARENA_BASE_URL", "http://localhost:9999").rstrip("/")


def parse_markdown_list_format(text):
    """
    Parse the markdown list format (- Key|Value) from the submission body.
    Also handles plain Key|Value format without the dash.
    """
    data = {}
    
    # Pattern to match lines like: - Key|Value or just Key|Value
    pattern = r'^(?:-\s*)?([^|]+)\|(.+)$'
    
    lines = text.strip().split('\n')
    for line in lines:
        line = line.strip()
        if '|' in line:
            match = re.match(pattern, line)
            if match:
                key = match.group(1).strip()
                value = match.group(2).strip()
                data[key] = value
    
    return data


def normalize_text(text):
    """
    Normalize text for comparison by handling different quote styles and whitespace.
    """
    if not isinstance(text, str):
        return str(text)

    # Replace various quote styles with standard quotes
    text = text.replace(""", "'").replace(""", "'")
    text = text.replace('"', '"').replace('"', '"')

    # Normalize whitespace
    text = " ".join(text.split())

    return text.strip()


async def check_account_login(page):
    """
    Check if the account can be successfully logged in.
    Returns True if login successful, False otherwise.
    """
    try:
        # Navigate to main page
        await page.goto(f"{BASE_URL}/", wait_until="networkidle")
        
        # Check if already logged in
        user_button = page.locator('button[class*="UserButton"]')
        if await user_button.count():
            username = await user_button.inner_text()
            if username.lower() == "buyitforlife_researcher":
                print(f"Already logged in as {username}", file=sys.stderr)
                return True
            else:
                # Log out first
                await user_button.click()
                await page.click('a:has-text("Log out")')
                await page.wait_for_load_state("networkidle")
        
        # Try to login
        await page.click('a:has-text("Log in")')
        await page.wait_for_load_state("networkidle")
        
        await page.fill('input[name="_username"]', "buyitforlife_researcher")
        await page.fill('input[name="_password"]', "BIFL2024Research!")
        
        await page.click('button:has-text("Log in")')
        await page.wait_for_load_state("networkidle")
        
        # Check if login successful
        user_button = page.locator('button:has-text("buyitforlife_researcher")')
        if await user_button.count():
            print("Successfully logged in as buyitforlife_researcher", file=sys.stderr)
            return True
        else:
            print("Login failed for buyitforlife_researcher", file=sys.stderr)
            return False
            
    except Exception as e:
        print(f"Error during login check: {str(e)}", file=sys.stderr)
        return False


async def check_submission_exists(page):
    """
    Check if the submission exists in the BuyItForLife forum with correct content.
    Returns (exists, content_data) tuple.
    """
    try:
        # Navigate to BuyItForLife forum
        await page.goto(f"{BASE_URL}/f/BuyItForLife", wait_until="networkidle")
        
        # Look for the post with correct title
        print("Looking for post 'Research Report for BuyItForLife'...", file=sys.stderr)
        post_locator = page.locator('a:has-text("Research Report for BuyItForLife")')
        
        if not await post_locator.count():
            print("Error: Could not find post with title 'Research Report for BuyItForLife'", file=sys.stderr)
            return False, None
        
        # Click on the post to view its content
        await post_locator.first.click()
        await page.wait_for_load_state("networkidle")
        
        # Get the post content
        post_content = None
        selectors = [
            '.PostFullItem-body',
            '.Post-body',
            '.PostItem-body',
            '.item-RichText',
            '[class*="RichText"]',
            'div:has-text("Post1_Title")',
        ]
        
        for selector in selectors:
            post_content_element = page.locator(selector)
            if await post_content_element.count():
                # Get the text content, handling multiple elements if needed
                if await post_content_element.count() > 1:
                    for i in range(await post_content_element.count()):
                        text = await post_content_element.nth(i).inner_text()
                        if "Post1_Title" in text:
                            post_content = text
                            print(f"Found post content using selector: {selector} (element {i})", file=sys.stderr)
                            break
                else:
                    post_content = await post_content_element.first.inner_text()
                    print(f"Found post content using selector: {selector}", file=sys.stderr)
                
                if post_content and "Post1_Title" in post_content:
                    break
        
        if not post_content:
            print("Error: Could not find post content element", file=sys.stderr)
            return False, None
        
        print("Post content found:", file=sys.stderr)
        print(post_content[:200] + "..." if len(post_content) > 200 else post_content, file=sys.stderr)
        
        # Parse the markdown list format
        extracted_data = parse_markdown_list_format(post_content)
        print(f"Extracted data: {extracted_data}", file=sys.stderr)
        
        return True, extracted_data
        
    except Exception as e:
        print(f"Error checking submission: {str(e)}", file=sys.stderr)
        return False, None


async def verify() -> bool:
    """
    Verifies that the BuyItForLife research task has been completed correctly.
    Checks:
    1. Account creation (can login with credentials)
    2. Submission exists with correct title
    3. Submission content matches expected format and values
    """
    async with async_playwright() as p:
        browser = await p.chromium.launch(headless=True)
        context = await browser.new_context()
        page = await context.new_page()
        
        try:
            # Step 1: Check account creation
            print("=== Step 1: Checking account creation ===", file=sys.stderr)
            account_ok = await check_account_login(page)
            if not account_ok:
                print("Error: Account 'buyitforlife_researcher' cannot be logged in", file=sys.stderr)
                return False
            
            # Step 2: Check submission exists and get content
            print("\n=== Step 2: Checking submission ===", file=sys.stderr)
            submission_exists, extracted_data = await check_submission_exists(page)
            
            if not submission_exists:
                print("Error: Submission not found in BuyItForLife forum", file=sys.stderr)
                return False
            
            if not extracted_data:
                print("Error: Could not extract data from submission", file=sys.stderr)
                return False
            
            # Step 3: Load expected data from label.txt
            print("\n=== Step 3: Validating submission content ===", file=sys.stderr)
            label_path = Path(__file__).parent / "label.txt"
            if not label_path.exists():
                print("Error: label.txt not found", file=sys.stderr)
                return False
            
            with open(label_path, "r") as f:
                expected_text = f.read().strip()
            expected_data = parse_markdown_list_format(expected_text)
            print(f"Expected data from label.txt: {expected_data}", file=sys.stderr)
            
            # Verify all required keys are present
            required_keys = [
                "Post1_Title",
                "Post1_Upvotes",
                "Post1_Comments",
                "Post2_Title",
                "Post2_Upvotes",
                "Post2_Comments",
                "Post3_Title",
                "Post3_Upvotes",
                "Post3_Comments",
                "TopComment_Text",
                "TopComment_Username",
                "Post1_Author",
                "Post2_Author",
                "Post3_Author",
            ]
            
            missing_keys = []
            for key in required_keys:
                if key not in extracted_data:
                    missing_keys.append(key)
            
            if missing_keys:
                print(f"Error: Missing required keys: {', '.join(missing_keys)}", file=sys.stderr)
                return False
            
            # Compare each field with expected values
            errors = []
            for key in required_keys:
                if key in expected_data and key in extracted_data:
                    expected_val = normalize_text(expected_data[key])
                    actual_val = normalize_text(extracted_data[key])
                    
                    # For numeric fields, compare as integers
                    if "Upvotes" in key or "Comments" in key:
                        try:
                            expected_int = int(expected_val)
                            actual_int = int(actual_val)
                            if expected_int != actual_int:
                                errors.append(f"{key} mismatch: got {actual_int}, expected {expected_int}")
                        except ValueError:
                            errors.append(f"{key} should be numeric: got '{actual_val}'")
                    else:
                        # For text fields, special handling for usernames with underscores
                        if "Author" in key or key == "TopComment_Username":
                            expected_core = expected_val.strip('_')
                            actual_core = actual_val.strip('_')
                            if expected_core != actual_core:
                                errors.append(f"{key} mismatch: got '{actual_val}', expected '{expected_val}'")
                        else:
                            if expected_val != actual_val:
                                errors.append(f"{key} mismatch: got '{actual_val}', expected '{expected_val}'")
            
            # Verify upvotes are in descending order
            try:
                post1_votes = int(extracted_data["Post1_Upvotes"])
                post2_votes = int(extracted_data["Post2_Upvotes"])
                post3_votes = int(extracted_data["Post3_Upvotes"])
                
                if not (post1_votes >= post2_votes >= post3_votes):
                    errors.append(f"Posts should be ordered by upvotes: {post1_votes} >= {post2_votes} >= {post3_votes}")
            except (ValueError, KeyError):
                pass  # Already reported above
            
            if errors:
                print("Error: Validation failed with the following issues:", file=sys.stderr)
                for error in errors:
                    print(f"  - {error}", file=sys.stderr)
                return False
            
            # All checks passed
            print("\n=== SUCCESS ===", file=sys.stderr)
            print("✓ Account 'buyitforlife_researcher' created and can login", file=sys.stderr)
            print("✓ Submission 'Research Report for BuyItForLife' found in correct forum", file=sys.stderr)
            print("✓ All 14 required fields present and correct", file=sys.stderr)
            print("✓ Data matches expected values from label.txt", file=sys.stderr)
            print("✓ Posts ordered by upvotes (descending)", file=sys.stderr)
            return True
            
        except PlaywrightTimeoutError as e:
            print(f"Error: Timeout occurred - {str(e)}", file=sys.stderr)
            return False
        except Exception as e:
            print(f"Error: Unexpected error - {str(e)}", file=sys.stderr)
            return False
        finally:
            await browser.close()


def main():
    """
    Executes the verification process and exits with a status code.
    """
    result = asyncio.run(verify())
    sys.exit(0 if result else 1)


if __name__ == "__main__":
    main()

================================================
FILE: tasks/playwright_webarena/standard/reddit/llm_research_summary/description.md
================================================
I need you to perform a comprehensive analysis of Large Language Model discussions on this forum platform and create a summary post. This analysis will help me understand the community's engagement with LLM topics.

**Task Requirements:**

1. Register a new account with username 'llm_analyst_2024' and password 'LLMExpert2024!'

2. After successful registration, navigate to the MachineLearning forum and analyze posts about Large Language Models:
   - Count all posts on the first page that contain 'GPT', 'ChatGPT', or 'LLM' in their titles

3. Identify the top 3 posts with the highest upvote counts from your LLM-related posts list and record their complete details

4. Navigate to the deeplearning forum and go to page 2:
   - Find the post with the most comments on page 2
   - Record its exact title and comment count

5. Create a new submission in the MachineLearning forum with:
   - Title: "LLM Research Summary: GPT Discussions Analysis [2024]"
   - Body text must be EXACTLY these lines without anything (keep the keys as-is, only replace the values after the pipe, follow the markdown format):

```
- Total_LLM_Posts|FILL_VALUE
- Top1_Title|FILL_VALUE
- Top1_Upvotes|FILL_VALUE
- Top1_Date|FILL_VALUE
- Top2_Title|FILL_VALUE
- Top2_Upvotes|FILL_VALUE
- Top2_Date|FILL_VALUE
- Top3_Title|FILL_VALUE
- Top3_Upvotes|FILL_VALUE
- Top3_Date|FILL_VALUE
- Deeplearning_MostDiscussed|FILL_VALUE
- Deeplearning_Comments|FILL_VALUE
```

================================================
FILE: tasks/playwright_webarena/standard/reddit/llm_research_summary/label.txt
================================================
- Total_LLM_Posts|9
- Top1_Title|[P] I made a command-line tool that explains your errors using ChatGPT (link in comments)
- Top1_Upvotes|2655
- Top1_Date|3 years ago
- Top2_Title|[P] I built Adrenaline, a debugger that fixes errors and explains them with GPT-3
- Top2_Upvotes|1542
- Top2_Date|3 years ago
- Top3_Title|[N] OpenAI may have benchmarked GPT-4's coding ability on it's own training data
- Top3_Upvotes|925
- Top3_Date|2 years ago
- Deeplearning_MostDiscussed|Do companies actually care about their model's training/inference speed?
- Deeplearning_Comments|39

================================================
FILE: tasks/playwright_webarena/standard/reddit/llm_research_summary/meta.json
================================================
{
  "task_id": "llm_research_summary",
  "task_name": "LLM Research Summary",
  "category_id": "reddit",
  "category_name": "Reddit",
  "description": "Aggregate and analyze LLM research discussions across multiple forums, collect trending topics, compile technical insights, and create comprehensive summary post with community engagement.",
  "author": "Fanqing Meng",
  "created_at": "2025-08-12",
  "difficulty": "L3",
  "tags": [
    "data extraction",
    "search aggregation",
    "content submission",
    "user interaction"
  ],
  "mcp": [
    "playwright"
  ],
  "meta_data": {
    "stateType": "video",
    "stateContent": null,
    "stateUrl": "https://storage.mcpmark.ai/tasks_state/playwright_video/postmill.mp4",
    "stateOriginalUrl": "https://github.com/web-arena-x/webarena/tree/main/environment_docker"
  }
}

================================================
FILE: tasks/playwright_webarena/standard/reddit/llm_research_summary/verify.py
================================================
import asyncio
import sys
import re
import os
from pathlib import Path
from playwright.async_api import (
    async_playwright,
    TimeoutError as PlaywrightTimeoutError,
)

# 从环境变量读取 base_url，默认回退到本地
BASE_URL = os.getenv("WEBARENA_BASE_URL", "http://localhost:9999").rstrip("/")


def parse_key_value_format(text):
    """
    Parse the Key|Value format from the submission body.
    Handles both pipe (|) and colon (:) separators for compatibility.
    """
    data = {}
    
    # Try to parse with pipe separator first (expected format)
    lines = text.strip().split('\n')
    for line in lines:
        line = line.strip()
        if not line:
            continue
        
        # Remove markdown list prefix if present
        if line.startswith('- '):
            line = line[2:]
        elif line.startswith('* '):
            line = line[2:]
        
        # Try pipe separator first
        if '|' in line:
            parts = line.split('|', 1)
            if len(parts) == 2:
                key = parts[0].strip()
                value = parts[1].strip()
                data[key] = value
        # Fallback to colon separator for label.txt compatibility
        elif ':' in line:
            parts = line.split(':', 1)
            if len(parts) == 2:
                key = parts[0].strip()
                value = parts[1].strip()
                data[key] = value
    
    return data


def normalize_text(text):
    """
    Normalize text for comparison by handling different quote styles and whitespace.
    """
    if not isinstance(text, str):
        return str(text)

    # Replace various quote styles with standard quotes
    text = text.replace(""", "'").replace(""", "'")
    text = text.replace('"', '"').replace('"', '"')

    # Normalize whitespace
    text = " ".join(text.split())

    return text.strip()


async def verify() -> bool:
    """
    Verifies that the LLM analysis task has been completed correctly.
    """
    async with async_playwright() as p:
        browser = await p.chromium.launch(headless=True)
        context = await browser.new_context()
        page = await context.new_page()

        try:
            # Navigate to the main page
            print("Navigating to forum...", file=sys.stderr)
            await page.goto(f"{BASE_URL}/", wait_until="networkidle")

            # Check if logged in as llm_analyst_2024
            user_button = page.locator('button:has-text("llm_analyst_2024")')
            if not await user_button.count():
                # Try to login
                print("Not logged in, attempting to login...", file=sys.stderr)

                await page.click('a:has-text("Log in")')
                await page.wait_for_load_state("networkidle")

                await page.fill('input[name="_username"]', "llm_analyst_2024")
                await page.fill('input[name="_password"]', "LLMExpert2024!")

                await page.click('button:has-text("Log in")')
                await page.wait_for_load_state("networkidle")

                user_button = page.locator('button:has-text("llm_analyst_2024")')
                if not await user_button.count():
                    print("Error: Login failed for llm_analyst_2024", file=sys.stderr)
                    return False

                print("Successfully logged in as llm_analyst_2024", file=sys.stderr)
            else:
                print("Already logged in as llm_analyst_2024", file=sys.stderr)

            # Navigate to MachineLearning forum
            print("Navigating to MachineLearning forum...", file=sys.stderr)
            await page.goto(
                f"{BASE_URL}/f/MachineLearning", wait_until="networkidle"
            )

            # Look for the submission with our specific title
            print(
                "Looking for submission 'LLM Research Summary: GPT Discussions Analysis [2024]'...",
                file=sys.stderr,
            )
            post_link = page.locator(
                'a:has-text("LLM Research Summary: GPT Discussions Analysis [2024]")'
            )

            if not await post_link.count():
                print(
                    "Error: Could not find submission with required title",
                    file=sys.stderr,
                )
                return False

            # Click on the submission to view its content
            await post_link.first.click()
            await page.wait_for_load_state("networkidle")

            # Extract the submission body content
            # Try multiple possible selectors for the post body
            post_content = None
            selectors = [
                ".submission__body",
                ".post-body",
                ".RichText",
                '[class*="RichText"]',
                'div:has(> p:has-text("Total_LLM_Posts"))',
                'div:has-text("Total_LLM_Posts"):has-text("Deeplearning_Comments")',
            ]

            for selector in selectors:
                content_element = page.locator(selector)
                if await content_element.count():
                    post_content = await content_element.first.inner_text()
                    if "Total_LLM_Posts" in post_content:
                        print(
                            f"Found submission content using selector: {selector}",
                            file=sys.stderr,
                        )
                        break

            if not post_content or "Total_LLM_Posts" not in post_content:
                print(
                    "Error: Could not find submission body with required format",
                    file=sys.stderr,
                )
                return False

            print("Submission content found, parsing data...", file=sys.stderr)
            print(f"Raw content: {post_content[:200]}...", file=sys.stderr)

            # Parse the Key: Value format
            extracted_data = parse_key_value_format(post_content)
            print(f"Extracted data: {extracted_data}", file=sys.stderr)

            # Load expected values from label.txt
            label_path = Path(__file__).parent / "label.txt"
            if label_path.exists():
                with open(label_path, "r") as f:
                    expected_text = f.read().strip()
                expected_data = parse_key_value_format(expected_text)
                print("Loaded expected values from label.txt", file=sys.stderr)

            # Verify all required keys are present
            required_keys = [
                "Total_LLM_Posts",
                "Top1_Title",
                "Top1_Upvotes",
                "Top1_Date",
                "Top2_Title",
                "Top2_Upvotes",
                "Top2_Date",
                "Top3_Title",
                "Top3_Upvotes",
                "Top3_Date",
                "Deeplearning_MostDiscussed",
                "Deeplearning_Comments",
            ]

            missing_keys = []
            for key in required_keys:
                if key not in extracted_data:
                    missing_keys.append(key)

            if missing_keys:
                print(
                    f"Error: Missing required keys: {', '.join(missing_keys)}",
                    file=sys.stderr,
                )
                return False

            # Validate data format and content
            errors = []

            # Check Total_LLM_Posts is a number and matches expected
            try:
                total_posts = int(extracted_data["Total_LLM_Posts"])
                if "expected_data" in locals() and "Total_LLM_Posts" in expected_data:
                    expected_total = int(expected_data["Total_LLM_Posts"])
                    if total_posts != expected_total:
                        errors.append(
                            f"Total_LLM_Posts mismatch: got {total_posts}, expected {expected_total}"
                        )
                elif total_posts < 5:  # Based on exploration, should be at least 5
                    errors.append(f"Total_LLM_Posts seems too low: {total_posts}")
            except ValueError:
                errors.append(
                    f"Total_LLM_Posts must be a number, got: {extracted_data['Total_LLM_Posts']}"
                )

            # If we have expected data, compare against it
            if "expected_data" in locals():
                # Compare each field
                for key in required_keys:
                    if key in expected_data and key in extracted_data:
                        expected_val = normalize_text(expected_data[key])
                        actual_val = normalize_text(extracted_data[key])

                        # For numeric fields, compare as integers
                        if (
                            "Upvotes" in key
                            or "Comments" in key
                            or key == "Total_LLM_Posts"
                        ):
                            try:
                                expected_int = int(expected_val)
                                actual_int = int(actual_val)
                                if expected_int != actual_int:
                                    errors.append(
                                        f"{key} mismatch: got {actual_int}, expected {expected_int}"
                                    )
                            except ValueError:
                                errors.append(
                                    f"{key} should be numeric: got '{actual_val}'"
                                )
                        else:
                            # For text fields, compare normalized text
                            if expected_val != actual_val:
                                errors.append(
                                    f"{key} mismatch: got '{actual_val}', expected '{expected_val}'"
                                )

            else:
                # If no expected data, just do basic validation
                for key in required_keys:
                    if key not in extracted_data:
                        errors.append(f"Missing required key: {key}")
                    elif (
                        not extracted_data[key] or extracted_data[key] == "[FILL_VALUE]"
                    ):
                        errors.append(f"{key} was not filled in")

            # Verify upvotes are in descending order for top 3
            try:
                top1_votes = int(extracted_data["Top1_Upvotes"])
                top2_votes = int(extracted_data["Top2_Upvotes"])
                top3_votes = int(extracted_data["Top3_Upvotes"])

                if not (top1_votes >= top2_votes >= top3_votes):
                    errors.append(
                        f"Top posts should be ordered by upvotes: {top1_votes} >= {top2_votes} >= {top3_votes}"
                    )
            except (ValueError, KeyError):
                pass  # Already reported above

            if errors:
                print(
                    "Error: Validation failed with the following issues:",
                    file=sys.stderr,
                )
                for error in errors:
                    print(f"  - {error}", file=sys.stderr)
                return False

            # All checks passed
            print("Success: LLM analysis task completed successfully.")
            print("- Account llm_analyst_2024 verified")
            print(
                "- Submission 'LLM Research Summary: GPT Discussions Analysis [2024]' found"
            )
            print(
                f"- Total LLM-related posts analyzed: {extracted_data['Total_LLM_Posts']}"
            )
            print("- Top 3 posts by upvotes identified and documented")
            print(
                f"- Deeplearning forum page 2 most discussed post: {extracted_data['Deeplearning_MostDiscussed']}"
            )
            print("- All data in correct Key: Value format with 12 lines")
            return True

        except PlaywrightTimeoutError as e:
            print(f"Error: Timeout occurred - {str(e)}", file=sys.stderr)
            return False
        except Exception as e:
            print(f"Error: Unexpected error - {str(e)}", file=sys.stderr)
            return False
        finally:
            await browser.close()


def main():
    """
    Executes the verification process and exits with a status code.
    """
    result = asyncio.run(verify())
    sys.exit(0 if result else 1)


if __name__ == "__main__":
    main()


================================================
FILE: tasks/playwright_webarena/standard/reddit/movie_reviewer_analysis/description.md
================================================
I need you to analyze wonderful movie discussions on Postmill and create a summary post. This analysis will help me understand the community's engagement with movie topics.

**Task Requirements:**

1. Register a new account with username 'movie_reviewer_2024' and password 'movie_reviewer_2024'

2. After successful registration, navigate to the movies forum and analyze the posts on the first page:
   - Count all posts that have 4-digit years in their titles

3. Identify the top 3 posts with the highest upvote counts from ALL posts on the first page and record their complete details

4. Find these specific posts and record their data:
   - The 'Rittenhouse Square' poster post (record exact upvotes and comments)

5. Navigate through the first 5 pages of movies forum and count the total number of image/poster submissions across all 5 pages

6. Create a new submission in the movies forum with:
   - Title: "Wonderful Movies Analysis: Community Favorites [2024]"
   - Body text must be EXACTLY these lines without anything (keep the keys as-is, only replace the values after the pipe, follow the markdown format):

```
- Total_Year_Posts|FILL_VALUE
- Top1_Title|FILL_VALUE
- Top1_Upvotes|FILL_VALUE
- Top1_Comments|FILL_VALUE
- Top2_Title|FILL_VALUE
- Top2_Upvotes|FILL_VALUE
- Top2_Comments|FILL_VALUE
- Top3_Title|FILL_VALUE
- Top3_Upvotes|FILL_VALUE
- Top3_Comments|FILL_VALUE
- Rittenhouse_Upvotes|FILL_VALUE
- Rittenhouse_Comments|FILL_VALUE
- Total_Image_Posts_5Pages|FILL_VALUE
```

================================================
FILE: tasks/playwright_webarena/standard/reddit/movie_reviewer_analysis/label.txt
================================================
- Total_Year_Posts|1
- Top1_Title|Who will win the Oscar for ACTRESS IN A SUPPORTING ROLE?
- Top1_Upvotes|9933
- Top1_Comments|23
- Top2_Title|Who will win the Oscar for FILM EDITING?
- Top2_Upvotes|7720
- Top2_Comments|20
- Top3_Title|Cindy Williams Dies: 'Laverne & Shirley' Star Who Appeared In 'American Graffiti' & 'The Conversation' Was 75
- Top3_Upvotes|5268
- Top3_Comments|190
- Rittenhouse_Upvotes|2761
- Rittenhouse_Comments|182
- Total_Image_Posts_5Pages|6

================================================
FILE: tasks/playwright_webarena/standard/reddit/movie_reviewer_analysis/meta.json
================================================
{
  "task_id": "movie_reviewer_analysis",
  "task_name": "Movie Reviewer Analysis",
  "category_id": "reddit",
  "category_name": "Reddit",
  "description": "Analyze movie review patterns by creating reviewer profile, collecting ratings data, tracking review trends, and generating analytical report on community movie preferences and discussions.",
  "author": "Fanqing Meng",
  "created_at": "2025-08-12",
  "difficulty": "L3",
  "tags": [
    "user interaction",
    "data extraction",
    "comparative analysis",
    "content submission"
  ],
  "mcp": [
    "playwright"
  ],
  "meta_data": {
    "stateType": "video",
    "stateContent": null,
    "stateUrl": "https://storage.mcpmark.ai/tasks_state/playwright_video/postmill.mp4",
    "stateOriginalUrl": "https://github.com/web-arena-x/webarena/tree/main/environment_docker"
  }
}

================================================
FILE: tasks/playwright_webarena/standard/reddit/movie_reviewer_analysis/verify.py
================================================
import asyncio
import sys
import re
import os
from pathlib import Path
from playwright.async_api import (
    async_playwright,
    TimeoutError as PlaywrightTimeoutError,
)

# 从环境变量读取 base_url，默认回退到原地址
BASE_URL = os.getenv("WEBARENA_BASE_URL", "http://localhost:9999").rstrip("/")
print(f"Using base URL: {BASE_URL}")

def parse_key_value_format(text):
    """
    Parse the Key|Value format from the submission body using regex.
    Works regardless of line breaks.
    """
    data = {}

    # Define patterns for each field with the pipe separator
    patterns = {
        "Total_Year_Posts": r"Total_Year_Posts\s*\|\s*(\d+)",
        "Top1_Title": r"Top1_Title\s*\|\s*(.+?)(?=\nTop1_Upvotes|$)",
        "Top1_Upvotes": r"Top1_Upvotes\s*\|\s*(\d+)",
        "Top1_Comments": r"Top1_Comments\s*\|\s*(\d+)",
        "Top2_Title": r"Top2_Title\s*\|\s*(.+?)(?=\nTop2_Upvotes|$)",
        "Top2_Upvotes": r"Top2_Upvotes\s*\|\s*(\d+)",
        "Top2_Comments": r"Top2_Comments\s*\|\s*(\d+)",
        "Top3_Title": r"Top3_Title\s*\|\s*(.+?)(?=\nTop3_Upvotes|$)",
        "Top3_Upvotes": r"Top3_Upvotes\s*\|\s*(\d+)",
        "Top3_Comments": r"Top3_Comments\s*\|\s*(\d+)",
        "Rittenhouse_Upvotes": r"Rittenhouse_Upvotes\s*\|\s*(\d+)",
        "Rittenhouse_Comments": r"Rittenhouse_Comments\s*\|\s*(\d+)",
        "Total_Image_Posts_5Pages": r"Total_Image_Posts_5Pages\s*\|\s*(\d+)",
    }

    # Extract each field using regex
    for key, pattern in patterns.items():
        match = re.search(pattern, text, re.DOTALL | re.MULTILINE)
        if match:
            # For title fields, clean up newlines and extra spaces
            value = match.group(1).strip()
            if "Title" in key:
                # Replace newlines with spaces and normalize whitespace
                value = " ".join(value.split())
            data[key] = value

    return data


def normalize_text(text):
    """
    Normalize text for comparison by handling different quote styles and whitespace.
    """
    if not isinstance(text, str):
        return str(text)

    # Replace various quote styles with standard quotes
    text = text.replace(""", "'").replace(""", "'")
    text = text.replace('"', '"').replace('"', '"')
    text = text.replace("&amp;", "&")

    # Normalize whitespace
    text = " ".join(text.split())

    return text.strip()


async def verify() -> bool:
    """
    Verifies that the wonderful movies analysis task has been completed correctly.
    """
    async with async_playwright() as p:
        browser = await p.chromium.launch(headless=True)
        context = await browser.new_context()
        page = await context.new_page()

        try:
            # Navigate to the main page
            print("Navigating to forum...", file=sys.stderr)
            await page.goto(f"{BASE_URL}/", wait_until="networkidle")

            # Check if logged in as movie_reviewer_2024
            user_button = page.locator('button:has-text("movie_reviewer_2024")')
            if not await user_button.count():
                # Try to login
                print("Not logged in, attempting to login...", file=sys.stderr)

                await page.click('a:has-text("Log in")')
                await page.wait_for_load_state("networkidle")

                await page.fill('input[name="_username"]', "movie_reviewer_2024")
                await page.fill('input[name="_password"]', "movie_reviewer_2024")

                await page.click('button:has-text("Log in")')
                await page.wait_for_load_state("networkidle")

                user_button = page.locator('button:has-text("movie_reviewer_2024")')
                if not await user_button.count():
                    print(
                        "Error: Login failed for movie_reviewer_2024", file=sys.stderr
                    )
                    return False

                print("Successfully logged in as movie_reviewer_2024", file=sys.stderr)
            else:
                print("Already logged in as movie_reviewer_2024", file=sys.stderr)

            # Navigate to movies forum
            print("Navigating to movies forum...", file=sys.stderr)
            await page.goto(
                f"{BASE_URL}/f/movies", wait_until="networkidle"
            )

            # Look for the submission with our specific title
            print(
                "Looking for submission 'Wonderful Movies Analysis: Community Favorites [2024]'...",
                file=sys.stderr,
            )
            post_link = page.locator(
                'a:has-text("Wonderful Movies Analysis: Community Favorites [2024]")'
            )

            if not await post_link.count():
                print(
                    "Error: Could not find submission with required title",
                    file=sys.stderr,
                )
                return False

            # Click on the submission to view its content
            await post_link.first.click()
            await page.wait_for_load_state("networkidle")

            # Extract the submission body content
            # Try multiple possible selectors for the post body
            post_content = None
            selectors = [
                ".submission__body",
                ".post-body",
                ".RichText",
                '[class*="RichText"]',
                'div:has(> p:has-text("Total_Year_Posts"))',
                'div:has-text("Total_Year_Posts"):has-text("Total_Image_Posts_5Pages")',
            ]

            for selector in selectors:
                content_element = page.locator(selector)
                if await content_element.count():
                    post_content = await content_element.first.inner_text()
                    if "Total_Year_Posts" in post_content:
                        print(
                            f"Found submission content using selector: {selector}",
                            file=sys.stderr,
                        )
                        break

            if not post_content or "Total_Year_Posts" not in post_content:
                print(
                    "Error: Could not find submission body with required format",
                    file=sys.stderr,
                )
                return False

            print("Submission content found, parsing data...", file=sys.stderr)
            print(f"Raw content: {post_content[:200]}...", file=sys.stderr)

            # Parse the Key: Value format
            extracted_data = parse_key_value_format(post_content)
            print(f"Extracted data: {extracted_data}", file=sys.stderr)

            # Load expected values from label.txt
            label_path = Path(__file__).parent / "label.txt"
            if label_path.exists():
                with open(label_path, "r") as f:
                    expected_text = f.read().strip()
                expected_data = parse_key_value_format(expected_text)
                print("Loaded expected values from label.txt", file=sys.stderr)

            # Verify all required keys are present
            required_keys = [
                "Total_Year_Posts",
                "Top1_Title",
                "Top1_Upvotes",
                "Top1_Comments",
                "Top2_Title",
                "Top2_Upvotes",
                "Top2_Comments",
                "Top3_Title",
                "Top3_Upvotes",
                "Top3_Comments",
                "Rittenhouse_Upvotes",
                "Rittenhouse_Comments",
                "Total_Image_Posts_5Pages",
            ]

            missing_keys = []
            for key in required_keys:
                if key not in extracted_data:
                    missing_keys.append(key)

            if missing_keys:
                print(
                    f"Error: Missing required keys: {', '.join(missing_keys)}",
                    file=sys.stderr,
                )
                return False

            # Validate data format and content
            errors = []

            # Check Total_Year_Posts is a number and matches expected
            try:
                total_posts = int(extracted_data["Total_Year_Posts"])
                if "expected_data" in locals() and "Total_Year_Posts" in expected_data:
                    expected_total = int(expected_data["Total_Year_Posts"])
                    if total_posts != expected_total:
                        errors.append(
                            f"Total_Year_Posts mismatch: got {total_posts}, expected {expected_total}"
                        )
            except ValueError:
                errors.append(
                    f"Total_Year_Posts must be a number, got: {extracted_data['Total_Year_Posts']}"
                )

            # If we have expected data, compare against it
            if "expected_data" in locals():
                # Compare each field
                for key in required_keys:
                    if key in expected_data and key in extracted_data:
                        expected_val = normalize_text(expected_data[key])
                        actual_val = normalize_text(extracted_data[key])

                        # For numeric fields, compare as integers
                        if (
                            "Upvotes" in key
                            or "Comments" in key
                            or key == "Total_Year_Posts"
                            or key == "Total_Image_Posts_5Pages"
                        ):
                            try:
                                expected_int = int(expected_val)
                                actual_int = int(actual_val)
                                if expected_int != actual_int:
                                    errors.append(
                                        f"{key} mismatch: got {actual_int}, expected {expected_int}"
                                    )
                            except ValueError:
                                errors.append(
                                    f"{key} should be numeric: got '{actual_val}'"
                                )
                        else:
                            # For text fields, compare normalized text
                            if expected_val != actual_val:
                                errors.append(
                                    f"{key} mismatch: got '{actual_val}', expected '{expected_val}'"
                                )

            else:
                # If no expected data, just do basic validation
                for key in required_keys:
                    if key not in extracted_data:
                        errors.append(f"Missing required key: {key}")
                    elif (
                        not extracted_data[key] or extracted_data[key] == "[FILL_VALUE]"
                    ):
                        errors.append(f"{key} was not filled in")

            if errors:
                print(
                    "Error: Validation failed with the following issues:",
                    file=sys.stderr,
                )
                for error in errors:
                    print(f"  - {error}", file=sys.stderr)
                return False

            # All checks passed
            print("Success: Wonderful movies analysis task completed successfully.")
            print("- Account movie_reviewer_2024 verified")
            print(
                "- Submission 'Wonderful Movies Analysis: Community Favorites [2024]' found"
            )
            print(f"- Total posts with years: {extracted_data['Total_Year_Posts']}")
            print("- Top 3 posts by upvotes identified and documented")
            print(
                f"- Rittenhouse Square data: {extracted_data['Rittenhouse_Upvotes']} upvotes, {extracted_data['Rittenhouse_Comments']} comments"
            )
            print(
                f"- Total image posts across 5 pages: {extracted_data['Total_Image_Posts_5Pages']}"
            )
            print("- All data in correct Key|Value format")
            return True

        except PlaywrightTimeoutError as e:
            print(f"Error: Timeout occurred - {str(e)}", file=sys.stderr)
            return False
        except Exception as e:
            print(f"Error: Unexpected error - {str(e)}", file=sys.stderr)
            return False
        finally:
            await browser.close()


def main():
    """
    Executes the verification process and exits with a status code.
    """
    result = asyncio.run(verify())
    sys.exit(0 if result else 1)


if __name__ == "__main__":
    main()


================================================
FILE: tasks/playwright_webarena/standard/reddit/nba_statistics_analysis/description.md
================================================
I'm conducting research on NBA player discussions in online sports communities. Please help me create a comprehensive analysis.

**Task Requirements:**

1. Register a new account with username 'NBA_DataAnalyst_2024' and password 'Research#2024!'

2. Navigate to the sports forum and search for posts containing 'NBA' in their titles:
   - Collect data from the 5 NBA-related posts with the most comments
   - For each post, record: the exact post title, vote count, comment count, and the username of the person who submitted it

3. Visit the user profile of 'BCLetsRide69':
   - Count his total submissions

4. Create a new submission in the sports forum with:
   - Title: "Statistical Analysis: NBA Content Engagement on This Forum"
   - Body text must be EXACTLY these lines without anything (keep the keys as-is, only replace the values after the colon, follow the markdown format):

```
- Total_NBA_Posts|FILL_VALUE
- Top1_Title|FILL_VALUE
- Top1_Votes|FILL_VALUE
- Top1_Comments|FILL_VALUE
- Top1_Author|FILL_VALUE
- Top2_Title|FILL_VALUE
- Top2_Votes|FILL_VALUE
- Top2_Comments|FILL_VALUE
- Top2_Author|FILL_VALUE
- Top3_Title|FILL_VALUE
- Top3_Votes|FILL_VALUE
- Top3_Comments|FILL_VALUE
- Top3_Author|FILL_VALUE
- Top4_Title|FILL_VALUE
- Top4_Votes|FILL_VALUE
- Top4_Comments|FILL_VALUE
- Top4_Author|FILL_VALUE
- Top5_Title|FILL_VALUE
- Top5_Votes|FILL_VALUE
- Top5_Comments|FILL_VALUE
- Top5_Author|FILL_VALUE
- BCLetsRide69_Total_Posts|FILL_VALUE
```


================================================
FILE: tasks/playwright_webarena/standard/reddit/nba_statistics_analysis/label.txt
================================================
- Total_NBA_Posts|20
- Top1_Title|Hamby claims [WNBA Champ] Aces 'unprofessional' after trade
- Top1_Votes|614
- Top1_Comments|170
- Top1_Author|Responsible-Lunch815
- Top2_Title|Heat place literally every player on injury report after receiving NBA fine ahead of Mexico City game
- Top2_Votes|1266
- Top2_Comments|145
- Top2_Author|XXmynameisNeganXX
- Top3_Title|[ESPN] Announced attendance at the Alamodome tonight|68,323, a new single-game NBA record, in the Spurs' first game there since Game 4 of the 2002 Western Conference Semifinals.
- Top3_Votes|1511
- Top3_Comments|101
- Top3_Author|dragon8811
- Top4_Title|Phoenix Mercury confirm Brittney Griner’s return to WNBA
- Top4_Votes|0
- Top4_Comments|42
- Top4_Author|rejs7
- Top5_Title|Perspective | Kyrie Irving lit a flame. The NBA, top to bottom, watched the fire spread.
- Top5_Votes|74
- Top5_Comments|32
- Top5_Author|tomyland
- BCLetsRide69_Total_Posts|48

================================================
FILE: tasks/playwright_webarena/standard/reddit/nba_statistics_analysis/meta.json
================================================
{
  "task_id": "nba_statistics_analysis",
  "task_name": "NBA Statistics Analysis",
  "category_id": "reddit",
  "category_name": "Reddit",
  "description": "Create sports analytics account, collect NBA player statistics from forum discussions, analyze basketball performance metrics, and compile comprehensive statistical report with community insights.",
  "author": "Fanqing Meng",
  "created_at": "2025-08-12",
  "difficulty": "L3",
  "tags": [
    "user interaction",
    "data extraction",
    "comparative analysis",
    "content submission"
  ],
  "mcp": [
    "playwright"
  ],
  "meta_data": {
    "stateType": "video",
    "stateContent": null,
    "stateUrl": "https://storage.mcpmark.ai/tasks_state/playwright_video/postmill.mp4",
    "stateOriginalUrl": "https://github.com/web-arena-x/webarena/tree/main/environment_docker"
  }
}

================================================
FILE: tasks/playwright_webarena/standard/reddit/nba_statistics_analysis/verify.py
================================================
import asyncio
import sys
import re
import os
from pathlib import Path
from playwright.async_api import (
    async_playwright,
    TimeoutError as PlaywrightTimeoutError,
)

# 从环境变量读取 base_url，默认回退到本地
BASE_URL = os.getenv("WEBARENA_BASE_URL", "http://localhost:9999").rstrip("/")


def parse_key_value_format(text):
    """
    Parse the Key|Value format from the submission body.
    This handles both the expected format from label.txt and the submission format.
    """
    data = {}
    
    # Split by lines and parse each line
    lines = text.strip().split('\n')
    for line in lines:
        line = line.strip()
        if not line or line.startswith('#'):
            continue
            
        # Remove bullet point if present
        if line.startswith('- '):
            line = line[2:]
        elif line.startswith('• '):
            line = line[2:]
            
        # Parse pipe-separated format
        if '|' in line:
            parts = line.split('|', 1)
            if len(parts) == 2:
                key = parts[0].strip()
                value = parts[1].strip()
                if value and value != 'FILL_VALUE':
                    data[key] = value
    
    return data


def normalize_text(text):
    """
    Normalize text for comparison by handling different quote styles and whitespace.
    """
    if not isinstance(text, str):
        return str(text)

    # Replace various quote styles with standard quotes
    text = text.replace(""", "'").replace(""", "'")
    text = text.replace('"', '"').replace('"', '"')
    # Also normalize apostrophes - use unicode escapes to be safe
    text = text.replace("\u2019", "'")  # RIGHT SINGLE QUOTATION MARK (')
    text = text.replace("\u2018", "'")  # LEFT SINGLE QUOTATION MARK (')

    # Normalize whitespace
    text = " ".join(text.split())

    return text.strip()


async def verify() -> bool:
    """
    Verifies that the NBA analysis task has been completed correctly.
    """
    async with async_playwright() as p:
        browser = await p.chromium.launch(headless=True)
        context = await browser.new_context()
        page = await context.new_page()

        try:
            # Navigate to the main page
            print("Navigating to forum...", file=sys.stderr)
            await page.goto(f"{BASE_URL}/", wait_until="networkidle")

            # Check if logged in as NBA_DataAnalyst_2024
            user_button = page.locator('button:has-text("NBA_DataAnalyst_2024")')
            if not await user_button.count():
                # Try to login
                print("Not logged in, attempting to login...", file=sys.stderr)

                await page.click('a:has-text("Log in")')
                await page.wait_for_load_state("networkidle")

                await page.fill('input[name="_username"]', "NBA_DataAnalyst_2024")
                await page.fill('input[name="_password"]', "Research#2024!")

                await page.click('button:has-text("Log in")')
                await page.wait_for_load_state("networkidle")

                user_button = page.locator('button:has-text("NBA_DataAnalyst_2024")')
                if not await user_button.count():
                    print(
                        "Error: Login failed for NBA_DataAnalyst_2024", file=sys.stderr
                    )
                    return False

                print("Successfully logged in as NBA_DataAnalyst_2024", file=sys.stderr)
            else:
                print("Already logged in as NBA_DataAnalyst_2024", file=sys.stderr)

            # Navigate to sports forum to check submission
            print("Navigating to sports forum to check submission...", file=sys.stderr)
            await page.goto(
                f"{BASE_URL}/f/sports", wait_until="networkidle"
            )

            # Look for the submission with our specific title
            print(
                "Looking for submission 'Statistical Analysis: NBA Content Engagement on This Forum'...",
                file=sys.stderr,
            )
            post_link = page.locator(
                'a:has-text("Statistical Analysis: NBA Content Engagement on This Forum")'
            )

            if not await post_link.count():
                print(
                    "Error: Could not find submission with required title",
                    file=sys.stderr,
                )
                return False

            # Click on the submission to view its content
            await post_link.first.click()
            await page.wait_for_load_state("networkidle")

            # Extract the submission body content
            # Try multiple possible selectors for the post body
            post_content = None
            selectors = [
                ".submission__body",
                ".post-body",
                ".RichText",
                '[class*="RichText"]',
                'div:has(> p:has-text("Total_NBA_Posts"))',
                'div:has-text("Total_NBA_Posts"):has-text("Most_Popular_NBA_Author")',
            ]

            for selector in selectors:
                content_element = page.locator(selector)
                if await content_element.count():
                    post_content = await content_element.first.inner_text()
                    if "Total_NBA_Posts" in post_content:
                        print(
                            f"Found submission content using selector: {selector}",
                            file=sys.stderr,
                        )
                        break

            if not post_content or "Total_NBA_Posts" not in post_content:
                print(
                    "Error: Could not find submission body with required format",
                    file=sys.stderr,
                )
                return False

            print("Submission content found, parsing data...", file=sys.stderr)
            print(f"Raw content: {post_content[:200]}...", file=sys.stderr)

            # Parse the Key: Value format
            extracted_data = parse_key_value_format(post_content)
            print(f"Extracted data: {extracted_data}", file=sys.stderr)

            # Load expected values from label.txt
            label_path = Path(__file__).parent / "label.txt"
            if label_path.exists():
                with open(label_path, "r") as f:
                    expected_text = f.read().strip()
                expected_data = parse_key_value_format(expected_text)
                print("Loaded expected values from label.txt", file=sys.stderr)

            # Verify all required keys are present
            required_keys = [
                "Total_NBA_Posts",
                "Top1_Title",
                "Top1_Votes",
                "Top1_Comments",
                "Top1_Author",
                "Top2_Title",
                "Top2_Votes",
                "Top2_Comments",
                "Top2_Author",
                "Top3_Title",
                "Top3_Votes",
                "Top3_Comments",
                "Top3_Author",
                "Top4_Title",
                "Top4_Votes",
                "Top4_Comments",
                "Top4_Author",
                "Top5_Title",
                "Top5_Votes",
                "Top5_Comments",
                "Top5_Author",
                "BCLetsRide69_Total_Posts",
            ]

            missing_keys = []
            for key in required_keys:
                if key not in extracted_data:
                    missing_keys.append(key)

            if missing_keys:
                print(
                    f"Error: Missing required keys: {', '.join(missing_keys)}",
                    file=sys.stderr,
                )
                return False

            # Validate data format and content
            errors = []

            # Check Total_NBA_Posts is a number and matches expected
            try:
                total_posts = int(extracted_data["Total_NBA_Posts"])
                if "expected_data" in locals() and "Total_NBA_Posts" in expected_data:
                    expected_total = int(expected_data["Total_NBA_Posts"])
                    if total_posts != expected_total:
                        errors.append(
                            f"Total_NBA_Posts mismatch: got {total_posts}, expected {expected_total}"
                        )
                elif (
                    total_posts < 5
                ):  # Should be at least 5 since we're collecting top 5
                    errors.append(f"Total_NBA_Posts seems too low: {total_posts}")
            except ValueError:
                errors.append(
                    f"Total_NBA_Posts must be a number, got: {extracted_data['Total_NBA_Posts']}"
                )

            # If we have expected data, compare against it
            if "expected_data" in locals():
                # Compare each field
                for key in required_keys:
                    if key in expected_data and key in extracted_data:
                        expected_val = normalize_text(expected_data[key])
                        actual_val = normalize_text(extracted_data[key])

                        # For numeric fields, compare as integers
                        if (
                            "Votes" in key
                            or "Comments" in key
                            or key == "Total_NBA_Posts"
                            or key == "BCLetsRide69_Total_Posts"
                        ):
                            try:
                                expected_int = int(expected_val)
                                actual_int = int(actual_val)
                                if expected_int != actual_int:
                                    errors.append(
                                        f"{key} mismatch: got {actual_int}, expected {expected_int}"
                                    )
                            except ValueError:
                                errors.append(
                                    f"{key} should be numeric: got '{actual_val}'"
                                )
                        else:
                            # For text fields, compare normalized text
                            if expected_val != actual_val:
                                errors.append(
                                    f"{key} mismatch: got '{actual_val}', expected '{expected_val}'"
                                )

            else:
                # If no expected data, just do basic validation
                for key in required_keys:
                    if key not in extracted_data:
                        errors.append(f"Missing required key: {key}")
                    elif (
                        not extracted_data[key] or extracted_data[key] == "[FILL_VALUE]"
                    ):
                        errors.append(f"{key} was not filled in")

            if errors:
                print(
                    "Error: Validation failed with the following issues:",
                    file=sys.stderr,
                )
                for error in errors:
                    print(f"  - {error}", file=sys.stderr)
                return False

            # All checks passed
            print("Success: NBA analysis task completed successfully.")
            print("- Account NBA_DataAnalyst_2024 verified")
            print(
                "- Submission 'Statistical Analysis: NBA Content Engagement on This Forum' found"
            )
            print(
                f"- Total NBA-related posts analyzed: {extracted_data['Total_NBA_Posts']}"
            )
            print("- Top 5 posts identified and documented")
            print(
                f"- BCLetsRide69's total posts: {extracted_data['BCLetsRide69_Total_Posts']}"
            )
            print("- All data in correct Key|Value format")
            return True

        except PlaywrightTimeoutError as e:
            print(f"Error: Timeout occurred - {str(e)}", file=sys.stderr)
            return False
        except Exception as e:
            print(f"Error: Unexpected error - {str(e)}", file=sys.stderr)
            return False
        finally:
            await browser.close()


def main():
    """
    Executes the verification process and exits with a status code.
    """
    result = asyncio.run(verify())
    sys.exit(0 if result else 1)


if __name__ == "__main__":
    main()


================================================
FILE: tasks/playwright_webarena/standard/reddit/routine_tracker_forum/description.md
================================================
Create a daily routine tracking system on the Postmill forum platform by setting up a new account and engaging with productivity-related content in the LifeProTips community.

**Task Requirements:**

1. Create a new account with the exact username "RoutineTracker2025" and password "DailyRoutine123!"

2. Locate the following specific post:
   - Title: "LPT: Use your calendar as your to-do list. Assigning dedicated time to tasks increases the likelyhood of you acting upon it."
   - Record the content of the most upvoted comment of this post

3. Create a new submission in the LifeProTips forum with these exact specifications:
   - Title: "My 5-Step Morning Routine That Increased My Productivity by 200%"
   - Body content must be the content recorded in the above step

4. After successfully posting, verify that your post appears in the LifeProTips forum listing

5. Engage with the community by upvoting exactly these two posts:
   - The calendar to-do list post you found in step 2
   - The post titled "LPT: clean your stovetop after using the oven. The heat loosens grime for easy removal"

================================================
FILE: tasks/playwright_webarena/standard/reddit/routine_tracker_forum/meta.json
================================================
{
  "task_id": "routine_tracker_forum",
  "task_name": "Routine Tracker Forum",
  "category_id": "reddit",
  "category_name": "Reddit",
  "description": "Establish productivity community by creating account, setting up routine tracking forum, implementing daily habit systems, and engaging members with structured productivity challenges and resources.",
  "author": "Fanqing Meng",
  "created_at": "2025-08-12",
  "difficulty": "L3",
  "tags": [
    "user interaction",
    "content submission"
  ],
  "mcp": [
    "playwright"
  ],
  "meta_data": {
    "stateType": "video",
    "stateContent": null,
    "stateUrl": "https://storage.mcpmark.ai/tasks_state/playwright_video/postmill.mp4",
    "stateOriginalUrl": "https://github.com/web-arena-x/webarena/tree/main/environment_docker"
  }
}

================================================
FILE: tasks/playwright_webarena/standard/reddit/routine_tracker_forum/verify.py
================================================
import asyncio
import sys
import os
from pathlib import Path
from datetime import datetime
from playwright.async_api import (
    async_playwright,
    TimeoutError as PlaywrightTimeoutError,
)

BASE_URL = os.getenv("WEBARENA_BASE_URL", "http://localhost:9999").rstrip("/")


async def verify() -> bool:
    """
    Verifies that the daily routine tracking setup has been completed correctly on the forum.
    """
    async with async_playwright() as p:
        browser = await p.chromium.launch(headless=True)
        context = await browser.new_context()
        page = await context.new_page()

        try:
            # Step 1: Check if account can be logged in
            print("Step 1: Verifying account login...", file=sys.stderr)
            await page.goto(f"{BASE_URL}/", wait_until="networkidle")

            # Check if already logged in
            user_button = page.locator('button:has-text("RoutineTracker2025")')
            if not await user_button.count():
                # Try to login
                print("Not logged in, attempting to login...", file=sys.stderr)

                # Click login link
                await page.click('a:has-text("Log in")')
                await page.wait_for_load_state("networkidle")

                # Fill login form
                await page.fill('input[name="_username"]', "RoutineTracker2025")
                await page.fill('input[name="_password"]', "DailyRoutine123!")

                # Submit login form
                await page.click('button:has-text("Log in")')
                await page.wait_for_load_state("networkidle")

                # Check if login successful
                user_button = page.locator('button:has-text("RoutineTracker2025")')
                if not await user_button.count():
                    print("Error: Account login failed for RoutineTracker2025", file=sys.stderr)
                    return False

                print("✓ Account login successful", file=sys.stderr)
            else:
                print("✓ Already logged in as RoutineTracker2025", file=sys.stderr)

            # Step 2: Check if the post exists in LifeProTips forum with correct content
            print("Step 2: Verifying post in LifeProTips forum...", file=sys.stderr)
            await page.goto(
                f"{BASE_URL}/f/LifeProTips", wait_until="networkidle"
            )

            # Check for the created post
            expected_title = "My 5-Step Morning Routine That Increased My Productivity by 200%"
            post_link = page.locator(f'a:has-text("{expected_title}")')
            
            if not await post_link.count():
                print(f"Error: Post with title '{expected_title}' not found in LifeProTips forum", file=sys.stderr)
                return False

            # Click on the post to verify content
            await post_link.click()
            await page.wait_for_load_state("networkidle")

            # Verify post content - this should be the content from the most upvoted comment of the calendar post
            expected_content = "As a college student, having a visible reminder of the assignments I have and when they are due is super helpful for me. It also just feels good to erase them from the board once they are completed."

            # Check if the content exists in the page
            content_found = False
            article_content = await page.locator("article").text_content()
            if article_content and expected_content in article_content:
                content_found = True

            if not content_found:
                print(f"Error: Post content does not match expected content", file=sys.stderr)
                print(f"Expected: {expected_content}", file=sys.stderr)
                return False

            print("✓ Post found in LifeProTips with correct title and content", file=sys.stderr)

            # Step 3: Check upvotes via search
            print("Step 3: Verifying upvotes on posts...", file=sys.stderr)
            
            # Check first post upvote
            search_url1 = f"{BASE_URL}/search?q=LPT%3A+Use+your+calendar+as+your+to-do+list.+Assigning+dedicated+time+to+tasks+increases+the+likelyhood+of+you+acting+upon+it."
            await page.goto(search_url1, wait_until="networkidle")
            
            # Find the post and check its upvote count
            posts = await page.locator("article").all()
            calendar_upvoted = False
            
            for post in posts:
                title_elem = post.locator("h1 a")
                if await title_elem.count():
                    title = await title_elem.text_content()
                    if "Use your calendar as your to-do list" in title:
                        # Check upvote count
                        vote_count_elem = post.locator("span.vote__net-score")
                        if await vote_count_elem.count():
                            vote_count = await vote_count_elem.text_content()
                            if vote_count and vote_count.strip() == "1":
                                calendar_upvoted = True
                                print("✓ Calendar post upvoted (count: 1)", file=sys.stderr)
                                break
            
            if not calendar_upvoted:
                print("Error: Calendar post not upvoted or upvote count is not 1", file=sys.stderr)
                return False

            # Check second post upvote
            search_url2 = f"{BASE_URL}/search?q=LPT%3A+clean+your+stovetop+after+using+the+oven.+The+heat+loosens+grime+for+easy+removal"
            await page.goto(search_url2, wait_until="networkidle")
            
            posts = await page.locator("article").all()
            stovetop_upvoted = False
            
            for post in posts:
                title_elem = post.locator("h1 a")
                if await title_elem.count():
                    title = await title_elem.text_content()
                    if "clean your stovetop after using the oven" in title:
                        # Check upvote count
                        vote_count_elem = post.locator("span.vote__net-score")
                        if await vote_count_elem.count():
                            vote_count = await vote_count_elem.text_content()
                            if vote_count and vote_count.strip() == "1":
                                stovetop_upvoted = True
                                print("✓ Stovetop post upvoted (count: 1)", file=sys.stderr)
                                break
            
            if not stovetop_upvoted:
                print("Error: Stovetop post not upvoted or upvote count is not 1", file=sys.stderr)
                return False

            print("Success: All verification steps passed!")
            return True

        except PlaywrightTimeoutError as e:
            print(f"Error: Timeout occurred - {str(e)}", file=sys.stderr)
            return False
        except Exception as e:
            print(f"Error: Unexpected error - {str(e)}", file=sys.stderr)
            return False
        finally:
            await browser.close()


def main():
    """
    Executes the verification process and exits with a status code.
    """
    result = asyncio.run(verify())
    sys.exit(0 if result else 1)


if __name__ == "__main__":
    main()


================================================
FILE: tasks/playwright_webarena/standard/shopping/advanced_product_analysis/description.md
================================================


**Task Requirements:**

1. Search for products with 'Ginger' in the Product Name field and price range $50.00 to $100.00

2. Add Q Mixers Premium Ginger Ale product to the comparison list

3. Find Intel NUC Kit product in Electronics category and add it to the comparison list

4. From the comparison page:
   - Record SKU numbers for both products
   - Add all products to cart

5. Record the total cart value

6. On the Ginger Ale product detail page, record:
   - Number of customer reviews
   - Name of the most recent reviewer (on top of the first page)

7. Output your findings in this format:

```
<answer>
GingerAleSKU|sku
IntelNUCSKU|sku
CartTotal|amount
ReviewCount|count
LatestReviewer|name
</answer>
```

**Example Output:**
```
<answer>
GingerAleSKU|XXXXXXXXX
IntelNUCSKU|XXXXXXXXX
CartTotal|$XXX.XX
ReviewCount|XX
LatestReviewer|name
</answer>
```


================================================
FILE: tasks/playwright_webarena/standard/shopping/advanced_product_analysis/label.txt
================================================
GingerAleSKU|B071KC37VD
IntelNUCSKU|B01DJ9XID4
CartTotal|$668.49
ReviewCount|12
LatestReviewer|jwm

================================================
FILE: tasks/playwright_webarena/standard/shopping/advanced_product_analysis/meta.json
================================================
{
  "task_id": "advanced_product_analysis",
  "task_name": "Advanced Product Analysis",
  "category_id": "shopping",
  "category_name": "Shopping",
  "description": "Perform comprehensive product analysis including feature comparisons, price tracking, review aggregation, customer sentiment analysis, and generate detailed recommendation reports for informed purchasing decisions.",
  "author": "Yaoqi Ye",
  "created_at": "2025-08-17",
  "difficulty": "L3",
  "tags": [
    "data extraction",
    "comparative analysis",
    "content submission"
  ],
  "mcp": [
    "playwright"
  ],
  "meta_data": {
    "stateType": "video",
    "stateContent": null,
    "stateUrl": "https://storage.mcpmark.ai/tasks_state/playwright_video/one-stop-market.mp4",
    "stateOriginalUrl": "https://github.com/web-arena-x/webarena/tree/main/environment_docker"
  }
}

================================================
FILE: tasks/playwright_webarena/standard/shopping/advanced_product_analysis/verify.py
================================================
import asyncio
import sys
import re
import os
import json
from pathlib import Path


def get_model_response():
    """
    Get the model's response from the MCP_MESSAGES environment variable.
    Returns the last assistant message text.
    """
    messages_path = os.getenv("MCP_MESSAGES")
    print(f"MCP_MESSAGES: {messages_path}")
    if not messages_path:
        print("Warning: MCP_MESSAGES environment variable not set", file=sys.stderr)
        return None

    try:
        with open(messages_path, "r") as f:
            messages = json.load(f)

        # Find the last assistant message
        for message in reversed(messages):
            if (
                message.get("role") == "assistant"
                and message.get("status") == "completed"
            ):
                content = message.get("content", [])
                for item in content:
                    if item.get("type") == "output_text":
                        return item.get("text", "")

        print("Warning: No assistant response found in messages", file=sys.stderr)
        return None
    except Exception as e:
        print(f"Error reading messages file: {str(e)}", file=sys.stderr)
        return None


def parse_answer_format(text):
    """
    Parse the <answer>xxx</answer> format from the agent's output.
    Returns a dictionary with the parsed values.
    """
    if not text:
        return None

    # Look for <answer>...</answer> pattern
    match = re.search(r"<answer>(.*?)</answer>", text, re.IGNORECASE | re.DOTALL)
    if not match:
        return None

    answer_content = match.group(1).strip()

    # Parse each line
    result = {}
    lines = answer_content.split("\n")

    if len(lines) != 5:
        print(f"Error: Expected 5 lines in answer, got {len(lines)}", file=sys.stderr)
        return None

    for line in lines:
        if "|" in line:
            key, value = line.split("|", 1)
            result[key.strip()] = value.strip()

    return result


def load_expected_answer(label_path):
    """
    Load the expected answer from label.txt file.
    Returns a dictionary with the expected values.
    """
    try:
        with open(label_path, "r") as f:
            lines = f.read().strip().split("\n")

        expected = {}
        for line in lines:
            if "|" in line:
                key, value = line.split("|", 1)
                expected[key.strip()] = value.strip()

        return expected
    except Exception as e:
        print(f"Error reading label file: {str(e)}", file=sys.stderr)
        return None


def compare_answers(model_answer, expected_answer):
    """
    Compare the model's answer with the expected answer.
    Returns True if all key information matches, False otherwise.
    """
    if not model_answer or not expected_answer:
        return False

    # Check each expected key
    mismatches = []
    for key, expected_value in expected_answer.items():
        model_value = model_answer.get(key, "")

        # Special handling for different types of values
        if key == "GingerAleSKU":
            # Check exact SKU match
            if model_value != expected_value:
                mismatches.append(
                    f"{key}: expected '{expected_value}', got '{model_value}'"
                )

        elif key == "IntelNUCSKU":
            # Check exact SKU match
            if model_value != expected_value:
                mismatches.append(
                    f"{key}: expected '{expected_value}', got '{model_value}'"
                )

        elif key == "CartTotal":
            # For price fields, only support $XX.XX format
            # Check if model value has correct format
            if not model_value.startswith("$"):
                mismatches.append(
                    f"{key}: incorrect format - expected '$XX.XX' format, got '{model_value}'"
                )
            else:
                # Normalize and compare values
                expected_clean = expected_value.replace("$", "").replace(",", "")
                model_clean = model_value.replace("$", "").replace(",", "")
                if expected_clean != model_clean:
                    mismatches.append(
                        f"{key}: expected '{expected_value}', got '{model_value}'"
                    )

        elif key == "ReviewCount":
            # Check review count matches
            if model_value != expected_value:
                mismatches.append(
                    f"{key}: expected '{expected_value}', got '{model_value}'"
                )

        elif key == "LatestReviewer":
            # Check reviewer name (allow partial match for names)
            if expected_value.lower() not in model_value.lower() and model_value.lower() not in expected_value.lower():
                mismatches.append(
                    f"{key}: expected '{expected_value}', got '{model_value}'"
                )

        else:
            # Exact match for other fields
            if model_value != expected_value:
                mismatches.append(
                    f"{key}: expected '{expected_value}', got '{model_value}'"
                )

    if mismatches:
        print("\n=== Answer Comparison Mismatches ===", file=sys.stderr)
        for mismatch in mismatches:
            print(f"✗ {mismatch}", file=sys.stderr)
        return False

    print("\n=== Answer Comparison ===", file=sys.stderr)
    print("✓ All key information matches the expected answer", file=sys.stderr)
    return True


async def verify() -> bool:
    """
    Verifies that the advanced product analysis task has been completed correctly.
    First checks the model's answer against the expected label.
    """
    # Get the label file path
    label_path = Path(__file__).parent / "label.txt"

    # Load expected answer
    expected_answer = load_expected_answer(label_path)
    if not expected_answer:
        print("Error: Could not load expected answer from label.txt", file=sys.stderr)
        return False

    # Get model's response from MCP_MESSAGES
    model_response = get_model_response()
    if model_response:
        print("Found model response, parsing answer format...", file=sys.stderr)
        model_answer = parse_answer_format(model_response)

        if model_answer:
            print("\n=== Model Answer Parsed ===", file=sys.stderr)
            for key, value in model_answer.items():
                print(f"{key}: {value}", file=sys.stderr)

            # Compare answers
            answer_match = compare_answers(model_answer, expected_answer)
            if not answer_match:
                print("\nModel answer does not match expected answer", file=sys.stderr)
                return False
            print("\n✓ Model answer matches expected answer", file=sys.stderr)
            return True
        else:
            print(
                "Warning: Could not parse answer format from model response",
                file=sys.stderr,
            )
            return False
    else:
        print("No model response found", file=sys.stderr)
        return False


def main():
    """
    Executes the verification process and exits with a status code.
    """
    result = asyncio.run(verify())
    sys.exit(0 if result else 1)


if __name__ == "__main__":
    main()

================================================
FILE: tasks/playwright_webarena/standard/shopping/gaming_accessories_analysis/description.md
================================================


**Task Requirements:**

1. In Video Games category, count products with customer rating 70% or higher in the first 2 pages

2. Sort products by price (ascending) and identify the cheapest product that has customer reviews

3. Find product with SKU 'B07D6LSCXZ' (N64 Controller), add to cart with quantity 3

4. Add products with SKU 'B071DR5V1K' and 'B082LZ4451' to comparison list, then count total products on comparison page

5. In cart, update N64 Controller quantity to 5 and record the subtotal for this item

6. Proceed to checkout and fill shipping form:
   - Email: test.buyer@example.com
   - First Name: Alice
   - Last Name: Johnson
   - Street Address: 456 Oak Avenue
   - Country: United States
   - State/Province: California
   - City: San Francisco
   - Zip Code: 94102
   - Phone: 415-555-0123
   Then count available shipping methods

7. Output your findings in this format:

```
<answer>
Products70Plus|count
CheapestReviewedSKU|sku
CheapestReviewedPrice|price
ComparisonCount|count
N64Subtotal|amount
CheckoutEmail|test.buyer@example.com
ShippingState|California
ShippingMethods|count
</answer>
```


================================================
FILE: tasks/playwright_webarena/standard/shopping/gaming_accessories_analysis/label.txt
================================================
Products70Plus|7
CheapestReviewedSKU|B014HDAUAA
CheapestReviewedPrice|$0.99
ComparisonCount|2
N64Subtotal|$84.95
CheckoutEmail|test.buyer@example.com
ShippingState|California
ShippingMethods|1

================================================
FILE: tasks/playwright_webarena/standard/shopping/gaming_accessories_analysis/meta.json
================================================
{
  "task_id": "gaming_accessories_analysis",
  "task_name": "Gaming Accessories Analysis",
  "category_id": "shopping",
  "category_name": "Shopping",
  "description": "Research gaming peripherals by analyzing technical specifications, comparing performance metrics, evaluating user reviews, tracking price trends, and creating detailed gaming accessory recommendations.",
  "author": "Yaoqi Ye",
  "created_at": "2025-08-17",
  "difficulty": "L3",
  "tags": [
    "search aggregation",
    "comparative analysis",
    "data extraction"
  ],
  "mcp": [
    "playwright"
  ],
  "meta_data": {
    "stateType": "video",
    "stateContent": null,
    "stateUrl": "https://storage.mcpmark.ai/tasks_state/playwright_video/one-stop-market.mp4",
    "stateOriginalUrl": "https://github.com/web-arena-x/webarena/tree/main/environment_docker"
  }
}

================================================
FILE: tasks/playwright_webarena/standard/shopping/gaming_accessories_analysis/verify.py
================================================
import asyncio
import sys
import re
import os
import json
from pathlib import Path


def get_model_response():
    """
    Get the model's response from the MCP_MESSAGES environment variable.
    Returns the last assistant message text.
    """
    messages_path = os.getenv("MCP_MESSAGES")
    print(f"MCP_MESSAGES: {messages_path}")
    if not messages_path:
        print("Warning: MCP_MESSAGES environment variable not set", file=sys.stderr)
        return None

    try:
        with open(messages_path, "r") as f:
            messages = json.load(f)

        # Find the last assistant message
        for message in reversed(messages):
            if (
                message.get("role") == "assistant"
                and message.get("status") == "completed"
                and message.get("type") == "message"
            ):
                content = message.get("content", [])
                for item in content:
                    if item.get("type") == "output_text":
                        return item.get("text", "")

        print("Warning: No assistant response found in messages", file=sys.stderr)
        return None
    except Exception as e:
        print(f"Error reading messages file: {str(e)}", file=sys.stderr)
        return None


def parse_answer_format(text):
    """
    Parse the <answer>...</answer> format from the agent's output.
    Returns a dictionary with the parsed values.
    """
    if not text:
        return None

    # Look for <answer>...</answer> pattern
    match = re.search(r"<answer>(.*?)</answer>", text, re.IGNORECASE | re.DOTALL)
    if not match:
        return None

    answer_content = match.group(1).strip()

    # Parse each line
    result = {}
    lines = answer_content.split("\n")

    if len(lines) != 8:
        print(f"Error: Expected 8 lines in answer, got {len(lines)}", file=sys.stderr)
        return None

    for line in lines:
        if "|" in line:
            key, value = line.split("|", 1)
            result[key.strip()] = value.strip()

    return result


def load_expected_answer(label_path):
    """
    Load the expected answer from label.txt file.
    Returns a dictionary with the expected values.
    """
    try:
        with open(label_path, "r") as f:
            lines = f.read().strip().split("\n")

        expected = {}
        for line in lines:
            if "|" in line:
                key, value = line.split("|", 1)
                expected[key.strip()] = value.strip()

        return expected
    except Exception as e:
        print(f"Error reading label file: {str(e)}", file=sys.stderr)
        return None


def compare_answers(model_answer, expected_answer):
    """
    Compare the model's answer with the expected answer.
    Returns True if all key information matches, False otherwise.
    """
    if not model_answer or not expected_answer:
        return False

    # Check each expected key
    mismatches = []
    for key, expected_value in expected_answer.items():
        model_value = model_answer.get(key, "")

        # Special handling for different types of values
        if key in ["CheapestReviewedPrice", "N64Subtotal"]:
            # For price fields, only support $XX.XX format
            # Check if model value has correct format
            if not model_value.startswith("$"):
                mismatches.append(
                    f"{key}: incorrect format - expected '$XX.XX' format, got '{model_value}'"
                )
            else:
                # Normalize and compare values
                expected_clean = expected_value.replace("$", "").replace(",", "")
                model_clean = model_value.replace("$", "").replace(",", "")
                if expected_clean != model_clean:
                    mismatches.append(
                        f"{key}: expected '{expected_value}', got '{model_value}'"
                    )

        elif key == "CheckoutEmail":
            # Email should match exactly (case-insensitive)
            if model_value.lower() != expected_value.lower():
                mismatches.append(
                    f"{key}: expected '{expected_value}', got '{model_value}'"
                )

        elif key == "Products70Plus":
            # For count fields, allow some flexibility (products might change)
            # But still check if it's a reasonable number
            try:
                model_count = int(model_value)
                expected_count = int(expected_value)
                # Allow up to 2 products difference (in case of dynamic content)
                if abs(model_count - expected_count) > 2:
                    mismatches.append(
                        f"{key}: expected around '{expected_value}', got '{model_value}'"
                    )
            except ValueError:
                mismatches.append(
                    f"{key}: expected '{expected_value}', got '{model_value}'"
                )

        else:
            # Exact match for other fields
            if model_value != expected_value:
                mismatches.append(
                    f"{key}: expected '{expected_value}', got '{model_value}'"
                )

    if mismatches:
        print("\n=== Answer Comparison Mismatches ===", file=sys.stderr)
        for mismatch in mismatches:
            print(f"✗ {mismatch}", file=sys.stderr)
        return False

    print("\n=== Answer Comparison ===", file=sys.stderr)
    print("✓ All key information matches the expected answer", file=sys.stderr)
    return True


async def verify() -> bool:
    """
    Verifies that the gaming accessories analysis task has been completed correctly.
    Checks the model's answer against the expected label.
    """
    # Get the label file path
    label_path = Path(__file__).parent / "label.txt"

    # Load expected answer
    expected_answer = load_expected_answer(label_path)
    if not expected_answer:
        print("Error: Could not load expected answer from label.txt", file=sys.stderr)
        return False

    # Get model's response from MCP_MESSAGES
    model_response = get_model_response()
    if model_response:
        print("Found model response, parsing answer format...", file=sys.stderr)
        model_answer = parse_answer_format(model_response)

        if model_answer:
            print("\n=== Model Answer Parsed ===", file=sys.stderr)
            for key, value in model_answer.items():
                print(f"{key}: {value}", file=sys.stderr)

            # Compare answers
            answer_match = compare_answers(model_answer, expected_answer)
            if not answer_match:
                print("\nModel answer does not match expected answer", file=sys.stderr)
                return False
            print("\n✓ Model answer matches expected answer", file=sys.stderr)
            return True
        else:
            print(
                "Warning: Could not parse answer format from model response",
                file=sys.stderr,
            )
            return False
    else:
        print("No model response found", file=sys.stderr)
        return False


def main():
    """
    Executes the verification process and exits with a status code.
    """
    result = asyncio.run(verify())
    sys.exit(0 if result else 1)


if __name__ == "__main__":
    main()

================================================
FILE: tasks/playwright_webarena/standard/shopping/health_routine_optimization/description.md
================================================


## Task Requirements

1. Search for products with `vitamin` in Description and price range `$0.00` to `$99.99`. Record total search results count.

2. In "Health & Household" category with price filter `$0.00 - $99.99`:
   - Add "LOOPACELL AG13 LR44 L1154 357 76A A76 Button Cell Battery 10 Pack" to comparison
   - Add "Energizer MAX C Batteries, Premium Alkaline C Cell Batteries (8 Battery Count)" to comparison
   - Record each battery's price
   - Verify comparison list has 2 items

3. Search `Elmwood Inn Fine Teas`, find "Elmwood Inn Fine Teas, Orange Vanilla Caffeine-free Fruit Infusion, 16-Ounce Pouch":
   - Record SKU, rating percentage, and review count
   - Add to cart with quantity 2

4. Search `energy`, sort by Relevance (descending):
   - Find "V8 +Energy, Healthy Energy Drink, Steady Energy from Black and Green Tea, Pomegranate Blueberry, 8 Ounce Can ,Pack of 24"
   - Record its position (1st, 2nd, 3rd, etc.)
   - Add to cart with quantity 1

5. In cart:
   - Record unique products count, total quantity, and subtotal
   - Then update Elmwood tea quantity to 3
   - Record new subtotal

## Output Format

```
<answer>
AdvancedSearchResults|XXXX
Battery1Name|LOOPACELL AG13 LR44
Battery1Price|$X.XX
Battery2Name|Energizer MAX C
Battery2Price|$XX.XX
ComparisonCount|X
TeaSKU|XXXXXXXXXX
TeaRating|XXX%
TeaReviews|X
V8Position|Xth
CartUniqueProducts|X
CartTotalQuantity|X
InitialSubtotal|$XX.XX
FinalSubtotal|$XX.XX
</answer>
```


================================================
FILE: tasks/playwright_webarena/standard/shopping/health_routine_optimization/label.txt
================================================
<answer>
AdvancedSearchResults|2906
Battery1Name|LOOPACELL AG13 LR44
Battery1Price|$3.72
Battery2Name|Energizer MAX C
Battery2Price|$14.87
ComparisonCount|2
TeaSKU|B0040WHKIY
TeaRating|95%
TeaReviews|4
V8Position|3rd
CartUniqueProducts|2
CartTotalQuantity|3
InitialSubtotal|$53.19
FinalSubtotal|$72.55
</answer>

================================================
FILE: tasks/playwright_webarena/standard/shopping/health_routine_optimization/meta.json
================================================
{
  "task_id": "health_routine_optimization",
  "task_name": "Health Routine Optimization",
  "category_id": "shopping",
  "category_name": "Shopping",
  "description": "Optimize health and wellness product selections by analyzing nutritional supplements, fitness equipment, creating personalized routines, and tracking health metrics for lifestyle improvements.",
  "author": "Yaoqi Ye",
  "created_at": "2025-08-17",
  "difficulty": "L3",
  "tags": [
    "data extraction",
    "comparative analysis",
    "content submission"
  ],
  "mcp": [
    "playwright"
  ],
  "meta_data": {
    "stateType": "video",
    "stateContent": null,
    "stateUrl": "https://storage.mcpmark.ai/tasks_state/playwright_video/one-stop-market.mp4",
    "stateOriginalUrl": "https://github.com/web-arena-x/webarena/tree/main/environment_docker"
  }
}

================================================
FILE: tasks/playwright_webarena/standard/shopping/health_routine_optimization/verify.py
================================================

import asyncio
import sys
import os
import json
import re
from pathlib import Path


def get_model_response():
    """
    Get the model's response from the MCP_MESSAGES environment variable.
    Returns the last assistant message text.
    """
    messages_path = os.getenv("MCP_MESSAGES")
    print(f"MCP_MESSAGES: {messages_path}")
    if not messages_path:
        print("Warning: MCP_MESSAGES environment variable not set", file=sys.stderr)
        return None

    try:
        with open(messages_path, "r") as f:
            messages = json.load(f)

        # Find the last assistant message
        for message in reversed(messages):
            if (
                message.get("role") == "assistant"
                and message.get("status") == "completed"
                and message.get("type") == "message"
            ):
                content = message.get("content", [])
                for item in content:
                    if item.get("type") == "output_text":
                        return item.get("text", "")

        print("Warning: No assistant response found in messages", file=sys.stderr)
        return None
    except Exception as e:
        print(f"Error reading messages file: {str(e)}", file=sys.stderr)
        return None

def parse_answer_format(text):
    """
    Parse the <answer>...</answer> format from the agent's output.
    Returns a dictionary with the parsed values.
    """
    if not text:
        return None

    # Look for <answer>...</answer> pattern
    match = re.search(r"<answer>(.*?)</answer>", text, re.IGNORECASE | re.DOTALL)
    if not match:
        return None

    answer_content = match.group(1).strip()

    # Parse each line
    result = {}
    lines = answer_content.split("\n")

    if len(lines) != 14:
        print(f"Error: Expected 14 lines in answer, got {len(lines)}", file=sys.stderr)
        return None

    for line in lines:
        if "|" in line:
            key, value = line.split("|", 1)
            result[key.strip()] = value.strip()

    return result

def load_expected_answer(label_path):
    """
    Load the expected answer from label.txt file.
    Returns a dictionary with the expected values.
    """
    try:
        with open(label_path, "r") as f:
            content = f.read().strip()

        # Parse the answer from the label file
        # The label file contains <answer>...</answer> tags
        match = re.search(r"<answer>(.*?)</answer>", content, re.IGNORECASE | re.DOTALL)
        if match:
            answer_content = match.group(1).strip()
            lines = answer_content.split("\n")
        else:
            # Fallback: treat the whole file as answer content
            lines = content.split("\n")

        expected = {}
        for line in lines:
            if "|" in line:
                key, value = line.split("|", 1)
                expected[key.strip()] = value.strip()

        return expected
    except Exception as e:
        print(f"Error reading label file: {str(e)}", file=sys.stderr)
        return None

def compare_answers(model_answer, expected_answer):
    """
    Compare the model's answer with the expected answer.
    Returns True if all key information matches, False otherwise.
    """
    if not model_answer or not expected_answer:
        return False

    # Check each expected key
    mismatches = []
    for key, expected_value in expected_answer.items():
        model_value = model_answer.get(key, "")

        # Special handling for different types of values
        if key in ["Battery1Price", "Battery2Price", "InitialSubtotal", "FinalSubtotal"]:
            # For price fields, only support $XX.XX format
            # Check if model value has correct format
            if not model_value.startswith("$"):
                mismatches.append(
                    f"{key}: incorrect format - expected '$XX.XX' format, got '{model_value}'"
                )
            else:
                # Normalize and compare values
                expected_clean = expected_value.replace("$", "").replace(",", "")
                model_clean = model_value.replace("$", "").replace(",", "")
                if expected_clean != model_clean:
                    mismatches.append(
                        f"{key}: expected '{expected_value}', got '{model_value}'"
                    )

        else:
            # Exact match for other fields
            if model_value != expected_value:
                mismatches.append(
                    f"{key}: expected '{expected_value}', got '{model_value}'"
                )

    if mismatches:
        print("\n=== Answer Comparison Mismatches ===", file=sys.stderr)
        for mismatch in mismatches:
            print(f"✗ {mismatch}", file=sys.stderr)
        return False

    print("\n=== Answer Comparison ===", file=sys.stderr)
    print("✓ All key information matches the expected answer", file=sys.stderr)
    return True

async def verify() -> bool:
    """
    Verifies that the health routine optimization task has been completed correctly.
    Checks the model's answer against the expected label.
    """
    # Get the label file path
    label_path = Path(__file__).parent / "label.txt"

    # Load expected answer
    expected_answer = load_expected_answer(label_path)
    if not expected_answer:
        print("Error: Could not load expected answer from label.txt", file=sys.stderr)
        return False

    # Get model's response from MCP_MESSAGES
    model_response = get_model_response()
    if model_response:
        print("Found model response, parsing answer format...", file=sys.stderr)
        model_answer = parse_answer_format(model_response)

        if model_answer:
            print("\n=== Model Answer Parsed ===", file=sys.stderr)
            for key, value in model_answer.items():
                print(f"{key}: {value}", file=sys.stderr)

            # Compare answers
            answer_match = compare_answers(model_answer, expected_answer)
            if not answer_match:
                print("\nModel answer does not match expected answer", file=sys.stderr)
                return False
            print("\n✓ Model answer matches expected answer", file=sys.stderr)
            return True
        else:
            print(
                "Warning: Could not parse answer format from model response",
                file=sys.stderr,
            )
            return False
    else:
        print("No model response found", file=sys.stderr)
        return False


def main():
    """
    Executes the verification process and exits with a status code.
    """
    result = asyncio.run(verify())
    sys.exit(0 if result else 1)


if __name__ == "__main__":
    main()

================================================
FILE: tasks/playwright_webarena/standard/shopping/holiday_baking_competition/description.md
================================================


**Task Requirements:**

1. Search 'gingerbread', sort by price (high to low):
   - Add most expensive product to comparison list
   - Record SKU of second most expensive product

2. Search 'cookie' with price range $20.00-$40.00:
   - Find product with highest rating % and at least 5 reviews in the first 2 pages (if tied, choose lowest price)
   - Record SKU and rating %
   - Select "Cookies: Oatmeal Chocolate Chunk" flavor if required
   - Add to cart with quantity 2

3. Search 'chocolate', sort by price (low to high):
   - Find cheapest product with at least 1 review
   - Record price and review count
   - Select "Peanut Butter Flavor" if required
   - Add to cart with quantity 3

4. In cart:
   - Update cookie quantity from 2 to 5
   - Record cart subtotal and total items count

5. Search 'gingerbread', go to page 2:
   - Find third product on page 2
   - Record SKU, price, and manufacturer

**Output Format:**

```
<answer>
SecondGingerbreadSKU|sku
HighestRatedCookieSKURating|sku:rating%
CheapestChocolatePriceReviews|$price:reviews
CartSubtotalAfterUpdate|$amount
TotalCartItems|count
Page2ThirdProductSKUPrice|sku:$price
ProductManufacturer|manufacturer
</answer>
```


================================================
FILE: tasks/playwright_webarena/standard/shopping/holiday_baking_competition/label.txt
================================================
SecondGingerbreadSKU|B0075AO9RI
HighestRatedCookieSKURating|B0951CPYV7:86%
CheapestChocolatePriceReviews|$1.04:12
CartSubtotalAfterUpdate|$128.07
TotalCartItems|8
Page2ThirdProductSKUPrice|B09RPXCB47:$21.99
ProductManufacturer|That Melanin Tho

================================================
FILE: tasks/playwright_webarena/standard/shopping/holiday_baking_competition/meta.json
================================================
{
  "task_id": "holiday_baking_competition",
  "task_name": "Holiday Baking Competition",
  "category_id": "shopping",
  "category_name": "Shopping",
  "description": "Research baking supplies for competition preparation including ingredient quality analysis, equipment comparisons, recipe optimization, and creating comprehensive shopping list with budget recommendations.",
  "author": "Yaoqi Ye",
  "created_at": "2025-08-17",
  "difficulty": "L3",
  "tags": [
    "search aggregation",
    "comparative analysis",
    "inventory management"
  ],
  "mcp": [
    "playwright"
  ],
  "meta_data": {
    "stateType": "video",
    "stateContent": null,
    "stateUrl": "https://storage.mcpmark.ai/tasks_state/playwright_video/one-stop-market.mp4",
    "stateOriginalUrl": "https://github.com/web-arena-x/webarena/tree/main/environment_docker"
  }
}

================================================
FILE: tasks/playwright_webarena/standard/shopping/holiday_baking_competition/verify.py
================================================
import asyncio
import sys
import re
import os
import json
from pathlib import Path


def get_model_response():
    """
    Get the model's response from the MCP_MESSAGES environment variable.
    Returns the last assistant message text.
    """
    messages_path = os.getenv("MCP_MESSAGES")
    print(f"MCP_MESSAGES: {messages_path}")
    if not messages_path:
        print("Warning: MCP_MESSAGES environment variable not set", file=sys.stderr)
        return None

    try:
        with open(messages_path, "r") as f:
            messages = json.load(f)

        # Find the last assistant message
        for message in reversed(messages):
            if (
                message.get("role") == "assistant"
                and message.get("status") == "completed"
                and message.get("type") == "message"
            ):
                content = message.get("content", [])
                for item in content:
                    if item.get("type") == "output_text":
                        return item.get("text", "")

        print("Warning: No assistant response found in messages", file=sys.stderr)
        return None
    except Exception as e:
        print(f"Error reading messages file: {str(e)}", file=sys.stderr)
        return None


def parse_answer_format(text):
    """
    Parse the <answer>...</answer> format from the agent's output.
    Returns a dictionary with the parsed values.
    """
    if not text:
        return None

    # Look for <answer>...</answer> pattern
    match = re.search(r"<answer>(.*?)</answer>", text, re.IGNORECASE | re.DOTALL)
    if not match:
        return None

    answer_content = match.group(1).strip()

    # Parse each line
    result = {}
    lines = answer_content.split("\n")

    if len(lines) != 7:
        print(f"Error: Expected 7 lines in answer, got {len(lines)}", file=sys.stderr)
        return None

    for line in lines:
        if "|" in line:
            key, value = line.split("|", 1)
            result[key.strip()] = value.strip()

    return result


def load_expected_answer(label_path):
    """
    Load the expected answer from label.txt file.
    Returns a dictionary with the expected values.
    """
    try:
        with open(label_path, "r") as f:
            lines = f.read().strip().split("\n")

        expected = {}
        for line in lines:
            if "|" in line:
                key, value = line.split("|", 1)
                expected[key.strip()] = value.strip()

        return expected
    except Exception as e:
        print(f"Error reading label file: {str(e)}", file=sys.stderr)
        return None


def compare_answers(model_answer, expected_answer):
    """
    Compare the model's answer with the expected answer.
    Returns True if all key information matches, False otherwise.
    """
    if not model_answer or not expected_answer:
        return False

    # Check each expected key
    mismatches = []
    for key, expected_value in expected_answer.items():
        model_value = model_answer.get(key, "")

        # Special handling for different types of values
        if key == "SecondGingerbreadSKU":
            # SKU should match exactly (case-insensitive)
            if model_value.upper() != expected_value.upper():
                mismatches.append(
                    f"{key}: expected '{expected_value}', got '{model_value}'"
                )
                
        elif key in ["CartSubtotalAfterUpdate"]:
            # For price fields, only support $XX.XX format
            # Check if model value has correct format
            if not model_value.startswith("$"):
                mismatches.append(
                    f"{key}: incorrect format - expected '$XX.XX' format, got '{model_value}'"
                )
            else:
                # Normalize and compare values
                expected_clean = expected_value.replace("$", "").replace(",", "")
                model_clean = model_value.replace("$", "").replace(",", "")
                # Allow some tolerance for price calculations (within $0.01)
                try:
                    expected_float = float(expected_clean)
                    model_float = float(model_clean)
                    if abs(expected_float - model_float) > 0.01:
                        mismatches.append(
                            f"{key}: expected '{expected_value}', got '{model_value}'"
                        )
                except ValueError:
                    if expected_value != model_value:
                        mismatches.append(
                            f"{key}: expected '{expected_value}', got '{model_value}'"
                        )
                    
        elif key in ["TotalCartItems"]:
            # Should be a number
            if model_value != expected_value:
                mismatches.append(
                    f"{key}: expected '{expected_value}', got '{model_value}'"
                )
                
        elif key in ["HighestRatedCookieSKURating", "CheapestChocolatePriceReviews", "Page2ThirdProductSKUPrice"]:
            # Colon-separated fields (sku:rating, price:reviews, sku:price)
            if ":" in expected_value and ":" in model_value:
                expected_parts = expected_value.split(":", 1)
                model_parts = model_value.split(":", 1)
                if len(expected_parts) == 2 and len(model_parts) == 2:
                    # For price fields, normalize the price part
                    if key == "CheapestChocolatePriceReviews":
                        # Check if price part has correct format ($XX.XX)
                        if not model_parts[0].startswith("$"):
                            mismatches.append(
                                f"{key}: incorrect format - price part should start with '$', got '{model_value}'"
                            )
                        else:
                            expected_price = expected_parts[0].replace("$", "").replace(",", "")
                            model_price = model_parts[0].replace("$", "").replace(",", "")
                            try:
                                if abs(float(expected_price) - float(model_price)) > 0.01 or expected_parts[1] != model_parts[1]:
                                    mismatches.append(
                                        f"{key}: expected '{expected_value}', got '{model_value}'"
                                    )
                            except ValueError:
                                if expected_value != model_value:
                                    mismatches.append(
                                        f"{key}: expected '{expected_value}', got '{model_value}'"
                                    )
                    elif key == "Page2ThirdProductSKUPrice":
                        # Check if price part has correct format ($XX.XX)
                        if not model_parts[1].startswith("$"):
                            mismatches.append(
                                f"{key}: incorrect format - price part should start with '$', got '{model_value}'"
                            )
                        else:
                            expected_price = expected_parts[1].replace("$", "").replace(",", "")
                            model_price = model_parts[1].replace("$", "").replace(",", "")
                            try:
                                if expected_parts[0] != model_parts[0] or abs(float(expected_price) - float(model_price)) > 0.01:
                                    mismatches.append(
                                        f"{key}: expected '{expected_value}', got '{model_value}'"
                                    )
                            except ValueError:
                                if expected_value != model_value:
                                    mismatches.append(
                                        f"{key}: expected '{expected_value}', got '{model_value}'"
                                    )
                    else:
                        # For rating fields, exact match
                        if expected_value != model_value:
                            mismatches.append(
                                f"{key}: expected '{expected_value}', got '{model_value}'"
                            )
                else:
                    mismatches.append(
                        f"{key}: expected '{expected_value}', got '{model_value}'"
                    )
            else:
                if expected_value != model_value:
                    mismatches.append(
                        f"{key}: expected '{expected_value}', got '{model_value}'"
                    )
        else:
            # Exact match for other fields (like ProductManufacturer)
            if model_value != expected_value:
                mismatches.append(
                    f"{key}: expected '{expected_value}', got '{model_value}'"
                )

    if mismatches:
        print("\n=== Answer Comparison Mismatches ===", file=sys.stderr)
        for mismatch in mismatches:
            print(f"✗ {mismatch}", file=sys.stderr)
        return False

    print("\n=== Answer Comparison ===", file=sys.stderr)
    print("✓ All key information matches the expected answer", file=sys.stderr)
    return True


async def verify() -> bool:
    """
    Verifies that the holiday baking competition task has been completed correctly.
    Checks the model's answer against the expected label.
    """
    # Get the label file path
    label_path = Path(__file__).parent / "label.txt"

    # Load expected answer
    expected_answer = load_expected_answer(label_path)
    if not expected_answer:
        print("Error: Could not load expected answer from label.txt", file=sys.stderr)
        return False

    # Get model's response from MCP_MESSAGES
    model_response = get_model_response()
    if model_response:
        print("Found model response, parsing answer format...", file=sys.stderr)
        model_answer = parse_answer_format(model_response)

        if model_answer:
            print("\n=== Model Answer Parsed ===", file=sys.stderr)
            for key, value in model_answer.items():
                print(f"{key}: {value}", file=sys.stderr)

            # Compare answers
            answer_match = compare_answers(model_answer, expected_answer)
            if not answer_match:
                print("\nModel answer does not match expected answer", file=sys.stderr)
                return False
            print("\n✓ Model answer matches expected answer", file=sys.stderr)
            return True
        else:
            print(
                "Warning: Could not parse answer format from model response",
                file=sys.stderr,
            )
            return False
    else:
        print("No model response found", file=sys.stderr)
        return False


def main():
    """
    Executes the verification process and exits with a status code.
    """
    result = asyncio.run(verify())
    sys.exit(0 if result else 1)


if __name__ == "__main__":
    main()

================================================
FILE: tasks/playwright_webarena/standard/shopping/multi_category_budget_analysis/description.md
================================================
Complete the following tasks on One Stop Market (http://localhost:7770):

**Task Requirements:**

1. In Chocolate subcategory, sort by price (ascending):
   - Record price and SKU of first 3 products

2. Search for 'tabletop' with price range $100.00-$200.00:
   - Find the cheapest tabletop that has the highest review rating with at least 3 reviews.
   - Record search results count
   - Record price of required tabletop

3. In "Computers & Accessories" subcategory with price filter $0.00-$9,999.99:
   - Sort by price (ascending)
   - Record price of cheapest item

4. Add these products to comparison:
   - "Little Secrets Chocolate Pieces, Peanut Butter Flavor"
   - "Multi Accessory Hub Adapter By JOBY"
   - "SanDisk Cruzer Glide 32GB (5 Pack) USB 2.0 Flash Drive"
   - Count total items on comparison page

5. In cart:
   - Add the cheapest chocolate product (from step 1) with "Peanut flavor" if available
   - Add cheapest computer accessory (from step 3)
   - Record cart subtotal and item count

6. Calculate:
   - Sum of 3 chocolate product prices
   - Price difference: cheapest tabletop minus cheapest computer accessory
   - Whether sum of 3 comparison items < $60

**Output Format:**

```
<answer>
chocolate_products|Price1:SKU1;Price2:SKU2;Price3:SKU3
chocolate_sum|Total
tabletop_search_count|Count
tabletop_product|Price:SKU
tabletop_reviews|NumbersOfReviews:Rating
cheapest_computer_accessory|Price
price_difference|Amount
comparison_count|Count
cart_subtotal|Amount
cart_item_count|Count
under_60_budget|YES/NO
</answer>
```


================================================
FILE: tasks/playwright_webarena/standard/shopping/multi_category_budget_analysis/label.txt
================================================
chocolate_products|$1.04:B071954ZDC;$1.89:B07ND598N3;$2.50:B01G26DMSC
chocolate_sum|$5.43
tabletop_search_count|60
tabletop_product|$169.99:B09NPX5CDP
tabletop_reviews|4:95%
cheapest_computer_accessory|$1.17
price_difference|$168.82
comparison_count|3
cart_subtotal|$2.21
cart_item_count|2
under_60_budget|YES

================================================
FILE: tasks/playwright_webarena/standard/shopping/multi_category_budget_analysis/meta.json
================================================
{
  "task_id": "multi_category_budget_analysis",
  "task_name": "Multi Category Budget Analysis",
  "category_id": "shopping",
  "category_name": "Shopping",
  "description": "Analyze spending patterns across multiple product categories, optimize budget allocation, identify cost-saving opportunities, and generate comprehensive financial planning report with purchase recommendations.",
  "author": "Yaoqi Ye",
  "created_at": "2025-08-17",
  "difficulty": "L3",
  "tags": [
    "data extraction",
    "search aggregation",
    "content submission",
    "comparative analysis",
    "inventory management"
  ],
  "mcp": [
    "playwright"
  ],
  "meta_data": {
    "stateType": "video",
    "stateContent": null,
    "stateUrl": "https://storage.mcpmark.ai/tasks_state/playwright_video/one-stop-market.mp4",
    "stateOriginalUrl": "https://github.com/web-arena-x/webarena/tree/main/environment_docker"
  }
}

================================================
FILE: tasks/playwright_webarena/standard/shopping/multi_category_budget_analysis/verify.py
================================================
import asyncio
import sys
import re
import os
import json
from pathlib import Path


def get_model_response():
    """
    Get the model's response from the MCP_MESSAGES environment variable.
    Returns the last assistant message text.
    """
    messages_path = os.getenv("MCP_MESSAGES")
    print(f"MCP_MESSAGES: {messages_path}")
    if not messages_path:
        print("Warning: MCP_MESSAGES environment variable not set", file=sys.stderr)
        return None

    try:
        with open(messages_path, "r") as f:
            messages = json.load(f)

        # Find the last assistant message
        for message in reversed(messages):
            if (
                message.get("role") == "assistant"
                and message.get("status") == "completed"
                and message.get("type") == "message"
            ):
                content = message.get("content", [])
                for item in content:
                    if item.get("type") == "output_text":
                        return item.get("text", "")

        print("Warning: No assistant response found in messages", file=sys.stderr)
        return None
    except Exception as e:
        print(f"Error reading messages file: {str(e)}", file=sys.stderr)
        return None


def parse_answer_format(text):
    """
    Parse the <answer>...</answer> format from the agent's output.
    Returns a dictionary with the parsed values.
    """
    if not text:
        return None

    # Look for <answer>...</answer> pattern
    match = re.search(r"<answer>(.*?)</answer>", text, re.IGNORECASE | re.DOTALL)
    if not match:
        return None

    answer_content = match.group(1).strip()

    # Parse each line
    result = {}
    lines = answer_content.split("\n")

    if len(lines) != 11:
        print(f"Error: Expected 11 lines in answer, got {len(lines)}", file=sys.stderr)
        return None

    for line in lines:
        if "|" in line:
            key, value = line.split("|", 1)
            result[key.strip()] = value.strip()

    return result


def load_expected_answer(label_path):
    """
    Load the expected answer from label.txt file.
    Returns a dictionary with the expected values.
    """
    try:
        with open(label_path, "r") as f:
            lines = f.read().strip().split("\n")

        expected = {}
        for line in lines:
            if "|" in line:
                key, value = line.split("|", 1)
                expected[key.strip()] = value.strip()

        return expected
    except Exception as e:
        print(f"Error reading label file: {str(e)}", file=sys.stderr)
        return None


def compare_answers(model_answer, expected_answer):
    """
    Compare the model's answer with the expected answer.
    Returns True if all key information matches, False otherwise.
    """
    if not model_answer or not expected_answer:
        return False

    # Check each expected key
    mismatches = []
    for key, expected_value in expected_answer.items():
        model_value = model_answer.get(key, "")

        # Special handling for different types of values
        if key == "chocolate_products":
            # Parse and compare chocolate products with price:SKU format
            expected_products = expected_value.split(";")
            model_products = model_value.split(";")
            
            if len(expected_products) != len(model_products):
                mismatches.append(f"{key}: expected {len(expected_products)} products, got {len(model_products)}")
            else:
                for i, (exp, mod) in enumerate(zip(expected_products, model_products)):
                    exp_parts = exp.strip().split(":")
                    mod_parts = mod.strip().split(":")
                    if len(exp_parts) != 2 or len(mod_parts) != 2:
                        mismatches.append(f"{key}: product {i+1} format error - expected 'price:SKU'")
                    else:
                        # Check price format (should start with $)
                        if not mod_parts[0].startswith("$"):
                            mismatches.append(f"{key}: product {i+1} price format error - expected '$XX.XX' format, got '{mod_parts[0]}'")
                        elif exp_parts[0] != mod_parts[0] or exp_parts[1] != mod_parts[1]:
                            mismatches.append(f"{key}: product {i+1} mismatch - expected '{exp}', got '{mod}'")

        elif key == "tabletop_product":
            # Parse and compare tabletop product with price:SKU format
            exp_parts = expected_value.strip().split(":")
            mod_parts = model_value.strip().split(":")
            if len(exp_parts) != 2 or len(mod_parts) != 2:
                mismatches.append(f"{key}: format error - expected 'price:SKU', got '{model_value}'")
            else:
                # Check price format (should start with $)
                if not mod_parts[0].startswith("$"):
                    mismatches.append(f"{key}: price format error - expected '$XX.XX' format, got '{mod_parts[0]}'")
                elif exp_parts[0] != mod_parts[0] or exp_parts[1] != mod_parts[1]:
                    mismatches.append(f"{key}: mismatch - expected '{expected_value}', got '{model_value}'")
        
        elif key == "tabletop_reviews":
            # Parse and compare tabletop reviews with NumberOfReviews:Rating format
            exp_parts = expected_value.strip().split(":")
            mod_parts = model_value.strip().split(":")
            if len(exp_parts) != 2 or len(mod_parts) != 2:
                mismatches.append(f"{key}: format error - expected 'NumberOfReviews:Rating', got '{model_value}'")
            else:
                # Check if both parts match
                if exp_parts[0] != mod_parts[0] or exp_parts[1] != mod_parts[1]:
                    mismatches.append(f"{key}: mismatch - expected '{expected_value}', got '{model_value}'")

        elif key in ["chocolate_sum", "price_difference", "cart_subtotal", "cheapest_computer_accessory"]:
            # For price fields, only support $XX.XX format
            # Check if model value has correct format
            if not model_value.startswith("$"):
                mismatches.append(
                    f"{key}: incorrect format - expected '$XX.XX' format, got '{model_value}'"
                )
            else:
                # Normalize and compare values
                expected_clean = expected_value.replace("$", "").replace(",", "")
                model_clean = model_value.replace("$", "").replace(",", "")
                if expected_clean != model_clean:
                    mismatches.append(
                        f"{key}: expected '{expected_value}', got '{model_value}'"
                    )

        elif key == "under_60_budget":
            # Compare YES/NO value (case-insensitive)
            if expected_value.upper() != model_value.upper():
                mismatches.append(f"{key}: expected '{expected_value}', got '{model_value}'")

        elif key in ["tabletop_search_count", "comparison_count", "cart_item_count"]:
            # Numeric fields - exact match
            if model_value != expected_value:
                mismatches.append(
                    f"{key}: expected '{expected_value}', got '{model_value}'"
                )

        else:
            # Exact match for other fields
            if model_value != expected_value:
                mismatches.append(
                    f"{key}: expected '{expected_value}', got '{model_value}'"
                )

    if mismatches:
        print("\n=== Answer Comparison Mismatches ===", file=sys.stderr)
        for mismatch in mismatches:
            print(f"✗ {mismatch}", file=sys.stderr)
        return False

    print("\n=== Answer Comparison ===", file=sys.stderr)
    print("✓ All key information matches the expected answer", file=sys.stderr)
    return True


async def verify() -> bool:
    """
    Verifies that the multi-category budget analysis task has been completed correctly.
    """
    # Get the label file path
    label_path = Path(__file__).parent / "label.txt"
    
    # Load expected answer
    expected_answer = load_expected_answer(label_path)
    if not expected_answer:
        print("Error: Could not load expected answer from label.txt", file=sys.stderr)
        return False

    # Get model's response from MCP_MESSAGES
    model_response = get_model_response()
    if model_response:
        print("Found model response, parsing answer format...", file=sys.stderr)
        model_answer = parse_answer_format(model_response)
        
        if model_answer:
            print("\n=== Model Answer Parsed ===", file=sys.stderr)
            for key, value in model_answer.items():
                print(f"{key}: {value}", file=sys.stderr)
            
            # Compare answers
            answer_match = compare_answers(model_answer, expected_answer)
            if not answer_match:
                print("\nModel answer does not match expected answer", file=sys.stderr)
                return False
            print("\n✓ Model answer matches expected answer", file=sys.stderr)
            return True
        else:
            print("Warning: Could not parse answer format from model response", file=sys.stderr)
            return False
    else:
        print("No model response found", file=sys.stderr)
        return False


def main():
    """
    Executes the verification process and exits with a status code.
    """
    result = asyncio.run(verify())
    sys.exit(0 if result else 1)


if __name__ == "__main__":
    main()

================================================
FILE: tasks/playwright_webarena/standard/shopping/printer_keyboard_search/description.md
================================================


1. Search for a `printer capable of reducing blue light` that:
   - Is pink or purple (must be stated in product details, not from image)
   - Manufactured in Asia
   Record SKU ID and price

2. Find a keyboard with:
   - Bluetooth mode (must be stated either stated in details or title)
   - Price between $50.00-$100.00
   - Highest review rating among matching products
   Record SKU ID, price, number of reviews, and review rating

**Output Format:**

```
<answer>
PrinterSKUID|id
PrinterPrice|$XX.XX
KeyboardSKUID|id
KeyboardPrice|$XX.XX
KeyboardReviews|XX
KeyboardRating|XX%
</answer>
```


================================================
FILE: tasks/playwright_webarena/standard/shopping/printer_keyboard_search/label.txt
================================================
PrinterSKUID|B09J8KQX6V
PrinterPrice|$248.04
KeyboardSKUID|B08JD7F3F5
KeyboardPrice|$85.99
KeyboardReviews|12
KeyboardRating|77%

================================================
FILE: tasks/playwright_webarena/standard/shopping/printer_keyboard_search/meta.json
================================================
{
  "task_id": "printer_keyboard_search",
  "task_name": "Printer Keyboard Search",
  "category_id": "shopping",
  "category_name": "Shopping",
  "description": "Search and evaluate office equipment by comparing printer specifications, keyboard ergonomics, analyzing user reviews, tracking prices, and generating detailed purchase recommendations report.",
  "author": "Yaoqi Ye",
  "created_at": "2025-08-17",
  "difficulty": "L3",
  "tags": [
    "search aggregation",
    "comparative analysis",
    "content submission"
  ],
  "mcp": [
    "playwright"
  ],
  "meta_data": {
    "stateType": "video",
    "stateContent": null,
    "stateUrl": "https://storage.mcpmark.ai/tasks_state/playwright_video/one-stop-market.mp4",
    "stateOriginalUrl": "https://github.com/web-arena-x/webarena/tree/main/environment_docker"
  }
}

================================================
FILE: tasks/playwright_webarena/standard/shopping/printer_keyboard_search/verify.py
================================================
import asyncio
import sys
import re
import os
import json
from pathlib import Path


def get_model_response():
    """
    Get the model's response from the MCP_MESSAGES environment variable.
    Returns the last assistant message text.
    """
    messages_path = os.getenv("MCP_MESSAGES")
    print(f"MCP_MESSAGES: {messages_path}")
    if not messages_path:
        print("Warning: MCP_MESSAGES environment variable not set", file=sys.stderr)
        return None

    try:
        with open(messages_path, "r") as f:
            messages = json.load(f)

        # Find the last assistant message
        for message in reversed(messages):
            if (
                message.get("role") == "assistant"
                and message.get("status") == "completed"
                and message.get("type") == "message"
            ):
                content = message.get("content", [])
                for item in content:
                    if item.get("type") == "output_text":
                        return item.get("text", "")

        print("Warning: No assistant response found in messages", file=sys.stderr)
        return None
    except Exception as e:
        print(f"Error reading messages file: {str(e)}", file=sys.stderr)
        return None


def parse_answer_format(text):
    """
    Parse the <answer>...</answer> format from the agent's output.
    Returns a dictionary with the parsed values.
    """
    if not text:
        return None

    # Look for <answer>...</answer> pattern
    match = re.search(r"<answer>(.*?)</answer>", text, re.IGNORECASE | re.DOTALL)
    if not match:
        return None

    answer_content = match.group(1).strip()

    # Parse each line
    result = {}
    lines = answer_content.split("\n")

    if len(lines) != 6:
        print(f"Error: Expected 6 lines in answer, got {len(lines)}", file=sys.stderr)
        return None

    for line in lines:
        if "|" in line:
            key, value = line.split("|", 1)
            result[key.strip()] = value.strip()

    return result


def load_expected_answer(label_path):
    """
    Load the expected answer from label.txt file.
    Returns a dictionary with the expected values.
    """
    try:
        with open(label_path, "r") as f:
            lines = f.read().strip().split("\n")

        expected = {}
        for line in lines:
            if "|" in line:
                key, value = line.split("|", 1)
                expected[key.strip()] = value.strip()

        return expected
    except Exception as e:
        print(f"Error reading label file: {str(e)}", file=sys.stderr)
        return None


def compare_answers(model_answer, expected_answer):
    """
    Compare the model's answer with the expected answer.
    Returns True if all key information matches, False otherwise.
    """
    if not model_answer or not expected_answer:
        return False

    # Check each expected key
    mismatches = []
    for key, expected_value in expected_answer.items():
        model_value = model_answer.get(key, "")

        # Special handling for different types of values
        if key in ["PrinterPrice", "KeyboardPrice"]:
            # For price fields, only support $XX.XX format
            # Check if model value has correct format
            if not model_value.startswith("$"):
                mismatches.append(
                    f"{key}: incorrect format - expected '$XX.XX' format, got '{model_value}'"
                )
            else:
                # Normalize and compare values
                expected_clean = expected_value.replace("$", "").replace(",", "")
                model_clean = model_value.replace("$", "").replace(",", "")
                if expected_clean != model_clean:
                    mismatches.append(
                        f"{key}: expected '{expected_value}', got '{model_value}'"
                    )

        elif key in ["PrinterSKUID", "KeyboardSKUID"]:
            # SKU should match exactly (case-insensitive)
            if model_value.upper() != expected_value.upper():
                mismatches.append(
                    f"{key}: expected '{expected_value}', got '{model_value}'"
                )

        elif key == "KeyboardReviews":
            # Number of reviews should match exactly
            if model_value != expected_value:
                mismatches.append(
                    f"{key}: expected '{expected_value}', got '{model_value}'"
                )

        elif key == "KeyboardRating":
            # Rating should match exactly (including % sign)
            if model_value != expected_value:
                mismatches.append(
                    f"{key}: expected '{expected_value}', got '{model_value}'"
                )

        else:
            # Exact match for other fields
            if model_value != expected_value:
                mismatches.append(
                    f"{key}: expected '{expected_value}', got '{model_value}'"
                )

    if mismatches:
        print("\n=== Answer Comparison Mismatches ===", file=sys.stderr)
        for mismatch in mismatches:
            print(f"✗ {mismatch}", file=sys.stderr)
        return False

    print("\n=== Answer Comparison ===", file=sys.stderr)
    print("✓ All key information matches the expected answer", file=sys.stderr)
    return True


async def verify() -> bool:
    """
    Verifies that the electronic products task has been completed correctly.
    Checks the model's answer against the expected label.
    """
    # Get the label file path
    label_path = Path(__file__).parent / "label.txt"

    # Load expected answer
    expected_answer = load_expected_answer(label_path)
    if not expected_answer:
        print("Error: Could not load expected answer from label.txt", file=sys.stderr)
        return False

    # Get model's response from MCP_MESSAGES
    model_response = get_model_response()
    if model_response:
        print("Found model response, parsing answer format...", file=sys.stderr)
        model_answer = parse_answer_format(model_response)

        if model_answer:
            print("\n=== Model Answer Parsed ===", file=sys.stderr)
            for key, value in model_answer.items():
                print(f"{key}: {value}", file=sys.stderr)

            # Compare answers
            answer_match = compare_answers(model_answer, expected_answer)
            if not answer_match:
                print("\nModel answer does not match expected answer", file=sys.stderr)
                return False
            print("\n✓ Model answer matches expected answer", file=sys.stderr)
            return True
        else:
            print(
                "Warning: Could not parse answer format from model response",
                file=sys.stderr,
            )
            return False
    else:
        print("No model response found", file=sys.stderr)
        return False


def main():
    """
    Executes the verification process and exits with a status code.
    """
    result = asyncio.run(verify())
    sys.exit(0 if result else 1)


if __name__ == "__main__":
    main()

================================================
FILE: tasks/playwright_webarena/standard/shopping/running_shoes_purchase/description.md
================================================


1. Find running shoes:
   - Price between $50.00-$60.00
   - "running shoe" must appear in product name
   - Choose the one with highest number of reviews
   - Select black or white color, size 10
   - Add to cart with quantity 2

2. Record from product page: SKU ID, price, number of reviews, review rating

3. Record cart subtotal

**Output Format:**

```
<answer>
SKUID|id
Price|$XX.XX
NumberOfReviews|XX
ReviewRating|XX%
Subtotal|$XX.XX
</answer>
```


================================================
FILE: tasks/playwright_webarena/standard/shopping/running_shoes_purchase/label.txt
================================================
SKUID|B08KKX1WXQ
Price|$56.21
NumberOfReviews|46
ReviewRating|86%
Subtotal|$112.42

================================================
FILE: tasks/playwright_webarena/standard/shopping/running_shoes_purchase/meta.json
================================================
{
  "task_id": "running_shoes_purchase",
  "task_name": "Running Shoes Purchase",
  "category_id": "shopping",
  "category_name": "Shopping",
  "description": "Research running footwear by analyzing biomechanical features, comparing cushioning technologies, evaluating durability ratings, considering user preferences, and recommending optimal shoe selections.",
  "author": "Yaoqi Ye",
  "created_at": "2025-08-17",
  "difficulty": "L3",
  "tags": [
    "search aggregation",
    "comparative analysis"
  ],
  "mcp": [
    "playwright"
  ],
  "meta_data": {
    "stateType": "video",
    "stateContent": null,
    "stateUrl": "https://storage.mcpmark.ai/tasks_state/playwright_video/one-stop-market.mp4",
    "stateOriginalUrl": "https://github.com/web-arena-x/webarena/tree/main/environment_docker"
  }
}

================================================
FILE: tasks/playwright_webarena/standard/shopping/running_shoes_purchase/verify.py
================================================
import asyncio
import sys
import re
import os
import json
from pathlib import Path


def get_model_response():
    """
    Get the model's response from the MCP_MESSAGES environment variable.
    Returns the last assistant message text.
    """
    messages_path = os.getenv("MCP_MESSAGES")
    print(f"MCP_MESSAGES: {messages_path}")
    if not messages_path:
        print("Warning: MCP_MESSAGES environment variable not set", file=sys.stderr)
        return None

    try:
        with open(messages_path, "r") as f:
            messages = json.load(f)

        # Find the last assistant message
        for message in reversed(messages):
            if (
                message.get("role") == "assistant"
                and message.get("status") == "completed"
                and message.get("type") == "message"
            ):
                content = message.get("content", [])
                for item in content:
                    if item.get("type") == "output_text":
                        return item.get("text", "")

        print("Warning: No assistant response found in messages", file=sys.stderr)
        return None
    except Exception as e:
        print(f"Error reading messages file: {str(e)}", file=sys.stderr)
        return None


def parse_answer_format(text):
    """
    Parse the <answer>...</answer> format from the agent's output.
    Returns a dictionary with the parsed values.
    """
    if not text:
        return None

    # Look for <answer>...</answer> pattern
    match = re.search(r"<answer>(.*?)</answer>", text, re.IGNORECASE | re.DOTALL)
    if not match:
        return None

    answer_content = match.group(1).strip()

    # Parse each line
    result = {}
    lines = answer_content.split("\n")

    if len(lines) != 5:
        print(f"Error: Expected 5 lines in answer, got {len(lines)}", file=sys.stderr)
        return None

    for line in lines:
        if "|" in line:
            key, value = line.split("|", 1)
            result[key.strip()] = value.strip()

    return result


def load_expected_answer(label_path):
    """
    Load the expected answer from label.txt file.
    Returns a dictionary with the expected values.
    """
    try:
        with open(label_path, "r") as f:
            lines = f.read().strip().split("\n")

        expected = {}
        for line in lines:
            if "|" in line:
                key, value = line.split("|", 1)
                expected[key.strip()] = value.strip()

        return expected
    except Exception as e:
        print(f"Error reading label file: {str(e)}", file=sys.stderr)
        return None


def compare_answers(model_answer, expected_answer):
    """
    Compare the model's answer with the expected answer.
    Returns True if all key information matches, False otherwise.
    """
    if not model_answer or not expected_answer:
        return False

    # Check each expected key
    mismatches = []
    for key, expected_value in expected_answer.items():
        model_value = model_answer.get(key, "")

        # Special handling for different types of values
        if key in ["Price", "Subtotal"]:
            # For price fields, only support $XX.XX format
            # Check if model value has correct format
            if not model_value.startswith("$"):
                mismatches.append(
                    f"{key}: incorrect format - expected '$XX.XX' format, got '{model_value}'"
                )
            else:
                # Normalize and compare values
                expected_clean = expected_value.replace("$", "").replace(",", "")
                model_clean = model_value.replace("$", "").replace(",", "")
                
                # Allow small tolerance for price calculations (within $0.01)
                try:
                    expected_float = float(expected_clean)
                    model_float = float(model_clean)
                    if abs(expected_float - model_float) > 0.01:
                        mismatches.append(
                            f"{key}: expected '{expected_value}', got '{model_value}'"
                        )
                except ValueError:
                    if expected_clean != model_clean:
                        mismatches.append(
                            f"{key}: expected '{expected_value}', got '{model_value}'"
                        )

        elif key == "SKUID":
            # SKU should match exactly (case-insensitive)
            if model_value.upper() != expected_value.upper():
                mismatches.append(
                    f"{key}: expected '{expected_value}', got '{model_value}'"
                )

        elif key == "NumberOfReviews":
            # Number of reviews should match exactly
            if model_value != expected_value:
                mismatches.append(
                    f"{key}: expected '{expected_value}', got '{model_value}'"
                )

        elif key == "ReviewRating":
            # Rating should match exactly (including % sign)
            if model_value != expected_value:
                mismatches.append(
                    f"{key}: expected '{expected_value}', got '{model_value}'"
                )

        else:
            # Exact match for other fields
            if model_value != expected_value:
                mismatches.append(
                    f"{key}: expected '{expected_value}', got '{model_value}'"
                )

    if mismatches:
        print("\n=== Answer Comparison Mismatches ===", file=sys.stderr)
        for mismatch in mismatches:
            print(f"✗ {mismatch}", file=sys.stderr)
        return False

    print("\n=== Answer Comparison ===", file=sys.stderr)
    print("✓ All key information matches the expected answer", file=sys.stderr)
    return True


async def verify() -> bool:
    """
    Verifies that the running shoes shopping task has been completed correctly.
    Checks the model's answer against the expected label.
    """
    # Get the label file path
    label_path = Path(__file__).parent / "label.txt"

    # Load expected answer
    expected_answer = load_expected_answer(label_path)
    if not expected_answer:
        print("Error: Could not load expected answer from label.txt", file=sys.stderr)
        return False

    # Get model's response from MCP_MESSAGES
    model_response = get_model_response()
    if model_response:
        print("Found model response, parsing answer format...", file=sys.stderr)
        model_answer = parse_answer_format(model_response)

        if model_answer:
            print("\n=== Model Answer Parsed ===", file=sys.stderr)
            for key, value in model_answer.items():
                print(f"{key}: {value}", file=sys.stderr)

            # Compare answers
            answer_match = compare_answers(model_answer, expected_answer)
            if not answer_match:
                print("\nModel answer does not match expected answer", file=sys.stderr)
                return False
            print("\n✓ Model answer matches expected answer", file=sys.stderr)
            return True
        else:
            print(
                "Warning: Could not parse answer format from model response",
                file=sys.stderr,
            )
            return False
    else:
        print("No model response found", file=sys.stderr)
        return False


def main():
    """
    Executes the verification process and exits with a status code.
    """
    result = asyncio.run(verify())
    sys.exit(0 if result else 1)


if __name__ == "__main__":
    main()

================================================
FILE: tasks/playwright_webarena/standard/shopping_admin/customer_segmentation_setup/description.md
================================================
Perform customer segmentation setup and analysis in the Magento Admin panel to establish new customer groups and manage customer profiles.

**Task Requirements:**

1. Access the Magento Admin panel to begin customer segmentation setup. if need to login, login with username 'admin' and password 'admin1234'

2. Establish baseline metrics for customer groups:
   - Record the exact number shown in "records found" at the top of the grid
   - This will be your initial groups count

3. Create a specialized customer group for European premium customers:
   - Group Name: Premium Europe
   - Tax Class: Retail Customer
   - Save the group

4. Verify the customer group creation was successful:
   - After saving, return to Customer Groups list
   - Record the new total shown in "records found"

5. Establish baseline metrics for all customers database:
   - Record the exact number shown in "records found" at the top of the grid
   - This will be your initial customers count

6. Add a representative customer to the new premium group:
   - Create a new customer with the following details:
   - First Name: Isabella
   - Last Name: Romano
   - Email: isabella.romano@premium.eu
   - Associate to Website: Main Website
   - Group: The group you just created
   - Save the customer

7. Verify the customer creation was successful:
   - After saving, return to All Customers list
   - Record the new total shown in "records found"

8. Analyze recent customer activity patterns:
   - Navigate to Dashboard
   - Look at the "Last Orders" section
   - Record the customer name in the last row of the table

9. Compile all your findings and output them in the following exact format:

```
<answer>
InitialGroups|count
FinalGroups|count  
InitialCustomers|count
FinalCustomers|count
LastOrderCustomer|name
</answer>
```

**Example Output:**
```
<answer>
InitialGroups|XX
FinalGroups|XX
InitialCustomers|XXX
FinalCustomers|XXX
LastOrderCustomer|XXX
</answer>
```

================================================
FILE: tasks/playwright_webarena/standard/shopping_admin/customer_segmentation_setup/label.txt
================================================
InitialGroups|4
FinalGroups|5
InitialCustomers|70
FinalCustomers|71
LastOrderCustomer|Ava Brown

================================================
FILE: tasks/playwright_webarena/standard/shopping_admin/customer_segmentation_setup/meta.json
================================================
{
  "task_id": "customer_segmentation_setup",
  "task_name": "Customer Segmentation Setup",
  "category_id": "shopping_admin",
  "category_name": "Shopping Admin",
  "description": "Configure customer segmentation system in admin panel by defining demographic criteria, creating behavior-based segments, implementing targeting rules, and setting up automated marketing workflows.",
  "author": "Fanqing Meng",
  "created_at": "2025-08-17",
  "difficulty": "L3",
  "tags": [
    "content submission",
    "inventory management"
  ],
  "mcp": [
    "playwright"
  ],
  "meta_data": {
    "stateType": "video",
    "stateContent": null,
    "stateUrl": "https://storage.mcpmark.ai/tasks_state/playwright_video/magento-admin.mp4",
    "stateOriginalUrl": "https://github.com/web-arena-x/webarena/tree/main/environment_docker"
  }
}

================================================
FILE: tasks/playwright_webarena/standard/shopping_admin/customer_segmentation_setup/verify.py
================================================
import asyncio
import sys
import re
import os
import json
from pathlib import Path
from playwright.async_api import (
    async_playwright,
    TimeoutError as PlaywrightTimeoutError,
)

# 从环境变量读取 base_url（shopping_admin 会注入 http://localhost:7780/admin），默认回退到本地
BASE_URL = os.getenv("WEBARENA_BASE_URL", "http://localhost:7780/admin").rstrip("/")


def get_model_response():
    """
    Get the model's response from the MCP_MESSAGES environment variable.
    Returns the last assistant message text.
    """
    messages_path = os.getenv("MCP_MESSAGES")
    print(f"MCP_MESSAGES: {messages_path}")
    if not messages_path:
        print("Warning: MCP_MESSAGES environment variable not set", file=sys.stderr)
        return None

    try:
        with open(messages_path, "r") as f:
            messages = json.load(f)

        # Find the last assistant message
        for message in reversed(messages):
            if (
                message.get("role") == "assistant"
                and message.get("status") == "completed"
            ):
                content = message.get("content", [])
                for item in content:
                    if item.get("type") == "output_text":
                        return item.get("text", "")

        print("Warning: No assistant response found in messages", file=sys.stderr)
        return None
    except Exception as e:
        print(f"Error reading messages file: {str(e)}", file=sys.stderr)
        return None


def parse_answer_format(text):
    """
    Parse the <answer>...</answer> format from the agent's output.
    Returns a dictionary with the parsed values.
    """
    if not text:
        return None

    # Look for <answer>...</answer> pattern
    match = re.search(r"<answer>(.*?)</answer>", text, re.IGNORECASE | re.DOTALL)
    if not match:
        return None

    answer_content = match.group(1).strip()

    # Parse each line
    result = {}
    lines = answer_content.split("\n")

    if len(lines) != 5:
        print(f"Error: Expected 5 lines in answer, got {len(lines)}", file=sys.stderr)
        return None

    for line in lines:
        if "|" in line:
            key, value = line.split("|", 1)
            result[key.strip()] = value.strip()

    return result


def load_expected_answer(label_path):
    """
    Load the expected answer from label.txt file.
    Returns a dictionary with the expected values.
    """
    try:
        with open(label_path, "r") as f:
            lines = f.read().strip().split("\n")

        expected = {}
        for line in lines:
            if "|" in line:
                key, value = line.split("|", 1)
                expected[key.strip()] = value.strip()

        return expected
    except Exception as e:
        print(f"Error reading label file: {str(e)}", file=sys.stderr)
        return None


def compare_answers(model_answer, expected_answer):
    """
    Compare the model's answer with the expected answer.
    Returns True if all key information matches, False otherwise.
    """
    if not model_answer or not expected_answer:
        return False

    # Check each expected key
    mismatches = []
    for key, expected_value in expected_answer.items():
        model_value = model_answer.get(key, "")

        # Exact match for all fields
        if model_value != expected_value:
            mismatches.append(
                f"{key}: expected '{expected_value}', got '{model_value}'"
            )

    if mismatches:
        print("\n=== Answer Comparison Mismatches ===", file=sys.stderr)
        for mismatch in mismatches:
            print(f"✗ {mismatch}", file=sys.stderr)
        return False

    print("\n=== Answer Comparison ===", file=sys.stderr)
    print("✓ All key information matches the expected answer", file=sys.stderr)
    return True


async def verify() -> bool:
    """
    Verifies that the customer segmentation setup task has been completed correctly.
    First checks the model's answer against the expected label,
    then verifies the actual state in the Magento Admin.
    """
    # Get the label file path
    label_path = Path(__file__).parent / "label.txt"

    # Load expected answer
    expected_answer = load_expected_answer(label_path)
    if not expected_answer:
        print("Error: Could not load expected answer from label.txt", file=sys.stderr)
        return False

    # Get model's response from MCP_MESSAGES
    model_response = get_model_response()
    if model_response:
        print("Found model response, parsing answer format...", file=sys.stderr)
        model_answer = parse_answer_format(model_response)

        if model_answer:
            print("\n=== Model Answer Parsed ===", file=sys.stderr)
            for key, value in model_answer.items():
                print(f"{key}: {value}", file=sys.stderr)

            # Compare answers
            answer_match = compare_answers(model_answer, expected_answer)
            if not answer_match:
                print("\nModel answer does not match expected answer", file=sys.stderr)
                return False
            print("\n✓ Model answer matches expected answer", file=sys.stderr)
        else:
            print(
                "Warning: Could not parse answer format from model response",
                file=sys.stderr,
            )
            print("Will proceed with browser verification only", file=sys.stderr)
    else:
        print(
            "No model response found, proceeding with browser verification",
            file=sys.stderr,
        )

    # Browser verification for actual state
    print("\n=== Starting Browser Verification ===", file=sys.stderr)
    async with async_playwright() as p:
        browser = await p.chromium.launch(headless=True)
        context = await browser.new_context()
        page = await context.new_page()

        try:
            # Navigate to Magento Admin
            print("Navigating to Magento Admin...", file=sys.stderr)
            await page.goto(
                f"{BASE_URL}/", wait_until="networkidle"
            )

            # Check if already logged in, if not, login
            if "dashboard" not in page.url.lower():
                print("Logging into Magento Admin...", file=sys.stderr)
                await page.fill('input[name="login[username]"]', "admin")
                await page.fill('input[name="login[password]"]', "admin1234")
                await page.click('button:has-text("Sign in")')
                await page.wait_for_load_state("networkidle")

                if "dashboard" not in page.url.lower():
                    print("Error: Login failed", file=sys.stderr)
                    return False

            print("Successfully logged into Magento Admin", file=sys.stderr)

            # 1. Verify Customer Groups
            print("\nVerifying Customer Groups...", file=sys.stderr)
            await page.goto(
                f"{BASE_URL}/customer/group/",
                wait_until="networkidle",
            )
            await page.wait_for_timeout(2000)  # Wait for grid to load

            # Check for Premium Europe group
            premium_europe_exists = (
                await page.locator("text=Premium Europe").count() > 0
            )
            if premium_europe_exists:
                print("✓ Found 'Premium Europe' customer group", file=sys.stderr)

                # Check if it has Retail Customer tax class
                # Look for Premium Europe row and check its tax class
                premium_row = page.locator('tr:has-text("Premium Europe")')
                if await premium_row.count() > 0:
                    tax_class_text = await premium_row.locator("td").nth(2).inner_text()
                    if "Retail Customer" in tax_class_text:
                        print(
                            "✓ Premium Europe has 'Retail Customer' tax class",
                            file=sys.stderr,
                        )
                    else:
                        print(
                            f"Warning: Premium Europe tax class is '{tax_class_text}'",
                            file=sys.stderr,
                        )
            else:
                print("✗ 'Premium Europe' customer group not found", file=sys.stderr)
                return False

            # Check total groups count
            records_found = page.locator("text=records found").first
            if await records_found.count() > 0:
                count_text = await records_found.inner_text()
                print(f"Customer Groups count: {count_text}", file=sys.stderr)

                # Extract number
                import re

                match = re.search(r"(\d+)\s+records found", count_text)
                if match:
                    groups_count = int(match.group(1))
                    print(f"✓ Customer groups count is {groups_count}", file=sys.stderr)

            # 2. Verify Customer
            print("\nVerifying Customer Isabella Romano...", file=sys.stderr)
            await page.goto(
                f"{BASE_URL}/customer/index/",
                wait_until="networkidle",
            )
            await page.wait_for_timeout(3000)  # Wait for grid to load

            # Check total customers count
            customer_records = page.locator("text=records found").first
            if await customer_records.count() > 0:
                count_text = await customer_records.inner_text()
                print(f"Customers count: {count_text}", file=sys.stderr)

                # Extract number
                match = re.search(r"(\d+)\s+records found", count_text)
                if match:
                    customers_count = int(match.group(1))
                    print(
                        f"✓ Total customers count is {customers_count}", file=sys.stderr
                    )

                    # Verify against expected answer if available
                    if expected_answer and "FinalCustomers" in expected_answer:
                        expected_final = int(expected_answer["FinalCustomers"])
                        if customers_count == expected_final:
                            print(
                                f"✓ Customer count matches expected: {customers_count}",
                                file=sys.stderr,
                            )
                        else:
                            print(
                                f"✗ Customer count mismatch: Expected {expected_final} customers, found {customers_count}",
                                file=sys.stderr,
                            )
                            return False

            # Wait for the customer grid to load properly
            await page.wait_for_timeout(5000)
            
            # Check if Isabella Romano exists - first wait for grid to load
            grid_loaded = False
            for i in range(3):
                # Look for grid container and wait for it to populate
                grid_container = page.locator(".admin__data-grid-outer-wrap, .data-grid, table").first
                if await grid_container.count() > 0:
                    # Check if there are customer rows loaded
                    customer_rows = page.locator("td[data-column='email'], td:has-text('@')")
                    if await customer_rows.count() > 0:
                        grid_loaded = True
                        break
                await page.wait_for_timeout(2000)
            
            if not grid_loaded:
                print("✗ Customer grid failed to load properly", file=sys.stderr)
                return False
            
            # Now check if Isabella Romano exists in the loaded grid
            isabella_exists = (
                await page.locator("text=isabella.romano@premium.eu").count() > 0
            )
            
            if not isabella_exists:
                # Try searching for the customer to be more thorough
                try:
                    search_box = page.locator('input[placeholder*="Search by keyword"], input[name="search"], [data-role="search"]').first
                    if await search_box.count() > 0:
                        await search_box.clear()
                        await search_box.fill("isabella.romano@premium.eu")
                        await page.keyboard.press("Enter")
                        await page.wait_for_load_state("networkidle")
                        await page.wait_for_timeout(3000)
                        
                        # Check again after search
                        isabella_exists = (
                            await page.locator("text=isabella.romano@premium.eu").count() > 0
                        )
                        
                        # Also check for "No records found" message
                        no_records = await page.locator("text=We couldn't find any records., text=No records found").count() > 0
                        if no_records:
                            print(
                                "✗ Customer 'isabella.romano@premium.eu' not found - search returned no results",
                                file=sys.stderr,
                            )
                            return False
                except Exception as e:
                    print(f"✗ Search failed: {str(e)}", file=sys.stderr)
            
            if isabella_exists:
                print(
                    "✓ Found customer with email 'isabella.romano@premium.eu'",
                    file=sys.stderr,
                )
            else:
                print(
                    "✗ Customer 'isabella.romano@premium.eu' not found",
                    file=sys.stderr,
                )
                return False

            # 3. Verify Dashboard Last Orders
            print("\nVerifying Dashboard Last Orders...", file=sys.stderr)
            await page.goto(
                f"{BASE_URL}/admin/dashboard/",
                wait_until="networkidle",
            )
            await page.wait_for_timeout(2000)

            # Check for Last Orders section
            last_orders_exists = await page.locator("text=Last Orders").count() > 0
            if last_orders_exists:
                print("✓ Found 'Last Orders' section on dashboard", file=sys.stderr)

                # Find the first customer in the table
                # Look for the table after "Last Orders" heading
                orders_table = (
                    page.locator("text=Last Orders")
                    .locator("..")
                    .locator("table")
                    .first
                )
                if await orders_table.count() > 0:
                    # Get the last row in tbody
                    last_row = orders_table.locator("tbody tr").last
                    if await last_row.count() > 0:
                        last_customer = await last_row.locator(
                            "td"
                        ).first.inner_text()
                        print(
                            f"✓ Last customer in Last Orders: {last_customer}",
                            file=sys.stderr,
                        )

                        # Verify against expected answer if available
                        if expected_answer and "LastOrderCustomer" in expected_answer:
                            if last_customer == expected_answer["LastOrderCustomer"]:
                                print(
                                    f"✓ Last Order Customer matches expected: {last_customer}",
                                    file=sys.stderr,
                                )
                            else:
                                print(
                                    f"✗ Last Order Customer mismatch: Expected '{expected_answer['LastOrderCustomer']}' but actual is '{last_customer}'",
                                    file=sys.stderr,
                                )
                                return False
            else:
                print(
                    "Warning: 'Last Orders' section not found on dashboard",
                    file=sys.stderr,
                )

            # Summary of verification - only print if we reach this point (all checks passed)
            print("\n=== Browser Verification Summary ===", file=sys.stderr)
            print("✓ Magento Admin login successful", file=sys.stderr)
            print(
                "✓ Customer group 'Premium Europe' exists with correct tax class",
                file=sys.stderr,
            )
            print("✓ Customer 'isabella.romano@premium.eu' found in system", file=sys.stderr)
            print("✓ Customer counts verified", file=sys.stderr)
            print("✓ Dashboard Last Orders section accessible", file=sys.stderr)

            return True

        except PlaywrightTimeoutError as e:
            print(f"Error: Timeout occurred - {str(e)}", file=sys.stderr)
            return False
        except Exception as e:
            print(f"Error: Unexpected error - {str(e)}", file=sys.stderr)
            return False
        finally:
            await browser.close()


def main():
    """
    Executes the verification process and exits with a status code.
    """
    result = asyncio.run(verify())
    sys.exit(0 if result else 1)


if __name__ == "__main__":
    main()


================================================
FILE: tasks/playwright_webarena/standard/shopping_admin/fitness_promotion_strategy/description.md
================================================
Our marketing team is planning a new promotion for our bestselling fitness products. We need to analyze the current performance of our top-selling items and their related promotions to optimize our strategy.

**Task Requirements:**

1. If need to login, login with username 'admin' and password 'admin1234'

2. Start by checking our current bestsellers:
   - Identify the top 3 bestselling products based on their Price	and Quantity - record their names, prices, and quantities sold
   - Note the total Revenue amount displayed
   - Check if any of these bestsellers appear in the Top Search Terms table - if yes, record the search term and its usage count, else output 'No:0'

3. Investigate these bestselling products in detail:
   - For each of the top 3 bestsellers identified, search for them by name and record:
     - Their SKU
     - Current inventory quantity
     - Whether they are 'Enabled' or 'Disabled'

4. Check if we have existing promotions for these products:
   - Look for any active rules that might apply to fitness/yoga products
   - Find if there's a rule offering percentage discount - record the rule name and discount percentage
   - Count total number of active rules

5. Analyze customer purchasing patterns:
   - Count total number of orders in the system
   - Note the ID of the most recent order

6. Review our top customers who might be interested:
   - Find the customer who appears in the Last Orders section of the dashboard with the highest total
   - Look up this customer in the All Customers list and record his email and customer group
   - Count how many other customers are in the same group

7. Compile your findings and output them in the following exact format:

```
<answer>
Bestseller1|name:price:quantity:sku:inventory:status
Bestseller2|name:price:quantity:sku:inventory:status
Bestseller3|name:price:quantity:sku:inventory:status
TotalRevenue|amount
BestsellerInSearch|term:count
PercentageDiscountRule|name:percentage
ActiveRulesCount|count
TotalOrders|count
MostRecentOrderID|id
TopCustomer|name:email:group
SameGroupCustomers|count
</answer>
```

**Example Output:**
```
<answer>
Bestseller1|Product Name:$XX.XX:X:XXX(SKU):X:Enabled/Disabled
Bestseller2|Product Name:$XX.XX:X:XXX(SKU):X:Enabled/Disabled
Bestseller3|Product Name:$XX.XX:X:XXX(SKU):X:Enabled/Disabled
TotalRevenue|$XX.XX
BestsellerInSearch|Term:X or None:0
PercentageDiscountRule|Rule Name:XX%
ActiveRulesCount|X
TotalOrders|X
MostRecentOrderID|X or None
TopCustomer|Customer Name:email@example.com:Group Name
SameGroupCustomers|X
</answer>
```


================================================
FILE: tasks/playwright_webarena/standard/shopping_admin/fitness_promotion_strategy/label.txt
================================================
Bestseller1|Sprite Stasis Ball 65 cm:$27.00:6:24-WG082-blue:100:Enabled
Bestseller2|Quest Lumaflex™ Band:$19.00:6:24-UG01:100:Enabled
Bestseller3|Sprite Yoga Strap 6 foot:$14.00:6:24-WG085:100:Enabled
TotalRevenue|$0.00
BestsellerInSearch|No:0
PercentageDiscountRule|20% OFF Ever $200-plus purchase!*:20%
ActiveRulesCount|4
TotalOrders|308
MostRecentOrderID|000000299
TopCustomer|Sarah Miller:sarah.miller@example.com:General
SameGroupCustomers|70

================================================
FILE: tasks/playwright_webarena/standard/shopping_admin/fitness_promotion_strategy/meta.json
================================================
{
  "task_id": "fitness_promotion_strategy",
  "task_name": "Fitness Promotion Strategy",
  "category_id": "shopping_admin",
  "category_name": "Shopping Admin",
  "description": "Develop fitness product promotion campaigns by analyzing sales data, creating targeted offers, configuring promotional rules, and implementing cross-selling strategies in admin dashboard.",
  "author": "Fanqing Meng",
  "created_at": "2025-08-17",
  "difficulty": "L3",
  "tags": [
    "data extraction",
    "comparative analysis",
    "inventory management",
    "content submission"
  ],
  "mcp": [
    "playwright"
  ],
  "meta_data": {
    "stateType": "video",
    "stateContent": null,
    "stateUrl": "https://storage.mcpmark.ai/tasks_state/playwright_video/magento-admin.mp4",
    "stateOriginalUrl": "https://github.com/web-arena-x/webarena/tree/main/environment_docker"
  }
}

================================================
FILE: tasks/playwright_webarena/standard/shopping_admin/fitness_promotion_strategy/verify.py
================================================
import asyncio
import sys
import re
import os
import json
from pathlib import Path

def get_model_response():
    """
    Get the model's response from the MCP_MESSAGES environment variable.
    Returns the last assistant message text.
    """
    messages_path = os.getenv("MCP_MESSAGES")
    print(f"MCP_MESSAGES: {messages_path}")
    if not messages_path:
        print("Warning: MCP_MESSAGES environment variable not set", file=sys.stderr)
        return None
    
    try:
        with open(messages_path, 'r') as f:
            messages = json.load(f)
        
        # Find the last assistant message
        for message in reversed(messages):
            if message.get('role') == 'assistant' and message.get('status') == 'completed':
                content = message.get('content', [])
                for item in content:
                    if item.get('type') == 'output_text':
                        return item.get('text', '')
        
        print("Warning: No assistant response found in messages", file=sys.stderr)
        return None
    except Exception as e:
        print(f"Error reading messages file: {str(e)}", file=sys.stderr)
        return None

def parse_answer_format(text):
    """
    Parse the <answer>...</answer> format from the agent's output.
    Returns a dictionary with the parsed values.
    """
    if not text:
        return None
    
    # Look for <answer>...</answer> pattern
    match = re.search(r'<answer>(.*?)</answer>', text, re.IGNORECASE | re.DOTALL)
    if not match:
        return None
    
    answer_content = match.group(1).strip()
    
    # Parse each line
    result = {}
    lines = answer_content.split('\n')
    
    # Skip the check for exact number of lines - just parse what we have
    # if len(lines) != 13:
    #     print(f"Error: Expected 13 lines in answer, got {len(lines)}", file=sys.stderr)
    #     return None
    
    for line in lines:
        if '|' in line:
            key, value = line.split('|', 1)
            result[key.strip()] = value.strip()
    
    return result

def load_expected_answer(label_path):
    """
    Load the expected answer from label.txt file.
    Returns a dictionary with the expected values.
    """
    try:
        with open(label_path, 'r') as f:
            lines = f.read().strip().split('\n')
        
        expected = {}
        for line in lines:
            if '|' in line:
                key, value = line.split('|', 1)
                expected[key.strip()] = value.strip()
        
        return expected
    except Exception as e:
        print(f"Error reading label file: {str(e)}", file=sys.stderr)
        return None

def compare_answers(model_answer, expected_answer):
    """
    Compare the model's answer with the expected answer.
    Returns True if all key information matches, False otherwise.
    """
    if not model_answer or not expected_answer:
        return False
    
    # Check each expected key
    mismatches = []
    for key, expected_value in expected_answer.items():
        model_value = model_answer.get(key, '')
        
        # Special handling for different types of values
        if key in ['Bestseller1', 'Bestseller2', 'Bestseller3']:
            # Check if all parts match (name:price:quantity:sku:inventory:status)
            if ':' in expected_value and ':' in model_value:
                expected_parts = expected_value.split(':')
                model_parts = model_value.split(':')
                if len(expected_parts) == 6 and len(model_parts) == 6:
                    # Compare each part
                    for i, (exp, mod) in enumerate(zip(expected_parts, model_parts)):
                        if i == 1:  # Price field
                            exp_clean = exp.replace('$', '').replace(',', '')
                            mod_clean = mod.replace('$', '').replace(',', '')
                            if exp_clean != mod_clean:
                                mismatches.append(f"{key} price: expected '{exp}', got '{mod}'")
                        elif i == 4:  # Inventory field (may have decimal places)
                            exp_float = float(exp.replace(',', ''))
                            mod_float = float(mod.replace(',', ''))
                            if abs(exp_float - mod_float) > 0.0001:
                                mismatches.append(f"{key} inventory: expected '{exp}', got '{mod}'")
                        else:
                            if exp.lower() != mod.lower():
                                mismatches.append(f"{key} part {i}: expected '{exp}', got '{mod}'")
                else:
                    mismatches.append(f"{key}: format mismatch - expected '{expected_value}', got '{model_value}'")
            else:
                if expected_value != model_value:
                    mismatches.append(f"{key}: expected '{expected_value}', got '{model_value}'")
        
        elif key == 'LowestInventoryProduct':
            # Check product name and inventory
            if ':' in expected_value and ':' in model_value:
                expected_name, expected_inv = expected_value.rsplit(':', 1)
                model_name, model_inv = model_value.rsplit(':', 1)
                if expected_name.lower() != model_name.lower():
                    mismatches.append(f"{key} name: expected '{expected_name}', got '{model_name}'")
                exp_float = float(expected_inv.replace(',', ''))
                mod_float = float(model_inv.replace(',', ''))
                if abs(exp_float - mod_float) > 0.0001:
                    mismatches.append(f"{key} inventory: expected '{expected_inv}', got '{model_inv}'")
            else:
                if expected_value != model_value:
                    mismatches.append(f"{key}: expected '{expected_value}', got '{model_value}'")
        
        elif key in ['TotalRevenue', 'MinimumPurchaseRule']:
            # For price/amount fields, normalize format
            expected_clean = expected_value.replace('$', '').replace(',', '')
            model_clean = model_value.replace('$', '').replace(',', '')
            if expected_clean != model_clean:
                mismatches.append(f"{key}: expected '{expected_value}', got '{model_value}'")
        
        elif key == 'BestsellerInSearch':
            # Check search term and count
            if expected_value.lower() != model_value.lower():
                mismatches.append(f"{key}: expected '{expected_value}', got '{model_value}'")
        
        elif key == 'PercentageDiscountRule':
            # Check rule name and percentage
            if ':' in expected_value and ':' in model_value:
                expected_name, expected_pct = expected_value.rsplit(':', 1)
                model_name, model_pct = model_value.rsplit(':', 1)
                if expected_name != model_name:
                    mismatches.append(f"{key} name: expected '{expected_name}', got '{model_name}'")
                # Normalize percentage (20% vs 20 vs 0.20)
                exp_pct_clean = expected_pct.replace('%', '').strip()
                mod_pct_clean = model_pct.replace('%', '').strip()
                if exp_pct_clean != mod_pct_clean:
                    mismatches.append(f"{key} percentage: expected '{expected_pct}', got '{model_pct}'")
            else:
                if expected_value != model_value:
                    mismatches.append(f"{key}: expected '{expected_value}', got '{model_value}'")
        
        elif key == 'TopCustomer':
            # Check name:email:group
            if ':' in expected_value and ':' in model_value:
                expected_parts = expected_value.split(':')
                model_parts = model_value.split(':')
                if len(expected_parts) == 3 and len(model_parts) == 3:
                    exp_name, exp_email, exp_group = expected_parts
                    mod_name, mod_email, mod_group = model_parts
                    if exp_name != mod_name:
                        mismatches.append(f"{key} name: expected '{exp_name}', got '{mod_name}'")
                    if exp_email.lower() != mod_email.lower():
                        mismatches.append(f"{key} email: expected '{exp_email}', got '{mod_email}'")
                    if exp_group.lower() != mod_group.lower():
                        mismatches.append(f"{key} group: expected '{exp_group}', got '{mod_group}'")
                else:
                    mismatches.append(f"{key}: format mismatch - expected '{expected_value}', got '{model_value}'")
            else:
                if expected_value != model_value:
                    mismatches.append(f"{key}: expected '{expected_value}', got '{model_value}'")
        
        elif key == 'MostRecentOrderDate':
            # Date format may vary, do flexible comparison
            if expected_value.lower() == 'none' and model_value.lower() == 'none':
                continue
            elif expected_value != model_value:
                # Could add more flexible date parsing here if needed
                mismatches.append(f"{key}: expected '{expected_value}', got '{model_value}'")
        
        else:
            # Exact match for other fields (counts, etc.)
            if str(model_value) != str(expected_value):
                mismatches.append(f"{key}: expected '{expected_value}', got '{model_value}'")
    
    if mismatches:
        print("\n=== Answer Comparison Mismatches ===", file=sys.stderr)
        for mismatch in mismatches:
            print(f"✗ {mismatch}", file=sys.stderr)
        return False
    
    print("\n=== Answer Comparison ===", file=sys.stderr)
    print("✓ All key information matches the expected answer", file=sys.stderr)
    return True

async def verify() -> bool:
    """
    Verifies that the bestseller analysis and promotion task has been completed correctly.
    First checks the model's answer against the expected label,
    then optionally verifies the actual state in the Magento Admin.
    """
    # Get the label file path
    label_path = Path(__file__).parent / "label.txt"
    
    # Load expected answer
    expected_answer = load_expected_answer(label_path)
    if not expected_answer:
        print("Error: Could not load expected answer from label.txt", file=sys.stderr)
        return False
    
    # Get model's response from MCP_MESSAGES
    model_response = get_model_response()
    if model_response:
        print("Found model response, parsing answer format...", file=sys.stderr)
        model_answer = parse_answer_format(model_response)
        
        if model_answer:
            print("\n=== Model Answer Parsed ===", file=sys.stderr)
            for key, value in model_answer.items():
                print(f"{key}: {value}", file=sys.stderr)
            
            # Compare answers
            answer_match = compare_answers(model_answer, expected_answer)
            if not answer_match:
                print("\nModel answer does not match expected answer", file=sys.stderr)
                return False
            print("\n✓ Model answer matches expected answer", file=sys.stderr)
            return True
        else:
            print("Warning: Could not parse answer format from model response", file=sys.stderr)
            return False
    else:
        print("No model response found", file=sys.stderr)
        return False

def main():
    """
    Executes the verification process and exits with a status code.
    """
    result = asyncio.run(verify())
    sys.exit(0 if result else 1)

if __name__ == "__main__":
    main()

================================================
FILE: tasks/playwright_webarena/standard/shopping_admin/marketing_customer_analysis/description.md
================================================
Perform a comprehensive marketing and customer analysis workflow in the Magento Admin panel to understand search behavior patterns and promotional effectiveness.

**Task Requirements:**

1. First, we need to access the system to begin our comprehensive analysis:
   if need to login, login with username 'admin' and password 'admin1234'

2. Let's start by analyzing customer search behavior to understand what customers are looking for:
   Go to Search Terms in Reports and analyze the search data:
   - Identify the TOP 2 search terms with the highest number of hits (record exact terms and hit counts)
   - Find a search term that has 0 results but still has search hits (record exact term and hit count)
   - Count the total number of search terms displayed in the report

3. Next, we'll examine our promotional strategies to understand current marketing efforts:
   Navigate to Cart Price Rules and identify:
   - Find ALL rules that contain a coupon code
   - Record the exact coupon codes and the complete rule names for each
   - Count how many active rules exist in total

4. Now let's analyze our email marketing reach and subscriber engagement:
   Go to Newsletter Subscribers:
   - Apply filter to show only 'Subscribed' status
   - Count the total number of subscribed users showing after filter
   - Verify whether these TWO emails appear in the subscribed list:
     * john.smith.xyz@gmail.com
     * admin@magento.com

5. To support our analysis, we need to create test customer profiles for different segments:
   Create TWO new customers with the following details:
   
   Customer 1:
   - First Name: Marketing1
   - Last Name: Analy
   - Email: marketdata1.analysis@magento.com
   - Associate to Website: Main Website
   - Group: General
   
   Customer 2:
   - First Name: Analytics1
   - Last Name: Report
   - Email: analytics1.report@magento.com
   - Associate to Website: Main Website
   - Group: Wholesale

6. Finally, let's review overall business performance metrics from the main dashboard:
   Go to Dashboard and identify:
   - The names and sales quantities of the products that are both the best-selling and most expensive
   - The total revenue displayed on the dashboard

7. Compile all your findings and must output them in the following exact format at last:

```
<answer>
Top2SearchTerms|term1:hits1,term2:hits2
ZeroResultTerm|term:hits
TotalSearchTerms|count
CouponCodes|code1:rulename1,code2:rulename2
ActiveRulesCount|count
SubscribedCount|count
EmailVerification|john.smith.xyz@gmail.com:yes/no,admin@magento.com:yes/no
TopProduct|name:quantity
TotalRevenue|amount
</answer>
```

**Example Output:**
```
<answer>
Top2SearchTerms|term1:XX,term2:XX
ZeroResultTerm|term:XX
TotalSearchTerms|XX
CouponCodes|CODE:Rule Name Here
ActiveRulesCount|X
SubscribedCount|XX
EmailVerification|john.smith.xyz@gmail.com:yes/no,admin@magento.com:yes/no
TopProduct|Product Name:XX
TotalRevenue|$XX.XX
</answer>
```

**Success Criteria:**
- Successfully logged into Magento Admin
- Navigated to Search Terms Report and identified top 2 terms
- Found search term with 0 results but has hits
- Counted total search terms in report
- Located all Cart Price Rules with coupon codes
- Extracted exact coupon codes and rule names
- Counted active rules
- Filtered Newsletter Subscribers by 'Subscribed' status
- Counted total subscribed users
- Verified presence of two specific email addresses
- Created two new customers successfully
- Found top bestselling product from dashboard
- Identified total revenue from dashboard
- Output answer in exact format with 9 data lines
- Answer wrapped in <answer> tags

================================================
FILE: tasks/playwright_webarena/standard/shopping_admin/marketing_customer_analysis/label.txt
================================================
Top2SearchTerms|hollister:19,Joust Bag:4
ZeroResultTerm|nike:3
TotalSearchTerms|7
CouponCodes|H20:$4 Luma water bottle (save 70%)
ActiveRulesCount|4
SubscribedCount|1
EmailVerification|john.smith.xyz@gmail.com:yes,admin@magento.com:no
TopProduct|Sprite Stasis Ball 65 cm:6
TotalRevenue|$0.00

================================================
FILE: tasks/playwright_webarena/standard/shopping_admin/marketing_customer_analysis/meta.json
================================================
{
  "task_id": "marketing_customer_analysis",
  "task_name": "Marketing Customer Analysis",
  "category_id": "shopping_admin",
  "category_name": "Shopping Admin",
  "description": "Analyze customer behavior patterns using admin analytics, segment user demographics, track purchase histories, evaluate campaign effectiveness, and generate comprehensive marketing intelligence reports.",
  "author": "Fanqing Meng",
  "created_at": "2025-08-17",
  "difficulty": "L3",
  "tags": [
    "data extraction",
    "comparative analysis",
    "content submission"
  ],
  "mcp": [
    "playwright"
  ],
  "meta_data": {
    "stateType": "video",
    "stateContent": null,
    "stateUrl": "https://storage.mcpmark.ai/tasks_state/playwright_video/magento-admin.mp4",
    "stateOriginalUrl": "https://github.com/web-arena-x/webarena/tree/main/environment_docker"
  }
}

================================================
FILE: tasks/playwright_webarena/standard/shopping_admin/marketing_customer_analysis/verify.py
================================================
import asyncio
import sys
import re
import os
import json
from pathlib import Path
from playwright.async_api import (
    async_playwright,
    TimeoutError as PlaywrightTimeoutError,
)

# 从环境变量读取 base_url（shopping_admin 会注入 http://localhost:7780/admin），默认回退到本地
BASE_URL = os.getenv("WEBARENA_BASE_URL", "http://localhost:7780/admin").rstrip("/")


def get_model_response():
    """
    Get the model's response from the MCP_MESSAGES environment variable.
    Returns the last assistant message text.
    """
    messages_path = os.getenv("MCP_MESSAGES")
    print(f"MCP_MESSAGES: {messages_path}")
    if not messages_path:
        print("Warning: MCP_MESSAGES environment variable not set", file=sys.stderr)
        return None

    try:
        with open(messages_path, "r") as f:
            messages = json.load(f)

        # Find the last assistant message
        for message in reversed(messages):
            if (
                message.get("role") == "assistant"
                and message.get("status") == "completed"
            ):
                content = message.get("content", [])
                for item in content:
                    if item.get("type") == "output_text":
                        return item.get("text", "")

        print("Warning: No assistant response found in messages", file=sys.stderr)
        return None
    except Exception as e:
        print(f"Error reading messages file: {str(e)}", file=sys.stderr)
        return None


def parse_answer_format(text):
    """
    Parse the new multi-line <answer>xxx</answer> format from the agent's output.
    Returns a dictionary with the parsed values.
    """
    if not text:
        return None

    # Look for <answer>...</answer> pattern
    match = re.search(r"<answer>(.*?)</answer>", text, re.IGNORECASE | re.DOTALL)
    if not match:
        return None

    answer_content = match.group(1).strip()

    # Parse each line
    result = {}
    lines = answer_content.split("\n")

    if len(lines) != 9:
        print(f"Error: Expected 9 lines in answer, got {len(lines)}", file=sys.stderr)
        return None

    for line in lines:
        if "|" in line:
            key, value = line.split("|", 1)
            result[key.strip()] = value.strip()

    return result


def load_expected_answer(label_path):
    """
    Load the expected answer from label.txt file.
    Returns a dictionary with the expected values.
    """
    try:
        with open(label_path, "r") as f:
            lines = f.read().strip().split("\n")

        expected = {}
        for line in lines:
            if "|" in line:
                key, value = line.split("|", 1)
                expected[key.strip()] = value.strip()

        return expected
    except Exception as e:
        print(f"Error reading label file: {str(e)}", file=sys.stderr)
        return None


def compare_answers(model_answer, expected_answer):
    """
    Compare the model's answer with the expected answer.
    Returns True if all key information matches, False otherwise.
    """
    if not model_answer or not expected_answer:
        return False

    # Check each expected key
    mismatches = []
    for key, expected_value in expected_answer.items():
        model_value = model_answer.get(key, "")

        # Special handling for different types of values
        if key == "Top2SearchTerms":
            # Check if both search terms are present with correct counts
            expected_terms = expected_value.split(",")
            model_terms = model_value.split(",")
            if set(expected_terms) != set(model_terms):
                mismatches.append(
                    f"{key}: expected '{expected_value}', got '{model_value}'"
                )

        elif key == "EmailVerification":
            # Check email verification status
            expected_emails = dict(
                item.split(":") for item in expected_value.split(",")
            )
            model_emails = dict(
                item.split(":") for item in model_value.split(",") if ":" in item
            )
            if expected_emails != model_emails:
                mismatches.append(
                    f"{key}: expected '{expected_value}', got '{model_value}'"
                )

        elif key == "CouponCodes":
            # Check if coupon code and rule name are present
            if "H20" not in model_value or "Luma water bottle" not in model_value:
                mismatches.append(
                    f"{key}: expected '{expected_value}', got '{model_value}'"
                )

        elif key == "TopProduct":
            # Check if product name and quantity match
            if expected_value != model_value:
                mismatches.append(
                    f"{key}: expected '{expected_value}', got '{model_value}'"
                )

        else:
            # Exact match for other fields
            if model_value != expected_value:
                mismatches.append(
                    f"{key}: expected '{expected_value}', got '{model_value}'"
                )

    if mismatches:
        print("\n=== Answer Comparison Mismatches ===", file=sys.stderr)
        for mismatch in mismatches:
            print(f"✗ {mismatch}", file=sys.stderr)
        return False

    print("\n=== Answer Comparison ===", file=sys.stderr)
    print("✓ All key information matches the expected answer", file=sys.stderr)
    return True


async def verify() -> bool:
    """
    Verifies that the marketing analysis task has been completed correctly.
    First checks the model's answer against the expected label,
    then optionally verifies the actual state in the Magento Admin.
    """
    # Get the label file path
    label_path = Path(__file__).parent / "label.txt"

    # Load expected answer
    expected_answer = load_expected_answer(label_path)
    if not expected_answer:
        print("Error: Could not load expected answer from label.txt", file=sys.stderr)
        return False

    # Get model's response from MCP_MESSAGES
    model_response = get_model_response()
    if model_response:
        print("Found model response, parsing answer format...", file=sys.stderr)
        model_answer = parse_answer_format(model_response)

        if model_answer:
            print("\n=== Model Answer Parsed ===", file=sys.stderr)
            for key, value in model_answer.items():
                print(f"{key}: {value}", file=sys.stderr)

            # Compare answers
            answer_match = compare_answers(model_answer, expected_answer)
            if not answer_match:
                print("\nModel answer does not match expected answer", file=sys.stderr)
                return False
            print("\n✓ Model answer matches expected answer", file=sys.stderr)
        else:
            print(
                "Warning: Could not parse answer format from model response",
                file=sys.stderr,
            )
            print("Will proceed with browser verification only", file=sys.stderr)
    else:
        print(
            "No model response found, proceeding with browser verification",
            file=sys.stderr,
        )

    # Browser verification - only check customer creation (the critical task requirement)
    print("\n=== Starting Browser Verification ===", file=sys.stderr)
    async with async_playwright() as p:
        browser = await p.chromium.launch(headless=True)
        context = await browser.new_context()
        page = await context.new_page()

        try:
            # Navigate to Magento Admin
            print("Navigating to Magento Admin...", file=sys.stderr)
            await page.goto(
                f"{BASE_URL}/", wait_until="networkidle"
            )

            # Check if already logged in, if not, login
            if "dashboard" not in page.url.lower():
                print("Logging into Magento Admin...", file=sys.stderr)
                await page.fill('input[name="login[username]"]', "admin")
                await page.fill('input[name="login[password]"]', "admin1234")
                await page.click('button:has-text("Sign in")')
                await page.wait_for_load_state("networkidle")

                if "dashboard" not in page.url.lower():
                    print("Error: Login failed", file=sys.stderr)
                    return False

            print("Successfully logged into Magento Admin", file=sys.stderr)

            # Verify Customer Creation (the only critical check for task completion)
            print("Verifying Customer Creation...", file=sys.stderr)
            await page.goto(
                f"{BASE_URL}/customer/index/",
                wait_until="networkidle",
            )

            # Wait for the customer grid to load
            try:
                await page.wait_for_selector("table", timeout=15000)
            except PlaywrightTimeoutError:
                print("Table not found, trying to proceed anyway...", file=sys.stderr)

            # Define customer requirements
            customer1_requirements = {
                "email": "marketdata1.analysis@magento.com",
                "first_name": "Marketing1",
                "last_name": "Analy",
                "group": "General",
                "website": "Main Website"
            }
            
            customer2_requirements = {
                "email": "analytics1.report@magento.com", 
                "first_name": "Analytics1",
                "last_name": "Report",
                "group": "Wholesale",
                "website": "Main Website"
            }

            async def check_customer_exists(customer_requirements):
                """Check if a customer exists by looking for their details in the customer grid"""
                email = customer_requirements["email"]
                first_name = customer_requirements["first_name"]
                last_name = customer_requirements["last_name"]
                group = customer_requirements["group"]
                
                # First check if email exists in current page without searching
                email_found = await page.locator(f"*:has-text('{email}')").count() > 0
                
                if not email_found:
                    # Try searching for the customer
                    try:
                        search_box = page.locator('input[placeholder*="Search by keyword"]').first
                        await search_box.clear()
                        await search_box.fill(email)
                        await page.keyboard.press("Enter")
                        await page.wait_for_load_state("networkidle")
                        await page.wait_for_timeout(2000)
                        
                        # Check again after search
                        email_found = await page.locator(f"*:has-text('{email}')").count() > 0
                    except:
                        pass
                
                if not email_found:
                    return False, f"Email {email} not found"
                
                # More precise validation: find the row containing this customer's email
                # Then check if the required fields are in the same row or nearby context
                try:
                    # Find the specific row containing this email
                    email_cell = page.locator(f"td:has-text('{email}')").first
                    if await email_cell.count() == 0:
                        # Fall back to broader search
                        email_cell = page.locator(f"*:has-text('{email}')").first
                    
                    # Get the parent row or container
                    row = email_cell.locator("xpath=ancestor::tr[1]")
                    if await row.count() == 0:
                        # Fall back to getting nearby content
                        row = email_cell.locator("xpath=..")
                    
                    # Get the text content of the row/container
                    row_text = await row.text_content() if await row.count() > 0 else ""
                    
                    # If we can't get a specific row, fall back to broader validation
                    if not row_text or len(row_text.strip()) < 10:
                        # Search in nearby cells or elements
                        nearby_elements = page.locator(f"*:has-text('{email}')").locator("xpath=../following-sibling::* | xpath=../preceding-sibling::*")
                        nearby_count = await nearby_elements.count()
                        nearby_text = ""
                        for i in range(min(nearby_count, 5)):  # Check up to 5 nearby elements
                            element_text = await nearby_elements.nth(i).text_content()
                            if element_text:
                                nearby_text += element_text + " "
                        row_text = row_text + " " + nearby_text
                    
                    # Check if required fields are present in the row/context
                    required_fields = [first_name, last_name, group]
                    found_fields = [email]  # Email is already confirmed
                    missing_fields = []
                    
                    for field in required_fields:
                        if field in row_text:
                            found_fields.append(field)
                        else:
                            missing_fields.append(field)
                    
                    if missing_fields:
                        return False, f"Customer found but missing fields in row context: {', '.join(missing_fields)}. Row text: {row_text[:100]}..."
                    
                    return True, f"Customer verified with all required fields: {', '.join(found_fields)}"
                    
                except Exception as e:
                    # Fall back to original simple validation
                    page_content = await page.content()
                    required_fields = [first_name, last_name, group, email]
                    found_fields = []
                    missing_fields = []
                    
                    for field in required_fields:
                        if field in page_content:
                            found_fields.append(field)
                        else:
                            missing_fields.append(field)
                    
                    if missing_fields:
                        return False, f"Customer found but missing fields (fallback): {', '.join(missing_fields)}"
                    
                    return True, f"Customer verified with all required fields (fallback): {', '.join(found_fields)}"

            # Check both customers
            customer1_exists, customer1_msg = await check_customer_exists(customer1_requirements)
            customer2_exists, customer2_msg = await check_customer_exists(customer2_requirements)

            print(
                f"Customer 1 (marketdata1.analysis@magento.com): {'Found' if customer1_exists else 'Not Found'} - {customer1_msg}",
                file=sys.stderr,
            )
            print(
                f"Customer 2 (analytics1.report@magento.com): {'Found' if customer2_exists else 'Not Found'} - {customer2_msg}",
                file=sys.stderr,
            )

            if not (customer1_exists and customer2_exists):
                print("Error: Required customers were not found in the system", file=sys.stderr)
                return False

            print("✓ Both required customers found in the system", file=sys.stderr)
            return True

        except PlaywrightTimeoutError as e:
            print(f"Error: Timeout occurred - {str(e)}", file=sys.stderr)
            return False
        except Exception as e:
            print(f"Error: Unexpected error - {str(e)}", file=sys.stderr)
            return False
        finally:
            await browser.close()


def main():
    """
    Executes the verification process and exits with a status code.
    """
    result = asyncio.run(verify())
    sys.exit(0 if result else 1)


if __name__ == "__main__":
    main()


================================================
FILE: tasks/playwright_webarena/standard/shopping_admin/ny_expansion_analysis/description.md
================================================
Our company is planning to expand sales operations to New York state and needs a comprehensive analysis of our current sales performance and tax implications. Please help me gather critical data for our expansion feasibility report.

**Task Requirements:**

1. Log in with username 'admin' and password 'admin1234'

2. First, analyze our current sales performance on the dashboard:
   - Check the 'Lifetime Sales' amount displayed
   - In the Bestsellers table, identify which product has lowest price and record its exact name, price, and quantity sold
   - Find if this same product appears in the 'Last Orders' table, and if so, note which customer(s) ordered it, if no, note 'No'

3. Since we're expanding to New York, we need check tax:
   - Find and record the exact tax rate for New York state
   - Compare it with California's tax rate - record which state has a higher rate
   - Count how many different US states currently have tax configurations

4. You need to understand our order status of stores processing for the NY market:
   - Filter orders to show only statuses that are 'Visible On Storefront = Yes'
   - Among these visible statuses, identify if exists one has the status code 'processing' (Yes or No),
   - Check if this 'processing' status is set as a 'Default Status' (Yes or No)


5. Since New York orders might need special handling, check all stores:
   - Note the number of website configured
   - Record the store code for the first Main Website Store

6. For inventory planning, check the sources of it:
   - Check if the Default Source is currently 'Enabled' or shows as 'Disabled' for Pickup Location
   - Click the 'Edit' link for the Default Source and check if there's a 'State/Province' field (Yes or No)

7. Finally, return to the Dashboard and examine the revenue metrics:
   - Record the current Revenue amount shown
   - Check if Tax and Shipping amounts are both $0.00 (Yes or No)

**Please provide your findings in the following exact format:**

```
<answer>
Lifetime_Sales_Amount|amount
Cheap_Bestseller_Name|name
Second_Bestseller_Price|price
Second_Bestseller_Quantity|quantity
Product_In_Last_Orders|yes_or_no
NY_Tax_Rate|rate
CA_Tax_Rate|rate
Higher_Tax_State|state
Total_States_With_Tax|count
Processing_Visible_Storefront|Yes_or_No
Processing_Default_Status|Yes_or_No
Number_Of_Websites|count
Main_Store_Code|code
Default_Source_Pickup_Status|status
Default_Source_State|state_or_none
Dashboard_Revenue|amount
Tax_Shipping_Zero|yes_or_no
</answer>
```

**Example Output:**
```
<answer>
Lifetime_Sales_Amount|$XX.XX
Cheap_Bestseller_Name|Product Name Here
Second_Bestseller_Price|$XX.XX
Second_Bestseller_Quantity|XX
Product_In_Last_Orders|Yes/No
NY_Tax_Rate|X.XXXX
CA_Tax_Rate|X.XXXX
Higher_Tax_State|XX
Total_States_With_Tax|XX
Processing_Visible_Storefront|Yes/No
Processing_Default_Status|Yes/No
Number_Of_Websites|X
Main_Store_Code|code_here
Default_Source_Pickup_Status|Enabled/Disabled
Default_Source_State|State or None
Dashboard_Revenue|$XX.XX
Tax_Shipping_Zero|Yes/No
</answer>
```

================================================
FILE: tasks/playwright_webarena/standard/shopping_admin/ny_expansion_analysis/label.txt
================================================
Lifetime_Sales_Amount|$0.00
Cheap_Bestseller_Name|Sprite Yoga Strap 6 foot
Second_Bestseller_Price|$14.00
Second_Bestseller_Quantity|6
Product_In_Last_Orders|No
NY_Tax_Rate|8.3750
CA_Tax_Rate|8.2500
Higher_Tax_State|NY
Total_States_With_Tax|2
Processing_Visible_Storefront|Yes
Processing_Default_Status|Yes
Number_Of_Websites|1
Main_Store_Code|main_website_store
Default_Source_Pickup_Status|Enabled
Default_Source_State|No
Dashboard_Revenue|$0.00
Tax_Shipping_Zero|Yes

================================================
FILE: tasks/playwright_webarena/standard/shopping_admin/ny_expansion_analysis/meta.json
================================================
{
  "task_id": "ny_expansion_analysis",
  "task_name": "NY Expansion Analysis",
  "category_id": "shopping_admin",
  "category_name": "Shopping Admin",
  "description": "Prepare New York market expansion strategy by analyzing regional demographics, evaluating competitor presence, assessing logistics requirements, and creating detailed market entry plan.",
  "author": "Fanqing Meng",
  "created_at": "2025-08-17",
  "difficulty": "L3",
  "tags": [
    "data extraction",
    "comparative analysis",
    "content submission"
  ],
  "mcp": [
    "playwright"
  ],
  "meta_data": {
    "stateType": "video",
    "stateContent": null,
    "stateUrl": "https://storage.mcpmark.ai/tasks_state/playwright_video/magento-admin.mp4",
    "stateOriginalUrl": "https://github.com/web-arena-x/webarena/tree/main/environment_docker"
  }
}

================================================
FILE: tasks/playwright_webarena/standard/shopping_admin/ny_expansion_analysis/verify.py
================================================
import asyncio
import sys
import re
import os
import json
from pathlib import Path

def get_model_response():
    """
    Get the model's response from the MCP_MESSAGES environment variable.
    Returns the last assistant message text.
    """
    messages_path = os.getenv("MCP_MESSAGES")
    print(f"MCP_MESSAGES: {messages_path}")
    if not messages_path:
        print("ERROR: MCP_MESSAGES environment variable not set", file=sys.stderr)
        return None
    
    # Check if file exists
    if not Path(messages_path).exists():
        print(f"ERROR: Messages file not found at path: {messages_path}", file=sys.stderr)
        return None
    
    try:
        with open(messages_path, 'r') as f:
            content = f.read()
            
        # Check if file is empty
        if not content or content.strip() == '""':
            print("ERROR: Messages file is empty or contains only empty string", file=sys.stderr)
            return None
            
        messages = json.loads(content)
        
        # Check if messages is a list
        if not isinstance(messages, list):
            print(f"ERROR: Messages file should contain a list, got {type(messages).__name__}", file=sys.stderr)
            return None
        
        # Find the last assistant message
        for message in reversed(messages):
            if message.get('role') == 'assistant' and message.get('status') == 'completed':
                content = message.get('content', [])
                if not content:
                    print("WARNING: Assistant message has empty content", file=sys.stderr)
                    continue
                    
                for item in content:
                    if item.get('type') == 'output_text':
                        text = item.get('text', '')
                        if not text:
                            print("WARNING: Output text is empty", file=sys.stderr)
                            continue
                        return text
        
        print("ERROR: No assistant response with output_text found in messages", file=sys.stderr)
        return None
    except json.JSONDecodeError as e:
        print(f"ERROR: Invalid JSON in messages file: {str(e)}", file=sys.stderr)
        return None
    except Exception as e:
        print(f"ERROR: Unexpected error reading messages file: {str(e)}", file=sys.stderr)
        return None

def parse_answer_format(text):
    """
    Parse the <answer>...</answer> format from the agent's output.
    Returns a dictionary with the parsed values.
    """
    if not text:
        print("ERROR: No text provided to parse", file=sys.stderr)
        return None
    
    # Look for <answer>...</answer> pattern
    match = re.search(r'<answer>(.*?)</answer>', text, re.IGNORECASE | re.DOTALL)
    if not match:
        print("ERROR: No <answer> tags found in the response", file=sys.stderr)
        print(f"  Response preview: {text[:200]}...", file=sys.stderr)
        return None
    
    answer_content = match.group(1).strip()
    
    if not answer_content:
        print("ERROR: Empty content between <answer> tags", file=sys.stderr)
        return None
    
    # Parse each line
    result = {}
    lines = answer_content.split('\n')
    
    # Expected keys that should be present
    expected_keys = [
        'Lifetime_Sales_Amount', 'Cheap_Bestseller_Name', 'Second_Bestseller_Price',
        'Second_Bestseller_Quantity', 'Product_In_Last_Orders', 'NY_Tax_Rate',
        'CA_Tax_Rate', 'Higher_Tax_State', 'Total_States_With_Tax',
        'Processing_Visible_Storefront', 'Processing_Default_Status',
        'Number_Of_Websites', 'Main_Store_Code', 'Default_Source_Pickup_Status',
        'Default_Source_State', 'Dashboard_Revenue', 'Tax_Shipping_Zero'
    ]
    
    parsed_keys = []
    for line in lines:
        line = line.strip()
        if not line:
            continue
            
        if '|' not in line:
            print(f"ERROR: Line missing pipe separator '|': {line}", file=sys.stderr)
            continue
            
        parts = line.split('|', 1)
        if len(parts) != 2:
            print(f"ERROR: Invalid line format: {line}", file=sys.stderr)
            continue
            
        key, value = parts
        key = key.strip()
        value = value.strip()
        
        if not key:
            print(f"ERROR: Empty key in line: {line}", file=sys.stderr)
            continue
            
        result[key] = value
        parsed_keys.append(key)
    
    # Check for missing expected keys
    missing_keys = set(expected_keys) - set(parsed_keys)
    if missing_keys:
        print(f"ERROR: Missing expected keys: {', '.join(sorted(missing_keys))}", file=sys.stderr)
        
    # Check for unexpected keys
    unexpected_keys = set(parsed_keys) - set(expected_keys)
    if unexpected_keys:
        print(f"WARNING: Unexpected keys found: {', '.join(sorted(unexpected_keys))}", file=sys.stderr)
    
    if not result:
        print("ERROR: No valid key-value pairs parsed from answer", file=sys.stderr)
        return None
    
    return result

def load_expected_answer(label_path):
    """
    Load the expected answer from label.txt file.
    Returns a dictionary with the expected values.
    """
    try:
        with open(label_path, 'r') as f:
            lines = f.read().strip().split('\n')
        
        expected = {}
        for line in lines:
            if '|' in line:
                key, value = line.split('|', 1)
                expected[key.strip()] = value.strip()
        
        return expected
    except Exception as e:
        print(f"Error reading label file: {str(e)}", file=sys.stderr)
        return None

def compare_answers(model_answer, expected_answer):
    """
    Compare the model's answer with the expected answer.
    Returns True if all key information matches, False otherwise.
    """
    if not model_answer or not expected_answer:
        return False
    
    # Check each expected key
    mismatches = []
    for key, expected_value in expected_answer.items():
        model_value = model_answer.get(key, '')
        
        # Special handling for different types of values
        if key in ['Lifetime_Sales_Amount', 'Second_Bestseller_Price', 'Dashboard_Revenue']:
            # For price/amount fields, normalize format
            expected_clean = expected_value.replace('$', '').replace(',', '')
            model_clean = model_value.replace('$', '').replace(',', '')
            if expected_clean != model_clean:
                mismatches.append(f"{key}: expected '{expected_value}', got '{model_value}'")
        
        elif key in ['NY_Tax_Rate', 'CA_Tax_Rate']:
            # Tax rates - allow different decimal formats
            expected_clean = expected_value.replace('%', '').strip()
            model_clean = model_value.replace('%', '').strip()
            # Convert to float for comparison
            try:
                if float(expected_clean) != float(model_clean):
                    mismatches.append(f"{key}: expected '{expected_value}', got '{model_value}'")
            except ValueError:
                if expected_clean != model_clean:
                    mismatches.append(f"{key}: expected '{expected_value}', got '{model_value}'")
        
        elif key in ['Product_In_Last_Orders', 'Processing_Visible_Storefront', 'Processing_Default_Status', 'Tax_Shipping_Zero']:
            # Yes/No fields - case insensitive
            if model_value.lower() != expected_value.lower():
                mismatches.append(f"{key}: expected '{expected_value}', got '{model_value}'")
        
        elif key == 'Empty_Rows_Yes_Effect':
            # Allow flexible descriptions for this field
            # Just check if model provided some reasonable description
            if not model_value or len(model_value) < 5:
                mismatches.append(f"{key}: expected meaningful description, got '{model_value}'")
        
        elif key == 'Order_Status_Options':
            # Check if main options are mentioned
            expected_options = set(opt.strip() for opt in expected_value.split(','))
            model_options = set(opt.strip() for opt in model_value.split(','))
            if expected_options != model_options:
                mismatches.append(f"{key}: expected '{expected_value}', got '{model_value}'")
        
        elif key == 'Chart_Disabled_Message':
            # Allow some flexibility in message text
            # Check for key words
            if 'disabled' not in model_value.lower() and 'enable' not in model_value.lower():
                mismatches.append(f"{key}: expected message about chart being disabled, got '{model_value}'")
        
        elif key == 'Default_Source_State':
            # Handle 'None' or empty state
            expected_normalized = expected_value.lower() if expected_value.lower() != 'none' else ''
            model_normalized = model_value.lower() if model_value.lower() != 'none' else ''
            if expected_normalized != model_normalized:
                mismatches.append(f"{key}: expected '{expected_value}', got '{model_value}'")
        
        else:
            # Exact match for other fields
            if model_value != expected_value:
                mismatches.append(f"{key}: expected '{expected_value}', got '{model_value}'")
    
    if mismatches:
        print("\n=== Answer Comparison Mismatches ===", file=sys.stderr)
        for mismatch in mismatches:
            print(f"✗ {mismatch}", file=sys.stderr)
        return False
    
    print("\n=== Answer Comparison ===", file=sys.stderr)
    print("✓ All key information matches the expected answer", file=sys.stderr)
    return True

async def verify() -> bool:
    """
    Verifies that the NY expansion analysis task has been completed correctly.
    First checks the model's answer against the expected label,
    then optionally verifies the actual state in the Magento Admin.
    """
    print("\n=== Starting Verification ===", file=sys.stderr)
    
    # Get the label file path
    label_path = Path(__file__).parent / "label.txt"
    
    # Load expected answer
    print("Loading expected answer from label.txt...", file=sys.stderr)
    expected_answer = load_expected_answer(label_path)
    if not expected_answer:
        print("FATAL ERROR: Could not load expected answer from label.txt", file=sys.stderr)
        return False
    
    print(f"Expected answer loaded with {len(expected_answer)} keys", file=sys.stderr)
    
    # Get model's response from MCP_MESSAGES
    print("\nReading model response from MCP_MESSAGES...", file=sys.stderr)
    model_response = get_model_response()
    
    if not model_response:
        print("FATAL ERROR: No valid model response found", file=sys.stderr)
        return False
    
    print(f"Model response found (length: {len(model_response)} chars)", file=sys.stderr)
    print("\nParsing answer format from model response...", file=sys.stderr)
    
    model_answer = parse_answer_format(model_response)
    
    if not model_answer:
        print("FATAL ERROR: Could not parse answer format from model response", file=sys.stderr)
        return False
    
    print(f"\n=== Model Answer Parsed Successfully ===", file=sys.stderr)
    print(f"Parsed {len(model_answer)} key-value pairs", file=sys.stderr)
    
    for key, value in model_answer.items():
        print(f"  {key}: {value}", file=sys.stderr)
    
    # Compare answers
    print("\n=== Comparing Model Answer with Expected Answer ===", file=sys.stderr)
    answer_match = compare_answers(model_answer, expected_answer)
    
    if not answer_match:
        print("\nFATAL ERROR: Model answer does not match expected answer", file=sys.stderr)
        print("Verification FAILED", file=sys.stderr)
        return False
    
    print("\n✓ Model answer matches expected answer", file=sys.stderr)
    print("Verification PASSED", file=sys.stderr)
    return True

def main():
    """
    Executes the verification process and exits with a status code.
    """
    result = asyncio.run(verify())
    sys.exit(0 if result else 1)

if __name__ == "__main__":
    main()

================================================
FILE: tasks/playwright_webarena/standard/shopping_admin/products_sales_analysis/description.md
================================================
Perform a comprehensive products and sales analysis in the Magento Admin panel to identify inventory status and sales performance metrics.

**Task Requirements:**

1. if need to login, login with username 'admin' and password 'admin1234'

2. Analyze product inventory and catalog details, perform the following:
   - Search for all products containing 'Yoga' in their name - count the exact number of results
   - Clear the search and find the product with SKU 'WH11' - record its exact price
   - Apply a filter to show only products with Quantity = 0.0000 - count how many products match

3. To identify top-selling products and revenue metrics, navigate to the Dashboard and from the Bestsellers table:
   - Identify the product with lowest price and lowest quantity - record the product name and quantity sold
   - Find the second cheapest product in the table - record its exact quantity sold
   - Note the total Revenue amount displayed in the dashboard

4. Father all customers' information and demographics:
   - Find customer 'Sarah Miller' - record her exact email address
   - Count the total number of customers shown in the grid

5. Review order status and customer purchase history, go to orders of sales:
   - Count the total number of orders with 'Pending' status
   - Find the order ID of Grace Nguyen's order with the completed status and the most expensive price (starting with "000")

6. To provide a comprehensive report of all gathered data, compile all your findings and output them in the following exact format:

```
<answer>
YogaProducts|count
WH11Price|price
ZeroQuantityProducts|count
LowestProduct|name:quantity
QuestLumaflexQuantity|quantity
DashboardRevenue|amount
SarahMillerEmail|email
TotalCustomers|count
PendingOrders|count
GraceNguyenOrderID|orderid
</answer>
```

**Example Output:**
```
<answer>
YogaProducts|XX
WH11Price|$XX.XX
ZeroQuantityProducts|XX
LowestProduct|Product Name Here:XX
QuestLumaflexQuantity|XX
DashboardRevenue|$XX.XX
SarahMillerEmail|email@example.com
TotalCustomers|XX
PendingOrders|X
GraceNguyenOrderID|00000XXXX
</answer>
```

================================================
FILE: tasks/playwright_webarena/standard/shopping_admin/products_sales_analysis/label.txt
================================================
YogaProducts|171
WH11Price|$54.00
ZeroQuantityProducts|150
LowestProduct|Sprite Stasis Ball 55 cm foot:5
QuestLumaflexQuantity|6
DashboardRevenue|$0.00
SarahMillerEmail|helloworld@yahoo.com
TotalCustomers|72
PendingOrders|10
GraceNguyenOrderID|000000189

================================================
FILE: tasks/playwright_webarena/standard/shopping_admin/products_sales_analysis/meta.json
================================================
{
  "task_id": "products_sales_analysis",
  "task_name": "Products Sales Analysis",
  "category_id": "shopping_admin",
  "category_name": "Shopping Admin",
  "description": "Generate comprehensive sales performance reports by extracting product metrics, analyzing revenue trends, identifying top performers, evaluating inventory turnover, and creating actionable insights.",
  "author": "Fanqing Meng",
  "created_at": "2025-08-17",
  "difficulty": "L3",
  "tags": [
    "data extraction",
    "comparative analysis",
    "content submission"
  ],
  "mcp": [
    "playwright"
  ],
  "meta_data": {
    "stateType": "video",
    "stateContent": null,
    "stateUrl": "https://storage.mcpmark.ai/tasks_state/playwright_video/magento-admin.mp4",
    "stateOriginalUrl": "https://github.com/web-arena-x/webarena/tree/main/environment_docker"
  }
}

================================================
FILE: tasks/playwright_webarena/standard/shopping_admin/products_sales_analysis/verify.py
================================================
import asyncio
import sys
import re
import os
import json
from pathlib import Path


def get_model_response():
    """
    Get the model's response from the MCP_MESSAGES environment variable.
    Returns the last assistant message text.
    """
    messages_path = os.getenv("MCP_MESSAGES")
    print(f"MCP_MESSAGES: {messages_path}")
    if not messages_path:
        print("Warning: MCP_MESSAGES environment variable not set", file=sys.stderr)
        return None

    try:
        with open(messages_path, "r") as f:
            messages = json.load(f)

        # Find the last assistant message
        for message in reversed(messages):
            if (
                message.get("role") == "assistant"
                and message.get("status") == "completed"
            ):
                content = message.get("content", [])
                for item in content:
                    if item.get("type") == "output_text":
                        return item.get("text", "")

        print("Warning: No assistant response found in messages", file=sys.stderr)
        return None
    except Exception as e:
        print(f"Error reading messages file: {str(e)}", file=sys.stderr)
        return None


def parse_answer_format(text):
    """
    Parse the <answer>...</answer> format from the agent's output.
    Returns a dictionary with the parsed values.
    """
    if not text:
        print("Error: No text provided to parse", file=sys.stderr)
        return None

    # Look for <answer>...</answer> pattern
    match = re.search(r"<answer>(.*?)</answer>", text, re.IGNORECASE | re.DOTALL)
    if not match:
        print("Error: No <answer>...</answer> tags found in response", file=sys.stderr)
        return None

    answer_content = match.group(1).strip()
    if not answer_content:
        print("Error: Empty answer content", file=sys.stderr)
        return None

    # Parse each line
    result = {}
    lines = [line.strip() for line in answer_content.split("\n") if line.strip()]

    if len(lines) != 10:
        print(f"Error: Expected 10 lines in answer, got {len(lines)}", file=sys.stderr)
        print(f"Lines found: {lines}", file=sys.stderr)
        return None

    # Expected keys for validation
    expected_keys = [
        "YogaProducts", "WH11Price", "ZeroQuantityProducts", "LowestProduct",
        "QuestLumaflexQuantity", "DashboardRevenue", "SarahMillerEmail",
        "TotalCustomers", "PendingOrders", "GraceNguyenOrderID"
    ]

    for line in lines:
        if "|" not in line:
            print(f"Error: Line missing '|' separator: {line}", file=sys.stderr)
            return None
        
        parts = line.split("|", 1)
        if len(parts) != 2:
            print(f"Error: Invalid line format: {line}", file=sys.stderr)
            return None
            
        key, value = parts[0].strip(), parts[1].strip()
        
        if not key or not value:
            print(f"Error: Empty key or value in line: {line}", file=sys.stderr)
            return None
            
        result[key] = value

    # Validate all expected keys are present
    missing_keys = set(expected_keys) - set(result.keys())
    if missing_keys:
        print(f"Error: Missing required keys: {missing_keys}", file=sys.stderr)
        return None

    return result


def load_expected_answer(label_path):
    """
    Load the expected answer from label.txt file.
    Returns a dictionary with the expected values.
    """
    try:
        with open(label_path, "r") as f:
            lines = f.read().strip().split("\n")

        expected = {}
        for line in lines:
            if "|" in line:
                key, value = line.split("|", 1)
                expected[key.strip()] = value.strip()

        return expected
    except Exception as e:
        print(f"Error reading label file: {str(e)}", file=sys.stderr)
        return None


def compare_answers(model_answer, expected_answer):
    """
    Compare the model's answer with the expected answer.
    Returns True if all key information matches, False otherwise.
    """
    if not model_answer or not expected_answer:
        return False

    # Check each expected key
    mismatches = []
    for key, expected_value in expected_answer.items():
        model_value = model_answer.get(key, "")

        # Special handling for different types of values
        if key == "LowestProduct":
            # Check if product name and quantity match (format: "Product Name:quantity")
            if ":" in expected_value and ":" in model_value:
                expected_name, expected_qty = expected_value.rsplit(":", 1)
                model_name, model_qty = model_value.rsplit(":", 1)
                if expected_name != model_name or expected_qty != model_qty:
                    mismatches.append(
                        f"{key}: expected '{expected_value}', got '{model_value}'"
                    )
            else:
                if expected_value != model_value:
                    mismatches.append(
                        f"{key}: expected '{expected_value}', got '{model_value}'"
                    )

        elif key in ["WH11Price", "DashboardRevenue"]:
            # For price/amount fields, normalize format
            expected_clean = expected_value.replace("$", "").replace(",", "")
            model_clean = model_value.replace("$", "").replace(",", "")
            if expected_clean != model_clean:
                mismatches.append(
                    f"{key}: expected '{expected_value}', got '{model_value}'"
                )

        elif key == "SarahMillerEmail":
            # Email should match exactly
            if model_value.lower() != expected_value.lower():
                mismatches.append(
                    f"{key}: expected '{expected_value}', got '{model_value}'"
                )

        else:
            # Exact match for other fields
            if model_value != expected_value:
                mismatches.append(
                    f"{key}: expected '{expected_value}', got '{model_value}'"
                )

    if mismatches:
        print("\n=== Answer Comparison Mismatches ===", file=sys.stderr)
        for mismatch in mismatches:
            print(f"✗ {mismatch}", file=sys.stderr)
        return False

    print("\n=== Answer Comparison ===", file=sys.stderr)
    print("✓ All key information matches the expected answer", file=sys.stderr)
    return True


async def verify() -> bool:
    """
    Verifies that the products and sales analysis task has been completed correctly.
    First checks the model's answer against the expected label,
    then optionally verifies the actual state in the Magento Admin.
    """
    # Get the label file path
    label_path = Path(__file__).parent / "label.txt"

    # Load expected answer
    expected_answer = load_expected_answer(label_path)
    if not expected_answer:
        print("Error: Could not load expected answer from label.txt", file=sys.stderr)
        return False

    # Get model's response from MCP_MESSAGES
    model_response = get_model_response()
    if model_response:
        print("Found model response, parsing answer format...", file=sys.stderr)
        model_answer = parse_answer_format(model_response)

        if model_answer:
            print("\n=== Model Answer Parsed ===", file=sys.stderr)
            for key, value in model_answer.items():
                print(f"{key}: {value}", file=sys.stderr)

            # Compare answers
            answer_match = compare_answers(model_answer, expected_answer)
            if not answer_match:
                print("\nModel answer does not match expected answer", file=sys.stderr)
                return False
            print("\n✓ Model answer matches expected answer", file=sys.stderr)
            return True
        else:
            print(
                "Warning: Could not parse answer format from model response",
                file=sys.stderr,
            )
            return False
    else:
        print("No model response found", file=sys.stderr)
        return False


def main():
    """
    Executes the verification process and exits with a status code.
    """
    result = asyncio.run(verify())
    sys.exit(0 if result else 1)


if __name__ == "__main__":
    main()


================================================
FILE: tasks/playwright_webarena/standard/shopping_admin/sales_inventory_analysis/description.md
================================================
Perform a comprehensive sales and inventory analysis by extracting specific metrics from multiple sections of the Magento Admin panel.

**Task Requirements:**

1. Login with username 'admin' and password 'admin1234'

2. To analyze product inventory and identify key items, check all products:
   - Search for all products containing 'Sprite' in their name - count the exact number of results
   - Clear the search and filter products by Quantity = 100.0000 - count how many products match
   - Find the product with SKU 'WS12' - record its exact name and price

3. To understand sales performance and order status, we need check all orders:
   - Search for all orders with 'Pending' status - count the total number
   - Find Grace Nguyen's Complete and the most cheap order - record the order ID (starts with "000")
   - Find the order with the highest Grand Total - record the customer name and amount

4. To examine bestselling products and search trends, from the main page:
   - In the Bestsellers table, identify the product with most quantity but and lowest price - record its name and quantity sold
   - Find 'Overnight Duffle' and record its exact price
   - In the Top Search Terms table, find 'hollister' and record its position number (1st, 2nd, etc.)

5. To analyze customer demographics and account information, go to All Customers:
   - Search for customers with its email address containing 'costello' - count the results
   - Find Sarah Miller's customer record - record her Group and extract Customer Since date

6. To review payment status and billing information, navigate to Invoices:
   - Find all invoices with 'Paid' status - count them
   - Find the invoice for order #000000002 - record the Bill-to Name

7. To provide a comprehensive report of all gathered data, compile all findings and output them in the following exact format:

```
<answer>
SpriteProducts|count
Quantity100Products|count
WS12Info|name:price
PendingOrders|count
GraceOrderID|orderid
HighestOrderInfo|customer:amount
CheapProduct|name:quantity
OvernightDufflePrice|price
HollisterPosition|position
CostelloCustomers|count
SarahMillerInfo|group:date
PaidInvoices|count
Invoice002BillTo|name
</answer>
```

**Example Output:**
```
<answer>
SpriteProducts|XX
Quantity100Products|XX
WS12Info|Product Name Here:$XX.XX
PendingOrders|X
GraceOrderID|00000XXXX
HighestOrderInfo|Customer Name:$XXX.XX
CheapProduct|Product Name:XX
OvernightDufflePrice|$XX.XX
HollisterPosition|Xth
CostelloCustomers|X
SarahMillerInfo|Group Name:MMM DD, YYYY
PaidInvoices|X
Invoice002BillTo|Customer Name
</answer>
```

================================================
FILE: tasks/playwright_webarena/standard/shopping_admin/sales_inventory_analysis/label.txt
================================================
SpriteProducts|16
Quantity100Products|1886
WS12Info|Radiant Tee:$22.00
PendingOrders|10
GraceOrderID|000000114
HighestOrderInfo|Samantha Jones:$292.40
CheapProduct|Sprite Yoga Strap 6 foot:6
OvernightDufflePrice|$45.00
HollisterPosition|1st
CostelloCustomers|0
SarahMillerInfo|General:Apr 19, 2023 5:45:07 PM
PaidInvoices|2
Invoice002BillTo|Veronica Costello

================================================
FILE: tasks/playwright_webarena/standard/shopping_admin/sales_inventory_analysis/meta.json
================================================
{
  "task_id": "sales_inventory_analysis",
  "task_name": "Sales Inventory Analysis",
  "category_id": "shopping_admin",
  "category_name": "Shopping Admin",
  "description": "Analyze sales patterns and inventory levels to optimize stock management, identify slow-moving items, predict demand trends, and generate restocking recommendations.",
  "author": "Fanqing Meng",
  "created_at": "2025-08-17",
  "difficulty": "L3",
  "tags": [
    "data extraction",
    "comparative analysis",
    "inventory management"
  ],
  "mcp": [
    "playwright"
  ],
  "meta_data": {
    "stateType": "video",
    "stateContent": null,
    "stateUrl": "https://storage.mcpmark.ai/tasks_state/playwright_video/magento-admin.mp4",
    "stateOriginalUrl": "https://github.com/web-arena-x/webarena/tree/main/environment_docker"
  }
}

================================================
FILE: tasks/playwright_webarena/standard/shopping_admin/sales_inventory_analysis/verify.py
================================================
import asyncio
import sys
import re
import os
import json
from pathlib import Path


def get_model_response():
    """
    Get the model's response from the MCP_MESSAGES environment variable.
    Returns the last assistant message text.
    """
    messages_path = os.getenv("MCP_MESSAGES")
    print(f"MCP_MESSAGES: {messages_path}")
    if not messages_path:
        print("Warning: MCP_MESSAGES environment variable not set", file=sys.stderr)
        return None

    try:
        with open(messages_path, "r") as f:
            messages = json.load(f)

        # Find the last assistant message with type='message', status='completed'
        for message in reversed(messages):
            if (
                message.get("role") == "assistant"
                and message.get("status") == "completed"
                and message.get("type") == "message"
            ):
                content = message.get("content", [])
                for item in content:
                    # Check for both 'text' and 'output_text' types
                    if item.get("type") in ["text", "output_text"]:
                        return item.get("text", "")

        print("Warning: No assistant response found in messages", file=sys.stderr)
        return None
    except Exception as e:
        print(f"Error reading messages file: {str(e)}", file=sys.stderr)
        return None


def parse_answer_format(text):
    """
    Parse the <answer>...</answer> format from the agent's output.
    Returns a dictionary with the parsed values.
    """
    if not text:
        print("ERROR: No text provided to parse", file=sys.stderr)
        return None

    # Look for <answer>...</answer> pattern
    match = re.search(r"<answer>(.*?)</answer>", text, re.IGNORECASE | re.DOTALL)
    if not match:
        print("ERROR: No <answer>...</answer> tags found in the response", file=sys.stderr)
        print("Response text preview (first 200 chars):", text[:200], file=sys.stderr)
        return None

    answer_content = match.group(1).strip()
    print(f"Found answer content with {len(answer_content)} characters", file=sys.stderr)

    # Parse each line
    result = {}
    lines = answer_content.split("\n")
    
    # Expected keys for this task
    expected_keys = [
        "SpriteProducts", "Quantity100Products", "WS12Info", "PendingOrders",
        "GraceOrderID", "HighestOrderInfo", "CheapProduct", "OvernightDufflePrice",
        "HollisterPosition", "CostelloCustomers", "SarahMillerInfo", 
        "PaidInvoices", "Invoice002BillTo"
    ]

    if len(lines) != 13:
        print(f"ERROR: Expected 13 lines in answer, got {len(lines)}", file=sys.stderr)
        print(f"Lines found: {lines}", file=sys.stderr)
        return None

    for i, line in enumerate(lines, 1):
        if "|" not in line:
            print(f"ERROR: Line {i} does not contain pipe separator '|': '{line}'", file=sys.stderr)
            return None
        
        parts = line.split("|", 1)
        if len(parts) != 2:
            print(f"ERROR: Line {i} could not be split into key|value: '{line}'", file=sys.stderr)
            return None
            
        key, value = parts
        result[key.strip()] = value.strip()
    
    # Check if all expected keys are present
    missing_keys = set(expected_keys) - set(result.keys())
    if missing_keys:
        print(f"ERROR: Missing expected keys: {missing_keys}", file=sys.stderr)
        print(f"Keys found: {list(result.keys())}", file=sys.stderr)
        return None
    
    # Check for unexpected keys
    extra_keys = set(result.keys()) - set(expected_keys)
    if extra_keys:
        print(f"WARNING: Unexpected keys found: {extra_keys}", file=sys.stderr)

    return result


def load_expected_answer(label_path):
    """
    Load the expected answer from label.txt file.
    Returns a dictionary with the expected values.
    """
    try:
        with open(label_path, "r") as f:
            lines = f.read().strip().split("\n")

        expected = {}
        for line in lines:
            if "|" in line:
                key, value = line.split("|", 1)
                expected[key.strip()] = value.strip()

        return expected
    except Exception as e:
        print(f"Error reading label file: {str(e)}", file=sys.stderr)
        return None


def compare_answers(model_answer, expected_answer):
    """
    Compare the model's answer with the expected answer.
    Returns True if all key information matches, False otherwise.
    """
    if not model_answer or not expected_answer:
        return False

    # Check each expected key
    mismatches = []
    for key, expected_value in expected_answer.items():
        model_value = model_answer.get(key, "")

        # Special handling for different types of values
        if key == "WS12Info":
            # Check if product name and price match (format: name:price)
            if ":" in expected_value and ":" in model_value:
                expected_name, expected_price = expected_value.rsplit(":", 1)
                model_name, model_price = model_value.rsplit(":", 1)
                # Normalize price format
                expected_price_clean = expected_price.replace("$", "").replace(",", "")
                model_price_clean = model_price.replace("$", "").replace(",", "")
                if (
                    expected_name != model_name
                    or expected_price_clean != model_price_clean
                ):
                    mismatches.append(
                        f"{key}: expected '{expected_value}', got '{model_value}'"
                    )
            else:
                if expected_value != model_value:
                    mismatches.append(
                        f"{key}: expected '{expected_value}', got '{model_value}'"
                    )

        elif key == "GraceOrderID":
            # Order ID should start with "000" and match exactly
            if not model_value.startswith("000"):
                mismatches.append(
                    f"{key}: expected to start with '000', got '{model_value}'"
                )
            elif model_value != expected_value:
                mismatches.append(
                    f"{key}: expected '{expected_value}', got '{model_value}'"
                )

        elif key == "HighestOrderInfo":
            # Check format customer:amount
            if ":" in expected_value and ":" in model_value:
                expected_customer, expected_amount = expected_value.rsplit(":", 1)
                model_customer, model_amount = model_value.rsplit(":", 1)
                # Normalize amount format
                expected_amount_clean = expected_amount.replace("$", "").replace(
                    ",", ""
                )
                model_amount_clean = model_amount.replace("$", "").replace(",", "")
                if (
                    expected_customer != model_customer
                    or expected_amount_clean != model_amount_clean
                ):
                    mismatches.append(
                        f"{key}: expected '{expected_value}', got '{model_value}'"
                    )
            else:
                if expected_value != model_value:
                    mismatches.append(
                        f"{key}: expected '{expected_value}', got '{model_value}'"
                    )

        elif key == "Position2Product":
            # Check if product name and quantity match
            if ":" in expected_value and ":" in model_value:
                expected_name, expected_qty = expected_value.rsplit(":", 1)
                model_name, model_qty = model_value.rsplit(":", 1)
                if expected_name != model_name or expected_qty != model_qty:
                    mismatches.append(
                        f"{key}: expected '{expected_value}', got '{model_value}'"
                    )
            else:
                if expected_value != model_value:
                    mismatches.append(
                        f"{key}: expected '{expected_value}', got '{model_value}'"
                    )

        elif key == "OvernightDufflePrice":
            # Normalize price format
            expected_clean = expected_value.replace("$", "").replace(",", "")
            model_clean = model_value.replace("$", "").replace(",", "")
            if expected_clean != model_clean:
                mismatches.append(
                    f"{key}: expected '{expected_value}', got '{model_value}'"
                )

        elif key == "HollisterPosition":
            # Position format (1st, 2nd, 3rd, etc.)
            if model_value.lower() != expected_value.lower():
                mismatches.append(
                    f"{key}: expected '{expected_value}', got '{model_value}'"
                )

        elif key == "SarahMillerInfo":
            # Format: group:date
            if ":" in expected_value and ":" in model_value:
                expected_group, expected_date = expected_value.split(":", 1)
                model_group, model_date = model_value.split(":", 1)
                # Allow some flexibility in date format
                if expected_group != model_group:
                    mismatches.append(
                        f"{key}: expected group '{expected_group}', got '{model_group}'"
                    )
                # For date, check if key parts match
                if not (expected_date in model_date or model_date in expected_date):
                    mismatches.append(
                        f"{key}: expected date '{expected_date}', got '{model_date}'"
                    )
            else:
                if expected_value != model_value:
                    mismatches.append(
                        f"{key}: expected '{expected_value}', got '{model_value}'"
                    )

        elif key == "Invoice002BillTo":
            # Name should match exactly
            if model_value != expected_value:
                mismatches.append(
                    f"{key}: expected '{expected_value}', got '{model_value}'"
                )

        else:
            # Exact match for count fields and other numeric values
            if model_value != expected_value:
                mismatches.append(
                    f"{key}: expected '{expected_value}', got '{model_value}'"
                )

    if mismatches:
        print("\n=== Answer Comparison Mismatches ===", file=sys.stderr)
        for mismatch in mismatches:
            print(f"✗ {mismatch}", file=sys.stderr)
        return False

    print("\n=== Answer Comparison ===", file=sys.stderr)
    print("✓ All key information matches the expected answer", file=sys.stderr)
    return True


async def verify() -> bool:
    """
    Verifies that the sales and inventory analysis task has been completed correctly.
    First checks the model's answer against the expected label,
    then optionally verifies the actual state in the Magento Admin.
    """
    print("\n" + "="*60, file=sys.stderr)
    print("Starting verification of Task 5", file=sys.stderr)
    print("="*60, file=sys.stderr)
    
    # Get the label file path
    label_path = Path(__file__).parent / "label.txt"

    # Load expected answer
    print("\n--- Loading Expected Answer ---", file=sys.stderr)
    expected_answer = load_expected_answer(label_path)
    if not expected_answer:
        print("FATAL ERROR: Could not load expected answer from label.txt", file=sys.stderr)
        return False
    print(f"Successfully loaded {len(expected_answer)} expected values", file=sys.stderr)

    # Get model's response from MCP_MESSAGES
    print("\n--- Loading Model Response ---", file=sys.stderr)
    model_response = get_model_response()
    if not model_response:
        print("FATAL ERROR: No model response found in MCP_MESSAGES", file=sys.stderr)
        return False
    
    print(f"Found model response ({len(model_response)} characters)", file=sys.stderr)
    
    print("\n--- Parsing Answer Format ---", file=sys.stderr)
    model_answer = parse_answer_format(model_response)
    
    if not model_answer:
        print("\nFATAL ERROR: Could not parse answer format from model response", file=sys.stderr)
        print("Verification FAILED", file=sys.stderr)
        return False
    
    print("\n=== Model Answer Successfully Parsed ===", file=sys.stderr)
    for key, value in model_answer.items():
        print(f"  {key}: {value}", file=sys.stderr)

    # Compare answers
    print("\n--- Comparing Answers ---", file=sys.stderr)
    answer_match = compare_answers(model_answer, expected_answer)
    
    if not answer_match:
        print("\n" + "="*60, file=sys.stderr)
        print("VERIFICATION FAILED: Model answer does not match expected answer", file=sys.stderr)
        print("="*60, file=sys.stderr)
        return False
    
    print("\n" + "="*60, file=sys.stderr)
    print("✓ VERIFICATION PASSED: Model answer matches expected answer", file=sys.stderr)
    print("="*60, file=sys.stderr)
    return True


def main():
    """
    Executes the verification process and exits with a status code.
    """
    result = asyncio.run(verify())
    sys.exit(0 if result else 1)


if __name__ == "__main__":
    main()


================================================
FILE: tasks/playwright_webarena/standard/shopping_admin/search_filtering_operations/description.md
================================================
Perform comprehensive search and filtering operations in the Magento Admin panel to extract specific business insights using advanced search techniques.

**Task Requirements:**

1. Login with username 'admin' and password 'admin1234'

2. To analyze search behavior and term effectiveness, check the Search Terms of Marketing and perform complex filtering:
   - Search for all terms containing 'tank' in their name - count the exact number of results
   - Clear filters and find terms with exactly 0 results - count how many such terms exist
   - Apply a filter to show only terms with more than 10 uses - record the term with highest uses and its count (You need to see how many there are and record them all.)
   - Find the search term that has results between 20-30 - record its name and exact result count

3. To gather detailed marketing insights from search data, go to Search Terms in Reports:
   - Apply filter for terms with more than 15 hits - count total filtered results
   - Find the term with ID between 10-15 that has the most results - record term name and result count (You need to see how many there are and record them all.)
   - Filter to show only terms from "Default Store View" - count total results

4. To examine real-time search trends and top performers, from the Dashboard, perform targeted searches:
   - In the 'Top Search Terms' table, find the term with exactly 1 result - record its name and uses
   - In the 'Last Search Terms' table, identify the term with the both the highest number of results and uses - record name and the number of results
   - In the 'Bestsellers' tab, find the product at position #3 - record name and quantity

5. To identify patterns in search usage and results, navigate to Search Terms (main grid) in step 2:
   - Sort by 'Uses' column (descending) - record the top term and its uses count
   - Sort by 'Results' column (ascending) - record the first non-zero result term and its count
   - Count total number of unique search terms in the system

6. To provide a comprehensive report of all gathered data, compile all findings and output in the following exact format:

```
<answer>
TankSearchCount|count
ZeroResultsCount|count
HighestUseTerm|term:uses
Results20to30Term|term1:results1|term2:result2|term3:result3|...
Hits15PlusCount|count
ID10to15MaxResults|term:results
DefaultStoreViewCount|count
OneResultTerm|term1:uses1|term2:uses2|term3:uses3|...
HighestResultLastSearch|term:results
Position3Bestseller|product:quantity
TopUseTerm|term:uses
FirstNonZeroResult|term:results
TotalUniqueTerms|count
</answer>
```

**Example Output:**
```
<answer>
TankSearchCount|X
ZeroResultsCount|X
HighestUseTerm|search_term:XX
Results20to30Term|search_term1:XX1|search_term2:XX2|search_term3:XX3|...
Hits15PlusCount|X
ID10to15MaxResults|Product Name:XX
DefaultStoreViewCount|X
OneResultTerm|search_term1:XX1|search_term2:XX2|search_term3:XX3|...
HighestResultLastSearch|search_term:XX
Position3Bestseller|Product Name:X
TopUseTerm|search_term:XX
FirstNonZeroResult|search_term:X
TotalUniqueTerms|X
</answer>
```

**Success Criteria:**
- Successfully logged into Magento Admin
- Applied complex search filters in Search Terms section
- Used range filters for results and hits
- Sorted columns to find specific records
- Navigated between different report views
- Extracted data from filtered and sorted results
- Counted records accurately after applying filters
- Output answer in exact format with 13 data lines
- Answer wrapped in <answer> tags

================================================
FILE: tasks/playwright_webarena/standard/shopping_admin/search_filtering_operations/label.txt
================================================
TankSearchCount|2
ZeroResultsCount|1
HighestUseTerm|hollister:19
Results20to30Term|Antonia Racer Tank:23|tanks:23
Hits15PlusCount|1
ID10to15MaxResults|Antonia Racer Tank:23
DefaultStoreViewCount|7
OneResultTerm|hollister:19|WP10:1
HighestResultLastSearch|Antonia Racer Tank:23
Position3Bestseller|Sprite Stasis Ball 65 cm:6
TopUseTerm|hollister:19
FirstNonZeroResult|WP10:1
TotalUniqueTerms|7

================================================
FILE: tasks/playwright_webarena/standard/shopping_admin/search_filtering_operations/meta.json
================================================
{
  "task_id": "search_filtering_operations",
  "task_name": "Search Filtering Operations",
  "category_id": "shopping_admin",
  "category_name": "Shopping Admin",
  "description": "Configure advanced search and filtering systems in admin interface, implement category hierarchies, set up attribute filters, and optimize search algorithms for user experience.",
  "author": "Fanqing Meng",
  "created_at": "2025-08-17",
  "difficulty": "L3",
  "tags": [
    "content submission"
  ],
  "mcp": [
    "playwright"
  ],
  "meta_data": {
    "stateType": "video",
    "stateContent": null,
    "stateUrl": "https://storage.mcpmark.ai/tasks_state/playwright_video/magento-admin.mp4",
    "stateOriginalUrl": "https://github.com/web-arena-x/webarena/tree/main/environment_docker"
  }
}

================================================
FILE: tasks/playwright_webarena/standard/shopping_admin/search_filtering_operations/verify.py
================================================
import re
import json
import os
import sys


def verify(messages):
    """
    Verify that the agent has successfully performed complex search and filtering operations
    in the Magento Admin panel and extracted all required information correctly.

    Args:
        messages: List of message dictionaries containing the conversation

    Returns:
        Dictionary with 'valid' boolean and 'reason' string
    """

    # Find the last assistant message with status "completed" and type "message"
    answer_content = None
    for message in reversed(messages):
        if (
            message.get("role") == "assistant"
            and message.get("status") == "completed"
            and message.get("type") == "message"
            and message.get("content")
        ):
            # Extract text from content structure
            content = message["content"]
            if isinstance(content, list):
                for item in content:
                    if isinstance(item, dict) and item.get("type") == "output_text":
                        text = item.get("text", "")
                        # Look for answer tags with case-insensitive search
                        answer_match = re.search(
                            r"<answer>(.*?)</answer>", text, re.DOTALL | re.IGNORECASE
                        )
                        if answer_match:
                            answer_content = answer_match.group(1).strip()
                            break
            elif isinstance(content, str):
                # Look for answer tags in string content
                answer_match = re.search(r"<answer>(.*?)</answer>", content, re.DOTALL | re.IGNORECASE)
                if answer_match:
                    answer_content = answer_match.group(1).strip()
                    break

            if answer_content:
                break

    if not answer_content:
        return {"valid": False, "reason": "No answer found in <answer> tags"}

    # Expected format - each line should have a key|value pair
    expected_keys = [
        "TankSearchCount",
        "ZeroResultsCount",
        "HighestUseTerm",
        "Results20to30Term",
        "Hits15PlusCount",
        "ID10to15MaxResults",
        "DefaultStoreViewCount",
        "OneResultTerm",
        "HighestResultLastSearch",
        "Position3Bestseller",
        "TopUseTerm",
        "FirstNonZeroResult",
        "TotalUniqueTerms",
    ]

    # Parse the answer
    lines = answer_content.strip().split("\n")

    # Check if we have exactly 13 lines
    if len(lines) != 13:
        return {"valid": False, "reason": f"Expected 13 data lines, found {len(lines)}"}

    # Parse each line and validate format
    extracted_data = {}
    for line in lines:
        if "|" not in line:
            return {
                "valid": False,
                "reason": f"Invalid format in line: {line}. Expected 'key|value' format",
            }

        parts = line.split("|", 1)
        if len(parts) != 2:
            return {"valid": False, "reason": f"Invalid format in line: {line}"}

        key, value = parts
        extracted_data[key] = value

    # Check all required keys are present
    missing_keys = set(expected_keys) - set(extracted_data.keys())
    if missing_keys:
        return {
            "valid": False,
            "reason": f"Missing required keys: {', '.join(missing_keys)}",
        }

    # Validate specific data formats and expected values based on the current data

    # 1. TankSearchCount should be a number (2 terms containing 'tank')
    if not extracted_data["TankSearchCount"].isdigit():
        return {
            "valid": False,
            "reason": f"TankSearchCount should be a number, got: {extracted_data['TankSearchCount']}",
        }

    # Expected: "Antonia Racer Tank" and "tanks" contain 'tank'
    if extracted_data["TankSearchCount"] != "2":
        return {
            "valid": False,
            "reason": f"TankSearchCount should be '2', got: {extracted_data['TankSearchCount']}",
        }

    # 2. ZeroResultsCount should be a number (nike has 0 results)
    if not extracted_data["ZeroResultsCount"].isdigit():
        return {
            "valid": False,
            "reason": f"ZeroResultsCount should be a number, got: {extracted_data['ZeroResultsCount']}",
        }

    if extracted_data["ZeroResultsCount"] != "1":
        return {
            "valid": False,
            "reason": f"ZeroResultsCount should be '1', got: {extracted_data['ZeroResultsCount']}",
        }

    # 3. HighestUseTerm should be in format "term:uses"
    if ":" not in extracted_data["HighestUseTerm"]:
        return {
            "valid": False,
            "reason": f"HighestUseTerm should be in format 'term:uses', got: {extracted_data['HighestUseTerm']}",
        }

    # hollister has 19 uses (highest among terms with > 10 uses)
    if extracted_data["HighestUseTerm"] != "hollister:19":
        return {
            "valid": False,
            "reason": f"HighestUseTerm should be 'hollister:19', got: {extracted_data['HighestUseTerm']}",
        }

    # 4. Results20to30Term should be in format "term:results"
    if ":" not in extracted_data["Results20to30Term"]:
        return {
            "valid": False,
            "reason": f"Results20to30Term should be in format 'term:results', got: {extracted_data['Results20to30Term']}",
        }

    # Both "tanks" and "Antonia Racer Tank" have 23 results (between 20-30)
    valid_results20to30 = ["tanks:23", "Antonia Racer Tank:23"]
    # Check if answer contains one of the valid values or both separated by |
    if not any(
        val in extracted_data["Results20to30Term"] for val in valid_results20to30
    ):
        return {
            "valid": False,
            "reason": f"Results20to30Term should contain 'tanks:23' or 'Antonia Racer Tank:23', got: {extracted_data['Results20to30Term']}",
        }

    # 5. Hits15PlusCount should be a number (only hollister has 19 hits > 15)
    if not extracted_data["Hits15PlusCount"].isdigit():
        return {
            "valid": False,
            "reason": f"Hits15PlusCount should be a number, got: {extracted_data['Hits15PlusCount']}",
        }

    if extracted_data["Hits15PlusCount"] != "1":
        return {
            "valid": False,
            "reason": f"Hits15PlusCount should be '1', got: {extracted_data['Hits15PlusCount']}",
        }

    # 6. ID10to15MaxResults should be in format "term:results"
    if ":" not in extracted_data["ID10to15MaxResults"]:
        return {
            "valid": False,
            "reason": f"ID10to15MaxResults should be in format 'term:results', got: {extracted_data['ID10to15MaxResults']}",
        }

    # ID 11 is hollister (1 result), ID 13 is Antonia Racer Tank (23 results)
    if extracted_data["ID10to15MaxResults"] != "Antonia Racer Tank:23":
        return {
            "valid": False,
            "reason": f"ID10to15MaxResults should be 'Antonia Racer Tank:23', got: {extracted_data['ID10to15MaxResults']}",
        }

    # 7. DefaultStoreViewCount should be a number (all 7 terms are from Default Store View)
    if not extracted_data["DefaultStoreViewCount"].isdigit():
        return {
            "valid": False,
            "reason": f"DefaultStoreViewCount should be a number, got: {extracted_data['DefaultStoreViewCount']}",
        }

    if extracted_data["DefaultStoreViewCount"] != "7":
        return {
            "valid": False,
            "reason": f"DefaultStoreViewCount should be '7', got: {extracted_data['DefaultStoreViewCount']}",
        }

    # 8. OneResultTerm should be in format "term:uses"
    if ":" not in extracted_data["OneResultTerm"]:
        return {
            "valid": False,
            "reason": f"OneResultTerm should be in format 'term:uses', got: {extracted_data['OneResultTerm']}",
        }

    # Both hollister and WP10 have exactly 1 result
    valid_one_result = ["hollister:19", "WP10:1"]
    if not any(val in extracted_data["OneResultTerm"] for val in valid_one_result):
        return {
            "valid": False,
            "reason": f"OneResultTerm should contain 'hollister:19' or 'WP10:1', got: {extracted_data['OneResultTerm']}",
        }

    # 9. HighestResultLastSearch should be in format "term:results"
    if ":" not in extracted_data["HighestResultLastSearch"]:
        return {
            "valid": False,
            "reason": f"HighestResultLastSearch should be in format 'term:results', got: {extracted_data['HighestResultLastSearch']}",
        }

    # In Last Search Terms: tanks and Antonia Racer Tank both have 23 results (highest)
    valid_highest_last = ["tanks:23", "Antonia Racer Tank:23"]
    if not any(
        val in extracted_data["HighestResultLastSearch"] for val in valid_highest_last
    ):
        return {
            "valid": False,
            "reason": f"HighestResultLastSearch should contain 'tanks:23' or 'Antonia Racer Tank:23', got: {extracted_data['HighestResultLastSearch']}",
        }

    # 10. Position3Bestseller should be in format "product:quantity"
    if ":" not in extracted_data["Position3Bestseller"]:
        return {
            "valid": False,
            "reason": f"Position3Bestseller should be in format 'product:quantity', got: {extracted_data['Position3Bestseller']}",
        }

    # Position 3 in Bestsellers is "Sprite Stasis Ball 65 cm" with quantity 6
    if extracted_data["Position3Bestseller"] != "Sprite Stasis Ball 65 cm:6":
        return {
            "valid": False,
            "reason": f"Position3Bestseller should be 'Sprite Stasis Ball 65 cm:6', got: {extracted_data['Position3Bestseller']}",
        }

    # 11. TopUseTerm should be in format "term:uses"
    if ":" not in extracted_data["TopUseTerm"]:
        return {
            "valid": False,
            "reason": f"TopUseTerm should be in format 'term:uses', got: {extracted_data['TopUseTerm']}",
        }

    # hollister has 19 uses (highest)
    if extracted_data["TopUseTerm"] != "hollister:19":
        return {
            "valid": False,
            "reason": f"TopUseTerm should be 'hollister:19', got: {extracted_data['TopUseTerm']}",
        }

    # 12. FirstNonZeroResult should be in format "term:results"
    if ":" not in extracted_data["FirstNonZeroResult"]:
        return {
            "valid": False,
            "reason": f"FirstNonZeroResult should be in format 'term:results', got: {extracted_data['FirstNonZeroResult']}",
        }

    # When sorted by results ascending, first non-zero is WP10 (has 1 result)
    if extracted_data["FirstNonZeroResult"] != "WP10:1":
        return {
            "valid": False,
            "reason": f"FirstNonZeroResult should be 'WP10:1', got: {extracted_data['FirstNonZeroResult']}",
        }

    # 13. TotalUniqueTerms should be a number
    if not extracted_data["TotalUniqueTerms"].isdigit():
        return {
            "valid": False,
            "reason": f"TotalUniqueTerms should be a number, got: {extracted_data['TotalUniqueTerms']}",
        }

    # There are 7 unique search terms in the system
    if extracted_data["TotalUniqueTerms"] != "7":
        return {
            "valid": False,
            "reason": f"TotalUniqueTerms should be '7', got: {extracted_data['TotalUniqueTerms']}",
        }

    # All validations passed
    return {
        "valid": True,
        "reason": "All complex search and filtering operations completed successfully",
    }


if __name__ == "__main__":
    # Load messages from environment variable
    messages_path = os.getenv("MCP_MESSAGES")
    if not messages_path:
        print(
            json.dumps(
                {"valid": False, "reason": "MCP_MESSAGES environment variable not set"}
            )
        )
        exit(1)

    try:
        with open(messages_path, "r") as f:
            messages = json.load(f)
    except Exception as e:
        print(
            json.dumps({"valid": False, "reason": f"Failed to load messages: {str(e)}"})
        )
        exit(1)

    # Run verification
    result = verify(messages)
    print(json.dumps(result))
    # Exit with appropriate code based on verification result
    sys.exit(0 if result["valid"] else 1)


================================================
FILE: tasks/postgres/easy/.gitkeep
================================================


================================================
FILE: tasks/postgres/easy/chinook/customer_data_migration_basic/description.md
================================================
Migrate customer data from an acquired company to PostgreSQL using efficient bulk operations.

## Your Mission:

Chinook Music Store has recently acquired "MelodyMart," a competing music retailer. Their customer database needs to be migrated into Chinook's PostgreSQL database.

## Migration Requirements:

1. **Process all customer records from the data table below** and migrate them into the `Customer` table 
2. **Apply business logic during migration**:
   - Assign `CustomerID` values starting from the next available ID
   - Assign all customers to support representative with EmployeeId 3
   - Set `Fax` field to NULL for all migrated customers

## Customer Data to Migrate:

| FirstName | LastName | Company | Address | City | State | Country | PostalCode | Phone | Email |
|-----------|----------|---------|---------|------|-------|---------|------------|-------|--------|
| Danielle | Johnson | Sanchez-Taylor | 819 Johnson Course | East William | AK | USA | 74064 | 386-3794 | danielle.johnson@sancheztaylor.com |
| Katherine | Moore | Peterson-Moore | 16155 Roman Stream Suite 816 | New Kellystad | OK | USA | 25704 | 103-4131 | katherine_moore@petersonmoore.org |
| Joshua | Reid | Martin-Kelly | 192 Frank Light Suite 835 | East Lydiamouth | MO | USA | 35594 | 139-5376 | joshua_reid@martinkelly.org |


================================================
FILE: tasks/postgres/easy/chinook/customer_data_migration_basic/meta.json
================================================
{
  "task_id": "customer_data_migration_basic",
  "task_name": "Customer Data Migration Basic",
  "category_id": "chinook",
  "category_name": "Chinook",
  "description": "Load the MelodyMart customer rows into the Customer table with new ids, SupportRepId = 3, and Fax values set to NULL.",
  "author": "Lingxiao Du",
  "created_at": "2025-11-15",
  "difficulty": "L1",
  "tags": [
    "data migration",
    "transactional operations"
  ],
  "mcp": [
    "postgres"
  ],
  "meta_data": {
    "stateType": "text",
    "stateContent": "Table \"Album\" {\n  \"AlbumId\" int4 [pk, not null]\n  \"Title\" varchar(160) [not null]\n  \"ArtistId\" int4 [not null]\n\n  Indexes {\n    ArtistId [type: btree, name: \"IFK_AlbumArtistId\"]\n  }\n}\n\nTable \"Artist\" {\n  \"ArtistId\" int4 [pk, not null]\n  \"Name\" varchar(120)\n}\n\nTable \"Customer\" {\n  \"CustomerId\" int4 [pk, not null]\n  \"FirstName\" varchar(40) [not null]\n  \"LastName\" varchar(20) [not null]\n  \"Company\" varchar(80)\n  \"Address\" varchar(70)\n  \"City\" varchar(40)\n  \"State\" varchar(40)\n  \"Country\" varchar(40)\n  \"PostalCode\" varchar(10)\n  \"Phone\" varchar(24)\n  \"Fax\" varchar(24)\n  \"Email\" varchar(60) [not null]\n  \"SupportRepId\" int4\n\n  Indexes {\n    SupportRepId [type: btree, name: \"IFK_CustomerSupportRepId\"]\n  }\n}\n\nTable \"Employee\" {\n  \"EmployeeId\" int4 [pk, not null]\n  \"LastName\" varchar(20) [not null]\n  \"FirstName\" varchar(20) [not null]\n  \"Title\" varchar(30)\n  \"ReportsTo\" int4\n  \"BirthDate\" timestamp\n  \"HireDate\" timestamp\n  \"Address\" varchar(70)\n  \"City\" varchar(40)\n  \"State\" varchar(40)\n  \"Country\" varchar(40)\n  \"PostalCode\" varchar(10)\n  \"Phone\" varchar(24)\n  \"Fax\" varchar(24)\n  \"Email\" varchar(60)\n\n  Indexes {\n    ReportsTo [type: btree, name: \"IFK_EmployeeReportsTo\"]\n  }\n}\n\nTable \"Genre\" {\n  \"GenreId\" int4 [pk, not null]\n  \"Name\" varchar(120)\n}\n\nTable \"Invoice\" {\n  \"InvoiceId\" int4 [pk, not null]\n  \"CustomerId\" int4 [not null]\n  \"InvoiceDate\" timestamp [not null]\n  \"BillingAddress\" varchar(70)\n  \"BillingCity\" varchar(40)\n  \"BillingState\" varchar(40)\n  \"BillingCountry\" varchar(40)\n  \"BillingPostalCode\" varchar(10)\n  \"Total\" numeric(10,2) [not null]\n\n  Indexes {\n    CustomerId [type: btree, name: \"IFK_InvoiceCustomerId\"]\n  }\n}\n\nTable \"InvoiceLine\" {\n  \"InvoiceLineId\" int4 [pk, not null]\n  \"InvoiceId\" int4 [not null]\n  \"TrackId\" int4 [not null]\n  \"UnitPrice\" numeric(10,2) [not null]\n  \"Quantity\" int4 [not null]\n\n  Indexes {\n    InvoiceId [type: btree, name: \"IFK_InvoiceLineInvoiceId\"]\n    TrackId [type: btree, name: \"IFK_InvoiceLineTrackId\"]\n  }\n}\n\nTable \"MediaType\" {\n  \"MediaTypeId\" int4 [pk, not null]\n  \"Name\" varchar(120)\n}\n\nTable \"Playlist\" {\n  \"PlaylistId\" int4 [pk, not null]\n  \"Name\" varchar(120)\n}\n\nTable \"PlaylistTrack\" {\n  \"PlaylistId\" int4 [not null]\n  \"TrackId\" int4 [not null]\n\n  Indexes {\n    (PlaylistId, TrackId) [type: btree, name: \"PK_PlaylistTrack\"]\n    TrackId [type: btree, name: \"IFK_PlaylistTrackTrackId\"]\n  }\n}\n\nTable \"Track\" {\n  \"TrackId\" int4 [pk, not null]\n  \"Name\" varchar(200) [not null]\n  \"AlbumId\" int4\n  \"MediaTypeId\" int4 [not null]\n  \"GenreId\" int4\n  \"Composer\" varchar(220)\n  \"Milliseconds\" int4 [not null]\n  \"Bytes\" int4\n  \"UnitPrice\" numeric(10,2) [not null]\n\n  Indexes {\n    AlbumId [type: btree, name: \"IFK_TrackAlbumId\"]\n    GenreId [type: btree, name: \"IFK_TrackGenreId\"]\n    MediaTypeId [type: btree, name: \"IFK_TrackMediaTypeId\"]\n  }\n}\n\nRef \"FK_AlbumArtistId\":\"Artist\".\"ArtistId\" < \"Album\".\"ArtistId\"\n\nRef \"FK_CustomerSupportRepId\":\"Employee\".\"EmployeeId\" < \"Customer\".\"SupportRepId\"\n\nRef \"FK_EmployeeReportsTo\":\"Employee\".\"EmployeeId\" < \"Employee\".\"ReportsTo\"\n\nRef \"FK_InvoiceCustomerId\":\"Customer\".\"CustomerId\" < \"Invoice\".\"CustomerId\"\n\nRef \"FK_InvoiceLineInvoiceId\":\"Invoice\".\"InvoiceId\" < \"InvoiceLine\".\"InvoiceId\"\n\nRef \"FK_InvoiceLineTrackId\":\"Track\".\"TrackId\" < \"InvoiceLine\".\"TrackId\"\n\nRef \"FK_PlaylistTrackPlaylistId\":\"Playlist\".\"PlaylistId\" < \"PlaylistTrack\".\"PlaylistId\"\n\nRef \"FK_PlaylistTrackTrackId\":\"Track\".\"TrackId\" < \"PlaylistTrack\".\"TrackId\"\n\nRef \"FK_TrackAlbumId\":\"Album\".\"AlbumId\" < \"Track\".\"AlbumId\"\n\nRef \"FK_TrackGenreId\":\"Genre\".\"GenreId\" < \"Track\".\"GenreId\"\n\nRef \"FK_TrackMediaTypeId\":\"MediaType\".\"MediaTypeId\" < \"Track\".\"MediaTypeId\"\n",
    "stateUrl": null,
    "stateOriginalUrl": "https://github.com/neondatabase-labs/postgres-sample-dbs/blob/main/chinook.sql"
  }
}


================================================
FILE: tasks/postgres/easy/chinook/customer_data_migration_basic/verify.py
================================================
"""
Verification script for PostgreSQL Task 2: Customer Data Migration
"""

import os
import sys
import psycopg2
import pickle

def get_connection_params() -> dict:
    """Get database connection parameters."""
    return {
        "host": os.getenv("POSTGRES_HOST", "localhost"),
        "port": int(os.getenv("POSTGRES_PORT", 5432)),
        "database": os.getenv("POSTGRES_DATABASE"),
        "user": os.getenv("POSTGRES_USERNAME"),
        "password": os.getenv("POSTGRES_PASSWORD")
    }

def load_expected_customers():
    """Load the expected customer data from pickle file."""
    import os
    script_dir = os.path.dirname(os.path.abspath(__file__))
    pkl_path = os.path.join(script_dir, 'customer_data.pkl')
    
    try:
        with open(pkl_path, 'rb') as f:
            return pickle.load(f)
    except FileNotFoundError:
        print(f"❌ customer_data.pkl not found at {pkl_path}. Please generate customer data first.")
        return None
    except Exception as e:
        print(f"❌ Error loading customer data: {e}")
        return None

def verify_migrated_customers(conn, expected_customers) -> bool:
    """Verify migrated customers by comparing with expected data as sets."""
    with conn.cursor() as cur:
        # Get all customers with ID > 59 (the migrated ones)
        cur.execute('''
            SELECT "FirstName", "LastName", "Company", "Address", "City", 
                   "State", "Country", "PostalCode", "Phone", "Email", 
                   "SupportRepId", "Fax"
            FROM "Customer" 
            WHERE "CustomerId" > 59
        ''')
        
        actual_customers = cur.fetchall()
        
        if len(actual_customers) != len(expected_customers):
            print(f"❌ Expected {len(expected_customers)} migrated customers, found {len(actual_customers)}")
            return False
        
        # Convert expected customers to tuples for set comparison
        expected_tuples = set()
        for expected in expected_customers:
            expected_tuple = (
                expected['FirstName'], expected['LastName'], expected['Company'],
                expected['Address'], expected['City'], expected['State'],
                expected['Country'], expected['PostalCode'], expected['Phone'], 
                expected['Email'], 3, None  # SupportRepId=3, Fax=None
            )
            expected_tuples.add(expected_tuple)
        
        # Convert actual customers to set with proper type conversion
        actual_tuples = set()
        for row in actual_customers:
            # Convert all fields to strings for consistent comparison
            actual_tuple = (
                str(row[0]) if row[0] is not None else '',  # FirstName
                str(row[1]) if row[1] is not None else '',  # LastName  
                str(row[2]) if row[2] is not None else '',  # Company
                str(row[3]) if row[3] is not None else '',  # Address
                str(row[4]) if row[4] is not None else '',  # City
                str(row[5]) if row[5] is not None else '',  # State
                str(row[6]) if row[6] is not None else '',  # Country
                str(row[7]) if row[7] is not None else '',  # PostalCode
                str(row[8]) if row[8] is not None else '',  # Phone
                str(row[9]) if row[9] is not None else '',  # Email
                int(row[10]) if row[10] is not None else None,  # SupportRepId
                row[11]  # Fax (should be None)
            )
            actual_tuples.add(actual_tuple)
        
        # Check if sets are equal
        if actual_tuples != expected_tuples:
            missing_in_actual = expected_tuples - actual_tuples
            extra_in_actual = actual_tuples - expected_tuples
            
            print(f"❌ Customer data sets don't match!")
            if missing_in_actual:
                print(f"   Missing {len(missing_in_actual)} expected customers")
                for missing in list(missing_in_actual)[:3]:  # Show first 3
                    print(f"   Missing: {missing[0]} {missing[1]} - {missing[2]}")
                if len(missing_in_actual) > 3:
                    print(f"   ... and {len(missing_in_actual) - 3} more")
            
            if extra_in_actual:
                print(f"   Found {len(extra_in_actual)} unexpected customers")
                for extra in list(extra_in_actual)[:3]:  # Show first 3
                    print(f"   Extra: {extra[0]} {extra[1]} - {extra[2]}")
                if len(extra_in_actual) > 3:
                    print(f"   ... and {len(extra_in_actual) - 3} more")
            
            return False
        
        print(f"✅ All {len(expected_customers)} customers migrated correctly")
        print(f"✅ All customers assigned to SupportRepId 3")
        print(f"✅ All customers have Fax field set to NULL")
        print(f"✅ Customer data sets match exactly (order-independent)")
        
        return True

def main():
    """Main verification function."""
    print("=" * 60)
    print("Verifying Customer Data Migration Task")
    print("=" * 60)

    # Load expected customer data
    expected_customers = load_expected_customers()
    if not expected_customers:
        sys.exit(1)
    
    print(f"Loaded {len(expected_customers)} expected customer records")

    # Get connection parameters
    conn_params = get_connection_params()

    if not conn_params["database"]:
        print("❌ No database specified")
        sys.exit(1)

    try:
        # Connect to database
        conn = psycopg2.connect(**conn_params)

        # Verify migration
        success = verify_migrated_customers(conn, expected_customers)

        conn.close()

        if success:
            print("\n🎉 Task verification: PASS")
            sys.exit(0)
        else:
            print("\n❌ Task verification: FAIL")
            sys.exit(1)

    except psycopg2.Error as e:
        print(f"❌ Database error: {e}")
        sys.exit(1)
    except Exception as e:
        print(f"❌ Verification error: {e}")
        sys.exit(1)

if __name__ == "__main__":
    main()

================================================
FILE: tasks/postgres/easy/chinook/update_employee_info/description.md
================================================
Update employee information and reorganize the reporting structure in the Chinook database to reflect organizational changes.

## Your Tasks:

### **UPDATE: Modify Existing Employee Information**
- Change Andrew Adams (EmployeeId = 1) title from 'General Manager' to 'CEO'
- Update Nancy Edwards (EmployeeId = 2) phone number to '+1 (403) 555-9999'
- Change all employees with Title = 'IT Staff' to have Title = 'IT Specialist'


## Requirements:

- Use UPDATE statements to modify the existing records
- The title update for 'IT Staff' should affect all matching employees

## Expected Results:

After completing the updates:
- Andrew Adams should have Title = 'CEO'
- Nancy Edwards should have Phone = '+1 (403) 555-9999'
- All employees previously with Title = 'IT Staff' should now have Title = 'IT Specialist'

This task practices UPDATE operations for both employee information and organizational hierarchy management.


================================================
FILE: tasks/postgres/easy/chinook/update_employee_info/meta.json
================================================
{
  "task_id": "update_employee_info",
  "task_name": "Update Employee Info",
  "category_id": "chinook",
  "category_name": "Chinook",
  "description": "Update Chinook employee records so Andrew Adams becomes CEO, Nancy Edwards receives the new phone number, and every \"IT Staff\" title becomes \"IT Specialist.\"",
  "author": "Lingxiao Du",
  "created_at": "2025-11-15",
  "difficulty": "L1",
  "tags": [
    "data updates",
    "organizational change"
  ],
  "mcp": [
    "postgres"
  ],
  "meta_data": {
    "stateType": "text",
    "stateContent": "Table \"Album\" {\n  \"AlbumId\" int4 [pk, not null]\n  \"Title\" varchar(160) [not null]\n  \"ArtistId\" int4 [not null]\n\n  Indexes {\n    ArtistId [type: btree, name: \"IFK_AlbumArtistId\"]\n  }\n}\n\nTable \"Artist\" {\n  \"ArtistId\" int4 [pk, not null]\n  \"Name\" varchar(120)\n}\n\nTable \"Customer\" {\n  \"CustomerId\" int4 [pk, not null]\n  \"FirstName\" varchar(40) [not null]\n  \"LastName\" varchar(20) [not null]\n  \"Company\" varchar(80)\n  \"Address\" varchar(70)\n  \"City\" varchar(40)\n  \"State\" varchar(40)\n  \"Country\" varchar(40)\n  \"PostalCode\" varchar(10)\n  \"Phone\" varchar(24)\n  \"Fax\" varchar(24)\n  \"Email\" varchar(60) [not null]\n  \"SupportRepId\" int4\n\n  Indexes {\n    SupportRepId [type: btree, name: \"IFK_CustomerSupportRepId\"]\n  }\n}\n\nTable \"Employee\" {\n  \"EmployeeId\" int4 [pk, not null]\n  \"LastName\" varchar(20) [not null]\n  \"FirstName\" varchar(20) [not null]\n  \"Title\" varchar(30)\n  \"ReportsTo\" int4\n  \"BirthDate\" timestamp\n  \"HireDate\" timestamp\n  \"Address\" varchar(70)\n  \"City\" varchar(40)\n  \"State\" varchar(40)\n  \"Country\" varchar(40)\n  \"PostalCode\" varchar(10)\n  \"Phone\" varchar(24)\n  \"Fax\" varchar(24)\n  \"Email\" varchar(60)\n\n  Indexes {\n    ReportsTo [type: btree, name: \"IFK_EmployeeReportsTo\"]\n  }\n}\n\nTable \"Genre\" {\n  \"GenreId\" int4 [pk, not null]\n  \"Name\" varchar(120)\n}\n\nTable \"Invoice\" {\n  \"InvoiceId\" int4 [pk, not null]\n  \"CustomerId\" int4 [not null]\n  \"InvoiceDate\" timestamp [not null]\n  \"BillingAddress\" varchar(70)\n  \"BillingCity\" varchar(40)\n  \"BillingState\" varchar(40)\n  \"BillingCountry\" varchar(40)\n  \"BillingPostalCode\" varchar(10)\n  \"Total\" numeric(10,2) [not null]\n\n  Indexes {\n    CustomerId [type: btree, name: \"IFK_InvoiceCustomerId\"]\n  }\n}\n\nTable \"InvoiceLine\" {\n  \"InvoiceLineId\" int4 [pk, not null]\n  \"InvoiceId\" int4 [not null]\n  \"TrackId\" int4 [not null]\n  \"UnitPrice\" numeric(10,2) [not null]\n  \"Quantity\" int4 [not null]\n\n  Indexes {\n    InvoiceId [type: btree, name: \"IFK_InvoiceLineInvoiceId\"]\n    TrackId [type: btree, name: \"IFK_InvoiceLineTrackId\"]\n  }\n}\n\nTable \"MediaType\" {\n  \"MediaTypeId\" int4 [pk, not null]\n  \"Name\" varchar(120)\n}\n\nTable \"Playlist\" {\n  \"PlaylistId\" int4 [pk, not null]\n  \"Name\" varchar(120)\n}\n\nTable \"PlaylistTrack\" {\n  \"PlaylistId\" int4 [not null]\n  \"TrackId\" int4 [not null]\n\n  Indexes {\n    (PlaylistId, TrackId) [type: btree, name: \"PK_PlaylistTrack\"]\n    TrackId [type: btree, name: \"IFK_PlaylistTrackTrackId\"]\n  }\n}\n\nTable \"Track\" {\n  \"TrackId\" int4 [pk, not null]\n  \"Name\" varchar(200) [not null]\n  \"AlbumId\" int4\n  \"MediaTypeId\" int4 [not null]\n  \"GenreId\" int4\n  \"Composer\" varchar(220)\n  \"Milliseconds\" int4 [not null]\n  \"Bytes\" int4\n  \"UnitPrice\" numeric(10,2) [not null]\n\n  Indexes {\n    AlbumId [type: btree, name: \"IFK_TrackAlbumId\"]\n    GenreId [type: btree, name: \"IFK_TrackGenreId\"]\n    MediaTypeId [type: btree, name: \"IFK_TrackMediaTypeId\"]\n  }\n}\n\nRef \"FK_AlbumArtistId\":\"Artist\".\"ArtistId\" < \"Album\".\"ArtistId\"\n\nRef \"FK_CustomerSupportRepId\":\"Employee\".\"EmployeeId\" < \"Customer\".\"SupportRepId\"\n\nRef \"FK_EmployeeReportsTo\":\"Employee\".\"EmployeeId\" < \"Employee\".\"ReportsTo\"\n\nRef \"FK_InvoiceCustomerId\":\"Customer\".\"CustomerId\" < \"Invoice\".\"CustomerId\"\n\nRef \"FK_InvoiceLineInvoiceId\":\"Invoice\".\"InvoiceId\" < \"InvoiceLine\".\"InvoiceId\"\n\nRef \"FK_InvoiceLineTrackId\":\"Track\".\"TrackId\" < \"InvoiceLine\".\"TrackId\"\n\nRef \"FK_PlaylistTrackPlaylistId\":\"Playlist\".\"PlaylistId\" < \"PlaylistTrack\".\"PlaylistId\"\n\nRef \"FK_PlaylistTrackTrackId\":\"Track\".\"TrackId\" < \"PlaylistTrack\".\"TrackId\"\n\nRef \"FK_TrackAlbumId\":\"Album\".\"AlbumId\" < \"Track\".\"AlbumId\"\n\nRef \"FK_TrackGenreId\":\"Genre\".\"GenreId\" < \"Track\".\"GenreId\"\n\nRef \"FK_TrackMediaTypeId\":\"MediaType\".\"MediaTypeId\" < \"Track\".\"MediaTypeId\"\n",
    "stateUrl": null,
    "stateOriginalUrl": "https://github.com/neondatabase-labs/postgres-sample-dbs/blob/main/chinook.sql"
  }
}


================================================
FILE: tasks/postgres/easy/chinook/update_employee_info/verify.py
================================================
"""
Verification script for PostgreSQL Task 3: Employee Hierarchy Management
"""

import os
import sys
import psycopg2
from decimal import Decimal

def rows_match(actual_row, expected_row):
    """
    Compare two rows with appropriate tolerance.
    For Decimal types: allows 0.01 tolerance
    For other types: requires exact match
    """
    if len(actual_row) != len(expected_row):
        return False
    
    for actual, expected in zip(actual_row, expected_row):
        if isinstance(actual, Decimal) and isinstance(expected, Decimal):
            if abs(float(actual) - float(expected)) > 0.01:
                return False
        elif actual != expected:
            return False
    
    return True

def get_connection_params() -> dict:
    """Get database connection parameters."""
    return {
        "host": os.getenv("POSTGRES_HOST", "localhost"),
        "port": int(os.getenv("POSTGRES_PORT", 5432)),
        "database": os.getenv("POSTGRES_DATABASE"),
        "user": os.getenv("POSTGRES_USERNAME"),
        "password": os.getenv("POSTGRES_PASSWORD")
    }

def verify_employee_count_and_titles(conn) -> bool:
    """Verify the final employee count and title changes."""
    with conn.cursor() as cur:
        # Check the final verification query results
        cur.execute("""
            SELECT 
                COUNT(*) as total_employees,
                COUNT(CASE WHEN "Title" = 'CEO' THEN 1 END) as ceo_count,
                COUNT(CASE WHEN "Title" = 'IT Specialist' THEN 1 END) as it_specialist_count
            FROM "Employee"
        """)
        result = cur.fetchone()
        
        total_employees, ceo_count, it_specialist_count = result
        
        if total_employees != 8:
            print(f"❌ Expected 8 total employees, got {total_employees}")
            return False
            
        if ceo_count != 1:
            print(f"❌ Expected 1 CEO, got {ceo_count}")
            return False
            
        if it_specialist_count != 2:
            print(f"❌ Expected 2 IT Specialists, got {it_specialist_count}")
            return False
            
        print("✅ Employee count and title verification passed")
        return True

def verify_specific_employees(conn) -> bool:
    """Verify specific employee records and modifications."""
    with conn.cursor() as cur:
        # Check all employee fields in one query
        cur.execute("""
            SELECT "EmployeeId", "LastName", "FirstName", "Title", "ReportsTo", "BirthDate", 
                   "HireDate", "Address", "City", "State", "Country", "PostalCode", 
                   "Phone", "Fax", "Email"
            FROM "Employee" 
            WHERE "EmployeeId" IN (1, 2)
            ORDER BY "EmployeeId"
        """)
        employees = cur.fetchall()
        
        from datetime import datetime
        
        expected = [
            # Andrew Adams (ID 1) - Title changes to 'CEO', phone stays original, ReportsTo stays None
            (1, 'Adams', 'Andrew', 'CEO', None, datetime(1962, 2, 18), datetime(2002, 8, 14),
             '11120 Jasper Ave NW', 'Edmonton', 'AB', 'Canada', 'T5K 2N1', '+1 (780) 428-9482', '+1 (780) 428-3457', 'andrew@chinookcorp.com'),
            # Nancy Edwards (ID 2) - Phone changes, title stays 'Sales Manager', ReportsTo stays 1
            (2, 'Edwards', 'Nancy', 'Sales Manager', 1, datetime(1958, 12, 8), datetime(2002, 5, 1),
             '825 8 Ave SW', 'Calgary', 'AB', 'Canada', 'T2P 2T3', '+1 (403) 555-9999', '+1 (403) 262-3322', 'nancy@chinookcorp.com'),
        ]
        
        if len(employees) != 2:
            print(f"❌ Expected 2 key employees, found {len(employees)}")
            return False
            
        # Full field comparison for all employees using rows_match
        for actual, expected_emp in zip(employees, expected):
            if not rows_match(actual, expected_emp):
                print(f"❌ Employee {actual[0]} row mismatch: expected {expected_emp}, got {actual}")
                return False
        
        print("✅ Specific employee verification passed - all fields match exactly")
        return True

def main():
    """Main verification function."""
    print("=" * 50)
    print("Verifying Task 3: Employee Hierarchy Management")
    print("=" * 50)

    # Get connection parameters
    conn_params = get_connection_params()

    if not conn_params["database"]:
        print("❌ No database specified")
        sys.exit(1)

    try:
        # Connect to database
        conn = psycopg2.connect(**conn_params)

        # Run verification checks with short-circuit evaluation
        success = (
            verify_employee_count_and_titles(conn) and
            verify_specific_employees(conn)
                  )
        conn.close()

        if success:
            print("\n🎉 Task verification: PASS")
            print("All employee hierarchy management operations completed correctly!")
            sys.exit(0)
        else:
            print("\n❌ Task verification: FAIL")
            sys.exit(1)

    except psycopg2.Error as e:
        print(f"❌ Database error: {e}")
        sys.exit(1)
    except Exception as e:
        print(f"❌ Verification error: {e}")
        sys.exit(1)

if __name__ == "__main__":
    main()

================================================
FILE: tasks/postgres/easy/dvdrental/create_payment_index/description.md
================================================
Create an index to optimize customer payment queries in the DVD rental database.

## Your Task:

Create an index on the `customer_id` column of the `payment` table to improve query performance.

## Requirements:

- Create an index on the `payment` table's `customer_id` column
- The index name can be anything you choose (e.g., `idx_payment_customer_id`)
- Use the standard CREATE INDEX syntax

## Why This Helps:

The `customer_id` column is frequently used in:
- JOIN operations between customer and payment tables
- WHERE clauses filtering by customer
- Subqueries that look up payments for specific customers

Adding an index will significantly speed up these operations.


================================================
FILE: tasks/postgres/easy/dvdrental/create_payment_index/meta.json
================================================
{
  "task_id": "create_payment_index",
  "task_name": "Create Payment Index",
  "category_id": "dvdrental",
  "category_name": "DVD Rental",
  "description": "Add an index on payment.customer_id to speed up the customer payment lookups in the DVD Rental database.",
  "author": "Lingxiao Du",
  "created_at": "2025-11-15",
  "difficulty": "L1",
  "tags": [
    "performance optimization",
    "indexing"
  ],
  "mcp": [
    "postgres"
  ],
  "meta_data": {
    "stateType": "text",
    "stateContent": "Enum \"mpaa_rating\" {\n  \"G\"\n  \"PG\"\n  \"PG-13\"\n  \"R\"\n  \"NC-17\"\n}\n\nTable \"customer\" {\n  \"customer_id\" int4 [pk, not null, increment]\n  \"store_id\" int2 [not null]\n  \"first_name\" varchar(45) [not null]\n  \"last_name\" varchar(45) [not null]\n  \"email\" varchar(50)\n  \"address_id\" int2 [not null]\n  \"activebool\" bool [not null, default: true]\n  \"create_date\" date [not null, default: `('now'::text)::date`]\n  \"last_update\" timestamp [default: `now()`]\n  \"active\" int4\n\n  Indexes {\n    address_id [type: btree, name: \"idx_fk_address_id\"]\n    store_id [type: btree, name: \"idx_fk_store_id\"]\n    last_name [type: btree, name: \"idx_last_name\"]\n  }\n}\n\nTable \"actor\" {\n  \"actor_id\" int4 [pk, not null, increment]\n  \"first_name\" varchar(45) [not null]\n  \"last_name\" varchar(45) [not null]\n  \"last_update\" timestamp [not null, default: `now()`]\n\n  Indexes {\n    last_name [type: btree, name: \"idx_actor_last_name\"]\n  }\n}\n\nTable \"category\" {\n  \"category_id\" int4 [pk, not null, increment]\n  \"name\" varchar(25) [not null]\n  \"last_update\" timestamp [not null, default: `now()`]\n}\n\nTable \"film\" {\n  \"film_id\" int4 [pk, not null, increment]\n  \"title\" varchar(255) [not null]\n  \"description\" text\n  \"release_year\" int4\n  \"language_id\" int2 [not null]\n  \"rental_duration\" int2 [not null, default: 3]\n  \"rental_rate\" numeric(4,2) [not null, default: 4.99]\n  \"length\" int2\n  \"replacement_cost\" numeric(5,2) [not null, default: 19.99]\n  \"rating\" mpaa_rating [default: 'G']\n  \"last_update\" timestamp [not null, default: `now()`]\n  \"special_features\" \"text[]\"\n  \"fulltext\" tsvector [not null]\n\n  Indexes {\n    fulltext [type: gist, name: \"film_fulltext_idx\"]\n    language_id [type: btree, name: \"idx_fk_language_id\"]\n    title [type: btree, name: \"idx_title\"]\n  }\n}\n\nTable \"film_actor\" {\n  \"actor_id\" int2 [not null]\n  \"film_id\" int2 [not null]\n  \"last_update\" timestamp [not null, default: `now()`]\n\n  Indexes {\n    (actor_id, film_id) [type: btree, name: \"film_actor_pkey\"]\n    film_id [type: btree, name: \"idx_fk_film_id\"]\n  }\n}\n\nTable \"film_category\" {\n  \"film_id\" int2 [not null]\n  \"category_id\" int2 [not null]\n  \"last_update\" timestamp [not null, default: `now()`]\n\n  Indexes {\n    (film_id, category_id) [type: btree, name: \"film_category_pkey\"]\n  }\n}\n\nTable \"address\" {\n  \"address_id\" int4 [pk, not null, increment]\n  \"address\" varchar(50) [not null]\n  \"address2\" varchar(50)\n  \"district\" varchar(20) [not null]\n  \"city_id\" int2 [not null]\n  \"postal_code\" varchar(10)\n  \"phone\" varchar(20) [not null]\n  \"last_update\" timestamp [not null, default: `now()`]\n\n  Indexes {\n    city_id [type: btree, name: \"idx_fk_city_id\"]\n  }\n}\n\nTable \"city\" {\n  \"city_id\" int4 [pk, not null, increment]\n  \"city\" varchar(50) [not null]\n  \"country_id\" int2 [not null]\n  \"last_update\" timestamp [not null, default: `now()`]\n\n  Indexes {\n    country_id [type: btree, name: \"idx_fk_country_id\"]\n  }\n}\n\nTable \"country\" {\n  \"country_id\" int4 [pk, not null, increment]\n  \"country\" varchar(50) [not null]\n  \"last_update\" timestamp [not null, default: `now()`]\n}\n\nTable \"inventory\" {\n  \"inventory_id\" int4 [pk, not null, increment]\n  \"film_id\" int2 [not null]\n  \"store_id\" int2 [not null]\n  \"last_update\" timestamp [not null, default: `now()`]\n\n  Indexes {\n    (store_id, film_id) [type: btree, name: \"idx_store_id_film_id\"]\n  }\n}\n\nTable \"language\" {\n  \"language_id\" int4 [pk, not null, increment]\n  \"name\" bpchar(20) [not null]\n  \"last_update\" timestamp [not null, default: `now()`]\n}\n\nTable \"payment\" {\n  \"payment_id\" int4 [pk, not null, increment]\n  \"customer_id\" int2 [not null]\n  \"staff_id\" int2 [not null]\n  \"rental_id\" int4 [not null]\n  \"amount\" numeric(5,2) [not null]\n  \"payment_date\" timestamp [not null]\n\n  Indexes {\n    rental_id [type: btree, name: \"idx_fk_rental_id\"]\n    staff_id [type: btree, name: \"idx_fk_staff_id\"]\n  }\n}\n\nTable \"rental\" {\n  \"rental_id\" int4 [pk, not null, increment]\n  \"rental_date\" timestamp [not null]\n  \"inventory_id\" int4 [not null]\n  \"customer_id\" int2 [not null]\n  \"return_date\" timestamp\n  \"staff_id\" int2 [not null]\n  \"last_update\" timestamp [not null, default: `now()`]\n\n  Indexes {\n    (rental_date, inventory_id, customer_id) [type: btree, name: \"idx_unq_rental_rental_date_inventory_id_customer_id\"]\n    inventory_id [type: btree, name: \"idx_fk_inventory_id\"]\n  }\n}\n\nTable \"staff\" {\n  \"staff_id\" int4 [pk, not null, increment]\n  \"first_name\" varchar(45) [not null]\n  \"last_name\" varchar(45) [not null]\n  \"address_id\" int2 [not null]\n  \"email\" varchar(50)\n  \"store_id\" int2 [not null]\n  \"active\" bool [not null, default: true]\n  \"username\" varchar(16) [not null]\n  \"password\" varchar(40)\n  \"last_update\" timestamp [not null, default: `now()`]\n  \"picture\" bytea\n}\n\nTable \"store\" {\n  \"store_id\" int4 [pk, not null, increment]\n  \"manager_staff_id\" int2 [unique, not null]\n  \"address_id\" int2 [not null]\n  \"last_update\" timestamp [not null, default: `now()`]\n}\n\nRef \"fk_address_city\":\"city\".\"city_id\" < \"address\".\"city_id\"\n\nRef \"fk_city\":\"country\".\"country_id\" < \"city\".\"country_id\"\n\nRef \"customer_address_id_fkey\":\"address\".\"address_id\" < \"customer\".\"address_id\" [update: cascade, delete: restrict]\n\nRef \"film_language_id_fkey\":\"language\".\"language_id\" < \"film\".\"language_id\" [update: cascade, delete: restrict]\n\nRef \"film_actor_actor_id_fkey\":\"actor\".\"actor_id\" < \"film_actor\".\"actor_id\" [update: cascade, delete: restrict]\n\nRef \"film_actor_film_id_fkey\":\"film\".\"film_id\" < \"film_actor\".\"film_id\" [update: cascade, delete: restrict]\n\nRef \"film_category_category_id_fkey\":\"category\".\"category_id\" < \"film_category\".\"category_id\" [update: cascade, delete: restrict]\n\nRef \"film_category_film_id_fkey\":\"film\".\"film_id\" < \"film_category\".\"film_id\" [update: cascade, delete: restrict]\n\nRef \"inventory_film_id_fkey\":\"film\".\"film_id\" < \"inventory\".\"film_id\" [update: cascade, delete: restrict]\n\nRef \"payment_customer_id_fkey\":\"customer\".\"customer_id\" < \"payment\".\"customer_id\" [update: cascade, delete: restrict]\n\nRef \"payment_rental_id_fkey\":\"rental\".\"rental_id\" < \"payment\".\"rental_id\" [update: cascade, delete: set null]\n\nRef \"payment_staff_id_fkey\":\"staff\".\"staff_id\" < \"payment\".\"staff_id\" [update: cascade, delete: restrict]\n\nRef \"rental_customer_id_fkey\":\"customer\".\"customer_id\" < \"rental\".\"customer_id\" [update: cascade, delete: restrict]\n\nRef \"rental_inventory_id_fkey\":\"inventory\".\"inventory_id\" < \"rental\".\"inventory_id\" [update: cascade, delete: restrict]\n\nRef \"rental_staff_id_key\":\"staff\".\"staff_id\" < \"rental\".\"staff_id\"\n\nRef \"staff_address_id_fkey\":\"address\".\"address_id\" < \"staff\".\"address_id\" [update: cascade, delete: restrict]\n\nRef \"store_address_id_fkey\":\"address\".\"address_id\" < \"store\".\"address_id\" [update: cascade, delete: restrict]\n\nRef \"store_manager_staff_id_fkey\":\"staff\".\"staff_id\" < \"store\".\"manager_staff_id\" [update: cascade, delete: restrict]\n",
    "stateUrl": null,
    "stateOriginalUrl": "https://github.com/gordonkwokkwok/DVD-Rental-PostgreSQL-Project"
  }
}


================================================
FILE: tasks/postgres/easy/dvdrental/create_payment_index/verify.py
================================================
"""
Verification script for PostgreSQL Task 1: Customer Payment Query Optimization
"""

import os
import sys
import psycopg2

def get_connection_params() -> dict:
    """Get database connection parameters."""
    return {
        "host": os.getenv("POSTGRES_HOST", "localhost"),
        "port": int(os.getenv("POSTGRES_PORT", 5432)),
        "database": os.getenv("POSTGRES_DATABASE"),
        "user": os.getenv("POSTGRES_USERNAME"),
        "password": os.getenv("POSTGRES_PASSWORD")
    }

def check_payment_customer_id_index(conn) -> bool:
    """Check if there's any index on payment.customer_id column."""
    with conn.cursor() as cur:
        cur.execute("""
            SELECT indexname, indexdef 
            FROM pg_indexes 
            WHERE schemaname = 'public' 
            AND tablename = 'payment'
            AND indexdef LIKE '%customer_id%'
        """)
        indexes = cur.fetchall()
        print(indexes)
        return len(indexes) > 0, indexes

def main():
    """Main verification function."""
    print("=" * 60)
    print("PostgreSQL Task 1 Verification: Customer Payment Query Optimization")
    print("=" * 60)
    
    # Get connection parameters
    conn_params = get_connection_params()
    
    if not conn_params["database"]:
        print("❌ No database specified")
        sys.exit(1)
    
    try:
        # Connect to database
        conn = psycopg2.connect(**conn_params)
        
        print("\n🔍 Checking for customer_id index on payment table...")
        
        # Check if any index exists on payment.customer_id
        has_index, indexes = check_payment_customer_id_index(conn)
        
        if has_index:
            print("✅ Found index(es) on payment.customer_id:")
            for index_name, index_def in indexes:
                print(f"   - {index_name}: {index_def}")
        else:
            print("❌ No index found on payment.customer_id column")
        
        conn.close()
        
        if has_index:
            print(f"\n🎉 Task verification: PASS")
            print(f"   - Index on payment.customer_id exists")
            sys.exit(0)
        else:
            print(f"\n❌ Task verification: FAIL")
            print(f"   - No index found on payment.customer_id")
            print(f"   - Create an index on payment(customer_id) to optimize the queries")
            sys.exit(1)
            
    except psycopg2.Error as e:
        print(f"❌ Database error: {e}")
        sys.exit(1)
    except Exception as e:
        print(f"❌ Verification error: {e}")
        sys.exit(1)

if __name__ == "__main__":
    main()

================================================
FILE: tasks/postgres/easy/employees/department_summary_view/description.md
================================================
Create an executive department summary view to provide quick insights into departmental metrics for leadership dashboards. This view will consolidate key department statistics in one easily accessible place.

## Your Task:

**Create the executive department summary view** — build a materialized view called `exec_department_summary` in the `employees` schema with these exact columns:

* `department_name` (varchar) — department name
* `total_employees` (integer) — current active employee count (employees with active salary where to_date = '9999-01-01')
* `avg_salary` (decimal) — average current salary for active employees
* `total_payroll` (bigint) — total monthly payroll cost (sum of all current salaries in the department)
* `manager_name` (varchar) — current department manager's full name (first_name and last_name concatenated)

## Requirements:

1. Use materialized view to cache results for better performance
2. Join the following tables:
   - `departments` - for department information
   - `dept_emp` - for employee-department relationships
   - `employees` - for employee details
   - `salaries` - for current salary information
   - `dept_manager` - for current manager information
3. Only include current active employees (those with to_date = '9999-01-01' in both `dept_emp` and `salaries`)
4. Only include current managers (to_date = '9999-01-01' in `dept_manager`)
5. Order results by department_name

## After Creation:

Refresh the materialized view to populate it with current data.

This view will provide executives with a real-time snapshot of departmental workforce metrics and costs.


================================================
FILE: tasks/postgres/easy/employees/department_summary_view/meta.json
================================================
{
  "task_id": "department_summary_view",
  "task_name": "Department Summary View",
  "category_id": "employees",
  "category_name": "Employees",
  "description": "Build the exec_department_summary materialized view showing department name, active headcount, payroll totals, and the manager name.",
  "author": "Lingxiao Du",
  "created_at": "2025-11-15",
  "difficulty": "L1",
  "tags": [
    "reporting and analytics",
    "materialized views"
  ],
  "mcp": [
    "postgres"
  ],
  "meta_data": {
    "stateType": "text",
    "stateContent": "Enum \"employees\".\"employee_gender\" {\n  \"M\"\n  \"F\"\n}\n\nTable \"employees\".\"department\" {\n  \"id\" bpchar(4) [pk, not null]\n  \"dept_name\" varchar(40) [unique, not null]\n}\n\nTable \"employees\".\"department_employee\" {\n  \"employee_id\" int8 [not null]\n  \"department_id\" bpchar(4) [not null]\n  \"from_date\" date [not null]\n  \"to_date\" date [not null]\n\n  Indexes {\n    (employee_id, department_id) [type: btree, name: \"idx_16982_primary\"]\n    department_id [type: btree, name: \"idx_16982_dept_no\"]\n  }\n}\n\nTable \"employees\".\"department_manager\" {\n  \"employee_id\" int8 [not null]\n  \"department_id\" bpchar(4) [not null]\n  \"from_date\" date [not null]\n  \"to_date\" date [not null]\n\n  Indexes {\n    (employee_id, department_id) [type: btree, name: \"idx_16985_primary\"]\n    department_id [type: btree, name: \"idx_16985_dept_no\"]\n  }\n}\n\nTable \"employees\".\"employee\" {\n  \"id\" int8 [pk, not null, increment]\n  \"birth_date\" date [not null]\n  \"first_name\" varchar(14) [not null]\n  \"last_name\" varchar(16) [not null]\n  \"gender\" employees.employee_gender [not null]\n  \"hire_date\" date [not null]\n}\n\nTable \"employees\".\"salary\" {\n  \"employee_id\" int8 [not null]\n  \"amount\" int8 [not null]\n  \"from_date\" date [not null]\n  \"to_date\" date [not null]\n\n  Indexes {\n    (employee_id, from_date) [type: btree, name: \"idx_16991_primary\"]\n  }\n}\n\nTable \"employees\".\"title\" {\n  \"employee_id\" int8 [not null]\n  \"title\" varchar(50) [not null]\n  \"from_date\" date [not null]\n  \"to_date\" date\n\n  Indexes {\n    (employee_id, title, from_date) [type: btree, name: \"idx_16994_primary\"]\n  }\n}\n\nRef \"dept_emp_ibfk_1\":\"employees\".\"employee\".\"id\" < \"employees\".\"department_employee\".\"employee_id\" [update: restrict, delete: cascade]\n\nRef \"dept_emp_ibfk_2\":\"employees\".\"department\".\"id\" < \"employees\".\"department_employee\".\"department_id\" [update: restrict, delete: cascade]\n\nRef \"dept_manager_ibfk_1\":\"employees\".\"employee\".\"id\" < \"employees\".\"department_manager\".\"employee_id\" [update: restrict, delete: cascade]\n\nRef \"dept_manager_ibfk_2\":\"employees\".\"department\".\"id\" < \"employees\".\"department_manager\".\"department_id\" [update: restrict, delete: cascade]\n\nRef \"salaries_ibfk_1\":\"employees\".\"employee\".\"id\" < \"employees\".\"salary\".\"employee_id\" [update: restrict, delete: cascade]\n\nRef \"titles_ibfk_1\":\"employees\".\"employee\".\"id\" < \"employees\".\"title\".\"employee_id\" [update: restrict, delete: cascade]\n",
    "stateUrl": null,
    "stateOriginalUrl": "https://github.com/neondatabase-labs/postgres-sample-dbs/blob/main/employees.sql.gz"
  }
}


================================================
FILE: tasks/postgres/easy/employees/department_summary_view/verify.py
================================================
"""
Verification script for PostgreSQL Task 6: Reporting and Automation System
"""

import os
import sys
import psycopg2
from decimal import Decimal

def rows_match(actual_row, expected_row):
    """
    Compare two rows with appropriate tolerance.
    For Decimal types: allows 0.1 tolerance
    For date types: convert to string for comparison
    For other types: requires exact match
    """
    if len(actual_row) != len(expected_row):
        return False
    
    for actual, expected in zip(actual_row, expected_row):
        if isinstance(actual, Decimal) and isinstance(expected, (Decimal, float, int)):
            if abs(float(actual) - float(expected)) > 0.1:
                return False
        elif hasattr(actual, 'strftime'):  # datetime.date or datetime.datetime
            if str(actual) != str(expected):
                return False
        elif actual != expected:
            return False
    
    return True

def get_connection_params() -> dict:
    """Get database connection parameters."""
    return {
        "host": os.getenv("POSTGRES_HOST", "localhost"),
        "port": int(os.getenv("POSTGRES_PORT", 5432)),
        "database": os.getenv("POSTGRES_DATABASE"),
        "user": os.getenv("POSTGRES_USERNAME"),
        "password": os.getenv("POSTGRES_PASSWORD")
    }

def verify_materialized_views(conn) -> bool:
    """Verify that materialized views were created and populated correctly."""
    with conn.cursor() as cur:
        # Check all departments' data accuracy
        cur.execute("""
            SELECT department_name, total_employees, avg_salary, total_payroll, manager_name
            FROM employees.exec_department_summary
            ORDER BY department_name
        """)
        view_data = cur.fetchall()
        
        # Get actual data for all departments
        cur.execute("""
            WITH current_salary AS (
            SELECT employee_id, amount
            FROM (
                SELECT s.*,
                    ROW_NUMBER() OVER (
                        PARTITION BY s.employee_id
                        ORDER BY s.from_date DESC, s.amount DESC
                    ) AS rn
                FROM employees.salary s
                WHERE s.to_date = DATE '9999-01-01'
            ) x
            WHERE rn = 1
            ),
            current_dept AS (
            SELECT DISTINCT de.employee_id, de.department_id
            FROM employees.department_employee de
            WHERE de.to_date = DATE '9999-01-01'
            ),
            current_manager AS (
            SELECT department_id,
                    CONCAT(e.first_name, ' ', e.last_name) AS manager_name
            FROM (
                SELECT dm.*,
                    ROW_NUMBER() OVER (
                        PARTITION BY dm.department_id
                        ORDER BY dm.from_date DESC, dm.employee_id
                    ) AS rn
                FROM employees.department_manager dm
                WHERE dm.to_date = DATE '9999-01-01'
            ) dm
            JOIN employees.employee e ON e.id = dm.employee_id
            WHERE dm.rn = 1
            )
            SELECT
            d.dept_name AS department_name,
            COUNT(cd.employee_id)::INT AS total_employees,
            AVG(cs.amount)::DECIMAL   AS avg_salary,
            COALESCE(SUM(cs.amount), 0)::BIGINT AS total_payroll,
            cm.manager_name
            FROM employees.department d
            LEFT JOIN current_dept   cd ON cd.department_id = d.id
            LEFT JOIN current_salary cs ON cs.employee_id = cd.employee_id
            LEFT JOIN current_manager cm ON cm.department_id = d.id
            GROUP BY d.id, d.dept_name, cm.manager_name
            ORDER BY d.dept_name;
        """)
        actual_data = cur.fetchall()
        
        if len(view_data) != len(actual_data):
            print(f"❌ Department count mismatch: view={len(view_data)}, actual={len(actual_data)}")
            return False
            
        for view_row, actual_row in zip(view_data, actual_data):
            if not rows_match(view_row, actual_row):
                print(f"❌ Department summary data incorrect for {view_row[0]}: view={view_row}, actual={actual_row}")
                return False
            
        return True

def main():
    """Main verification function."""
    print("=" * 50)

    # Get connection parameters
    conn_params = get_connection_params()

    if not conn_params["database"]:
        print("❌ No database specified")
        sys.exit(1)

    try:
        # Connect to database
        conn = psycopg2.connect(**conn_params)

        # Verify all components
        success = verify_materialized_views(conn)

        conn.close()

        if success:
            print("\n🎉 Task verification: PASS")
            sys.exit(0)
        else:
            print("\n❌ Task verification: FAIL")
            sys.exit(1)

    except psycopg2.Error as e:
        print(f"❌ Database error: {e}")
        sys.exit(1)
    except Exception as e:
        print(f"❌ Verification error: {e}")
        sys.exit(1)

if __name__ == "__main__":
    main()

================================================
FILE: tasks/postgres/easy/employees/employee_gender_statistics/description.md
================================================
Create a gender statistics summary table for the HR team's annual workforce composition report. This is a simple analysis to understand the gender distribution in our employee database.

## Your Task:

**Create the gender statistics table** — build a table called `gender_statistics` in the `employees` schema with these exact columns:

* `gender` (varchar) — gender ('M' or 'F')
* `total_employees` (integer) — total number of employees of this gender
* `current_employees` (integer) — current employees of this gender (have active salary where to_date = '9999-01-01')
* `percentage_of_workforce` (decimal) — percentage of current workforce (current_employees / total current employees * 100)

## Requirements:

1. Calculate total employees by counting all employees of each gender from the `employees` table
2. Calculate current employees by counting employees with active salary records (to_date = '9999-01-01' in the `salaries` table)
3. Calculate the percentage based on current workforce only
4. The table should contain exactly 2 rows (one for 'M' and one for 'F')

This analysis will help HR understand the basic gender composition of our workforce for diversity reporting.


================================================
FILE: tasks/postgres/easy/employees/employee_gender_statistics/meta.json
================================================
{
  "task_id": "employee_gender_statistics",
  "task_name": "Employee Gender Statistics",
  "category_id": "employees",
  "category_name": "Employees",
  "description": "Aggregate the employees dataset into a gender_statistics table with counts of total/current staff by gender plus workforce percentage.",
  "author": "Lingxiao Du",
  "created_at": "2025-11-15",
  "difficulty": "L1",
  "tags": [
    "reporting and analytics",
    "data aggregation"
  ],
  "mcp": [
    "postgres"
  ],
  "meta_data": {
    "stateType": "text",
    "stateContent": "Enum \"employees\".\"employee_gender\" {\n  \"M\"\n  \"F\"\n}\n\nTable \"employees\".\"department\" {\n  \"id\" bpchar(4) [pk, not null]\n  \"dept_name\" varchar(40) [unique, not null]\n}\n\nTable \"employees\".\"department_employee\" {\n  \"employee_id\" int8 [not null]\n  \"department_id\" bpchar(4) [not null]\n  \"from_date\" date [not null]\n  \"to_date\" date [not null]\n\n  Indexes {\n    (employee_id, department_id) [type: btree, name: \"idx_16982_primary\"]\n    department_id [type: btree, name: \"idx_16982_dept_no\"]\n  }\n}\n\nTable \"employees\".\"department_manager\" {\n  \"employee_id\" int8 [not null]\n  \"department_id\" bpchar(4) [not null]\n  \"from_date\" date [not null]\n  \"to_date\" date [not null]\n\n  Indexes {\n    (employee_id, department_id) [type: btree, name: \"idx_16985_primary\"]\n    department_id [type: btree, name: \"idx_16985_dept_no\"]\n  }\n}\n\nTable \"employees\".\"employee\" {\n  \"id\" int8 [pk, not null, increment]\n  \"birth_date\" date [not null]\n  \"first_name\" varchar(14) [not null]\n  \"last_name\" varchar(16) [not null]\n  \"gender\" employees.employee_gender [not null]\n  \"hire_date\" date [not null]\n}\n\nTable \"employees\".\"salary\" {\n  \"employee_id\" int8 [not null]\n  \"amount\" int8 [not null]\n  \"from_date\" date [not null]\n  \"to_date\" date [not null]\n\n  Indexes {\n    (employee_id, from_date) [type: btree, name: \"idx_16991_primary\"]\n  }\n}\n\nTable \"employees\".\"title\" {\n  \"employee_id\" int8 [not null]\n  \"title\" varchar(50) [not null]\n  \"from_date\" date [not null]\n  \"to_date\" date\n\n  Indexes {\n    (employee_id, title, from_date) [type: btree, name: \"idx_16994_primary\"]\n  }\n}\n\nRef \"dept_emp_ibfk_1\":\"employees\".\"employee\".\"id\" < \"employees\".\"department_employee\".\"employee_id\" [update: restrict, delete: cascade]\n\nRef \"dept_emp_ibfk_2\":\"employees\".\"department\".\"id\" < \"employees\".\"department_employee\".\"department_id\" [update: restrict, delete: cascade]\n\nRef \"dept_manager_ibfk_1\":\"employees\".\"employee\".\"id\" < \"employees\".\"department_manager\".\"employee_id\" [update: restrict, delete: cascade]\n\nRef \"dept_manager_ibfk_2\":\"employees\".\"department\".\"id\" < \"employees\".\"department_manager\".\"department_id\" [update: restrict, delete: cascade]\n\nRef \"salaries_ibfk_1\":\"employees\".\"employee\".\"id\" < \"employees\".\"salary\".\"employee_id\" [update: restrict, delete: cascade]\n\nRef \"titles_ibfk_1\":\"employees\".\"employee\".\"id\" < \"employees\".\"title\".\"employee_id\" [update: restrict, delete: cascade]\n",
    "stateUrl": null,
    "stateOriginalUrl": "https://github.com/neondatabase-labs/postgres-sample-dbs/blob/main/employees.sql.gz"
  }
}


================================================
FILE: tasks/postgres/easy/employees/employee_gender_statistics/verify.py
================================================
import os
import sys
import psycopg2
from decimal import Decimal

def rows_match(actual_row, expected_row):
    """
    Compare two rows with appropriate tolerance.
    For Decimal types: allows 0.1 tolerance
    For other types: requires exact match
    """
    if len(actual_row) != len(expected_row):
        return False
    
    for actual, expected in zip(actual_row, expected_row):
        if isinstance(actual, Decimal) and isinstance(expected, Decimal):
            if abs(float(actual) - float(expected)) > 0.1:
                return False
        elif actual != expected:
            return False
    
    return True

def get_connection_params() -> dict:
    """Get database connection parameters."""
    return {
        "host": os.getenv("POSTGRES_HOST", "localhost"),
        "port": int(os.getenv("POSTGRES_PORT", 5432)),
        "database": os.getenv("POSTGRES_DATABASE"),
        "user": os.getenv("POSTGRES_USERNAME"),
        "password": os.getenv("POSTGRES_PASSWORD")
    }

def verify_gender_statistics_results(conn) -> bool:
    """Verify the gender statistics results."""
    with conn.cursor() as cur:
        # Get actual results from the created table
        cur.execute("""
            SELECT gender, total_employees, current_employees, percentage_of_workforce
            FROM employees.gender_statistics
            ORDER BY gender
        """)
        actual_results = cur.fetchall()
        
        # Execute ground truth query
        cur.execute("""
            WITH current_emp AS (
            SELECT DISTINCT s.employee_id
            FROM employees.salary s
            WHERE s.to_date = DATE '9999-01-01'
            ),
            total_current AS (
            SELECT COUNT(*) AS cnt
            FROM current_emp
            )
            SELECT
            e.gender::varchar AS gender,
            COUNT(*) AS total_employees,
            COUNT(*) FILTER (WHERE ce.employee_id IS NOT NULL) AS current_employees,
            (COUNT(*) FILTER (WHERE ce.employee_id IS NOT NULL))::DECIMAL
                / NULLIF((SELECT cnt FROM total_current), 0) * 100 AS percentage_of_workforce
            FROM employees.employee e
            LEFT JOIN current_emp ce ON ce.employee_id = e.id
            WHERE e.gender IN ('M','F')
            GROUP BY e.gender
            ORDER BY gender;
        """)
        expected_results = cur.fetchall()

        if len(actual_results) != len(expected_results):
            print(f"❌ Expected {len(expected_results)} gender statistics results, got {len(actual_results)}")
            return False

        mismatches = 0
        for i, (actual, expected) in enumerate(zip(actual_results, expected_results)):
            if not rows_match(actual, expected):
                if mismatches < 5:  # Only show first 5 mismatches
                    print(f"❌ Row {i+1} mismatch: expected {expected}, got {actual}")
                mismatches += 1

        if mismatches > 0:
            print(f"❌ Total mismatches: {mismatches}")
            return False

        print(f"✅ Gender statistics results are correct ({len(actual_results)} records)")
        return True

def main():
    """Main verification function."""
    print("=" * 50)

    # Get connection parameters
    conn_params = get_connection_params()

    if not conn_params["database"]:
        print("❌ No database specified")
        sys.exit(1)

    try:
        # Connect to database
        conn = psycopg2.connect(**conn_params)

        # Verify all four analysis results
        success = verify_gender_statistics_results(conn)

        conn.close()

        if success:
            print("\n🎉 Task verification: PASS")
            sys.exit(0)
        else:
            print("\n❌ Task verification: FAIL")
            sys.exit(1)

    except psycopg2.Error as e:
        print(f"❌ Database error: {e}")
        sys.exit(1)
    except Exception as e:
        print(f"❌ Verification error: {e}")
        sys.exit(1)

if __name__ == "__main__":
    main()

================================================
FILE: tasks/postgres/easy/employees/employee_projects_basic/description.md
================================================
Create and manage a basic employee projects table to track company projects. The IT team needs you to build the database table structure and populate it with initial project data.

## Your Tasks:

1. **Create the employee_projects table** — build a new table in the `employees` schema:

   **Table: `employee_projects`**
   * `project_id` (integer, primary key, auto-increment)
   * `project_name` (varchar(100), not null)
   * `start_date` (date, not null)
   * `end_date` (date)
   * `budget` (decimal(10,2))
   * `status` (varchar(20), default 'active')

2. **Insert exactly this initial data into `employee_projects`**:
   * Project 1: name='Database Modernization', start_date='2024-01-15', end_date='2024-06-30', budget=250000.00, status='active'
   * Project 2: name='Employee Portal Upgrade', start_date='2024-02-01', end_date='2024-05-15', budget=180000.00, status='active'
   * Project 3: name='HR Analytics Dashboard', start_date='2023-11-01', end_date='2024-01-31', budget=120000.00, status='active'

This will establish the basic project tracking foundation for the company.


================================================
FILE: tasks/postgres/easy/employees/employee_projects_basic/meta.json
================================================
{
  "task_id": "employee_projects_basic",
  "task_name": "Employee Projects Basic",
  "category_id": "employees",
  "category_name": "Employees",
  "description": "Create the employee_projects table with the specified schema and insert the three starter projects for modernization, portal upgrade, and analytics.",
  "author": "Lingxiao Du",
  "created_at": "2025-11-15",
  "difficulty": "L1",
  "tags": [
    "schema design",
    "data loading"
  ],
  "mcp": [
    "postgres"
  ],
  "meta_data": {
    "stateType": "text",
    "stateContent": "Enum \"employees\".\"employee_gender\" {\n  \"M\"\n  \"F\"\n}\n\nTable \"employees\".\"department\" {\n  \"id\" bpchar(4) [pk, not null]\n  \"dept_name\" varchar(40) [unique, not null]\n}\n\nTable \"employees\".\"department_employee\" {\n  \"employee_id\" int8 [not null]\n  \"department_id\" bpchar(4) [not null]\n  \"from_date\" date [not null]\n  \"to_date\" date [not null]\n\n  Indexes {\n    (employee_id, department_id) [type: btree, name: \"idx_16982_primary\"]\n    department_id [type: btree, name: \"idx_16982_dept_no\"]\n  }\n}\n\nTable \"employees\".\"department_manager\" {\n  \"employee_id\" int8 [not null]\n  \"department_id\" bpchar(4) [not null]\n  \"from_date\" date [not null]\n  \"to_date\" date [not null]\n\n  Indexes {\n    (employee_id, department_id) [type: btree, name: \"idx_16985_primary\"]\n    department_id [type: btree, name: \"idx_16985_dept_no\"]\n  }\n}\n\nTable \"employees\".\"employee\" {\n  \"id\" int8 [pk, not null, increment]\n  \"birth_date\" date [not null]\n  \"first_name\" varchar(14) [not null]\n  \"last_name\" varchar(16) [not null]\n  \"gender\" employees.employee_gender [not null]\n  \"hire_date\" date [not null]\n}\n\nTable \"employees\".\"salary\" {\n  \"employee_id\" int8 [not null]\n  \"amount\" int8 [not null]\n  \"from_date\" date [not null]\n  \"to_date\" date [not null]\n\n  Indexes {\n    (employee_id, from_date) [type: btree, name: \"idx_16991_primary\"]\n  }\n}\n\nTable \"employees\".\"title\" {\n  \"employee_id\" int8 [not null]\n  \"title\" varchar(50) [not null]\n  \"from_date\" date [not null]\n  \"to_date\" date\n\n  Indexes {\n    (employee_id, title, from_date) [type: btree, name: \"idx_16994_primary\"]\n  }\n}\n\nRef \"dept_emp_ibfk_1\":\"employees\".\"employee\".\"id\" < \"employees\".\"department_employee\".\"employee_id\" [update: restrict, delete: cascade]\n\nRef \"dept_emp_ibfk_2\":\"employees\".\"department\".\"id\" < \"employees\".\"department_employee\".\"department_id\" [update: restrict, delete: cascade]\n\nRef \"dept_manager_ibfk_1\":\"employees\".\"employee\".\"id\" < \"employees\".\"department_manager\".\"employee_id\" [update: restrict, delete: cascade]\n\nRef \"dept_manager_ibfk_2\":\"employees\".\"department\".\"id\" < \"employees\".\"department_manager\".\"department_id\" [update: restrict, delete: cascade]\n\nRef \"salaries_ibfk_1\":\"employees\".\"employee\".\"id\" < \"employees\".\"salary\".\"employee_id\" [update: restrict, delete: cascade]\n\nRef \"titles_ibfk_1\":\"employees\".\"employee\".\"id\" < \"employees\".\"title\".\"employee_id\" [update: restrict, delete: cascade]\n",
    "stateUrl": null,
    "stateOriginalUrl": "https://github.com/neondatabase-labs/postgres-sample-dbs/blob/main/employees.sql.gz"
  }
}


================================================
FILE: tasks/postgres/easy/employees/employee_projects_basic/verify.py
================================================
"""
Verification script for PostgreSQL Task 5: Database Schema and Data Operations
"""

import os
import sys
import psycopg2
from decimal import Decimal

def rows_match(actual_row, expected_row):
    """
    Compare two rows with appropriate tolerance.
    For Decimal types: allows 0.1 tolerance
    For date types: convert to string for comparison
    For other types: requires exact match
    """
    if len(actual_row) != len(expected_row):
        return False
    
    for actual, expected in zip(actual_row, expected_row):
        if isinstance(actual, Decimal) and isinstance(expected, (Decimal, float, int)):
            if abs(float(actual) - float(expected)) > 0.1:
                return False
        elif hasattr(actual, 'strftime'):  # datetime.date or datetime.datetime
            if str(actual) != str(expected):
                return False
        elif actual != expected:
            return False
    
    return True

def get_connection_params() -> dict:
    """Get database connection parameters."""
    return {
        "host": os.getenv("POSTGRES_HOST", "localhost"),
        "port": int(os.getenv("POSTGRES_PORT", 5432)),
        "database": os.getenv("POSTGRES_DATABASE"),
        "user": os.getenv("POSTGRES_USERNAME"),
        "password": os.getenv("POSTGRES_PASSWORD")
    }


def verify_project_data(conn) -> bool:
    """Verify that project data was inserted and updated correctly."""
    with conn.cursor() as cur:
        # Check project data after updates
        cur.execute("""
            SELECT project_name, start_date, end_date, budget, status
            FROM employees.employee_projects
            ORDER BY project_name
        """)
        projects = cur.fetchall()
        
        if len(projects) != 3:
            print(f"❌ Expected 3 projects, found {len(projects)}")
            return False
            
        # Expected final state after all updates
        expected = {
            'Database Modernization': ('2024-01-15', '2024-06-30', 250000.00, 'active'),
            'Employee Portal Upgrade': ('2024-02-01', '2024-05-15', 180000.00, 'active'),
            'HR Analytics Dashboard': ('2023-11-01', '2024-01-31', 120000.00, 'active')
        }
        
        for project in projects:
            name = project[0]
            if name not in expected:
                print(f"❌ Unexpected project: {name}")
                return False
                
            exp = expected[name]
            # Use rows_match for comparison
            expected_row = (name,) + exp
            if not rows_match(project, expected_row):
                print(f"❌ Project {name} data mismatch: expected {expected_row}, got {project}")
                return False
                
        print("✅ Project data is correct")
        return True

def main():
    """Main verification function."""
    print("=" * 50)

    # Get connection parameters
    conn_params = get_connection_params()

    if not conn_params["database"]:
        print("❌ No database specified")
        sys.exit(1)

    try:
        # Connect to database
        conn = psycopg2.connect(**conn_params)

        # Verify all components
        success = verify_project_data(conn)

        conn.close()

        if success:
            print("\n🎉 Task verification: PASS")
            sys.exit(0)
        else:
            print("\n❌ Task verification: FAIL")
            sys.exit(1)

    except psycopg2.Error as e:
        print(f"❌ Database error: {e}")
        sys.exit(1)
    except Exception as e:
        print(f"❌ Verification error: {e}")
        sys.exit(1)

if __name__ == "__main__":
    main()

================================================
FILE: tasks/postgres/easy/employees/hiring_year_summary/description.md
================================================
Create a hiring year summary table to help HR track employee retention trends over the years. This analysis shows how many employees were hired each year and how many are still with the company.

## Your Task:

**Create the hiring year summary table** — build a table called `hiring_year_summary` in the `employees` schema with these exact columns:

* `hire_year` (integer) — year employees were hired
* `employees_hired` (integer) — number of employees hired that year
* `still_employed` (integer) — how many from that year are still employed (have active salary where to_date = '9999-01-01')
* `retention_rate` (decimal) — percentage still employed (still_employed / employees_hired * 100)

## Requirements:

1. Extract the hire year from the `hire_date` column in the `employees` table
2. Count total employees hired in each year
3. Determine which employees are still employed by checking for active salary records (to_date = '9999-01-01' in the `salaries` table)
4. Order results by hire_year in ascending order

This analysis will help HR understand retention patterns and identify years with particularly high or low retention rates.


================================================
FILE: tasks/postgres/easy/employees/hiring_year_summary/meta.json
================================================
{
  "task_id": "hiring_year_summary",
  "task_name": "Hiring Year Summary",
  "category_id": "employees",
  "category_name": "Employees",
  "description": "Summarize hires per year into hiring_year_summary, including still-employed counts and retention percentages using active salary rows.",
  "author": "Lingxiao Du",
  "created_at": "2025-11-15",
  "difficulty": "L1",
  "tags": [
    "reporting and analytics",
    "retention analysis"
  ],
  "mcp": [
    "postgres"
  ],
  "meta_data": {
    "stateType": "text",
    "stateContent": "Enum \"employees\".\"employee_gender\" {\n  \"M\"\n  \"F\"\n}\n\nTable \"employees\".\"department\" {\n  \"id\" bpchar(4) [pk, not null]\n  \"dept_name\" varchar(40) [unique, not null]\n}\n\nTable \"employees\".\"department_employee\" {\n  \"employee_id\" int8 [not null]\n  \"department_id\" bpchar(4) [not null]\n  \"from_date\" date [not null]\n  \"to_date\" date [not null]\n\n  Indexes {\n    (employee_id, department_id) [type: btree, name: \"idx_16982_primary\"]\n    department_id [type: btree, name: \"idx_16982_dept_no\"]\n  }\n}\n\nTable \"employees\".\"department_manager\" {\n  \"employee_id\" int8 [not null]\n  \"department_id\" bpchar(4) [not null]\n  \"from_date\" date [not null]\n  \"to_date\" date [not null]\n\n  Indexes {\n    (employee_id, department_id) [type: btree, name: \"idx_16985_primary\"]\n    department_id [type: btree, name: \"idx_16985_dept_no\"]\n  }\n}\n\nTable \"employees\".\"employee\" {\n  \"id\" int8 [pk, not null, increment]\n  \"birth_date\" date [not null]\n  \"first_name\" varchar(14) [not null]\n  \"last_name\" varchar(16) [not null]\n  \"gender\" employees.employee_gender [not null]\n  \"hire_date\" date [not null]\n}\n\nTable \"employees\".\"salary\" {\n  \"employee_id\" int8 [not null]\n  \"amount\" int8 [not null]\n  \"from_date\" date [not null]\n  \"to_date\" date [not null]\n\n  Indexes {\n    (employee_id, from_date) [type: btree, name: \"idx_16991_primary\"]\n  }\n}\n\nTable \"employees\".\"title\" {\n  \"employee_id\" int8 [not null]\n  \"title\" varchar(50) [not null]\n  \"from_date\" date [not null]\n  \"to_date\" date\n\n  Indexes {\n    (employee_id, title, from_date) [type: btree, name: \"idx_16994_primary\"]\n  }\n}\n\nRef \"dept_emp_ibfk_1\":\"employees\".\"employee\".\"id\" < \"employees\".\"department_employee\".\"employee_id\" [update: restrict, delete: cascade]\n\nRef \"dept_emp_ibfk_2\":\"employees\".\"department\".\"id\" < \"employees\".\"department_employee\".\"department_id\" [update: restrict, delete: cascade]\n\nRef \"dept_manager_ibfk_1\":\"employees\".\"employee\".\"id\" < \"employees\".\"department_manager\".\"employee_id\" [update: restrict, delete: cascade]\n\nRef \"dept_manager_ibfk_2\":\"employees\".\"department\".\"id\" < \"employees\".\"department_manager\".\"department_id\" [update: restrict, delete: cascade]\n\nRef \"salaries_ibfk_1\":\"employees\".\"employee\".\"id\" < \"employees\".\"salary\".\"employee_id\" [update: restrict, delete: cascade]\n\nRef \"titles_ibfk_1\":\"employees\".\"employee\".\"id\" < \"employees\".\"title\".\"employee_id\" [update: restrict, delete: cascade]\n",
    "stateUrl": null,
    "stateOriginalUrl": "https://github.com/neondatabase-labs/postgres-sample-dbs/blob/main/employees.sql.gz"
  }
}


================================================
FILE: tasks/postgres/easy/employees/hiring_year_summary/verify.py
================================================
"""
Verification script for PostgreSQL Task 3: Employee Demographics Report
"""

import os
import sys
import psycopg2
from decimal import Decimal

def rows_match(actual_row, expected_row):
    """
    Compare two rows with appropriate tolerance.
    For Decimal types: allows 0.1 tolerance
    For other types: requires exact match
    """
    if len(actual_row) != len(expected_row):
        return False
    
    for actual, expected in zip(actual_row, expected_row):
        if isinstance(actual, Decimal) and isinstance(expected, Decimal):
            if abs(float(actual) - float(expected)) > 0.1:
                return False
        elif actual != expected:
            return False
    
    return True

def get_connection_params() -> dict:
    """Get database connection parameters."""
    return {
        "host": os.getenv("POSTGRES_HOST", "localhost"),
        "port": int(os.getenv("POSTGRES_PORT", 5432)),
        "database": os.getenv("POSTGRES_DATABASE"),
        "user": os.getenv("POSTGRES_USERNAME"),
        "password": os.getenv("POSTGRES_PASSWORD")
    }

def verify_hiring_year_results(conn) -> bool:
    """Verify the hiring year summary results."""
    with conn.cursor() as cur:
        # Get actual results from the created table
        cur.execute("""
            SELECT hire_year, employees_hired, still_employed, retention_rate
            FROM employees.hiring_year_summary
            ORDER BY hire_year
        """)
        actual_results = cur.fetchall()
        
        # Execute ground truth query
        cur.execute("""
            WITH current_emp AS (
            SELECT DISTINCT s.employee_id
            FROM employees.salary s
            WHERE s.to_date = DATE '9999-01-01'
            ),
            base AS (
            SELECT e.id, EXTRACT(YEAR FROM e.hire_date)::INT AS hire_year
            FROM employees.employee e
            WHERE e.hire_date IS NOT NULL
            )
            SELECT
            b.hire_year,
            COUNT(*)::INT AS employees_hired,
            COUNT(*) FILTER (WHERE ce.employee_id IS NOT NULL)::INT AS still_employed,
            (COUNT(*) FILTER (WHERE ce.employee_id IS NOT NULL))::DECIMAL
                / NULLIF(COUNT(*), 0) * 100 AS retention_rate
            FROM base b
            LEFT JOIN current_emp ce ON ce.employee_id = b.id
            GROUP BY b.hire_year
            ORDER BY b.hire_year;
        """)
        expected_results = cur.fetchall()

        if len(actual_results) != len(expected_results):
            print(f"❌ Expected {len(expected_results)} hiring year results, got {len(actual_results)}")
            return False

        mismatches = 0
        for i, (actual, expected) in enumerate(zip(actual_results, expected_results)):
            if not rows_match(actual, expected):
                if mismatches < 5:  # Only show first 5 mismatches
                    print(f"❌ Row {i+1} mismatch: expected {expected}, got {actual}")
                mismatches += 1

        if mismatches > 0:
            print(f"❌ Total mismatches: {mismatches}")
            return False

        print(f"✅ Hiring year summary results are correct ({len(actual_results)} records)")
        return True

def main():
    """Main verification function."""
    print("=" * 50)

    # Get connection parameters
    conn_params = get_connection_params()

    if not conn_params["database"]:
        print("❌ No database specified")
        sys.exit(1)

    try:
        # Connect to database
        conn = psycopg2.connect(**conn_params)

        # Verify all four analysis results
        success = verify_hiring_year_results(conn)

        conn.close()

        if success:
            print("\n🎉 Task verification: PASS")
            sys.exit(0)
        else:
            print("\n❌ Task verification: FAIL")
            sys.exit(1)

    except psycopg2.Error as e:
        print(f"❌ Database error: {e}")
        sys.exit(1)
    except Exception as e:
        print(f"❌ Verification error: {e}")
        sys.exit(1)

if __name__ == "__main__":
    main()

================================================
FILE: tasks/postgres/easy/lego/basic_security_setup/description.md
================================================
Set up basic database security with role-based access control and Row-Level Security (RLS) for the LEGO database.

## Your Tasks:

### 1. Create Database Role and Permissions

Create a new database role called `theme_analyst` with the following permissions:

* `SELECT` permissions on all reference tables: `lego_themes`, `lego_colors`, `lego_parts`, `lego_part_categories`
* `SELECT` permissions on main data tables: `lego_sets`, `lego_inventories`, `lego_inventory_parts`
* No `INSERT`, `UPDATE`, or `DELETE` permissions on any tables

### 2. Enable Row-Level Security

Enable RLS on the following tables:

* `lego_sets`
* `lego_inventories`
* `lego_inventory_parts`

## Requirements:

- Use `CREATE ROLE` to create the `theme_analyst` role
- Use `GRANT SELECT` statements to assign the appropriate permissions
- Use `ALTER TABLE ... ENABLE ROW LEVEL SECURITY` to enable RLS on each table

## Expected Outcome:

After completing these tasks:
- The `theme_analyst` role should exist with read-only access to specified tables
- Row-Level Security should be enabled (but not yet enforced with policies) on the three main data tables
- The role should have no write permissions on any table

This sets up the foundation for implementing theme-based data isolation policies.


================================================
FILE: tasks/postgres/easy/lego/basic_security_setup/meta.json
================================================
{
  "task_id": "basic_security_setup",
  "task_name": "Basic Security Setup",
  "category_id": "lego",
  "category_name": "Lego",
  "description": "Create the read-only theme_analyst role with SELECT rights on LEGO reference tables and enable row-level security on sets and inventory tables.",
  "author": "Lingxiao Du",
  "created_at": "2025-11-15",
  "difficulty": "L1",
  "tags": [
    "security",
    "access control"
  ],
  "mcp": [
    "postgres"
  ],
  "meta_data": {
    "stateType": "text",
    "stateContent": "Table \"lego_colors\" {\n  \"id\" int4 [pk, not null, increment]\n  \"name\" varchar(255) [not null]\n  \"rgb\" varchar(6) [not null]\n  \"is_trans\" bpchar(1) [not null]\n}\n\nTable \"lego_inventories\" {\n  \"id\" int4 [pk, not null, increment]\n  \"version\" int4 [not null]\n  \"set_num\" varchar(255) [not null]\n}\n\nTable \"lego_inventory_parts\" {\n  \"inventory_id\" int4 [not null]\n  \"part_num\" varchar(255) [not null]\n  \"color_id\" int4 [not null]\n  \"quantity\" int4 [not null]\n  \"is_spare\" bool [not null]\n}\n\nTable \"lego_inventory_sets\" {\n  \"inventory_id\" int4 [not null]\n  \"set_num\" varchar(255) [not null]\n  \"quantity\" int4 [not null]\n}\n\nTable \"lego_part_categories\" {\n  \"id\" int4 [pk, not null, increment]\n  \"name\" varchar(255) [not null]\n}\n\nTable \"lego_parts\" {\n  \"part_num\" varchar(255) [pk, not null]\n  \"name\" text [not null]\n  \"part_cat_id\" int4 [not null]\n}\n\nTable \"lego_sets\" {\n  \"set_num\" varchar(255) [pk, not null]\n  \"name\" varchar(255) [not null]\n  \"year\" int4\n  \"theme_id\" int4\n  \"num_parts\" int4\n}\n\nTable \"lego_themes\" {\n  \"id\" int4 [pk, not null, increment]\n  \"name\" varchar(255) [not null]\n  \"parent_id\" int4\n}\n",
    "stateUrl": null,
    "stateOriginalUrl": "https://github.com/neondatabase-labs/postgres-sample-dbs/blob/main/lego.sql"
  }
}


================================================
FILE: tasks/postgres/easy/lego/basic_security_setup/verify.py
================================================
"""
Verification script for PostgreSQL LEGO Task 4: Database Security and RLS Implementation
(Version 2 - Improved Robustness)
"""

import os
import sys
import psycopg2
import psycopg2.errors
from typing import Dict

def get_connection_params() -> Dict[str, any]:
    """Get database connection parameters from environment variables."""
    return {
        "host": os.getenv("POSTGRES_HOST", "localhost"),
        "port": int(os.getenv("POSTGRES_PORT", 5432)),
        "database": os.getenv("POSTGRES_DATABASE"),
        "user": os.getenv("POSTGRES_USERNAME"),
        "password": os.getenv("POSTGRES_PASSWORD"),
    }

def verify_role_creation(conn) -> bool:
    """
    TASK 1 VERIFICATION: Check if theme_analyst role was created with proper permissions.
    """
    print("\n-- Verifying Task 1: Role Creation and Permissions --")
    with conn.cursor() as cur:
        # Check if role exists
        cur.execute("SELECT 1 FROM pg_roles WHERE rolname = 'theme_analyst';")
        if not cur.fetchone():
            print("❌ FAIL: The 'theme_analyst' role was not created.")
            return False
        print("✅ OK: Role 'theme_analyst' exists.")

        # Check SELECT permissions on reference and main tables
        all_tables = [
            'lego_themes', 'lego_colors', 'lego_parts', 'lego_part_categories',
            'lego_sets', 'lego_inventories', 'lego_inventory_parts'
        ]
        for table in all_tables:
            cur.execute(
                """
                SELECT has_table_privilege('theme_analyst', %s, 'SELECT');
                """,
                (table,)
            )
            if not cur.fetchone()[0]:
                print(f"❌ FAIL: 'theme_analyst' role is missing SELECT permission on '{table}'.")
                return False
        print("✅ OK: Role has correct SELECT permissions on all required tables.")

        # Check that no INSERT/UPDATE/DELETE permissions exist
        for table in all_tables:
            cur.execute(
                """
                SELECT 
                    has_table_privilege('theme_analyst', %s, 'INSERT') OR
                    has_table_privilege('theme_analyst', %s, 'UPDATE') OR
                    has_table_privilege('theme_analyst', %s, 'DELETE');
                """,
                (table, table, table)
            )
            if cur.fetchone()[0]:
                print(f"❌ FAIL: 'theme_analyst' role has unauthorized INSERT, UPDATE, or DELETE permission on '{table}'.")
                return False
        print("✅ OK: Role does not have modification permissions.")
        
        print("✅ PASS: 'theme_analyst' role created with correct permissions.")
        return True

def verify_rls_enabled(conn) -> bool:
    """
    TASK 2 VERIFICATION: Check if Row-Level Security is enabled on required tables.
    """
    print("\n-- Verifying Task 2: Row-Level Security Enablement --")
    tables_to_check = ['lego_sets', 'lego_inventories', 'lego_inventory_parts']
    with conn.cursor() as cur:
        for table in tables_to_check:
            cur.execute(
                "SELECT relrowsecurity FROM pg_class WHERE relname = %s;", (table,)
            )
            rls_enabled = cur.fetchone()
            if not rls_enabled or not rls_enabled[0]:
                print(f"❌ FAIL: RLS is not enabled on table '{table}'.")
                return False
            print(f"✅ OK: RLS is enabled on table '{table}'.")
    
    print("✅ PASS: Row-Level Security is enabled on all required tables.")
    return True

def main():
    """Main verification function."""
    print("=" * 60)
    print("LEGO Database Security and RLS Verification Script")
    print("=" * 60)

    conn_params = get_connection_params()
    if not conn_params.get("database"):
        print("❌ CRITICAL: POSTGRES_DATABASE environment variable not set.")
        sys.exit(1)

    conn = None
    try:
        conn = psycopg2.connect(**conn_params)
        
        results = [
            verify_role_creation(conn),
            verify_rls_enabled(conn),
        ]

        if all(results):
            print("\n🎉 Overall Result: PASS - All security tasks verified successfully!")
            sys.exit(0)
        else:
            print("\n❌ Overall Result: FAIL - One or more verification steps failed.")
            sys.exit(1)

    except psycopg2.OperationalError as e:
        print(f"❌ CRITICAL: Could not connect to the database. Check credentials and host. Details: {e}")
        sys.exit(1)
    except Exception as e:
        print(f"❌ CRITICAL: An unexpected error occurred. Details: {e}")
        sys.exit(1)
    finally:
        if conn:
            conn.close()

if __name__ == "__main__":
    main()


================================================
FILE: tasks/postgres/easy/lego/fix_data_inconsistencies/description.md
================================================
Fix data inconsistencies in the LEGO database where the reported part count in the `lego_sets` table does not match the actual sum of non-spare parts in the latest inventory version.

## Consistency Rule

For any given `set_num`, the following must be true:
`lego_sets.num_parts = SUM(quantity)` FROM `lego_inventory_parts` WHERE `inventory_id` IN (latest inventory for that set) AND `is_spare` = false

**Important**: If a set has no inventory records, the consistency check should be skipped.

## Your Tasks:

### Task 1: Identify Data Inconsistencies

**Objective**: Write a single `SELECT` query to find all sets where the stored `num_parts` does not match the actual calculated number of parts from the latest inventory.

1. **Find the Latest Inventory**: For each `set_num`, find its latest inventory id by getting the `MAX(version)` from the `lego_inventories` table.
2. **Calculate Actual Part Count**: For these latest inventories, join with `lego_inventory_parts` and calculate the `SUM(quantity)`, but only for parts where `is_spare` is false.
3. **Compare and Filter**: Join this calculated result back to the `lego_sets` table and return the rows where `lego_sets.num_parts` is different from your calculated sum.

### Task 2: Fix Existing Inconsistencies

**Objective**: Correct all mismatched `num_parts` values using a clear, multi-step process with a temporary table.

#### Step 1: Create a Temporary Table
Create a temporary table (e.g., `correct_counts`) with two columns: `set_num` (text) and `actual_parts` (integer).

#### Step 2: Populate the Temporary Table
Write an `INSERT` statement that calculates the correct part count for every single set listed in the `lego_sets` table.

- The query must start by selecting from `public.lego_sets`.
- It must then `LEFT JOIN` to a subquery that contains the part-counting logic (finding the latest inventory version and summing the non-spare parts).
- Use `COALESCE` on the final result from the subquery to ensure that any set without parts or without an inventory record gets a value of `0`, not `NULL`.

#### Step 3: Update from the Temporary Table
Write a final, simple `UPDATE` statement that joins the `lego_sets` table with your temporary table on `set_num` and sets `num_parts` to the `actual_parts` value.

## Expected Outcome:

After completing these tasks, all sets in the `lego_sets` table should have their `num_parts` correctly reflecting the sum of non-spare parts from their latest inventory version.


================================================
FILE: tasks/postgres/easy/lego/fix_data_inconsistencies/meta.json
================================================
{
  "task_id": "fix_data_inconsistencies",
  "task_name": "Fix Data Inconsistencies",
  "category_id": "lego",
  "category_name": "Lego",
  "description": "Recalculate each LEGO set's part count from the latest inventory, stage the results, and update lego_sets.num_parts to remove mismatches.",
  "author": "Lingxiao Du",
  "created_at": "2025-11-15",
  "difficulty": "L1",
  "tags": [
    "data integrity enforcement",
    "data reconciliation"
  ],
  "mcp": [
    "postgres"
  ],
  "meta_data": {
    "stateType": "text",
    "stateContent": "Table \"lego_colors\" {\n  \"id\" int4 [pk, not null, increment]\n  \"name\" varchar(255) [not null]\n  \"rgb\" varchar(6) [not null]\n  \"is_trans\" bpchar(1) [not null]\n}\n\nTable \"lego_inventories\" {\n  \"id\" int4 [pk, not null, increment]\n  \"version\" int4 [not null]\n  \"set_num\" varchar(255) [not null]\n}\n\nTable \"lego_inventory_parts\" {\n  \"inventory_id\" int4 [not null]\n  \"part_num\" varchar(255) [not null]\n  \"color_id\" int4 [not null]\n  \"quantity\" int4 [not null]\n  \"is_spare\" bool [not null]\n}\n\nTable \"lego_inventory_sets\" {\n  \"inventory_id\" int4 [not null]\n  \"set_num\" varchar(255) [not null]\n  \"quantity\" int4 [not null]\n}\n\nTable \"lego_part_categories\" {\n  \"id\" int4 [pk, not null, increment]\n  \"name\" varchar(255) [not null]\n}\n\nTable \"lego_parts\" {\n  \"part_num\" varchar(255) [pk, not null]\n  \"name\" text [not null]\n  \"part_cat_id\" int4 [not null]\n}\n\nTable \"lego_sets\" {\n  \"set_num\" varchar(255) [pk, not null]\n  \"name\" varchar(255) [not null]\n  \"year\" int4\n  \"theme_id\" int4\n  \"num_parts\" int4\n}\n\nTable \"lego_themes\" {\n  \"id\" int4 [pk, not null, increment]\n  \"name\" varchar(255) [not null]\n  \"parent_id\" int4\n}\n",
    "stateUrl": null,
    "stateOriginalUrl": "https://github.com/neondatabase-labs/postgres-sample-dbs/blob/main/lego.sql"
  }
}


================================================
FILE: tasks/postgres/easy/lego/fix_data_inconsistencies/verify.py
================================================
"""
Verification script for PostgreSQL LEGO Task 1: Parts Consistency Fix & Constraints
Version 2.1: Relaxed consistency check to allow for one known corner case mismatch.
"""

import os
import sys
import psycopg2
import psycopg2.errors
from typing import Optional, Tuple, List


def get_connection_params() -> dict:
    """Get database connection parameters from environment variables."""
    return {
        "host": os.getenv("POSTGRES_HOST", "localhost"),
        "port": int(os.getenv("POSTGRES_PORT", 5432)),
        "database": os.getenv("POSTGRES_DATABASE"),
        "user": os.getenv("POSTGRES_USERNAME"),
        "password": os.getenv("POSTGRES_PASSWORD"),
    }


def fetch_candidate_part_row(cur) -> Optional[Tuple[int, str, str, int]]:
    """
    Picks a concrete, non-spare inventory part from the latest inventory of any set.
    This provides a reliable target for testing update and insert triggers.

    Returns a tuple: (inventory_id, set_num, part_num, color_id) or None.
    """
    cur.execute(
        """
        WITH latest_inv AS (
            SELECT set_num, MAX(version) AS max_version
            FROM public.lego_inventories
            GROUP BY set_num
        ), inv AS (
            SELECT li.id, li.set_num
            FROM public.lego_inventories li
            JOIN latest_inv lv ON lv.set_num = li.set_num AND lv.max_version = li.version
        )
        SELECT i.id AS inventory_id, i.set_num, lip.part_num, lip.color_id
        FROM inv i
        JOIN public.lego_inventory_parts lip ON lip.inventory_id = i.id
        WHERE lip.is_spare = false AND lip.quantity > 0
        LIMIT 1;
        """
    )
    return cur.fetchone()


def get_mismatch_count(cur) -> int:
    """Returns the number of sets where num_parts mismatches the computed actual sum."""
    cur.execute(
        """
        WITH latest_inv AS (
            SELECT set_num, MAX(version) AS max_version
            FROM public.lego_inventories
            GROUP BY set_num
        ), inv_latest AS (
            SELECT li.set_num, li.id
            FROM public.lego_inventories li
            JOIN latest_inv lv ON lv.set_num = li.set_num AND lv.max_version = li.version
        ), parts_agg AS (
            SELECT
                i.set_num,
                SUM(lip.quantity) AS actual_parts
            FROM inv_latest i
            JOIN public.lego_inventory_parts lip ON lip.inventory_id = i.id
            WHERE lip.is_spare = false
            GROUP BY i.set_num
        )
        SELECT COUNT(*)
        FROM public.lego_sets s
        LEFT JOIN parts_agg pa ON s.set_num = pa.set_num
        WHERE s.num_parts <> COALESCE(pa.actual_parts, 0);
        """
    )
    return cur.fetchone()[0]


def verify_data_consistency(conn) -> bool:
    """
    TASK 1 VERIFICATION: Checks if the initial data fix was successful.
    (Relaxed: Allows for one corner-case mismatch).
    """
    print("\n-- Verifying Task 1: Data Consistency Fix (Relaxed) --")
    with conn.cursor() as cur:
        count = get_mismatch_count(cur)
        # RELAXED CONDITION: Allow 0 or 1 mismatch to pass.
        if count > 1:
            print(f"❌ FAIL: Found {count} sets with inconsistent part counts. Expected 0 or 1 after fix.")
            return False
        
        print("✅ PASS: Data consistency check passed (allowing for one known mismatch).")
        return True


def main():
    """Main verification function."""
    print("=" * 60)
    print("LEGO Database Consistency Verification Script")
    print("=" * 60)

    conn_params = get_connection_params()
    if not conn_params.get("database"):
        print("❌ CRITICAL: POSTGRES_DATABASE environment variable not set.")
        sys.exit(1)

    try:
        with psycopg2.connect(**conn_params) as conn:
            conn.autocommit = False # Ensure we control transactions

            # Run all verification steps
            results = [
                verify_data_consistency(conn),
            ]

            if all(results):
                print("\n🎉 Overall Result: PASS - All tasks verified successfully!")
                sys.exit(0)
            else:
                print("\n❌ Overall Result: FAIL - One or more verification steps failed.")
                sys.exit(1)

    except psycopg2.OperationalError as e:
        print(f"❌ CRITICAL: Could not connect to the database. Details: {e}")
        sys.exit(1)
    except Exception as e:
        print(f"❌ CRITICAL: An unexpected error occurred during verification. Details: {e}")
        sys.exit(1)


if __name__ == "__main__":
    main()


================================================
FILE: tasks/postgres/easy/sports/create_performance_indexes/description.md
================================================
Create indexes to optimize participant and statistics queries in the sports database.

## Your Task:

Create two indexes to improve query performance:

1. **Index on participants_events table**: Create an index on the `participant_id` column of the `participants_events` table
2. **Composite index on stats table**: Create a composite index on the `stats` table using columns `stat_holder_type` and `stat_holder_id` (in that order)

## Requirements:

- Create an index on `participants_events(participant_id)`
- Create a composite index on `stats(stat_holder_type, stat_holder_id)`
- Index names can be anything you choose (e.g., `idx_participants_events_participant_id`, `idx_stats_holder`)
- Use the standard CREATE INDEX syntax

## Expected Outcome:

After creating these indexes, queries that involve participant filtering and statistics lookups will run significantly faster.


================================================
FILE: tasks/postgres/easy/sports/create_performance_indexes/meta.json
================================================
{
  "task_id": "create_performance_indexes",
  "task_name": "Create Performance Indexes",
  "category_id": "sports",
  "category_name": "Sports",
  "description": "Create indexes on participants_events.participant_id and stats(stat_holder_type, stat_holder_id) to accelerate performance reporting.",
  "author": "Lingxiao Du",
  "created_at": "2025-11-15",
  "difficulty": "L1",
  "tags": [
    "performance optimization",
    "indexing"
  ],
  "mcp": [
    "postgres"
  ],
  "meta_data": {
    "stateType": "text",
    "stateContent": "Table \"addresses\" {\n  \"id\" int4 [not null, increment]\n  \"location_id\" int4 [not null]\n  \"language\" varchar(100)\n  \"suite\" varchar(100)\n  \"floor\" varchar(100)\n  \"building\" varchar(100)\n  \"street_number\" varchar(100)\n  \"street_prefix\" varchar(100)\n  \"street\" varchar(100)\n  \"street_suffix\" varchar(100)\n  \"neighborhood\" varchar(100)\n  \"district\" varchar(100)\n  \"locality\" varchar(100)\n  \"county\" varchar(100)\n  \"region\" varchar(100)\n  \"postal_code\" varchar(100)\n  \"country\" varchar(100)\n}\n\nTable \"affiliation_phases\" {\n  \"id\" int4 [not null, increment]\n  \"affiliation_id\" int4 [not null]\n  \"ancestor_affiliation_id\" int4\n  \"start_season_id\" int4\n  \"start_date_time\" timestamp\n  \"end_season_id\" int4\n  \"end_date_time\" timestamp\n}\n\nTable \"affiliations\" {\n  \"id\" int4 [not null, increment]\n  \"affiliation_key\" varchar(100) [not null]\n  \"affiliation_type\" varchar(100)\n  \"publisher_id\" int4 [not null]\n}\n\nTable \"affiliations_documents\" {\n  \"affiliation_id\" int4 [not null]\n  \"document_id\" int4 [not null]\n}\n\nTable \"affiliations_events\" {\n  \"affiliation_id\" int4 [not null]\n  \"event_id\" int4 [not null]\n}\n\nTable \"affiliations_media\" {\n  \"affiliation_id\" int4 [not null]\n  \"media_id\" int4 [not null]\n}\n\nTable \"american_football_action_participants\" {\n  \"id\" int4 [not null, increment]\n  \"american_football_action_play_id\" int4 [not null]\n  \"person_id\" int4 [not null]\n  \"participant_role\" varchar(100) [not null]\n  \"score_type\" varchar(100)\n  \"field_line\" int4\n  \"yardage\" int4\n  \"score_credit\" int4\n  \"yards_gained\" int4\n}\n\nTable \"american_football_action_plays\" {\n  \"id\" int4 [not null, increment]\n  \"american_football_event_state_id\" int4 [not null]\n  \"play_type\" varchar(100)\n  \"score_attempt_type\" varchar(100)\n  \"drive_result\" varchar(100)\n  \"points\" int4\n  \"comment\" varchar(255)\n}\n\nTable \"american_football_defensive_stats\" {\n  \"id\" int4 [not null, increment]\n  \"tackles_total\" varchar(100)\n  \"tackles_solo\" varchar(100)\n  \"tackles_assists\" varchar(100)\n  \"interceptions_total\" varchar(100)\n  \"interceptions_yards\" varchar(100)\n  \"interceptions_average\" varchar(100)\n  \"interceptions_longest\" varchar(100)\n  \"interceptions_touchdown\" varchar(100)\n  \"quarterback_hurries\" varchar(100)\n  \"sacks_total\" varchar(100)\n  \"sacks_yards\" varchar(100)\n  \"passes_defensed\" varchar(100)\n}\n\nTable \"american_football_down_progress_stats\" {\n  \"id\" int4 [not null, increment]\n  \"first_downs_total\" varchar(100)\n  \"first_downs_pass\" varchar(100)\n  \"first_downs_run\" varchar(100)\n  \"first_downs_penalty\" varchar(100)\n  \"conversions_third_down\" varchar(100)\n  \"conversions_third_down_attempts\" varchar(100)\n  \"conversions_third_down_percentage\" varchar(100)\n  \"conversions_fourth_down\" varchar(100)\n  \"conversions_fourth_down_attempts\" varchar(100)\n  \"conversions_fourth_down_percentage\" varchar(100)\n}\n\nTable \"american_football_event_states\" {\n  \"id\" int4 [not null, increment]\n  \"event_id\" int4 [not null]\n  \"current_state\" int2\n  \"sequence_number\" int4\n  \"period_value\" int4\n  \"period_time_elapsed\" varchar(100)\n  \"period_time_remaining\" varchar(100)\n  \"clock_state\" varchar(100)\n  \"down\" int4\n  \"team_in_possession_id\" int4\n  \"distance_for_1st_down\" int4\n  \"field_side\" varchar(100)\n  \"field_line\" int4\n  \"context\" varchar(40)\n}\n\nTable \"american_football_fumbles_stats\" {\n  \"id\" int4 [not null, increment]\n  \"fumbles_committed\" varchar(100)\n  \"fumbles_forced\" varchar(100)\n  \"fumbles_recovered\" varchar(100)\n  \"fumbles_lost\" varchar(100)\n  \"fumbles_yards_gained\" varchar(100)\n  \"fumbles_own_committed\" varchar(100)\n  \"fumbles_own_recovered\" varchar(100)\n  \"fumbles_own_lost\" varchar(100)\n  \"fumbles_own_yards_gained\" varchar(100)\n  \"fumbles_opposing_committed\" varchar(100)\n  \"fumbles_opposing_recovered\" varchar(100)\n  \"fumbles_opposing_lost\" varchar(100)\n  \"fumbles_opposing_yards_gained\" varchar(100)\n}\n\nTable \"american_football_offensive_stats\" {\n  \"id\" int4 [not null, increment]\n  \"offensive_plays_yards\" varchar(100)\n  \"offensive_plays_number\" varchar(100)\n  \"offensive_plays_average_yards_per\" varchar(100)\n  \"possession_duration\" varchar(100)\n  \"turnovers_giveaway\" varchar(100)\n}\n\nTable \"american_football_passing_stats\" {\n  \"id\" int4 [not null, increment]\n  \"passes_attempts\" varchar(100)\n  \"passes_completions\" varchar(100)\n  \"passes_percentage\" varchar(100)\n  \"passes_yards_gross\" varchar(100)\n  \"passes_yards_net\" varchar(100)\n  \"passes_yards_lost\" varchar(100)\n  \"passes_touchdowns\" varchar(100)\n  \"passes_touchdowns_percentage\" varchar(100)\n  \"passes_interceptions\" varchar(100)\n  \"passes_interceptions_percentage\" varchar(100)\n  \"passes_longest\" varchar(100)\n  \"passes_average_yards_per\" varchar(100)\n  \"passer_rating\" varchar(100)\n  \"receptions_total\" varchar(100)\n  \"receptions_yards\" varchar(100)\n  \"receptions_touchdowns\" varchar(100)\n  \"receptions_first_down\" varchar(100)\n  \"receptions_longest\" varchar(100)\n  \"receptions_average_yards_per\" varchar(100)\n}\n\nTable \"american_football_penalties_stats\" {\n  \"id\" int4 [not null, increment]\n  \"penalties_total\" varchar(100)\n  \"penalty_yards\" varchar(100)\n  \"penalty_first_downs\" varchar(100)\n}\n\nTable \"american_football_rushing_stats\" {\n  \"id\" int4 [not null, increment]\n  \"rushes_attempts\" varchar(100)\n  \"rushes_yards\" varchar(100)\n  \"rushes_touchdowns\" varchar(100)\n  \"rushing_average_yards_per\" varchar(100)\n  \"rushes_first_down\" varchar(100)\n  \"rushes_longest\" varchar(100)\n}\n\nTable \"american_football_sacks_against_stats\" {\n  \"id\" int4 [not null, increment]\n  \"sacks_against_yards\" varchar(100)\n  \"sacks_against_total\" varchar(100)\n}\n\nTable \"american_football_scoring_stats\" {\n  \"id\" int4 [not null, increment]\n  \"touchdowns_total\" varchar(100)\n  \"touchdowns_passing\" varchar(100)\n  \"touchdowns_rushing\" varchar(100)\n  \"touchdowns_special_teams\" varchar(100)\n  \"touchdowns_defensive\" varchar(100)\n  \"extra_points_attempts\" varchar(100)\n  \"extra_points_made\" varchar(100)\n  \"extra_points_missed\" varchar(100)\n  \"extra_points_blocked\" varchar(100)\n  \"field_goal_attempts\" varchar(100)\n  \"field_goals_made\" varchar(100)\n  \"field_goals_missed\" varchar(100)\n  \"field_goals_blocked\" varchar(100)\n  \"safeties_against\" varchar(100)\n  \"two_point_conversions_attempts\" varchar(100)\n  \"two_point_conversions_made\" varchar(100)\n  \"touchbacks_total\" varchar(100)\n}\n\nTable \"american_football_special_teams_stats\" {\n  \"id\" int4 [not null, increment]\n  \"returns_punt_total\" varchar(100)\n  \"returns_punt_yards\" varchar(100)\n  \"returns_punt_average\" varchar(100)\n  \"returns_punt_longest\" varchar(100)\n  \"returns_punt_touchdown\" varchar(100)\n  \"returns_kickoff_total\" varchar(100)\n  \"returns_kickoff_yards\" varchar(100)\n  \"returns_kickoff_average\" varchar(100)\n  \"returns_kickoff_longest\" varchar(100)\n  \"returns_kickoff_touchdown\" varchar(100)\n  \"returns_total\" varchar(100)\n  \"returns_yards\" varchar(100)\n  \"punts_total\" varchar(100)\n  \"punts_yards_gross\" varchar(100)\n  \"punts_yards_net\" varchar(100)\n  \"punts_longest\" varchar(100)\n  \"punts_inside_20\" varchar(100)\n  \"punts_inside_20_percentage\" varchar(100)\n  \"punts_average\" varchar(100)\n  \"punts_blocked\" varchar(100)\n  \"touchbacks_total\" varchar(100)\n  \"touchbacks_total_percentage\" varchar(100)\n  \"touchbacks_kickoffs\" varchar(100)\n  \"touchbacks_kickoffs_percentage\" varchar(100)\n  \"touchbacks_punts\" varchar(100)\n  \"touchbacks_punts_percentage\" varchar(100)\n  \"touchbacks_interceptions\" varchar(100)\n  \"touchbacks_interceptions_percentage\" varchar(100)\n  \"fair_catches\" varchar(100)\n}\n\nTable \"baseball_action_contact_details\" {\n  \"id\" int4 [not null, increment]\n  \"baseball_action_pitch_id\" int4 [not null]\n  \"location\" varchar(100)\n  \"strength\" varchar(100)\n  \"velocity\" int4\n  \"comment\" text\n  \"trajectory_coordinates\" varchar(100)\n  \"trajectory_formula\" varchar(100)\n}\n\nTable \"baseball_action_pitches\" {\n  \"id\" int4 [not null, increment]\n  \"baseball_action_play_id\" int4 [not null]\n  \"sequence_number\" int4\n  \"baseball_defensive_group_id\" int4\n  \"umpire_call\" varchar(100)\n  \"pitch_location\" varchar(100)\n  \"pitch_type\" varchar(100)\n  \"pitch_velocity\" int4\n  \"comment\" text\n  \"trajectory_coordinates\" varchar(100)\n  \"trajectory_formula\" varchar(100)\n  \"ball_type\" varchar(40)\n  \"strike_type\" varchar(40)\n}\n\nTable \"baseball_action_plays\" {\n  \"id\" int4 [not null, increment]\n  \"baseball_event_state_id\" int4 [not null]\n  \"play_type\" varchar(100)\n  \"notation\" varchar(100)\n  \"notation_yaml\" text\n  \"baseball_defensive_group_id\" int4\n  \"comment\" varchar(255)\n  \"runner_on_first_advance\" int4\n  \"runner_on_second_advance\" int4\n  \"runner_on_third_advance\" int4\n  \"outs_recorded\" int4\n  \"rbi\" int4\n  \"runs_scored\" int4\n  \"earned_runs_scored\" varchar(100)\n}\n\nTable \"baseball_action_substitutions\" {\n  \"id\" int4 [not null, increment]\n  \"baseball_event_state_id\" int4 [not null]\n  \"sequence_number\" int4\n  \"person_type\" varchar(100)\n  \"person_original_id\" int4\n  \"person_original_position_id\" int4\n  \"person_original_lineup_slot\" int4\n  \"person_replacing_id\" int4\n  \"person_replacing_position_id\" int4\n  \"person_replacing_lineup_slot\" int4\n  \"substitution_reason\" varchar(100)\n  \"comment\" varchar(100)\n}\n\nTable \"baseball_defensive_group\" {\n  \"id\" int4 [not null, increment]\n}\n\nTable \"baseball_defensive_players\" {\n  \"id\" int4 [not null, increment]\n  \"baseball_defensive_group_id\" int4 [not null]\n  \"player_id\" int4 [not null]\n  \"position_id\" int4 [not null]\n}\n\nTable \"baseball_defensive_stats\" {\n  \"id\" int4 [not null, increment]\n  \"double_plays\" int4\n  \"triple_plays\" int4\n  \"putouts\" int4\n  \"assists\" int4\n  \"errors\" int4\n  \"fielding_percentage\" numeric\n  \"defensive_average\" numeric\n  \"errors_passed_ball\" int4\n  \"errors_catchers_interference\" int4\n}\n\nTable \"baseball_event_states\" {\n  \"id\" int4 [not null, increment]\n  \"event_id\" int4 [not null]\n  \"current_state\" int2\n  \"sequence_number\" int4\n  \"at_bat_number\" int4\n  \"inning_value\" int4\n  \"inning_half\" varchar(100)\n  \"outs\" int4\n  \"balls\" int4\n  \"strikes\" int4\n  \"runner_on_first_id\" int4\n  \"runner_on_second_id\" int4\n  \"runner_on_third_id\" int4\n  \"runner_on_first\" int2\n  \"runner_on_second\" int2\n  \"runner_on_third\" int2\n  \"runs_this_inning_half\" int4\n  \"pitcher_id\" int4\n  \"batter_id\" int4\n  \"batter_side\" varchar(100)\n  \"context\" varchar(40)\n}\n\nTable \"baseball_offensive_stats\" {\n  \"id\" int4 [not null, increment]\n  \"average\" numeric\n  \"runs_scored\" int4\n  \"at_bats\" int4\n  \"hits\" int4\n  \"rbi\" int4\n  \"total_bases\" int4\n  \"slugging_percentage\" numeric\n  \"bases_on_balls\" int4\n  \"strikeouts\" int4\n  \"left_on_base\" int4\n  \"left_in_scoring_position\" int4\n  \"singles\" int4\n  \"doubles\" int4\n  \"triples\" int4\n  \"home_runs\" int4\n  \"grand_slams\" int4\n  \"at_bats_per_rbi\" numeric\n  \"plate_appearances_per_rbi\" numeric\n  \"at_bats_per_home_run\" numeric\n  \"plate_appearances_per_home_run\" numeric\n  \"sac_flies\" int4\n  \"sac_bunts\" int4\n  \"grounded_into_double_play\" int4\n  \"moved_up\" int4\n  \"on_base_percentage\" numeric\n  \"stolen_bases\" int4\n  \"stolen_bases_caught\" int4\n  \"stolen_bases_average\" numeric\n  \"hit_by_pitch\" int4\n  \"defensive_interferance_reaches\" int4\n  \"on_base_plus_slugging\" numeric\n  \"plate_appearances\" int4\n  \"hits_extra_base\" int4\n}\n\nTable \"baseball_pitching_stats\" {\n  \"id\" int4 [not null, increment]\n  \"runs_allowed\" int4\n  \"singles_allowed\" int4\n  \"doubles_allowed\" int4\n  \"triples_allowed\" int4\n  \"home_runs_allowed\" int4\n  \"innings_pitched\" varchar(20)\n  \"hits\" int4\n  \"earned_runs\" int4\n  \"unearned_runs\" int4\n  \"bases_on_balls\" int4\n  \"bases_on_balls_intentional\" int4\n  \"strikeouts\" int4\n  \"strikeout_to_bb_ratio\" numeric\n  \"number_of_pitches\" int4\n  \"era\" numeric\n  \"inherited_runners_scored\" int4\n  \"pick_offs\" int4\n  \"errors_hit_with_pitch\" int4\n  \"errors_wild_pitch\" int4\n  \"balks\" int4\n  \"wins\" int4\n  \"losses\" int4\n  \"saves\" int4\n  \"shutouts\" int4\n  \"games_complete\" int4\n  \"games_finished\" int4\n  \"winning_percentage\" numeric\n  \"event_credit\" varchar(40)\n  \"save_credit\" varchar(40)\n}\n\nTable \"basketball_defensive_stats\" {\n  \"id\" int4 [not null, increment]\n  \"steals_total\" varchar(100)\n  \"steals_per_game\" varchar(100)\n  \"blocks_total\" varchar(100)\n  \"blocks_per_game\" varchar(100)\n}\n\nTable \"basketball_event_states\" {\n  \"id\" int4 [not null, increment]\n  \"event_id\" int4 [not null]\n  \"current_state\" int4\n  \"sequence_number\" int4\n  \"period_value\" varchar(100)\n  \"period_time_elapsed\" varchar(100)\n  \"period_time_remaining\" varchar(100)\n  \"context\" varchar(40)\n}\n\nTable \"basketball_offensive_stats\" {\n  \"id\" int4 [not null, increment]\n  \"field_goals_made\" int4\n  \"field_goals_attempted\" int4\n  \"field_goals_percentage\" varchar(100)\n  \"field_goals_per_game\" varchar(100)\n  \"field_goals_attempted_per_game\" varchar(100)\n  \"field_goals_percentage_adjusted\" varchar(100)\n  \"three_pointers_made\" int4\n  \"three_pointers_attempted\" int4\n  \"three_pointers_percentage\" varchar(100)\n  \"three_pointers_per_game\" varchar(100)\n  \"three_pointers_attempted_per_game\" varchar(100)\n  \"free_throws_made\" varchar(100)\n  \"free_throws_attempted\" varchar(100)\n  \"free_throws_percentage\" varchar(100)\n  \"free_throws_per_game\" varchar(100)\n  \"free_throws_attempted_per_game\" varchar(100)\n  \"points_scored_total\" varchar(100)\n  \"points_scored_per_game\" varchar(100)\n  \"assists_total\" varchar(100)\n  \"assists_per_game\" varchar(100)\n  \"turnovers_total\" varchar(100)\n  \"turnovers_per_game\" varchar(100)\n  \"points_scored_off_turnovers\" varchar(100)\n  \"points_scored_in_paint\" varchar(100)\n  \"points_scored_on_second_chance\" varchar(100)\n  \"points_scored_on_fast_break\" varchar(100)\n}\n\nTable \"basketball_rebounding_stats\" {\n  \"id\" int4 [not null, increment]\n  \"rebounds_total\" varchar(100)\n  \"rebounds_per_game\" varchar(100)\n  \"rebounds_defensive\" varchar(100)\n  \"rebounds_offensive\" varchar(100)\n  \"team_rebounds_total\" varchar(100)\n  \"team_rebounds_per_game\" varchar(100)\n  \"team_rebounds_defensive\" varchar(100)\n  \"team_rebounds_offensive\" varchar(100)\n}\n\nTable \"basketball_team_stats\" {\n  \"id\" int4 [not null, increment]\n  \"timeouts_left\" varchar(100)\n  \"largest_lead\" varchar(100)\n  \"fouls_total\" varchar(100)\n  \"turnover_margin\" varchar(100)\n}\n\nTable \"bookmakers\" {\n  \"id\" int4 [not null, increment]\n  \"bookmaker_key\" varchar(100)\n  \"publisher_id\" int4 [not null]\n  \"location_id\" int4\n}\n\nTable \"core_person_stats\" {\n  \"id\" int4 [not null, increment]\n  \"time_played_event\" varchar(40)\n  \"time_played_total\" varchar(40)\n  \"time_played_event_average\" varchar(40)\n  \"events_played\" int4\n  \"events_started\" int4\n  \"position_id\" int4\n}\n\nTable \"core_stats\" {\n  \"id\" int4 [not null, increment]\n  \"score\" varchar(100)\n  \"score_opposing\" varchar(100)\n  \"score_attempts\" varchar(100)\n  \"score_attempts_opposing\" varchar(100)\n  \"score_percentage\" varchar(100)\n  \"score_percentage_opposing\" varchar(100)\n}\n\nTable \"db_info\" {\n  \"version\" varchar(100) [not null, default: 16]\n}\n\nTable \"display_names\" {\n  \"id\" int4 [not null, increment]\n  \"language\" varchar(100) [not null]\n  \"entity_type\" varchar(100) [not null]\n  \"entity_id\" int4 [not null]\n  \"full_name\" varchar(100)\n  \"first_name\" varchar(100)\n  \"middle_name\" varchar(100)\n  \"last_name\" varchar(100)\n  \"alias\" varchar(100)\n  \"abbreviation\" varchar(100)\n  \"short_name\" varchar(100)\n  \"prefix\" varchar(20)\n  \"suffix\" varchar(20)\n}\n\nTable \"document_classes\" {\n  \"id\" int4 [not null, increment]\n  \"name\" varchar(100)\n}\n\nTable \"document_contents\" {\n  \"id\" int4 [not null, increment]\n  \"document_id\" int4 [not null]\n  \"sportsml\" varchar(200)\n  \"abstract\" text\n}\n\nTable \"document_fixtures\" {\n  \"id\" int4 [not null, increment]\n  \"fixture_key\" varchar(100)\n  \"publisher_id\" int4 [not null]\n  \"name\" varchar(100)\n  \"document_class_id\" int4 [not null]\n}\n\nTable \"document_fixtures_events\" {\n  \"id\" int4 [not null, increment]\n  \"document_fixture_id\" int4 [not null]\n  \"event_id\" int4 [not null]\n  \"latest_document_id\" int4 [not null]\n  \"last_update\" timestamp\n}\n\nTable \"document_package_entry\" {\n  \"id\" int4 [not null, increment]\n  \"document_package_id\" int4 [not null]\n  \"rank\" varchar(100)\n  \"document_id\" int4 [not null]\n  \"headline\" varchar(100)\n  \"short_headline\" varchar(100)\n}\n\nTable \"document_packages\" {\n  \"id\" int4 [not null, increment]\n  \"package_key\" varchar(100)\n  \"package_name\" varchar(100)\n  \"date_time\" date\n}\n\nTable \"documents\" {\n  \"id\" int4 [not null, increment]\n  \"doc_id\" varchar(75) [not null]\n  \"publisher_id\" int4 [not null]\n  \"date_time\" timestamp\n  \"title\" varchar(255)\n  \"language\" varchar(100)\n  \"priority\" varchar(100)\n  \"revision_id\" varchar(75)\n  \"stats_coverage\" varchar(100)\n  \"document_fixture_id\" int4 [not null]\n  \"source_id\" int4\n  \"db_loading_date_time\" timestamp\n}\n\nTable \"documents_media\" {\n  \"id\" int4 [not null, increment]\n  \"document_id\" int4 [not null]\n  \"media_id\" int4 [not null]\n  \"media_caption_id\" int4 [not null]\n}\n\nTable \"events\" {\n  \"id\" int4 [not null, increment]\n  \"event_key\" varchar(100) [not null]\n  \"publisher_id\" int4 [not null]\n  \"start_date_time\" timestamp\n  \"site_id\" int4\n  \"site_alignment\" varchar(100)\n  \"event_status\" varchar(100)\n  \"duration\" varchar(100)\n  \"attendance\" varchar(100)\n  \"last_update\" timestamp\n}\n\nTable \"events_documents\" {\n  \"event_id\" int4 [not null]\n  \"document_id\" int4 [not null]\n}\n\nTable \"events_media\" {\n  \"event_id\" int4 [not null]\n  \"media_id\" int4 [not null]\n}\n\nTable \"events_sub_seasons\" {\n  \"event_id\" int4 [not null]\n  \"sub_season_id\" int4 [not null]\n}\n\nTable \"ice_hockey_action_participants\" {\n  \"id\" int4 [not null, increment]\n  \"ice_hockey_action_play_id\" int4 [not null]\n  \"person_id\" int4 [not null]\n  \"participant_role\" varchar(100) [not null]\n  \"point_credit\" int4\n}\n\nTable \"ice_hockey_action_plays\" {\n  \"id\" int4 [not null, increment]\n  \"ice_hockey_event_state_id\" int4 [not null]\n  \"play_type\" varchar(100)\n  \"score_attempt_type\" varchar(100)\n  \"play_result\" varchar(100)\n  \"comment\" varchar(255)\n}\n\nTable \"ice_hockey_defensive_stats\" {\n  \"id\" int4 [not null, increment]\n  \"shots_power_play_allowed\" varchar(100)\n  \"shots_penalty_shot_allowed\" varchar(100)\n  \"goals_power_play_allowed\" varchar(100)\n  \"goals_penalty_shot_allowed\" varchar(100)\n  \"goals_against_average\" varchar(100)\n  \"saves\" varchar(100)\n  \"save_percentage\" varchar(100)\n  \"penalty_killing_amount\" varchar(100)\n  \"penalty_killing_percentage\" varchar(100)\n  \"shots_blocked\" varchar(100)\n  \"takeaways\" varchar(100)\n  \"shutouts\" varchar(100)\n  \"minutes_penalty_killing\" varchar(100)\n  \"hits\" varchar(100)\n  \"goals_empty_net_allowed\" varchar(100)\n  \"goals_short_handed_allowed\" varchar(100)\n  \"goals_shootout_allowed\" varchar(100)\n  \"shots_shootout_allowed\" varchar(100)\n}\n\nTable \"ice_hockey_event_states\" {\n  \"id\" int4 [not null, increment]\n  \"event_id\" int4 [not null]\n  \"current_state\" int4\n  \"sequence_number\" int4\n  \"period_value\" varchar(100)\n  \"period_time_elapsed\" varchar(100)\n  \"period_time_remaining\" varchar(100)\n  \"context\" varchar(40)\n}\n\nTable \"ice_hockey_offensive_stats\" {\n  \"id\" int4 [not null, increment]\n  \"goals_game_winning\" varchar(100)\n  \"goals_game_tying\" varchar(100)\n  \"goals_power_play\" varchar(100)\n  \"goals_short_handed\" varchar(100)\n  \"goals_even_strength\" varchar(100)\n  \"goals_empty_net\" varchar(100)\n  \"goals_overtime\" varchar(100)\n  \"goals_shootout\" varchar(100)\n  \"goals_penalty_shot\" varchar(100)\n  \"assists\" varchar(100)\n  \"points\" varchar(100)\n  \"power_play_amount\" varchar(100)\n  \"power_play_percentage\" varchar(100)\n  \"shots_penalty_shot_taken\" varchar(100)\n  \"shots_penalty_shot_missed\" varchar(100)\n  \"shots_penalty_shot_percentage\" varchar(100)\n  \"giveaways\" varchar(100)\n  \"minutes_power_play\" varchar(100)\n  \"faceoff_wins\" varchar(100)\n  \"faceoff_losses\" varchar(100)\n  \"faceoff_win_percentage\" varchar(100)\n  \"scoring_chances\" varchar(100)\n}\n\nTable \"ice_hockey_player_stats\" {\n  \"id\" int4 [not null, increment]\n  \"plus_minus\" varchar(100)\n}\n\nTable \"injury_phases\" {\n  \"id\" int4 [not null, increment]\n  \"person_id\" int4 [not null]\n  \"injury_status\" varchar(100)\n  \"injury_type\" varchar(100)\n  \"injury_comment\" varchar(100)\n  \"disabled_list\" varchar(100)\n  \"start_date_time\" timestamp\n  \"end_date_time\" timestamp\n  \"season_id\" int4\n  \"phase_type\" varchar(100)\n  \"injury_side\" varchar(100)\n}\n\nTable \"key_aliases\" {\n  \"id\" int4 [not null, increment]\n  \"key_id\" int4 [not null]\n  \"key_root_id\" int4 [not null]\n}\n\nTable \"key_roots\" {\n  \"id\" int4 [not null, increment]\n  \"key_type\" varchar(100)\n}\n\nTable \"latest_revisions\" {\n  \"id\" int4 [not null, increment]\n  \"revision_id\" varchar(75) [not null]\n  \"latest_document_id\" int4 [not null]\n}\n\nTable \"locations\" {\n  \"id\" int4 [not null, increment]\n  \"timezone\" varchar(100)\n  \"latitude\" varchar(100)\n  \"longitude\" varchar(100)\n  \"country_code\" varchar(100)\n}\n\nTable \"media\" {\n  \"id\" int4 [not null, increment]\n  \"object_id\" int4\n  \"source_id\" int4\n  \"revision_id\" int4\n  \"media_type\" varchar(100)\n  \"publisher_id\" int4 [not null]\n  \"date_time\" varchar(100)\n  \"credit_id\" int4 [not null]\n  \"db_loading_date_time\" timestamp\n  \"creation_location_id\" int4 [not null]\n}\n\nTable \"media_captions\" {\n  \"id\" int4 [not null, increment]\n  \"media_id\" int4 [not null]\n  \"caption_type\" varchar(100)\n  \"caption\" varchar(100)\n  \"caption_author_id\" int4 [not null]\n  \"language\" varchar(100)\n  \"caption_size\" varchar(100)\n}\n\nTable \"media_contents\" {\n  \"id\" int4 [not null, increment]\n  \"media_id\" int4 [not null]\n  \"object\" varchar(100)\n  \"format\" varchar(100)\n  \"mime_type\" varchar(100)\n  \"height\" varchar(100)\n  \"width\" varchar(100)\n  \"duration\" varchar(100)\n  \"file_size\" varchar(100)\n  \"resolution\" varchar(100)\n}\n\nTable \"media_keywords\" {\n  \"id\" int4 [not null, increment]\n  \"keyword\" varchar(100)\n  \"media_id\" int4 [not null]\n}\n\nTable \"motor_racing_event_states\" {\n  \"id\" int4 [not null, increment]\n  \"event_id\" int4 [not null]\n  \"current_state\" int4\n  \"sequence_number\" int4\n  \"lap\" varchar(100)\n  \"laps_remaining\" varchar(100)\n  \"time_elapsed\" varchar(100)\n  \"flag_state\" varchar(100)\n  \"context\" varchar(40)\n}\n\nTable \"motor_racing_qualifying_stats\" {\n  \"id\" int4 [not null, increment]\n  \"grid\" varchar(100)\n  \"pole_position\" varchar(100)\n  \"pole_wins\" varchar(100)\n  \"qualifying_speed\" varchar(100)\n  \"qualifying_speed_units\" varchar(100)\n  \"qualifying_time\" varchar(100)\n  \"qualifying_position\" varchar(100)\n}\n\nTable \"motor_racing_race_stats\" {\n  \"id\" int4 [not null, increment]\n  \"time_behind_leader\" varchar(100)\n  \"laps_behind_leader\" varchar(100)\n  \"time_ahead_follower\" varchar(100)\n  \"laps_ahead_follower\" varchar(100)\n  \"time\" varchar(100)\n  \"points\" varchar(100)\n  \"points_rookie\" varchar(100)\n  \"bonus\" varchar(100)\n  \"laps_completed\" varchar(100)\n  \"laps_leading_total\" varchar(100)\n  \"distance_leading\" varchar(100)\n  \"distance_completed\" varchar(100)\n  \"distance_units\" varchar(40)\n  \"speed_average\" varchar(40)\n  \"speed_units\" varchar(40)\n  \"status\" varchar(40)\n  \"finishes_top_5\" varchar(40)\n  \"finishes_top_10\" varchar(40)\n  \"starts\" varchar(40)\n  \"finishes\" varchar(40)\n  \"non_finishes\" varchar(40)\n  \"wins\" varchar(40)\n  \"races_leading\" varchar(40)\n  \"money\" varchar(40)\n  \"money_units\" varchar(40)\n  \"leads_total\" varchar(40)\n}\n\nTable \"outcome_totals\" {\n  \"id\" int4 [not null, increment]\n  \"standing_subgroup_id\" int4 [not null]\n  \"outcome_holder_type\" varchar(100)\n  \"outcome_holder_id\" int4\n  \"rank\" varchar(100)\n  \"wins\" varchar(100)\n  \"losses\" varchar(100)\n  \"ties\" varchar(100)\n  \"undecideds\" varchar(100)\n  \"winning_percentage\" varchar(100)\n  \"points_scored_for\" varchar(100)\n  \"points_scored_against\" varchar(100)\n  \"points_difference\" varchar(100)\n  \"standing_points\" varchar(100)\n  \"streak_type\" varchar(100)\n  \"streak_duration\" varchar(100)\n  \"streak_total\" varchar(100)\n  \"streak_start\" date\n  \"streak_end\" date\n}\n\nTable \"participants_events\" {\n  \"id\" int4 [not null, increment]\n  \"participant_type\" varchar(100) [not null]\n  \"participant_id\" int4 [not null]\n  \"event_id\" int4 [not null]\n  \"alignment\" varchar(100)\n  \"score\" varchar(100)\n  \"event_outcome\" varchar(100)\n  \"rank\" int4\n}\n\nTable \"periods\" {\n  \"id\" int4 [not null, increment]\n  \"participant_event_id\" int4 [not null]\n  \"period_value\" varchar(100)\n  \"score\" varchar(100)\n}\n\nTable \"person_event_metadata\" {\n  \"id\" int4 [not null, increment]\n  \"person_id\" int4 [not null]\n  \"event_id\" int4 [not null]\n  \"status\" varchar(100)\n  \"health\" varchar(100)\n  \"weight\" varchar(100)\n  \"role_id\" int4\n  \"position_id\" int4\n  \"team_id\" int4\n  \"lineup_slot\" int4\n  \"lineup_slot_sequence\" int4\n}\n\nTable \"person_phases\" {\n  \"id\" int4 [not null, increment]\n  \"person_id\" int4 [not null]\n  \"membership_type\" varchar(40) [not null]\n  \"membership_id\" int4 [not null]\n  \"role_id\" int4\n  \"role_status\" varchar(40)\n  \"phase_status\" varchar(40)\n  \"uniform_number\" varchar(20)\n  \"regular_position_id\" int4\n  \"regular_position_depth\" varchar(40)\n  \"height\" varchar(100)\n  \"weight\" varchar(100)\n  \"start_date_time\" timestamp\n  \"start_season_id\" int4\n  \"end_date_time\" timestamp\n  \"end_season_id\" int4\n  \"entry_reason\" varchar(40)\n  \"exit_reason\" varchar(40)\n  \"selection_level\" int4\n  \"selection_sublevel\" int4\n  \"selection_overall\" int4\n}\n\nTable \"persons\" {\n  \"id\" int4 [not null, increment]\n  \"person_key\" varchar(100) [not null]\n  \"publisher_id\" int4 [not null]\n  \"gender\" varchar(20)\n  \"birth_date\" varchar(30)\n  \"death_date\" varchar(30)\n  \"birth_location_id\" int4\n  \"hometown_location_id\" int4\n  \"residence_location_id\" int4\n  \"death_location_id\" int4\n}\n\nTable \"persons_documents\" {\n  \"person_id\" int4 [not null]\n  \"document_id\" int4 [not null]\n}\n\nTable \"persons_media\" {\n  \"person_id\" int4 [not null]\n  \"media_id\" int4 [not null]\n}\n\nTable \"positions\" {\n  \"id\" int4 [not null, increment]\n  \"affiliation_id\" int4 [not null]\n  \"abbreviation\" varchar(100) [not null]\n}\n\nTable \"publishers\" {\n  \"id\" int4 [not null, increment]\n  \"publisher_key\" varchar(100) [not null]\n  \"publisher_name\" varchar(100)\n}\n\nTable \"roles\" {\n  \"id\" int4 [not null, increment]\n  \"role_key\" varchar(100) [not null]\n  \"role_name\" varchar(100)\n  \"comment\" varchar(100)\n}\n\nTable \"seasons\" {\n  \"id\" int4 [not null, increment]\n  \"season_key\" int4 [not null]\n  \"publisher_id\" int4 [not null]\n  \"league_id\" int4 [not null]\n  \"start_date_time\" timestamp\n  \"end_date_time\" timestamp\n}\n\nTable \"sites\" {\n  \"id\" int4 [not null, increment]\n  \"site_key\" int4 [not null]\n  \"publisher_id\" int4 [not null]\n  \"location_id\" int4\n}\n\nTable \"soccer_defensive_stats\" {\n  \"id\" int4 [not null, increment]\n  \"shots_penalty_shot_allowed\" varchar(100)\n  \"goals_penalty_shot_allowed\" varchar(100)\n  \"goals_against_average\" varchar(100)\n  \"goals_against_total\" varchar(100)\n  \"saves\" varchar(100)\n  \"save_percentage\" varchar(100)\n  \"catches_punches\" varchar(100)\n  \"shots_on_goal_total\" varchar(100)\n  \"shots_shootout_total\" varchar(100)\n  \"shots_shootout_allowed\" varchar(100)\n  \"shots_blocked\" varchar(100)\n  \"shutouts\" varchar(100)\n}\n\nTable \"soccer_event_states\" {\n  \"id\" int4 [not null, increment]\n  \"event_id\" int4 [not null]\n  \"current_state\" int4\n  \"sequence_number\" int4\n  \"period_value\" varchar(100)\n  \"period_time_elapsed\" varchar(100)\n  \"period_time_remaining\" varchar(100)\n  \"minutes_elapsed\" varchar(100)\n  \"period_minute_elapsed\" varchar(100)\n  \"context\" varchar(40)\n}\n\nTable \"soccer_foul_stats\" {\n  \"id\" int4 [not null, increment]\n  \"fouls_suffered\" varchar(100)\n  \"fouls_commited\" varchar(100)\n  \"cautions_total\" varchar(100)\n  \"cautions_pending\" varchar(100)\n  \"caution_points_total\" varchar(100)\n  \"caution_points_pending\" varchar(100)\n  \"ejections_total\" varchar(100)\n}\n\nTable \"soccer_offensive_stats\" {\n  \"id\" int4 [not null, increment]\n  \"goals_game_winning\" varchar(100)\n  \"goals_game_tying\" varchar(100)\n  \"goals_overtime\" varchar(100)\n  \"goals_shootout\" varchar(100)\n  \"goals_total\" varchar(100)\n  \"assists_game_winning\" varchar(100)\n  \"assists_game_tying\" varchar(100)\n  \"assists_overtime\" varchar(100)\n  \"assists_total\" varchar(100)\n  \"points\" varchar(100)\n  \"shots_total\" varchar(100)\n  \"shots_on_goal_total\" varchar(100)\n  \"shots_hit_frame\" varchar(100)\n  \"shots_penalty_shot_taken\" varchar(100)\n  \"shots_penalty_shot_scored\" varchar(100)\n  \"shots_penalty_shot_missed\" varchar(40)\n  \"shots_penalty_shot_percentage\" varchar(40)\n  \"shots_shootout_taken\" varchar(40)\n  \"shots_shootout_scored\" varchar(40)\n  \"shots_shootout_missed\" varchar(40)\n  \"shots_shootout_percentage\" varchar(40)\n  \"giveaways\" varchar(40)\n  \"offsides\" varchar(40)\n  \"corner_kicks\" varchar(40)\n  \"hat_tricks\" varchar(40)\n}\n\nTable \"standing_subgroups\" {\n  \"id\" int4 [not null, increment]\n  \"standing_id\" int4 [not null]\n  \"affiliation_id\" int4 [not null]\n}\n\nTable \"standings\" {\n  \"id\" int4 [not null, increment]\n  \"affiliation_id\" int4 [not null]\n  \"standing_type\" varchar(100)\n  \"sub_season_id\" int4 [not null]\n  \"last_updated\" varchar(100)\n  \"duration_scope\" varchar(100)\n  \"competition_scope\" varchar(100)\n  \"competition_scope_id\" varchar(100)\n  \"alignment_scope\" varchar(100)\n  \"site_scope\" varchar(100)\n  \"scoping_label\" varchar(100)\n  \"publisher_id\" int4 [not null]\n  \"source\" varchar(100)\n}\n\nTable \"stats\" {\n  \"id\" int4 [not null, increment]\n  \"stat_repository_type\" varchar(100)\n  \"stat_repository_id\" int4 [not null]\n  \"stat_holder_type\" varchar(100)\n  \"stat_holder_id\" int4\n  \"stat_coverage_type\" varchar(100)\n  \"stat_coverage_id\" int4\n  \"context\" varchar(40) [not null]\n}\n\nTable \"sub_periods\" {\n  \"id\" int4 [not null, increment]\n  \"period_id\" int4 [not null]\n  \"sub_period_value\" varchar(100)\n  \"score\" varchar(100)\n}\n\nTable \"sub_seasons\" {\n  \"id\" int4 [not null, increment]\n  \"sub_season_key\" varchar(100) [not null]\n  \"season_id\" int4 [not null]\n  \"sub_season_type\" varchar(100) [not null]\n  \"start_date_time\" timestamp\n  \"end_date_time\" timestamp\n}\n\nTable \"team_american_football_stats\" {\n  \"id\" int4 [not null, increment]\n  \"yards_per_attempt\" varchar(100)\n  \"average_starting_position\" varchar(100)\n  \"timeouts\" varchar(100)\n  \"time_of_possession\" varchar(100)\n  \"turnover_ratio\" varchar(100)\n}\n\nTable \"team_phases\" {\n  \"id\" int4 [not null, increment]\n  \"team_id\" int4 [not null]\n  \"start_season_id\" int4\n  \"end_season_id\" int4\n  \"affiliation_id\" int4 [not null]\n  \"start_date_time\" varchar(100)\n  \"end_date_time\" varchar(100)\n  \"phase_status\" varchar(40)\n  \"role_id\" int4\n}\n\nTable \"teams\" {\n  \"id\" int4 [not null, increment]\n  \"team_key\" varchar(100) [not null]\n  \"publisher_id\" int4 [not null]\n  \"home_site_id\" int4\n}\n\nTable \"teams_documents\" {\n  \"team_id\" int4 [not null]\n  \"document_id\" int4 [not null]\n}\n\nTable \"teams_media\" {\n  \"team_id\" int4 [not null]\n  \"media_id\" int4 [not null]\n}\n\nTable \"tennis_action_points\" {\n  \"id\" int4 [not null, increment]\n  \"sub_period_id\" varchar(100)\n  \"sequence_number\" varchar(100)\n  \"win_type\" varchar(100)\n}\n\nTable \"tennis_action_volleys\" {\n  \"id\" int4 [not null, increment]\n  \"sequence_number\" varchar(100)\n  \"tennis_action_points_id\" int4\n  \"landing_location\" varchar(100)\n  \"swing_type\" varchar(100)\n  \"result\" varchar(100)\n  \"spin_type\" varchar(100)\n  \"trajectory_details\" varchar(100)\n}\n\nTable \"tennis_event_states\" {\n  \"id\" int4 [not null, increment]\n  \"event_id\" int4 [not null]\n  \"current_state\" int4\n  \"sequence_number\" int4\n  \"tennis_set\" varchar(100)\n  \"game\" varchar(100)\n  \"server_person_id\" int4\n  \"server_score\" varchar(100)\n  \"receiver_person_id\" int4\n  \"receiver_score\" varchar(100)\n  \"service_number\" varchar(100)\n  \"context\" varchar(40)\n}\n\nTable \"tennis_return_stats\" {\n  \"id\" int4 [not null, increment]\n  \"returns_played\" varchar(100)\n  \"matches_played\" varchar(100)\n  \"first_service_return_points_won\" varchar(100)\n  \"first_service_return_points_won_pct\" varchar(100)\n  \"second_service_return_points_won\" varchar(100)\n  \"second_service_return_points_won_pct\" varchar(100)\n  \"return_games_played\" varchar(100)\n  \"return_games_won\" varchar(100)\n  \"return_games_won_pct\" varchar(100)\n  \"break_points_played\" varchar(100)\n  \"break_points_converted\" varchar(100)\n  \"break_points_converted_pct\" varchar(100)\n}\n\nTable \"tennis_service_stats\" {\n  \"id\" int4 [not null, increment]\n  \"services_played\" varchar(100)\n  \"matches_played\" varchar(100)\n  \"aces\" varchar(100)\n  \"first_services_good\" varchar(100)\n  \"first_services_good_pct\" varchar(100)\n  \"first_service_points_won\" varchar(100)\n  \"first_service_points_won_pct\" varchar(100)\n  \"second_service_points_won\" varchar(100)\n  \"second_service_points_won_pct\" varchar(100)\n  \"service_games_played\" varchar(100)\n  \"service_games_won\" varchar(100)\n  \"service_games_won_pct\" varchar(100)\n  \"break_points_played\" varchar(100)\n  \"break_points_saved\" varchar(100)\n  \"break_points_saved_pct\" varchar(100)\n}\n\nTable \"wagering_moneylines\" {\n  \"id\" int4 [not null, increment]\n  \"bookmaker_id\" int4 [not null]\n  \"event_id\" int4 [not null]\n  \"date_time\" timestamp\n  \"team_id\" int4 [not null]\n  \"person_id\" int4\n  \"rotation_key\" varchar(100)\n  \"comment\" varchar(100)\n  \"vigorish\" varchar(100)\n  \"line\" varchar(100)\n  \"line_opening\" varchar(100)\n  \"prediction\" varchar(100)\n}\n\nTable \"wagering_odds_lines\" {\n  \"id\" int4 [not null, increment]\n  \"bookmaker_id\" int4 [not null]\n  \"event_id\" int4 [not null]\n  \"date_time\" timestamp\n  \"team_id\" int4 [not null]\n  \"person_id\" int4\n  \"rotation_key\" varchar(100)\n  \"comment\" varchar(100)\n  \"numerator\" varchar(100)\n  \"denominator\" varchar(100)\n  \"prediction\" varchar(100)\n  \"payout_calculation\" varchar(100)\n  \"payout_amount\" varchar(100)\n}\n\nTable \"wagering_runlines\" {\n  \"id\" int4 [not null, increment]\n  \"bookmaker_id\" int4 [not null]\n  \"event_id\" int4 [not null]\n  \"date_time\" timestamp\n  \"team_id\" int4 [not null]\n  \"person_id\" int4\n  \"rotation_key\" varchar(100)\n  \"comment\" varchar(100)\n  \"vigorish\" varchar(100)\n  \"line\" varchar(100)\n  \"line_opening\" varchar(100)\n  \"line_value\" varchar(100)\n  \"prediction\" varchar(100)\n}\n\nTable \"wagering_straight_spread_lines\" {\n  \"id\" int4 [not null, increment]\n  \"bookmaker_id\" int4 [not null]\n  \"event_id\" int4 [not null]\n  \"date_time\" timestamp\n  \"team_id\" int4 [not null]\n  \"person_id\" int4\n  \"rotation_key\" varchar(100)\n  \"comment\" varchar(100)\n  \"vigorish\" varchar(100)\n  \"line_value\" varchar(100)\n  \"line_value_opening\" varchar(100)\n  \"prediction\" varchar(100)\n}\n\nTable \"wagering_total_score_lines\" {\n  \"id\" int4 [not null, increment]\n  \"bookmaker_id\" int4 [not null]\n  \"event_id\" int4 [not null]\n  \"date_time\" timestamp\n  \"team_id\" int4 [not null]\n  \"person_id\" int4\n  \"rotation_key\" varchar(100)\n  \"comment\" varchar(100)\n  \"vigorish\" varchar(100)\n  \"line_over\" varchar(100)\n  \"line_under\" varchar(100)\n  \"total\" varchar(100)\n  \"total_opening\" varchar(100)\n  \"prediction\" varchar(100)\n}\n\nTable \"weather_conditions\" {\n  \"id\" int4 [not null, increment]\n  \"event_id\" int4 [not null]\n  \"temperature\" varchar(100)\n  \"temperature_units\" varchar(40)\n  \"humidity\" varchar(100)\n  \"clouds\" varchar(100)\n  \"wind_direction\" varchar(100)\n  \"wind_velocity\" varchar(100)\n  \"weather_code\" varchar(100)\n}\n",
    "stateUrl": null,
    "stateOriginalUrl": "https://github.com/yugabyte/yugabyte-db/blob/master/sample/sportsdb_tables.sql"
  }
}


================================================
FILE: tasks/postgres/easy/sports/create_performance_indexes/verify.py
================================================
"""
Verification script for PostgreSQL Sports Task 3: Query Performance Optimization
"""

import os
import sys
import psycopg2
from decimal import Decimal

def rows_match(actual_row, expected_row):
    """
    Compare two rows with appropriate tolerance.
    For Decimal types: allows 0.001 tolerance
    For other types: requires exact match
    """
    if len(actual_row) != len(expected_row):
        return False
    
    for actual, expected in zip(actual_row, expected_row):
        if isinstance(actual, Decimal) and isinstance(expected, Decimal):
            if abs(float(actual) - float(expected)) > 0.001:
                return False
        elif isinstance(actual, float) and isinstance(expected, float):
            if abs(actual - expected) > 0.001:
                return False
        elif actual != expected:
            return False
    
    return True

def get_connection_params() -> dict:
    """Get database connection parameters."""
    return {
        "host": os.getenv("POSTGRES_HOST", "localhost"),
        "port": int(os.getenv("POSTGRES_PORT", 5432)),
        "database": os.getenv("POSTGRES_DATABASE", "sports"),
        "user": os.getenv("POSTGRES_USERNAME", "postgres"),
        "password": os.getenv("POSTGRES_PASSWORD", "postgres")
    }


def verify_performance_optimization(conn) -> bool:
    """Verify that key performance optimization indexes have been implemented."""
    with conn.cursor() as cur:
        print("\n🔍 Checking for critical performance indexes...")
        
        # Check 1: participants_events.participant_id index (critical for subqueries)
        cur.execute("""
            SELECT indexname, indexdef 
            FROM pg_indexes 
            WHERE schemaname = 'public' 
            AND tablename = 'participants_events'
            AND indexdef LIKE '%participant_id%'
        """)
        participant_indexes = cur.fetchall()
        has_participant_index = len(participant_indexes) > 0
        
        # Check 2: stats table optimization (critical for subquery filtering)
        cur.execute("""
            SELECT indexname, indexdef 
            FROM pg_indexes 
            WHERE schemaname = 'public' 
            AND tablename = 'stats'
            AND indexdef LIKE '%stat_holder_type%'
            AND indexdef LIKE '%stat_holder_id%'
        """)
        stats_indexes = cur.fetchall()
        has_stats_index = len(stats_indexes) > 0
        
        # Report findings
        critical_indexes_found = 0
        
        if has_participant_index:
            print("✅ Found participant filtering index on participants_events.participant_id")
            critical_indexes_found += 1
        else:
            print("❌ Missing critical index on participants_events.participant_id")
            
        if has_stats_index:
            print("✅ Found subquery optimization index on stats table")
            critical_indexes_found += 1
        else:
            print("❌ Missing critical index on stats table")
        
        # Must have both critical indexes for this subquery-heavy query
        if critical_indexes_found >= 2:
            print(f"\n✅ Performance optimization: PASS ({critical_indexes_found}/2 critical indexes found)")
            return True
        else:
            print(f"\n❌ Performance optimization: FAIL ({critical_indexes_found}/2 critical indexes found)")
            print("   Create these critical indexes:")
            print("   - CREATE INDEX ON participants_events(participant_id);")
            print("   - CREATE INDEX ON stats(stat_holder_type, stat_holder_id);")
            return False

def main():
    """Main verification function."""
    print("=" * 50)
    print("Verifying Sports Task 3: Query Performance Optimization")
    print("=" * 50)

    # Get connection parameters
    conn_params = get_connection_params()

    if not conn_params["database"]:
        print("❌ No database specified")
        sys.exit(1)

    try:
        # Connect to database
        conn = psycopg2.connect(**conn_params)

        # Verify all components
        success = verify_performance_optimization(conn)

        conn.close()

        if success:
            print("\n🎉 Task verification: PASS")
            sys.exit(0)
        else:
            print("\n❌ Task verification: FAIL")
            sys.exit(1)

    except psycopg2.Error as e:
        print(f"❌ Database error: {e}")
        sys.exit(1)
    except Exception as e:
        print(f"❌ Verification error: {e}")
        sys.exit(1)

if __name__ == "__main__":
    main()

================================================
FILE: tasks/postgres/standard/chinook/customer_data_migration/description.md
================================================
Migrate customer data from an acquired company to PostgreSQL using efficient bulk operations.

## Your Mission:

Chinook Music Store has recently acquired "MelodyMart," a competing music retailer. Their customer database needs to be migrated into Chinook's PostgreSQL database.

## Migration Requirements:

1. **Process all customer records from the data table below** and migrate them into the `Customer` table 
2. **Apply business logic during migration**:
   - Assign `CustomerID` values starting from the next available ID
   - Assign all customers to support representative with EmployeeId 3
   - Set `Fax` field to NULL for all migrated customers
3. **Avoid individual INSERT statements**

## Customer Data to Migrate:

| FirstName | LastName | Company | Address | City | State | Country | PostalCode | Phone | Email |
|-----------|----------|---------|---------|------|-------|---------|------------|-------|--------|
| Danielle | Johnson | Sanchez-Taylor | 819 Johnson Course | East William | AK | USA | 74064 | 386-3794 | danielle.johnson@sancheztaylor.com |
| Katherine | Moore | Peterson-Moore | 16155 Roman Stream Suite 816 | New Kellystad | OK | USA | 25704 | 103-4131 | katherine_moore@petersonmoore.org |
| Joshua | Reid | Martin-Kelly | 192 Frank Light Suite 835 | East Lydiamouth | MO | USA | 35594 | 139-5376 | joshua_reid@martinkelly.org |
| Douglas | Taylor | Hoffman, Baker and Richards | 3287 Katelyn Wall Apt. 226 | South Patrickmouth | NC | USA | 33454 | 801-8451 | douglast@hoffmanbakerand.net |
| Ryan | Chavez | Liu, Baker and Mason | 148 Eric Track | New Stephanie | NC | USA | 00575 | 957-0154 | r.chavez@liubakerandmaso.com |
| Brian | Humphrey | Miller Group | 227 Joseph Well | Brandtside | WV | USA | 96174 | 346-5787 | brian.humphrey@millergroup.com |
| John | Brown | Chapman and Sons | 10310 Jones Freeway | Elizabethborough | ND | USA | 17843 | 997-3763 | john.brown@chapmanandsons.com |
| Collin | Jordan | Jenkins-Shields | 106 Mcbride Coves | East James | NV | USA | 18874 | 624-7317 | collin.jordan@jenkinsshields.com |
| Brent | Kidd | Novak and Sons | 7736 Franklin Alley | Bakermouth | LA | USA | 55945 | 872-3430 | brent.kidd@novakandsons.com |
| Julie | Brown | Woods, Calhoun and Schmidt | 121 Emma Freeway | Wilsonshire | IA | USA | 76381 | 909-1699 | julieb@woodscalhounand.net |
| Sarah | Harris | Edwards, Baker and Anderson | 5107 Charles Forest Suite 251 | West Justin | NV | USA | 71701 | 498-0841 | s.harris@edwardsbakerand.com |
| Joseph | Preston | Tran, Nelson and Jacobs | 48740 Cynthia Village Suite 005 | Lake Tina | GA | USA | 97655 | 786-8011 | j.preston@trannelsonandja.com |
| Amy | Davenport | Tran, Jordan and Williams | 53315 Dickson Summit Apt. 322 | Johnsonmouth | WY | USA | 54465 | 342-1607 | a.davenport@tranjordanandwi.com |
| James | Sellers | Torres-Pope | 03654 Tammy Harbors | Darlenefurt | TX | USA | 70783 | 501-4294 | james.sellers@torrespope.com |
| Daniel | Hamilton | Hartman, Graham and Joyce | 9340 Smith Valley | West Ryan | TN | USA | 43780 | 951-4846 | danielh@hartmangrahaman.net |
| Richard | Phillips | Lee Ltd | 299 Sullivan Village Apt. 443 | Floydmouth | NH | USA | 58406 | 738-7214 | richardp@leeltd.net |
| Clarence | Crane | Chambers and Sons | 00379 Stanley Roads | Lake Heather | NM | USA | 52884 | 320-1632 | clarence_crane@chambersandsons.org |
| Brent | Wright | Bryant Group | 9868 Merritt Summit Suite 743 | Katiehaven | NM | USA | 82650 | 347-1434 | brentw@bryantgroup.net |
| Luis | Fernandez | Hernandez Group | 316 Rivera Mountain | Brownchester | MS | USA | 77057 | 096-7054 | luis_fernandez@hernandezgroup.org |
| Melissa | Ashley | Medina-Navarro | 3467 Paul Skyway | Ramseymouth | PW | USA | 17229 | 980-6990 | melissa.ashley@medinanavarro.com |
| Dawn | Taylor | White-Green | 75564 King Common Suite 080 | Jeffreyland | WI | USA | 85927 | 003-3092 | d.taylor@whitegreen.com |
| David | Caldwell | Gould, Marshall and Scott | 99124 Beth Inlet Suite 631 | North Heidi | ME | USA | 90188 | 919-0586 | davidc@gouldmarshallan.net |
| Casey | Holland | Atkinson Group | 5726 Jessica Run | Christinaside | WI | USA | 63873 | 769-4531 | caseyh@atkinsongroup.net |
| Nicole | Sanchez | Hudson-Barnett | 75273 Salinas Junctions Suite 948 | New Stacyland | IA | USA | 94882 | 678-3777 | nicole.sanchez@hudsonbarnett.com |
| Christopher | Walker | Sanchez, Beck and Wood | 8557 Parker Fort Apt. 351 | East Javier | NJ | USA | 36742 | 989-4134 | c.walker@sanchezbeckandw.com |
| Michael | Turner | Ferguson, Hill and Mccann | 271 Audrey Mountains Suite 752 | West Shelleyfort | DE | USA | 09065 | 671-9022 | michaelt@fergusonhilland.net |
| Christopher | Wright | Duran, Obrien and Gibbs | 677 Dalton Meadow | Ashleyton | RI | USA | 97505 | 133-4123 | c.wright@duranobrienandg.com |
| Andrea | Moore | Hayes-Wheeler | 34471 Sandra Turnpike Apt. 618 | Lake Edward | KY | USA | 19144 | 102-4994 | andrea_moore@hayeswheeler.org |
| David | Barker | Powell, Nelson and Fernandez | 90659 Johnson Forks Apt. 490 | South April | NV | USA | 36959 | 296-7175 | david_barker@powellnelsonand.org |
| Mathew | Santiago | Rivera Ltd | 6807 Leonard Islands Apt. 680 | Gutierrezborough | NC | USA | 47920 | 977-0348 | m.santiago@riveraltd.com |
| Sara | Kim | Washington, Johnson and Mccoy | 248 Andrea Course | Port Robin | NH | USA | 15897 | 274-8467 | sara_kim@washingtonjohns.org |
| John | Arnold | Lee-Greene | 46584 Justin Hills | Grimesmouth | ND | USA | 63984 | 558-8675 | j.arnold@leegreene.com |
| Tina | Allen | Hall-Rowe | 7662 Hanna Crossroad | Mollymouth | CT | USA | 69438 | 702-6217 | tinaa@hallrowe.net |
| Matthew | Schwartz | Miller, Murphy and Craig | 7809 Jimmy Spur Suite 316 | Port Cynthiaville | NV | USA | 22306 | 400-5045 | matthews@millermurphyand.net |
| Ryan | Sanchez | Knight-Sparks | 19693 Durham Divide | South Dana | NH | USA | 33967 | 074-8217 | ryans@knightsparks.net |
| Vanessa | Evans | Vaughn-Bryant | 67136 Andrews Squares Suite 064 | New Michelleton | PW | USA | 79983 | 743-9533 | vanessae@vaughnbryant.net |
| Erica | Le | Becker, Taylor and Davis | 7095 Christopher Hill | Julieburgh | ID | USA | 17823 | 858-8424 | erica_le@beckertaylorand.org |
| Tammy | Phillips | Brock-Mcdonald | 36851 Smith Plain | South Miguelview | OR | USA | 50442 | 513-7098 | tammyp@brockmcdonald.net |
| Rose | Walker | Reid Group | 612 Sophia Hollow Suite 113 | South Shawn | TN | USA | 97905 | 869-2617 | rose_walker@reidgroup.org |
| Sheila | Ramirez | Wood, Ramos and Sampson | 58506 Lopez Crossing Suite 139 | North Kristinbury | DC | USA | 74501 | 318-3933 | sheilar@woodramosandsam.net |
| Kim | Kramer | Smith, Garrison and Thomas | 421 David Knolls | New Mario | HI | USA | 35283 | 026-8117 | kim_kramer@smithgarrisonan.org |
| Kimberly | Palmer | Hayes and Sons | 847 Bruce Neck | Simmonsville | NM | USA | 93876 | 711-5921 | k.palmer@hayesandsons.com |
| Joshua | Schultz | Joseph, James and Harper | 8961 Melissa Run Apt. 673 | Morganmouth | MO | USA | 55025 | 156-5452 | joshua_schultz@josephjamesandh.org |
| Carlos | Decker | Reynolds Ltd | 80988 Santiago Loop Suite 604 | Michaelshire | NY | USA | 28385 | 273-1585 | carlos.decker@reynoldsltd.com |
| Kathryn | Andrews | Bruce-Villegas | 402 Park Inlet | Michaelburgh | VI | USA | 19277 | 961-2018 | k.andrews@brucevillegas.com |
| Nicholas | Chavez | Wood Ltd | 910 Eric River Apt. 147 | Tuckermouth | MT | USA | 36305 | 381-5614 | nicholas_chavez@woodltd.org |
| Alison | Parker | Foster PLC | 34324 Murphy Avenue | Burgessburgh | DC | USA | 50335 | 838-8516 | alison.parker@fosterplc.com |
| Ryan | Stevens | Atkins PLC | 664 Richard Islands Apt. 975 | South Meganbury | NE | USA | 77685 | 681-6453 | ryans@atkinsplc.net |
| Kimberly | Jones | Wilson, Hicks and Bullock | 2312 Gonzalez Rapids Apt. 127 | Webstershire | NV | USA | 89778 | 995-5271 | kimberly_jones@wilsonhicksandb.org |
| Scott | Turner | Vargas-Bell | 7700 Decker Club | New Brookefurt | NH | USA | 76565 | 807-9359 | scott_turner@vargasbell.org |
| Walter | Rosario | Garcia-Nolan | 182 John Mill Suite 889 | West Nathan | LA | USA | 51280 | 659-0515 | walter.rosario@garcianolan.com |
| Angela | Hughes | Cummings-Douglas | 1925 Ponce Square | Andersonland | ME | USA | 73760 | 652-8168 | angelah@cummingsdouglas.net |
| Andrew | Parker | Peterson Group | 22141 Ebony Wells | New Nicholas | GA | USA | 24204 | 927-0653 | andrew_parker@petersongroup.org |
| Cheryl | Goodwin | Young-Allen | 59774 Shaw Manor Apt. 392 | Brettfort | VI | USA | 49156 | 818-1412 | cherylg@youngallen.net |
| Shannon | Palmer | Davis-Lozano | 0606 Young Common Suite 305 | Port Jennifermouth | WY | USA | 19643 | 204-7277 | shannon.palmer@davislozano.com |
| Rebecca | Smith | Conley PLC | 43410 Robert Underpass Suite 117 | Lake Zacharybury | VT | USA | 19319 | 460-9539 | rebecca_smith@conleyplc.org |
| Jacob | Barnett | Villegas, Jones and Fox | 7065 Burgess Knolls | West Johnville | WI | USA | 76772 | 520-5852 | jacob_barnett@villegasjonesan.org |
| Tina | Mendoza | Cain Inc | 43030 Mahoney Passage Suite 874 | Port Deborahport | MI | USA | 06766 | 541-5667 | tina_mendoza@caininc.org |
| Matthew | Lopez | Jimenez, Glass and Stone | 616 Amy Islands | North Markport | ME | USA | 58948 | 962-7570 | matthewl@jimenezglassand.net |
| Christina | Graham | Whitney, Gould and Jones | 8202 Johnson Cliff Apt. 556 | New Ericmouth | MN | USA | 49261 | 719-2856 | christinag@whitneygouldand.net |
| Debra | Wright | Johnson and Sons | 681 Hampton Squares Suite 394 | Gonzalezberg | PR | USA | 10207 | 727-1551 | debraw@johnsonandsons.net |
| Patricia | York | Mckinney, Graves and Thompson | 313 Joel Park Apt. 589 | Tannerside | DC | USA | 80710 | 114-6786 | patricia_york@mckinneygravesa.org |
| Madeline | Jones | Day-Cole | 89226 Marie Path Apt. 422 | Sarahbury | MI | USA | 68513 | 414-3842 | madelinej@daycole.net |
| Christina | Davis | Jackson, David and Moore | 001 Stacy Trail Suite 396 | South Pamelaside | LA | USA | 84637 | 473-6471 | christina.davis@jacksondavidand.com |
| Eric | Perry | Harris-Lawson | 556 Kathleen Passage Apt. 537 | West Shannonberg | CT | USA | 07133 | 469-6325 | ericp@harrislawson.net |
| James | Moore | Owens, Koch and Jimenez | 8733 Williams Haven | Harperfort | LA | USA | 70846 | 016-2456 | jamesm@owenskochandjim.net |
| Brandon | Williams | Lee, Tran and Jones | 499 David Court Suite 558 | Kariborough | PA | USA | 67232 | 680-0025 | brandon_williams@leetranandjones.org |
| April | Hernandez | Taylor, Velazquez and Flores | 495 Erickson Hills Suite 055 | South Brandytown | PA | USA | 62706 | 499-3097 | a.hernandez@taylorvelazquez.com |
| Alexandria | Griffith | Hernandez-Becker | 130 Edwards Drive | Vaughnchester | NY | USA | 80648 | 702-8385 | alexandria_griffith@hernandezbecker.org |
| Alicia | Edwards | Stevens PLC | 549 Lee Gateway Suite 843 | Kellieborough | UT | USA | 92905 | 757-5844 | alicia.edwards@stevensplc.com |
| Ashley | Daniels | Cardenas-Blevins | 0415 Douglas Summit | Lewisside | KY | USA | 74165 | 421-9933 | ashley.daniels@cardenasblevins.com |
| Elizabeth | Schmidt | Hall, Garcia and Rivera | 20826 Woods Flats Suite 540 | Lake Audreyside | WA | USA | 95281 | 026-2067 | e.schmidt@hallgarciaandri.com |
| Sharon | Hayden | Mcdowell-Smith | 4788 Small Dale | Nelsonville | MA | USA | 21799 | 742-0549 | s.hayden@mcdowellsmith.com |
| Gregory | Chase | Wilcox-Robertson | 1227 Boyle Avenue | Patrickmouth | WV | USA | 35496 | 549-9045 | g.chase@wilcoxrobertson.com |
| Bryan | Wilson | Moore-Parks | 145 Jeffrey Dale Suite 279 | Robertside | PW | USA | 62213 | 833-9187 | bryanw@mooreparks.net |
| Christian | Elliott | Poole PLC | 822 Bond Mills | Lake Jamieshire | NM | USA | 12420 | 870-7286 | christian_elliott@pooleplc.org |
| Anne | Hansen | Roman, Cummings and Foster | 391 Rodney Squares | New Virginialand | NJ | USA | 04660 | 462-2656 | anne_hansen@romancummingsan.org |
| Molly | Knox | Miller-Brandt | 512 Rice Stream | Port Adam | AK | USA | 39608 | 786-8633 | molly_knox@millerbrandt.org |
| Michael | Hill | Cannon, Johnson and Keller | 31190 Harper Squares | East Joyfurt | NV | USA | 31216 | 830-2843 | michaelh@cannonjohnsonan.net |
| Barbara | Barton | Young-Walter | 4408 Connie Meadow | Williamsstad | SD | USA | 88495 | 685-6624 | barbara_barton@youngwalter.org |
| Ivan | Medina | Atkinson LLC | 0866 Paul Glens | West Deborah | NV | USA | 49138 | 183-0469 | ivan.medina@atkinsonllc.com |
| Morgan | Lopez | Ramsey, Hansen and Mendoza | 0331 Rocha Square Apt. 638 | Kimberlyfurt | NH | USA | 70447 | 544-5877 | morgan.lopez@ramseyhansenand.com |
| Leah | Bowen | Rocha-Wood | 93204 Phillips Flat Suite 369 | South Andrea | TX | USA | 44746 | 477-7252 | l.bowen@rochawood.com |
| Jennifer | Freeman | Mooney, Bernard and Warren | 006 Megan Fort | Lake Edwardborough | NY | USA | 60271 | 509-9770 | jennifer.freeman@mooneybernardan.com |
| Amanda | Jenkins | Moreno LLC | 86211 John River Suite 546 | West Susanmouth | OK | USA | 32378 | 341-0166 | amanda_jenkins@morenollc.org |
| Angela | Brown | Warner Inc | 5918 Jerry Ways Suite 401 | Rachelshire | TN | USA | 04813 | 250-3926 | angela.brown@warnerinc.com |
| Kevin | Elliott | Davenport, Price and Mosley | 2185 Connor Fort Apt. 599 | Novakmouth | AK | USA | 83616 | 477-3586 | kevin_elliott@davenportpricea.org |
| Jacob | Willis | Miller-Montgomery | 114 Norman Tunnel | Lake Peter | MN | USA | 14466 | 104-7541 | j.willis@millermontgomer.com |
| Christopher | Jordan | Peters, Russell and Johnson | 199 Shields Bridge Suite 661 | New Adriana | TX | USA | 50404 | 224-4472 | christopher.jordan@petersrussellan.com |
| Gary | Hill | Washington-Jones | 79937 Derek Avenue Suite 596 | Scottchester | GU | USA | 85833 | 924-5937 | garyh@washingtonjones.net |
| Gregory | Sanders | Carter-Neal | 356 Velasquez Lock Suite 193 | Lake Katrina | AK | USA | 95818 | 737-4167 | g.sanders@carterneal.com |
| Cynthia | Allen | Moore, Henderson and Bennett | 796 Stephens Turnpike Suite 891 | Port Johnstad | GA | USA | 85304 | 909-6561 | cynthia.allen@moorehendersona.com |
| Corey | Walker | Stone, Carpenter and Johnston | 6798 Michael Burg Suite 146 | North Marieberg | MI | USA | 41381 | 573-8757 | corey.walker@stonecarpentera.com |
| Samuel | Horton | Jones-Williams | 51238 Andrea Isle | Mullenbury | AS | USA | 53591 | 226-6093 | samuel_horton@joneswilliams.org |
| Brittany | Price | Lewis, Ramirez and Padilla | 182 Nguyen Mount | West Emilyfort | NC | USA | 84270 | 596-9691 | brittanyp@lewisramirezand.net |
| Michael | Ellis | Cervantes Ltd | 912 Wilson Inlet Apt. 252 | Barnesberg | OK | USA | 50794 | 627-8282 | michael_ellis@cervantesltd.org |
| Keith | Lopez | Harvey-Glenn | 2368 Ortiz Overpass | Mckinneymouth | NM | USA | 22423 | 190-3404 | k.lopez@harveyglenn.com |
| Amanda | Jackson | Cunningham-Barton | 819 Joseph Plains Suite 807 | South Curtis | MP | USA | 86179 | 340-7451 | amanda_jackson@cunninghambarto.org |
| Michelle | Wilson | Clark Ltd | 962 Kristen Via Apt. 095 | Candiceburgh | MD | USA | 92782 | 449-4812 | michelle_wilson@clarkltd.org |
| Samantha | Riddle | Martinez, Cline and Wright | 67294 Brooks Club Apt. 684 | Shawnfort | MD | USA | 76779 | 017-5186 | s.riddle@martinezclinean.com |
| Tammy | Summers | Adams-Clayton | 929 Kramer Springs Apt. 017 | North Sarahburgh | NV | USA | 60337 | 063-2424 | tammy.summers@adamsclayton.com |
| Diamond | Wright | Beck-Banks | 4361 Aaron Neck | East Brittneyhaven | TX | USA | 58836 | 005-1627 | diamond.wright@beckbanks.com |
| Jeremy | Davis | Garcia LLC | 62218 Chelsey Expressway Suite 532 | Jensenmouth | VI | USA | 28975 | 112-1965 | jeremy_davis@garciallc.org |
| Leonard | Taylor | Newman-Wright | 043 Julie Hill Apt. 376 | East Victorland | NC | USA | 02082 | 552-6965 | l.taylor@newmanwright.com |
| Kathryn | Best | Smith Inc | 3006 Fuller Parkway | Hendersonfurt | CO | USA | 84457 | 889-2414 | kathryn.best@smithinc.com |
| William | Harris | Herrera Group | 6303 Sandy Crescent | Salazarton | ME | USA | 87805 | 210-2027 | williamh@herreragroup.net |
| Alexandra | Logan | Green, Watson and Brady | 105 Nelson Circles Suite 917 | Dixonton | NM | USA | 74803 | 252-4191 | a.logan@greenwatsonandb.com |
| Joyce | Smith | Sanchez Group | 2208 Walker Gateway Suite 541 | Davidton | HI | USA | 29754 | 806-1744 | joyces@sanchezgroup.net |
| Christopher | Bryant | Gonzalez-Elliott | 937 Vargas Park Apt. 832 | South Andrewside | MI | USA | 83855 | 050-6413 | c.bryant@gonzalezelliott.com |
| Robert | Woodward | Dawson Inc | 86571 William Route | Jonesshire | AR | USA | 57515 | 234-4565 | robertw@dawsoninc.net |
| Shawn | Hall | Taylor PLC | 12775 Martinez Knolls | South Kyle | KS | USA | 16218 | 124-9035 | s.hall@taylorplc.com |
| Christopher | Wright | Foster-Williams | 2067 Cody Cove Apt. 100 | East James | MO | USA | 49291 | 199-4101 | c.wright@fosterwilliams.com |
| Rachel | Ramos | Davis LLC | 70296 Crawford Light | Thompsonborough | PW | USA | 25031 | 447-2099 | r.ramos@davisllc.com |
| Deborah | Porter | Mendoza, Miller and Reyes | 83806 Castillo Tunnel Suite 598 | Paulburgh | AK | USA | 42296 | 930-4078 | deborahp@mendozamilleran.net |
| Katie | Key | Garcia Ltd | 8039 Kelly Villages | East Joel | MD | USA | 97245 | 590-5992 | k.key@garcialtd.com |
| Mary | Cochran | Weaver-Thompson | 03930 Smith Ridges | Port David | VT | USA | 23761 | 500-2921 | maryc@weaverthompson.net |
| Susan | Brooks | Foster, Garcia and Turner | 67528 Walker Radial | South Kurt | UT | USA | 39103 | 220-9690 | s.brooks@fostergarciaand.com |
| Carrie | Mccall | Walker, Cunningham and Zuniga | 1355 Daisy Corners | Seanview | IL | USA | 33208 | 154-1006 | carrie_mccall@walkercunningha.org |
| Jessica | Costa | Snyder-Gray | 79327 Lauren Bypass Suite 054 | North Matthewfurt | GA | USA | 96443 | 181-5997 | jessica.costa@snydergray.com |
| Ryan | Valdez | Preston, Moore and Garcia | 68844 Young Causeway | Armstrongfort | FL | USA | 07645 | 506-1497 | r.valdez@prestonmooreand.com |
| Collin | Clark | Carter, Miller and Anthony | 7741 Lopez Light Suite 270 | Scottview | IN | USA | 35701 | 902-1158 | collin_clark@cartermillerand.org |
| Tara | Lawrence | Brown, Hughes and Mills | 374 Ralph Walk Apt. 898 | North Stacy | NV | USA | 23160 | 233-2061 | tara_lawrence@brownhughesandm.org |
| James | Carson | Flowers LLC | 116 Arnold Walks Suite 870 | Rodriguezberg | FL | USA | 74765 | 991-1914 | jamesc@flowersllc.net |
| Natalie | Baker | Washington, Lynch and Johnson | 2996 Randy Isle Apt. 074 | Andrewport | ME | USA | 37246 | 713-2475 | natalieb@washingtonlynch.net |
| Jessica | Jacobs | Lopez and Sons | 785 Zachary Estate Apt. 486 | Port Melissa | FM | USA | 75038 | 023-3030 | jessica_jacobs@lopezandsons.org |
| Brent | Ward | Hill Group | 103 Burns Mission Apt. 798 | Maxview | WA | USA | 90790 | 140-6029 | b.ward@hillgroup.com |
| Mercedes | Holland | Clark, Pearson and Palmer | 2290 Johnny Valley | Jenniferview | NE | USA | 49846 | 574-3748 | mercedes_holland@clarkpearsonand.org |
| Breanna | Smith | Levy, Franco and Hoffman | 1715 Davidson Wall Suite 443 | New Kathy | MH | USA | 07942 | 965-2074 | breannas@levyfrancoandho.net |
| Rebecca | Sullivan | Johnson, Erickson and Armstrong | 3875 Bruce Ville | West Connor | DC | USA | 97614 | 482-5135 | r.sullivan@johnsonerickson.com |
| Julie | Parker | Watson-Richards | 70999 Thomas Fields Apt. 684 | Brownberg | DC | USA | 26754 | 569-7252 | julie.parker@watsonrichards.com |
| Tony | Welch | Edwards Inc | 4329 Tracy Track | East Christinachester | MO | USA | 56734 | 760-0835 | tony.welch@edwardsinc.com |
| Patricia | Sherman | Lee, Rhodes and Sims | 54216 Jackson View | West Stacymouth | VA | USA | 68696 | 985-6257 | patricias@leerhodesandsim.net |
| Karen | Martin | Smith-Walker | 09821 Dawson Turnpike | South Nancyview | WI | USA | 70589 | 909-0100 | karen.martin@smithwalker.com |
| Robert | James | King, Miles and Harris | 6184 Robert Cove | West Danielville | NM | USA | 26538 | 934-8356 | robertj@kingmilesandhar.net |
| Ethan | Kelley | Watts Group | 00119 Hernandez Course Apt. 143 | Hintonport | KS | USA | 61354 | 012-0455 | ethan_kelley@wattsgroup.org |
| Joanna | Davis | Smith and Sons | 5794 Nathan Junctions | North Richard | NH | USA | 36130 | 676-2120 | j.davis@smithandsons.com |
| Dale | Pruitt | Pham-Gregory | 659 Michelle Villages | South Samantha | DE | USA | 54408 | 701-4508 | d.pruitt@phamgregory.com |
| Tiffany | Santiago | Stone-Watts | 3756 Mary Point | North Dawnburgh | NY | USA | 62011 | 721-7535 | tiffanys@stonewatts.net |
| Brent | Walker | Gray, Montoya and Miller | 717 Stewart Parks Apt. 166 | New Andrealand | WY | USA | 79695 | 948-8375 | brentw@graymontoyaandm.net |
| Marcia | Velasquez | Rivera-Saunders | 571 Katherine Forges Apt. 554 | Jacquelineton | MH | USA | 22017 | 726-1493 | m.velasquez@riverasaunders.com |
| David | Phelps | Bryant and Sons | 60917 Barrett Parkways Apt. 708 | New Savannahshire | NJ | USA | 67129 | 292-2169 | davidp@bryantandsons.net |
| William | Cruz | Moon, Farmer and Hill | 7226 Cameron Plaza Suite 833 | New Jennifer | TX | USA | 45759 | 228-8515 | william_cruz@moonfarmerandhi.org |
| Brandi | Bender | Butler, Adkins and Skinner | 0810 Thomas Skyway Apt. 342 | Francesberg | MP | USA | 08631 | 438-0571 | b.bender@butleradkinsand.com |
| Julia | Hoffman | Dixon Ltd | 066 Frye Spur Suite 800 | Jamesmouth | MP | USA | 30064 | 598-9334 | julia_hoffman@dixonltd.org |
| Gregory | Fleming | Rivers Ltd | 0648 Anderson Prairie | Adammouth | VT | USA | 20791 | 025-9094 | gregory_fleming@riversltd.org |
| Kristy | Pierce | Bowers LLC | 81826 Davis Forges | Lake Martin | MN | USA | 38980 | 398-7801 | kristyp@bowersllc.net |
| Sean | Conway | Sellers, Sanchez and Williams | 1648 Johnson Path Suite 887 | Williamsborough | MD | USA | 67858 | 112-8801 | s.conway@sellerssancheza.com |
| Ellen | Ayala | Coleman, Garcia and Medina | 120 Love Camp Apt. 102 | Angelashire | GU | USA | 30338 | 466-7665 | ellen.ayala@colemangarciaan.com |
| Perry | Wilson | May PLC | 901 Reilly Coves | Kristinport | PA | USA | 11839 | 476-6072 | p.wilson@mayplc.com |
| Derek | Myers | Phillips, Walters and Evans | 88210 Ashley Lock Apt. 435 | South Rebecca | PR | USA | 67682 | 222-3943 | derek.myers@phillipswalters.com |
| Howard | Marsh | York PLC | 814 John Flat Suite 552 | North Justin | CA | USA | 25863 | 577-5949 | h.marsh@yorkplc.com |
| Ariana | Diaz | Benjamin-Jackson | 36452 Humphrey Mountain Suite 547 | East Debbieland | MP | USA | 37281 | 283-4110 | ariana.diaz@benjaminjackson.com |
| Lisa | Riley | Lewis, Johnson and Green | 256 Patricia Radial Suite 278 | South Michaeltown | TN | USA | 31811 | 928-2722 | l.riley@lewisjohnsonand.com |
| Jill | Webb | Williams-Juarez | 45303 Hughes Motorway | North Tinamouth | CT | USA | 92741 | 844-9892 | jill_webb@williamsjuarez.org |
| Desiree | Diaz | Villanueva, Miller and King | 655 Sparks Rapids | New Nicolemouth | GA | USA | 30646 | 184-3222 | desireed@villanuevamille.net |
| Carolyn | Montoya | Hall, Shepherd and Cortez | 773 Deborah Loop Apt. 302 | East Crystal | AZ | USA | 75509 | 202-4286 | carolyn.montoya@hallshepherdand.com |
| Natalie | Luna | Valentine-Robinson | 2369 Laura View Apt. 984 | Lake Gina | NH | USA | 78689 | 913-6621 | natalie.luna@valentinerobins.com |
| James | Heath | Cohen, Serrano and Jacobs | 9908 Christopher Shoals | New Amber | AL | USA | 89441 | 686-5086 | j.heath@cohenserranoand.com |
| Shawna | Olson | Bell-Ballard | 2473 Justin Wells | Scotttown | VT | USA | 97972 | 098-1806 | s.olson@bellballard.com |
| Gwendolyn | Stewart | Rodriguez-Simmons | 8695 Braun Locks Apt. 688 | Whiteside | OH | USA | 63908 | 449-5621 | g.stewart@rodriguezsimmon.com |
| Sean | Lyons | Garcia PLC | 8902 Oconnell Avenue Apt. 279 | Davisview | IN | USA | 49107 | 190-6698 | seanl@garciaplc.net |
| Jennifer | Harper | Bowman Group | 84309 Christina Spring | West Johntown | GA | USA | 11883 | 465-6693 | jennifer.harper@bowmangroup.com |
| Jillian | Jones | Dunn Ltd | 4393 Spears Ports Apt. 426 | New Charlesport | MA | USA | 15837 | 848-9476 | jillian_jones@dunnltd.org |
| Kayla | Todd | Maldonado-Mosley | 1416 Erica Forks | Robertstad | NC | USA | 70709 | 043-4165 | kayla.todd@maldonadomosley.com |
| Angela | White | Gomez-Shannon | 37333 Clark Flats Apt. 952 | North Samanthafort | RI | USA | 01369 | 807-5957 | angelaw@gomezshannon.net |
| Travis | Joyce | Ramirez, Walker and Ray | 678 Wayne Lock | South Tiffany | UT | USA | 68423 | 750-0369 | travis.joyce@ramirezwalkeran.com |
| Mark | Salazar | Lopez-Baker | 9552 Coleman Manor Suite 564 | Whiteberg | OK | USA | 90417 | 314-3866 | m.salazar@lopezbaker.com |
| Dustin | Haley | Kennedy Inc | 7288 Floyd Hills | Annashire | AR | USA | 52720 | 120-3471 | dustin_haley@kennedyinc.org |
| Julie | Green | Castro-Frederick | 0615 Barbara Run Apt. 455 | Hamptonmouth | FM | USA | 10778 | 694-7225 | julie_green@castrofrederick.org |
| Crystal | Duncan | Miller LLC | 5449 Nelson Mills | Juliehaven | NV | USA | 54763 | 220-2341 | c.duncan@millerllc.com |
| Garrett | Garcia | Zuniga Group | 68114 Christopher Loaf | Jeromeport | NV | USA | 82615 | 228-2005 | garrettg@zunigagroup.net |
| Michelle | Mcdonald | Donovan, Dunn and Taylor | 979 Mills Route | Reginafort | ND | USA | 30271 | 174-5642 | michellem@donovandunnandt.net |
| Alex | Mills | Cooper Group | 774 Katie Union | Carlatown | OH | USA | 49475 | 368-6632 | alex_mills@coopergroup.org |
| Maria | Walker | Henderson and Sons | 8463 Ian Highway Apt. 797 | Jackiefort | ID | USA | 42528 | 020-8021 | mariaw@hendersonandson.net |
| Joseph | Espinoza | Smith, Davis and Smith | 6475 Terry Bypass | Christopherberg | AR | USA | 35432 | 618-7234 | joseph_espinoza@smithdavisandsm.org |
| Maria | Martinez | Wright, Wise and Ramos | 71837 Maldonado Inlet | Ericton | WA | USA | 72535 | 814-7435 | maria.martinez@wrightwiseandra.com |
| Michelle | Robinson | Young Group | 24916 Albert Canyon Suite 925 | East Ericland | TX | USA | 81588 | 500-5281 | m.robinson@younggroup.com |
| Tony | Stewart | Kramer, Sherman and Trujillo | 306 Ramsey Glen Apt. 778 | Amyfort | ID | USA | 74779 | 285-5749 | t.stewart@kramershermanan.com |
| Casey | Moore | Weiss-Weaver | 86209 Parsons Garden Suite 186 | New Felicia | WI | USA | 72782 | 294-5651 | casey.moore@weissweaver.com |
| Alexandra | Jones | White Inc | 73109 Barrett Pine | Brandonbury | PA | USA | 94590 | 103-7170 | alexandraj@whiteinc.net |
| Angela | Hurley | Short-Bauer | 480 Mary Club | New Colton | VA | USA | 30780 | 863-3839 | a.hurley@shortbauer.com |
| Angela | Grant | Garcia, Fowler and Howard | 612 Andrea Parkways Suite 289 | Mahoneymouth | OH | USA | 43054 | 566-5939 | a.grant@garciafowlerand.com |
| Nicholas | Pierce | King, Nixon and West | 04908 Victoria Hollow Apt. 433 | Andrewview | PW | USA | 73070 | 889-9210 | nicholas_pierce@kingnixonandwes.org |
| Michael | Taylor | Preston-Wright | 1969 Jessica Stream Suite 727 | New Dawnton | VA | USA | 76035 | 610-5566 | michael.taylor@prestonwright.com |
| Molly | Perez | Atkinson, Mcfarland and Walters | 48058 Mark Square Apt. 206 | Mullinsshire | NY | USA | 12308 | 364-6225 | molly.perez@atkinsonmcfarla.com |
| Thomas | Mcgee | Ross, Miller and Shaw | 78376 Ann Street | East Charles | WI | USA | 56870 | 591-1665 | thomasm@rossmillerandsh.net |
| James | Cooper | Johnson, Torres and Huerta | 270 James Landing Apt. 110 | New Sara | VI | USA | 38208 | 051-4770 | jamesc@johnsontorresan.net |
| Jason | Medina | Payne LLC | 206 Jonathan Circle Suite 394 | South Dianatown | CA | USA | 51441 | 451-0463 | jason_medina@paynellc.org |
| William | Mckinney | Washington-Harper | 38780 John Pines | Matthewfurt | WA | USA | 21079 | 055-5438 | williamm@washingtonharpe.net |
| Lisa | Garrett | Zamora-Briggs | 432 Prince Shoals | North Jessica | NC | USA | 89367 | 936-3926 | lisag@zamorabriggs.net |
| Renee | Murphy | Anderson, Delgado and Carpenter | 48262 Lonnie Point | East Lonnieberg | VA | USA | 04365 | 566-4742 | r.murphy@andersondelgado.com |
| Daniel | Lopez | Jensen, Obrien and Salazar | 05172 Joseph Landing | Port Paul | NJ | USA | 18525 | 233-0604 | daniel_lopez@jensenobrienand.org |
| Jeffrey | Powers | Todd Inc | 9757 Ronald Trail | New Jillfurt | VA | USA | 41513 | 699-9880 | jeffrey.powers@toddinc.com |
| Shannon | Wilcox | Rich and Sons | 086 James Mill Suite 447 | South Kelly | PW | USA | 07650 | 827-7181 | s.wilcox@richandsons.com |
| Kimberly | Pace | Payne, Long and Morris | 79371 Nguyen Run | Lake Jessica | CO | USA | 15464 | 751-8689 | k.pace@paynelongandmor.com |
| Nicholas | James | Barr PLC | 22064 Cross Mission | Courtneyville | MH | USA | 17746 | 309-4077 | nicholas_james@barrplc.org |
| Amy | Smith | Young-Chapman | 6719 John Plaza Suite 983 | East Eddiestad | AZ | USA | 19555 | 099-4510 | amy.smith@youngchapman.com |
| Robert | Thompson | Mitchell, Guerrero and Graves | 9501 Morris Light | Port Ronaldside | CA | USA | 38883 | 721-4586 | r.thompson@mitchellguerrer.com |
| Heather | Salazar | Duncan Ltd | 9469 Green Ports | Sarashire | NM | USA | 68619 | 772-9343 | heather.salazar@duncanltd.com |
| David | Marshall | Mclaughlin and Sons | 0558 Alex Flats Suite 414 | Williammouth | WI | USA | 01304 | 155-6990 | d.marshall@mclaughlinandso.com |

================================================
FILE: tasks/postgres/standard/chinook/customer_data_migration/meta.json
================================================
{
  "task_id": "customer_data_migration",
  "task_name": "Customer Data Migration",
  "category_id": "chinook",
  "category_name": "Chinook",
  "description": "Migrate customer data from acquired company MelodyMart into Chinook database using bulk operations and business logic.",
  "author": "Lingxiao Du",
  "created_at": "2025-08-12",
  "difficulty": "L3",
  "tags": [
    "data migration",
    "transactional operations"
  ],
  "mcp": [
    "postgres"
  ],
  "meta_data": {
    "stateType": "text",
    "stateContent": "Table \"Album\" {\n  \"AlbumId\" int4 [pk, not null]\n  \"Title\" varchar(160) [not null]\n  \"ArtistId\" int4 [not null]\n\n  Indexes {\n    ArtistId [type: btree, name: \"IFK_AlbumArtistId\"]\n  }\n}\n\nTable \"Artist\" {\n  \"ArtistId\" int4 [pk, not null]\n  \"Name\" varchar(120)\n}\n\nTable \"Customer\" {\n  \"CustomerId\" int4 [pk, not null]\n  \"FirstName\" varchar(40) [not null]\n  \"LastName\" varchar(20) [not null]\n  \"Company\" varchar(80)\n  \"Address\" varchar(70)\n  \"City\" varchar(40)\n  \"State\" varchar(40)\n  \"Country\" varchar(40)\n  \"PostalCode\" varchar(10)\n  \"Phone\" varchar(24)\n  \"Fax\" varchar(24)\n  \"Email\" varchar(60) [not null]\n  \"SupportRepId\" int4\n\n  Indexes {\n    SupportRepId [type: btree, name: \"IFK_CustomerSupportRepId\"]\n  }\n}\n\nTable \"Employee\" {\n  \"EmployeeId\" int4 [pk, not null]\n  \"LastName\" varchar(20) [not null]\n  \"FirstName\" varchar(20) [not null]\n  \"Title\" varchar(30)\n  \"ReportsTo\" int4\n  \"BirthDate\" timestamp\n  \"HireDate\" timestamp\n  \"Address\" varchar(70)\n  \"City\" varchar(40)\n  \"State\" varchar(40)\n  \"Country\" varchar(40)\n  \"PostalCode\" varchar(10)\n  \"Phone\" varchar(24)\n  \"Fax\" varchar(24)\n  \"Email\" varchar(60)\n\n  Indexes {\n    ReportsTo [type: btree, name: \"IFK_EmployeeReportsTo\"]\n  }\n}\n\nTable \"Genre\" {\n  \"GenreId\" int4 [pk, not null]\n  \"Name\" varchar(120)\n}\n\nTable \"Invoice\" {\n  \"InvoiceId\" int4 [pk, not null]\n  \"CustomerId\" int4 [not null]\n  \"InvoiceDate\" timestamp [not null]\n  \"BillingAddress\" varchar(70)\n  \"BillingCity\" varchar(40)\n  \"BillingState\" varchar(40)\n  \"BillingCountry\" varchar(40)\n  \"BillingPostalCode\" varchar(10)\n  \"Total\" numeric(10,2) [not null]\n\n  Indexes {\n    CustomerId [type: btree, name: \"IFK_InvoiceCustomerId\"]\n  }\n}\n\nTable \"InvoiceLine\" {\n  \"InvoiceLineId\" int4 [pk, not null]\n  \"InvoiceId\" int4 [not null]\n  \"TrackId\" int4 [not null]\n  \"UnitPrice\" numeric(10,2) [not null]\n  \"Quantity\" int4 [not null]\n\n  Indexes {\n    InvoiceId [type: btree, name: \"IFK_InvoiceLineInvoiceId\"]\n    TrackId [type: btree, name: \"IFK_InvoiceLineTrackId\"]\n  }\n}\n\nTable \"MediaType\" {\n  \"MediaTypeId\" int4 [pk, not null]\n  \"Name\" varchar(120)\n}\n\nTable \"Playlist\" {\n  \"PlaylistId\" int4 [pk, not null]\n  \"Name\" varchar(120)\n}\n\nTable \"PlaylistTrack\" {\n  \"PlaylistId\" int4 [not null]\n  \"TrackId\" int4 [not null]\n\n  Indexes {\n    (PlaylistId, TrackId) [type: btree, name: \"PK_PlaylistTrack\"]\n    TrackId [type: btree, name: \"IFK_PlaylistTrackTrackId\"]\n  }\n}\n\nTable \"Track\" {\n  \"TrackId\" int4 [pk, not null]\n  \"Name\" varchar(200) [not null]\n  \"AlbumId\" int4\n  \"MediaTypeId\" int4 [not null]\n  \"GenreId\" int4\n  \"Composer\" varchar(220)\n  \"Milliseconds\" int4 [not null]\n  \"Bytes\" int4\n  \"UnitPrice\" numeric(10,2) [not null]\n\n  Indexes {\n    AlbumId [type: btree, name: \"IFK_TrackAlbumId\"]\n    GenreId [type: btree, name: \"IFK_TrackGenreId\"]\n    MediaTypeId [type: btree, name: \"IFK_TrackMediaTypeId\"]\n  }\n}\n\nRef \"FK_AlbumArtistId\":\"Artist\".\"ArtistId\" < \"Album\".\"ArtistId\"\n\nRef \"FK_CustomerSupportRepId\":\"Employee\".\"EmployeeId\" < \"Customer\".\"SupportRepId\"\n\nRef \"FK_EmployeeReportsTo\":\"Employee\".\"EmployeeId\" < \"Employee\".\"ReportsTo\"\n\nRef \"FK_InvoiceCustomerId\":\"Customer\".\"CustomerId\" < \"Invoice\".\"CustomerId\"\n\nRef \"FK_InvoiceLineInvoiceId\":\"Invoice\".\"InvoiceId\" < \"InvoiceLine\".\"InvoiceId\"\n\nRef \"FK_InvoiceLineTrackId\":\"Track\".\"TrackId\" < \"InvoiceLine\".\"TrackId\"\n\nRef \"FK_PlaylistTrackPlaylistId\":\"Playlist\".\"PlaylistId\" < \"PlaylistTrack\".\"PlaylistId\"\n\nRef \"FK_PlaylistTrackTrackId\":\"Track\".\"TrackId\" < \"PlaylistTrack\".\"TrackId\"\n\nRef \"FK_TrackAlbumId\":\"Album\".\"AlbumId\" < \"Track\".\"AlbumId\"\n\nRef \"FK_TrackGenreId\":\"Genre\".\"GenreId\" < \"Track\".\"GenreId\"\n\nRef \"FK_TrackMediaTypeId\":\"MediaType\".\"MediaTypeId\" < \"Track\".\"MediaTypeId\"\n",
    "stateUrl": null,
    "stateOriginalUrl": "https://github.com/neondatabase-labs/postgres-sample-dbs/blob/main/chinook.sql"
  }
}

================================================
FILE: tasks/postgres/standard/chinook/customer_data_migration/verify.py
================================================
"""
Verification script for PostgreSQL Task 2: Customer Data Migration
"""

import os
import sys
import psycopg2
import pickle

def get_connection_params() -> dict:
    """Get database connection parameters."""
    return {
        "host": os.getenv("POSTGRES_HOST", "localhost"),
        "port": int(os.getenv("POSTGRES_PORT", 5432)),
        "database": os.getenv("POSTGRES_DATABASE"),
        "user": os.getenv("POSTGRES_USERNAME"),
        "password": os.getenv("POSTGRES_PASSWORD")
    }

def load_expected_customers():
    """Load the expected customer data from pickle file."""
    import os
    script_dir = os.path.dirname(os.path.abspath(__file__))
    pkl_path = os.path.join(script_dir, 'customer_data.pkl')
    
    try:
        with open(pkl_path, 'rb') as f:
            return pickle.load(f)
    except FileNotFoundError:
        print(f"❌ customer_data.pkl not found at {pkl_path}. Please generate customer data first.")
        return None
    except Exception as e:
        print(f"❌ Error loading customer data: {e}")
        return None

def verify_migrated_customers(conn, expected_customers) -> bool:
    """Verify migrated customers by comparing with expected data as sets."""
    with conn.cursor() as cur:
        # Get all customers with ID > 59 (the migrated ones)
        cur.execute('''
            SELECT "FirstName", "LastName", "Company", "Address", "City", 
                   "State", "Country", "PostalCode", "Phone", "Email", 
                   "SupportRepId", "Fax"
            FROM "Customer" 
            WHERE "CustomerId" > 59
        ''')
        
        actual_customers = cur.fetchall()
        
        if len(actual_customers) != len(expected_customers):
            print(f"❌ Expected {len(expected_customers)} migrated customers, found {len(actual_customers)}")
            return False
        
        # Convert expected customers to tuples for set comparison
        expected_tuples = set()
        for expected in expected_customers:
            expected_tuple = (
                expected['FirstName'], expected['LastName'], expected['Company'],
                expected['Address'], expected['City'], expected['State'],
                expected['Country'], expected['PostalCode'], expected['Phone'], 
                expected['Email'], 3, None  # SupportRepId=3, Fax=None
            )
            expected_tuples.add(expected_tuple)
        
        # Convert actual customers to set with proper type conversion
        actual_tuples = set()
        for row in actual_customers:
            # Convert all fields to strings for consistent comparison
            actual_tuple = (
                str(row[0]) if row[0] is not None else '',  # FirstName
                str(row[1]) if row[1] is not None else '',  # LastName  
                str(row[2]) if row[2] is not None else '',  # Company
                str(row[3]) if row[3] is not None else '',  # Address
                str(row[4]) if row[4] is not None else '',  # City
                str(row[5]) if row[5] is not None else '',  # State
                str(row[6]) if row[6] is not None else '',  # Country
                str(row[7]) if row[7] is not None else '',  # PostalCode
                str(row[8]) if row[8] is not None else '',  # Phone
                str(row[9]) if row[9] is not None else '',  # Email
                int(row[10]) if row[10] is not None else None,  # SupportRepId
                row[11]  # Fax (should be None)
            )
            actual_tuples.add(actual_tuple)
        
        # Check if sets are equal
        if actual_tuples != expected_tuples:
            missing_in_actual = expected_tuples - actual_tuples
            extra_in_actual = actual_tuples - expected_tuples
            
            print(f"❌ Customer data sets don't match!")
            if missing_in_actual:
                print(f"   Missing {len(missing_in_actual)} expected customers")
                for missing in list(missing_in_actual)[:3]:  # Show first 3
                    print(f"   Missing: {missing[0]} {missing[1]} - {missing[2]}")
                if len(missing_in_actual) > 3:
                    print(f"   ... and {len(missing_in_actual) - 3} more")
            
            if extra_in_actual:
                print(f"   Found {len(extra_in_actual)} unexpected customers")
                for extra in list(extra_in_actual)[:3]:  # Show first 3
                    print(f"   Extra: {extra[0]} {extra[1]} - {extra[2]}")
                if len(extra_in_actual) > 3:
                    print(f"   ... and {len(extra_in_actual) - 3} more")
            
            return False
        
        print(f"✅ All {len(expected_customers)} customers migrated correctly")
        print(f"✅ All customers assigned to SupportRepId 3")
        print(f"✅ All customers have Fax field set to NULL")
        print(f"✅ Customer data sets match exactly (order-independent)")
        
        return True

def main():
    """Main verification function."""
    print("=" * 60)
    print("Verifying Customer Data Migration Task")
    print("=" * 60)

    # Load expected customer data
    expected_customers = load_expected_customers()
    if not expected_customers:
        sys.exit(1)
    
    print(f"Loaded {len(expected_customers)} expected customer records")

    # Get connection parameters
    conn_params = get_connection_params()

    if not conn_params["database"]:
        print("❌ No database specified")
        sys.exit(1)

    try:
        # Connect to database
        conn = psycopg2.connect(**conn_params)

        # Verify migration
        success = verify_migrated_customers(conn, expected_customers)

        conn.close()

        if success:
            print("\n🎉 Task verification: PASS")
            sys.exit(0)
        else:
            print("\n❌ Task verification: FAIL")
            sys.exit(1)

    except psycopg2.Error as e:
        print(f"❌ Database error: {e}")
        sys.exit(1)
    except Exception as e:
        print(f"❌ Verification error: {e}")
        sys.exit(1)

if __name__ == "__main__":
    main()

================================================
FILE: tasks/postgres/standard/chinook/employee_hierarchy_management/description.md
================================================
Manage employee hierarchy and customer assignments through systematic CRUD operations.

## Your Mission:

Chinook needs to reorganize their employee structure and reassign customer relationships. Complete a series of precise database modifications to update the employee hierarchy.

## Tasks to Complete:

### 1. **INSERT: Add New Employees**
Insert exactly 2 new employees into the Employee table:
- EmployeeId: 9, FirstName: 'Sarah', LastName: 'Johnson', Title: 'Sales Support Agent', ReportsTo: 2, BirthDate: '1985-03-15', HireDate: '2009-01-10', Address: '123 Oak Street', City: 'Calgary', State: 'AB', Country: 'Canada', PostalCode: 'T2P 5G3', Phone: '+1 (403) 555-0123', Fax: '+1 (403) 555-0124', Email: 'sarah.johnson@chinookcorp.com'
- EmployeeId: 10, FirstName: 'Mike', LastName: 'Chen', Title: 'Sales Support Agent', ReportsTo: 2, BirthDate: '1982-08-22', HireDate: '2009-01-10', Address: '456 Pine Ave', City: 'Calgary', State: 'AB', Country: 'Canada', PostalCode: 'T2P 5G4', Phone: '+1 (403) 555-0125', Fax: '+1 (403) 555-0126', Email: 'mike.chen@chinookcorp.com'

### 2. **UPDATE: Modify Existing Employee Information**
- Change Andrew Adams (EmployeeId = 1) title from 'General Manager' to 'CEO'
- Update Nancy Edwards (EmployeeId = 2) phone number to '+1 (403) 555-9999'
- Change all employees with Title = 'IT Staff' to have Title = 'IT Specialist'

### 3. **UPDATE: Reassign Some Customers to New Employees**
- Update customers with CustomerId 1, 2, 3 to have SupportRepId = 9 (Sarah Johnson)
- Update customers with CustomerId 4, 5, 6 to have SupportRepId = 10 (Mike Chen)


### 4. **UPDATE: Reorganize Reporting Structure**
- Change Sarah Johnson (EmployeeId = 9) to report to Andrew Adams (EmployeeId = 1) instead of Nancy Edwards
- Change Mike Chen (EmployeeId = 10) to also report to Andrew Adams (EmployeeId = 1)

### 5. **INSERT: Create Employee Performance Table**
Create a new table called `employee_performance`:
- `employee_id` (integer, foreign key to Employee)
- `customers_assigned` (integer)
- `performance_score` (decimal)

Insert records for employees 9 and 10 by calculating their actual customer assignments:
- Sarah Johnson: calculate actual number of customers assigned to her, performance score 4.5
- Mike Chen: calculate actual number of customers assigned to him, performance score 4.2

### 6. **DELETE: Remove IT Department Employee**
- Delete Robert King (EmployeeId = 7) from the Employee table
- Before deletion, handle all relationships:
  - Find who Robert reports to and reassign any employees who report to Robert to report to Robert's manager instead
  - Find all customers assigned to Robert as their support rep and reassign them to Robert's manager

### 7. **UPDATE: Promote Remaining IT Staff**
- Promote Laura Callahan (EmployeeId = 8) from 'IT Specialist' to 'Senior IT Specialist'  
- Update her salary information by adding a new column `salary` to Employee table (decimal type)
- Set Laura's salary to 75000.00 and all other employees to 50000.00

### 8. **Final Verification Query**
Execute this exact query to verify all changes:
```sql
SELECT 
    COUNT(*) as total_employees,
    COUNT(CASE WHEN "Title" = 'CEO' THEN 1 END) as ceo_count,
    COUNT(CASE WHEN "Title" = 'IT Specialist' THEN 1 END) as it_specialist_count,
    COUNT(CASE WHEN "ReportsTo" = 1 THEN 1 END) as reports_to_ceo
FROM "Employee";
```

Expected result: total_employees = 9, ceo_count = 1, it_specialist_count = 0, reports_to_ceo = 4

## Business Rules:
* Use exact EmployeeId values as specified
* Maintain referential integrity between Employee and Customer tables
* All phone numbers must include country code format
* Email addresses must follow the pattern firstname.lastname@chinookcorp.com

## Expected Outcome:
The database should have exactly 10 employees total, with the new hierarchy structure in place and customer assignments updated accordingly.

================================================
FILE: tasks/postgres/standard/chinook/employee_hierarchy_management/meta.json
================================================
{
  "task_id": "employee_hierarchy_management",
  "task_name": "Employee Hierarchy Management",
  "category_id": "chinook",
  "category_name": "Chinook",
  "description": "Reorganize employee structure through CRUD operations including inserts, updates, deletes, and customer reassignments.",
  "author": "Lingxiao Du",
  "created_at": "2025-08-12",
  "difficulty": "L3",
  "tags": [
    "data migration",
    "schema design",
    "transactional operations"
  ],
  "mcp": [
    "postgres"
  ],
  "meta_data": {
    "stateType": "text",
    "stateContent": "Table \"Album\" {\n  \"AlbumId\" int4 [pk, not null]\n  \"Title\" varchar(160) [not null]\n  \"ArtistId\" int4 [not null]\n\n  Indexes {\n    ArtistId [type: btree, name: \"IFK_AlbumArtistId\"]\n  }\n}\n\nTable \"Artist\" {\n  \"ArtistId\" int4 [pk, not null]\n  \"Name\" varchar(120)\n}\n\nTable \"Customer\" {\n  \"CustomerId\" int4 [pk, not null]\n  \"FirstName\" varchar(40) [not null]\n  \"LastName\" varchar(20) [not null]\n  \"Company\" varchar(80)\n  \"Address\" varchar(70)\n  \"City\" varchar(40)\n  \"State\" varchar(40)\n  \"Country\" varchar(40)\n  \"PostalCode\" varchar(10)\n  \"Phone\" varchar(24)\n  \"Fax\" varchar(24)\n  \"Email\" varchar(60) [not null]\n  \"SupportRepId\" int4\n\n  Indexes {\n    SupportRepId [type: btree, name: \"IFK_CustomerSupportRepId\"]\n  }\n}\n\nTable \"Employee\" {\n  \"EmployeeId\" int4 [pk, not null]\n  \"LastName\" varchar(20) [not null]\n  \"FirstName\" varchar(20) [not null]\n  \"Title\" varchar(30)\n  \"ReportsTo\" int4\n  \"BirthDate\" timestamp\n  \"HireDate\" timestamp\n  \"Address\" varchar(70)\n  \"City\" varchar(40)\n  \"State\" varchar(40)\n  \"Country\" varchar(40)\n  \"PostalCode\" varchar(10)\n  \"Phone\" varchar(24)\n  \"Fax\" varchar(24)\n  \"Email\" varchar(60)\n\n  Indexes {\n    ReportsTo [type: btree, name: \"IFK_EmployeeReportsTo\"]\n  }\n}\n\nTable \"Genre\" {\n  \"GenreId\" int4 [pk, not null]\n  \"Name\" varchar(120)\n}\n\nTable \"Invoice\" {\n  \"InvoiceId\" int4 [pk, not null]\n  \"CustomerId\" int4 [not null]\n  \"InvoiceDate\" timestamp [not null]\n  \"BillingAddress\" varchar(70)\n  \"BillingCity\" varchar(40)\n  \"BillingState\" varchar(40)\n  \"BillingCountry\" varchar(40)\n  \"BillingPostalCode\" varchar(10)\n  \"Total\" numeric(10,2) [not null]\n\n  Indexes {\n    CustomerId [type: btree, name: \"IFK_InvoiceCustomerId\"]\n  }\n}\n\nTable \"InvoiceLine\" {\n  \"InvoiceLineId\" int4 [pk, not null]\n  \"InvoiceId\" int4 [not null]\n  \"TrackId\" int4 [not null]\n  \"UnitPrice\" numeric(10,2) [not null]\n  \"Quantity\" int4 [not null]\n\n  Indexes {\n    InvoiceId [type: btree, name: \"IFK_InvoiceLineInvoiceId\"]\n    TrackId [type: btree, name: \"IFK_InvoiceLineTrackId\"]\n  }\n}\n\nTable \"MediaType\" {\n  \"MediaTypeId\" int4 [pk, not null]\n  \"Name\" varchar(120)\n}\n\nTable \"Playlist\" {\n  \"PlaylistId\" int4 [pk, not null]\n  \"Name\" varchar(120)\n}\n\nTable \"PlaylistTrack\" {\n  \"PlaylistId\" int4 [not null]\n  \"TrackId\" int4 [not null]\n\n  Indexes {\n    (PlaylistId, TrackId) [type: btree, name: \"PK_PlaylistTrack\"]\n    TrackId [type: btree, name: \"IFK_PlaylistTrackTrackId\"]\n  }\n}\n\nTable \"Track\" {\n  \"TrackId\" int4 [pk, not null]\n  \"Name\" varchar(200) [not null]\n  \"AlbumId\" int4\n  \"MediaTypeId\" int4 [not null]\n  \"GenreId\" int4\n  \"Composer\" varchar(220)\n  \"Milliseconds\" int4 [not null]\n  \"Bytes\" int4\n  \"UnitPrice\" numeric(10,2) [not null]\n\n  Indexes {\n    AlbumId [type: btree, name: \"IFK_TrackAlbumId\"]\n    GenreId [type: btree, name: \"IFK_TrackGenreId\"]\n    MediaTypeId [type: btree, name: \"IFK_TrackMediaTypeId\"]\n  }\n}\n\nRef \"FK_AlbumArtistId\":\"Artist\".\"ArtistId\" < \"Album\".\"ArtistId\"\n\nRef \"FK_CustomerSupportRepId\":\"Employee\".\"EmployeeId\" < \"Customer\".\"SupportRepId\"\n\nRef \"FK_EmployeeReportsTo\":\"Employee\".\"EmployeeId\" < \"Employee\".\"ReportsTo\"\n\nRef \"FK_InvoiceCustomerId\":\"Customer\".\"CustomerId\" < \"Invoice\".\"CustomerId\"\n\nRef \"FK_InvoiceLineInvoiceId\":\"Invoice\".\"InvoiceId\" < \"InvoiceLine\".\"InvoiceId\"\n\nRef \"FK_InvoiceLineTrackId\":\"Track\".\"TrackId\" < \"InvoiceLine\".\"TrackId\"\n\nRef \"FK_PlaylistTrackPlaylistId\":\"Playlist\".\"PlaylistId\" < \"PlaylistTrack\".\"PlaylistId\"\n\nRef \"FK_PlaylistTrackTrackId\":\"Track\".\"TrackId\" < \"PlaylistTrack\".\"TrackId\"\n\nRef \"FK_TrackAlbumId\":\"Album\".\"AlbumId\" < \"Track\".\"AlbumId\"\n\nRef \"FK_TrackGenreId\":\"Genre\".\"GenreId\" < \"Track\".\"GenreId\"\n\nRef \"FK_TrackMediaTypeId\":\"MediaType\".\"MediaTypeId\" < \"Track\".\"MediaTypeId\"\n",
    "stateUrl": null,
    "stateOriginalUrl": "https://github.com/neondatabase-labs/postgres-sample-dbs/blob/main/chinook.sql"
  }
}

================================================
FILE: tasks/postgres/standard/chinook/employee_hierarchy_management/verify.py
================================================
"""
Verification script for PostgreSQL Task 3: Employee Hierarchy Management
"""

import os
import sys
import psycopg2
from decimal import Decimal

def rows_match(actual_row, expected_row):
    """
    Compare two rows with appropriate tolerance.
    For Decimal types: allows 0.01 tolerance
    For other types: requires exact match
    """
    if len(actual_row) != len(expected_row):
        return False
    
    for actual, expected in zip(actual_row, expected_row):
        if isinstance(actual, Decimal) and isinstance(expected, Decimal):
            if abs(float(actual) - float(expected)) > 0.01:
                return False
        elif actual != expected:
            return False
    
    return True

def get_connection_params() -> dict:
    """Get database connection parameters."""
    return {
        "host": os.getenv("POSTGRES_HOST", "localhost"),
        "port": int(os.getenv("POSTGRES_PORT", 5432)),
        "database": os.getenv("POSTGRES_DATABASE"),
        "user": os.getenv("POSTGRES_USERNAME"),
        "password": os.getenv("POSTGRES_PASSWORD")
    }

def verify_employee_count_and_titles(conn) -> bool:
    """Verify the final employee count and title changes."""
    with conn.cursor() as cur:
        # Check the final verification query results
        cur.execute("""
            SELECT 
                COUNT(*) as total_employees,
                COUNT(CASE WHEN "Title" = 'CEO' THEN 1 END) as ceo_count,
                COUNT(CASE WHEN "Title" = 'IT Specialist' THEN 1 END) as it_specialist_count,
                COUNT(CASE WHEN "ReportsTo" = 1 THEN 1 END) as reports_to_ceo
            FROM "Employee"
        """)
        result = cur.fetchone()
        
        total_employees, ceo_count, it_specialist_count, reports_to_ceo = result
        
        # Expected: total_employees = 9, ceo_count = 1, it_specialist_count = 1, reports_to_ceo = 4
        if total_employees != 9:
            print(f"❌ Expected 9 total employees, got {total_employees}")
            return False
            
        if ceo_count != 1:
            print(f"❌ Expected 1 CEO, got {ceo_count}")
            return False
            
        if it_specialist_count != 0:
            print(f"❌ Expected 0 IT Specialists, got {it_specialist_count}")
            return False
            
        if reports_to_ceo != 4:
            print(f"❌ Expected 4 employees reporting to CEO, got {reports_to_ceo}")
            return False
        
        print("✅ Employee count and title verification passed")
        return True

def verify_specific_employees(conn) -> bool:
    """Verify specific employee records and modifications."""
    with conn.cursor() as cur:
        # Check all employee fields in one query
        cur.execute("""
            SELECT "EmployeeId", "LastName", "FirstName", "Title", "ReportsTo", "BirthDate", 
                   "HireDate", "Address", "City", "State", "Country", "PostalCode", 
                   "Phone", "Fax", "Email"
            FROM "Employee" 
            WHERE "EmployeeId" IN (1, 2, 9, 10)
            ORDER BY "EmployeeId"
        """)
        employees = cur.fetchall()
        
        from datetime import datetime
        
        expected = [
            # Andrew Adams (ID 1) - Title changes to 'CEO', phone stays original, ReportsTo stays None
            (1, 'Adams', 'Andrew', 'CEO', None, datetime(1962, 2, 18), datetime(2002, 8, 14),
             '11120 Jasper Ave NW', 'Edmonton', 'AB', 'Canada', 'T5K 2N1', '+1 (780) 428-9482', '+1 (780) 428-3457', 'andrew@chinookcorp.com'),
            # Nancy Edwards (ID 2) - Phone changes, title stays 'Sales Manager', ReportsTo stays 1
            (2, 'Edwards', 'Nancy', 'Sales Manager', 1, datetime(1958, 12, 8), datetime(2002, 5, 1),
             '825 8 Ave SW', 'Calgary', 'AB', 'Canada', 'T2P 2T3', '+1 (403) 555-9999', '+1 (403) 262-3322', 'nancy@chinookcorp.com'),
            # Sarah Johnson - all new data, final ReportsTo = 1 (changed in step 4)
            (9, 'Johnson', 'Sarah', 'Sales Support Agent', 1, datetime(1985, 3, 15), datetime(2009, 1, 10),
             '123 Oak Street', 'Calgary', 'AB', 'Canada', 'T2P 5G3', '+1 (403) 555-0123', '+1 (403) 555-0124', 'sarah.johnson@chinookcorp.com'),
            # Mike Chen - all new data, final ReportsTo = 1 (changed in step 4)
            (10, 'Chen', 'Mike', 'Sales Support Agent', 1, datetime(1982, 8, 22), datetime(2009, 1, 10),
             '456 Pine Ave', 'Calgary', 'AB', 'Canada', 'T2P 5G4', '+1 (403) 555-0125', '+1 (403) 555-0126', 'mike.chen@chinookcorp.com')
        ]
        
        if len(employees) != 4:
            print(f"❌ Expected 4 key employees, found {len(employees)}")
            return False
            
        # Full field comparison for all employees using rows_match
        for actual, expected_emp in zip(employees, expected):
            if not rows_match(actual, expected_emp):
                print(f"❌ Employee {actual[0]} row mismatch: expected {expected_emp}, got {actual}")
                return False
        
        print("✅ Specific employee verification passed - all fields match exactly")
        return True

def verify_customer_assignments(conn) -> bool:
    """Verify customer support representative assignments."""
    with conn.cursor() as cur:
        # Check customers 1, 2, 3 are assigned to Sarah (ID 9)
        cur.execute("""
            SELECT COUNT(*)
            FROM "Customer" 
            WHERE "CustomerId" IN (1, 2, 3) AND "SupportRepId" = 9
        """)
        sarah_customers = cur.fetchone()[0]
        
        if sarah_customers != 3:
            print(f"❌ Expected 3 customers assigned to Sarah Johnson, got {sarah_customers}")
            return False
        
        # Check customers 4, 5, 6 are assigned to Mike (ID 10)
        cur.execute("""
            SELECT COUNT(*)
            FROM "Customer" 
            WHERE "CustomerId" IN (4, 5, 6) AND "SupportRepId" = 10
        """)
        mike_customers = cur.fetchone()[0]
        
        if mike_customers != 3:
            print(f"❌ Expected 3 customers assigned to Mike Chen, got {mike_customers}")
            return False
        
        print("✅ Customer assignment verification passed")
        return True

def verify_performance_table(conn) -> bool:
    """Verify the employee_performance table exists and has correct data."""
    with conn.cursor() as cur:
        try:
            # Get all performance records
            cur.execute("""
                SELECT employee_id, customers_assigned, performance_score
                FROM employee_performance 
                ORDER BY employee_id
            """)
            actual_results = cur.fetchall()
            
            # Get actual customer counts for verification
            cur.execute("""
                SELECT "SupportRepId", COUNT(*) 
                FROM "Customer" 
                WHERE "SupportRepId" IN (9, 10)
                GROUP BY "SupportRepId"
                ORDER BY "SupportRepId"
            """)
            customer_counts = dict(cur.fetchall())
            
            expected = [
                (9, customer_counts.get(9, 0), Decimal('4.5')),  # Sarah Johnson
                (10, customer_counts.get(10, 0), Decimal('4.2'))  # Mike Chen
            ]
            
            if len(actual_results) != 2:
                print(f"❌ Expected 2 performance records, got {len(actual_results)}")
                return False
            
            for actual, expected_row in zip(actual_results, expected):
                if not rows_match(actual, expected_row):
                    print(f"❌ Performance record mismatch: expected {expected_row}, got {actual}")
                    return False
            
            print("✅ Employee performance table verification passed")
            return True
            
        except psycopg2.Error as e:
            print(f"❌ Employee performance table verification failed: {e}")
            return False

def verify_employee_deletion_and_promotion(conn) -> bool:
    """Verify Robert King deletion and Laura Callahan promotion."""
    with conn.cursor() as cur:
        try:
            # Verify Robert King (ID 7) is deleted
            cur.execute("""
                SELECT COUNT(*) FROM "Employee" WHERE "EmployeeId" = 7
            """)
            if cur.fetchone()[0] != 0:
                print("❌ Robert King (EmployeeId = 7) should be deleted")
                return False
            
            # Verify Laura Callahan (ID 8) promotion
            cur.execute("""
                SELECT "Title" FROM "Employee" WHERE "EmployeeId" = 8
            """)
            laura_title = cur.fetchone()
            if not laura_title or laura_title[0] != 'Senior IT Specialist':
                print(f"❌ Laura Callahan should have title 'Senior IT Specialist', got: {laura_title[0] if laura_title else None}")
                return False
            
            print("✅ Employee deletion and promotion verification passed")
            return True
            
        except psycopg2.Error as e:
            print(f"❌ Employee deletion/promotion verification failed: {e}")
            return False

def verify_salary_column(conn) -> bool:
    """Verify salary column exists and has correct values."""
    with conn.cursor() as cur:
        try:
            # Check if salary column exists and get all salary values
            cur.execute("""
                SELECT "EmployeeId", salary 
                FROM "Employee" 
                ORDER BY "EmployeeId"
            """)
            salary_data = cur.fetchall()
            
            # Verify Laura (ID 8) has 75000.00, others have 50000.00
            for emp_id, salary in salary_data:
                expected_salary = Decimal('75000.00') if emp_id == 8 else Decimal('50000.00')
                if salary != expected_salary:
                    print(f"❌ Employee {emp_id} salary should be {expected_salary}, got {salary}")
                    return False
            
            print("✅ Salary column verification passed")
            return True
            
        except psycopg2.Error as e:
            print(f"❌ Salary column verification failed: {e}")
            return False

def main():
    """Main verification function."""
    print("=" * 50)
    print("Verifying Task 3: Employee Hierarchy Management")
    print("=" * 50)

    # Get connection parameters
    conn_params = get_connection_params()

    if not conn_params["database"]:
        print("❌ No database specified")
        sys.exit(1)

    try:
        # Connect to database
        conn = psycopg2.connect(**conn_params)

        # Run verification checks with short-circuit evaluation
        success = (verify_employee_count_and_titles(conn) and
                  verify_specific_employees(conn) and
                  verify_customer_assignments(conn) and
                  verify_performance_table(conn) and
                  verify_employee_deletion_and_promotion(conn) and
                  verify_salary_column(conn))

        conn.close()

        if success:
            print("\n🎉 Task verification: PASS")
            print("All employee hierarchy management operations completed correctly!")
            sys.exit(0)
        else:
            print("\n❌ Task verification: FAIL")
            sys.exit(1)

    except psycopg2.Error as e:
        print(f"❌ Database error: {e}")
        sys.exit(1)
    except Exception as e:
        print(f"❌ Verification error: {e}")
        sys.exit(1)

if __name__ == "__main__":
    main()

================================================
FILE: tasks/postgres/standard/chinook/sales_and_music_charts/description.md
================================================
Create a monthly sales dashboard and top music charts system for Chinook's management team to track business performance and identify trending music content.

## Your Tasks:

1. **Build the monthly sales summary table** — create a table called `monthly_sales_summary` in the `public` schema with:
   * `year_month` (varchar) — format as 'YYYY-MM' (e.g., '2009-01')
   * `total_invoices` (integer) — number of invoices in that month
   * `total_revenue` (decimal) — sum of all invoice totals for the month
   * `total_tracks_sold` (integer) — total quantity of individual tracks sold
   * `average_invoice_value` (decimal) — average invoice amount for the month
   * `unique_customers` (integer) — count of distinct customers who made purchases

2. **Create the music charts table** — build a table called `top_music_charts` in the `public` schema with:
   * `chart_type` (varchar) — either 'top_tracks', 'top_albums', or 'top_artists'
   * `rank_position` (integer) — ranking from 1 to 10
   * `item_id` (integer) — ID of the track, album, or artist
   * `item_name` (varchar) — name of the track, album, or artist
   * `total_revenue` (decimal) — total revenue generated by this item

3. **Populate the monthly sales data**:
   * Calculate metrics for each month that has invoice data
   * Use invoice date to determine the month
   * **Note**: Each invoice can contain multiple invoice lines (tracks)

4. **Generate the top 10 charts**:
   * **Top Tracks**: Rank tracks by total quantity sold across all invoices
   * **Top Albums**: Rank albums by total revenue generated from their tracks
   * **Top Artists**: Rank artists by total revenue from all their tracks across all albums

5. **Business rules to follow**:
   * Only include months where at least one invoice exists
   * For album rankings, sum revenue from all tracks in each album
   * For artist rankings, sum revenue from all tracks across all their albums
   * Handle ties by using item name alphabetically as tiebreaker
   * Exclude any items with zero sales

This system will provide clear, actionable business intelligence for monthly reporting and music trend analysis.

================================================
FILE: tasks/postgres/standard/chinook/sales_and_music_charts/meta.json
================================================
{
  "task_id": "sales_and_music_charts",
  "task_name": "Sales and Music Charts",
  "category_id": "chinook",
  "category_name": "Chinook",
  "description": "Create monthly sales dashboard and top music charts system for tracking business performance and trending content.",
  "author": "Lingxiao Du",
  "created_at": "2025-08-12",
  "difficulty": "L3",
  "tags": [
    "reporting and analytics",
    "statistical aggregation",
    "schema design"
  ],
  "mcp": [
    "postgres"
  ],
  "meta_data": {
    "stateType": "text",
    "stateContent": "Table \"Album\" {\n  \"AlbumId\" int4 [pk, not null]\n  \"Title\" varchar(160) [not null]\n  \"ArtistId\" int4 [not null]\n\n  Indexes {\n    ArtistId [type: btree, name: \"IFK_AlbumArtistId\"]\n  }\n}\n\nTable \"Artist\" {\n  \"ArtistId\" int4 [pk, not null]\n  \"Name\" varchar(120)\n}\n\nTable \"Customer\" {\n  \"CustomerId\" int4 [pk, not null]\n  \"FirstName\" varchar(40) [not null]\n  \"LastName\" varchar(20) [not null]\n  \"Company\" varchar(80)\n  \"Address\" varchar(70)\n  \"City\" varchar(40)\n  \"State\" varchar(40)\n  \"Country\" varchar(40)\n  \"PostalCode\" varchar(10)\n  \"Phone\" varchar(24)\n  \"Fax\" varchar(24)\n  \"Email\" varchar(60) [not null]\n  \"SupportRepId\" int4\n\n  Indexes {\n    SupportRepId [type: btree, name: \"IFK_CustomerSupportRepId\"]\n  }\n}\n\nTable \"Employee\" {\n  \"EmployeeId\" int4 [pk, not null]\n  \"LastName\" varchar(20) [not null]\n  \"FirstName\" varchar(20) [not null]\n  \"Title\" varchar(30)\n  \"ReportsTo\" int4\n  \"BirthDate\" timestamp\n  \"HireDate\" timestamp\n  \"Address\" varchar(70)\n  \"City\" varchar(40)\n  \"State\" varchar(40)\n  \"Country\" varchar(40)\n  \"PostalCode\" varchar(10)\n  \"Phone\" varchar(24)\n  \"Fax\" varchar(24)\n  \"Email\" varchar(60)\n\n  Indexes {\n    ReportsTo [type: btree, name: \"IFK_EmployeeReportsTo\"]\n  }\n}\n\nTable \"Genre\" {\n  \"GenreId\" int4 [pk, not null]\n  \"Name\" varchar(120)\n}\n\nTable \"Invoice\" {\n  \"InvoiceId\" int4 [pk, not null]\n  \"CustomerId\" int4 [not null]\n  \"InvoiceDate\" timestamp [not null]\n  \"BillingAddress\" varchar(70)\n  \"BillingCity\" varchar(40)\n  \"BillingState\" varchar(40)\n  \"BillingCountry\" varchar(40)\n  \"BillingPostalCode\" varchar(10)\n  \"Total\" numeric(10,2) [not null]\n\n  Indexes {\n    CustomerId [type: btree, name: \"IFK_InvoiceCustomerId\"]\n  }\n}\n\nTable \"InvoiceLine\" {\n  \"InvoiceLineId\" int4 [pk, not null]\n  \"InvoiceId\" int4 [not null]\n  \"TrackId\" int4 [not null]\n  \"UnitPrice\" numeric(10,2) [not null]\n  \"Quantity\" int4 [not null]\n\n  Indexes {\n    InvoiceId [type: btree, name: \"IFK_InvoiceLineInvoiceId\"]\n    TrackId [type: btree, name: \"IFK_InvoiceLineTrackId\"]\n  }\n}\n\nTable \"MediaType\" {\n  \"MediaTypeId\" int4 [pk, not null]\n  \"Name\" varchar(120)\n}\n\nTable \"Playlist\" {\n  \"PlaylistId\" int4 [pk, not null]\n  \"Name\" varchar(120)\n}\n\nTable \"PlaylistTrack\" {\n  \"PlaylistId\" int4 [not null]\n  \"TrackId\" int4 [not null]\n\n  Indexes {\n    (PlaylistId, TrackId) [type: btree, name: \"PK_PlaylistTrack\"]\n    TrackId [type: btree, name: \"IFK_PlaylistTrackTrackId\"]\n  }\n}\n\nTable \"Track\" {\n  \"TrackId\" int4 [pk, not null]\n  \"Name\" varchar(200) [not null]\n  \"AlbumId\" int4\n  \"MediaTypeId\" int4 [not null]\n  \"GenreId\" int4\n  \"Composer\" varchar(220)\n  \"Milliseconds\" int4 [not null]\n  \"Bytes\" int4\n  \"UnitPrice\" numeric(10,2) [not null]\n\n  Indexes {\n    AlbumId [type: btree, name: \"IFK_TrackAlbumId\"]\n    GenreId [type: btree, name: \"IFK_TrackGenreId\"]\n    MediaTypeId [type: btree, name: \"IFK_TrackMediaTypeId\"]\n  }\n}\n\nRef \"FK_AlbumArtistId\":\"Artist\".\"ArtistId\" < \"Album\".\"ArtistId\"\n\nRef \"FK_CustomerSupportRepId\":\"Employee\".\"EmployeeId\" < \"Customer\".\"SupportRepId\"\n\nRef \"FK_EmployeeReportsTo\":\"Employee\".\"EmployeeId\" < \"Employee\".\"ReportsTo\"\n\nRef \"FK_InvoiceCustomerId\":\"Customer\".\"CustomerId\" < \"Invoice\".\"CustomerId\"\n\nRef \"FK_InvoiceLineInvoiceId\":\"Invoice\".\"InvoiceId\" < \"InvoiceLine\".\"InvoiceId\"\n\nRef \"FK_InvoiceLineTrackId\":\"Track\".\"TrackId\" < \"InvoiceLine\".\"TrackId\"\n\nRef \"FK_PlaylistTrackPlaylistId\":\"Playlist\".\"PlaylistId\" < \"PlaylistTrack\".\"PlaylistId\"\n\nRef \"FK_PlaylistTrackTrackId\":\"Track\".\"TrackId\" < \"PlaylistTrack\".\"TrackId\"\n\nRef \"FK_TrackAlbumId\":\"Album\".\"AlbumId\" < \"Track\".\"AlbumId\"\n\nRef \"FK_TrackGenreId\":\"Genre\".\"GenreId\" < \"Track\".\"GenreId\"\n\nRef \"FK_TrackMediaTypeId\":\"MediaType\".\"MediaTypeId\" < \"Track\".\"MediaTypeId\"\n",
    "stateUrl": null,
    "stateOriginalUrl": "https://github.com/neondatabase-labs/postgres-sample-dbs/blob/main/chinook.sql"
  }
}

================================================
FILE: tasks/postgres/standard/chinook/sales_and_music_charts/verify.py
================================================
"""
Verification script for PostgreSQL Task 1: Monthly Sales Dashboard and Music Charts
"""

import os
import sys
import psycopg2
from decimal import Decimal

def rows_match(actual_row, expected_row):
    """
    Compare two rows with appropriate tolerance.
    For Decimal types: allows 0.01 tolerance
    For other types: requires exact match
    """
    if len(actual_row) != len(expected_row):
        return False
    
    for actual, expected in zip(actual_row, expected_row):
        if isinstance(actual, Decimal) and isinstance(expected, Decimal):
            if abs(float(actual) - float(expected)) > 0.01:
                return False
        elif actual != expected:
            return False
    
    return True

def get_connection_params() -> dict:
    """Get database connection parameters."""
    return {
        "host": os.getenv("POSTGRES_HOST", "localhost"),
        "port": int(os.getenv("POSTGRES_PORT", 5432)),
        "database": os.getenv("POSTGRES_DATABASE"),
        "user": os.getenv("POSTGRES_USERNAME"),
        "password": os.getenv("POSTGRES_PASSWORD")
    }

def verify_monthly_sales_results(conn) -> bool:
    """Verify the monthly sales summary results."""
    with conn.cursor() as cur:
        # Get actual results from the created table
        cur.execute("""
            SELECT year_month, total_invoices, total_revenue, 
                   total_tracks_sold, average_invoice_value, unique_customers
            FROM monthly_sales_summary 
            ORDER BY year_month
        """)
        actual_results = cur.fetchall()
        
        # Execute ground truth query
        cur.execute("""
            WITH invoice_metrics AS (
            SELECT
                DATE_TRUNC('month', i."InvoiceDate") AS ym,
                COUNT(*)::INT                       AS total_invoices,
                SUM(i."Total")::DECIMAL             AS total_revenue,
                AVG(i."Total")::DECIMAL             AS average_invoice_value,
                COUNT(DISTINCT i."CustomerId")::INT AS unique_customers
            FROM "Invoice" i
            GROUP BY 1
            ),
            track_metrics AS (         
            SELECT
                DATE_TRUNC('month', i."InvoiceDate") AS ym,
                SUM(il."Quantity")::INT              AS total_tracks_sold
            FROM "Invoice" i
            JOIN "InvoiceLine" il ON il."InvoiceId" = i."InvoiceId"
            WHERE il."Quantity" > 0                
            GROUP BY 1
            )
            SELECT
            TO_CHAR(im.ym, 'YYYY-MM')          AS year_month,
            im.total_invoices,
            im.total_revenue,
            COALESCE(tm.total_tracks_sold, 0)  AS total_tracks_sold,
            im.average_invoice_value,
            im.unique_customers
            FROM invoice_metrics im
            LEFT JOIN track_metrics tm USING (ym)
            ORDER BY year_month;
        """)
        expected_results = cur.fetchall()

        if len(actual_results) != len(expected_results):
            print(f"❌ Expected {len(expected_results)} monthly sales records, got {len(actual_results)}")
            return False

        mismatches = 0
        for i, (actual, expected) in enumerate(zip(actual_results, expected_results)):
            if not rows_match(actual, expected):
                if mismatches < 5:  # Only show first 5 mismatches
                    print(f"❌ Monthly sales row {i+1} mismatch: expected {expected}, got {actual}")
                mismatches += 1

        if mismatches > 0:
            print(f"❌ Total monthly sales mismatches: {mismatches}")
            return False

        print(f"✅ Monthly sales results are correct ({len(actual_results)} records)")
        return True

def verify_music_charts_results(conn) -> bool:
    """Verify the music charts results."""
    with conn.cursor() as cur:
        # Get actual results from the created table
        cur.execute("""
            SELECT chart_type, rank_position, item_id, item_name, total_revenue
            FROM top_music_charts
            ORDER BY chart_type, rank_position
        """)
        actual_results = cur.fetchall()

        # Execute ground truth queries for each chart type
        cur.execute("""
            WITH track_stats AS (
            SELECT
                'top_tracks'::varchar AS chart_type,
                t."TrackId"           AS item_id,
                t."Name"              AS item_name,
                SUM(il."UnitPrice" * il."Quantity")::DECIMAL AS total_revenue,
                SUM(il."Quantity")::INT                      AS total_quantity
            FROM "Track" t
            JOIN "InvoiceLine" il ON il."TrackId" = t."TrackId"
            GROUP BY t."TrackId", t."Name"
            HAVING SUM(il."Quantity") > 0
            ),
            track_ranked AS (
            SELECT
                chart_type, item_id, item_name, total_revenue,
                ROW_NUMBER() OVER (ORDER BY total_quantity DESC, item_name, item_id) AS rank_position
            FROM track_stats
            ),
            album_rev AS (
            SELECT
                'top_albums'::varchar AS chart_type,
                a."AlbumId"           AS item_id,
                a."Title"             AS item_name,
                SUM(il."UnitPrice" * il."Quantity")::DECIMAL AS total_revenue
            FROM "Album" a
            JOIN "Track" t        ON t."AlbumId"  = a."AlbumId"
            JOIN "InvoiceLine" il ON il."TrackId" = t."TrackId"
            GROUP BY a."AlbumId", a."Title"
            HAVING SUM(il."UnitPrice" * il."Quantity") > 0
            ),
            album_ranked AS (
            SELECT
                chart_type, item_id, item_name, total_revenue,
                ROW_NUMBER() OVER (ORDER BY total_revenue DESC, item_name, item_id) AS rank_position
            FROM album_rev
            ),
            artist_rev AS (
            SELECT
                'top_artists'::varchar AS chart_type,
                ar."ArtistId"          AS item_id,
                ar."Name"              AS item_name,
                SUM(il."UnitPrice" * il."Quantity")::DECIMAL AS total_revenue
            FROM "Artist" ar
            JOIN "Album"  a       ON a."ArtistId" = ar."ArtistId"
            JOIN "Track"  t       ON t."AlbumId"  = a."AlbumId"
            JOIN "InvoiceLine" il ON il."TrackId" = t."TrackId"
            GROUP BY ar."ArtistId", ar."Name"
            HAVING SUM(il."UnitPrice" * il."Quantity") > 0
            ),
            artist_ranked AS (
            SELECT
                chart_type, item_id, item_name, total_revenue,
                ROW_NUMBER() OVER (ORDER BY total_revenue DESC, item_name, item_id) AS rank_position
            FROM artist_rev
            )
            SELECT chart_type, rank_position, item_id, item_name, total_revenue
            FROM (
            SELECT * FROM track_ranked  WHERE rank_position <= 10
            UNION ALL
            SELECT * FROM album_ranked  WHERE rank_position <= 10
            UNION ALL
            SELECT * FROM artist_ranked WHERE rank_position <= 10
            ) x
            ORDER BY chart_type, rank_position;
        """)
        expected_results = cur.fetchall()

        if len(actual_results) != len(expected_results):
            print(f"❌ Expected {len(expected_results)} music chart records, got {len(actual_results)}")
            return False

        mismatches = 0
        for i, (actual, expected) in enumerate(zip(actual_results, expected_results)):
            if not rows_match(actual, expected):
                if mismatches < 5:  # Only show first 5 mismatches
                    print(f"❌ Music chart row {i+1} mismatch: expected {expected}, got {actual}")
                mismatches += 1

        if mismatches > 0:
            print(f"❌ Total music chart mismatches: {mismatches}")
            return False

        print(f"✅ Music chart results are correct ({len(actual_results)} records)")
        return True

def main():
    """Main verification function."""
    print("=" * 50)

    # Get connection parameters
    conn_params = get_connection_params()

    if not conn_params["database"]:
        print("❌ No database specified")
        sys.exit(1)

    try:
        # Connect to database
        conn = psycopg2.connect(**conn_params)

        # Verify results
        success = verify_monthly_sales_results(conn) and verify_music_charts_results(conn)

        conn.close()

        if success:
            print("\n🎉 Task verification: PASS")
            sys.exit(0)
        else:
            print("\n❌ Task verification: FAIL")
            sys.exit(1)

    except psycopg2.Error as e:
        print(f"❌ Database error: {e}")
        sys.exit(1)
    except Exception as e:
        print(f"❌ Verification error: {e}")
        sys.exit(1)

if __name__ == "__main__":
    main()

================================================
FILE: tasks/postgres/standard/dvdrental/customer_analysis_fix/description.md
================================================
Fix the customer analysis query that is producing incorrect results.

## Background

The data analytics team attempted to create a customer behavior analysis query to identify active customers and analyze their spending patterns and preferences. The requirements are:
- Only count rentals that have associated payment records (paid rentals)
- Only include customers with at least 15 paid rentals
- Only include customers with valid email addresses

However, they're getting incorrect results - the query is over-counting rentals and calculating wrong spending amounts. 

Your task is to fix this query to produce accurate results.

## The Problematic Query

Here's the buggy query that needs to be fixed:

```sql
WITH customer_basic_stats AS (
    SELECT 
        c.customer_id,
        c.first_name || ' ' || c.last_name as customer_name,
        ci.city as customer_city,
        co.country as customer_country,
        COUNT(r.rental_id) as total_rentals,
        COUNT(DISTINCT i.film_id) as unique_films,
        SUM(p.amount) as total_spent,
        AVG(EXTRACT(days FROM (r.return_date - r.rental_date))) as avg_rental_duration
    FROM customer c
    JOIN address a ON c.address_id = a.address_id
    JOIN city ci ON a.city_id = ci.city_id
    JOIN country co ON ci.country_id = co.country_id
    JOIN rental r ON c.customer_id = r.customer_id
    JOIN inventory i ON r.inventory_id = i.inventory_id
    JOIN payment p ON r.rental_id = p.rental_id
    WHERE c.email IS NOT NULL
    GROUP BY c.customer_id, c.first_name, c.last_name, ci.city, co.country
    HAVING COUNT(r.rental_id) >= 15
),
customer_categories AS (
    SELECT 
        c.customer_id,
        cat.name as category_name,
        COUNT(*) as category_count,
        ROW_NUMBER() OVER (PARTITION BY c.customer_id ORDER BY COUNT(*) DESC, cat.name ASC) as rn
    FROM customer c
    JOIN rental r ON c.customer_id = r.customer_id
    JOIN inventory i ON r.inventory_id = i.inventory_id
    JOIN film f ON i.film_id = f.film_id
    JOIN film_category fc ON f.film_id = fc.film_id
    JOIN category cat ON fc.category_id = cat.category_id
    JOIN payment p ON r.rental_id = p.rental_id
    WHERE c.email IS NOT NULL
    GROUP BY c.customer_id, cat.name
),
customer_actors AS (
    SELECT 
        c.customer_id,
        a.first_name || ' ' || a.last_name as actor_name,
        COUNT(*) as actor_count,
        ROW_NUMBER() OVER (PARTITION BY c.customer_id ORDER BY COUNT(*) DESC, (a.first_name || ' ' || a.last_name) ASC) as rn
    FROM customer c
    JOIN rental r ON c.customer_id = r.customer_id
    JOIN inventory i ON r.inventory_id = i.inventory_id
    JOIN film f ON i.film_id = f.film_id
    JOIN film_actor fa ON f.film_id = fa.film_id
    JOIN actor a ON fa.actor_id = a.actor_id
    JOIN payment p ON r.rental_id = p.rental_id
    WHERE c.email IS NOT NULL
    GROUP BY c.customer_id, a.first_name, a.last_name
),
regional_popular_films AS (
    SELECT 
        co.country,
        f.title,
        COUNT(*) as rental_count,
        ROW_NUMBER() OVER (PARTITION BY co.country ORDER BY COUNT(*) DESC, f.title ASC) as rn
    FROM rental r
    JOIN inventory i ON r.inventory_id = i.inventory_id
    JOIN film f ON i.film_id = f.film_id
    JOIN customer c ON r.customer_id = c.customer_id
    JOIN address a ON c.address_id = a.address_id
    JOIN city ci ON a.city_id = ci.city_id
    JOIN country co ON ci.country_id = co.country_id
    JOIN payment p ON r.rental_id = p.rental_id
    WHERE c.email IS NOT NULL
    GROUP BY co.country, f.title
)
SELECT 
    cbs.customer_id,
    cbs.customer_name,
    cbs.customer_city,
    cbs.customer_country,
    cbs.total_rentals,
    cbs.unique_films,
    cbs.total_spent,
    cc.category_name as favorite_category,
    ca.actor_name as favorite_actor,
    cbs.avg_rental_duration,
    CASE 
        WHEN cbs.total_spent >= 150 THEN 'Premium'
        WHEN cbs.total_spent >= 75 THEN 'Standard'
        ELSE 'Basic'
    END as customer_tier,
    rpf.title as most_popular_film_in_region,
    rpf.rental_count as regional_film_rental_count
FROM customer_basic_stats cbs
LEFT JOIN customer_categories cc ON cbs.customer_id = cc.customer_id AND cc.rn = 1
LEFT JOIN customer_actors ca ON cbs.customer_id = ca.customer_id AND ca.rn = 1
LEFT JOIN regional_popular_films rpf ON cbs.customer_country = rpf.country AND rpf.rn = 1
ORDER BY cbs.total_spent DESC, cbs.total_rentals DESC, cbs.customer_name ASC;
```

## Known Issues

When comparing the problematic query results with the expected correct values, the following discrepancies are observed:

1. **Rental count discrepancies**: Many customers show higher `total_rentals` counts than expected

2. **Spending amount errors**: The `total_spent` values don't match the correct calculations 

3. **Incorrect favorite categories and actors**: Many customers show wrong favorite categories and actors compared to the expected results

4. **Time calculation inconsistencies**: The `avg_rental_duration` values differ significantly from the correct calculations
    - Example: Customer ID 1 shows 3.90 days instead of the expected 4.27 days
    - Example: Customer ID 2 shows 5.23 days instead of the expected 5.69 days

## Your Task

Debug and fix the query to produce accurate results. Then create a table with your corrected results.

1. **Fix the query** to ensure:
   - Accurate customer spending and rental counts
   - Correct favorite categories and actors
   - Proper regional popular films

2. **Create a table** called `customer_analysis_fixed` in the `public` schema with your corrected query results. The table should have the same columns as the original query output.

**Important**: The business logic and output columns should remain the same - only fix the data accuracy issues.

================================================
FILE: tasks/postgres/standard/dvdrental/customer_analysis_fix/meta.json
================================================
{
  "task_id": "customer_analysis_fix",
  "task_name": "Customer Analysis Fix",
  "category_id": "dvdrental",
  "category_name": "DVD Rental",
  "description": "Debug and fix customer behavior analysis query producing incorrect rental counts and spending calculations.",
  "author": "Lingxiao Du",
  "created_at": "2025-08-20",
  "difficulty": "L3",
  "tags": [
    "performance optimization",
    "data integrity enforcement"
  ],
  "mcp": [
    "postgres"
  ],
  "meta_data": {
    "stateType": "text",
    "stateContent": "Enum \"mpaa_rating\" {\n  \"G\"\n  \"PG\"\n  \"PG-13\"\n  \"R\"\n  \"NC-17\"\n}\n\nTable \"customer\" {\n  \"customer_id\" int4 [pk, not null, increment]\n  \"store_id\" int2 [not null]\n  \"first_name\" varchar(45) [not null]\n  \"last_name\" varchar(45) [not null]\n  \"email\" varchar(50)\n  \"address_id\" int2 [not null]\n  \"activebool\" bool [not null, default: true]\n  \"create_date\" date [not null, default: `('now'::text)::date`]\n  \"last_update\" timestamp [default: `now()`]\n  \"active\" int4\n\n  Indexes {\n    address_id [type: btree, name: \"idx_fk_address_id\"]\n    store_id [type: btree, name: \"idx_fk_store_id\"]\n    last_name [type: btree, name: \"idx_last_name\"]\n  }\n}\n\nTable \"actor\" {\n  \"actor_id\" int4 [pk, not null, increment]\n  \"first_name\" varchar(45) [not null]\n  \"last_name\" varchar(45) [not null]\n  \"last_update\" timestamp [not null, default: `now()`]\n\n  Indexes {\n    last_name [type: btree, name: \"idx_actor_last_name\"]\n  }\n}\n\nTable \"category\" {\n  \"category_id\" int4 [pk, not null, increment]\n  \"name\" varchar(25) [not null]\n  \"last_update\" timestamp [not null, default: `now()`]\n}\n\nTable \"film\" {\n  \"film_id\" int4 [pk, not null, increment]\n  \"title\" varchar(255) [not null]\n  \"description\" text\n  \"release_year\" int4\n  \"language_id\" int2 [not null]\n  \"rental_duration\" int2 [not null, default: 3]\n  \"rental_rate\" numeric(4,2) [not null, default: 4.99]\n  \"length\" int2\n  \"replacement_cost\" numeric(5,2) [not null, default: 19.99]\n  \"rating\" mpaa_rating [default: 'G']\n  \"last_update\" timestamp [not null, default: `now()`]\n  \"special_features\" \"text[]\"\n  \"fulltext\" tsvector [not null]\n\n  Indexes {\n    fulltext [type: gist, name: \"film_fulltext_idx\"]\n    language_id [type: btree, name: \"idx_fk_language_id\"]\n    title [type: btree, name: \"idx_title\"]\n  }\n}\n\nTable \"film_actor\" {\n  \"actor_id\" int2 [not null]\n  \"film_id\" int2 [not null]\n  \"last_update\" timestamp [not null, default: `now()`]\n\n  Indexes {\n    (actor_id, film_id) [type: btree, name: \"film_actor_pkey\"]\n    film_id [type: btree, name: \"idx_fk_film_id\"]\n  }\n}\n\nTable \"film_category\" {\n  \"film_id\" int2 [not null]\n  \"category_id\" int2 [not null]\n  \"last_update\" timestamp [not null, default: `now()`]\n\n  Indexes {\n    (film_id, category_id) [type: btree, name: \"film_category_pkey\"]\n  }\n}\n\nTable \"address\" {\n  \"address_id\" int4 [pk, not null, increment]\n  \"address\" varchar(50) [not null]\n  \"address2\" varchar(50)\n  \"district\" varchar(20) [not null]\n  \"city_id\" int2 [not null]\n  \"postal_code\" varchar(10)\n  \"phone\" varchar(20) [not null]\n  \"last_update\" timestamp [not null, default: `now()`]\n\n  Indexes {\n    city_id [type: btree, name: \"idx_fk_city_id\"]\n  }\n}\n\nTable \"city\" {\n  \"city_id\" int4 [pk, not null, increment]\n  \"city\" varchar(50) [not null]\n  \"country_id\" int2 [not null]\n  \"last_update\" timestamp [not null, default: `now()`]\n\n  Indexes {\n    country_id [type: btree, name: \"idx_fk_country_id\"]\n  }\n}\n\nTable \"country\" {\n  \"country_id\" int4 [pk, not null, increment]\n  \"country\" varchar(50) [not null]\n  \"last_update\" timestamp [not null, default: `now()`]\n}\n\nTable \"inventory\" {\n  \"inventory_id\" int4 [pk, not null, increment]\n  \"film_id\" int2 [not null]\n  \"store_id\" int2 [not null]\n  \"last_update\" timestamp [not null, default: `now()`]\n\n  Indexes {\n    (store_id, film_id) [type: btree, name: \"idx_store_id_film_id\"]\n  }\n}\n\nTable \"language\" {\n  \"language_id\" int4 [pk, not null, increment]\n  \"name\" bpchar(20) [not null]\n  \"last_update\" timestamp [not null, default: `now()`]\n}\n\nTable \"payment\" {\n  \"payment_id\" int4 [pk, not null, increment]\n  \"customer_id\" int2 [not null]\n  \"staff_id\" int2 [not null]\n  \"rental_id\" int4 [not null]\n  \"amount\" numeric(5,2) [not null]\n  \"payment_date\" timestamp [not null]\n\n  Indexes {\n    rental_id [type: btree, name: \"idx_fk_rental_id\"]\n    staff_id [type: btree, name: \"idx_fk_staff_id\"]\n  }\n}\n\nTable \"rental\" {\n  \"rental_id\" int4 [pk, not null, increment]\n  \"rental_date\" timestamp [not null]\n  \"inventory_id\" int4 [not null]\n  \"customer_id\" int2 [not null]\n  \"return_date\" timestamp\n  \"staff_id\" int2 [not null]\n  \"last_update\" timestamp [not null, default: `now()`]\n\n  Indexes {\n    (rental_date, inventory_id, customer_id) [type: btree, name: \"idx_unq_rental_rental_date_inventory_id_customer_id\"]\n    inventory_id [type: btree, name: \"idx_fk_inventory_id\"]\n  }\n}\n\nTable \"staff\" {\n  \"staff_id\" int4 [pk, not null, increment]\n  \"first_name\" varchar(45) [not null]\n  \"last_name\" varchar(45) [not null]\n  \"address_id\" int2 [not null]\n  \"email\" varchar(50)\n  \"store_id\" int2 [not null]\n  \"active\" bool [not null, default: true]\n  \"username\" varchar(16) [not null]\n  \"password\" varchar(40)\n  \"last_update\" timestamp [not null, default: `now()`]\n  \"picture\" bytea\n}\n\nTable \"store\" {\n  \"store_id\" int4 [pk, not null, increment]\n  \"manager_staff_id\" int2 [unique, not null]\n  \"address_id\" int2 [not null]\n  \"last_update\" timestamp [not null, default: `now()`]\n}\n\nRef \"fk_address_city\":\"city\".\"city_id\" < \"address\".\"city_id\"\n\nRef \"fk_city\":\"country\".\"country_id\" < \"city\".\"country_id\"\n\nRef \"customer_address_id_fkey\":\"address\".\"address_id\" < \"customer\".\"address_id\" [update: cascade, delete: restrict]\n\nRef \"film_language_id_fkey\":\"language\".\"language_id\" < \"film\".\"language_id\" [update: cascade, delete: restrict]\n\nRef \"film_actor_actor_id_fkey\":\"actor\".\"actor_id\" < \"film_actor\".\"actor_id\" [update: cascade, delete: restrict]\n\nRef \"film_actor_film_id_fkey\":\"film\".\"film_id\" < \"film_actor\".\"film_id\" [update: cascade, delete: restrict]\n\nRef \"film_category_category_id_fkey\":\"category\".\"category_id\" < \"film_category\".\"category_id\" [update: cascade, delete: restrict]\n\nRef \"film_category_film_id_fkey\":\"film\".\"film_id\" < \"film_category\".\"film_id\" [update: cascade, delete: restrict]\n\nRef \"inventory_film_id_fkey\":\"film\".\"film_id\" < \"inventory\".\"film_id\" [update: cascade, delete: restrict]\n\nRef \"payment_customer_id_fkey\":\"customer\".\"customer_id\" < \"payment\".\"customer_id\" [update: cascade, delete: restrict]\n\nRef \"payment_rental_id_fkey\":\"rental\".\"rental_id\" < \"payment\".\"rental_id\" [update: cascade, delete: set null]\n\nRef \"payment_staff_id_fkey\":\"staff\".\"staff_id\" < \"payment\".\"staff_id\" [update: cascade, delete: restrict]\n\nRef \"rental_customer_id_fkey\":\"customer\".\"customer_id\" < \"rental\".\"customer_id\" [update: cascade, delete: restrict]\n\nRef \"rental_inventory_id_fkey\":\"inventory\".\"inventory_id\" < \"rental\".\"inventory_id\" [update: cascade, delete: restrict]\n\nRef \"rental_staff_id_key\":\"staff\".\"staff_id\" < \"rental\".\"staff_id\"\n\nRef \"staff_address_id_fkey\":\"address\".\"address_id\" < \"staff\".\"address_id\" [update: cascade, delete: restrict]\n\nRef \"store_address_id_fkey\":\"address\".\"address_id\" < \"store\".\"address_id\" [update: cascade, delete: restrict]\n\nRef \"store_manager_staff_id_fkey\":\"staff\".\"staff_id\" < \"store\".\"manager_staff_id\" [update: cascade, delete: restrict]\n",
    "stateUrl": null,
    "stateOriginalUrl": "https://github.com/gordonkwokkwok/DVD-Rental-PostgreSQL-Project"
  }
}

================================================
FILE: tasks/postgres/standard/dvdrental/customer_analysis_fix/verify.py
================================================
"""
Verification script for PostgreSQL Task 3: Fix Customer Analysis Query
"""

import os
import sys
import psycopg2
from decimal import Decimal

def get_connection_params() -> dict:
    """Get database connection parameters."""
    return {
        "host": os.getenv("POSTGRES_HOST", "localhost"),
        "port": int(os.getenv("POSTGRES_PORT", 5432)),
        "database": os.getenv("POSTGRES_DATABASE"),
        "user": os.getenv("POSTGRES_USERNAME"),
        "password": os.getenv("POSTGRES_PASSWORD")
    }

def rows_match(actual_row, expected_row):
    """Compare two rows with appropriate tolerance for decimals and floats."""
    if len(actual_row) != len(expected_row):
        return False
    
    for actual, expected in zip(actual_row, expected_row):
        if isinstance(actual, (Decimal, float)) and isinstance(expected, (Decimal, float)):
            # Use higher tolerance for floating point comparisons
            if abs(float(actual) - float(expected)) > 0.1:
                return False
        elif actual != expected:
            return False
    
    return True

def verify_customer_analysis_fixed_table(conn) -> bool:
    """Verify the customer_analysis_fixed table results."""
    with conn.cursor() as cur:
        # Get actual results from the created table
        cur.execute("""
            SELECT customer_id, customer_name, customer_city, customer_country,
                   total_rentals, unique_films, total_spent, favorite_category,
                   favorite_actor, avg_rental_duration, customer_tier,
                   most_popular_film_in_region, regional_film_rental_count
            FROM customer_analysis_fixed
            ORDER BY total_spent DESC, total_rentals DESC, customer_name ASC
        """)
        actual_results = cur.fetchall()
        
        # Execute ground truth query (the corrected version)
        cur.execute("""
            WITH paid_rentals AS (
            SELECT DISTINCT
                    r.rental_id,
                    r.customer_id,
                    r.inventory_id,
                    r.rental_date,
                    r.return_date
            FROM rental r
            JOIN payment p ON p.rental_id = r.rental_id
            ),
            payments_by_customer AS (
            SELECT pr.customer_id, SUM(p.amount) AS total_spent
            FROM paid_rentals pr
            JOIN payment p ON p.rental_id = pr.rental_id
            GROUP BY pr.customer_id
            ),
            customer_basic_stats AS (
            SELECT
                c.customer_id,
                c.first_name || ' ' || c.last_name AS customer_name,
                ci.city AS customer_city,
                co.country AS customer_country,
                COUNT(DISTINCT pr.rental_id) AS total_rentals,
                COUNT(DISTINCT i.film_id) AS unique_films,
                pbc.total_spent,
                AVG(EXTRACT(EPOCH FROM (pr.return_date - pr.rental_date)) / 86400.0) AS avg_rental_duration
            FROM customer c
            JOIN address a ON c.address_id = a.address_id
            JOIN city ci ON a.city_id = ci.city_id
            JOIN country co ON ci.country_id = co.country_id
            JOIN paid_rentals pr ON pr.customer_id = c.customer_id
            JOIN inventory i ON pr.inventory_id = i.inventory_id
            JOIN payments_by_customer pbc ON pbc.customer_id = c.customer_id
            WHERE c.email IS NOT NULL
            GROUP BY c.customer_id, c.first_name, c.last_name, ci.city, co.country, pbc.total_spent
            HAVING COUNT(DISTINCT pr.rental_id) >= 15
            ),
            customer_categories AS (
            SELECT
                pr.customer_id,
                cat.name AS category_name,
                COUNT(*) AS category_count,
                ROW_NUMBER() OVER (
                    PARTITION BY pr.customer_id
                    ORDER BY COUNT(*) DESC, cat.name ASC
                ) AS rn
            FROM paid_rentals pr
            JOIN inventory i ON pr.inventory_id = i.inventory_id
            JOIN film f ON i.film_id = f.film_id
            JOIN film_category fc ON f.film_id = fc.film_id
            JOIN category cat ON fc.category_id = cat.category_id
            JOIN customer c ON pr.customer_id = c.customer_id
            WHERE c.email IS NOT NULL
            GROUP BY pr.customer_id, cat.name
            ),
            customer_actors AS (
            SELECT
                pr.customer_id,
                (a.first_name || ' ' || a.last_name) AS actor_name,
                COUNT(*) AS actor_count,
                ROW_NUMBER() OVER (
                    PARTITION BY pr.customer_id
                    ORDER BY COUNT(*) DESC, (a.first_name || ' ' || a.last_name) ASC
                ) AS rn
            FROM paid_rentals pr
            JOIN inventory i ON pr.inventory_id = i.inventory_id
            JOIN film f ON i.film_id = f.film_id
            JOIN film_actor fa ON f.film_id = fa.film_id
            JOIN actor a ON fa.actor_id = a.actor_id
            JOIN customer c ON pr.customer_id = c.customer_id
            WHERE c.email IS NOT NULL
            GROUP BY pr.customer_id, a.first_name, a.last_name
            ),
            regional_popular_films AS (
            SELECT
                co.country,
                f.title,
                COUNT(DISTINCT pr.rental_id) AS rental_count,
                ROW_NUMBER() OVER (
                    PARTITION BY co.country
                    ORDER BY COUNT(DISTINCT pr.rental_id) DESC, f.title ASC
                ) AS rn
            FROM paid_rentals pr
            JOIN customer c ON pr.customer_id = c.customer_id
            JOIN address a ON c.address_id = a.address_id
            JOIN city ci ON a.city_id = ci.city_id
            JOIN country co ON ci.country_id = co.country_id
            JOIN inventory i ON pr.inventory_id = i.inventory_id
            JOIN film f ON i.film_id = f.film_id
            WHERE c.email IS NOT NULL
            GROUP BY co.country, f.title
            )
            SELECT
                cbs.customer_id,
                cbs.customer_name,
                cbs.customer_city,
                cbs.customer_country,
                cbs.total_rentals,
                cbs.unique_films,
                cbs.total_spent,
                cc.category_name AS favorite_category,
                ca.actor_name AS favorite_actor,
                cbs.avg_rental_duration,
                CASE
                WHEN cbs.total_spent >= 150 THEN 'Premium'
                WHEN cbs.total_spent >= 75  THEN 'Standard'
                ELSE 'Basic'
                END AS customer_tier,
                rpf.title AS most_popular_film_in_region,
                rpf.rental_count AS regional_film_rental_count
            FROM customer_basic_stats cbs
            LEFT JOIN customer_categories cc
            ON cbs.customer_id = cc.customer_id AND cc.rn = 1
            LEFT JOIN customer_actors ca
            ON cbs.customer_id = ca.customer_id AND ca.rn = 1
            LEFT JOIN regional_popular_films rpf
            ON cbs.customer_country = rpf.country AND rpf.rn = 1
            ORDER BY cbs.total_spent DESC, cbs.total_rentals DESC, cbs.customer_name ASC;
        """)
        expected_results = cur.fetchall()

        if len(actual_results) != len(expected_results):
            print(f"❌ Expected {len(expected_results)} rows, got {len(actual_results)}")
            return False

        mismatches = 0
        for i, (actual, expected) in enumerate(zip(actual_results, expected_results)):
            if not rows_match(actual, expected):
                if mismatches < 5:  # Only show first 5 mismatches
                    print(f"❌ Row {i+1} mismatch:")
                    print(f"   Expected: {expected}")
                    print(f"   Actual:   {actual}")
                mismatches += 1

        if mismatches > 0:
            print(f"❌ Total mismatches: {mismatches}")
            return False

        print(f"✅ Query results are correct ({len(actual_results)} rows)")
        return True

def main():
    """Main verification function."""
    print("=" * 70)
    print("PostgreSQL Task 3 Verification: Fix Customer Analysis Query")
    print("=" * 70)

    # Get connection parameters
    conn_params = get_connection_params()

    if not conn_params["database"]:
        print("❌ No database specified")
        sys.exit(1)

    try:
        # Connect to database
        conn = psycopg2.connect(**conn_params)


        # Verify results
        success = verify_customer_analysis_fixed_table(conn)

        conn.close()

        if success:
            print("\n🎉 Task verification: PASS")
            print("   - Query was successfully debugged and fixed")
            print("   - All 587 rows match the expected results")
            sys.exit(0)
        else:
            print("\n❌ Task verification: FAIL")
            print("   - The query still has issues")
            print("   - Please review the duplicate counting problem")
            sys.exit(1)

    except psycopg2.Error as e:
        print(f"❌ Database error: {e}")
        sys.exit(1)
    except Exception as e:
        print(f"❌ Verification error: {e}")
        sys.exit(1)

if __name__ == "__main__":
    main()

================================================
FILE: tasks/postgres/standard/dvdrental/customer_analytics_optimization/description.md
================================================
Optimize slow customer analytics query in the DVD rental database.

## Background

The business intelligence team is running customer analytics reports, but one of their critical queries has become extremely slow. The query that used to run in milliseconds is now taking over a second to complete, causing timeout issues in their reporting dashboard.

## Your Task

Analyze and optimize the performance of this customer analytics query:

```sql
SELECT 
    c.customer_id,
    c.first_name,
    c.last_name,
    c.email,
    COUNT(DISTINCT p.payment_id) as total_payments,
    SUM(p.amount) as total_spent,
    AVG(p.amount) as avg_payment,
    COUNT(DISTINCT EXTRACT(month FROM p.payment_date)) as active_months,
    MAX(p.payment_date) as last_payment,
    MIN(p.payment_date) as first_payment,
    (SELECT COUNT(*) FROM payment p2 WHERE p2.customer_id = c.customer_id AND p2.amount > 5.0) as high_value_payments,
    (SELECT SUM(amount) FROM payment p3 WHERE p3.customer_id = c.customer_id AND p3.payment_date >= '2007-03-01') as recent_spending
FROM customer c
JOIN payment p ON c.customer_id = p.customer_id
WHERE c.active = 1
GROUP BY c.customer_id, c.first_name, c.last_name, c.email
HAVING COUNT(p.payment_id) >= 10
ORDER BY total_spent DESC, total_payments DESC;
```

The query is currently taking over 1000ms to execute and has a very high cost in the execution plan. The team needs this optimized urgently as it's blocking their daily reporting processes.

## Requirements

- Use `EXPLAIN ANALYZE` to identify performance bottlenecks
- Implement appropriate database optimizations  
- Ensure queries return accurate results after optimization
- Document your optimization approach and performance improvements

================================================
FILE: tasks/postgres/standard/dvdrental/customer_analytics_optimization/meta.json
================================================
{
  "task_id": "customer_analytics_optimization",
  "task_name": "Customer Analytics Optimization",
  "category_id": "dvdrental",
  "category_name": "DVD Rental",
  "description": "Optimize slow customer analytics query with correlated subqueries causing timeout issues in reporting dashboard.",
  "author": "Lingxiao Du",
  "created_at": "2025-08-20",
  "difficulty": "L3",
  "tags": [
    "performance optimization"
  ],
  "mcp": [
    "postgres"
  ],
  "meta_data": {
    "stateType": "text",
    "stateContent": "Enum \"mpaa_rating\" {\n  \"G\"\n  \"PG\"\n  \"PG-13\"\n  \"R\"\n  \"NC-17\"\n}\n\nTable \"customer\" {\n  \"customer_id\" int4 [pk, not null, increment]\n  \"store_id\" int2 [not null]\n  \"first_name\" varchar(45) [not null]\n  \"last_name\" varchar(45) [not null]\n  \"email\" varchar(50)\n  \"address_id\" int2 [not null]\n  \"activebool\" bool [not null, default: true]\n  \"create_date\" date [not null, default: `('now'::text)::date`]\n  \"last_update\" timestamp [default: `now()`]\n  \"active\" int4\n\n  Indexes {\n    address_id [type: btree, name: \"idx_fk_address_id\"]\n    store_id [type: btree, name: \"idx_fk_store_id\"]\n    last_name [type: btree, name: \"idx_last_name\"]\n  }\n}\n\nTable \"actor\" {\n  \"actor_id\" int4 [pk, not null, increment]\n  \"first_name\" varchar(45) [not null]\n  \"last_name\" varchar(45) [not null]\n  \"last_update\" timestamp [not null, default: `now()`]\n\n  Indexes {\n    last_name [type: btree, name: \"idx_actor_last_name\"]\n  }\n}\n\nTable \"category\" {\n  \"category_id\" int4 [pk, not null, increment]\n  \"name\" varchar(25) [not null]\n  \"last_update\" timestamp [not null, default: `now()`]\n}\n\nTable \"film\" {\n  \"film_id\" int4 [pk, not null, increment]\n  \"title\" varchar(255) [not null]\n  \"description\" text\n  \"release_year\" int4\n  \"language_id\" int2 [not null]\n  \"rental_duration\" int2 [not null, default: 3]\n  \"rental_rate\" numeric(4,2) [not null, default: 4.99]\n  \"length\" int2\n  \"replacement_cost\" numeric(5,2) [not null, default: 19.99]\n  \"rating\" mpaa_rating [default: 'G']\n  \"last_update\" timestamp [not null, default: `now()`]\n  \"special_features\" \"text[]\"\n  \"fulltext\" tsvector [not null]\n\n  Indexes {\n    fulltext [type: gist, name: \"film_fulltext_idx\"]\n    language_id [type: btree, name: \"idx_fk_language_id\"]\n    title [type: btree, name: \"idx_title\"]\n  }\n}\n\nTable \"film_actor\" {\n  \"actor_id\" int2 [not null]\n  \"film_id\" int2 [not null]\n  \"last_update\" timestamp [not null, default: `now()`]\n\n  Indexes {\n    (actor_id, film_id) [type: btree, name: \"film_actor_pkey\"]\n    film_id [type: btree, name: \"idx_fk_film_id\"]\n  }\n}\n\nTable \"film_category\" {\n  \"film_id\" int2 [not null]\n  \"category_id\" int2 [not null]\n  \"last_update\" timestamp [not null, default: `now()`]\n\n  Indexes {\n    (film_id, category_id) [type: btree, name: \"film_category_pkey\"]\n  }\n}\n\nTable \"address\" {\n  \"address_id\" int4 [pk, not null, increment]\n  \"address\" varchar(50) [not null]\n  \"address2\" varchar(50)\n  \"district\" varchar(20) [not null]\n  \"city_id\" int2 [not null]\n  \"postal_code\" varchar(10)\n  \"phone\" varchar(20) [not null]\n  \"last_update\" timestamp [not null, default: `now()`]\n\n  Indexes {\n    city_id [type: btree, name: \"idx_fk_city_id\"]\n  }\n}\n\nTable \"city\" {\n  \"city_id\" int4 [pk, not null, increment]\n  \"city\" varchar(50) [not null]\n  \"country_id\" int2 [not null]\n  \"last_update\" timestamp [not null, default: `now()`]\n\n  Indexes {\n    country_id [type: btree, name: \"idx_fk_country_id\"]\n  }\n}\n\nTable \"country\" {\n  \"country_id\" int4 [pk, not null, increment]\n  \"country\" varchar(50) [not null]\n  \"last_update\" timestamp [not null, default: `now()`]\n}\n\nTable \"inventory\" {\n  \"inventory_id\" int4 [pk, not null, increment]\n  \"film_id\" int2 [not null]\n  \"store_id\" int2 [not null]\n  \"last_update\" timestamp [not null, default: `now()`]\n\n  Indexes {\n    (store_id, film_id) [type: btree, name: \"idx_store_id_film_id\"]\n  }\n}\n\nTable \"language\" {\n  \"language_id\" int4 [pk, not null, increment]\n  \"name\" bpchar(20) [not null]\n  \"last_update\" timestamp [not null, default: `now()`]\n}\n\nTable \"payment\" {\n  \"payment_id\" int4 [pk, not null, increment]\n  \"customer_id\" int2 [not null]\n  \"staff_id\" int2 [not null]\n  \"rental_id\" int4 [not null]\n  \"amount\" numeric(5,2) [not null]\n  \"payment_date\" timestamp [not null]\n\n  Indexes {\n    rental_id [type: btree, name: \"idx_fk_rental_id\"]\n    staff_id [type: btree, name: \"idx_fk_staff_id\"]\n  }\n}\n\nTable \"rental\" {\n  \"rental_id\" int4 [pk, not null, increment]\n  \"rental_date\" timestamp [not null]\n  \"inventory_id\" int4 [not null]\n  \"customer_id\" int2 [not null]\n  \"return_date\" timestamp\n  \"staff_id\" int2 [not null]\n  \"last_update\" timestamp [not null, default: `now()`]\n\n  Indexes {\n    (rental_date, inventory_id, customer_id) [type: btree, name: \"idx_unq_rental_rental_date_inventory_id_customer_id\"]\n    inventory_id [type: btree, name: \"idx_fk_inventory_id\"]\n  }\n}\n\nTable \"staff\" {\n  \"staff_id\" int4 [pk, not null, increment]\n  \"first_name\" varchar(45) [not null]\n  \"last_name\" varchar(45) [not null]\n  \"address_id\" int2 [not null]\n  \"email\" varchar(50)\n  \"store_id\" int2 [not null]\n  \"active\" bool [not null, default: true]\n  \"username\" varchar(16) [not null]\n  \"password\" varchar(40)\n  \"last_update\" timestamp [not null, default: `now()`]\n  \"picture\" bytea\n}\n\nTable \"store\" {\n  \"store_id\" int4 [pk, not null, increment]\n  \"manager_staff_id\" int2 [unique, not null]\n  \"address_id\" int2 [not null]\n  \"last_update\" timestamp [not null, default: `now()`]\n}\n\nRef \"fk_address_city\":\"city\".\"city_id\" < \"address\".\"city_id\"\n\nRef \"fk_city\":\"country\".\"country_id\" < \"city\".\"country_id\"\n\nRef \"customer_address_id_fkey\":\"address\".\"address_id\" < \"customer\".\"address_id\" [update: cascade, delete: restrict]\n\nRef \"film_language_id_fkey\":\"language\".\"language_id\" < \"film\".\"language_id\" [update: cascade, delete: restrict]\n\nRef \"film_actor_actor_id_fkey\":\"actor\".\"actor_id\" < \"film_actor\".\"actor_id\" [update: cascade, delete: restrict]\n\nRef \"film_actor_film_id_fkey\":\"film\".\"film_id\" < \"film_actor\".\"film_id\" [update: cascade, delete: restrict]\n\nRef \"film_category_category_id_fkey\":\"category\".\"category_id\" < \"film_category\".\"category_id\" [update: cascade, delete: restrict]\n\nRef \"film_category_film_id_fkey\":\"film\".\"film_id\" < \"film_category\".\"film_id\" [update: cascade, delete: restrict]\n\nRef \"inventory_film_id_fkey\":\"film\".\"film_id\" < \"inventory\".\"film_id\" [update: cascade, delete: restrict]\n\nRef \"payment_customer_id_fkey\":\"customer\".\"customer_id\" < \"payment\".\"customer_id\" [update: cascade, delete: restrict]\n\nRef \"payment_rental_id_fkey\":\"rental\".\"rental_id\" < \"payment\".\"rental_id\" [update: cascade, delete: set null]\n\nRef \"payment_staff_id_fkey\":\"staff\".\"staff_id\" < \"payment\".\"staff_id\" [update: cascade, delete: restrict]\n\nRef \"rental_customer_id_fkey\":\"customer\".\"customer_id\" < \"rental\".\"customer_id\" [update: cascade, delete: restrict]\n\nRef \"rental_inventory_id_fkey\":\"inventory\".\"inventory_id\" < \"rental\".\"inventory_id\" [update: cascade, delete: restrict]\n\nRef \"rental_staff_id_key\":\"staff\".\"staff_id\" < \"rental\".\"staff_id\"\n\nRef \"staff_address_id_fkey\":\"address\".\"address_id\" < \"staff\".\"address_id\" [update: cascade, delete: restrict]\n\nRef \"store_address_id_fkey\":\"address\".\"address_id\" < \"store\".\"address_id\" [update: cascade, delete: restrict]\n\nRef \"store_manager_staff_id_fkey\":\"staff\".\"staff_id\" < \"store\".\"manager_staff_id\" [update: cascade, delete: restrict]\n",
    "stateUrl": null,
    "stateOriginalUrl": "https://github.com/gordonkwokkwok/DVD-Rental-PostgreSQL-Project"
  }
}

================================================
FILE: tasks/postgres/standard/dvdrental/customer_analytics_optimization/verify.py
================================================
"""
Verification script for PostgreSQL Task 1: Customer Payment Query Optimization
"""

import os
import sys
import psycopg2

def get_connection_params() -> dict:
    """Get database connection parameters."""
    return {
        "host": os.getenv("POSTGRES_HOST", "localhost"),
        "port": int(os.getenv("POSTGRES_PORT", 5432)),
        "database": os.getenv("POSTGRES_DATABASE"),
        "user": os.getenv("POSTGRES_USERNAME"),
        "password": os.getenv("POSTGRES_PASSWORD")
    }

def check_payment_customer_id_index(conn) -> bool:
    """Check if there's any index on payment.customer_id column."""
    with conn.cursor() as cur:
        cur.execute("""
            SELECT indexname, indexdef 
            FROM pg_indexes 
            WHERE schemaname = 'public' 
            AND tablename = 'payment'
            AND indexdef LIKE '%customer_id%'
        """)
        indexes = cur.fetchall()
        print(indexes)
        return len(indexes) > 0, indexes

def main():
    """Main verification function."""
    print("=" * 60)
    print("PostgreSQL Task 1 Verification: Customer Payment Query Optimization")
    print("=" * 60)
    
    # Get connection parameters
    conn_params = get_connection_params()
    
    if not conn_params["database"]:
        print("❌ No database specified")
        sys.exit(1)
    
    try:
        # Connect to database
        conn = psycopg2.connect(**conn_params)
        
        print("\n🔍 Checking for customer_id index on payment table...")
        
        # Check if any index exists on payment.customer_id
        has_index, indexes = check_payment_customer_id_index(conn)
        
        if has_index:
            print("✅ Found index(es) on payment.customer_id:")
            for index_name, index_def in indexes:
                print(f"   - {index_name}: {index_def}")
        else:
            print("❌ No index found on payment.customer_id column")
        
        conn.close()
        
        if has_index:
            print(f"\n🎉 Task verification: PASS")
            print(f"   - Index on payment.customer_id exists")
            sys.exit(0)
        else:
            print(f"\n❌ Task verification: FAIL")
            print(f"   - No index found on payment.customer_id")
            print(f"   - Create an index on payment(customer_id) to optimize the queries")
            sys.exit(1)
            
    except psycopg2.Error as e:
        print(f"❌ Database error: {e}")
        sys.exit(1)
    except Exception as e:
        print(f"❌ Verification error: {e}")
        sys.exit(1)

if __name__ == "__main__":
    main()

================================================
FILE: tasks/postgres/standard/dvdrental/film_inventory_management/description.md
================================================
Manage film inventory operations in the DVD rental database.

## Background

You are the database administrator for the DVD rental store. The store manager has requested several database operations to manage the film inventory. You need to perform multiple operations including adding new films, updating inventory, querying available films, and cleaning up old records.

## Your Task

Complete the following database operations in sequence:

### 1. Add New Films
Add these two new films to the database:
- **Film 1**: Title "Data Science Adventures", Description "A thrilling journey through machine learning algorithms", Release Year 2024, Language ID 1, Rental Duration 5 days, Rental Rate $3.99, Length 120 minutes, Replacement Cost $15.99, Rating 'PG-13'
- **Film 2**: Title "Cloud Computing Chronicles", Description "Exploring the world of distributed systems", Release Year 2024, Language ID 1, Rental Duration 7 days, Rental Rate $4.99, Length 135 minutes, Replacement Cost $18.99, Rating 'PG'

### 2. Add Inventory Records
For each new film, add 3 inventory records for store_id = 1 and 2 inventory records for store_id = 2.

### 3. Update Film Information
Update the rental_rate of all films with rating 'PG-13' to increase by 10% (multiply by 1.1).

### 4. Create Available Films Table
Create a table called `available_films` with the following structure:
- `film_id` (INTEGER, PRIMARY KEY)
- `title` (VARCHAR(255), NOT NULL)
- `rental_rate` (NUMERIC(4,2), NOT NULL)
- `length` (SMALLINT)

Populate this table with films that meet these criteria:
- Have rental_rate between $3.00 and $5.00
- Have length greater than 100 minutes  
- Are available in store_id = 1 (have at least 1 inventory record)


### 5. Clean Up Inventory
Delete inventory records for films that meet ALL of the following criteria:
- Have a replacement_cost greater than $25.00
- AND have rental_rate less than $1.00
- AND have no rental history (no records in the rental table)


### 6. Create Summary Report Table
Create a table called `film_inventory_summary` with the following structure:
- `title` (VARCHAR(255), NOT NULL)
- `rental_rate` (NUMERIC(4,2), NOT NULL)
- `total_inventory` (INTEGER, NOT NULL)
- `store1_count` (INTEGER, NOT NULL)
- `store2_count` (INTEGER, NOT NULL)

Populate this table with a summary query that shows:
- Film title
- Current rental rate (after any updates from step 3)
- Total count of inventory records across all stores
- Count of inventory records in store_id = 1
- Count of inventory records in store_id = 2

Requirements for the summary report:
- Include only films that currently have at least one inventory record  
- Insert the results sorted by inventory count from highest to lowest, and then alphabetically by film title
- Ensure all counts reflect the state after completing the previous operations

## Requirements

- Complete all operations in the specified sequence
- Ensure data integrity throughout all operations
- Verify that your operations affect the expected number of records
- Handle any constraint violations appropriately

================================================
FILE: tasks/postgres/standard/dvdrental/film_inventory_management/meta.json
================================================
{
  "task_id": "film_inventory_management",
  "task_name": "Film Inventory Management",
  "category_id": "dvdrental",
  "category_name": "DVD Rental",
  "description": "Manage film inventory through multiple operations including adding films, updating records, and cleaning old data.",
  "author": "Lingxiao Du",
  "created_at": "2025-08-20",
  "difficulty": "L3",
  "tags": [
    "data migration",
    "transactional operations",
    "schema design"
  ],
  "mcp": [
    "postgres"
  ],
  "meta_data": {
    "stateType": "text",
    "stateContent": "Enum \"mpaa_rating\" {\n  \"G\"\n  \"PG\"\n  \"PG-13\"\n  \"R\"\n  \"NC-17\"\n}\n\nTable \"customer\" {\n  \"customer_id\" int4 [pk, not null, increment]\n  \"store_id\" int2 [not null]\n  \"first_name\" varchar(45) [not null]\n  \"last_name\" varchar(45) [not null]\n  \"email\" varchar(50)\n  \"address_id\" int2 [not null]\n  \"activebool\" bool [not null, default: true]\n  \"create_date\" date [not null, default: `('now'::text)::date`]\n  \"last_update\" timestamp [default: `now()`]\n  \"active\" int4\n\n  Indexes {\n    address_id [type: btree, name: \"idx_fk_address_id\"]\n    store_id [type: btree, name: \"idx_fk_store_id\"]\n    last_name [type: btree, name: \"idx_last_name\"]\n  }\n}\n\nTable \"actor\" {\n  \"actor_id\" int4 [pk, not null, increment]\n  \"first_name\" varchar(45) [not null]\n  \"last_name\" varchar(45) [not null]\n  \"last_update\" timestamp [not null, default: `now()`]\n\n  Indexes {\n    last_name [type: btree, name: \"idx_actor_last_name\"]\n  }\n}\n\nTable \"category\" {\n  \"category_id\" int4 [pk, not null, increment]\n  \"name\" varchar(25) [not null]\n  \"last_update\" timestamp [not null, default: `now()`]\n}\n\nTable \"film\" {\n  \"film_id\" int4 [pk, not null, increment]\n  \"title\" varchar(255) [not null]\n  \"description\" text\n  \"release_year\" int4\n  \"language_id\" int2 [not null]\n  \"rental_duration\" int2 [not null, default: 3]\n  \"rental_rate\" numeric(4,2) [not null, default: 4.99]\n  \"length\" int2\n  \"replacement_cost\" numeric(5,2) [not null, default: 19.99]\n  \"rating\" mpaa_rating [default: 'G']\n  \"last_update\" timestamp [not null, default: `now()`]\n  \"special_features\" \"text[]\"\n  \"fulltext\" tsvector [not null]\n\n  Indexes {\n    fulltext [type: gist, name: \"film_fulltext_idx\"]\n    language_id [type: btree, name: \"idx_fk_language_id\"]\n    title [type: btree, name: \"idx_title\"]\n  }\n}\n\nTable \"film_actor\" {\n  \"actor_id\" int2 [not null]\n  \"film_id\" int2 [not null]\n  \"last_update\" timestamp [not null, default: `now()`]\n\n  Indexes {\n    (actor_id, film_id) [type: btree, name: \"film_actor_pkey\"]\n    film_id [type: btree, name: \"idx_fk_film_id\"]\n  }\n}\n\nTable \"film_category\" {\n  \"film_id\" int2 [not null]\n  \"category_id\" int2 [not null]\n  \"last_update\" timestamp [not null, default: `now()`]\n\n  Indexes {\n    (film_id, category_id) [type: btree, name: \"film_category_pkey\"]\n  }\n}\n\nTable \"address\" {\n  \"address_id\" int4 [pk, not null, increment]\n  \"address\" varchar(50) [not null]\n  \"address2\" varchar(50)\n  \"district\" varchar(20) [not null]\n  \"city_id\" int2 [not null]\n  \"postal_code\" varchar(10)\n  \"phone\" varchar(20) [not null]\n  \"last_update\" timestamp [not null, default: `now()`]\n\n  Indexes {\n    city_id [type: btree, name: \"idx_fk_city_id\"]\n  }\n}\n\nTable \"city\" {\n  \"city_id\" int4 [pk, not null, increment]\n  \"city\" varchar(50) [not null]\n  \"country_id\" int2 [not null]\n  \"last_update\" timestamp [not null, default: `now()`]\n\n  Indexes {\n    country_id [type: btree, name: \"idx_fk_country_id\"]\n  }\n}\n\nTable \"country\" {\n  \"country_id\" int4 [pk, not null, increment]\n  \"country\" varchar(50) [not null]\n  \"last_update\" timestamp [not null, default: `now()`]\n}\n\nTable \"inventory\" {\n  \"inventory_id\" int4 [pk, not null, increment]\n  \"film_id\" int2 [not null]\n  \"store_id\" int2 [not null]\n  \"last_update\" timestamp [not null, default: `now()`]\n\n  Indexes {\n    (store_id, film_id) [type: btree, name: \"idx_store_id_film_id\"]\n  }\n}\n\nTable \"language\" {\n  \"language_id\" int4 [pk, not null, increment]\n  \"name\" bpchar(20) [not null]\n  \"last_update\" timestamp [not null, default: `now()`]\n}\n\nTable \"payment\" {\n  \"payment_id\" int4 [pk, not null, increment]\n  \"customer_id\" int2 [not null]\n  \"staff_id\" int2 [not null]\n  \"rental_id\" int4 [not null]\n  \"amount\" numeric(5,2) [not null]\n  \"payment_date\" timestamp [not null]\n\n  Indexes {\n    rental_id [type: btree, name: \"idx_fk_rental_id\"]\n    staff_id [type: btree, name: \"idx_fk_staff_id\"]\n  }\n}\n\nTable \"rental\" {\n  \"rental_id\" int4 [pk, not null, increment]\n  \"rental_date\" timestamp [not null]\n  \"inventory_id\" int4 [not null]\n  \"customer_id\" int2 [not null]\n  \"return_date\" timestamp\n  \"staff_id\" int2 [not null]\n  \"last_update\" timestamp [not null, default: `now()`]\n\n  Indexes {\n    (rental_date, inventory_id, customer_id) [type: btree, name: \"idx_unq_rental_rental_date_inventory_id_customer_id\"]\n    inventory_id [type: btree, name: \"idx_fk_inventory_id\"]\n  }\n}\n\nTable \"staff\" {\n  \"staff_id\" int4 [pk, not null, increment]\n  \"first_name\" varchar(45) [not null]\n  \"last_name\" varchar(45) [not null]\n  \"address_id\" int2 [not null]\n  \"email\" varchar(50)\n  \"store_id\" int2 [not null]\n  \"active\" bool [not null, default: true]\n  \"username\" varchar(16) [not null]\n  \"password\" varchar(40)\n  \"last_update\" timestamp [not null, default: `now()`]\n  \"picture\" bytea\n}\n\nTable \"store\" {\n  \"store_id\" int4 [pk, not null, increment]\n  \"manager_staff_id\" int2 [unique, not null]\n  \"address_id\" int2 [not null]\n  \"last_update\" timestamp [not null, default: `now()`]\n}\n\nRef \"fk_address_city\":\"city\".\"city_id\" < \"address\".\"city_id\"\n\nRef \"fk_city\":\"country\".\"country_id\" < \"city\".\"country_id\"\n\nRef \"customer_address_id_fkey\":\"address\".\"address_id\" < \"customer\".\"address_id\" [update: cascade, delete: restrict]\n\nRef \"film_language_id_fkey\":\"language\".\"language_id\" < \"film\".\"language_id\" [update: cascade, delete: restrict]\n\nRef \"film_actor_actor_id_fkey\":\"actor\".\"actor_id\" < \"film_actor\".\"actor_id\" [update: cascade, delete: restrict]\n\nRef \"film_actor_film_id_fkey\":\"film\".\"film_id\" < \"film_actor\".\"film_id\" [update: cascade, delete: restrict]\n\nRef \"film_category_category_id_fkey\":\"category\".\"category_id\" < \"film_category\".\"category_id\" [update: cascade, delete: restrict]\n\nRef \"film_category_film_id_fkey\":\"film\".\"film_id\" < \"film_category\".\"film_id\" [update: cascade, delete: restrict]\n\nRef \"inventory_film_id_fkey\":\"film\".\"film_id\" < \"inventory\".\"film_id\" [update: cascade, delete: restrict]\n\nRef \"payment_customer_id_fkey\":\"customer\".\"customer_id\" < \"payment\".\"customer_id\" [update: cascade, delete: restrict]\n\nRef \"payment_rental_id_fkey\":\"rental\".\"rental_id\" < \"payment\".\"rental_id\" [update: cascade, delete: set null]\n\nRef \"payment_staff_id_fkey\":\"staff\".\"staff_id\" < \"payment\".\"staff_id\" [update: cascade, delete: restrict]\n\nRef \"rental_customer_id_fkey\":\"customer\".\"customer_id\" < \"rental\".\"customer_id\" [update: cascade, delete: restrict]\n\nRef \"rental_inventory_id_fkey\":\"inventory\".\"inventory_id\" < \"rental\".\"inventory_id\" [update: cascade, delete: restrict]\n\nRef \"rental_staff_id_key\":\"staff\".\"staff_id\" < \"rental\".\"staff_id\"\n\nRef \"staff_address_id_fkey\":\"address\".\"address_id\" < \"staff\".\"address_id\" [update: cascade, delete: restrict]\n\nRef \"store_address_id_fkey\":\"address\".\"address_id\" < \"store\".\"address_id\" [update: cascade, delete: restrict]\n\nRef \"store_manager_staff_id_fkey\":\"staff\".\"staff_id\" < \"store\".\"manager_staff_id\" [update: cascade, delete: restrict]\n",
    "stateUrl": null,
    "stateOriginalUrl": "https://github.com/gordonkwokkwok/DVD-Rental-PostgreSQL-Project"
  }
}

================================================
FILE: tasks/postgres/standard/dvdrental/film_inventory_management/verify.py
================================================
"""
Verification script for PostgreSQL Task 4: Film Inventory Management
"""

import os
import sys
import psycopg2
from decimal import Decimal

def rows_match(actual_row, expected_row):
    """Compare two rows with appropriate tolerance for decimals and floats."""
    if len(actual_row) != len(expected_row):
        return False
    
    for actual, expected in zip(actual_row, expected_row):
        if isinstance(actual, (Decimal, float)) and isinstance(expected, (Decimal, float)):
            # Use higher tolerance for floating point comparisons
            if abs(float(actual) - float(expected)) > 0.01:
                return False
        elif actual != expected:
            return False
    
    return True

def get_connection_params() -> dict:
    """Get database connection parameters."""
    return {
        "host": os.getenv("POSTGRES_HOST", "localhost"),
        "port": int(os.getenv("POSTGRES_PORT", 5432)),
        "database": os.getenv("POSTGRES_DATABASE"),
        "user": os.getenv("POSTGRES_USERNAME"),
        "password": os.getenv("POSTGRES_PASSWORD")
    }

def check_new_films(conn) -> bool:
    """Check if the two new films were added correctly."""
    with conn.cursor() as cur:
        cur.execute("""
            SELECT title, description, release_year, language_id, 
                   rental_duration, rental_rate, length, replacement_cost, 
                   rating
            FROM film 
            WHERE title IN ('Data Science Adventures', 'Cloud Computing Chronicles')
            ORDER BY title
        """)
        actual_films = cur.fetchall()
        
        expected_films = [
            ('Cloud Computing Chronicles', 'Exploring the world of distributed systems', 2024, 1, 7, Decimal('4.99'), 135, Decimal('18.99'), 'PG'),
            ('Data Science Adventures', 'A thrilling journey through machine learning algorithms', 2024, 1, 5, Decimal('4.389'), 120, Decimal('15.99'), 'PG-13')
        ]
        
        if len(actual_films) != len(expected_films):
            print(f"❌ Expected {len(expected_films)} new films, found {len(actual_films)}")
            return False
            
        mismatches = 0
        for i, (actual, expected) in enumerate(zip(actual_films, expected_films)):
            if not rows_match(actual, expected):
                print(f"❌ Film {i+1} mismatch: expected {expected}, got {actual}")
                mismatches += 1
                
        if mismatches > 0:
            print(f"❌ Total film mismatches: {mismatches}")
            return False
            
        print("✅ Both new films added correctly")
        return True

def check_inventory_records(conn) -> bool:
    """Check if inventory records were added for new films."""
    with conn.cursor() as cur:
        cur.execute("""
            SELECT f.title, i.store_id, COUNT(*) as count
            FROM film f
            JOIN inventory i ON f.film_id = i.film_id
            WHERE f.title IN ('Data Science Adventures', 'Cloud Computing Chronicles')
            GROUP BY f.title, i.store_id
            ORDER BY f.title, i.store_id
        """)
        actual_inventory = cur.fetchall()
        
        expected_inventory = [
            ('Cloud Computing Chronicles', 1, 3),
            ('Cloud Computing Chronicles', 2, 2), 
            ('Data Science Adventures', 1, 3),
            ('Data Science Adventures', 2, 2)
        ]
        
        if len(actual_inventory) != len(expected_inventory):
            print(f"❌ Expected {len(expected_inventory)} inventory groups, found {len(actual_inventory)}")
            return False
            
        mismatches = 0
        for i, (actual, expected) in enumerate(zip(actual_inventory, expected_inventory)):
            if not rows_match(actual, expected):
                print(f"❌ Inventory group {i+1} mismatch: expected {expected}, got {actual}")
                mismatches += 1
                
        if mismatches > 0:
            print(f"❌ Total inventory mismatches: {mismatches}")
            return False
                
        print("✅ Inventory records added correctly")
        return True

def check_available_films_table(conn) -> bool:
    """Check if available_films table was created and populated correctly."""
    with conn.cursor() as cur:
        # Get actual results from the created table
        cur.execute("""
            SELECT film_id, title, rental_rate, length
            FROM available_films
            ORDER BY rental_rate DESC, length DESC, title ASC
        """)
        actual_results = cur.fetchall()
        
        # Execute ground truth query
        cur.execute("""
            SELECT DISTINCT f.film_id, f.title, f.rental_rate, f.length
            FROM film f
            JOIN inventory i ON f.film_id = i.film_id
            WHERE f.rental_rate >= 3.00 AND f.rental_rate <= 5.00
            AND f.length > 100
            AND i.store_id = 1
            ORDER BY f.rental_rate DESC, f.length DESC, f.title ASC
        """)
        expected_results = cur.fetchall()
        
        if len(actual_results) != len(expected_results):
            print(f"❌ available_films table has {len(actual_results)} records, expected {len(expected_results)}")
            return False
            
        mismatches = 0
        for i, (actual, expected) in enumerate(zip(actual_results, expected_results)):
            if not rows_match(actual, expected):
                if mismatches < 5:  # Only show first 5 mismatches
                    print(f"❌ available_films row {i+1} mismatch: expected {expected}, got {actual}")
                mismatches += 1
                
        if mismatches > 0:
            print(f"❌ Total available_films mismatches: {mismatches}")
            return False
            
        print(f"✅ available_films table created and populated correctly ({len(actual_results)} records)")
        return True

def check_inventory_cleanup(conn) -> bool:
    """Check if inventory cleanup was performed correctly."""
    with conn.cursor() as cur:
        # Check that no inventory exists for films with replacement_cost > 25 AND rental_rate < 1
        # that also don't have rental records (safe to delete)
        cur.execute("""
            SELECT COUNT(*)
            FROM inventory i
            JOIN film f ON i.film_id = f.film_id
            WHERE f.replacement_cost > 25.00 AND f.rental_rate < 1.00
            AND NOT EXISTS (SELECT 1 FROM rental r WHERE r.inventory_id = i.inventory_id)
        """)
        
        remaining_count = cur.fetchone()[0]
        
        if remaining_count > 0:
            print(f"❌ Found {remaining_count} inventory records that should have been deleted (no rental history)")
            return False
            
        print("✅ Inventory cleanup completed correctly")
        return True

def check_summary_table(conn) -> bool:
    """Check if film_inventory_summary table was created and populated correctly."""
    with conn.cursor() as cur:
            
        # Get actual results from the created table
        cur.execute("""
            SELECT title, rental_rate, total_inventory, store1_count, store2_count
            FROM film_inventory_summary
        """)
        actual_results = cur.fetchall()
        
        # Execute ground truth query
        cur.execute("""
            SELECT f.title, f.rental_rate,
                   COUNT(i.inventory_id) as total_inventory,
                   COUNT(CASE WHEN i.store_id = 1 THEN 1 END) as store1_count,
                   COUNT(CASE WHEN i.store_id = 2 THEN 1 END) as store2_count
            FROM film f
            JOIN inventory i ON f.film_id = i.film_id
            GROUP BY f.film_id, f.title, f.rental_rate
            ORDER BY total_inventory DESC, f.title ASC
        """)
        expected_results = cur.fetchall()
        
        if len(actual_results) != len(expected_results):
            print(f"❌ film_inventory_summary table has {len(actual_results)} records, expected {len(expected_results)}")
            return False
            
        mismatches = 0
        for i, (actual, expected) in enumerate(zip(actual_results, expected_results)):
            if not rows_match(actual, expected):
                if mismatches < 5:  # Only show first 5 mismatches
                    print(f"❌ Summary row {i+1} mismatch: expected {expected}, got {actual}")
                mismatches += 1
                
        if mismatches > 0:
            print(f"❌ Total summary table mismatches: {mismatches}")
            return False
                
        print(f"✅ film_inventory_summary table created and populated correctly ({len(actual_results)} records)")
        return True

def main():
    """Main verification function."""
    print("=" * 70)
    print("PostgreSQL Task 4 Verification: Film Inventory Management")
    print("=" * 70)
    
    # Get connection parameters
    conn_params = get_connection_params()
    
    if not conn_params["database"]:
        print("❌ No database specified")
        sys.exit(1)
    
    try:
        # Connect to database
        conn = psycopg2.connect(**conn_params)
        
        # Verify all operations with short-circuit evaluation
        success = (
            check_new_films(conn) and 
            check_inventory_records(conn) and
            check_available_films_table(conn) and 
            check_inventory_cleanup(conn) and
            check_summary_table(conn)
        )
        
        conn.close()
        
        if success:
            print(f"\n🎉 Task verification: PASS")
            sys.exit(0)
        else:
            print(f"\n❌ Task verification: FAIL")
            sys.exit(1)
            
    except psycopg2.Error as e:
        print(f"❌ Database error: {e}")
        sys.exit(1)
    except Exception as e:
        print(f"❌ Verification error: {e}")
        sys.exit(1)

if __name__ == "__main__":
    main()

================================================
FILE: tasks/postgres/standard/employees/employee_demographics_report/description.md
================================================
Generate a comprehensive employee demographics and basic statistics report for the annual company overview. The HR team needs simple, clear statistical summaries about our workforce composition to include in the annual report and diversity initiatives.

## Your Tasks:

1. **Create the gender statistics table** — build a table called `gender_statistics` in the `employees` schema with these exact columns:
   * `gender` (varchar) — gender ('M' or 'F')
   * `total_employees` (integer) — total number of employees of this gender
   * `current_employees` (integer) — current employees of this gender (have active salary)
   * `percentage_of_workforce` (decimal) — percentage of current workforce

2. **Create the age group analysis table** — build a table called `age_group_analysis` in the `employees` schema with:
   * `age_group` (varchar) — age range ('20-29', '30-39', '40-49', '50-59', '60+')
   * `employee_count` (integer) — number of current employees in age group
   * `avg_salary` (decimal) — average current salary for age group
   * `avg_tenure_days` (decimal) — average days of service

3. **Create the birth month distribution table** — build a table called `birth_month_distribution` in the `employees` schema with:
   * `birth_month` (integer) — month number (1-12)
   * `month_name` (varchar) — month name ('January', 'February', etc.)
   * `employee_count` (integer) — total employees born in this month
   * `current_employee_count` (integer) — current employees born in this month

4. **Create the hiring year summary table** — build a table called `hiring_year_summary` in the `employees` schema with:
   * `hire_year` (integer) — year employees were hired
   * `employees_hired` (integer) — number of employees hired that year
   * `still_employed` (integer) — how many from that year are still employed
   * `retention_rate` (decimal) — percentage still employed (still_employed/employees_hired * 100)

5. **Apply age group classification** based on current age:
   * **20-29**: Ages 20-29
   * **30-39**: Ages 30-39  
   * **40-49**: Ages 40-49
   * **50-59**: Ages 50-59
   * **60+**: Ages 60 and above

6. **Calculate workforce composition** — determine current workforce demographics using employees with active salary records (to_date = '9999-01-01').

7. **Focus on basic statistics** — create simple counts, averages, and percentages that are easy to understand and verify.

The analysis will provide clear demographic insights for HR reporting and workforce planning.


================================================
FILE: tasks/postgres/standard/employees/employee_demographics_report/meta.json
================================================
{
  "task_id": "employee_demographics_report",
  "task_name": "Employee Demographics Report",
  "category_id": "employees",
  "category_name": "Employees",
  "description": "Generate comprehensive employee demographics report with gender statistics, age groups, birth months, and hiring trends.",
  "author": "Lingxiao Du",
  "created_at": "2025-08-14",
  "difficulty": "L3",
  "tags": [
    "reporting and analytics",
    "statistical aggregation"
  ],
  "mcp": [
    "postgres"
  ],
  "meta_data": {
    "stateType": "text",
    "stateContent": "Enum \"employees\".\"employee_gender\" {\n  \"M\"\n  \"F\"\n}\n\nTable \"employees\".\"department\" {\n  \"id\" bpchar(4) [pk, not null]\n  \"dept_name\" varchar(40) [unique, not null]\n}\n\nTable \"employees\".\"department_employee\" {\n  \"employee_id\" int8 [not null]\n  \"department_id\" bpchar(4) [not null]\n  \"from_date\" date [not null]\n  \"to_date\" date [not null]\n\n  Indexes {\n    (employee_id, department_id) [type: btree, name: \"idx_16982_primary\"]\n    department_id [type: btree, name: \"idx_16982_dept_no\"]\n  }\n}\n\nTable \"employees\".\"department_manager\" {\n  \"employee_id\" int8 [not null]\n  \"department_id\" bpchar(4) [not null]\n  \"from_date\" date [not null]\n  \"to_date\" date [not null]\n\n  Indexes {\n    (employee_id, department_id) [type: btree, name: \"idx_16985_primary\"]\n    department_id [type: btree, name: \"idx_16985_dept_no\"]\n  }\n}\n\nTable \"employees\".\"employee\" {\n  \"id\" int8 [pk, not null, increment]\n  \"birth_date\" date [not null]\n  \"first_name\" varchar(14) [not null]\n  \"last_name\" varchar(16) [not null]\n  \"gender\" employees.employee_gender [not null]\n  \"hire_date\" date [not null]\n}\n\nTable \"employees\".\"salary\" {\n  \"employee_id\" int8 [not null]\n  \"amount\" int8 [not null]\n  \"from_date\" date [not null]\n  \"to_date\" date [not null]\n\n  Indexes {\n    (employee_id, from_date) [type: btree, name: \"idx_16991_primary\"]\n  }\n}\n\nTable \"employees\".\"title\" {\n  \"employee_id\" int8 [not null]\n  \"title\" varchar(50) [not null]\n  \"from_date\" date [not null]\n  \"to_date\" date\n\n  Indexes {\n    (employee_id, title, from_date) [type: btree, name: \"idx_16994_primary\"]\n  }\n}\n\nRef \"dept_emp_ibfk_1\":\"employees\".\"employee\".\"id\" < \"employees\".\"department_employee\".\"employee_id\" [update: restrict, delete: cascade]\n\nRef \"dept_emp_ibfk_2\":\"employees\".\"department\".\"id\" < \"employees\".\"department_employee\".\"department_id\" [update: restrict, delete: cascade]\n\nRef \"dept_manager_ibfk_1\":\"employees\".\"employee\".\"id\" < \"employees\".\"department_manager\".\"employee_id\" [update: restrict, delete: cascade]\n\nRef \"dept_manager_ibfk_2\":\"employees\".\"department\".\"id\" < \"employees\".\"department_manager\".\"department_id\" [update: restrict, delete: cascade]\n\nRef \"salaries_ibfk_1\":\"employees\".\"employee\".\"id\" < \"employees\".\"salary\".\"employee_id\" [update: restrict, delete: cascade]\n\nRef \"titles_ibfk_1\":\"employees\".\"employee\".\"id\" < \"employees\".\"title\".\"employee_id\" [update: restrict, delete: cascade]\n",
    "stateUrl": null,
    "stateOriginalUrl": "https://github.com/neondatabase-labs/postgres-sample-dbs/blob/main/employees.sql.gz"
  }
}

================================================
FILE: tasks/postgres/standard/employees/employee_demographics_report/verify.py
================================================
"""
Verification script for PostgreSQL Task 3: Employee Demographics Report
"""

import os
import sys
import psycopg2
from decimal import Decimal

def rows_match(actual_row, expected_row):
    """
    Compare two rows with appropriate tolerance.
    For Decimal types: allows 0.1 tolerance
    For other types: requires exact match
    """
    if len(actual_row) != len(expected_row):
        return False
    
    for actual, expected in zip(actual_row, expected_row):
        if isinstance(actual, Decimal) and isinstance(expected, Decimal):
            if abs(float(actual) - float(expected)) > 0.1:
                return False
        elif actual != expected:
            return False
    
    return True

def get_connection_params() -> dict:
    """Get database connection parameters."""
    return {
        "host": os.getenv("POSTGRES_HOST", "localhost"),
        "port": int(os.getenv("POSTGRES_PORT", 5432)),
        "database": os.getenv("POSTGRES_DATABASE"),
        "user": os.getenv("POSTGRES_USERNAME"),
        "password": os.getenv("POSTGRES_PASSWORD")
    }

def verify_gender_statistics_results(conn) -> bool:
    """Verify the gender statistics results."""
    with conn.cursor() as cur:
        # Get actual results from the created table
        cur.execute("""
            SELECT gender, total_employees, current_employees, percentage_of_workforce
            FROM employees.gender_statistics
            ORDER BY gender
        """)
        actual_results = cur.fetchall()
        
        # Execute ground truth query
        cur.execute("""
            WITH current_emp AS (
            SELECT DISTINCT s.employee_id
            FROM employees.salary s
            WHERE s.to_date = DATE '9999-01-01'
            ),
            total_current AS (
            SELECT COUNT(*) AS cnt
            FROM current_emp
            )
            SELECT
            e.gender::varchar AS gender,
            COUNT(*) AS total_employees,
            COUNT(*) FILTER (WHERE ce.employee_id IS NOT NULL) AS current_employees,
            (COUNT(*) FILTER (WHERE ce.employee_id IS NOT NULL))::DECIMAL
                / NULLIF((SELECT cnt FROM total_current), 0) * 100 AS percentage_of_workforce
            FROM employees.employee e
            LEFT JOIN current_emp ce ON ce.employee_id = e.id
            WHERE e.gender IN ('M','F')
            GROUP BY e.gender
            ORDER BY gender;
        """)
        expected_results = cur.fetchall()

        if len(actual_results) != len(expected_results):
            print(f"❌ Expected {len(expected_results)} gender statistics results, got {len(actual_results)}")
            return False

        mismatches = 0
        for i, (actual, expected) in enumerate(zip(actual_results, expected_results)):
            if not rows_match(actual, expected):
                if mismatches < 5:  # Only show first 5 mismatches
                    print(f"❌ Row {i+1} mismatch: expected {expected}, got {actual}")
                mismatches += 1

        if mismatches > 0:
            print(f"❌ Total mismatches: {mismatches}")
            return False

        print(f"✅ Gender statistics results are correct ({len(actual_results)} records)")
        return True

def verify_age_group_results(conn) -> bool:
    """Verify the age group analysis results."""
    with conn.cursor() as cur:
        # Get actual results from the created table
        cur.execute("""
            SELECT age_group, employee_count, avg_salary, avg_tenure_days
            FROM employees.age_group_analysis
            ORDER BY age_group
        """)
        actual_results = cur.fetchall()
        
        # Execute ground truth query
        cur.execute("""
WITH current_salary AS (
  SELECT employee_id, amount
  FROM (
    SELECT s.*,
           ROW_NUMBER() OVER (
             PARTITION BY s.employee_id
             ORDER BY s.from_date DESC, s.amount DESC
           ) AS rn
    FROM employees.salary s
    WHERE s.to_date = DATE '9999-01-01'
  ) x
  WHERE rn = 1
),
emp_age AS (
  SELECT
    e.id AS employee_id,
    e.hire_date,
    EXTRACT(YEAR FROM AGE(CURRENT_DATE, e.birth_date))::INT AS age_years
  FROM employees.employee e
  WHERE e.birth_date IS NOT NULL
)
SELECT
  CASE
    WHEN a.age_years BETWEEN 20 AND 29 THEN '20-29'
    WHEN a.age_years BETWEEN 30 AND 39 THEN '30-39'
    WHEN a.age_years BETWEEN 40 AND 49 THEN '40-49'
    WHEN a.age_years BETWEEN 50 AND 59 THEN '50-59'
    WHEN a.age_years >= 60 THEN '60+'
  END AS age_group,
  COUNT(*)::INT AS employee_count,
  AVG(cs.amount) AS avg_salary,
  AVG((CURRENT_DATE - a.hire_date)::INT) AS avg_tenure_days
FROM emp_age a
JOIN current_salary cs ON cs.employee_id = a.employee_id
WHERE a.age_years >= 20
GROUP BY 1
ORDER BY 1;
        """)
        expected_results = cur.fetchall()

        if len(actual_results) != len(expected_results):
            print(f"❌ Expected {len(expected_results)} age group results, got {len(actual_results)}")
            return False

        mismatches = 0
        for i, (actual, expected) in enumerate(zip(actual_results, expected_results)):
            if not rows_match(actual, expected):
                if mismatches < 5:  # Only show first 5 mismatches
                    print(f"❌ Row {i+1} mismatch: expected {expected}, got {actual}")
                mismatches += 1

        if mismatches > 0:
            print(f"❌ Total mismatches: {mismatches}")
            return False

        print(f"✅ Age group analysis results are correct ({len(actual_results)} records)")
        return True

def verify_birth_month_results(conn) -> bool:
    """Verify the birth month distribution results."""
    with conn.cursor() as cur:
        # Get actual results from the created table
        cur.execute("""
            SELECT birth_month, month_name, employee_count, current_employee_count
            FROM employees.birth_month_distribution
            ORDER BY birth_month
        """)
        actual_results = cur.fetchall()
        
        # Execute ground truth query
        cur.execute("""
            WITH current_emp AS (
            SELECT DISTINCT s.employee_id
            FROM employees.salary s
            WHERE s.to_date = DATE '9999-01-01'
            ),
            months AS (
            SELECT gs AS birth_month
            FROM generate_series(1, 12) AS gs
            )
            SELECT
            m.birth_month::INTEGER AS birth_month,
            CASE m.birth_month
                WHEN 1 THEN 'January'   WHEN 2 THEN 'February' WHEN 3 THEN 'March'
                WHEN 4 THEN 'April'     WHEN 5 THEN 'May'      WHEN 6 THEN 'June'
                WHEN 7 THEN 'July'      WHEN 8 THEN 'August'   WHEN 9 THEN 'September'
                WHEN 10 THEN 'October'  WHEN 11 THEN 'November'WHEN 12 THEN 'December'
            END AS month_name,
            COUNT(e.id)::INTEGER AS employee_count,
            COUNT(ce.employee_id)::INTEGER AS current_employee_count
            FROM months m
            LEFT JOIN employees.employee e
            ON EXTRACT(MONTH FROM e.birth_date) = m.birth_month
            LEFT JOIN current_emp ce
            ON ce.employee_id = e.id
            GROUP BY m.birth_month
            ORDER BY m.birth_month;
        """)
        expected_results = cur.fetchall()

        if len(actual_results) != len(expected_results):
            print(f"❌ Expected {len(expected_results)} birth month results, got {len(actual_results)}")
            return False

        mismatches = 0
        for i, (actual, expected) in enumerate(zip(actual_results, expected_results)):
            if not rows_match(actual, expected):
                if mismatches < 5:  # Only show first 5 mismatches
                    print(f"❌ Row {i+1} mismatch: expected {expected}, got {actual}")
                mismatches += 1

        if mismatches > 0:
            print(f"❌ Total mismatches: {mismatches}")
            return False

        print(f"✅ Birth month distribution results are correct ({len(actual_results)} records)")
        return True

def verify_hiring_year_results(conn) -> bool:
    """Verify the hiring year summary results."""
    with conn.cursor() as cur:
        # Get actual results from the created table
        cur.execute("""
            SELECT hire_year, employees_hired, still_employed, retention_rate
            FROM employees.hiring_year_summary
            ORDER BY hire_year
        """)
        actual_results = cur.fetchall()
        
        # Execute ground truth query
        cur.execute("""
            WITH current_emp AS (
            SELECT DISTINCT s.employee_id
            FROM employees.salary s
            WHERE s.to_date = DATE '9999-01-01'
            ),
            base AS (
            SELECT e.id, EXTRACT(YEAR FROM e.hire_date)::INT AS hire_year
            FROM employees.employee e
            WHERE e.hire_date IS NOT NULL
            )
            SELECT
            b.hire_year,
            COUNT(*)::INT AS employees_hired,
            COUNT(*) FILTER (WHERE ce.employee_id IS NOT NULL)::INT AS still_employed,
            (COUNT(*) FILTER (WHERE ce.employee_id IS NOT NULL))::DECIMAL
                / NULLIF(COUNT(*), 0) * 100 AS retention_rate
            FROM base b
            LEFT JOIN current_emp ce ON ce.employee_id = b.id
            GROUP BY b.hire_year
            ORDER BY b.hire_year;
        """)
        expected_results = cur.fetchall()

        if len(actual_results) != len(expected_results):
            print(f"❌ Expected {len(expected_results)} hiring year results, got {len(actual_results)}")
            return False

        mismatches = 0
        for i, (actual, expected) in enumerate(zip(actual_results, expected_results)):
            if not rows_match(actual, expected):
                if mismatches < 5:  # Only show first 5 mismatches
                    print(f"❌ Row {i+1} mismatch: expected {expected}, got {actual}")
                mismatches += 1

        if mismatches > 0:
            print(f"❌ Total mismatches: {mismatches}")
            return False

        print(f"✅ Hiring year summary results are correct ({len(actual_results)} records)")
        return True

def main():
    """Main verification function."""
    print("=" * 50)

    # Get connection parameters
    conn_params = get_connection_params()

    if not conn_params["database"]:
        print("❌ No database specified")
        sys.exit(1)

    try:
        # Connect to database
        conn = psycopg2.connect(**conn_params)

        # Verify all four analysis results
        success = (
            verify_gender_statistics_results(conn) and 
            verify_age_group_results(conn) and 
            verify_birth_month_results(conn) and
            verify_hiring_year_results(conn)
        )

        conn.close()

        if success:
            print("\n🎉 Task verification: PASS")
            sys.exit(0)
        else:
            print("\n❌ Task verification: FAIL")
            sys.exit(1)

    except psycopg2.Error as e:
        print(f"❌ Database error: {e}")
        sys.exit(1)
    except Exception as e:
        print(f"❌ Verification error: {e}")
        sys.exit(1)

if __name__ == "__main__":
    main()

================================================
FILE: tasks/postgres/standard/employees/employee_performance_analysis/description.md
================================================
Create a comprehensive employee performance evaluation system that analyzes career progression patterns and salary equity across our organization. The executive team needs data-driven insights for upcoming promotion decisions and salary adjustment planning.

## Your Tasks:

1. **Create the employee performance analysis table** — build a table called `employee_performance_analysis` in the `employees` schema with these exact columns:
   * `employee_id` (bigint) — the employee's ID
   * `performance_category` (varchar) — classification of employee performance ('high_achiever', 'steady_performer', 'needs_attention')
   * `salary_growth_rate` (decimal) — percentage salary increase from first salary record to current
   * `days_of_service` (integer) — total days with the company
   * `promotion_count` (integer) — number of different titles held

2. **Analyze only current employees** — focus on employees who currently have active salary records (to_date = '9999-01-01').

3. **Apply performance classification rules**:
   * **High achievers**: Salary growth rate > 40% AND more than 1 title held
   * **Needs attention**: Salary growth rate < 15% AND more than 3650 days of service (10 years)
   * **Steady performers**: All other current employees (default category)

4. **Create the department salary analysis table** — build a table called `department_salary_analysis` in the `employees` schema with:
   * `department_name` (varchar) — the department name
   * `avg_current_salary` (decimal) — average current salary in the department (only current employees)
   * `employee_count` (integer) — total current employees in the department
   * `salary_range_spread` (integer) — difference between max and min salary (current employees only)

5. **Calculate salary equity metrics** — populate the department table with current salary statistics for active employees only to identify potential pay equity issues across departments.

The analysis should help leadership make informed decisions about promotions, salary adjustments, and talent retention strategies.

================================================
FILE: tasks/postgres/standard/employees/employee_performance_analysis/meta.json
================================================
{
  "task_id": "employee_performance_analysis",
  "task_name": "Employee Performance Analysis",
  "category_id": "employees",
  "category_name": "Employees",
  "description": "Create performance evaluation system analyzing career progression patterns and salary equity for promotion and compensation decisions.",
  "author": "Lingxiao Du",
  "created_at": "2025-08-14",
  "difficulty": "L3",
  "tags": [
    "reporting and analytics",
    "statistical aggregation",
    "schema design"
  ],
  "mcp": [
    "postgres"
  ],
  "meta_data": {
    "stateType": "text",
    "stateContent": "Enum \"employees\".\"employee_gender\" {\n  \"M\"\n  \"F\"\n}\n\nTable \"employees\".\"department\" {\n  \"id\" bpchar(4) [pk, not null]\n  \"dept_name\" varchar(40) [unique, not null]\n}\n\nTable \"employees\".\"department_employee\" {\n  \"employee_id\" int8 [not null]\n  \"department_id\" bpchar(4) [not null]\n  \"from_date\" date [not null]\n  \"to_date\" date [not null]\n\n  Indexes {\n    (employee_id, department_id) [type: btree, name: \"idx_16982_primary\"]\n    department_id [type: btree, name: \"idx_16982_dept_no\"]\n  }\n}\n\nTable \"employees\".\"department_manager\" {\n  \"employee_id\" int8 [not null]\n  \"department_id\" bpchar(4) [not null]\n  \"from_date\" date [not null]\n  \"to_date\" date [not null]\n\n  Indexes {\n    (employee_id, department_id) [type: btree, name: \"idx_16985_primary\"]\n    department_id [type: btree, name: \"idx_16985_dept_no\"]\n  }\n}\n\nTable \"employees\".\"employee\" {\n  \"id\" int8 [pk, not null, increment]\n  \"birth_date\" date [not null]\n  \"first_name\" varchar(14) [not null]\n  \"last_name\" varchar(16) [not null]\n  \"gender\" employees.employee_gender [not null]\n  \"hire_date\" date [not null]\n}\n\nTable \"employees\".\"salary\" {\n  \"employee_id\" int8 [not null]\n  \"amount\" int8 [not null]\n  \"from_date\" date [not null]\n  \"to_date\" date [not null]\n\n  Indexes {\n    (employee_id, from_date) [type: btree, name: \"idx_16991_primary\"]\n  }\n}\n\nTable \"employees\".\"title\" {\n  \"employee_id\" int8 [not null]\n  \"title\" varchar(50) [not null]\n  \"from_date\" date [not null]\n  \"to_date\" date\n\n  Indexes {\n    (employee_id, title, from_date) [type: btree, name: \"idx_16994_primary\"]\n  }\n}\n\nRef \"dept_emp_ibfk_1\":\"employees\".\"employee\".\"id\" < \"employees\".\"department_employee\".\"employee_id\" [update: restrict, delete: cascade]\n\nRef \"dept_emp_ibfk_2\":\"employees\".\"department\".\"id\" < \"employees\".\"department_employee\".\"department_id\" [update: restrict, delete: cascade]\n\nRef \"dept_manager_ibfk_1\":\"employees\".\"employee\".\"id\" < \"employees\".\"department_manager\".\"employee_id\" [update: restrict, delete: cascade]\n\nRef \"dept_manager_ibfk_2\":\"employees\".\"department\".\"id\" < \"employees\".\"department_manager\".\"department_id\" [update: restrict, delete: cascade]\n\nRef \"salaries_ibfk_1\":\"employees\".\"employee\".\"id\" < \"employees\".\"salary\".\"employee_id\" [update: restrict, delete: cascade]\n\nRef \"titles_ibfk_1\":\"employees\".\"employee\".\"id\" < \"employees\".\"title\".\"employee_id\" [update: restrict, delete: cascade]\n",
    "stateUrl": null,
    "stateOriginalUrl": "https://github.com/neondatabase-labs/postgres-sample-dbs/blob/main/employees.sql.gz"
  }
}

================================================
FILE: tasks/postgres/standard/employees/employee_performance_analysis/verify.py
================================================
"""
Verification script for PostgreSQL Task 1: Employee Performance Analysis
"""

import os
import sys
import psycopg2
from decimal import Decimal

def rows_match(actual_row, expected_row):
    """
    Compare two rows with appropriate tolerance.
    For Decimal types: allows 0.1 tolerance
    For other types: requires exact match
    """
    if len(actual_row) != len(expected_row):
        return False
    
    for actual, expected in zip(actual_row, expected_row):
        if isinstance(actual, Decimal) and isinstance(expected, Decimal):
            if abs(float(actual) - float(expected)) > 0.1:
                return False
        elif actual != expected:
            return False
    
    return True

def get_connection_params() -> dict:
    """Get database connection parameters."""
    return {
        "host": os.getenv("POSTGRES_HOST", "localhost"),
        "port": int(os.getenv("POSTGRES_PORT", 5432)),
        "database": os.getenv("POSTGRES_DATABASE"),
        "user": os.getenv("POSTGRES_USERNAME"),
        "password": os.getenv("POSTGRES_PASSWORD")
    }

def verify_performance_results(conn) -> bool:
    """Verify the employee performance analysis results."""
    with conn.cursor() as cur:
        # Get actual results from the created table
        cur.execute("""
            SELECT employee_id, performance_category, salary_growth_rate, 
                   days_of_service, promotion_count
            FROM employees.employee_performance_analysis 
            ORDER BY employee_id
        """)
        actual_results = cur.fetchall()
        
        # Execute ground truth query - use first salary record as starting salary
        cur.execute("""
            WITH current_salary AS (
            SELECT employee_id, amount AS current_amount
            FROM (
                SELECT s.*,
                    ROW_NUMBER() OVER (PARTITION BY s.employee_id
                                        ORDER BY s.from_date DESC, s.amount DESC) AS rn
                FROM employees.salary s
                WHERE s.to_date = DATE '9999-01-01'
            ) x
            WHERE rn = 1
            ),
            first_salary AS (
            SELECT employee_id, amount AS first_amount
            FROM (
                SELECT s.*,
                    ROW_NUMBER() OVER (PARTITION BY s.employee_id
                                        ORDER BY s.from_date ASC, s.amount ASC) AS rn
                FROM employees.salary s
            ) x
            WHERE rn = 1
            ),
            title_counts AS (
            SELECT t.employee_id, COUNT(DISTINCT t.title) AS promotion_count
            FROM employees.title t
            GROUP BY t.employee_id
            ),
            base AS (
            SELECT e.id AS employee_id,
                    e.hire_date,
                    cs.current_amount,
                    fs.first_amount,
                    COALESCE(tc.promotion_count, 0) AS promotion_count
            FROM employees.employee e
            JOIN current_salary cs ON cs.employee_id = e.id
            JOIN first_salary  fs ON fs.employee_id = e.id
            LEFT JOIN title_counts tc ON tc.employee_id = e.id
            ),
            scored AS (
            SELECT
                employee_id,
                ((current_amount - first_amount) / NULLIF(first_amount, 0)::NUMERIC) * 100 AS salary_growth_rate,
                (CURRENT_DATE - hire_date)::INTEGER AS days_of_service,
                promotion_count
            FROM base
            )
            SELECT
            s.employee_id,
            CASE
                WHEN s.salary_growth_rate > 40 AND s.promotion_count > 1 THEN 'high_achiever'
                WHEN s.salary_growth_rate < 15 AND s.days_of_service > 3650 THEN 'needs_attention'
                ELSE 'steady_performer'
            END AS performance_category,
            s.salary_growth_rate,
            s.days_of_service,
            s.promotion_count AS promotion_count
            FROM scored s
            ORDER BY s.employee_id;
        """)
        expected_results = cur.fetchall()

        if len(actual_results) != len(expected_results):
            print(f"❌ Expected {len(expected_results)} performance results, got {len(actual_results)}")
            return False

        mismatches = 0
        for i, (actual, expected) in enumerate(zip(actual_results, expected_results)):
            if not rows_match(actual, expected):
                if mismatches < 5:  # Only show first 5 mismatches
                    print(f"❌ Row {i+1} mismatch: expected {expected}, got {actual}")
                mismatches += 1

        if mismatches > 0:
            print(f"❌ Total mismatches: {mismatches}")
            return False

        print(f"✅ Employee performance results are correct ({len(actual_results)} records)")
        return True

def verify_department_results(conn) -> bool:
    """Verify the department salary analysis results."""
    with conn.cursor() as cur:
        # Get actual results from the created table
        cur.execute("""
            SELECT department_name, avg_current_salary, employee_count, salary_range_spread
            FROM employees.department_salary_analysis
            ORDER BY department_name
        """)
        actual_results = cur.fetchall()

        # Execute ground truth query
        cur.execute("""
            WITH current_salary AS (
            SELECT employee_id, amount
            FROM (
                SELECT s.*,
                    ROW_NUMBER() OVER (PARTITION BY s.employee_id
                                        ORDER BY s.from_date DESC, s.amount DESC) AS rn
                FROM employees.salary s
                WHERE s.to_date = DATE '9999-01-01'
            ) x
            WHERE rn = 1
            ),
            current_dept AS (
            SELECT DISTINCT de.employee_id, de.department_id
            FROM employees.department_employee de
            WHERE de.to_date = DATE '9999-01-01'
            )
            SELECT 
            d.dept_name AS department_name,
            AVG(cs.amount)::DECIMAL AS avg_current_salary,
            COUNT(DISTINCT cd.employee_id) AS employee_count,
            (MAX(cs.amount) - MIN(cs.amount)) AS salary_range_spread
            FROM employees.department d
            JOIN current_dept cd ON cd.department_id = d.id
            JOIN current_salary cs ON cs.employee_id = cd.employee_id
            GROUP BY d.id, d.dept_name
            ORDER BY d.dept_name;
        """)
        expected_results = cur.fetchall()

        if len(actual_results) != len(expected_results):
            print(f"❌ Expected {len(expected_results)} department results, got {len(actual_results)}")
            return False

        mismatches = 0
        for i, (actual, expected) in enumerate(zip(actual_results, expected_results)):
            if not rows_match(actual, expected):
                print(f"❌ Row {i+1} mismatch: expected {expected}, got {actual}")
                mismatches += 1

        if mismatches > 0:
            print(f"❌ Total mismatches: {mismatches}")
            return False

        print(f"✅ Department salary results are correct ({len(actual_results)} records)")
        return True

def main():
    """Main verification function."""
    print("=" * 50)

    # Get connection parameters
    conn_params = get_connection_params()

    if not conn_params["database"]:
        print("❌ No database specified")
        sys.exit(1)

    try:
        # Connect to database
        conn = psycopg2.connect(**conn_params)

        # Verify results
        success = verify_performance_results(conn) and verify_department_results(conn)

        conn.close()

        if success:
            print("\n🎉 Task verification: PASS")
            sys.exit(0)
        else:
            print("\n❌ Task verification: FAIL")
            sys.exit(1)

    except psycopg2.Error as e:
        print(f"❌ Database error: {e}")
        sys.exit(1)
    except Exception as e:
        print(f"❌ Verification error: {e}")
        sys.exit(1)

if __name__ == "__main__":
    main()

================================================
FILE: tasks/postgres/standard/employees/employee_project_tracking/description.md
================================================
Create and manage a comprehensive employee project tracking system using database schema design and data manipulation operations. The IT team needs you to build the database structure from scratch and populate it with specific initial data to support project management workflows.

## Your Tasks:

1. **Create the project tracking tables** — build three new tables in the `employees` schema:
   
   **Table 1: `employee_projects`**
   * `project_id` (integer, primary key, auto-increment)
   * `project_name` (varchar(100), not null)
   * `start_date` (date, not null)
   * `end_date` (date)
   * `budget` (decimal(10,2))
   * `status` (varchar(20), default 'active')

   **Table 2: `project_assignments`**
   * `assignment_id` (integer, primary key, auto-increment)
   * `employee_id` (bigint, not null)
   * `project_id` (integer, not null)
   * `role` (varchar(50), not null)
   * `allocation_percentage` (integer, check constraint: between 1 and 100)
   * `assigned_date` (date, not null)

   **Table 3: `project_milestones`**
   * `milestone_id` (integer, primary key, auto-increment)
   * `project_id` (integer, not null)
   * `milestone_name` (varchar(100), not null)
   * `due_date` (date, not null)
   * `completed` (boolean, default false)

2. **Add foreign key relationships**:
   * `project_assignments.employee_id` → `employees.employee.id`
   * `project_assignments.project_id` → `employees.employee_projects.project_id`
   * `project_milestones.project_id` → `employees.employee_projects.project_id`

3. **Create performance indexes**:
   * Index named `idx_projects_status` on `employee_projects.status`
   * Composite index named `idx_assignments_emp_proj` on `project_assignments(employee_id, project_id)`
   * Index named `idx_milestones_due_date` on `project_milestones.due_date`

4. **Insert exactly this initial data**:
   
   **Into `employee_projects`:**
   * Project 1: name='Database Modernization', start_date='2024-01-15', end_date='2024-06-30', budget=250000.00, status='active'
   * Project 2: name='Employee Portal Upgrade', start_date='2024-02-01', end_date='2024-05-15', budget=180000.00, status='active'  
   * Project 3: name='HR Analytics Dashboard', start_date='2023-11-01', end_date='2024-01-31', budget=120000.00, status='active'

   **Into `project_assignments` (assign ALL current employees):**
   * All employees from Development department → Project 1 ('Database Modernization'), role='Developer', allocation=80%
   * All employees from Human Resources department → Project 2 ('Employee Portal Upgrade'), role='Business Analyst', allocation=60%
   * All employees from Marketing department → Project 3 ('HR Analytics Dashboard'), role='Marketing Specialist', allocation=40%
   * All employees from Finance department → Project 1 ('Database Modernization'), role='Financial Analyst', allocation=30%
   * All employees from Sales department → Project 2 ('Employee Portal Upgrade'), role='Sales Representative', allocation=50%
   * All employees from Research department → Project 3 ('HR Analytics Dashboard'), role='Research Analyst', allocation=70%
   * All employees from Production department → Project 1 ('Database Modernization'), role='Production Coordinator', allocation=45%
   * All employees from Quality Management department → Project 2 ('Employee Portal Upgrade'), role='QA Specialist', allocation=85%
   * All employees from Customer Service department → Project 3 ('HR Analytics Dashboard'), role='Customer Success', allocation=35%
   * All employees should have assigned_date='2024-01-01'

   **Into `project_milestones`:**
   * Project 1: 'Design Phase Complete' due '2024-03-01', 'Implementation Complete' due '2024-05-15'
   * Project 2: 'UI/UX Approval' due '2024-03-15', 'Beta Testing' due '2024-04-30'
   * Project 3: 'Data Collection' due '2023-12-15', 'Dashboard Launch' due '2024-01-25'

5. **Perform these exact data updates**:
   * Update Project 3 ('HR Analytics Dashboard') status to 'completed'
   * Increase budget by 15% for all projects with status 'active'
   * Mark the milestone 'Data Collection' as completed (set completed = true)

6. **Add new column to `employee_projects`**:
   * Add `priority` column (varchar(10)) with check constraint allowing only 'low', 'medium', 'high'
   * Update all existing projects: set priority='high' for 'Database Modernization', priority='medium' for others


================================================
FILE: tasks/postgres/standard/employees/employee_project_tracking/meta.json
================================================
{
  "task_id": "employee_project_tracking",
  "task_name": "Employee Project Tracking",
  "category_id": "employees",
  "category_name": "Employees",
  "description": "Build project tracking system from scratch with tables for projects, assignments, milestones, and performance indexes.",
  "author": "Lingxiao Du",
  "created_at": "2025-08-14",
  "difficulty": "L3",
  "tags": [
    "schema design",
    "data migration",
    "data integrity enforcement"
  ],
  "mcp": [
    "postgres"
  ],
  "meta_data": {
    "stateType": "text",
    "stateContent": "Enum \"employees\".\"employee_gender\" {\n  \"M\"\n  \"F\"\n}\n\nTable \"employees\".\"department\" {\n  \"id\" bpchar(4) [pk, not null]\n  \"dept_name\" varchar(40) [unique, not null]\n}\n\nTable \"employees\".\"department_employee\" {\n  \"employee_id\" int8 [not null]\n  \"department_id\" bpchar(4) [not null]\n  \"from_date\" date [not null]\n  \"to_date\" date [not null]\n\n  Indexes {\n    (employee_id, department_id) [type: btree, name: \"idx_16982_primary\"]\n    department_id [type: btree, name: \"idx_16982_dept_no\"]\n  }\n}\n\nTable \"employees\".\"department_manager\" {\n  \"employee_id\" int8 [not null]\n  \"department_id\" bpchar(4) [not null]\n  \"from_date\" date [not null]\n  \"to_date\" date [not null]\n\n  Indexes {\n    (employee_id, department_id) [type: btree, name: \"idx_16985_primary\"]\n    department_id [type: btree, name: \"idx_16985_dept_no\"]\n  }\n}\n\nTable \"employees\".\"employee\" {\n  \"id\" int8 [pk, not null, increment]\n  \"birth_date\" date [not null]\n  \"first_name\" varchar(14) [not null]\n  \"last_name\" varchar(16) [not null]\n  \"gender\" employees.employee_gender [not null]\n  \"hire_date\" date [not null]\n}\n\nTable \"employees\".\"salary\" {\n  \"employee_id\" int8 [not null]\n  \"amount\" int8 [not null]\n  \"from_date\" date [not null]\n  \"to_date\" date [not null]\n\n  Indexes {\n    (employee_id, from_date) [type: btree, name: \"idx_16991_primary\"]\n  }\n}\n\nTable \"employees\".\"title\" {\n  \"employee_id\" int8 [not null]\n  \"title\" varchar(50) [not null]\n  \"from_date\" date [not null]\n  \"to_date\" date\n\n  Indexes {\n    (employee_id, title, from_date) [type: btree, name: \"idx_16994_primary\"]\n  }\n}\n\nRef \"dept_emp_ibfk_1\":\"employees\".\"employee\".\"id\" < \"employees\".\"department_employee\".\"employee_id\" [update: restrict, delete: cascade]\n\nRef \"dept_emp_ibfk_2\":\"employees\".\"department\".\"id\" < \"employees\".\"department_employee\".\"department_id\" [update: restrict, delete: cascade]\n\nRef \"dept_manager_ibfk_1\":\"employees\".\"employee\".\"id\" < \"employees\".\"department_manager\".\"employee_id\" [update: restrict, delete: cascade]\n\nRef \"dept_manager_ibfk_2\":\"employees\".\"department\".\"id\" < \"employees\".\"department_manager\".\"department_id\" [update: restrict, delete: cascade]\n\nRef \"salaries_ibfk_1\":\"employees\".\"employee\".\"id\" < \"employees\".\"salary\".\"employee_id\" [update: restrict, delete: cascade]\n\nRef \"titles_ibfk_1\":\"employees\".\"employee\".\"id\" < \"employees\".\"title\".\"employee_id\" [update: restrict, delete: cascade]\n",
    "stateUrl": null,
    "stateOriginalUrl": "https://github.com/neondatabase-labs/postgres-sample-dbs/blob/main/employees.sql.gz"
  }
}

================================================
FILE: tasks/postgres/standard/employees/employee_project_tracking/verify.py
================================================
"""
Verification script for PostgreSQL Task 5: Database Schema and Data Operations
"""

import os
import sys
import psycopg2
from decimal import Decimal

def rows_match(actual_row, expected_row):
    """
    Compare two rows with appropriate tolerance.
    For Decimal types: allows 0.1 tolerance
    For date types: convert to string for comparison
    For other types: requires exact match
    """
    if len(actual_row) != len(expected_row):
        return False
    
    for actual, expected in zip(actual_row, expected_row):
        if isinstance(actual, Decimal) and isinstance(expected, (Decimal, float, int)):
            if abs(float(actual) - float(expected)) > 0.1:
                return False
        elif hasattr(actual, 'strftime'):  # datetime.date or datetime.datetime
            if str(actual) != str(expected):
                return False
        elif actual != expected:
            return False
    
    return True

def get_connection_params() -> dict:
    """Get database connection parameters."""
    return {
        "host": os.getenv("POSTGRES_HOST", "localhost"),
        "port": int(os.getenv("POSTGRES_PORT", 5432)),
        "database": os.getenv("POSTGRES_DATABASE"),
        "user": os.getenv("POSTGRES_USERNAME"),
        "password": os.getenv("POSTGRES_PASSWORD")
    }

def verify_table_structures(conn) -> bool:
    """Verify that all three tables were created with correct structure."""
    with conn.cursor() as cur:
        # Check if tables exist
        cur.execute("""
            SELECT table_name FROM information_schema.tables 
            WHERE table_schema = 'employees' 
            AND table_name IN ('employee_projects', 'project_assignments', 'project_milestones')
            ORDER BY table_name
        """)
        tables = [row[0] for row in cur.fetchall()]
        
        if len(tables) != 3:
            print(f"❌ Expected 3 tables, found {len(tables)}: {tables}")
            return False
            
        # Check foreign key constraints exist
        cur.execute("""
            SELECT COUNT(*) FROM information_schema.table_constraints 
            WHERE table_schema = 'employees' 
            AND constraint_type = 'FOREIGN KEY'
            AND table_name IN ('project_assignments', 'project_milestones')
        """)
        fkey_count = cur.fetchone()[0]
        
        if fkey_count != 3:
            print(f"❌ Expected 3 foreign key constraints, found {fkey_count}")
            return False
            
        # Check if priority column exists (added in step 6)
        cur.execute("""
            SELECT COUNT(*) FROM information_schema.columns 
            WHERE table_schema = 'employees' AND table_name = 'employee_projects'
            AND column_name = 'priority'
        """)
        priority_exists = cur.fetchone()[0]
        
        if priority_exists == 0:
            print("❌ Priority column was not added to employee_projects table")
            return False
            
        print("✅ Table structures are correct")
        return True

def verify_indexes(conn) -> bool:
    """Verify that required indexes were created."""
    with conn.cursor() as cur:
        # Check for specific indexes
        cur.execute("""
            SELECT COUNT(*) 
            FROM pg_indexes 
            WHERE schemaname = 'employees' 
            AND indexname IN ('idx_projects_status', 'idx_assignments_emp_proj', 'idx_milestones_due_date')
        """)
        index_count = cur.fetchone()[0]
        
        if index_count != 3:
            print(f"❌ Expected 3 required indexes, got {index_count}")
            return False
                
        print("✅ All required indexes are present")
        return True

def verify_project_data(conn) -> bool:
    """Verify that project data was inserted and updated correctly."""
    with conn.cursor() as cur:
        # Check project data after updates
        cur.execute("""
            SELECT project_name, start_date, end_date, budget, status, priority
            FROM employees.employee_projects
            ORDER BY project_name
        """)
        projects = cur.fetchall()
        
        if len(projects) != 3:
            print(f"❌ Expected 3 projects, found {len(projects)}")
            return False
            
        # Expected final state after all updates
        expected = {
            'Database Modernization': ('2024-01-15', '2024-06-30', 287500.00, 'active', 'high'),
            'Employee Portal Upgrade': ('2024-02-01', '2024-05-15', 207000.00, 'active', 'medium'),
            'HR Analytics Dashboard': ('2023-11-01', '2024-01-31', 120000.00, 'completed', 'medium')
        }
        
        for project in projects:
            name = project[0]
            if name not in expected:
                print(f"❌ Unexpected project: {name}")
                return False
                
            exp = expected[name]
            # Use rows_match for comparison
            expected_row = (name,) + exp
            if not rows_match(project, expected_row):
                print(f"❌ Project {name} data mismatch: expected {expected_row}, got {project}")
                return False
                
        print("✅ Project data is correct")
        return True

def verify_assignment_data(conn) -> bool:
    """Verify that all current employees were assigned to projects by department."""
    with conn.cursor() as cur:
        # Check total assignment count matches current employee count
        cur.execute("""
            SELECT COUNT(*) FROM employees.project_assignments
        """)
        assignment_count = cur.fetchone()[0]
        
        cur.execute("""
            SELECT COUNT(DISTINCT de.employee_id) 
            FROM employees.department_employee de
            WHERE de.to_date = '9999-01-01'
        """)
        current_employee_count = cur.fetchone()[0]
        
        if assignment_count != current_employee_count:
            print(f"❌ Expected {current_employee_count} assignments, found {assignment_count}")
            return False
            
        # Check department-project mapping
        cur.execute("""
            SELECT d.dept_name, pa.project_id, pa.role, pa.allocation_percentage, COUNT(*)
            FROM employees.project_assignments pa
            JOIN employees.department_employee de ON pa.employee_id = de.employee_id AND de.to_date = '9999-01-01'
            JOIN employees.department d ON de.department_id = d.id
            JOIN employees.employee_projects ep ON pa.project_id = ep.project_id
            GROUP BY d.dept_name, pa.project_id, pa.role, pa.allocation_percentage
            ORDER BY d.dept_name
        """)
        dept_assignments = cur.fetchall()
        
        # Expected department-project mappings
        expected_mappings = {
            'Development': (1, 'Developer', 80),
            'Human Resources': (2, 'Business Analyst', 60),
            'Marketing': (3, 'Marketing Specialist', 40),
            'Finance': (1, 'Financial Analyst', 30),
            'Sales': (2, 'Sales Representative', 50),
            'Research': (3, 'Research Analyst', 70),
            'Production': (1, 'Production Coordinator', 45),
            'Quality Management': (2, 'QA Specialist', 85),
            'Customer Service': (3, 'Customer Success', 35)
        }
        
        dept_found = {}
        for assignment in dept_assignments:
            dept_name, project_id, role, allocation, _ = assignment  # Ignore count
            if dept_name in dept_found:
                print(f"❌ Department {dept_name} has multiple assignments")
                return False
            dept_found[dept_name] = (project_id, role, allocation)
            
        for dept, expected in expected_mappings.items():
            if dept not in dept_found:
                print(f"❌ Department {dept} has no assignments")
                return False
            if dept_found[dept] != expected:
                print(f"❌ Department {dept} assignment mismatch: expected {expected}, got {dept_found[dept]}")
                return False
                
        # Check that all assignments have correct assigned_date
        cur.execute("""
            SELECT COUNT(*) FROM employees.project_assignments 
            WHERE assigned_date != '2024-01-01'
        """)
        wrong_date_count = cur.fetchone()[0]
        
        if wrong_date_count > 0:
            print(f"❌ {wrong_date_count} assignments have incorrect assigned_date")
            return False
                
        print("✅ Assignment data is correct")
        return True

def verify_milestone_data(conn) -> bool:
    """Verify that milestone data was inserted and updated correctly."""
    with conn.cursor() as cur:
        cur.execute("""
            SELECT project_id, milestone_name, due_date, completed
            FROM employees.project_milestones
            ORDER BY project_id, milestone_name
        """)
        milestones = cur.fetchall()
        
        if len(milestones) != 6:
            print(f"❌ Expected 6 milestones, found {len(milestones)}")
            return False
            
        # Expected milestones
        expected_milestones = {
            (1, 'Design Phase Complete'): ('2024-03-01', False),
            (1, 'Implementation Complete'): ('2024-05-15', False),
            (2, 'UI/UX Approval'): ('2024-03-15', False),
            (2, 'Beta Testing'): ('2024-04-30', False),
            (3, 'Data Collection'): ('2023-12-15', True),  # Should be completed
            (3, 'Dashboard Launch'): ('2024-01-25', False)
        }
        
        for milestone in milestones:
            project_id, name, due_date, completed = milestone
            key = (project_id, name)
            
            if key not in expected_milestones:
                print(f"❌ Unexpected milestone: {key}")
                return False
                
            expected_due, expected_completed = expected_milestones[key]
            if str(due_date) != expected_due or completed != expected_completed:
                print(f"❌ Milestone {name} mismatch: expected ({expected_due}, {expected_completed}), got ({due_date}, {completed})")
                return False
                
        print("✅ Milestone data is correct")
        return True

def main():
    """Main verification function."""
    print("=" * 50)

    # Get connection parameters
    conn_params = get_connection_params()

    if not conn_params["database"]:
        print("❌ No database specified")
        sys.exit(1)

    try:
        # Connect to database
        conn = psycopg2.connect(**conn_params)

        # Verify all components
        success = (
            verify_table_structures(conn) and 
            verify_indexes(conn) and
            verify_project_data(conn) and
            verify_assignment_data(conn) and
            verify_milestone_data(conn)
        )

        conn.close()

        if success:
            print("\n🎉 Task verification: PASS")
            sys.exit(0)
        else:
            print("\n❌ Task verification: FAIL")
            sys.exit(1)

    except psycopg2.Error as e:
        print(f"❌ Database error: {e}")
        sys.exit(1)
    except Exception as e:
        print(f"❌ Verification error: {e}")
        sys.exit(1)

if __name__ == "__main__":
    main()

================================================
FILE: tasks/postgres/standard/employees/employee_retention_analysis/description.md
================================================
Analyze employee retention patterns and identify factors contributing to turnover across the organization. The HR leadership team needs comprehensive insights to develop targeted retention strategies and reduce costly employee attrition.

## Your Tasks:

1. **Create the retention analysis table** — build a table called `employee_retention_analysis` in the `employees` schema with these exact columns:
   * `department_name` (varchar) — the department name
   * `total_employees_ever` (integer) — total number of employees who have ever worked in this department
   * `current_employees` (integer) — number of current employees in the department
   * `former_employees` (integer) — number of employees who left the department
   * `retention_rate` (decimal) — percentage of employees still with the company (current/total * 100)

2. **Create the high-risk employee identification table** — build a table called `high_risk_employees` in the `employees` schema with:
   * `employee_id` (bigint) — the employee's ID  
   * `full_name` (varchar) — concatenated first and last name
   * `current_department` (varchar) — current department name
   * `tenure_days` (integer) — days with the company
   * `current_salary` (integer) — current salary amount
   * `risk_category` (varchar) — risk level ('high_risk', 'medium_risk', 'low_risk')
   
   **Note**: Analyze only current employees (those with active salary records where to_date = '9999-01-01').

3. **Create the turnover trend analysis table** — build a table called `turnover_trend_analysis` in the `employees` schema with:
   * `departure_year` (integer) — year when employees left (extract from to_date of salary records)
   * `departures_count` (integer) — number of employees who left that year
   * `avg_tenure_days` (decimal) — average tenure in days for employees who left that year
   * `avg_final_salary` (decimal) — average final salary of departed employees that year

4. **Apply risk assessment criteria** for current employees:
   * **High risk**: Employees in departments with retention rate < 80% AND tenure < 1095 days (3 years)
   * **Medium risk**: Employees in departments with retention rate < 85% AND tenure < 1825 days (5 years)  
   * **Low risk**: All other current employees

5. **Analyze departure trends** — examine employees who left between 1985-2002, grouping by departure year.

6. **Handle final salary selection** — when calculating `avg_final_salary`, if an employee has multiple salary records with the same departure date, select the record with the latest start date. If there are still ties, select the record with the highest salary amount.

7. **Focus appropriately** — use current employees for risk analysis, all historical data for retention rates, and former employees for trend analysis.

The comprehensive analysis will help identify retention patterns, at-risk employees, and historical turnover trends to guide strategic workforce planning.


================================================
FILE: tasks/postgres/standard/employees/employee_retention_analysis/meta.json
================================================
{
  "task_id": "employee_retention_analysis",
  "task_name": "Employee Retention Analysis",
  "category_id": "employees",
  "category_name": "Employees",
  "description": "Analyze retention patterns identifying turnover factors and high-risk employees to develop targeted retention strategies.",
  "author": "Lingxiao Du",
  "created_at": "2025-08-15",
  "difficulty": "L3",
  "tags": [
    "reporting and analytics",
    "statistical aggregation",
    "audit and compliance"
  ],
  "mcp": [
    "postgres"
  ],
  "meta_data": {
    "stateType": "text",
    "stateContent": "Enum \"employees\".\"employee_gender\" {\n  \"M\"\n  \"F\"\n}\n\nTable \"employees\".\"department\" {\n  \"id\" bpchar(4) [pk, not null]\n  \"dept_name\" varchar(40) [unique, not null]\n}\n\nTable \"employees\".\"department_employee\" {\n  \"employee_id\" int8 [not null]\n  \"department_id\" bpchar(4) [not null]\n  \"from_date\" date [not null]\n  \"to_date\" date [not null]\n\n  Indexes {\n    (employee_id, department_id) [type: btree, name: \"idx_16982_primary\"]\n    department_id [type: btree, name: \"idx_16982_dept_no\"]\n  }\n}\n\nTable \"employees\".\"department_manager\" {\n  \"employee_id\" int8 [not null]\n  \"department_id\" bpchar(4) [not null]\n  \"from_date\" date [not null]\n  \"to_date\" date [not null]\n\n  Indexes {\n    (employee_id, department_id) [type: btree, name: \"idx_16985_primary\"]\n    department_id [type: btree, name: \"idx_16985_dept_no\"]\n  }\n}\n\nTable \"employees\".\"employee\" {\n  \"id\" int8 [pk, not null, increment]\n  \"birth_date\" date [not null]\n  \"first_name\" varchar(14) [not null]\n  \"last_name\" varchar(16) [not null]\n  \"gender\" employees.employee_gender [not null]\n  \"hire_date\" date [not null]\n}\n\nTable \"employees\".\"salary\" {\n  \"employee_id\" int8 [not null]\n  \"amount\" int8 [not null]\n  \"from_date\" date [not null]\n  \"to_date\" date [not null]\n\n  Indexes {\n    (employee_id, from_date) [type: btree, name: \"idx_16991_primary\"]\n  }\n}\n\nTable \"employees\".\"title\" {\n  \"employee_id\" int8 [not null]\n  \"title\" varchar(50) [not null]\n  \"from_date\" date [not null]\n  \"to_date\" date\n\n  Indexes {\n    (employee_id, title, from_date) [type: btree, name: \"idx_16994_primary\"]\n  }\n}\n\nRef \"dept_emp_ibfk_1\":\"employees\".\"employee\".\"id\" < \"employees\".\"department_employee\".\"employee_id\" [update: restrict, delete: cascade]\n\nRef \"dept_emp_ibfk_2\":\"employees\".\"department\".\"id\" < \"employees\".\"department_employee\".\"department_id\" [update: restrict, delete: cascade]\n\nRef \"dept_manager_ibfk_1\":\"employees\".\"employee\".\"id\" < \"employees\".\"department_manager\".\"employee_id\" [update: restrict, delete: cascade]\n\nRef \"dept_manager_ibfk_2\":\"employees\".\"department\".\"id\" < \"employees\".\"department_manager\".\"department_id\" [update: restrict, delete: cascade]\n\nRef \"salaries_ibfk_1\":\"employees\".\"employee\".\"id\" < \"employees\".\"salary\".\"employee_id\" [update: restrict, delete: cascade]\n\nRef \"titles_ibfk_1\":\"employees\".\"employee\".\"id\" < \"employees\".\"title\".\"employee_id\" [update: restrict, delete: cascade]\n",
    "stateUrl": null,
    "stateOriginalUrl": "https://github.com/neondatabase-labs/postgres-sample-dbs/blob/main/employees.sql.gz"
  }
}

================================================
FILE: tasks/postgres/standard/employees/employee_retention_analysis/verify.py
================================================
"""
Verification script for PostgreSQL Task 2: Employee Retention Analysis
"""

import os
import sys
import psycopg2
from decimal import Decimal

def rows_match(actual_row, expected_row):
    """
    Compare two rows with appropriate tolerance.
    For Decimal types: allows 0.1 tolerance
    For other types: requires exact match
    """
    if len(actual_row) != len(expected_row):
        return False
    
    for actual, expected in zip(actual_row, expected_row):
        if isinstance(actual, Decimal) and isinstance(expected, Decimal):
            if abs(float(actual) - float(expected)) > 0.1:
                return False
        elif actual != expected:
            return False
    
    return True

def get_connection_params() -> dict:
    """Get database connection parameters."""
    return {
        "host": os.getenv("POSTGRES_HOST", "localhost"),
        "port": int(os.getenv("POSTGRES_PORT", 5432)),
        "database": os.getenv("POSTGRES_DATABASE"),
        "user": os.getenv("POSTGRES_USERNAME"),
        "password": os.getenv("POSTGRES_PASSWORD")
    }

def verify_retention_analysis_results(conn) -> bool:
    """Verify the employee retention analysis results."""
    with conn.cursor() as cur:
        # Get actual results from the created table
        cur.execute("""
            SELECT department_name, total_employees_ever, current_employees, 
                   former_employees, retention_rate
            FROM employees.employee_retention_analysis
            ORDER BY department_name
        """)
        actual_results = cur.fetchall()
        
        # Execute ground truth query
        cur.execute("""
            SELECT
            d.dept_name AS department_name,
            COUNT(DISTINCT de.employee_id) AS total_employees_ever,
            COUNT(DISTINCT de.employee_id) FILTER (WHERE de.to_date = DATE '9999-01-01') AS current_employees,
            (COUNT(DISTINCT de.employee_id)
            - COUNT(DISTINCT de.employee_id) FILTER (WHERE de.to_date = DATE '9999-01-01')) AS former_employees,
            (COUNT(DISTINCT de.employee_id) FILTER (WHERE de.to_date = DATE '9999-01-01'))::DECIMAL
                / NULLIF(COUNT(DISTINCT de.employee_id), 0) * 100 AS retention_rate
            FROM employees.department d
            LEFT JOIN employees.department_employee de
            ON d.id = de.department_id
            GROUP BY d.id, d.dept_name
            ORDER BY d.dept_name
        """)
        expected_results = cur.fetchall()

        if len(actual_results) != len(expected_results):
            print(f"❌ Expected {len(expected_results)} retention analysis results, got {len(actual_results)}")
            return False

        mismatches = 0
        for i, (actual, expected) in enumerate(zip(actual_results, expected_results)):
            if not rows_match(actual, expected):
                if mismatches < 5:  # Only show first 5 mismatches
                    print(f"❌ Row {i+1} mismatch: expected {expected}, got {actual}")
                mismatches += 1

        if mismatches > 0:
            print(f"❌ Total mismatches: {mismatches}")
            return False

        print(f"✅ Employee retention analysis results are correct ({len(actual_results)} records)")
        return True

def verify_high_risk_results(conn) -> bool:
    """Verify the high risk employee analysis results."""
    with conn.cursor() as cur:
        # Get actual results from the created table
        cur.execute("""
            SELECT employee_id, full_name, current_department, tenure_days, 
                   current_salary, risk_category
            FROM employees.high_risk_employees
            ORDER BY employee_id
        """)
        actual_results = cur.fetchall()
        
        # Execute ground truth query - only current employees
        cur.execute("""
            WITH current_salary AS (
            SELECT employee_id, amount AS current_amount
            FROM (
                SELECT s.*,
                    ROW_NUMBER() OVER (PARTITION BY s.employee_id
                                        ORDER BY s.from_date DESC, s.amount DESC) AS rn
                FROM employees.salary s
                WHERE s.to_date = DATE '9999-01-01'
            ) x
            WHERE rn = 1
            ),
            current_dept AS (
            SELECT employee_id, department_id
            FROM (
                SELECT de.*,
                    ROW_NUMBER() OVER (PARTITION BY de.employee_id
                                        ORDER BY de.from_date DESC, de.department_id) AS rn
                FROM employees.department_employee de
                WHERE de.to_date = DATE '9999-01-01'
            ) x
            WHERE rn = 1
            ),
            dept_retention AS (
            SELECT
                d.id   AS department_id,
                d.dept_name,
                COUNT(DISTINCT de.employee_id) AS total_employees_ever,
                COUNT(DISTINCT de.employee_id) FILTER (WHERE de.to_date = DATE '9999-01-01') AS current_employees,
                (COUNT(DISTINCT de.employee_id) FILTER (WHERE de.to_date = DATE '9999-01-01'))::NUMERIC
                / NULLIF(COUNT(DISTINCT de.employee_id), 0) * 100 AS retention_rate
            FROM employees.department d
            LEFT JOIN employees.department_employee de
                    ON de.department_id = d.id
            GROUP BY d.id, d.dept_name
            )
            SELECT
            e.id AS employee_id,
            CONCAT(e.first_name, ' ', e.last_name) AS full_name,
            d.dept_name AS current_department,
            (CURRENT_DATE - e.hire_date)::INTEGER AS tenure_days,
            cs.current_amount::INTEGER AS current_salary,
            CASE
                WHEN dr.retention_rate < 80  AND (CURRENT_DATE - e.hire_date) < 1095 THEN 'high_risk'
                WHEN dr.retention_rate < 85  AND (CURRENT_DATE - e.hire_date) < 1825 THEN 'medium_risk'
                ELSE 'low_risk'
            END AS risk_category
            FROM employees.employee e
            JOIN current_salary cs ON cs.employee_id = e.id
            JOIN current_dept   cd ON cd.employee_id = e.id
            JOIN employees.department d ON d.id = cd.department_id
            JOIN dept_retention dr ON dr.department_id = d.id
            ORDER BY e.id;
        """)
        expected_results = cur.fetchall()

        if len(actual_results) != len(expected_results):
            print(f"❌ Expected {len(expected_results)} high risk analysis results, got {len(actual_results)}")
            return False

        mismatches = 0
        for i, (actual, expected) in enumerate(zip(actual_results, expected_results)):
            if not rows_match(actual, expected):
                if mismatches < 5:  # Only show first 5 mismatches
                    print(f"❌ Row {i+1} mismatch: expected {expected}, got {actual}")
                mismatches += 1

        if mismatches > 0:
            print(f"❌ Total mismatches: {mismatches}")
            return False

        print(f"✅ High risk employee analysis results are correct ({len(actual_results)} records)")
        return True

def verify_turnover_trend_results(conn) -> bool:
    """Verify the turnover trend analysis results."""
    with conn.cursor() as cur:
        # Get actual results from the created table
        cur.execute("""
            SELECT departure_year, departures_count, avg_tenure_days, avg_final_salary
            FROM employees.turnover_trend_analysis
            ORDER BY departure_year
        """)
        actual_results = cur.fetchall()
        
        # Execute ground truth query - simplified version
        cur.execute("""
            WITH last_non_current_salary AS (
            SELECT
                s.employee_id,
                s.to_date      AS departure_date,
                s.amount       AS final_salary,
                ROW_NUMBER() OVER (
                PARTITION BY s.employee_id
                ORDER BY s.to_date DESC, s.from_date DESC, s.amount DESC
                ) AS rn
            FROM employees.salary s
            WHERE s.to_date <> DATE '9999-01-01'
                AND NOT EXISTS (
                SELECT 1
                FROM employees.salary s_cur
                WHERE s_cur.employee_id = s.employee_id
                    AND s_cur.to_date = DATE '9999-01-01'
                )
            ),
            departed AS (
            SELECT employee_id, departure_date, final_salary
            FROM last_non_current_salary
            WHERE rn = 1
            ),
            with_tenure AS (
            SELECT
                e.id AS employee_id,
                d.departure_date,
                d.final_salary,
                (d.departure_date - e.hire_date)::INTEGER AS tenure_days
            FROM employees.employee e
            JOIN departed d ON d.employee_id = e.id
            )
            SELECT
            EXTRACT(YEAR FROM departure_date)::INTEGER AS departure_year,
            COUNT(*)::INTEGER                         AS departures_count,
            AVG(tenure_days)                          AS avg_tenure_days,
            AVG(final_salary)                         AS avg_final_salary
            FROM with_tenure
            WHERE departure_date BETWEEN DATE '1985-01-01' AND DATE '2002-12-31'
            GROUP BY EXTRACT(YEAR FROM departure_date)
            ORDER BY departure_year;
        """)
        expected_results = cur.fetchall()

        if len(actual_results) != len(expected_results):
            print(f"❌ Expected {len(expected_results)} turnover trend results, got {len(actual_results)}")
            return False

        mismatches = 0
        for i, (actual, expected) in enumerate(zip(actual_results, expected_results)):
            if not rows_match(actual, expected):
                if mismatches < 5:  # Only show first 5 mismatches
                    print(f"❌ Row {i+1} mismatch: expected {expected}, got {actual}")
                mismatches += 1

        if mismatches > 0:
            print(f"❌ Total mismatches: {mismatches}")
            return False

        print(f"✅ Turnover trend analysis results are correct ({len(actual_results)} records)")
        return True

def main():
    """Main verification function."""
    print("=" * 50)

    # Get connection parameters
    conn_params = get_connection_params()

    if not conn_params["database"]:
        print("❌ No database specified")
        sys.exit(1)

    try:
        # Connect to database
        conn = psycopg2.connect(**conn_params)

        # Verify all three analysis results
        success = (
            verify_retention_analysis_results(conn) and 
            verify_high_risk_results(conn) and 
            verify_turnover_trend_results(conn)
        )

        conn.close()

        if success:
            print("\n🎉 Task verification: PASS")
            sys.exit(0)
        else:
            print("\n❌ Task verification: FAIL")
            sys.exit(1)

    except psycopg2.Error as e:
        print(f"❌ Database error: {e}")
        sys.exit(1)
    except Exception as e:
        print(f"❌ Verification error: {e}")
        sys.exit(1)

if __name__ == "__main__":
    main()

================================================
FILE: tasks/postgres/standard/employees/executive_dashboard_automation/description.md
================================================
Design a comprehensive reporting and automation system for executive dashboard and real-time monitoring. The executive team needs automated reports, data views, and trigger-based notifications to track key business metrics without manual intervention.

## Your Tasks:

1. **Create executive summary views** — build three materialized views in the `employees` schema:
   
   **View 1: `exec_department_summary`**
   * `department_name` (varchar) — department name
   * `total_employees` (integer) — current active employee count
   * `avg_salary` (decimal) — average current salary
   * `total_payroll` (bigint) — total monthly payroll cost
   * `manager_name` (varchar) — current department manager name

   **View 2: `exec_hiring_trends`**  
   * `hire_year` (integer) — year employees were hired
   * `employees_hired` (integer) — number hired that year
   * `avg_starting_salary` (decimal) — average first salary of hires that year
   * `retention_rate` (decimal) — percentage still employed
   * `top_hiring_department` (varchar) — department that hired the most that year

   **View 3: `exec_salary_distribution`**
   * `salary_band` (varchar) — salary ranges ('30K-50K', '50K-70K', '70K-90K', '90K-110K', '110K+')  
   * `employee_count` (integer) — employees in this salary band
   * `percentage_of_workforce` (decimal) — percentage of total workforce
   * `most_common_title` (varchar) — most frequent job title in this band

2. **Create stored procedure for report generation**:
   
   **Procedure: `generate_monthly_report(report_date DATE)`**
   * Create a table `monthly_reports` with columns: report_id (auto-increment), report_date, department_count, total_employees (current active employees only), avg_salary, generated_at
   * Insert one summary record using the report_date as identifier and current database statistics (not historical data for that date)
   * Return the generated report_id

3. **Create notification triggers**:
   
   **Trigger: `high_salary_alert`**
   * Fires when a new salary record is inserted with amount > 120000
   * Inserts alert into `salary_alerts` table with: employee_id, salary_amount, alert_date, status='new'

4. **Insert test data to verify triggers**:
   * Update employee 10001's current salary: first set their current salary record to_date='2024-01-31', then insert new salary record with amount 125000, from_date='2024-02-01', to_date='9999-01-01'
   * Refresh all materialized views after inserting new data to ensure they reflect the updated information

5. **Execute the stored procedure**:
   * Call `generate_monthly_report('2024-01-01')` to create January report
   * Query the generated report to verify execution

6. **Create performance indexes**:
   * Index on `salary_alerts.status` for alert processing
   * Composite index on `monthly_reports(report_date, department_count)` for trend analysis

================================================
FILE: tasks/postgres/standard/employees/executive_dashboard_automation/meta.json
================================================
{
  "task_id": "executive_dashboard_automation",
  "task_name": "Executive Dashboard Automation",
  "category_id": "employees",
  "category_name": "Employees",
  "description": "Design automated reporting system with materialized views, stored procedures, and triggers for executive dashboard monitoring.",
  "author": "Lingxiao Du",
  "created_at": "2025-08-15",
  "difficulty": "L3",
  "tags": [
    "reporting and analytics",
    "stored procedures and functions",
    "schema design"
  ],
  "mcp": [
    "postgres"
  ],
  "meta_data": {
    "stateType": "text",
    "stateContent": "Enum \"employees\".\"employee_gender\" {\n  \"M\"\n  \"F\"\n}\n\nTable \"employees\".\"department\" {\n  \"id\" bpchar(4) [pk, not null]\n  \"dept_name\" varchar(40) [unique, not null]\n}\n\nTable \"employees\".\"department_employee\" {\n  \"employee_id\" int8 [not null]\n  \"department_id\" bpchar(4) [not null]\n  \"from_date\" date [not null]\n  \"to_date\" date [not null]\n\n  Indexes {\n    (employee_id, department_id) [type: btree, name: \"idx_16982_primary\"]\n    department_id [type: btree, name: \"idx_16982_dept_no\"]\n  }\n}\n\nTable \"employees\".\"department_manager\" {\n  \"employee_id\" int8 [not null]\n  \"department_id\" bpchar(4) [not null]\n  \"from_date\" date [not null]\n  \"to_date\" date [not null]\n\n  Indexes {\n    (employee_id, department_id) [type: btree, name: \"idx_16985_primary\"]\n    department_id [type: btree, name: \"idx_16985_dept_no\"]\n  }\n}\n\nTable \"employees\".\"employee\" {\n  \"id\" int8 [pk, not null, increment]\n  \"birth_date\" date [not null]\n  \"first_name\" varchar(14) [not null]\n  \"last_name\" varchar(16) [not null]\n  \"gender\" employees.employee_gender [not null]\n  \"hire_date\" date [not null]\n}\n\nTable \"employees\".\"salary\" {\n  \"employee_id\" int8 [not null]\n  \"amount\" int8 [not null]\n  \"from_date\" date [not null]\n  \"to_date\" date [not null]\n\n  Indexes {\n    (employee_id, from_date) [type: btree, name: \"idx_16991_primary\"]\n  }\n}\n\nTable \"employees\".\"title\" {\n  \"employee_id\" int8 [not null]\n  \"title\" varchar(50) [not null]\n  \"from_date\" date [not null]\n  \"to_date\" date\n\n  Indexes {\n    (employee_id, title, from_date) [type: btree, name: \"idx_16994_primary\"]\n  }\n}\n\nRef \"dept_emp_ibfk_1\":\"employees\".\"employee\".\"id\" < \"employees\".\"department_employee\".\"employee_id\" [update: restrict, delete: cascade]\n\nRef \"dept_emp_ibfk_2\":\"employees\".\"department\".\"id\" < \"employees\".\"department_employee\".\"department_id\" [update: restrict, delete: cascade]\n\nRef \"dept_manager_ibfk_1\":\"employees\".\"employee\".\"id\" < \"employees\".\"department_manager\".\"employee_id\" [update: restrict, delete: cascade]\n\nRef \"dept_manager_ibfk_2\":\"employees\".\"department\".\"id\" < \"employees\".\"department_manager\".\"department_id\" [update: restrict, delete: cascade]\n\nRef \"salaries_ibfk_1\":\"employees\".\"employee\".\"id\" < \"employees\".\"salary\".\"employee_id\" [update: restrict, delete: cascade]\n\nRef \"titles_ibfk_1\":\"employees\".\"employee\".\"id\" < \"employees\".\"title\".\"employee_id\" [update: restrict, delete: cascade]\n",
    "stateUrl": null,
    "stateOriginalUrl": "https://github.com/neondatabase-labs/postgres-sample-dbs/blob/main/employees.sql.gz"
  }
}

================================================
FILE: tasks/postgres/standard/employees/executive_dashboard_automation/verify.py
================================================
"""
Verification script for PostgreSQL Task 6: Reporting and Automation System
"""

import os
import sys
import psycopg2
from decimal import Decimal

def rows_match(actual_row, expected_row):
    """
    Compare two rows with appropriate tolerance.
    For Decimal types: allows 0.1 tolerance
    For date types: convert to string for comparison
    For other types: requires exact match
    """
    if len(actual_row) != len(expected_row):
        return False
    
    for actual, expected in zip(actual_row, expected_row):
        if isinstance(actual, Decimal) and isinstance(expected, (Decimal, float, int)):
            if abs(float(actual) - float(expected)) > 0.1:
                return False
        elif hasattr(actual, 'strftime'):  # datetime.date or datetime.datetime
            if str(actual) != str(expected):
                return False
        elif actual != expected:
            return False
    
    return True

def get_connection_params() -> dict:
    """Get database connection parameters."""
    return {
        "host": os.getenv("POSTGRES_HOST", "localhost"),
        "port": int(os.getenv("POSTGRES_PORT", 5432)),
        "database": os.getenv("POSTGRES_DATABASE"),
        "user": os.getenv("POSTGRES_USERNAME"),
        "password": os.getenv("POSTGRES_PASSWORD")
    }

def verify_materialized_views(conn) -> bool:
    """Verify that materialized views were created and populated correctly."""
    with conn.cursor() as cur:
        # Check if materialized views exist
        cur.execute("""
            SELECT matviewname FROM pg_matviews 
            WHERE schemaname = 'employees' 
            AND matviewname IN ('exec_department_summary', 'exec_hiring_trends', 'exec_salary_distribution')
            ORDER BY matviewname
        """)
        views = [row[0] for row in cur.fetchall()]
        
        expected_views = ['exec_department_summary', 'exec_hiring_trends', 'exec_salary_distribution']
        if set(views) != set(expected_views):
            print(f"❌ Expected views {expected_views}, found {views}")
            return False
        
        # Check all departments' data accuracy
        cur.execute("""
            SELECT department_name, total_employees, avg_salary, total_payroll, manager_name
            FROM employees.exec_department_summary
            ORDER BY department_name
        """)
        view_data = cur.fetchall()
        
        # Get actual data for all departments
        cur.execute("""
            WITH current_salary AS (
            SELECT employee_id, amount
            FROM (
                SELECT s.*,
                    ROW_NUMBER() OVER (
                        PARTITION BY s.employee_id
                        ORDER BY s.from_date DESC, s.amount DESC
                    ) AS rn
                FROM employees.salary s
                WHERE s.to_date = DATE '9999-01-01'
            ) x
            WHERE rn = 1
            ),
            current_dept AS (
            SELECT DISTINCT de.employee_id, de.department_id
            FROM employees.department_employee de
            WHERE de.to_date = DATE '9999-01-01'
            ),
            current_manager AS (
            SELECT department_id,
                    CONCAT(e.first_name, ' ', e.last_name) AS manager_name
            FROM (
                SELECT dm.*,
                    ROW_NUMBER() OVER (
                        PARTITION BY dm.department_id
                        ORDER BY dm.from_date DESC, dm.employee_id
                    ) AS rn
                FROM employees.department_manager dm
                WHERE dm.to_date = DATE '9999-01-01'
            ) dm
            JOIN employees.employee e ON e.id = dm.employee_id
            WHERE dm.rn = 1
            )
            SELECT
            d.dept_name AS department_name,
            COUNT(cd.employee_id)::INT AS total_employees,
            AVG(cs.amount)::DECIMAL   AS avg_salary,
            COALESCE(SUM(cs.amount), 0)::BIGINT AS total_payroll,
            cm.manager_name
            FROM employees.department d
            LEFT JOIN current_dept   cd ON cd.department_id = d.id
            LEFT JOIN current_salary cs ON cs.employee_id = cd.employee_id
            LEFT JOIN current_manager cm ON cm.department_id = d.id
            GROUP BY d.id, d.dept_name, cm.manager_name
            ORDER BY d.dept_name;
        """)
        actual_data = cur.fetchall()
        
        if len(view_data) != len(actual_data):
            print(f"❌ Department count mismatch: view={len(view_data)}, actual={len(actual_data)}")
            return False
            
        for view_row, actual_row in zip(view_data, actual_data):
            if not rows_match(view_row, actual_row):
                print(f"❌ Department summary data incorrect for {view_row[0]}: view={view_row}, actual={actual_row}")
                return False
            
        # Check all hiring trends data accuracy
        cur.execute("""
            SELECT hire_year, employees_hired, avg_starting_salary, retention_rate, top_hiring_department
            FROM employees.exec_hiring_trends
            ORDER BY hire_year
        """)
        hiring_view_data = cur.fetchall()
        
        # Get actual data for all years
        cur.execute("""
            WITH first_salary AS (
            SELECT employee_id, amount AS starting_salary
            FROM (
                SELECT s.*,
                    ROW_NUMBER() OVER (
                        PARTITION BY s.employee_id
                        ORDER BY s.from_date ASC, s.amount ASC
                    ) AS rn
                FROM employees.salary s
            ) x
            WHERE rn = 1
            ),
            current_emp AS (
            SELECT DISTINCT s.employee_id
            FROM employees.salary s
            WHERE s.to_date = DATE '9999-01-01'
            ),
            first_dept AS (
            SELECT employee_id, department_id
            FROM (
                SELECT de.*,
                    ROW_NUMBER() OVER (
                        PARTITION BY de.employee_id
                        ORDER BY de.from_date ASC, de.department_id
                    ) AS rn
                FROM employees.department_employee de
            ) x
            WHERE rn = 1
            ),
            hire_base AS (
            SELECT e.id AS employee_id,
                    EXTRACT(YEAR FROM e.hire_date)::INT AS hire_year
            FROM employees.employee e
            WHERE e.hire_date IS NOT NULL
            ),
            hire_by_dept_year AS (
            SELECT hb.hire_year,
                    d.dept_name,
                    COUNT(*) AS dept_hires
            FROM hire_base hb
            LEFT JOIN first_dept fd ON fd.employee_id = hb.employee_id
            LEFT JOIN employees.department d ON d.id = fd.department_id
            GROUP BY hb.hire_year, d.dept_name
            ),
            top_dept_per_year AS (
            SELECT hire_year,
                    dept_name AS top_hiring_department
            FROM (
                SELECT hire_year, dept_name, dept_hires,
                    ROW_NUMBER() OVER (
                        PARTITION BY hire_year
                        ORDER BY dept_hires DESC NULLS LAST, dept_name
                    ) AS rn
                FROM hire_by_dept_year
            ) t
            WHERE rn = 1
            )
            SELECT
            hb.hire_year,
            COUNT(*)::INT AS employees_hired,
            AVG(fs.starting_salary)::DECIMAL AS avg_starting_salary,
            (COUNT(ce.employee_id)::DECIMAL / NULLIF(COUNT(*), 0) * 100) AS retention_rate,
            td.top_hiring_department
            FROM hire_base hb
            LEFT JOIN first_salary fs   ON fs.employee_id = hb.employee_id
            LEFT JOIN current_emp ce    ON ce.employee_id = hb.employee_id
            LEFT JOIN top_dept_per_year td ON td.hire_year = hb.hire_year
            GROUP BY hb.hire_year, td.top_hiring_department
            ORDER BY hb.hire_year;
        """)
        actual_hiring_data = cur.fetchall()
        
        if len(hiring_view_data) != len(actual_hiring_data):
            print(f"❌ Hiring trends count mismatch: view={len(hiring_view_data)}, actual={len(actual_hiring_data)}")
            return False
        
        for hiring_view, actual_hiring in zip(hiring_view_data, actual_hiring_data):
            # Now compare all 5 fields including top_hiring_department
            if not rows_match(hiring_view, actual_hiring):
                print(f"❌ Hiring trends data incorrect for year {hiring_view[0]}: view={hiring_view}, actual={actual_hiring}")
                return False
                
            
        # Check all salary bands' data accuracy
        cur.execute("""
            WITH band_order AS (
            SELECT '30K-50K' AS band, 1 AS ord UNION ALL
            SELECT '50K-70K', 2 UNION ALL
            SELECT '70K-90K', 3 UNION ALL
            SELECT '90K-110K',4 UNION ALL
            SELECT '110K+',   5
            )
            SELECT salary_band, employee_count, percentage_of_workforce, most_common_title
            FROM employees.exec_salary_distribution v
            JOIN band_order bo ON bo.band = v.salary_band
            ORDER BY bo.ord;
        """)
        view_bands = cur.fetchall()
        
        # Calculate actual data for all bands
        cur.execute("""
            WITH current_salary AS (
            SELECT employee_id, amount
            FROM (
                SELECT s.*,
                    ROW_NUMBER() OVER (
                        PARTITION BY s.employee_id
                        ORDER BY s.from_date DESC, s.amount DESC
                    ) AS rn
                FROM employees.salary s
                WHERE s.to_date = DATE '9999-01-01'
            ) x
            WHERE rn = 1
            ),
            current_title AS (
            SELECT employee_id, title
            FROM (
                SELECT t.*,
                    ROW_NUMBER() OVER (
                        PARTITION BY t.employee_id
                        ORDER BY t.from_date DESC, t.title
                    ) AS rn
                FROM employees.title t
                WHERE t.to_date = DATE '9999-01-01'
            ) x
            WHERE rn = 1
            ),
            base AS (
            SELECT cs.employee_id, cs.amount, COALESCE(ct.title, 'Unknown') AS title
            FROM current_salary cs
            LEFT JOIN current_title ct ON ct.employee_id = cs.employee_id
            ),
            banded AS (
            SELECT
                CASE
                WHEN amount <  50000 THEN '30K-50K'
                WHEN amount <  70000 THEN '50K-70K'
                WHEN amount <  90000 THEN '70K-90K'
                WHEN amount < 110000 THEN '90K-110K'
                ELSE '110K+'
                END AS salary_band,
                title,
                employee_id
            FROM base
            ),
            band_counts AS (
            SELECT salary_band, COUNT(DISTINCT employee_id) AS employee_count
            FROM banded
            GROUP BY salary_band
            ),
            title_counts AS (
            SELECT salary_band, title, COUNT(DISTINCT employee_id) AS title_count
            FROM banded
            GROUP BY salary_band, title
            ),
            top_titles AS (
            SELECT salary_band, title AS most_common_title
            FROM (
                SELECT salary_band, title, title_count,
                    ROW_NUMBER() OVER (
                        PARTITION BY salary_band
                        ORDER BY title_count DESC, title
                    ) AS rn
                FROM title_counts
            ) t
            WHERE rn = 1
            ),
            workforce AS (
            SELECT COUNT(DISTINCT employee_id) AS total_current
            FROM base
            ),
            band_order AS (
            SELECT '30K-50K' AS band, 1 AS ord UNION ALL
            SELECT '50K-70K', 2 UNION ALL
            SELECT '70K-90K', 3 UNION ALL
            SELECT '90K-110K', 4 UNION ALL
            SELECT '110K+',   5
            )
            SELECT
            bc.salary_band,
            bc.employee_count::INT AS employee_count,
            (bc.employee_count::DECIMAL / NULLIF((SELECT total_current FROM workforce), 0) * 100) AS percentage_of_workforce,
            tt.most_common_title
            FROM band_counts bc
            LEFT JOIN top_titles tt ON tt.salary_band = bc.salary_band
            LEFT JOIN band_order  bo ON bo.band = bc.salary_band
            ORDER BY bo.ord;        
        """)
        actual_bands = cur.fetchall()
        
        # Compare view data with actual data
        if len(view_bands) != len(actual_bands):
            print(f"❌ Salary band count mismatch: view={len(view_bands)}, actual={len(actual_bands)}")
            return False
            
        for view_band, actual_band in zip(view_bands, actual_bands):
            if not rows_match(view_band, actual_band):
                print(f"❌ Salary band {actual_band[0]} data incorrect: view={view_band}, actual={actual_band}")
                return False
            
        print("✅ All materialized views are created and contain correct data")
        return True

def verify_stored_procedures(conn) -> bool:
    """Verify that stored procedure was created."""
    with conn.cursor() as cur:
        # Check if procedure exists
        cur.execute("""
            SELECT routine_name FROM information_schema.routines 
            WHERE routine_schema = 'employees' 
            AND routine_type = 'FUNCTION'
            AND routine_name = 'generate_monthly_report'
        """)
        procedures = [row[0] for row in cur.fetchall()]
        
        if 'generate_monthly_report' not in procedures:
            print("❌ generate_monthly_report procedure not found")
            return False
            
        # Check if monthly_reports table exists with correct structure
        cur.execute("""
            SELECT COUNT(*) FROM information_schema.columns 
            WHERE table_schema = 'employees' AND table_name = 'monthly_reports'
            AND column_name IN ('report_id', 'report_date', 'department_count', 'total_employees', 'avg_salary', 'generated_at')
        """)
        report_columns = cur.fetchone()[0]
        if report_columns != 6:
            print("❌ monthly_reports table missing required columns")
            return False
            
        print("✅ Stored procedure and supporting table are created")
        return True

def verify_triggers(conn) -> bool:
    """Verify that triggers were created and fired correctly."""
    with conn.cursor() as cur:
        # Check if triggers exist
        cur.execute("""
            SELECT trigger_name FROM information_schema.triggers 
            WHERE trigger_schema = 'employees'
            AND trigger_name = 'high_salary_alert'
        """)
        triggers = [row[0] for row in cur.fetchall()]
        
        if 'high_salary_alert' not in triggers:
            print("❌ high_salary_alert trigger not found")
            return False
            
        # Check if trigger support table exists
        cur.execute("""
            SELECT table_name FROM information_schema.tables 
            WHERE table_schema = 'employees' 
            AND table_name = 'salary_alerts'
        """)
        trigger_tables = [row[0] for row in cur.fetchall()]
        
        if 'salary_alerts' not in trigger_tables:
            print("❌ salary_alerts table not found")
            return False
            
        # Check if the old salary record was properly closed
        cur.execute("""
            SELECT COUNT(*) FROM employees.salary 
            WHERE employee_id = 10001 AND to_date = '2024-01-31'
        """)
        old_salary_count = cur.fetchone()[0]
        if old_salary_count == 0:
            print("❌ Old salary record for employee 10001 was not properly closed with to_date='2024-01-31'")
            return False
            
        # Check if the new salary record was inserted
        cur.execute("""
            SELECT COUNT(*) FROM employees.salary 
            WHERE employee_id = 10001 AND amount = 125000 
            AND from_date = '2024-02-01' AND to_date = '9999-01-01'
        """)
        new_salary_count = cur.fetchone()[0]
        if new_salary_count == 0:
            print("❌ New salary record for employee 10001 with amount 125000 was not inserted")
            return False
            
        # Check if high salary alert was triggered with specific details
        cur.execute("""
            SELECT COUNT(*) FROM employees.salary_alerts 
            WHERE employee_id = 10001 AND salary_amount = 125000 AND status = 'new'
        """)
        alert_count = cur.fetchone()[0]
        if alert_count == 0:
            print("❌ High salary alert was not triggered correctly for employee 10001 with amount 125000")
            return False
            
        print("✅ Trigger is created and functioning correctly")
        return True

def verify_procedure_execution(conn) -> bool:
    """Verify that stored procedure was executed with correct data."""
    with conn.cursor() as cur:
        # Check if monthly report data matches actual statistics
        cur.execute("""
            SELECT department_count, total_employees, avg_salary
            FROM employees.monthly_reports 
            WHERE report_date = '2024-01-01'
        """)
        report_data = cur.fetchone()
        if not report_data:
            print("❌ Monthly report for 2024-01-01 was not generated")
            return False
            
        # Get actual current statistics to compare
        cur.execute("""
WITH current_salary AS (
  SELECT employee_id, amount
  FROM (
    SELECT s.*,
           ROW_NUMBER() OVER (
             PARTITION BY s.employee_id
             ORDER BY s.from_date DESC, s.amount DESC
           ) AS rn
    FROM employees.salary s
    WHERE s.to_date = DATE '9999-01-01'
  ) x
  WHERE rn = 1
),
current_dept AS (
  SELECT DISTINCT de.employee_id, de.department_id
  FROM employees.department_employee de
  WHERE de.to_date = DATE '9999-01-01'
),
base AS (
  SELECT cd.department_id, cs.employee_id, cs.amount
  FROM current_dept cd
  JOIN current_salary cs ON cs.employee_id = cd.employee_id
)
SELECT
  COUNT(DISTINCT department_id)        AS actual_dept_count,
  COUNT(DISTINCT employee_id)          AS actual_total_employees,
  AVG(amount)::DECIMAL                 AS actual_avg_salary
FROM base;
        """)
        actual_stats = cur.fetchone()
        
        # Compare report data with actual data  
        if not rows_match(report_data, actual_stats):
            print(f"❌ Monthly report data incorrect: expected {actual_stats}, got {report_data}")
            return False
                
        print("✅ Stored procedure executed with correct data")
        return True

def verify_indexes(conn) -> bool:
    """Verify that performance indexes were created."""
    with conn.cursor() as cur:
        # Check for required indexes
        cur.execute("""
            SELECT indexname FROM pg_indexes 
            WHERE schemaname = 'employees' 
            AND tablename IN ('salary_alerts', 'monthly_reports')
            AND indexname LIKE 'idx_%'
            ORDER BY indexname
        """)
        indexes = [row[0] for row in cur.fetchall()]
        
        # Should have at least 2 indexes created
        if len(indexes) < 2:
            print(f"❌ Expected at least 2 performance indexes, found {len(indexes)}")
            return False
            
        print("✅ Performance indexes are created")
        return True

def main():
    """Main verification function."""
    print("=" * 50)

    # Get connection parameters
    conn_params = get_connection_params()

    if not conn_params["database"]:
        print("❌ No database specified")
        sys.exit(1)

    try:
        # Connect to database
        conn = psycopg2.connect(**conn_params)

        # Verify all components
        success = (
            verify_materialized_views(conn) and 
            verify_stored_procedures(conn) and
            verify_triggers(conn) and
            verify_procedure_execution(conn) and
            verify_indexes(conn)
        )

        conn.close()

        if success:
            print("\n🎉 Task verification: PASS")
            sys.exit(0)
        else:
            print("\n❌ Task verification: FAIL")
            sys.exit(1)

    except psycopg2.Error as e:
        print(f"❌ Database error: {e}")
        sys.exit(1)
    except Exception as e:
        print(f"❌ Verification error: {e}")
        sys.exit(1)

if __name__ == "__main__":
    main()

================================================
FILE: tasks/postgres/standard/employees/management_structure_analysis/description.md
================================================
Conduct a comprehensive management structure analysis to evaluate leadership effectiveness and organizational hierarchy. The executive team needs insights into management tenure, span of control, and leadership transitions to optimize the management structure and succession planning.

## Your Tasks:

1. **Create the manager profile table** — build a table called `manager_profile` in the `employees` schema with these exact columns:
   * `manager_id` (bigint) — the manager's employee ID
   * `manager_name` (varchar) — concatenated first and last name
   * `current_department` (varchar) — current department they manage (NULL if not current)
   * `management_periods` (integer) — total number of management assignments (including multiple periods in same department)
   * `current_manager` (boolean) — whether they are currently a manager

2. **Create the department leadership table** — build a table called `department_leadership` in the `employees` schema with:
   * `department_name` (varchar) — the department name
   * `current_manager_name` (varchar) — current manager's full name
   * `manager_start_date` (date) — when current manager started
   * `total_historical_managers` (integer) — total number of managers this department has had

3. **Create the management transition table** — build a table called `management_transitions` in the `employees` schema with:
   * `department_name` (varchar) — the department name
   * `transition_year` (integer) — year when management changed
   * `outgoing_manager` (varchar) — previous manager's name
   * `incoming_manager` (varchar) — new manager's name ('No Successor' if department had no immediate replacement)
   * `transition_gap_days` (integer) — days between managers (0 if immediate or no successor)

4. **Create the span of control table** — build a table called `span_of_control` in the `employees` schema with:
   * `manager_id` (bigint) — the manager's employee ID
   * `manager_name` (varchar) — manager's full name
   * `department_name` (varchar) — department they manage
   * `total_employees` (integer) — total employees in their department
   * `current_employees` (integer) — current active employees in department
   * `management_load` (varchar) — assessment ('light', 'moderate', 'heavy') based on current employees

5. **Apply management load classification**:
   * **Light**: < 5,000 current employees
   * **Moderate**: 5,000 - 15,000 current employees
   * **Heavy**: > 15,000 current employees

6. **Focus on current managers only** for span of control analysis — use managers with active management roles (to_date = '9999-01-01').

7. **Track all management history** for profiles and transitions — include both current and former managers to understand complete leadership evolution.

The analysis will provide insights into management effectiveness, departmental stability, and organizational structure optimization opportunities.


================================================
FILE: tasks/postgres/standard/employees/management_structure_analysis/meta.json
================================================
{
  "task_id": "management_structure_analysis",
  "task_name": "Management Structure Analysis",
  "category_id": "employees",
  "category_name": "Employees",
  "description": "Analyze management structure evaluating leadership effectiveness, span of control, and management transitions for succession planning.",
  "author": "Lingxiao Du",
  "created_at": "2025-08-15",
  "difficulty": "L3",
  "tags": [
    "reporting and analytics",
    "statistical aggregation"
  ],
  "mcp": [
    "postgres"
  ],
  "meta_data": {
    "stateType": "text",
    "stateContent": "Enum \"employees\".\"employee_gender\" {\n  \"M\"\n  \"F\"\n}\n\nTable \"employees\".\"department\" {\n  \"id\" bpchar(4) [pk, not null]\n  \"dept_name\" varchar(40) [unique, not null]\n}\n\nTable \"employees\".\"department_employee\" {\n  \"employee_id\" int8 [not null]\n  \"department_id\" bpchar(4) [not null]\n  \"from_date\" date [not null]\n  \"to_date\" date [not null]\n\n  Indexes {\n    (employee_id, department_id) [type: btree, name: \"idx_16982_primary\"]\n    department_id [type: btree, name: \"idx_16982_dept_no\"]\n  }\n}\n\nTable \"employees\".\"department_manager\" {\n  \"employee_id\" int8 [not null]\n  \"department_id\" bpchar(4) [not null]\n  \"from_date\" date [not null]\n  \"to_date\" date [not null]\n\n  Indexes {\n    (employee_id, department_id) [type: btree, name: \"idx_16985_primary\"]\n    department_id [type: btree, name: \"idx_16985_dept_no\"]\n  }\n}\n\nTable \"employees\".\"employee\" {\n  \"id\" int8 [pk, not null, increment]\n  \"birth_date\" date [not null]\n  \"first_name\" varchar(14) [not null]\n  \"last_name\" varchar(16) [not null]\n  \"gender\" employees.employee_gender [not null]\n  \"hire_date\" date [not null]\n}\n\nTable \"employees\".\"salary\" {\n  \"employee_id\" int8 [not null]\n  \"amount\" int8 [not null]\n  \"from_date\" date [not null]\n  \"to_date\" date [not null]\n\n  Indexes {\n    (employee_id, from_date) [type: btree, name: \"idx_16991_primary\"]\n  }\n}\n\nTable \"employees\".\"title\" {\n  \"employee_id\" int8 [not null]\n  \"title\" varchar(50) [not null]\n  \"from_date\" date [not null]\n  \"to_date\" date\n\n  Indexes {\n    (employee_id, title, from_date) [type: btree, name: \"idx_16994_primary\"]\n  }\n}\n\nRef \"dept_emp_ibfk_1\":\"employees\".\"employee\".\"id\" < \"employees\".\"department_employee\".\"employee_id\" [update: restrict, delete: cascade]\n\nRef \"dept_emp_ibfk_2\":\"employees\".\"department\".\"id\" < \"employees\".\"department_employee\".\"department_id\" [update: restrict, delete: cascade]\n\nRef \"dept_manager_ibfk_1\":\"employees\".\"employee\".\"id\" < \"employees\".\"department_manager\".\"employee_id\" [update: restrict, delete: cascade]\n\nRef \"dept_manager_ibfk_2\":\"employees\".\"department\".\"id\" < \"employees\".\"department_manager\".\"department_id\" [update: restrict, delete: cascade]\n\nRef \"salaries_ibfk_1\":\"employees\".\"employee\".\"id\" < \"employees\".\"salary\".\"employee_id\" [update: restrict, delete: cascade]\n\nRef \"titles_ibfk_1\":\"employees\".\"employee\".\"id\" < \"employees\".\"title\".\"employee_id\" [update: restrict, delete: cascade]\n",
    "stateUrl": null,
    "stateOriginalUrl": "https://github.com/neondatabase-labs/postgres-sample-dbs/blob/main/employees.sql.gz"
  }
}

================================================
FILE: tasks/postgres/standard/employees/management_structure_analysis/verify.py
================================================
"""
Verification script for PostgreSQL Task 4: Management Structure Analysis
"""

import os
import sys
import psycopg2
from decimal import Decimal

def rows_match(actual_row, expected_row):
    """
    Compare two rows with appropriate tolerance.
    For Decimal types: allows 0.1 tolerance
    For other types: requires exact match
    """
    if len(actual_row) != len(expected_row):
        return False
    
    for actual, expected in zip(actual_row, expected_row):
        if isinstance(actual, Decimal) and isinstance(expected, Decimal):
            if abs(float(actual) - float(expected)) > 0.1:
                return False
        elif actual != expected:
            return False
    
    return True

def get_connection_params() -> dict:
    """Get database connection parameters."""
    return {
        "host": os.getenv("POSTGRES_HOST", "localhost"),
        "port": int(os.getenv("POSTGRES_PORT", 5432)),
        "database": os.getenv("POSTGRES_DATABASE"),
        "user": os.getenv("POSTGRES_USERNAME"),
        "password": os.getenv("POSTGRES_PASSWORD")
    }

def verify_manager_profile_results(conn) -> bool:
    """Verify the manager profile results."""
    with conn.cursor() as cur:
        # Get actual results from the created table
        cur.execute("""
            SELECT manager_id, manager_name, current_department, 
                   management_periods, current_manager
            FROM employees.manager_profile
            ORDER BY manager_id
        """)
        actual_results = cur.fetchall()
        
        # Execute ground truth query
        cur.execute("""
            WITH dm AS (
            SELECT dm.employee_id,
                    dm.department_id,
                    dm.from_date,
                    dm.to_date
            FROM employees.department_manager dm
            ),
            manager_periods AS (
            SELECT employee_id, COUNT(*)::INT AS management_periods
            FROM dm
            GROUP BY employee_id
            ),
            current_assignment AS (
            SELECT employee_id, department_id
            FROM (
                SELECT d.*,
                    ROW_NUMBER() OVER (
                        PARTITION BY d.employee_id
                        ORDER BY d.from_date DESC, d.department_id
                    ) AS rn
                FROM dm d
                WHERE d.to_date = DATE '9999-01-01'
            ) x
            WHERE rn = 1
            ),
            manager_names AS (
            SELECT e.id AS manager_id,
                    CONCAT(e.first_name, ' ', e.last_name) AS manager_name
            FROM employees.employee e
            WHERE EXISTS (SELECT 1 FROM dm WHERE employee_id = e.id)
            )
            SELECT
            mn.manager_id,
            mn.manager_name,
            d.dept_name AS current_department,
            mp.management_periods,
            (d.dept_name IS NOT NULL) AS current_manager
            FROM manager_names mn
            JOIN manager_periods mp ON mp.employee_id = mn.manager_id
            LEFT JOIN current_assignment ca ON ca.employee_id = mn.manager_id
            LEFT JOIN employees.department d ON d.id = ca.department_id
            ORDER BY mn.manager_id;
        """)
        expected_results = cur.fetchall()

        if len(actual_results) != len(expected_results):
            print(f"❌ Expected {len(expected_results)} manager profile results, got {len(actual_results)}")
            return False

        mismatches = 0
        for i, (actual, expected) in enumerate(zip(actual_results, expected_results)):
            if not rows_match(actual, expected):
                if mismatches < 5:  # Only show first 5 mismatches
                    print(f"❌ Row {i+1} mismatch: expected {expected}, got {actual}")
                mismatches += 1

        if mismatches > 0:
            print(f"❌ Total mismatches: {mismatches}")
            return False

        print(f"✅ Manager profile results are correct ({len(actual_results)} records)")
        return True

def verify_department_leadership_results(conn) -> bool:
    """Verify the department leadership results."""
    with conn.cursor() as cur:
        # Get actual results from the created table
        cur.execute("""
            SELECT department_name, current_manager_name, manager_start_date, 
                   total_historical_managers
            FROM employees.department_leadership
            ORDER BY department_name
        """)
        actual_results = cur.fetchall()
        
        # Execute ground truth query
        cur.execute("""
            WITH current_mgr AS (
            SELECT department_id,
                    CONCAT(e.first_name, ' ', e.last_name) AS current_manager_name,
                    dm.from_date AS manager_start_date
            FROM (
                SELECT dm.*,
                    ROW_NUMBER() OVER (
                        PARTITION BY dm.department_id
                        ORDER BY dm.from_date DESC, dm.employee_id
                    ) AS rn
                FROM employees.department_manager dm
                WHERE dm.to_date = DATE '9999-01-01'
            ) dm
            JOIN employees.employee e ON e.id = dm.employee_id
            WHERE dm.rn = 1
            ),
            hist AS (
            SELECT dm.department_id, COUNT(DISTINCT dm.employee_id)::INT AS total_historical_managers
            FROM employees.department_manager dm
            GROUP BY dm.department_id
            )
            SELECT
            d.dept_name                              AS department_name,
            cm.current_manager_name,
            cm.manager_start_date,
            COALESCE(h.total_historical_managers,0)  AS total_historical_managers
            FROM employees.department d
            LEFT JOIN current_mgr cm ON cm.department_id = d.id
            LEFT JOIN hist        h  ON h.department_id = d.id
            ORDER BY d.dept_name;
        """)
        expected_results = cur.fetchall()

        if len(actual_results) != len(expected_results):
            print(f"❌ Expected {len(expected_results)} department leadership results, got {len(actual_results)}")
            return False

        mismatches = 0
        for i, (actual, expected) in enumerate(zip(actual_results, expected_results)):
            if not rows_match(actual, expected):
                if mismatches < 5:  # Only show first 5 mismatches
                    print(f"❌ Row {i+1} mismatch: expected {expected}, got {actual}")
                mismatches += 1

        if mismatches > 0:
            print(f"❌ Total mismatches: {mismatches}")
            return False

        print(f"✅ Department leadership results are correct ({len(actual_results)} records)")
        return True

def verify_management_transitions_results(conn) -> bool:
    """Verify the management transitions results."""
    with conn.cursor() as cur:
        # Get actual results from the created table
        cur.execute("""
            SELECT department_name, transition_year, outgoing_manager, incoming_manager, transition_gap_days
            FROM employees.management_transitions
            ORDER BY department_name, transition_year
        """)
        actual_results = cur.fetchall()
        
        # Execute ground truth query
        cur.execute("""
            WITH mgr AS (
            SELECT
                d.id AS department_id,
                d.dept_name,
                dm.employee_id,
                dm.from_date,
                dm.to_date,
                CONCAT(e.first_name, ' ', e.last_name) AS manager_name
            FROM employees.department_manager dm
            JOIN employees.department d ON d.id = dm.department_id
            JOIN employees.employee  e ON e.id = dm.employee_id
            ),
            ordered AS (
            SELECT
                department_id,
                dept_name,
                employee_id,
                manager_name,
                from_date,
                to_date,
                ROW_NUMBER() OVER (
                PARTITION BY department_id
                ORDER BY from_date, to_date, employee_id
                ) AS rn,
                LEAD(manager_name) OVER (
                PARTITION BY department_id
                ORDER BY from_date, to_date, employee_id
                ) AS next_manager_name,
                LEAD(from_date) OVER (
                PARTITION BY department_id
                ORDER BY from_date, to_date, employee_id
                ) AS next_from_date
            FROM mgr
            )
            SELECT
            o.dept_name                                   AS department_name,
            EXTRACT(YEAR FROM o.to_date)::INT             AS transition_year,
            o.manager_name                                AS outgoing_manager,
            COALESCE(o.next_manager_name, 'No Successor') AS incoming_manager,
            COALESCE(GREATEST((o.next_from_date - o.to_date - 1), 0), 0)::INT AS transition_gap_days
            FROM ordered o
            WHERE o.to_date <> DATE '9999-01-01'
            ORDER BY department_name, transition_year;
        """)
        expected_results = cur.fetchall()

        if len(actual_results) != len(expected_results):
            print(f"❌ Expected {len(expected_results)} management transitions results, got {len(actual_results)}")
            return False

        mismatches = 0
        for i, (actual, expected) in enumerate(zip(actual_results, expected_results)):
            if not rows_match(actual, expected):
                if mismatches < 5:  # Only show first 5 mismatches
                    print(f"❌ Row {i+1} mismatch: expected {expected}, got {actual}")
                mismatches += 1

        if mismatches > 0:
            print(f"❌ Total mismatches: {mismatches}")
            return False

        print(f"✅ Management transitions results are correct ({len(actual_results)} records)")
        return True

def verify_span_of_control_results(conn) -> bool:
    """Verify the span of control results."""
    with conn.cursor() as cur:
        # Get actual results from the created table
        cur.execute("""
            SELECT manager_id, manager_name, department_name, total_employees, 
                   current_employees, management_load
            FROM employees.span_of_control
            ORDER BY manager_id
        """)
        actual_results = cur.fetchall()
        
        # Execute ground truth query
        cur.execute("""
            WITH dept_total AS (
            SELECT de.department_id, COUNT(DISTINCT de.employee_id)::INT AS total_employees
            FROM employees.department_employee de
            GROUP BY de.department_id
            ),
            dept_current AS (
            SELECT de.department_id, COUNT(DISTINCT de.employee_id)::INT AS current_employees
            FROM employees.department_employee de
            JOIN employees.salary s
                ON s.employee_id = de.employee_id
            AND s.to_date = DATE '9999-01-01'
            WHERE de.to_date = DATE '9999-01-01'
            GROUP BY de.department_id
            )
            SELECT
            dm.employee_id AS manager_id,
            CONCAT(e.first_name, ' ', e.last_name) AS manager_name,
            d.dept_name AS department_name,
            COALESCE(dt.total_employees, 0)  AS total_employees,
            COALESCE(dc.current_employees, 0) AS current_employees,
            CASE
                WHEN COALESCE(dc.current_employees, 0) < 5000  THEN 'light'
                WHEN COALESCE(dc.current_employees, 0) <= 15000 THEN 'moderate'
                ELSE 'heavy'
            END AS management_load
            FROM employees.department_manager dm
            JOIN employees.employee  e ON e.id = dm.employee_id
            JOIN employees.department d ON d.id = dm.department_id
            LEFT JOIN dept_total  dt ON dt.department_id = dm.department_id
            LEFT JOIN dept_current dc ON dc.department_id = dm.department_id
            WHERE dm.to_date = DATE '9999-01-01'
            ORDER BY dm.employee_id, d.dept_name;
        """)
        expected_results = cur.fetchall()

        if len(actual_results) != len(expected_results):
            print(f"❌ Expected {len(expected_results)} span of control results, got {len(actual_results)}")
            return False

        mismatches = 0
        for i, (actual, expected) in enumerate(zip(actual_results, expected_results)):
            if not rows_match(actual, expected):
                if mismatches < 5:  # Only show first 5 mismatches
                    print(f"❌ Row {i+1} mismatch: expected {expected}, got {actual}")
                mismatches += 1

        if mismatches > 0:
            print(f"❌ Total mismatches: {mismatches}")
            return False

        print(f"✅ Span of control results are correct ({len(actual_results)} records)")
        return True

def main():
    """Main verification function."""
    print("=" * 50)

    # Get connection parameters
    conn_params = get_connection_params()

    if not conn_params["database"]:
        print("❌ No database specified")
        sys.exit(1)

    try:
        # Connect to database
        conn = psycopg2.connect(**conn_params)

        # Verify all four analysis results
        success = (
            verify_manager_profile_results(conn) and 
            verify_department_leadership_results(conn) and 
            verify_management_transitions_results(conn) and
            verify_span_of_control_results(conn)
        )

        conn.close()

        if success:
            print("\n🎉 Task verification: PASS")
            sys.exit(0)
        else:
            print("\n❌ Task verification: FAIL")
            sys.exit(1)

    except psycopg2.Error as e:
        print(f"❌ Database error: {e}")
        sys.exit(1)
    except Exception as e:
        print(f"❌ Verification error: {e}")
        sys.exit(1)

if __name__ == "__main__":
    main()

================================================
FILE: tasks/postgres/standard/lego/consistency_enforcement/description.md
================================================
Implement a data consistency enforcement system for the LEGO database. The system must ensure that the reported part count in the `lego_sets` table matches the actual sum of non-spare parts in the latest inventory version. This involves a three-step process: identifying existing inconsistencies, fixing them, and creating a trigger-based constraint system to prevent future issues.

### Consistency Rule
For any given `set_num`, the following invariant must be maintained:
`lego_sets.num_parts = SUM(quantity)` FROM `lego_inventory_parts` WHERE `inventory_id` IN (latest inventory for that set) AND `is_spare` = false

**Important**: If a set has no inventory records, the consistency check should be skipped.

# Your Tasks:

## Task 1: Identify Data Inconsistencies

### Objective
Write a single `SELECT` query to find all sets where the stored `num_parts` does not match the actual calculated number of parts from the latest inventory.

1.  **Find the Latest Inventory**: For each `set_num`, find its latest inventory id by getting the `MAX(version)` from the `lego_inventories` table.
2.  **Calculate Actual Part Count**: For these latest inventories, join with `lego_inventory_parts` and calculate the `SUM(quantity)`, but only for parts where `is_spare` is false.
3.  **Compare and Filter**: Join this calculated result back to the `lego_sets` table and return the rows where `lego_sets.num_parts` is different from your calculated sum.

## Task 2: Fix Existing Inconsistencies

### Objective
Correct all mismatched `num_parts` values using a clear, multi-step process with a temporary table. This approach is designed to be robust against all edge cases.

#### Step 1: Create a Temporary Table
Create a temporary table (e.g., `correct_counts`) with two columns: `set_num` (text) and `actual_parts` (integer).

#### Step 2: Populate the Temporary Table
This is the most critical step. Write an `INSERT` statement that calculates the correct part count for every single set listed in the `lego_sets` table.

-   The query must start by selecting from `public.lego_sets`.
-   It must then `LEFT JOIN` to a subquery that contains the part-counting logic (finding the latest inventory version and summing the non-spare parts).
-   Use `COALESCE` on the final result from the subquery to ensure that any set without parts or without an inventory record gets a value of `0`, not `NULL`.

#### Step 3: Update from the Temporary Table

Write a final, simple `UPDATE` statement that joins the `lego_sets` table with your temporary table on `set_num` and sets `num_parts` to the `actual_parts` value.

## Task 3: Create Constraint Enforcement System

### Objective

Implement a deferrable constraint trigger system to enforce the consistency rule automatically for all future `INSERT` and `UPDATE` operations.

### Part A: Create the Trigger Function

Create a single PL/pgSQL function, preferably named `check_set_parts_consistency()`, that performs the core validation.

**Function Requirements**:

  - Returns `trigger`.
  - Accepts no arguments.
  - Contains the core validation logic:
      - **Identify the `set_num` to check**. This is the most critical part. The `set_num` must be retrieved based on which table fired the trigger (`TG_TABLE_NAME`):
          - If `lego_sets` or `lego_inventories`: get the `set_num` directly from `NEW.set_num`.
          - If `lego_inventory_parts`: you must first query `lego_inventories` using `NEW.inventory_id` to find the corresponding `set_num`.
      - **Perform the check**. For the identified `set_num`, execute the same core logic from Task 1 to get the `actual_parts` count and the `stored_num_parts` from the `lego_sets` table.
      - **Raise an exception on failure**. If `actual_parts` does not equal `stored_num_parts`, the function must raise an exception to block the transaction (e.g., `RAISE EXCEPTION 'Inconsistent part count for set %', relevant_set_num;`).
      - **Return `NEW` on success**. If the check passes or is skipped, the function should `RETURN NEW`.

### Part B: Create the Constraint Triggers

Create three separate `CONSTRAINT TRIGGER` statements that attach the function from Part A to the following tables:

  - `public.lego_sets`
  - `public.lego_inventories`
  - `public.lego_inventory_parts`

**Crucial Trigger Requirements**:

  - Each trigger must fire `AFTER INSERT OR UPDATE`.
  - Each trigger **MUST** be `DEFERRABLE` and `INITIALLY IMMEDIATE`. This is non-negotiable for the verification to pass.
  - Each trigger must execute the function `FOR EACH ROW`.

================================================
FILE: tasks/postgres/standard/lego/consistency_enforcement/meta.json
================================================
{
  "task_id": "consistency_enforcement",
  "task_name": "Consistency Enforcement",
  "category_id": "lego",
  "category_name": "Lego",
  "description": "Implement data consistency system ensuring reported part counts match actual inventory using triggers and constraint enforcement.",
  "author": "Jiawei Wang",
  "created_at": "2025-08-15",
  "difficulty": "L3",
  "tags": [
    "data integrity enforcement",
    "stored procedures and functions",
    "transactional operations"
  ],
  "mcp": [
    "postgres"
  ],
  "meta_data": {
    "stateType": "text",
    "stateContent": "Table \"lego_colors\" {\n  \"id\" int4 [pk, not null, increment]\n  \"name\" varchar(255) [not null]\n  \"rgb\" varchar(6) [not null]\n  \"is_trans\" bpchar(1) [not null]\n}\n\nTable \"lego_inventories\" {\n  \"id\" int4 [pk, not null, increment]\n  \"version\" int4 [not null]\n  \"set_num\" varchar(255) [not null]\n}\n\nTable \"lego_inventory_parts\" {\n  \"inventory_id\" int4 [not null]\n  \"part_num\" varchar(255) [not null]\n  \"color_id\" int4 [not null]\n  \"quantity\" int4 [not null]\n  \"is_spare\" bool [not null]\n}\n\nTable \"lego_inventory_sets\" {\n  \"inventory_id\" int4 [not null]\n  \"set_num\" varchar(255) [not null]\n  \"quantity\" int4 [not null]\n}\n\nTable \"lego_part_categories\" {\n  \"id\" int4 [pk, not null, increment]\n  \"name\" varchar(255) [not null]\n}\n\nTable \"lego_parts\" {\n  \"part_num\" varchar(255) [pk, not null]\n  \"name\" text [not null]\n  \"part_cat_id\" int4 [not null]\n}\n\nTable \"lego_sets\" {\n  \"set_num\" varchar(255) [pk, not null]\n  \"name\" varchar(255) [not null]\n  \"year\" int4\n  \"theme_id\" int4\n  \"num_parts\" int4\n}\n\nTable \"lego_themes\" {\n  \"id\" int4 [pk, not null, increment]\n  \"name\" varchar(255) [not null]\n  \"parent_id\" int4\n}\n",
    "stateUrl": null,
    "stateOriginalUrl": "https://github.com/neondatabase-labs/postgres-sample-dbs/blob/main/lego.sql"
  }
}

================================================
FILE: tasks/postgres/standard/lego/consistency_enforcement/verify.py
================================================
"""
Verification script for PostgreSQL LEGO Task 1: Parts Consistency Fix & Constraints
Version 2.1: Relaxed consistency check to allow for one known corner case mismatch.
"""

import os
import sys
import psycopg2
import psycopg2.errors
from typing import Optional, Tuple, List


def get_connection_params() -> dict:
    """Get database connection parameters from environment variables."""
    return {
        "host": os.getenv("POSTGRES_HOST", "localhost"),
        "port": int(os.getenv("POSTGRES_PORT", 5432)),
        "database": os.getenv("POSTGRES_DATABASE"),
        "user": os.getenv("POSTGRES_USERNAME"),
        "password": os.getenv("POSTGRES_PASSWORD"),
    }


def fetch_candidate_part_row(cur) -> Optional[Tuple[int, str, str, int]]:
    """
    Picks a concrete, non-spare inventory part from the latest inventory of any set.
    This provides a reliable target for testing update and insert triggers.

    Returns a tuple: (inventory_id, set_num, part_num, color_id) or None.
    """
    cur.execute(
        """
        WITH latest_inv AS (
            SELECT set_num, MAX(version) AS max_version
            FROM public.lego_inventories
            GROUP BY set_num
        ), inv AS (
            SELECT li.id, li.set_num
            FROM public.lego_inventories li
            JOIN latest_inv lv ON lv.set_num = li.set_num AND lv.max_version = li.version
        )
        SELECT i.id AS inventory_id, i.set_num, lip.part_num, lip.color_id
        FROM inv i
        JOIN public.lego_inventory_parts lip ON lip.inventory_id = i.id
        WHERE lip.is_spare = false AND lip.quantity > 0
        LIMIT 1;
        """
    )
    return cur.fetchone()


def get_mismatch_count(cur) -> int:
    """Returns the number of sets where num_parts mismatches the computed actual sum."""
    cur.execute(
        """
        WITH latest_inv AS (
            SELECT set_num, MAX(version) AS max_version
            FROM public.lego_inventories
            GROUP BY set_num
        ), inv_latest AS (
            SELECT li.set_num, li.id
            FROM public.lego_inventories li
            JOIN latest_inv lv ON lv.set_num = li.set_num AND lv.max_version = li.version
        ), parts_agg AS (
            SELECT
                i.set_num,
                SUM(lip.quantity) AS actual_parts
            FROM inv_latest i
            JOIN public.lego_inventory_parts lip ON lip.inventory_id = i.id
            WHERE lip.is_spare = false
            GROUP BY i.set_num
        )
        SELECT COUNT(*)
        FROM public.lego_sets s
        LEFT JOIN parts_agg pa ON s.set_num = pa.set_num
        WHERE s.num_parts <> COALESCE(pa.actual_parts, 0);
        """
    )
    return cur.fetchone()[0]


def verify_data_consistency(conn) -> bool:
    """
    TASK 1 VERIFICATION: Checks if the initial data fix was successful.
    (Relaxed: Allows for one corner-case mismatch).
    """
    print("\n-- Verifying Task 1: Data Consistency Fix (Relaxed) --")
    with conn.cursor() as cur:
        count = get_mismatch_count(cur)
        # RELAXED CONDITION: Allow 0 or 1 mismatch to pass.
        if count > 1:
            print(f"❌ FAIL: Found {count} sets with inconsistent part counts. Expected 0 or 1 after fix.")
            return False
        
        print("✅ PASS: Data consistency check passed (allowing for one known mismatch).")
        return True


def verify_constraint_triggers_exist(conn) -> bool:
    """
    TASK 2 VERIFICATION (Part A): Checks if constraint triggers are attached to all required tables.
    This is more robust than checking names or a total count.
    """
    print("\n-- Verifying Task 2: Constraint Trigger Existence --")
    tables_to_check = [
        'public.lego_inventory_parts',
        'public.lego_inventories',
        'public.lego_sets'
    ]
    all_triggers_found = True
    with conn.cursor() as cur:
        for table in tables_to_check:
            cur.execute(
                """
                SELECT COUNT(*)
                FROM pg_trigger
                WHERE tgrelid = %s::regclass AND tgconstraint <> 0;
                """,
                (table,)
            )
            trigger_count = cur.fetchone()[0]
            if trigger_count == 0:
                print(f"❌ FAIL: No constraint trigger found on table '{table}'.")
                all_triggers_found = False
            else:
                print(f"✅ OK: Found constraint trigger(s) on table '{table}'.")

    if all_triggers_found:
        print("✅ PASS: Constraint triggers are attached to all required tables.")
    return all_triggers_found


def verify_violation_is_blocked(conn) -> bool:
    """
    TASK 2 VERIFICATION (Part B): Checks if triggers block a direct, inconsistent write.
    An attempt to increment a part quantity without updating the set's total should fail.
    """
    print("\n-- Verifying Task 2: Immediate Constraint Enforcement --")
    with conn.cursor() as cur:
        candidate = fetch_candidate_part_row(cur)
        if not candidate:
            print("⚠️ SKIP: No candidate part row found to test constraints. Cannot verify.")
            return True # Skip if no data to test

        inventory_id, _, part_num, color_id = candidate
        try:
            # This transaction should fail due to the trigger
            cur.execute(
                """
                UPDATE public.lego_inventory_parts
                SET quantity = quantity + 1
                WHERE inventory_id = %s AND part_num = %s AND color_id = %s;
                """,
                (inventory_id, part_num, color_id),
            )
            # If we reach here, the trigger failed to block the update.
            conn.rollback()
            print("❌ FAIL: An inconsistent write was NOT blocked by the trigger.")
            return False
        except psycopg2.Error as e:
            # We expect an error. Specifically, a constraint violation error.
            conn.rollback()
            # 23514 is check_violation, but custom triggers might raise others.
            # Any error here is considered a success as the transaction was blocked.
            print(f"✅ PASS: Inconsistent write was correctly blocked by the trigger. (Error: {e.pgcode})")
            return True


def verify_deferred_transaction_is_allowed(conn) -> bool:
    """
    TASK 2 VERIFICATION (Part C): Checks if a coordinated, consistent update is allowed
    when constraints are deferred.
    """
    print("\n-- Verifying Task 2: Deferred Constraint Enforcement --")
    with conn.cursor() as cur:
        candidate = fetch_candidate_part_row(cur)
        if not candidate:
            print("⚠️ SKIP: No candidate part row found. Cannot test deferred transaction.")
            return True # Skip if no data to test

    inventory_id, set_num, part_num, color_id = candidate

    try:
        # This multi-statement transaction should succeed with deferred constraints
        with conn.cursor() as cur:
            cur.execute("BEGIN;")
            cur.execute("SET CONSTRAINTS ALL DEFERRED;")
            cur.execute(
                "UPDATE public.lego_inventory_parts SET quantity = quantity + 1 WHERE inventory_id = %s AND part_num = %s AND color_id = %s;",
                (inventory_id, part_num, color_id),
            )
            cur.execute(
                "UPDATE public.lego_sets SET num_parts = num_parts + 1 WHERE set_num = %s;",
                (set_num,),
            )
            cur.execute("COMMIT;") # This will fail if constraints are not deferrable or logic is wrong
        print("✅ PASS: Coordinated update with deferred constraints committed successfully.")

        # Revert changes to leave DB in its original state
        with conn.cursor() as cur:
            cur.execute("BEGIN;")
            cur.execute("SET CONSTRAINTS ALL DEFERRED;")
            cur.execute(
                "UPDATE public.lego_inventory_parts SET quantity = quantity - 1 WHERE inventory_id = %s AND part_num = %s AND color_id = %s;",
                (inventory_id, part_num, color_id),
            )
            cur.execute(
                "UPDATE public.lego_sets SET num_parts = num_parts - 1 WHERE set_num = %s;",
                (set_num,),
            )
            cur.execute("COMMIT;")
        print("INFO: Test changes were successfully reverted.")
        return True

    except psycopg2.Error as e:
        conn.rollback()
        print(f"❌ FAIL: Deferred transaction failed to commit. Error: {e}")
        return False


def main():
    """Main verification function."""
    print("=" * 60)
    print("LEGO Database Consistency Verification Script")
    print("=" * 60)

    conn_params = get_connection_params()
    if not conn_params.get("database"):
        print("❌ CRITICAL: POSTGRES_DATABASE environment variable not set.")
        sys.exit(1)

    try:
        with psycopg2.connect(**conn_params) as conn:
            conn.autocommit = False # Ensure we control transactions

            # Run all verification steps
            results = [
                verify_data_consistency(conn),
                verify_constraint_triggers_exist(conn),
                verify_violation_is_blocked(conn),
                verify_deferred_transaction_is_allowed(conn),
            ]

            if all(results):
                print("\n🎉 Overall Result: PASS - All tasks verified successfully!")
                sys.exit(0)
            else:
                print("\n❌ Overall Result: FAIL - One or more verification steps failed.")
                sys.exit(1)

    except psycopg2.OperationalError as e:
        print(f"❌ CRITICAL: Could not connect to the database. Details: {e}")
        sys.exit(1)
    except Exception as e:
        print(f"❌ CRITICAL: An unexpected error occurred during verification. Details: {e}")
        sys.exit(1)


if __name__ == "__main__":
    main()


================================================
FILE: tasks/postgres/standard/lego/database_security_policies/description.md
================================================
Implement a comprehensive database security system with Row-Level Security (RLS) policies and role-based access control for the LEGO database. The system must ensure theme-based data isolation and prevent unauthorized access across different LEGO themes.

## Your Tasks:

1. **Create database role and permissions** — Create a new database role called `theme_analyst` with the following permissions:
   * `SELECT` permissions on all reference tables: `lego_themes`, `lego_colors`, `lego_parts`, `lego_part_categories`
   * `SELECT` permissions on main data tables: `lego_sets`, `lego_inventories`, `lego_inventory_parts`
   * No `INSERT`, `UPDATE`, or `DELETE` permissions on any tables

2. **Enable Row-Level Security** — Enable RLS on the following tables:
   * `lego_sets`
   * `lego_inventories` 
   * `lego_inventory_parts`

3. **Create RLS policies** — Implement theme-based data isolation policies:
   
   **Policy 1: `theme_sets_policy` on `lego_sets`**
   * Allows access only to sets where `theme_id = 18` (Star Wars theme)
   * Policy should use a function that checks the current user's theme assignment
   
   **Policy 2: `theme_inventories_policy` on `lego_inventories`**
   * Allows access only to inventories for sets with `theme_id = 18`
   * Must join with `lego_sets` table to check theme_id
   
   **Policy 3: `theme_inventory_parts_policy` on `lego_inventory_parts`**
   * Allows access only to inventory parts for sets with `theme_id = 18`
   * Must join through `lego_inventories` and `lego_sets` to check theme_id

4. **Create theme assignment function** — Create a function `get_user_theme_id()` that:
   * Returns `18` for the `theme_analyst` role (Star Wars theme)
   * Can be extended to support other themes in the future
   * Uses `current_user` to determine the appropriate theme_id

5. **Test the security implementation** — Execute verification queries that demonstrate:
   * Star Wars theme (theme_id=18) returns exactly 2 sets: '65081-1' and 'K8008-1'
   * Technic theme (theme_id=1) returns 0 sets when accessed by theme_analyst role
   * Cross-theme data access is properly blocked
   * Reference tables are accessible for all data

6. **Create comprehensive security audit** — Generate a detailed report including:
   * Complete SQL statements for role creation and policy implementation
   * Expected query results for each theme
   * Verification queries to confirm proper data isolation
   * Documentation of the security model and access patterns

## Security Requirements:

- The `theme_analyst` role must only see data related to Star Wars theme (theme_id=18)
- All other themes must be completely hidden from this role
- Reference tables (themes, colors, parts, part_categories) must be fully accessible
- The system must prevent any cross-theme data leakage
- RLS policies must be active and enforced for all data access

## Expected Results:

When the `theme_analyst` role queries the database:
- `lego_sets` should return only 2 Star Wars sets
- `lego_inventories` should return only inventories for those 2 sets  
- `lego_inventory_parts` should return only parts for those 2 sets
- All reference tables should return complete data
- Queries for other themes should return empty results


================================================
FILE: tasks/postgres/standard/lego/database_security_policies/meta.json
================================================
{
  "task_id": "database_security_policies",
  "task_name": "Database Security Policies",
  "category_id": "lego",
  "category_name": "Lego",
  "description": "Implement Row-Level Security policies with role-based access control for theme-based data isolation in LEGO database.",
  "author": "Jiawei Wang",
  "created_at": "2025-08-15",
  "difficulty": "L3",
  "tags": [
    "security and access control",
    "stored procedures and functions"
  ],
  "mcp": [
    "postgres"
  ],
  "meta_data": {
    "stateType": "text",
    "stateContent": "Table \"lego_colors\" {\n  \"id\" int4 [pk, not null, increment]\n  \"name\" varchar(255) [not null]\n  \"rgb\" varchar(6) [not null]\n  \"is_trans\" bpchar(1) [not null]\n}\n\nTable \"lego_inventories\" {\n  \"id\" int4 [pk, not null, increment]\n  \"version\" int4 [not null]\n  \"set_num\" varchar(255) [not null]\n}\n\nTable \"lego_inventory_parts\" {\n  \"inventory_id\" int4 [not null]\n  \"part_num\" varchar(255) [not null]\n  \"color_id\" int4 [not null]\n  \"quantity\" int4 [not null]\n  \"is_spare\" bool [not null]\n}\n\nTable \"lego_inventory_sets\" {\n  \"inventory_id\" int4 [not null]\n  \"set_num\" varchar(255) [not null]\n  \"quantity\" int4 [not null]\n}\n\nTable \"lego_part_categories\" {\n  \"id\" int4 [pk, not null, increment]\n  \"name\" varchar(255) [not null]\n}\n\nTable \"lego_parts\" {\n  \"part_num\" varchar(255) [pk, not null]\n  \"name\" text [not null]\n  \"part_cat_id\" int4 [not null]\n}\n\nTable \"lego_sets\" {\n  \"set_num\" varchar(255) [pk, not null]\n  \"name\" varchar(255) [not null]\n  \"year\" int4\n  \"theme_id\" int4\n  \"num_parts\" int4\n}\n\nTable \"lego_themes\" {\n  \"id\" int4 [pk, not null, increment]\n  \"name\" varchar(255) [not null]\n  \"parent_id\" int4\n}\n",
    "stateUrl": null,
    "stateOriginalUrl": "https://github.com/neondatabase-labs/postgres-sample-dbs/blob/main/lego.sql"
  }
}

================================================
FILE: tasks/postgres/standard/lego/database_security_policies/verify.py
================================================
"""
Verification script for PostgreSQL LEGO Task 4: Database Security and RLS Implementation
(Version 2 - Improved Robustness)
"""

import os
import sys
import psycopg2
import psycopg2.errors
from typing import Dict

def get_connection_params() -> Dict[str, any]:
    """Get database connection parameters from environment variables."""
    return {
        "host": os.getenv("POSTGRES_HOST", "localhost"),
        "port": int(os.getenv("POSTGRES_PORT", 5432)),
        "database": os.getenv("POSTGRES_DATABASE"),
        "user": os.getenv("POSTGRES_USERNAME"),
        "password": os.getenv("POSTGRES_PASSWORD"),
    }

def verify_role_creation(conn) -> bool:
    """
    TASK 1 VERIFICATION: Check if theme_analyst role was created with proper permissions.
    """
    print("\n-- Verifying Task 1: Role Creation and Permissions --")
    with conn.cursor() as cur:
        # Check if role exists
        cur.execute("SELECT 1 FROM pg_roles WHERE rolname = 'theme_analyst';")
        if not cur.fetchone():
            print("❌ FAIL: The 'theme_analyst' role was not created.")
            return False
        print("✅ OK: Role 'theme_analyst' exists.")

        # Check SELECT permissions on reference and main tables
        all_tables = [
            'lego_themes', 'lego_colors', 'lego_parts', 'lego_part_categories',
            'lego_sets', 'lego_inventories', 'lego_inventory_parts'
        ]
        for table in all_tables:
            cur.execute(
                """
                SELECT has_table_privilege('theme_analyst', %s, 'SELECT');
                """,
                (table,)
            )
            if not cur.fetchone()[0]:
                print(f"❌ FAIL: 'theme_analyst' role is missing SELECT permission on '{table}'.")
                return False
        print("✅ OK: Role has correct SELECT permissions on all required tables.")

        # Check that no INSERT/UPDATE/DELETE permissions exist
        for table in all_tables:
            cur.execute(
                """
                SELECT 
                    has_table_privilege('theme_analyst', %s, 'INSERT') OR
                    has_table_privilege('theme_analyst', %s, 'UPDATE') OR
                    has_table_privilege('theme_analyst', %s, 'DELETE');
                """,
                (table, table, table)
            )
            if cur.fetchone()[0]:
                print(f"❌ FAIL: 'theme_analyst' role has unauthorized INSERT, UPDATE, or DELETE permission on '{table}'.")
                return False
        print("✅ OK: Role does not have modification permissions.")
        
        print("✅ PASS: 'theme_analyst' role created with correct permissions.")
        return True

def verify_rls_enabled(conn) -> bool:
    """
    TASK 2 VERIFICATION: Check if Row-Level Security is enabled on required tables.
    """
    print("\n-- Verifying Task 2: Row-Level Security Enablement --")
    tables_to_check = ['lego_sets', 'lego_inventories', 'lego_inventory_parts']
    with conn.cursor() as cur:
        for table in tables_to_check:
            cur.execute(
                "SELECT relrowsecurity FROM pg_class WHERE relname = %s;", (table,)
            )
            rls_enabled = cur.fetchone()
            if not rls_enabled or not rls_enabled[0]:
                print(f"❌ FAIL: RLS is not enabled on table '{table}'.")
                return False
            print(f"✅ OK: RLS is enabled on table '{table}'.")
    
    print("✅ PASS: Row-Level Security is enabled on all required tables.")
    return True

def verify_rls_policies(conn) -> bool:
    """
    TASK 3 VERIFICATION: Check if RLS policies were created on required tables.
    """
    print("\n-- Verifying Task 3: RLS Policy Creation --")
    expected_policies = {
        'lego_sets': 'theme_sets_policy',
        'lego_inventories': 'theme_inventories_policy',
        'lego_inventory_parts': 'theme_inventory_parts_policy'
    }
    with conn.cursor() as cur:
        for table, policy_name in expected_policies.items():
            cur.execute(
                "SELECT 1 FROM pg_policies WHERE tablename = %s AND policyname = %s;",
                (table, policy_name)
            )
            if not cur.fetchone():
                print(f"❌ FAIL: RLS policy '{policy_name}' not found on table '{table}'.")
                return False
            print(f"✅ OK: RLS policy '{policy_name}' found on table '{table}'.")
    
    print("✅ PASS: All required RLS policies are created.")
    return True

def verify_theme_function(conn) -> bool:
    """
    TASK 4 VERIFICATION: Check if get_user_theme_id() function was created and works correctly.
    """
    print("\n-- Verifying Task 4: Theme Assignment Function --")
    with conn.cursor() as cur:
        cur.execute(
            "SELECT 1 FROM pg_proc WHERE proname = 'get_user_theme_id';"
        )
        if not cur.fetchone():
            print("❌ FAIL: The 'get_user_theme_id' function was not created.")
            return False
        print("✅ OK: Function 'get_user_theme_id' exists.")

        try:
            # Test the function's output specifically for the 'theme_analyst' role
            cur.execute("SET ROLE theme_analyst;")
            cur.execute("SELECT get_user_theme_id();")
            theme_id = cur.fetchone()[0]
            cur.execute("RESET ROLE;") # IMPORTANT: Switch back
            
            if theme_id != 18:
                print(f"❌ FAIL: get_user_theme_id() returned {theme_id} for 'theme_analyst', but expected 18.")
                return False
            
            print("✅ OK: Function returns correct theme_id (18) for 'theme_analyst'.")
            print("✅ PASS: Theme assignment function is correct.")
            return True
        except Exception as e:
            conn.rollback() # Rollback any failed transaction state
            print(f"❌ FAIL: Error testing get_user_theme_id() function: {e}")
            return False

def test_theme_analyst_access(conn) -> bool:
    """
    TASK 5 VERIFICATION: Test data access by assuming the theme_analyst role.
    """
    print("\n-- Verifying Task 5: Theme-Based Data Access --")
    try:
        with conn.cursor() as cur:
            # Assume the role of theme_analyst for this session
            cur.execute("SET ROLE theme_analyst;")

            # Test 1: Check Star Wars sets access (should return 2 sets)
            cur.execute("SELECT set_num FROM lego_sets ORDER BY set_num;")
            star_wars_sets = [row[0] for row in cur.fetchall()]
            expected_sets = ['65081-1', 'K8008-1']
            
            if sorted(star_wars_sets) != sorted(expected_sets):
                print(f"❌ FAIL: Expected Star Wars sets {expected_sets}, but got {star_wars_sets}.")
                cur.execute("RESET ROLE;")
                return False
            print("✅ PASS: Star Wars sets access is correct (2 sets returned).")

            # Test 2: Check that Technic sets are not accessible (should return 0)
            cur.execute("SELECT COUNT(*) FROM lego_sets WHERE theme_id = 1;")
            technic_count = cur.fetchone()[0]
            if technic_count != 0:
                print(f"❌ FAIL: Technic sets should be blocked, but query returned {technic_count} sets.")
                cur.execute("RESET ROLE;")
                return False
            print("✅ PASS: Technic theme is correctly blocked (0 sets returned).")

            # Test 3: Check reference tables are fully accessible
            cur.execute("SELECT COUNT(*) > 10 FROM lego_themes;") # Check for a reasonable number
            if not cur.fetchone()[0]:
                print("❌ FAIL: 'lego_themes' table seems inaccessible or empty.")
                cur.execute("RESET ROLE;")
                return False
            print("✅ PASS: Reference tables appear to be accessible.")

            # Test 4 & 5: Check related tables
            cur.execute("SELECT COUNT(*) FROM lego_inventories;")
            if cur.fetchone()[0] == 0:
                print("❌ FAIL: No inventories are visible for the allowed sets.")
                cur.execute("RESET ROLE;")
                return False
            
            cur.execute("SELECT COUNT(*) FROM lego_inventory_parts;")
            if cur.fetchone()[0] == 0:
                print("❌ FAIL: No inventory parts are visible for the allowed sets.")
                cur.execute("RESET ROLE;")
                return False
            print("✅ PASS: Related tables (inventories, inventory_parts) are correctly filtered.")

            # IMPORTANT: Always reset the role at the end
            cur.execute("RESET ROLE;")
            return True
    except Exception as e:
        conn.rollback() # Ensure transaction is clean
        print(f"❌ FAIL: An error occurred while testing data access as 'theme_analyst': {e}")
        # Try to reset role even on failure to clean up session state
        try:
            with conn.cursor() as cleanup_cur:
                cleanup_cur.execute("RESET ROLE;")
        except:
            pass
        return False

def main():
    """Main verification function."""
    print("=" * 60)
    print("LEGO Database Security and RLS Verification Script")
    print("=" * 60)

    conn_params = get_connection_params()
    if not conn_params.get("database"):
        print("❌ CRITICAL: POSTGRES_DATABASE environment variable not set.")
        sys.exit(1)

    conn = None
    try:
        conn = psycopg2.connect(**conn_params)
        
        results = [
            verify_role_creation(conn),
            verify_rls_enabled(conn),
            verify_rls_policies(conn),
            verify_theme_function(conn),
            test_theme_analyst_access(conn),
        ]

        if all(results):
            print("\n🎉 Overall Result: PASS - All security tasks verified successfully!")
            sys.exit(0)
        else:
            print("\n❌ Overall Result: FAIL - One or more verification steps failed.")
            sys.exit(1)

    except psycopg2.OperationalError as e:
        print(f"❌ CRITICAL: Could not connect to the database. Check credentials and host. Details: {e}")
        sys.exit(1)
    except Exception as e:
        print(f"❌ CRITICAL: An unexpected error occurred. Details: {e}")
        sys.exit(1)
    finally:
        if conn:
            conn.close()

if __name__ == "__main__":
    main()


================================================
FILE: tasks/postgres/standard/lego/transactional_inventory_transfer/description.md
================================================
Create a PostgreSQL function to handle inventory part transfers between LEGO sets with enhanced validation and audit capabilities. The LEGO warehouse management system needs to support transferring parts while maintaining data integrity and tracking transfer history.

## Your Tasks:

1. **Create the transfer function** — Implement a PostgreSQL function named `transfer_parts` with the following signature:
   ```sql
   CREATE OR REPLACE FUNCTION transfer_parts(
       source_inventory_id INTEGER,
       target_inventory_id INTEGER,
       part_to_transfer_num VARCHAR,
       color_to_transfer_id INTEGER,
       quantity_to_transfer INTEGER,
       transfer_reason VARCHAR DEFAULT 'manual_transfer'
   ) RETURNS TEXT
   ```

2. **Create audit logging table** — Create a new table to track transfer history:
   ```sql
   CREATE TABLE inventory_transfer_log (
       log_id SERIAL PRIMARY KEY,
       transfer_timestamp TIMESTAMP DEFAULT NOW(),
       source_inventory_id INTEGER NOT NULL,
       target_inventory_id INTEGER NOT NULL,
       part_num VARCHAR NOT NULL,
       color_id INTEGER NOT NULL,
       quantity_transferred INTEGER NOT NULL,
       transfer_reason VARCHAR NOT NULL,
       transfer_status VARCHAR NOT NULL CHECK (transfer_status IN ('success', 'failed')),
       error_message TEXT
   );
   ```

3. **Implement enhanced validation** — The function must perform these validations:
   
   **Validation A: Basic Checks**
   - Verify both inventory IDs exist in `lego_inventories` table
   - Verify part exists in `lego_parts` table
   - Verify color exists in `lego_colors` table
   - Check source has sufficient quantity (including spare parts)
   - Prevent self-transfers (source and target cannot be the same)

   **Validation B: Business Rules**
   - Maximum transfer quantity is 500 parts per operation
   - Minimum transfer quantity is 1 part
   - Source and target must be different inventories

4. **Implement transactional logic** — The function must perform these operations within a single transaction:
   
   **Step A: Pre-validation**
   - Lock both inventory records using `SELECT ... FOR UPDATE`
   - Perform all validation checks
   - Calculate transfer feasibility

   **Step B: Source Inventory Update**
   - Decrease quantity in source inventory
   - If quantity becomes zero, delete the row
   - Handle spare parts appropriately (maintain `is_spare` flag)

   **Step C: Target Inventory Update**
   - Check if part exists in target inventory
   - If exists: increase quantity
   - If not exists: insert new record
   - Handle spare parts appropriately

   **Step D: Audit Logging**
   - Log successful transfers with details
   - Log failed transfers with error messages
   - Include transfer reason and status

5. **Error handling requirements**:
   - Use `RAISE EXCEPTION` with descriptive error messages
   - Handle all validation failures gracefully
   - Ensure complete rollback on any failure
   - Log all attempts (successful and failed)

6. **Return value**:
   - Return success message: `'Successfully transferred {quantity} parts ({part_num}, color_id: {color_id}) from inventory {source_id} to inventory {target_id}. Reason: {reason}'`
   - Include transfer details and reason in the message

## Function Requirements:

- **Transaction Safety**: All operations wrapped in transaction block
- **Data Integrity**: No partial updates possible
- **Audit Trail**: Complete logging of all transfer attempts
- **Validation**: Comprehensive input and business rule validation
- **Error Recovery**: Failed transfers leave database unchanged
- **Performance**: Use appropriate locking to prevent race conditions

## Example Usage:

```sql
-- Basic transfer with reason
SELECT transfer_parts(14469, 14686, '3024', 15, 100, 'inventory_adjustment');

-- Transfer to new inventory (should create new record)
SELECT transfer_parts(11124, 14686, '3001', 4, 50, 'part_redistribution');

-- This should fail due to insufficient quantity
SELECT transfer_parts(14469, 14686, '3024', 15, 2000, 'large_transfer');

-- This should fail due to self-transfer
SELECT transfer_parts(14469, 14469, '3024', 15, 10, 'self_transfer');
```

## Verification Criteria:

- Function handles all validation rules correctly
- Audit logging captures all transfer attempts
- Failed transfers are properly logged with error details
- Self-transfers are prevented
- Quantity limits are enforced
- Database state remains consistent after failures

================================================
FILE: tasks/postgres/standard/lego/transactional_inventory_transfer/meta.json
================================================
{
  "task_id": "transactional_inventory_transfer",
  "task_name": "Transactional Inventory Transfer",
  "category_id": "lego",
  "category_name": "Lego",
  "description": "Create PostgreSQL function to handle inventory part transfers between LEGO sets with validation and audit logging.",
  "author": "Jiawei Wang",
  "created_at": "2025-08-16",
  "difficulty": "L3",
  "tags": [
    "transactional operations",
    "stored procedures and functions",
    "audit and compliance"
  ],
  "mcp": [
    "postgres"
  ],
  "meta_data": {
    "stateType": "text",
    "stateContent": "Table \"lego_colors\" {\n  \"id\" int4 [pk, not null, increment]\n  \"name\" varchar(255) [not null]\n  \"rgb\" varchar(6) [not null]\n  \"is_trans\" bpchar(1) [not null]\n}\n\nTable \"lego_inventories\" {\n  \"id\" int4 [pk, not null, increment]\n  \"version\" int4 [not null]\n  \"set_num\" varchar(255) [not null]\n}\n\nTable \"lego_inventory_parts\" {\n  \"inventory_id\" int4 [not null]\n  \"part_num\" varchar(255) [not null]\n  \"color_id\" int4 [not null]\n  \"quantity\" int4 [not null]\n  \"is_spare\" bool [not null]\n}\n\nTable \"lego_inventory_sets\" {\n  \"inventory_id\" int4 [not null]\n  \"set_num\" varchar(255) [not null]\n  \"quantity\" int4 [not null]\n}\n\nTable \"lego_part_categories\" {\n  \"id\" int4 [pk, not null, increment]\n  \"name\" varchar(255) [not null]\n}\n\nTable \"lego_parts\" {\n  \"part_num\" varchar(255) [pk, not null]\n  \"name\" text [not null]\n  \"part_cat_id\" int4 [not null]\n}\n\nTable \"lego_sets\" {\n  \"set_num\" varchar(255) [pk, not null]\n  \"name\" varchar(255) [not null]\n  \"year\" int4\n  \"theme_id\" int4\n  \"num_parts\" int4\n}\n\nTable \"lego_themes\" {\n  \"id\" int4 [pk, not null, increment]\n  \"name\" varchar(255) [not null]\n  \"parent_id\" int4\n}\n",
    "stateUrl": null,
    "stateOriginalUrl": "https://github.com/neondatabase-labs/postgres-sample-dbs/blob/main/lego.sql"
  }
}

================================================
FILE: tasks/postgres/standard/lego/transactional_inventory_transfer/verify.py
================================================
"""
Verification script for PostgreSQL LEGO Task 2: Enhanced Inventory Transfer Function
Tests the transfer_parts function with audit logging and enhanced validation.

Key Features Tested:
- Core transfer functionality with audit logging
- Business rule validation (quantity limits, self-transfer prevention)
- Error handling and rollback mechanisms
- Audit trail maintenance for both success and failure cases
"""

import os
import sys
import psycopg2
import psycopg2.errors
from typing import Optional, Tuple


def get_connection_params() -> dict:
    """Get database connection parameters from environment variables."""
    return {
        "host": os.getenv("POSTGRES_HOST", "localhost"),
        "port": int(os.getenv("POSTGRES_PORT", 5432)),
        "database": os.getenv("POSTGRES_DATABASE"),
        "user": os.getenv("POSTGRES_USERNAME"),
        "password": os.getenv("POSTGRES_PASSWORD"),
    }


def get_inventory_part_quantity(conn, inventory_id: int, part_num: str, color_id: int) -> int:
    """Get the current quantity of a specific part in an inventory."""
    with conn.cursor() as cur:
        cur.execute(
            """
            SELECT quantity FROM public.lego_inventory_parts
            WHERE inventory_id = %s AND part_num = %s AND color_id = %s
            """,
            (inventory_id, part_num, color_id)
        )
        result = cur.fetchone()
        return result[0] if result else 0


def verify_system_components(conn) -> bool:
    """Verify that all required system components exist."""
    print("\n-- Verifying System Components --")
    try:
        with conn.cursor() as cur:
            # Check main function
            cur.execute(
                """
                SELECT COUNT(*) FROM pg_proc p
                JOIN pg_namespace n ON p.pronamespace = n.oid
                WHERE n.nspname = 'public' AND p.proname = 'transfer_parts'
                """
            )
            main_func_count = cur.fetchone()[0]
            
            # Check audit table
            cur.execute(
                """
                SELECT COUNT(*) FROM information_schema.tables 
                WHERE table_schema = 'public' AND table_name = 'inventory_transfer_log'
                """
            )
            audit_table_count = cur.fetchone()[0]
            
            if main_func_count == 0:
                print("❌ FAIL: transfer_parts function does not exist")
                return False
            
            if audit_table_count == 0:
                print("❌ FAIL: inventory_transfer_log table does not exist")
                return False
            
            print("✅ PASS: All system components exist")
            return True
    finally:
        conn.rollback()


def verify_successful_transfer_with_audit(conn) -> bool:
    """Test a successful transfer with audit logging."""
    print("\n-- Verifying Successful Transfer with Audit --")
    passed = False
    try:
        # Test data: Transfer 100 white plates from Mosaic Dino to Mosaic Johnny Thunder
        source_id = 14469
        target_id = 14686
        part_num = '3024'
        color_id = 15
        transfer_qty = 100
        reason = 'inventory_adjustment'
        
        source_initial = get_inventory_part_quantity(conn, source_id, part_num, color_id)
        target_initial = get_inventory_part_quantity(conn, target_id, part_num, color_id)
        print(f"Initial quantities - Source: {source_initial}, Target: {target_initial}")
        
        # Get initial audit log count
        with conn.cursor() as cur:
            cur.execute("SELECT COUNT(*) FROM inventory_transfer_log")
            initial_log_count = cur.fetchone()[0]
        
        with conn.cursor() as cur:
            cur.execute(
                "SELECT transfer_parts(%s, %s, %s, %s, %s, %s)",
                (source_id, target_id, part_num, color_id, transfer_qty, reason)
            )
            result = cur.fetchone()
            print(f"Transfer result: {result[0]}")
        
        source_final = get_inventory_part_quantity(conn, source_id, part_num, color_id)
        target_final = get_inventory_part_quantity(conn, target_id, part_num, color_id)
        print(f"Final quantities - Source: {source_final}, Target: {target_final}")
        
        # Verify audit log entry
        with conn.cursor() as cur:
            cur.execute("SELECT COUNT(*) FROM inventory_transfer_log")
            final_log_count = cur.fetchone()[0]
            
            if final_log_count <= initial_log_count:
                print("❌ FAIL: No audit log entry was created")
                return False
            
            # Check latest audit entry
            cur.execute(
                """
                SELECT transfer_status, quantity_transferred, transfer_reason
                FROM inventory_transfer_log
                ORDER BY log_id DESC
                LIMIT 1
                """
            )
            audit_entry = cur.fetchone()
            
            if not audit_entry:
                print("❌ FAIL: Could not retrieve audit log entry")
                return False
            
            status, qty_transferred, trans_reason = audit_entry
            
            if status != 'success':
                print(f"❌ FAIL: Transfer status should be 'success', got '{status}'")
                return False
            
            if qty_transferred != transfer_qty or trans_reason != reason:
                print(f"❌ FAIL: Audit log details don't match transfer parameters")
                return False
        
        expected_source = source_initial - transfer_qty
        expected_target = target_initial + transfer_qty
        
        if source_final != expected_source:
            print(f"❌ FAIL: Source quantity mismatch. Expected {expected_source}, got {source_final}")
        elif target_final != expected_target:
            print(f"❌ FAIL: Target quantity mismatch. Expected {expected_target}, got {target_final}")
        else:
            print("✅ PASS: Successful transfer with audit logging completed correctly")
            passed = True
            
    except psycopg2.Error as e:
        print(f"❌ FAIL: Transfer failed unexpectedly with error: {e}")
    finally:
        conn.rollback()
    return passed


def verify_new_part_transfer(conn) -> bool:
    """Test transferring a part to an inventory that doesn't have it."""
    print("\n-- Verifying New Part Transfer --")
    passed = False
    try:
        # Test data: Transfer red bricks to Mosaic Johnny Thunder (which doesn't have them)
        source_id = 11124  # Giant Lego Dacta Basic Set (has red bricks)
        target_id = 14686  # Lego Mosaic Johnny Thunder (doesn't have red bricks)
        part_num = '3001'
        color_id = 4
        transfer_qty = 50
        reason = 'part_redistribution'
        
        target_initial = get_inventory_part_quantity(conn, target_id, part_num, color_id)
        if target_initial != 0:
            print(f"❌ FAIL: Pre-condition failed. Target already has {target_initial} of this part, expected 0")
            return False
        
        source_initial = get_inventory_part_quantity(conn, source_id, part_num, color_id)
        print(f"Initial quantities - Source: {source_initial}, Target: {target_initial}")
        
        with conn.cursor() as cur:
            cur.execute(
                "SELECT transfer_parts(%s, %s, %s, %s, %s, %s)",
                (source_id, target_id, part_num, color_id, transfer_qty, reason)
            )
            result = cur.fetchone()
            print(f"Transfer result: {result[0]}")
        
        source_final = get_inventory_part_quantity(conn, source_id, part_num, color_id)
        target_final = get_inventory_part_quantity(conn, target_id, part_num, color_id)
        print(f"Final quantities - Source: {source_final}, Target: {target_final}")
        
        expected_source = source_initial - transfer_qty
        expected_target = transfer_qty
        
        if source_final != expected_source:
            print(f"❌ FAIL: Source quantity mismatch. Expected {expected_source}, got {source_final}")
        elif target_final != expected_target:
            print(f"❌ FAIL: Target quantity mismatch. Expected {expected_target}, got {target_final}")
        else:
            print("✅ PASS: New part transfer completed correctly")
            passed = True

    except psycopg2.Error as e:
        print(f"❌ FAIL: Transfer failed unexpectedly with error: {e}")
    finally:
        conn.rollback()
    return passed


def verify_business_rule_validation(conn) -> bool:
    """Test business rule validation including quantity limits and self-transfer prevention."""
    print("\n-- Verifying Business Rule Validation --")
    
    # Test 1: Self-transfer (should fail)
    print("Test 1: Self-transfer (should fail)")
    test1_passed = False
    try:
        source_id = 14469
        part_num = '3024'
        color_id = 15
        transfer_qty = 10
        reason = 'self_transfer'
        
        with conn.cursor() as cur:
            cur.execute(
                "SELECT transfer_parts(%s, %s, %s, %s, %s, %s)",
                (source_id, source_id, part_num, color_id, transfer_qty, reason)
            )
            result = cur.fetchone()
            print(f"❌ FAIL: Self-transfer should have failed but succeeded: {result[0]}")
    except psycopg2.Error:
        print(f"✅ PASS: Self-transfer correctly failed")
        test1_passed = True
    except Exception as e:
        print(f"❌ FAIL: Self-transfer test failed with unexpected error: {e}")
    finally:
        conn.rollback() # Rollback after first test

    # Test 2: Transfer quantity exceeds maximum (should fail)
    print("Test 2: Transfer quantity exceeds maximum (should fail)")
    test2_passed = False
    try:
        source_id = 14469
        target_id = 14686
        part_num = '3024'
        color_id = 15
        
        with conn.cursor() as cur:
            cur.execute(
                "SELECT transfer_parts(%s, %s, %s, %s, %s, %s)",
                (source_id, target_id, part_num, color_id, 600, 'large_transfer')
            )
            result = cur.fetchone()
            print(f"❌ FAIL: Large transfer should have failed but succeeded: {result[0]}")
    except psycopg2.Error:
        print(f"✅ PASS: Large transfer correctly failed")
        test2_passed = True
    except Exception as e:
        print(f"❌ FAIL: Large transfer test failed with unexpected error: {e}")
    finally:
        conn.rollback() # Rollback after second test

    # Test 3: Transfer quantity below minimum (should fail)
    print("Test 3: Transfer quantity below minimum (should fail)")
    test3_passed = False
    try:
        source_id = 14469
        target_id = 14686
        part_num = '3024'
        color_id = 15

        with conn.cursor() as cur:
            cur.execute(
                "SELECT transfer_parts(%s, %s, %s, %s, %s, %s)",
                (source_id, target_id, part_num, color_id, 0, 'zero_transfer')
            )
            result = cur.fetchone()
            print(f"❌ FAIL: Zero transfer should have failed but succeeded: {result[0]}")
    except psycopg2.Error:
        print(f"✅ PASS: Zero transfer correctly failed")
        test3_passed = True
    except Exception as e:
        print(f"❌ FAIL: Zero transfer test failed with unexpected error: {e}")
    finally:
        conn.rollback() # Rollback after third test

    return test1_passed and test2_passed and test3_passed


def verify_insufficient_quantity_error(conn) -> bool:
    """Test that transfer fails when source has insufficient quantity."""
    print("\n-- Verifying Insufficient Quantity Error --")
    passed = False
    try:
        source_id = 14469
        target_id = 14686
        part_num = '3024'
        color_id = 15
        transfer_qty = 99999  # Far more than available
        reason = 'insufficient_test'
        
        source_initial = get_inventory_part_quantity(conn, source_id, part_num, color_id)
        target_initial = get_inventory_part_quantity(conn, target_id, part_num, color_id)
        print(f"Initial quantities - Source: {source_initial}, Target: {target_initial}")
        
        with conn.cursor() as cur:
            try:
                cur.execute(
                    "SELECT transfer_parts(%s, %s, %s, %s, %s, %s)",
                    (source_id, target_id, part_num, color_id, transfer_qty, reason)
                )
                result = cur.fetchone()
                print(f"❌ FAIL: Transfer should have failed but succeeded: {result[0]}")
            except psycopg2.Error as e:
                print(f"✅ PASS: Transfer correctly failed with an exception.")
                # After an exception, the transaction is in an aborted state. Must rollback before new queries.
                conn.rollback()
                
                source_final = get_inventory_part_quantity(conn, source_id, part_num, color_id)
                target_final = get_inventory_part_quantity(conn, target_id, part_num, color_id)
                
                if source_final != source_initial:
                    print(f"❌ FAIL: Source quantity changed from {source_initial} to {source_final}")
                elif target_final != target_initial:
                    print(f"❌ FAIL: Target quantity changed from {target_initial} to {target_final}")
                else:
                    print("✅ PASS: Database state unchanged after failed transfer")
                    passed = True
    finally:
        conn.rollback()
    return passed


def verify_invalid_inventory_error(conn) -> bool:
    """Test that transfer fails with invalid inventory IDs."""
    print("\n-- Verifying Invalid Inventory Error --")
    passed = False
    try:
        source_id = 99999  # Non-existent inventory
        target_id = 14686
        part_num = '3024'
        color_id = 15
        transfer_qty = 10
        reason = 'invalid_test'
        
        target_initial = get_inventory_part_quantity(conn, target_id, part_num, color_id)
        
        with conn.cursor() as cur:
            try:
                cur.execute(
                    "SELECT transfer_parts(%s, %s, %s, %s, %s, %s)",
                    (source_id, target_id, part_num, color_id, transfer_qty, reason)
                )
                result = cur.fetchone()
                print(f"❌ FAIL: Transfer should have failed but succeeded: {result[0]}")
            except psycopg2.Error as e:
                print(f"✅ PASS: Transfer correctly failed with an exception.")
                # Rollback the aborted transaction
                conn.rollback()
                
                target_final = get_inventory_part_quantity(conn, target_id, part_num, color_id)
                if target_final != target_initial:
                    print(f"❌ FAIL: Target quantity changed from {target_initial} to {target_final}")
                else:
                    print("✅ PASS: Database state unchanged after invalid inventory error")
                    passed = True
    finally:
        conn.rollback()
    return passed


def verify_audit_logging(conn) -> bool:
    """
    Test that audit logging captures both successful and failed transfers.
    This function uses commits to separate test cases and work around the
    transactional paradox of logging a failure within a transaction that
    is about to be rolled back by the client.
    """
    print("\n-- Verifying Audit Logging --")
    
    # Part 1: Test success logging
    print("Part 1: Verifying success log entry...")
    success_passed = False
    try:
        with conn.cursor() as cur:
            cur.execute("SELECT COUNT(*) FROM inventory_transfer_log")
            initial_count = cur.fetchone()[0]

        with conn.cursor() as cur:
            cur.execute(
                "SELECT transfer_parts(14469, 14686, '3024', 15, 5, 'audit_test_success')"
            )
        
        # Check the log before committing/rolling back
        with conn.cursor() as cur:
            cur.execute("SELECT COUNT(*) FROM inventory_transfer_log")
            final_count = cur.fetchone()[0]
            if final_count == initial_count + 1:
                print("✅ PASS: Success log was correctly written within the transaction.")
                success_passed = True
            else:
                print("❌ FAIL: Success log was not created.")

    except Exception as e:
        print(f"❌ FAIL: Success logging test threw an unexpected error: {e}")
    finally:
        conn.rollback() # Clean up the transaction for the next part

    if not success_passed:
        return False

    # Part 2: Test failure logging
    print("\nPart 2: Verifying failure log entry...")
    failure_passed = False
    try:
        with conn.cursor() as cur:
            cur.execute("SELECT COUNT(*) FROM inventory_transfer_log")
            initial_count = cur.fetchone()[0]
        
        try:
            with conn.cursor() as cur:
                cur.execute(
                    "SELECT transfer_parts(14469, 14469, '3024', 15, 5, 'audit_test_fail')"
                )
        except psycopg2.Error:
            # This is the expected failure path.
            # The function should have logged the failure before raising the error.
            # Now, we check the log table.
            pass
        
        # The transaction is now in an aborted state. We must rollback to issue new commands.
        conn.rollback()

        with conn.cursor() as cur:
            cur.execute("SELECT COUNT(*) FROM inventory_transfer_log")
            final_count = cur.fetchone()[0]
            if final_count == initial_count:
                 print("✅ PASS: Failure log was correctly rolled back as expected in a standard transaction.")
                 failure_passed = True
            else:
                print("❌ FAIL: Failure log was not rolled back. This implies a non-standard transaction behavior.")
                print(f"Log count before: {initial_count}, Log count after: {final_count}")

    except Exception as e:
        print(f"❌ FAIL: Failure logging test threw an unexpected error: {e}")
    finally:
        conn.rollback() # Ensure cleanup

    return success_passed and failure_passed


def verify_exact_quantity_transfer(conn) -> bool:
    """Test transferring exact quantity (should delete source row when quantity becomes 0)."""
    print("\n-- Verifying Exact Quantity Transfer --")
    passed = False
    target_id = 14686  # Use a fixed target inventory
    
    try:
        # Find a part with a small quantity that doesn't conflict with the target inventory
        with conn.cursor() as cur:
            cur.execute(
                """
                SELECT inventory_id, part_num, color_id, quantity
                FROM public.lego_inventory_parts
                WHERE quantity BETWEEN 5 AND 20 AND inventory_id != %s
                LIMIT 1
                """,
                (target_id,)
            )
            result = cur.fetchone()
            if not result:
                print("⚠️ SKIP: No suitable part found for exact quantity test")
                return True
            
            source_id, part_num, color_id, exact_qty = result
        
        print(f"Testing exact transfer: {exact_qty} parts of '{part_num}' from inventory {source_id} to {target_id}")
        
        source_initial = get_inventory_part_quantity(conn, source_id, part_num, color_id)
        target_initial = get_inventory_part_quantity(conn, target_id, part_num, color_id)
        print(f"Initial quantities - Source: {source_initial}, Target: {target_initial}")

        with conn.cursor() as cur:
            cur.execute(
                "SELECT transfer_parts(%s, %s, %s, %s, %s, %s)",
                (source_id, target_id, part_num, color_id, exact_qty, 'exact_transfer')
            )
            print(f"Transfer result: {cur.fetchone()[0]}")
        
        source_final = get_inventory_part_quantity(conn, source_id, part_num, color_id)
        target_final = get_inventory_part_quantity(conn, target_id, part_num, color_id)
        print(f"Final quantities - Source: {source_final}, Target: {target_final}")
        
        expected_source = 0
        expected_target = target_initial + exact_qty
        
        if source_final != expected_source:
            print(f"❌ FAIL: Source quantity should be 0 (row deleted), but got {source_final}")
        elif target_final != expected_target:
            print(f"❌ FAIL: Target quantity mismatch. Expected {expected_target}, got {target_final}")
        else:
            print("✅ PASS: Exact quantity transfer completed correctly (source row deleted)")
            passed = True

    except psycopg2.Error as e:
        print(f"❌ FAIL: Transfer failed unexpectedly with error: {e}")
    finally:
        conn.rollback()
    return passed


def main():
    """Main verification function."""
    print("=" * 60)
    print("LEGO Enhanced Inventory Transfer Function Verification Script")
    print("=" * 60)

    conn_params = get_connection_params()
    if not conn_params.get("database"):
        print("❌ CRITICAL: POSTGRES_DATABASE environment variable not set.")
        sys.exit(1)

    conn = None
    try:
        conn = psycopg2.connect(**conn_params)
        conn.autocommit = False  # Ensure we can control transactions manually

        # Run all verification steps
        results = [
            verify_system_components(conn),
            verify_successful_transfer_with_audit(conn),
            verify_new_part_transfer(conn),
            verify_business_rule_validation(conn),
            verify_insufficient_quantity_error(conn),
            verify_invalid_inventory_error(conn),
            verify_audit_logging(conn),
            verify_exact_quantity_transfer(conn),
        ]

        if all(results):
            print("\n🎉 Overall Result: PASS - All verification steps completed successfully!")
            sys.exit(0)
        else:
            print("\n❌ Overall Result: FAIL - One or more verification steps failed.")
            sys.exit(1)

    except psycopg2.OperationalError as e:
        print(f"❌ CRITICAL: Could not connect to the database. Details: {e}")
        sys.exit(1)
    except Exception as e:
        print(f"❌ CRITICAL: An unexpected error occurred. Details: {e}")
        sys.exit(1)
    finally:
        if conn:
            conn.close()


if __name__ == "__main__":
    main()


================================================
FILE: tasks/postgres/standard/security/rls_business_access/description.md
================================================
Implement Row Level Security (RLS) policies for a social media platform with Users, Posts, Comments, and Channels.

## Your Mission:

Build RLS policies for a social platform where users create posts and comments in channels. Implement proper access control so users can manage their own content, while channel moderators can moderate content in their channels.

## RLS Requirements:

### 1. Users Table Access Rules:
- **SELECT**: Users can read all public user profiles (username, created_at)
- **UPDATE**: Users can only modify their own profile
- **DELETE**: Users can only delete their own account

### 2. Channels Table Access Rules:
- **SELECT**: Everyone can read public channel information
- **INSERT**: Any authenticated user can create a channel (becomes owner)
- **UPDATE**: Only channel owners can modify channel details
- **DELETE**: Only channel owners can delete channels

### 3. Posts Table Access Rules:
- **SELECT**: Users can read all posts in channels they have access to
- **INSERT**: Authenticated users can create posts in any channel
- **UPDATE**: Post authors OR channel moderators OR channel owners can edit posts
- **DELETE**: Post authors OR channel moderators OR channel owners can delete posts

### 4. Comments Table Access Rules:
- **SELECT**: Users can read comments on posts they can access
- **INSERT**: Authenticated users can comment on posts they can see
- **UPDATE**: Comment authors OR post authors OR channel moderators OR channel owners can edit comments
- **DELETE**: Comment authors OR post authors OR channel moderators OR channel owners can delete comments

### 5. Channel Moderators Table Access Rules:
- **SELECT**: Users can see moderator lists for channels
- **INSERT**: Only channel owners can add moderators
- **DELETE**: Channel owners can remove moderators; moderators can remove themselves

## Session Context:

Use `current_setting('app.current_user_id')` to get the current user ID from session context.

## Schema Requirements:

- **Use only the `public` schema** for all tables, functions, and policies
- All helper functions should be created in the `public` schema
- Do not create additional schemas

## Expected Deliverables:

1. **Enable RLS** on all five tables
2. **Create policies** for SELECT, INSERT, UPDATE, DELETE operations on each table
3. **Helper functions** to check permissions efficiently:
   - `is_channel_owner(channel_id, user_id)`
   - `is_channel_moderator(channel_id, user_id)`
   - `can_moderate_channel(channel_id, user_id)`
4. **Proper indexing** to ensure RLS policies perform well

## Test Scenarios:

Your RLS implementation will be verified with:

- **Content ownership**: Users can only edit their own posts/comments
- **Moderation hierarchy**: Moderators can moderate content in their channels
- **Channel isolation**: Users only see content from accessible channels
- **Permission escalation**: Owners have full control over their channels
- **Cross-table access**: Comment policies respect post and channel permissions

## Success Criteria:

- Users can manage their own content (posts, comments)
- Channel owners have full control over their channels
- Moderators can moderate content in their assigned channels
- No unauthorized access to other users' private data
- Policies are efficient and don't create performance bottlenecks
- All operations (SELECT, INSERT, UPDATE, DELETE) are properly secured


================================================
FILE: tasks/postgres/standard/security/rls_business_access/ground_truth.sql
================================================
-- Ground Truth RLS Implementation

BEGIN;

-- ============================================================================
-- PERFORMANCE INDEXES FOR RLS
-- ============================================================================

-- Users table indexes
CREATE INDEX IF NOT EXISTS idx_users_is_public ON users(is_public);

-- Channels table indexes
CREATE INDEX IF NOT EXISTS idx_channels_owner_id ON channels(owner_id);
CREATE INDEX IF NOT EXISTS idx_channels_is_public ON channels(is_public);

-- Channel moderators table indexes
CREATE INDEX IF NOT EXISTS idx_channel_moderators_channel_user ON channel_moderators(channel_id, user_id);
CREATE INDEX IF NOT EXISTS idx_channel_moderators_user ON channel_moderators(user_id);

-- Posts table indexes
CREATE INDEX IF NOT EXISTS idx_posts_channel_id ON posts(channel_id);
CREATE INDEX IF NOT EXISTS idx_posts_author_id ON posts(author_id);
CREATE INDEX IF NOT EXISTS idx_posts_created_at ON posts(created_at);

-- Comments table indexes
CREATE INDEX IF NOT EXISTS idx_comments_post_id ON comments(post_id);
CREATE INDEX IF NOT EXISTS idx_comments_author_id ON comments(author_id);
CREATE INDEX IF NOT EXISTS idx_comments_created_at ON comments(created_at);

-- ============================================================================
-- ENABLE ROW LEVEL SECURITY
-- ============================================================================

ALTER TABLE users ENABLE ROW LEVEL SECURITY;
ALTER TABLE channels ENABLE ROW LEVEL SECURITY;
ALTER TABLE channel_moderators ENABLE ROW LEVEL SECURITY;
ALTER TABLE posts ENABLE ROW LEVEL SECURITY;
ALTER TABLE comments ENABLE ROW LEVEL SECURITY;

-- ============================================================================
-- USERS TABLE POLICIES
-- ============================================================================

-- Users SELECT: Can read public profiles OR own profile
DROP POLICY IF EXISTS users_select ON users;
CREATE POLICY users_select ON users
FOR SELECT
USING (
    is_public = true
    OR id = app_current_user_id()
);

-- Users UPDATE: Can only update own profile
DROP POLICY IF EXISTS users_update ON users;
CREATE POLICY users_update ON users
FOR UPDATE
USING (id = app_current_user_id())
WITH CHECK (id = app_current_user_id());

-- Users DELETE: Can only delete own account
DROP POLICY IF EXISTS users_delete ON users;
CREATE POLICY users_delete ON users
FOR DELETE
USING (id = app_current_user_id());

-- ============================================================================
-- CHANNELS TABLE POLICIES
-- ============================================================================

-- Channels SELECT: Can read public channels OR channels where user is owner/moderator
DROP POLICY IF EXISTS channels_select ON channels;
CREATE POLICY channels_select ON channels
FOR SELECT
USING (
    is_public = true
    OR owner_id = app_current_user_id()
    OR is_channel_moderator(id, app_current_user_id())
);

-- Channels INSERT: Authenticated users can create channels (become owner)
DROP POLICY IF EXISTS channels_insert ON channels;
CREATE POLICY channels_insert ON channels
FOR INSERT
WITH CHECK (owner_id = app_current_user_id());

-- Channels UPDATE: Only channel owners can modify
DROP POLICY IF EXISTS channels_update ON channels;
CREATE POLICY channels_update ON channels
FOR UPDATE
USING (owner_id = app_current_user_id())
WITH CHECK (owner_id = app_current_user_id());

-- Channels DELETE: Only channel owners can delete
DROP POLICY IF EXISTS channels_delete ON channels;
CREATE POLICY channels_delete ON channels
FOR DELETE
USING (owner_id = app_current_user_id());

-- ============================================================================
-- POSTS TABLE POLICIES
-- ============================================================================

-- Posts SELECT: Can read posts in accessible channels
DROP POLICY IF EXISTS posts_select ON posts;
CREATE POLICY posts_select ON posts
FOR SELECT
USING (
    EXISTS (
        SELECT 1 FROM channels c
        WHERE c.id = posts.channel_id
        AND (
            c.is_public = true
            OR c.owner_id = app_current_user_id()
            OR is_channel_moderator(c.id, app_current_user_id())
        )
    )
);

-- Posts INSERT: Authenticated users can create posts (must be author)
DROP POLICY IF EXISTS posts_insert ON posts;
CREATE POLICY posts_insert ON posts
FOR INSERT
WITH CHECK (
    author_id = app_current_user_id()
    AND EXISTS (
        SELECT 1 FROM channels c
        WHERE c.id = posts.channel_id
        AND (
            c.is_public = true
            OR c.owner_id = app_current_user_id()
            OR is_channel_moderator(c.id, app_current_user_id())
        )
    )
);

-- Posts UPDATE: Post authors OR channel moderators/owners can edit
DROP POLICY IF EXISTS posts_update ON posts;
CREATE POLICY posts_update ON posts
FOR UPDATE
USING (
    author_id = app_current_user_id()
    OR can_moderate_channel(channel_id, app_current_user_id())
)
WITH CHECK (
    author_id = app_current_user_id()
    OR can_moderate_channel(channel_id, app_current_user_id())
);

-- Posts DELETE: Post authors OR channel moderators/owners can delete
DROP POLICY IF EXISTS posts_delete ON posts;
CREATE POLICY posts_delete ON posts
FOR DELETE
USING (
    author_id = app_current_user_id()
    OR can_moderate_channel(channel_id, app_current_user_id())
);

-- ============================================================================
-- COMMENTS TABLE POLICIES
-- ============================================================================

-- Comments SELECT: Can read comments on accessible posts
DROP POLICY IF EXISTS comments_select ON comments;
CREATE POLICY comments_select ON comments
FOR SELECT
USING (
    EXISTS (
        SELECT 1 FROM posts p
        JOIN channels c ON c.id = p.channel_id
        WHERE p.id = comments.post_id
        AND (
            c.is_public = true
            OR c.owner_id = app_current_user_id()
            OR is_channel_moderator(c.id, app_current_user_id())
        )
    )
);

-- Comments INSERT: Authenticated users can comment on accessible posts
DROP POLICY IF EXISTS comments_insert ON comments;
CREATE POLICY comments_insert ON comments
FOR INSERT
WITH CHECK (
    author_id = app_current_user_id()
    AND EXISTS (
        SELECT 1 FROM posts p
        JOIN channels c ON c.id = p.channel_id
        WHERE p.id = comments.post_id
        AND (
            c.is_public = true
            OR c.owner_id = app_current_user_id()
            OR is_channel_moderator(c.id, app_current_user_id())
        )
    )
);

-- Comments UPDATE: Comment authors OR post authors OR channel moderators/owners can edit
DROP POLICY IF EXISTS comments_update ON comments;
CREATE POLICY comments_update ON comments
FOR UPDATE
USING (
    author_id = app_current_user_id()
    OR EXISTS (
        SELECT 1 FROM posts p
        WHERE p.id = comments.post_id
        AND (
            p.author_id = app_current_user_id()
            OR can_moderate_channel(p.channel_id, app_current_user_id())
        )
    )
)
WITH CHECK (
    author_id = app_current_user_id()
    OR EXISTS (
        SELECT 1 FROM posts p
        WHERE p.id = comments.post_id
        AND (
            p.author_id = app_current_user_id()
            OR can_moderate_channel(p.channel_id, app_current_user_id())
        )
    )
);

-- Comments DELETE: Comment authors OR post authors OR channel moderators/owners can delete
DROP POLICY IF EXISTS comments_delete ON comments;
CREATE POLICY comments_delete ON comments
FOR DELETE
USING (
    author_id = app_current_user_id()
    OR EXISTS (
        SELECT 1 FROM posts p
        WHERE p.id = comments.post_id
        AND (
            p.author_id = app_current_user_id()
            OR can_moderate_channel(p.channel_id, app_current_user_id())
        )
    )
);

-- ============================================================================
-- CHANNEL MODERATORS TABLE POLICIES
-- ============================================================================

-- Channel moderators SELECT: Visible to users who can access the channel
DROP POLICY IF EXISTS channel_moderators_select ON channel_moderators;
CREATE POLICY channel_moderators_select ON channel_moderators
FOR SELECT
USING (
    EXISTS (
        SELECT 1 FROM channels c
        WHERE c.id = channel_moderators.channel_id
        AND (
            c.is_public = true
            OR c.owner_id = app_current_user_id()
            OR is_channel_moderator(c.id, app_current_user_id())
        )
    )
);

-- Channel moderators INSERT: Only channel owners can add moderators
DROP POLICY IF EXISTS channel_moderators_insert ON channel_moderators;
CREATE POLICY channel_moderators_insert ON channel_moderators
FOR INSERT
WITH CHECK (is_channel_owner(channel_id, app_current_user_id()));

-- Channel moderators DELETE: Channel owners can remove any; moderators can remove themselves
DROP POLICY IF EXISTS channel_moderators_delete ON channel_moderators;
CREATE POLICY channel_moderators_delete ON channel_moderators
FOR DELETE
USING (
    is_channel_owner(channel_id, app_current_user_id())
    OR user_id = app_current_user_id()
);

-- ============================================================================
-- USAGE NOTES
-- ============================================================================

/*
Usage Instructions:
1. Set session context before queries:
   SET app.current_user_id = '<user-uuid>';

2. For anonymous users:
   SET app.current_user_id = '';

3. Test examples:
   -- Alice (owner of general channel)
   SET app.current_user_id = '11111111-1111-1111-1111-111111111111';

   -- Bob (moderator of general channel)
   SET app.current_user_id = '22222222-2222-2222-2222-222222222222';
*/

COMMIT;


================================================
FILE: tasks/postgres/standard/security/rls_business_access/meta.json
================================================
{
  "task_id": "rls_business_access",
  "task_name": "RLS Business Access",
  "category_id": "security",
  "category_name": "Security",
  "description": "Implement Row Level Security policies for social platform with proper access control for posts, comments, and channels.",
  "author": "Fanshi Zhang",
  "created_at": "2025-08-17",
  "difficulty": "L3",
  "tags": [
    "security and access control",
    "stored procedures and functions",
    "schema design"
  ],
  "mcp": [
    "postgres"
  ],
  "meta_data": {
    "stateType": "text",
    "stateContent": "Table \"users\" {\n  \"id\" uuid [pk, not null, default: `gen_random_uuid()`]\n  \"username\" varchar(50) [unique, not null]\n  \"email\" varchar(100) [unique, not null]\n  \"is_public\" bool [default: false]\n  \"created_at\" timestamp [default: `CURRENT_TIMESTAMP`]\n\n  Indexes {\n    is_public [type: btree, name: \"idx_users_is_public\"]\n  }\n}\n\nTable \"channels\" {\n  \"id\" uuid [pk, not null, default: `gen_random_uuid()`]\n  \"name\" varchar(100) [not null]\n  \"description\" text\n  \"is_public\" bool [default: true]\n  \"owner_id\" uuid\n  \"created_at\" timestamp [default: `CURRENT_TIMESTAMP`]\n\n  Indexes {\n    is_public [type: btree, name: \"idx_channels_is_public\"]\n    owner_id [type: btree, name: \"idx_channels_owner_id\"]\n  }\n}\n\nTable \"channel_moderators\" {\n  \"channel_id\" uuid [not null]\n  \"user_id\" uuid [not null]\n  \"created_at\" timestamp [default: `CURRENT_TIMESTAMP`]\n\n  Indexes {\n    (channel_id, user_id) [type: btree, name: \"channel_moderators_pkey\"]\n    (channel_id, user_id) [type: btree, name: \"idx_channel_moderators_channel_user\"]\n    user_id [type: btree, name: \"idx_channel_moderators_user\"]\n  }\n}\n\nTable \"posts\" {\n  \"id\" uuid [pk, not null, default: `gen_random_uuid()`]\n  \"channel_id\" uuid\n  \"author_id\" uuid\n  \"title\" varchar(200) [not null]\n  \"content\" text\n  \"created_at\" timestamp [default: `CURRENT_TIMESTAMP`]\n  \"updated_at\" timestamp [default: `CURRENT_TIMESTAMP`]\n\n  Indexes {\n    author_id [type: btree, name: \"idx_posts_author_id\"]\n    channel_id [type: btree, name: \"idx_posts_channel_id\"]\n    created_at [type: btree, name: \"idx_posts_created_at\"]\n  }\n}\n\nTable \"comments\" {\n  \"id\" uuid [pk, not null, default: `gen_random_uuid()`]\n  \"post_id\" uuid\n  \"author_id\" uuid\n  \"content\" text [not null]\n  \"created_at\" timestamp [default: `CURRENT_TIMESTAMP`]\n  \"updated_at\" timestamp [default: `CURRENT_TIMESTAMP`]\n\n  Indexes {\n    author_id [type: btree, name: \"idx_comments_author_id\"]\n    created_at [type: btree, name: \"idx_comments_created_at\"]\n    post_id [type: btree, name: \"idx_comments_post_id\"]\n  }\n}\n\nRef \"channel_moderators_channel_id_fkey\":\"channels\".\"id\" < \"channel_moderators\".\"channel_id\" [delete: cascade]\n\nRef \"channel_moderators_user_id_fkey\":\"users\".\"id\" < \"channel_moderators\".\"user_id\" [delete: cascade]\n\nRef \"channels_owner_id_fkey\":\"users\".\"id\" < \"channels\".\"owner_id\" [delete: cascade]\n\nRef \"comments_author_id_fkey\":\"users\".\"id\" < \"comments\".\"author_id\" [delete: cascade]\n\nRef \"comments_post_id_fkey\":\"posts\".\"id\" < \"comments\".\"post_id\" [delete: cascade]\n\nRef \"posts_author_id_fkey\":\"users\".\"id\" < \"posts\".\"author_id\" [delete: cascade]\n\nRef \"posts_channel_id_fkey\":\"channels\".\"id\" < \"posts\".\"channel_id\" [delete: cascade]\n",
    "stateUrl": null,
    "stateOriginalUrl": null
  }
}


================================================
FILE: tasks/postgres/standard/security/rls_business_access/prepare_environment.py
================================================
#!/usr/bin/env python3

import os
import psycopg2
from psycopg2.extensions import ISOLATION_LEVEL_AUTOCOMMIT
import sys

def setup_rls_environment():
    """
    Set up a PostgreSQL environment for a social media platform with RLS policies.
    Creates Users, Channels, Posts, Comments, and Channel Moderators for testing RLS implementations.
    """

    # Database connection parameters from environment
    db_params = {
        'host': os.getenv('POSTGRES_HOST', 'localhost'),
        'port': os.getenv('POSTGRES_PORT', '5432'),
        'user': os.getenv('POSTGRES_USERNAME', 'postgres'),
        'password': os.getenv('POSTGRES_PASSWORD', 'password'),
        'database': os.getenv('POSTGRES_DATABASE', 'postgres')
    }

    try:
        conn = psycopg2.connect(**db_params)
        conn.set_isolation_level(ISOLATION_LEVEL_AUTOCOMMIT)
        cur = conn.cursor()

        # 1. Users Table (with correct field name for verification)
        cur.execute("""
            CREATE TABLE IF NOT EXISTS users (
                id UUID PRIMARY KEY DEFAULT gen_random_uuid(),
                username VARCHAR(50) UNIQUE NOT NULL,
                email VARCHAR(100) UNIQUE NOT NULL,
                is_public BOOLEAN DEFAULT false,
                created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
            );
        """)
        print("✓ Created users table")

        # 2. Channels Table
        cur.execute("""
            CREATE TABLE IF NOT EXISTS channels (
                id UUID PRIMARY KEY DEFAULT gen_random_uuid(),
                name VARCHAR(100) NOT NULL,
                description TEXT,
                is_public BOOLEAN DEFAULT true,
                owner_id UUID REFERENCES users(id) ON DELETE CASCADE,
                created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
            );
        """)
        print("✓ Created channels table")

        # 3. Channel Moderators Table
        cur.execute("""
            CREATE TABLE IF NOT EXISTS channel_moderators (
                channel_id UUID REFERENCES channels(id) ON DELETE CASCADE,
                user_id UUID REFERENCES users(id) ON DELETE CASCADE,
                created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
                PRIMARY KEY (channel_id, user_id)
            );
        """)
        print("✓ Created channel_moderators table")

        # 4. Posts Table
        cur.execute("""
            CREATE TABLE IF NOT EXISTS posts (
                id UUID PRIMARY KEY DEFAULT gen_random_uuid(),
                channel_id UUID REFERENCES channels(id) ON DELETE CASCADE,
                author_id UUID REFERENCES users(id) ON DELETE CASCADE,
                title VARCHAR(200) NOT NULL,
                content TEXT,
                created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
                updated_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
            );
        """)
        print("✓ Created posts table")

        # 5. Comments Table
        cur.execute("""
            CREATE TABLE IF NOT EXISTS comments (
                id UUID PRIMARY KEY DEFAULT gen_random_uuid(),
                post_id UUID REFERENCES posts(id) ON DELETE CASCADE,
                author_id UUID REFERENCES users(id) ON DELETE CASCADE,
                content TEXT NOT NULL,
                created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
                updated_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
            );
        """)
        print("✓ Created comments table")

        # Create helper functions for RLS (matching ground truth expectations)
        cur.execute("""
            -- Function to get current user ID from session context
            CREATE OR REPLACE FUNCTION app_current_user_id()
            RETURNS UUID AS $$
            BEGIN
                RETURN NULLIF(current_setting('app.current_user_id', true), '')::UUID;
            END;
            $$ LANGUAGE plpgsql SECURITY DEFINER STABLE PARALLEL SAFE;

            -- Function to check if user owns a channel
            CREATE OR REPLACE FUNCTION is_channel_owner(p_channel_id UUID, p_user_id UUID)
            RETURNS BOOLEAN AS $$
            BEGIN
                RETURN EXISTS (
                    SELECT 1 FROM channels
                    WHERE id = p_channel_id AND owner_id = p_user_id
                );
            END;
            $$ LANGUAGE plpgsql SECURITY DEFINER STABLE PARALLEL SAFE;

            -- Function to check if user moderates a channel
            CREATE OR REPLACE FUNCTION is_channel_moderator(p_channel_id UUID, p_user_id UUID)
            RETURNS BOOLEAN AS $$
            BEGIN
                RETURN EXISTS (
                    SELECT 1 FROM channel_moderators
                    WHERE channel_id = p_channel_id AND user_id = p_user_id
                );
            END;
            $$ LANGUAGE plpgsql SECURITY DEFINER STABLE PARALLEL SAFE;

            -- Function to check if user can moderate channel (owner OR moderator)
            CREATE OR REPLACE FUNCTION can_moderate_channel(p_channel_id UUID, p_user_id UUID)
            RETURNS BOOLEAN AS $$
            BEGIN
                RETURN is_channel_owner(p_channel_id, p_user_id)
                       OR is_channel_moderator(p_channel_id, p_user_id);
            END;
            $$ LANGUAGE plpgsql SECURITY DEFINER STABLE PARALLEL SAFE;
        """)
        print("✓ Created RLS helper functions")

        # Insert sample data
        print("\nInserting sample data...")

        # Sample users (exact UUIDs expected by verification script)
        cur.execute("""
            INSERT INTO users (id, username, email, is_public) VALUES
            ('11111111-1111-1111-1111-111111111111', 'alice', 'alice@example.com', true),
            ('22222222-2222-2222-2222-222222222222', 'bob', 'bob@example.com', true),
            ('33333333-3333-3333-3333-333333333333', 'charlie', 'charlie@example.com', false),
            ('44444444-4444-4444-4444-444444444444', 'diana', 'diana@example.com', true),
            ('55555555-5555-5555-5555-555555555555', 'eve', 'eve@example.com', false)
            ON CONFLICT (id) DO NOTHING;
        """)
        print("✓ Created 5 sample users")

        # Sample channels (exact UUIDs expected by verification script)
        cur.execute("""
            INSERT INTO channels (id, name, description, is_public, owner_id) VALUES
            ('aaaaaaaa-aaaa-aaaa-aaaa-aaaaaaaaaaaa', 'general', 'General discussion channel', true, '11111111-1111-1111-1111-111111111111'),
            ('bbbbbbbb-bbbb-bbbb-bbbb-bbbbbbbbbbbb', 'tech-talk', 'Technical discussions', true, '22222222-2222-2222-2222-222222222222'),
            ('cccccccc-cccc-cccc-cccc-cccccccccccc', 'random', 'Random conversations', false, '33333333-3333-3333-3333-333333333333')
            ON CONFLICT (id) DO NOTHING;
        """)
        print("✓ Created 3 sample channels")

        # Sample moderators (exact relationships expected by verification script)
        cur.execute("""
            INSERT INTO channel_moderators (channel_id, user_id) VALUES
            ('aaaaaaaa-aaaa-aaaa-aaaa-aaaaaaaaaaaa', '22222222-2222-2222-2222-222222222222'),  -- Bob moderates general
            ('bbbbbbbb-bbbb-bbbb-bbbb-bbbbbbbbbbbb', '44444444-4444-4444-4444-444444444444')  -- Diana moderates tech-talk
            ON CONFLICT (channel_id, user_id) DO NOTHING;
        """)
        print("✓ Created sample moderator assignments")

        # Sample posts (exact UUIDs expected by verification script)
        cur.execute("""
            INSERT INTO posts (id, channel_id, author_id, title, content) VALUES
            ('dddddddd-dddd-dddd-dddd-dddddddddddd', 'aaaaaaaa-aaaa-aaaa-aaaa-aaaaaaaaaaaa', '11111111-1111-1111-1111-111111111111', 'Welcome to the platform!', 'This is our first post'),
            ('eeeeeeee-eeee-eeee-eeee-eeeeeeeeeeee', 'aaaaaaaa-aaaa-aaaa-aaaa-aaaaaaaaaaaa', '33333333-3333-3333-3333-333333333333', 'Hello everyone', 'Nice to meet you all'),
            ('ffffffff-ffff-ffff-ffff-ffffffffffff', 'bbbbbbbb-bbbb-bbbb-bbbb-bbbbbbbbbbbb', '22222222-2222-2222-2222-222222222222', 'PostgreSQL RLS Tutorial', 'Let''s discuss Row Level Security'),
            ('10101010-1010-1010-1010-101010101010', 'cccccccc-cccc-cccc-cccc-cccccccccccc', '55555555-5555-5555-5555-555555555555', 'Random thoughts', 'Just some random content here')
            ON CONFLICT (id) DO NOTHING;
        """)
        print("✓ Created 4 sample posts")

        # Sample comments (exact UUIDs expected by verification script)
        cur.execute("""
            INSERT INTO comments (id, post_id, author_id, content) VALUES
            ('99999999-9999-9999-9999-999999999999', 'dddddddd-dddd-dddd-dddd-dddddddddddd', '22222222-2222-2222-2222-222222222222', 'Great to have you here!'),
            ('88888888-8888-8888-8888-888888888888', 'dddddddd-dddd-dddd-dddd-dddddddddddd', '33333333-3333-3333-3333-333333333333', 'Thanks for setting this up'),
            ('77777777-7777-7777-7777-777777777777', 'ffffffff-ffff-ffff-ffff-ffffffffffff', '44444444-4444-4444-4444-444444444444', 'RLS is really powerful!'),
            ('66666666-6666-6666-6666-666666666666', 'eeeeeeee-eeee-eeee-eeee-eeeeeeeeeeee', '11111111-1111-1111-1111-111111111111', 'Welcome Charlie!')
            ON CONFLICT (id) DO NOTHING;
        """)
        print("✓ Created 4 sample comments")

        # Create indexes for better RLS performance
        cur.execute("""
            CREATE INDEX IF NOT EXISTS idx_channels_owner_id ON channels(owner_id);
            CREATE INDEX IF NOT EXISTS idx_channels_is_public ON channels(is_public);
            CREATE INDEX IF NOT EXISTS idx_channel_moderators_channel_user ON channel_moderators(channel_id, user_id);
            CREATE INDEX IF NOT EXISTS idx_channel_moderators_user ON channel_moderators(user_id);
            CREATE INDEX IF NOT EXISTS idx_posts_channel_id ON posts(channel_id);
            CREATE INDEX IF NOT EXISTS idx_posts_author_id ON posts(author_id);
            CREATE INDEX IF NOT EXISTS idx_posts_created_at ON posts(created_at);
            CREATE INDEX IF NOT EXISTS idx_comments_post_id ON comments(post_id);
            CREATE INDEX IF NOT EXISTS idx_comments_author_id ON comments(author_id);
            CREATE INDEX IF NOT EXISTS idx_comments_created_at ON comments(created_at);
            CREATE INDEX IF NOT EXISTS idx_users_is_public ON users(is_public);
        """)
        print("✓ Created performance indexes for RLS")

        cur.close()
        conn.close()

    except Exception as e:
        print(f"Error setting up environment: {e}")
        sys.exit(1)

if __name__ == "__main__":
    setup_rls_environment()


================================================
FILE: tasks/postgres/standard/security/rls_business_access/verify.py
================================================
#!/usr/bin/env python3

import os
import psycopg2
from psycopg2.extensions import ISOLATION_LEVEL_AUTOCOMMIT
import sys

def verify_rls_implementation():
    """
    Verify that Row Level Security policies have been properly implemented
    for the social media platform with Users, Posts, Comments, and Channels.
    """

    # Database connection parameters from environment
    admin_db_params = {
        'host': os.getenv('POSTGRES_HOST', 'localhost'),
        'port': os.getenv('POSTGRES_PORT', '5432'),
        'user': os.getenv('POSTGRES_USERNAME', 'postgres'),
        'password': os.getenv('POSTGRES_PASSWORD', 'password'),
        'database': os.getenv('POSTGRES_DATABASE', 'postgres')
    }

    # Test user parameters (non-superuser for proper RLS testing)
    test_db_params = {
        'host': os.getenv('POSTGRES_HOST', 'localhost'),
        'port': os.getenv('POSTGRES_PORT', '5432'),
        'user': 'test_user',
        'password': 'testpass',
        'database': os.getenv('POSTGRES_DATABASE', 'postgres')
    }

    try:
        # First connect as admin to ensure test user exists
        admin_conn = psycopg2.connect(**admin_db_params)
        admin_conn.set_isolation_level(ISOLATION_LEVEL_AUTOCOMMIT)
        admin_cur = admin_conn.cursor()

        # Create test user if it doesn't exist
        try:
            admin_cur.execute("CREATE ROLE test_user LOGIN PASSWORD 'testpass';")
        except psycopg2.Error:
            pass  # User already exists

        # Grant necessary permissions to test user on the current database
        admin_cur.execute("SELECT current_database();")
        current_db_name = admin_cur.fetchone()[0]

        admin_cur.execute(f"GRANT CONNECT ON DATABASE \"{current_db_name}\" TO test_user;")
        admin_cur.execute("GRANT USAGE ON SCHEMA public TO test_user;")
        admin_cur.execute("GRANT SELECT, INSERT, UPDATE, DELETE ON ALL TABLES IN SCHEMA public TO test_user;")
        admin_cur.execute("GRANT USAGE, SELECT ON ALL SEQUENCES IN SCHEMA public TO test_user;")
        admin_cur.execute("GRANT EXECUTE ON ALL FUNCTIONS IN SCHEMA public TO test_user;")

        admin_cur.close()
        admin_conn.close()

        # Update test_db_params with the correct database name
        test_db_params['database'] = current_db_name

        # Now connect as test user for RLS verification
        conn = psycopg2.connect(**test_db_params)
        conn.set_isolation_level(ISOLATION_LEVEL_AUTOCOMMIT)
        cur = conn.cursor()

        print("Verifying...")

        test_results = []

        # Test 1: Check if RLS is enabled on all tables
        print("\n1. Checking RLS enablement...")
        expected_tables = ['users', 'channels', 'channel_moderators', 'posts', 'comments']

        for table in expected_tables:
            cur.execute("""
                SELECT relrowsecurity
                FROM pg_class
                WHERE relname = %s AND relkind = 'r'
            """, (table,))
            result = cur.fetchone()

            if result and result[0]:
                test_results.append(f"✓ RLS enabled on {table}")
            else:
                test_results.append(f"✗ RLS NOT enabled on {table}")

        # Test 2: Users can only update their own profile
        print("\n2. Testing user profile access control...")

        # Alice tries to update her own profile (should work)
        try:
            cur.execute("SET app.current_user_id = '11111111-1111-1111-1111-111111111111';")  # Alice
            cur.execute("""
                UPDATE users
                SET email = 'alice.updated@example.com'
                WHERE id = '11111111-1111-1111-1111-111111111111'
            """)
            test_results.append("✓ Users can update their own profile")
        except Exception as e:
            test_results.append(f"✗ User cannot update own profile: {e}")

        # Alice tries to update Bob's profile (should fail)
        try:
            cur.execute("SET app.current_user_id = '11111111-1111-1111-1111-111111111111';")  # Alice
            cur.execute("""
                UPDATE users
                SET email = 'bob.hacked@example.com'
                WHERE id = '22222222-2222-2222-2222-222222222222'
            """)
            # Check if the update actually affected any rows (RLS blocks by affecting 0 rows)
            if cur.rowcount == 0:
                test_results.append("✓ Users blocked from updating other users' profiles")
            else:
                test_results.append("✗ User was able to update another user's profile (should be blocked)")
        except psycopg2.Error:
            test_results.append("✓ Users blocked from updating other users' profiles")

        # Test 3: Channel ownership controls
        print("\n3. Testing channel ownership controls...")

        # Alice (owner of general channel) tries to update her channel
        try:
            cur.execute("SET app.current_user_id = '11111111-1111-1111-1111-111111111111';")  # Alice
            cur.execute("""
                UPDATE channels
                SET description = 'Updated by Alice'
                WHERE id = 'aaaaaaaa-aaaa-aaaa-aaaa-aaaaaaaaaaaa'
            """)
            test_results.append("✓ Channel owners can update their channels")
        except Exception as e:
            test_results.append(f"✗ Channel owner cannot update channel: {e}")

        # Charlie tries to update Alice's channel (should fail)
        try:
            cur.execute("SET app.current_user_id = '33333333-3333-3333-3333-333333333333';")  # Charlie
            cur.execute("""
                UPDATE channels
                SET description = 'Hacked by Charlie'
                WHERE id = 'aaaaaaaa-aaaa-aaaa-aaaa-aaaaaaaaaaaa'
            """)
            # Check if the update actually affected any rows (RLS blocks by affecting 0 rows)
            if cur.rowcount == 0:
                test_results.append("✓ Non-owners blocked from updating channels")
            else:
                test_results.append("✗ Non-owner was able to update channel (should be blocked)")
        except psycopg2.Error:
            test_results.append("✓ Non-owners blocked from updating channels")

        # Test 4: Post authorship and moderation controls
        print("\n4. Testing post access controls...")

        # Alice (author) tries to update her own post
        try:
            cur.execute("SET app.current_user_id = '11111111-1111-1111-1111-111111111111';")  # Alice
            cur.execute("""
                UPDATE posts
                SET title = 'Updated by Alice'
                WHERE id = 'dddddddd-dddd-dddd-dddd-dddddddddddd'
            """)
            test_results.append("✓ Post authors can update their posts")
        except Exception as e:
            test_results.append(f"✗ Post author cannot update post: {e}")

        # Bob (moderator of general) tries to update Alice's post (should work)
        try:
            cur.execute("SET app.current_user_id = '22222222-2222-2222-2222-222222222222';")  # Bob (moderator)
            cur.execute("""
                UPDATE posts
                SET content = 'Moderated by Bob'
                WHERE id = 'dddddddd-dddd-dddd-dddd-dddddddddddd'
            """)
            test_results.append("✓ Channel moderators can update posts in their channels")
        except Exception as e:
            test_results.append(f"✗ Channel moderator cannot update post: {e}")

        # Eve tries to update Alice's post (should fail - not author, owner, or moderator)
        try:
            cur.execute("SET app.current_user_id = '55555555-5555-5555-5555-555555555555';")  # Eve
            cur.execute("""
                UPDATE posts
                SET content = 'Hacked by Eve'
                WHERE id = 'dddddddd-dddd-dddd-dddd-dddddddddddd'
            """)
            # Check if the update actually affected any rows (RLS blocks by affecting 0 rows)
            if cur.rowcount == 0:
                test_results.append("✓ Unauthorized users blocked from updating posts")
            else:
                test_results.append("✗ Unauthorized user was able to update post (should be blocked)")
        except psycopg2.Error:
            test_results.append("✓ Unauthorized users blocked from updating posts")

        # Test 5: Comment access controls
        print("\n5. Testing comment access controls...")

        # Bob (comment author) tries to update his own comment
        try:
            cur.execute("SET app.current_user_id = '22222222-2222-2222-2222-222222222222';")  # Bob
            cur.execute("""
                UPDATE comments
                SET content = 'Updated by Bob himself'
                WHERE id = '99999999-9999-9999-9999-999999999999'
            """)
            test_results.append("✓ Comment authors can update their comments")
        except Exception as e:
            test_results.append(f"✗ Comment author cannot update comment: {e}")

        # Alice (post author) tries to update Bob's comment on her post (should work)
        try:
            cur.execute("SET app.current_user_id = '11111111-1111-1111-1111-111111111111';")  # Alice (post author)
            cur.execute("""
                UPDATE comments
                SET content = 'Moderated by post author Alice'
                WHERE id = '99999999-9999-9999-9999-999999999999'
            """)
            test_results.append("✓ Post authors can moderate comments on their posts")
        except Exception as e:
            test_results.append(f"✗ Post author cannot moderate comment: {e}")

        # Test 6: Channel moderator assignment controls
        print("\n6. Testing moderator assignment controls...")

        # Alice (channel owner) tries to add a moderator
        try:
            cur.execute("SET app.current_user_id = '11111111-1111-1111-1111-111111111111';")  # Alice (owner of general)
            cur.execute("""
                INSERT INTO channel_moderators (channel_id, user_id)
                VALUES ('aaaaaaaa-aaaa-aaaa-aaaa-aaaaaaaaaaaa', '33333333-3333-3333-3333-333333333333')
            """)
            test_results.append("✓ Channel owners can add moderators")
        except Exception as e:
            test_results.append(f"✗ Channel owner cannot add moderator: {e}")

        # Charlie tries to add himself as moderator to Bob's channel (should fail)
        try:
            cur.execute("SET app.current_user_id = '33333333-3333-3333-3333-333333333333';")  # Charlie
            cur.execute("""
                INSERT INTO channel_moderators (channel_id, user_id)
                VALUES ('bbbbbbbb-bbbb-bbbb-bbbb-bbbbbbbbbbbb', '33333333-3333-3333-3333-333333333333')
            """)
            # Check if the insert actually affected any rows (RLS blocks by affecting 0 rows)
            if cur.rowcount == 0:
                test_results.append("✓ Non-owners blocked from adding moderators")
            else:
                test_results.append("✗ Non-owner was able to add moderator (should be blocked)")
        except psycopg2.Error:
            test_results.append("✓ Non-owners blocked from adding moderators")

        # Test 7: Content visibility based on user context
        print("\n7. Testing content visibility...")

        # Count posts visible to Alice
        cur.execute("SET app.current_user_id = '11111111-1111-1111-1111-111111111111';")  # Alice
        cur.execute("SELECT COUNT(*) FROM posts;")
        alice_posts = cur.fetchone()[0]

        # Count posts visible to Eve
        cur.execute("SET app.current_user_id = '55555555-5555-5555-5555-555555555555';")  # Eve
        cur.execute("SELECT COUNT(*) FROM posts;")
        eve_posts = cur.fetchone()[0]

        if alice_posts >= 2 and eve_posts >= 1:  # Alice should see posts in channels she has access to
            test_results.append("✓ Content visibility varies correctly based on user context")
        else:
            test_results.append(f"✗ Content visibility issue: Alice sees {alice_posts}, Eve sees {eve_posts}")

        # Test 8: Anonymous user access
        print("\n8. Testing anonymous user restrictions...")

        try:
            cur.execute("SET app.current_user_id = '';")  # Anonymous user
            cur.execute("SELECT COUNT(*) FROM users;")
            anon_users = cur.fetchone()[0]

            # Anonymous users should be able to see public user profiles per requirements
            # Count public users that should be visible
            cur.execute("SELECT COUNT(*) FROM users WHERE is_public = true;")
            public_users = cur.fetchone()[0] if cur.rowcount > 0 else 0

            if anon_users == public_users and anon_users > 0:
                test_results.append(f"✓ Anonymous users can see {anon_users} public user profiles (correct)")
            elif anon_users == 0:
                test_results.append("✗ Anonymous users cannot see any users (should see public profiles)")
            else:
                test_results.append(f"✗ Anonymous users can see {anon_users} users but expected {public_users} public users")
        except Exception as e:
            test_results.append("✓ Anonymous users properly restricted")

        # Print results
        print("\n" + "="*60)
        print("RLS VERIFICATION RESULTS - SOCIAL MEDIA PLATFORM")
        print("="*60)

        passed = sum(1 for result in test_results if result.startswith("✓"))
        failed = sum(1 for result in test_results if result.startswith("✗"))

        for result in test_results:
            print(result)

        print(f"\nSummary: {passed} passed, {failed} failed")

        cur.close()
        conn.close()

        if failed == 0:
            print("\nAll tests passed.")
            return True
        else:
            print(f"\n{failed} test(s) failed.")
            return False

    except Exception as e:
        print(f"Error during verification: {e}")
        return False

if __name__ == "__main__":
    success = verify_rls_implementation()
    sys.exit(0 if success else 1)


================================================
FILE: tasks/postgres/standard/security/user_permission_audit/description.md
================================================
Conduct a comprehensive security audit to identify PostgreSQL users with insufficient or dangling permissions in a business database environment.

## Your Mission:

You've been hired as a security consultant to audit the PostgreSQL database permissions for a growing e-commerce company. The company has experienced rapid growth and multiple teams have been granted database access over time. However, there's concern about permission inconsistencies and security gaps.

## Security Audit Requirements:

1. **Discover the database structure**: Identify all business tables and their purposes
2. **Catalog all database users and roles**: Use `pg_user`, `pg_roles`, and `pg_auth_members` to find all accounts
3. **Analyze current permissions**: Use `information_schema.table_privileges` to map permissions
4. **Identify security issues**:
   - **Dangling users**: Inactive accounts that should be removed
   - **Missing permissions**: Users lacking permissions required for their business role
   - **Excessive permissions**: Users with unnecessary permissions that should be revoked

## Expected permissions by role (what they SHOULD have)

```python
# users's role
USER_ROLE = {
    # Active functional users
    'analytics_user': 'Analytics Team',
    'marketing_user': 'Marketing Department',
    'customer_service': 'Customer Service',
    'finance_user': 'Finance Team',
    'product_manager': 'Product Management',
    'security_auditor': 'Security Team',
    'developer_user': 'Development Team',
    'backup_user': 'Backup Service',
}

# each role has its permissions
ROLE_EXPECTED_PERMISSIONS = {
    'Analytics Team': [
        ('user_profiles', 'SELECT'),
        ('user_stat_analysis', 'SELECT'),
        ('product_catalog', 'SELECT'),
        ('order_management', 'SELECT'),
    ],
    'Marketing Department': [
        ('user_profiles', 'SELECT'),
        ('user_stat_analysis', 'SELECT'),
        ('product_catalog', 'SELECT'),
    ],
    'Customer Service': [
        ('user_profiles', 'SELECT'),
        ('user_profiles', 'UPDATE'),
        ('order_management', 'SELECT'),
        ('order_management', 'INSERT'),
        ('order_management', 'UPDATE'),
        ('product_catalog', 'SELECT'),
    ],
    'Finance Team': [
        ('financial_transactions', 'SELECT'),
        ('order_management', 'SELECT'),
        ('user_profiles', 'SELECT'),
    ],
    'Product Management': [
        ('product_catalog', 'SELECT'),
        ('product_catalog', 'INSERT'),
        ('product_catalog', 'UPDATE'),
        ('product_catalog', 'DELETE'),
        ('order_management', 'SELECT'),
        ('user_stat_analysis', 'SELECT'),
    ],
    'Security Team': [
        ('audit_logs', 'SELECT'),
        ('user_credentials', 'SELECT'),
        ('user_profiles', 'SELECT'),
    ],
    'Development Team': [
        ('user_profiles', 'SELECT'),
        ('product_catalog', 'SELECT'),
    ],
    'Backup Service': [
        ('user_profiles', 'SELECT'),
        ('product_catalog', 'SELECT'),
        ('order_management', 'SELECT'),
        ('financial_transactions', 'SELECT'),
        ('user_stat_analysis', 'SELECT'),
        ('audit_logs', 'SELECT'),
        ('user_credentials', 'SELECT'),
    ]
}
```

## Expected Deliverables:

Your audit must produce findings in a structured format that can be verified. Create two tables to store your audit results:

**1. Summary Table:**
```sql
CREATE TABLE security_audit_results (
    audit_id SERIAL PRIMARY KEY,
    audit_type VARCHAR(50) NOT NULL, -- 'DANGLING_USERS', 'MISSING_PERMISSIONS', 'EXCESSIVE_PERMISSIONS'
    total_issues INTEGER NOT NULL,
    users_affected INTEGER NOT NULL,
    tables_affected INTEGER NOT NULL
);
```

**2. Detailed Findings Table:**
```sql
CREATE TABLE security_audit_details (
    detail_id SERIAL PRIMARY KEY,
    username VARCHAR(50) NOT NULL,
    issue_type VARCHAR(50) NOT NULL, -- 'DANGLING_USER', 'MISSING_PERMISSION', 'EXCESSIVE_PERMISSION'
    table_name VARCHAR(50), -- NULL for dangling users
    permission_type VARCHAR(20), -- 'SELECT', 'INSERT', 'UPDATE', 'DELETE', NULL for dangling users
    expected_access BOOLEAN NOT NULL -- TRUE if user should have access, FALSE if should not
);
```

## Success Criteria:

Your audit should populate both tables with:
- **Summary data**: High-level counts of different types of security issues
- **Detailed findings**: Specific permission gaps for each user and table combination

## Business Role Expectations

Analyze usernames and infer their intended business roles based on naming patterns:

- **analytics_user** → Analytics Team (needs user behavior and statistics data)
- **marketing_user** → Marketing Department (needs customer and product data for campaigns)  
- **customer_service** → Customer Service (needs user profiles and order management)
- **finance_user** → Finance Team (needs financial and order data)
- **product_manager** → Product Management (needs full product catalog access)
- **security_auditor** → Security Team (needs audit logs and credential data)
- **developer_user** → Development Team (needs limited access for testing)
- **backup_user** → Backup Service (needs read-only access to all business data)
- **temp_contractor, old_employee, test_account** → Inactive/Temporary (should have NO permissions)

The verification process will check that your findings correctly identify the actual permission gaps in the system by comparing against expected results.


================================================
FILE: tasks/postgres/standard/security/user_permission_audit/ground_truth.sql
================================================
-- Ground Truth Solution: Complete Security Audit Implementation
-- This includes comprehensive PostgreSQL user, role, and permission discovery

/*
================================================================================
PERMISSION MODEL DOCUMENTATION
================================================================================

## Current Permission State
| Username          | Table                  | Permission | Status  | Reason                                    |
|-------------------|------------------------|------------|---------|-------------------------------------------|
| analytics_user    | user_stat_analysis     | SELECT     | EXISTS  | Correctly granted                         |
| analytics_user    | user_profiles          | SELECT     | MISSING | Permission was revoked                    |
| analytics_user    | financial_transactions | SELECT     | EXISTS  | Should be revoked - no business need     |
| marketing_user    | user_profiles          | SELECT     | EXISTS  | Correctly granted                         |
| marketing_user    | user_stat_analysis     | SELECT     | EXISTS  | Correctly granted                         |
| marketing_user    | product_catalog        | SELECT     | MISSING | Permission was revoked                    |
| marketing_user    | financial_transactions | SELECT     | EXISTS  | Should be revoked - security risk        |
| customer_service  | user_profiles          | SELECT     | EXISTS  | Correctly granted                         |
| customer_service  | user_profiles          | UPDATE     | EXISTS  | Correctly granted                         |
| customer_service  | order_management       | SELECT     | EXISTS  | Correctly granted                         |
| customer_service  | order_management       | INSERT     | EXISTS  | Correctly granted                         |
| customer_service  | order_management       | UPDATE     | EXISTS  | Correctly granted                         |
| customer_service  | product_catalog        | SELECT     | MISSING | Permission was revoked                    |
| customer_service  | user_credentials       | SELECT     | EXISTS  | Should be revoked - security risk        |
| finance_user      | financial_transactions | SELECT     | EXISTS  | Correctly granted                         |
| finance_user      | order_management       | SELECT     | EXISTS  | Correctly granted                         |
| finance_user      | user_profiles          | SELECT     | MISSING | Permission was revoked                    |
| product_manager   | product_catalog        | SELECT     | EXISTS  | Correctly granted                         |
| product_manager   | product_catalog        | INSERT     | EXISTS  | Correctly granted                         |
| product_manager   | product_catalog        | UPDATE     | EXISTS  | Correctly granted                         |
| product_manager   | product_catalog        | DELETE     | EXISTS  | Correctly granted                         |
| product_manager   | order_management       | SELECT     | EXISTS  | Correctly granted                         |
| product_manager   | financial_transactions | SELECT     | EXISTS  | Should be revoked - no business need     |
| security_auditor  | user_credentials       | SELECT     | EXISTS  | Correctly granted                         |
| security_auditor  | user_profiles          | SELECT     | EXISTS  | Correctly granted                         |
| security_auditor  | audit_logs             | SELECT     | MISSING | Permission was revoked                    |
| security_auditor  | financial_transactions | UPDATE     | EXISTS  | Should be revoked - excessive privilege  |
| developer_user    | user_profiles          | SELECT     | EXISTS  | Correctly granted                         |
| developer_user    | product_catalog        | SELECT     | MISSING | Permission was revoked                    |
| developer_user    | user_credentials       | SELECT     | EXISTS  | Should be revoked - security risk        |
| developer_user    | order_management       | UPDATE     | EXISTS  | Should be revoked - no business need     |
| backup_user       | user_profiles          | SELECT     | EXISTS  | Correctly granted                         |
| backup_user       | product_catalog        | SELECT     | EXISTS  | Correctly granted                         |
| backup_user       | audit_logs             | SELECT     | EXISTS  | Correctly granted                         |
| backup_user       | order_management       | SELECT     | MISSING | Permission was revoked                    |
| backup_user       | product_catalog        | DELETE     | EXISTS  | Should be revoked - backup should be read-only |
| temp_contractor   | product_catalog        | SELECT     | EXISTS  | Should be revoked - user is inactive     |
| temp_contractor   | user_profiles          | SELECT     | EXISTS  | Should be revoked - user is inactive     |
| old_employee      | audit_logs             | SELECT     | EXISTS  | Should be revoked - user is inactive     |
| old_employee      | user_stat_analysis     | UPDATE     | EXISTS  | Should be revoked - user is inactive     |
| test_account      | user_profiles          | SELECT     | EXISTS  | Should be revoked - test account          |

## Expected Permission State
| Username          | Table                  | Permission | Justification                                                |
|-------------------|------------------------|------------|--------------------------------------------------------------|
| analytics_user    | user_profiles          | SELECT     | Analytics team needs customer data for user behavior analysis|
| analytics_user    | user_stat_analysis     | SELECT     | Core analytics data required for reporting                   |
| analytics_user    | product_catalog        | SELECT     | Product performance analysis and customer preferences        |
| analytics_user    | order_management       | SELECT     | Sales trend analysis and customer purchasing patterns        |
| marketing_user    | user_profiles          | SELECT     | Customer segmentation and personalized marketing campaigns   |
| marketing_user    | user_stat_analysis     | SELECT     | Campaign effectiveness analysis and user behavior tracking   |
| marketing_user    | product_catalog        | SELECT     | Product promotion planning and marketing material creation   |
| customer_service  | user_profiles          | SELECT     | Customer identity verification and support                   |
| customer_service  | user_profiles          | UPDATE     | Update customer information and resolve account issues       |
| customer_service  | order_management       | SELECT     | Order status inquiries and customer support                  |
| customer_service  | order_management       | INSERT     | Create orders for customers over phone                       |
| customer_service  | order_management       | UPDATE     | Update order status and resolve order issues                 |
| customer_service  | product_catalog        | SELECT     | Product information for customer questions and support       |
| finance_user      | financial_transactions | SELECT     | Financial reporting, auditing, and compliance               |
| finance_user      | order_management       | SELECT     | Revenue reconciliation and financial analysis                |
| finance_user      | user_profiles          | SELECT     | Customer financial analysis and credit assessment            |
| product_manager   | product_catalog        | SELECT     | Product information access and management                    |
| product_manager   | product_catalog        | INSERT     | Add new products to catalog                                  |
| product_manager   | product_catalog        | UPDATE     | Update product details, pricing, and specifications         |
| product_manager   | product_catalog        | DELETE     | Remove discontinued or obsolete products                     |
| product_manager   | order_management       | SELECT     | Product sales analysis and demand forecasting               |
| product_manager   | user_stat_analysis     | SELECT     | Product usage analytics and customer behavior insights       |
| security_auditor  | audit_logs             | SELECT     | Security monitoring and incident investigation               |
| security_auditor  | user_credentials       | SELECT     | Security auditing and compliance verification               |
| security_auditor  | user_profiles          | SELECT     | User account auditing and security incident investigation    |
| developer_user    | user_profiles          | SELECT     | Application development and testing with realistic data      |
| developer_user    | product_catalog        | SELECT     | Application development and testing with product data        |
| backup_user       | user_profiles          | SELECT     | Complete data backup coverage for business continuity       |
| backup_user       | product_catalog        | SELECT     | Complete data backup coverage for business continuity       |
| backup_user       | order_management       | SELECT     | Complete data backup coverage for business continuity       |
| backup_user       | financial_transactions | SELECT     | Complete data backup coverage for business continuity       |
| backup_user       | user_stat_analysis     | SELECT     | Complete data backup coverage for business continuity       |
| backup_user       | audit_logs             | SELECT     | Complete data backup coverage for business continuity       |
| backup_user       | user_credentials       | SELECT     | Complete data backup coverage for business continuity       |

Notes:
- temp_contractor, old_employee, test_account should have NO permissions (accounts should be removed)
- All excessive permissions should be revoked for security compliance
- Missing permissions should be granted based on business role requirements

================================================================================
*/

BEGIN;

-- ============================================================================
-- CREATE AUDIT RESULTS TABLES
-- ============================================================================

CREATE TABLE security_audit_results (
    audit_id SERIAL PRIMARY KEY,
    audit_type VARCHAR(50) NOT NULL, -- 'DANGLING_USERS', 'MISSING_PERMISSIONS', 'EXCESSIVE_PERMISSIONS'
    total_issues INTEGER NOT NULL,
    users_affected INTEGER NOT NULL,
    tables_affected INTEGER NOT NULL
);

CREATE TABLE security_audit_details (
    detail_id SERIAL PRIMARY KEY,
    username VARCHAR(50) NOT NULL,
    issue_type VARCHAR(50) NOT NULL, -- 'DANGLING_USER', 'MISSING_PERMISSION', 'EXCESSIVE_PERMISSION'
    table_name VARCHAR(50), -- NULL for dangling users
    permission_type VARCHAR(20), -- 'SELECT', 'INSERT', 'UPDATE', 'DELETE', NULL for dangling users
    expected_access BOOLEAN NOT NULL -- TRUE if user should have access, FALSE if should not
);

-- ============================================================================
-- DISCOVER DATABASE USERS AND ROLES
-- ============================================================================

CREATE TEMP TABLE temp_user_discovery AS
SELECT DISTINCT
    COALESCE(u.usename, r.rolname) as username,
    COALESCE(u.usesuper, r.rolsuper) as is_superuser,
    COALESCE(u.usecreatedb, r.rolcreatedb) as can_create_db,
    r.rolname as role_name,
    u.usename as user_name,
    CASE 
        WHEN COALESCE(u.usename, r.rolname) LIKE '%analytics%' THEN 'Analytics Team'
        WHEN COALESCE(u.usename, r.rolname) LIKE '%marketing%' THEN 'Marketing Department'
        WHEN COALESCE(u.usename, r.rolname) LIKE '%customer%' OR COALESCE(u.usename, r.rolname) LIKE '%service%' THEN 'Customer Service'
        WHEN COALESCE(u.usename, r.rolname) LIKE '%finance%' THEN 'Finance Team'
        WHEN COALESCE(u.usename, r.rolname) LIKE '%product%' THEN 'Product Management'
        WHEN COALESCE(u.usename, r.rolname) LIKE '%security%' OR COALESCE(u.usename, r.rolname) LIKE '%audit%' THEN 'Security Team'
        WHEN COALESCE(u.usename, r.rolname) LIKE '%backup%' THEN 'Backup Service'
        WHEN COALESCE(u.usename, r.rolname) LIKE '%developer%' OR COALESCE(u.usename, r.rolname) LIKE '%dev%' THEN 'Development Team'
        WHEN COALESCE(u.usename, r.rolname) LIKE '%temp%' OR COALESCE(u.usename, r.rolname) LIKE '%old%' OR COALESCE(u.usename, r.rolname) LIKE '%test%' THEN 'Inactive/Temporary'
        ELSE 'Unknown'
    END as inferred_business_role
FROM pg_user u
FULL OUTER JOIN pg_roles r ON u.usename = r.rolname
WHERE COALESCE(u.usename, r.rolname) NOT IN ('postgres', 'test_user')
AND COALESCE(u.usename, r.rolname) IS NOT NULL;

-- ============================================================================
-- DISCOVER ROLE MEMBERSHIPS
-- ============================================================================

CREATE TEMP TABLE temp_role_memberships AS
SELECT 
    member_role.rolname as member_name,
    granted_role.rolname as granted_role_name,
    grantor_role.rolname as grantor_name,
    am.admin_option
FROM pg_auth_members am
JOIN pg_roles member_role ON am.member = member_role.oid
JOIN pg_roles granted_role ON am.roleid = granted_role.oid  
JOIN pg_roles grantor_role ON am.grantor = grantor_role.oid
WHERE member_role.rolname NOT IN ('postgres')
AND granted_role.rolname NOT IN ('postgres');

-- ============================================================================
-- ANALYZE CURRENT PERMISSIONS
-- ============================================================================

CREATE TEMP TABLE temp_current_permissions AS
SELECT DISTINCT
    tp.grantee as username,
    tp.table_name,
    tp.privilege_type as permission_type,
    tp.is_grantable,
    tp.grantor,
    ud.inferred_business_role,
    ud.is_superuser
FROM information_schema.table_privileges tp
LEFT JOIN temp_user_discovery ud ON tp.grantee = ud.username
WHERE tp.table_schema = 'public'
AND tp.grantee NOT IN ('postgres', 'PUBLIC', 'test_user')
AND tp.table_name NOT LIKE 'security_audit_%'
ORDER BY tp.grantee, tp.table_name, tp.privilege_type;

-- ============================================================================
-- IDENTIFY DANGLING USERS
-- ============================================================================

INSERT INTO security_audit_details (username, issue_type, table_name, permission_type, expected_access)
SELECT DISTINCT
    username,
    'DANGLING_USER',
    NULL,
    NULL,
    FALSE
FROM temp_user_discovery
WHERE inferred_business_role = 'Inactive/Temporary';

-- ============================================================================
-- IDENTIFY EXCESSIVE PERMISSIONS
-- ============================================================================

WITH excessive_permissions AS (
    SELECT username, table_name, permission_type FROM (VALUES
        ('analytics_user', 'financial_transactions', 'SELECT'),
        ('marketing_user', 'financial_transactions', 'SELECT'),
        ('customer_service', 'user_credentials', 'SELECT'),
        ('product_manager', 'financial_transactions', 'SELECT'),
        ('security_auditor', 'financial_transactions', 'UPDATE'),
        ('developer_user', 'user_credentials', 'SELECT'),
        ('developer_user', 'order_management', 'UPDATE'),
        ('backup_user', 'product_catalog', 'DELETE'),
        ('temp_contractor', 'product_catalog', 'SELECT'),
        ('temp_contractor', 'user_profiles', 'SELECT'),
        ('old_employee', 'audit_logs', 'SELECT'),
        ('old_employee', 'user_stat_analysis', 'UPDATE'),
        ('test_account', 'user_profiles', 'SELECT')
    ) AS excessive(username, table_name, permission_type)
)
INSERT INTO security_audit_details (username, issue_type, table_name, permission_type, expected_access)
SELECT 
    ep.username,
    'EXCESSIVE_PERMISSION',
    ep.table_name,
    ep.permission_type,
    FALSE
FROM excessive_permissions ep
WHERE EXISTS (
    SELECT 1 FROM temp_current_permissions cp
    WHERE cp.username = ep.username
    AND cp.table_name = ep.table_name  
    AND cp.permission_type = ep.permission_type
);

-- ============================================================================
-- IDENTIFY MISSING PERMISSIONS
-- ============================================================================

WITH expected_permissions AS (
    SELECT role_name, table_name, permission_type FROM (VALUES
        ('Analytics Team', 'user_profiles', 'SELECT'),
        ('Analytics Team', 'user_stat_analysis', 'SELECT'),
        ('Analytics Team', 'product_catalog', 'SELECT'),
        ('Analytics Team', 'order_management', 'SELECT'),
        ('Marketing Department', 'user_profiles', 'SELECT'),
        ('Marketing Department', 'user_stat_analysis', 'SELECT'),
        ('Marketing Department', 'product_catalog', 'SELECT'),
        ('Customer Service', 'user_profiles', 'SELECT'),
        ('Customer Service', 'user_profiles', 'UPDATE'),
        ('Customer Service', 'order_management', 'SELECT'),
        ('Customer Service', 'order_management', 'INSERT'),
        ('Customer Service', 'order_management', 'UPDATE'),
        ('Customer Service', 'product_catalog', 'SELECT'),
        ('Finance Team', 'financial_transactions', 'SELECT'),
        ('Finance Team', 'order_management', 'SELECT'),
        ('Finance Team', 'user_profiles', 'SELECT'),
        ('Product Management', 'product_catalog', 'SELECT'),
        ('Product Management', 'product_catalog', 'INSERT'),
        ('Product Management', 'product_catalog', 'UPDATE'),
        ('Product Management', 'product_catalog', 'DELETE'),
        ('Product Management', 'order_management', 'SELECT'),
        ('Product Management', 'user_stat_analysis', 'SELECT'),
        ('Security Team', 'audit_logs', 'SELECT'),
        ('Security Team', 'user_credentials', 'SELECT'),
        ('Security Team', 'user_profiles', 'SELECT'),
        ('Development Team', 'user_profiles', 'SELECT'),
        ('Development Team', 'product_catalog', 'SELECT'),
        ('Backup Service', 'user_profiles', 'SELECT'),
        ('Backup Service', 'product_catalog', 'SELECT'),
        ('Backup Service', 'order_management', 'SELECT'),
        ('Backup Service', 'financial_transactions', 'SELECT'),
        ('Backup Service', 'user_stat_analysis', 'SELECT'),
        ('Backup Service', 'audit_logs', 'SELECT'),
        ('Backup Service', 'user_credentials', 'SELECT')
    ) AS expected(role_name, table_name, permission_type)
)
INSERT INTO security_audit_details (username, issue_type, table_name, permission_type, expected_access)
SELECT DISTINCT
    ud.username,
    'MISSING_PERMISSION',
    ep.table_name,
    ep.permission_type,
    TRUE
FROM temp_user_discovery ud
JOIN expected_permissions ep ON ud.inferred_business_role = ep.role_name
LEFT JOIN temp_current_permissions cp ON (
    cp.username = ud.username 
    AND cp.table_name = ep.table_name 
    AND cp.permission_type = ep.permission_type
)
WHERE cp.username IS NULL
AND ud.inferred_business_role != 'Inactive/Temporary'
AND ud.inferred_business_role != 'Unknown'
AND EXISTS (
    SELECT 1 FROM information_schema.tables t
    WHERE t.table_name = ep.table_name 
    AND t.table_schema = 'public'
    AND t.table_type = 'BASE TABLE'
);

-- ============================================================================
-- POPULATE SUMMARY STATISTICS
-- ============================================================================

INSERT INTO security_audit_results (audit_type, total_issues, users_affected, tables_affected)
SELECT 
    'DANGLING_USERS',
    COUNT(*),
    COUNT(DISTINCT username),
    0
FROM security_audit_details
WHERE issue_type = 'DANGLING_USER';

INSERT INTO security_audit_results (audit_type, total_issues, users_affected, tables_affected)
SELECT 
    'MISSING_PERMISSIONS',
    COUNT(*),
    COUNT(DISTINCT username),
    COUNT(DISTINCT table_name)
FROM security_audit_details
WHERE issue_type = 'MISSING_PERMISSION';

INSERT INTO security_audit_results (audit_type, total_issues, users_affected, tables_affected)
SELECT 
    'EXCESSIVE_PERMISSIONS',
    COUNT(*),
    COUNT(DISTINCT username),
    COUNT(DISTINCT table_name)
FROM security_audit_details
WHERE issue_type = 'EXCESSIVE_PERMISSION';

-- ============================================================================
-- CLEANUP TEMPORARY TABLES
-- ============================================================================

DROP TABLE temp_user_discovery;
DROP TABLE temp_role_memberships;
DROP TABLE temp_current_permissions;

COMMIT;

-- ============================================================================
-- DISCOVERY AND VERIFICATION QUERIES
-- ============================================================================

-- Show all users and their properties
SELECT 
    usename as username,
    usesuper as is_superuser,
    usecreatedb as can_create_db,
    valuntil as password_expiry
FROM pg_user 
WHERE usename NOT IN ('postgres', 'test_user')
ORDER BY usename;

-- Show all roles and their properties  
SELECT 
    rolname as role_name,
    rolsuper as is_superuser,
    rolinherit as inherits_privileges,
    rolcanlogin as can_login
FROM pg_roles 
WHERE rolname NOT LIKE 'pg_%'
AND rolname NOT IN ('postgres', 'test_user')
ORDER BY rolname;

-- Show current table privileges
SELECT 
    grantee as username,
    table_name,
    privilege_type as permission,
    is_grantable
FROM information_schema.table_privileges
WHERE table_schema = 'public'
AND grantee NOT IN ('postgres', 'PUBLIC', 'test_user')
AND table_name NOT LIKE 'security_audit_%'
ORDER BY grantee, table_name, privilege_type;

-- Show role memberships
SELECT 
    member.rolname as member,
    granted.rolname as granted_role
FROM pg_auth_members am
JOIN pg_roles member ON am.member = member.oid
JOIN pg_roles granted ON am.roleid = granted.oid
WHERE member.rolname NOT IN ('postgres')
ORDER BY member.rolname, granted.rolname;

-- Display audit summary
SELECT 
    audit_type,
    total_issues,
    users_affected,
    tables_affected
FROM security_audit_results 
ORDER BY audit_type;

-- Display detailed findings
SELECT 
    username,
    issue_type,
    COALESCE(table_name, 'N/A') as table_name,
    COALESCE(permission_type, 'N/A') as permission_type,
    expected_access
FROM security_audit_details 
ORDER BY issue_type, username, table_name;

================================================
FILE: tasks/postgres/standard/security/user_permission_audit/meta.json
================================================
{
  "task_id": "user_permission_audit",
  "task_name": "User Permission Audit",
  "category_id": "security",
  "category_name": "Security",
  "description": "Conduct comprehensive security audit identifying users with insufficient or dangling permissions in business database environment.",
  "author": "Fanshi Zhang",
  "created_at": "2025-08-17",
  "difficulty": "L3",
  "tags": [
    "security and access control",
    "audit and compliance"
  ],
  "mcp": [
    "postgres"
  ],
  "meta_data": {
    "stateType": "text",
    "stateContent": "Table \"user_profiles\" {\n  \"user_id\" int4 [pk, not null, increment]\n  \"username\" varchar(50) [unique, not null]\n  \"email\" varchar(100) [unique, not null]\n  \"first_name\" varchar(50) [not null]\n  \"last_name\" varchar(50) [not null]\n  \"phone\" varchar(20)\n  \"address\" text\n  \"city\" varchar(50)\n  \"state\" varchar(2)\n  \"zip_code\" varchar(10)\n  \"date_created\" timestamp [default: `CURRENT_TIMESTAMP`]\n  \"last_updated\" timestamp [default: `CURRENT_TIMESTAMP`]\n  \"is_active\" bool [default: true]\n  \"profile_picture_url\" text\n  \"bio\" text\n}\n\nTable \"user_credentials\" {\n  \"credential_id\" int4 [pk, not null, increment]\n  \"user_id\" int4\n  \"password_hash\" varchar(255) [not null]\n  \"salt\" varchar(100) [not null]\n  \"login_attempts\" int4 [default: 0]\n  \"last_login\" timestamp\n  \"password_created\" timestamp [default: `CURRENT_TIMESTAMP`]\n  \"password_expires\" timestamp\n  \"is_locked\" bool [default: false]\n  \"two_factor_enabled\" bool [default: false]\n  \"two_factor_secret\" varchar(32)\n  \"backup_codes\" \"text[]\"\n  \"security_questions\" jsonb\n}\n\nTable \"user_stat_analysis\" {\n  \"analysis_id\" int4 [pk, not null, increment]\n  \"user_id\" int4\n  \"session_id\" varchar(100)\n  \"page_views\" int4 [default: 0]\n  \"time_spent_minutes\" int4 [default: 0]\n  \"actions_performed\" jsonb\n  \"device_info\" jsonb\n  \"ip_address\" inet\n  \"location_data\" jsonb\n  \"referrer_url\" text\n  \"conversion_events\" jsonb\n  \"analysis_date\" date [default: `CURRENT_DATE`]\n  \"created_at\" timestamp [default: `CURRENT_TIMESTAMP`]\n}\n\nTable \"product_catalog\" {\n  \"product_id\" int4 [pk, not null, increment]\n  \"product_name\" varchar(100) [not null]\n  \"description\" text\n  \"category\" varchar(50)\n  \"price\" numeric(10,2) [not null]\n  \"cost\" numeric(10,2)\n  \"sku\" varchar(50) [unique]\n  \"inventory_count\" int4 [default: 0]\n  \"is_active\" bool [default: true]\n  \"created_at\" timestamp [default: `CURRENT_TIMESTAMP`]\n  \"updated_at\" timestamp [default: `CURRENT_TIMESTAMP`]\n  \"supplier_info\" jsonb\n  \"weight_kg\" numeric(6,2)\n  \"dimensions\" jsonb\n}\n\nTable \"order_management\" {\n  \"order_id\" int4 [pk, not null, increment]\n  \"user_id\" int4\n  \"order_number\" varchar(50) [unique, not null]\n  \"order_status\" varchar(20) [default: 'pending']\n  \"total_amount\" numeric(12,2) [not null]\n  \"tax_amount\" numeric(12,2)\n  \"shipping_amount\" numeric(12,2)\n  \"discount_amount\" numeric(12,2) [default: 0]\n  \"payment_method\" varchar(50)\n  \"payment_status\" varchar(20) [default: 'pending']\n  \"shipping_address\" jsonb\n  \"billing_address\" jsonb\n  \"order_date\" timestamp [default: `CURRENT_TIMESTAMP`]\n  \"shipped_date\" timestamp\n  \"delivered_date\" timestamp\n  \"tracking_number\" varchar(100)\n}\n\nTable \"financial_transactions\" {\n  \"transaction_id\" int4 [pk, not null, increment]\n  \"order_id\" int4\n  \"user_id\" int4\n  \"transaction_type\" varchar(20) [not null]\n  \"amount\" numeric(12,2) [not null]\n  \"currency\" varchar(3) [default: 'USD']\n  \"payment_gateway\" varchar(50)\n  \"gateway_transaction_id\" varchar(100)\n  \"credit_card_last_four\" bpchar(4)\n  \"bank_account_last_four\" bpchar(4)\n  \"transaction_status\" varchar(20) [default: 'pending']\n  \"processed_at\" timestamp\n  \"created_at\" timestamp [default: `CURRENT_TIMESTAMP`]\n  \"fee_amount\" numeric(8,2)\n  \"refund_amount\" numeric(12,2) [default: 0]\n  \"notes\" text\n}\n\nTable \"audit_logs\" {\n  \"log_id\" int4 [pk, not null, increment]\n  \"user_id\" int4\n  \"action_type\" varchar(50) [not null]\n  \"table_name\" varchar(50)\n  \"record_id\" int4\n  \"old_values\" jsonb\n  \"new_values\" jsonb\n  \"ip_address\" inet\n  \"user_agent\" text\n  \"session_id\" varchar(100)\n  \"timestamp\" timestamp [default: `CURRENT_TIMESTAMP`]\n  \"success\" bool [default: true]\n  \"error_message\" text\n}\n\nRef \"audit_logs_user_id_fkey\":\"user_profiles\".\"user_id\" < \"audit_logs\".\"user_id\"\n\nRef \"financial_transactions_order_id_fkey\":\"order_management\".\"order_id\" < \"financial_transactions\".\"order_id\"\n\nRef \"financial_transactions_user_id_fkey\":\"user_profiles\".\"user_id\" < \"financial_transactions\".\"user_id\"\n\nRef \"order_management_user_id_fkey\":\"user_profiles\".\"user_id\" < \"order_management\".\"user_id\"\n\nRef \"user_credentials_user_id_fkey\":\"user_profiles\".\"user_id\" < \"user_credentials\".\"user_id\" [delete: cascade]\n\nRef \"user_stat_analysis_user_id_fkey\":\"user_profiles\".\"user_id\" < \"user_stat_analysis\".\"user_id\" [delete: cascade]\n",
    "stateUrl": null,
    "stateOriginalUrl": null
  }
}

================================================
FILE: tasks/postgres/standard/security/user_permission_audit/prepare_environment.py
================================================
#!/usr/bin/env python3

import os
import psycopg2
from psycopg2.extensions import ISOLATION_LEVEL_AUTOCOMMIT
import sys

# Configuration for users and their permissions
USER_CONFIGS = {
    # Active functional users
    'analytics_user': {
        'password': 'analytics123',
        'role': 'Analytics Team',
        'status': 'active'
    },
    'marketing_user': {
        'password': 'marketing123',
        'role': 'Marketing Department',
        'status': 'active'
    },
    'customer_service': {
        'password': 'service123',
        'role': 'Customer Service',
        'status': 'active'
    },
    'finance_user': {
        'password': 'finance123',
        'role': 'Finance Team',
        'status': 'active'
    },
    'product_manager': {
        'password': 'product123',
        'role': 'Product Management',
        'status': 'active'
    },
    'security_auditor': {
        'password': 'security123',
        'role': 'Security Team',
        'status': 'active'
    },
    'developer_user': {
        'password': 'dev123',
        'role': 'Development Team',
        'status': 'active'
    },
    'backup_user': {
        'password': 'backup123',
        'role': 'Backup Service',
        'status': 'active'
    },
    # Inactive/dangling users
    'temp_contractor': {
        'password': 'temp123',
        'role': 'Inactive/Temporary',
        'status': 'inactive'
    },
    'old_employee': {
        'password': 'old456',
        'role': 'Inactive/Temporary',
        'status': 'inactive'
    },
    'test_account': {
        'password': 'test789',
        'role': 'Inactive/Temporary',
        'status': 'inactive'
    }
}

# Expected permissions by role (what they SHOULD have)
ROLE_EXPECTED_PERMISSIONS = {
    'Analytics Team': [
        ('user_profiles', 'SELECT'),
        ('user_stat_analysis', 'SELECT'),
        ('product_catalog', 'SELECT'),
        ('order_management', 'SELECT'),
    ],
    'Marketing Department': [
        ('user_profiles', 'SELECT'),
        ('user_stat_analysis', 'SELECT'),
        ('product_catalog', 'SELECT'),
    ],
    'Customer Service': [
        ('user_profiles', 'SELECT'),
        ('user_profiles', 'UPDATE'),
        ('order_management', 'SELECT'),
        ('order_management', 'INSERT'),
        ('order_management', 'UPDATE'),
        ('product_catalog', 'SELECT'),
    ],
    'Finance Team': [
        ('financial_transactions', 'SELECT'),
        ('order_management', 'SELECT'),
        ('user_profiles', 'SELECT'),
    ],
    'Product Management': [
        ('product_catalog', 'SELECT'),
        ('product_catalog', 'INSERT'),
        ('product_catalog', 'UPDATE'),
        ('product_catalog', 'DELETE'),
        ('order_management', 'SELECT'),
        ('user_stat_analysis', 'SELECT'),
    ],
    'Security Team': [
        ('audit_logs', 'SELECT'),
        ('user_credentials', 'SELECT'),
        ('user_profiles', 'SELECT'),
    ],
    'Development Team': [
        ('user_profiles', 'SELECT'),
        ('product_catalog', 'SELECT'),
    ],
    'Backup Service': [
        ('user_profiles', 'SELECT'),
        ('product_catalog', 'SELECT'),
        ('order_management', 'SELECT'),
        ('financial_transactions', 'SELECT'),
        ('user_stat_analysis', 'SELECT'),
        ('audit_logs', 'SELECT'),
        ('user_credentials', 'SELECT'),
    ],
}

# Excessive permissions that will be granted but should be flagged as security issues
EXCESSIVE_PERMISSIONS = [
    # Users getting financial access they shouldn't have
    ('analytics_user', 'financial_transactions', 'SELECT'),
    ('marketing_user', 'financial_transactions', 'SELECT'),
    ('product_manager', 'financial_transactions', 'SELECT'),

    # Security risks - credential access
    ('customer_service', 'user_credentials', 'SELECT'),
    ('developer_user', 'user_credentials', 'SELECT'),

    # Excessive privileges
    ('security_auditor', 'financial_transactions', 'UPDATE'),
    ('developer_user', 'order_management', 'UPDATE'),
    ('backup_user', 'product_catalog', 'DELETE'),  # Backup should be read-only

    # Inactive users with permissions they shouldn't have
    ('temp_contractor', 'product_catalog', 'SELECT'),
    ('temp_contractor', 'user_profiles', 'SELECT'),
    ('old_employee', 'audit_logs', 'SELECT'),
    ('old_employee', 'user_stat_analysis', 'UPDATE'),
    ('test_account', 'user_profiles', 'SELECT'),
]

# Permissions to revoke to create "missing permission" findings
PERMISSIONS_TO_REVOKE = [
    ('analytics_user', 'user_profiles', 'SELECT'),
    ('analytics_user', 'order_management', 'SELECT'),
    ('analytics_user', 'product_catalog', 'SELECT'),
    ('marketing_user', 'product_catalog', 'SELECT'),
    ('finance_user', 'user_profiles', 'SELECT'),
    ('developer_user', 'product_catalog', 'SELECT'),
    ('customer_service', 'product_catalog', 'SELECT'),
    ('security_auditor', 'audit_logs', 'SELECT'),
    ('product_manager', 'user_stat_analysis', 'SELECT'),
    ('backup_user', 'order_management', 'SELECT'),
    ('backup_user', 'financial_transactions', 'SELECT'),
    ('backup_user', 'user_stat_analysis', 'SELECT'),
    ('backup_user', 'user_credentials', 'SELECT'),
]

def create_business_tables(cur):
    """Create all business tables"""

    tables = [
        ('user_profiles', """
            DROP TABLE IF EXISTS user_profiles CASCADE;
            CREATE TABLE user_profiles (
                user_id SERIAL PRIMARY KEY,
                username VARCHAR(50) UNIQUE NOT NULL,
                email VARCHAR(100) UNIQUE NOT NULL,
                first_name VARCHAR(50) NOT NULL,
                last_name VARCHAR(50) NOT NULL,
                phone VARCHAR(20),
                address TEXT,
                city VARCHAR(50),
                state VARCHAR(2),
                zip_code VARCHAR(10),
                date_created TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
                last_updated TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
                is_active BOOLEAN DEFAULT true,
                profile_picture_url TEXT,
                bio TEXT
            );
        """),

        ('user_credentials', """
            DROP TABLE IF EXISTS user_credentials CASCADE;
            CREATE TABLE user_credentials (
                credential_id SERIAL PRIMARY KEY,
                user_id INTEGER REFERENCES user_profiles(user_id) ON DELETE CASCADE,
                password_hash VARCHAR(255) NOT NULL,
                salt VARCHAR(100) NOT NULL,
                login_attempts INTEGER DEFAULT 0,
                last_login TIMESTAMP,
                password_created TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
                password_expires TIMESTAMP,
                is_locked BOOLEAN DEFAULT false,
                two_factor_enabled BOOLEAN DEFAULT false,
                two_factor_secret VARCHAR(32),
                backup_codes TEXT[],
                security_questions JSONB
            );
        """),

        ('user_stat_analysis', """
            DROP TABLE IF EXISTS user_stat_analysis CASCADE;
            CREATE TABLE user_stat_analysis (
                analysis_id SERIAL PRIMARY KEY,
                user_id INTEGER REFERENCES user_profiles(user_id) ON DELETE CASCADE,
                session_id VARCHAR(100),
                page_views INTEGER DEFAULT 0,
                time_spent_minutes INTEGER DEFAULT 0,
                actions_performed JSONB,
                device_info JSONB,
                ip_address INET,
                location_data JSONB,
                referrer_url TEXT,
                conversion_events JSONB,
                analysis_date DATE DEFAULT CURRENT_DATE,
                created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
            );
        """),

        ('product_catalog', """
            DROP TABLE IF EXISTS product_catalog CASCADE;
            CREATE TABLE product_catalog (
                product_id SERIAL PRIMARY KEY,
                product_name VARCHAR(100) NOT NULL,
                description TEXT,
                category VARCHAR(50),
                price DECIMAL(10,2) NOT NULL,
                cost DECIMAL(10,2),
                sku VARCHAR(50) UNIQUE,
                inventory_count INTEGER DEFAULT 0,
                is_active BOOLEAN DEFAULT true,
                created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
                updated_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
                supplier_info JSONB,
                weight_kg DECIMAL(6,2),
                dimensions JSONB
            );
        """),

        ('order_management', """
            DROP TABLE IF EXISTS order_management CASCADE;
            CREATE TABLE order_management (
                order_id SERIAL PRIMARY KEY,
                user_id INTEGER REFERENCES user_profiles(user_id),
                order_number VARCHAR(50) UNIQUE NOT NULL,
                order_status VARCHAR(20) DEFAULT 'pending',
                total_amount DECIMAL(12,2) NOT NULL,
                tax_amount DECIMAL(12,2),
                shipping_amount DECIMAL(12,2),
                discount_amount DECIMAL(12,2) DEFAULT 0,
                payment_method VARCHAR(50),
                payment_status VARCHAR(20) DEFAULT 'pending',
                shipping_address JSONB,
                billing_address JSONB,
                order_date TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
                shipped_date TIMESTAMP,
                delivered_date TIMESTAMP,
                tracking_number VARCHAR(100)
            );
        """),

        ('financial_transactions', """
            DROP TABLE IF EXISTS financial_transactions CASCADE;
            CREATE TABLE financial_transactions (
                transaction_id SERIAL PRIMARY KEY,
                order_id INTEGER REFERENCES order_management(order_id),
                user_id INTEGER REFERENCES user_profiles(user_id),
                transaction_type VARCHAR(20) NOT NULL,
                amount DECIMAL(12,2) NOT NULL,
                currency VARCHAR(3) DEFAULT 'USD',
                payment_gateway VARCHAR(50),
                gateway_transaction_id VARCHAR(100),
                credit_card_last_four CHAR(4),
                bank_account_last_four CHAR(4),
                transaction_status VARCHAR(20) DEFAULT 'pending',
                processed_at TIMESTAMP,
                created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
                fee_amount DECIMAL(8,2),
                refund_amount DECIMAL(12,2) DEFAULT 0,
                notes TEXT
            );
        """),

        ('audit_logs', """
            DROP TABLE IF EXISTS audit_logs CASCADE;
            CREATE TABLE audit_logs (
                log_id SERIAL PRIMARY KEY,
                user_id INTEGER REFERENCES user_profiles(user_id),
                action_type VARCHAR(50) NOT NULL,
                table_name VARCHAR(50),
                record_id INTEGER,
                old_values JSONB,
                new_values JSONB,
                ip_address INET,
                user_agent TEXT,
                session_id VARCHAR(100),
                timestamp TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
                success BOOLEAN DEFAULT true,
                error_message TEXT
            );
        """)
    ]

    for table_name, sql in tables:
        cur.execute(sql)

def create_users(cur):
    """Create PostgreSQL users from configuration"""
    for username, config in USER_CONFIGS.items():
        cur.execute(f"CREATE USER {username} WITH PASSWORD %s;", (config['password'],))

def grant_expected_permissions(cur):
    """Grant expected permissions to users based on their roles"""
    for username, config in USER_CONFIGS.items():
        if config['status'] == 'active':
            role = config['role']
            permissions = ROLE_EXPECTED_PERMISSIONS.get(role, [])
            for table_name, privilege in permissions:
                cur.execute(f"GRANT {privilege} ON {table_name} TO {username};")

def grant_excessive_permissions(cur):
    """Grant excessive permissions that should be flagged as security issues"""
    for username, table_name, privilege in EXCESSIVE_PERMISSIONS:
        cur.execute(f"GRANT {privilege} ON {table_name} TO {username};")

def revoke_permissions(cur):
    """Revoke specific permissions to create missing permission findings"""
    for username, table_name, privilege in PERMISSIONS_TO_REVOKE:
        cur.execute(f"REVOKE {privilege} ON {table_name} FROM {username};")

def grant_sequence_permissions(cur):
    """Grant sequence permissions for users that need them"""
    users_needing_sequences = ['customer_service', 'product_manager']
    for username in users_needing_sequences:
        cur.execute(f"GRANT USAGE, SELECT ON ALL SEQUENCES IN SCHEMA public TO {username};")

def setup_security_environment():
    """
    Set up a security-focused PostgreSQL environment with business tables and users with various permissions.
    Creates a scenario where some users have dangling or insufficient permissions for realistic security analysis.
    """

    # Database connection parameters from environment
    db_params = {
        'host': os.getenv('POSTGRES_HOST', 'localhost'),
        'port': os.getenv('POSTGRES_PORT', '5432'),
        'user': os.getenv('POSTGRES_USERNAME', 'postgres'),
        'password': os.getenv('POSTGRES_PASSWORD', 'password'),
        'database': os.getenv('POSTGRES_DATABASE', 'postgres')
    }

    postgres_params = db_params.copy()
    postgres_params['database'] = 'postgres'

    try:
        conn_postgres = psycopg2.connect(**postgres_params)
        conn_postgres.set_isolation_level(ISOLATION_LEVEL_AUTOCOMMIT)
        cur_postgres = conn_postgres.cursor()

        current_db = db_params['database']
        cur_postgres.execute("SELECT datname FROM pg_database WHERE datname LIKE %s AND datname != %s;", ('%user_permission_audit%', current_db))
        audit_databases = cur_postgres.fetchall()

        if audit_databases:
            for db_row in audit_databases:
                db_name = db_row[0]
                try:
                    cur_postgres.execute("SELECT pg_terminate_backend(pid) FROM pg_stat_activity WHERE datname = %s;", (db_name,))
                    cur_postgres.execute(f"DROP DATABASE IF EXISTS {db_name};")
                    print(f"Dropped database: {db_name}")
                except Exception as e:
                    print(f"Warning: Could not drop database {db_name}: {e}")

        # Clean up existing users
        for username in USER_CONFIGS.keys():
            try:
                cur_postgres.execute("SELECT pg_terminate_backend(pid) FROM pg_stat_activity WHERE usename = %s;", (username,))
                cur_postgres.execute(f"DROP USER IF EXISTS {username};")
            except Exception as e:
                print(f"Warning: Could not drop user {username}: {e}")

        cur_postgres.close()
        conn_postgres.close()

    except Exception as e:
        print(f"Warning: Could not clean up users: {e}")

    try:
        conn = psycopg2.connect(**db_params)
        conn.set_isolation_level(ISOLATION_LEVEL_AUTOCOMMIT)
        cur = conn.cursor()

        print("Setting up security audit environment...")

        # Create business tables with realistic structure
        create_business_tables(cur)
        print("Created 7 business tables")

        # Create users
        create_users(cur)
        active_count = len([u for u in USER_CONFIGS.values() if u['status'] == 'active'])
        inactive_count = len([u for u in USER_CONFIGS.values() if u['status'] == 'inactive'])
        print(f"Created {len(USER_CONFIGS)} users ({active_count} functional, {inactive_count} dangling)")

        # Grant expected permissions
        grant_expected_permissions(cur)

        # Grant excessive permissions that will be flagged as issues
        grant_excessive_permissions(cur)

        print("Granted initial permissions")

        # Revoke specific permissions to create missing permission findings
        revoke_permissions(cur)

        # Grant sequence permissions where needed
        grant_sequence_permissions(cur)

        cur.close()
        conn.close()

    except Exception as e:
        print(f"Error setting up environment: {e}")
        sys.exit(1)

if __name__ == "__main__":
    setup_security_environment()


================================================
FILE: tasks/postgres/standard/security/user_permission_audit/verify.py
================================================
import os
import psycopg2
import sys


def verify_security_audit():
    """
    Verify that the security audit correctly identified all permission issues.
    """

    # Database connection parameters from environment
    db_params = {
        'host': os.getenv('POSTGRES_HOST', 'localhost'),
        'port': os.getenv('POSTGRES_PORT', '5432'),
        'user': os.getenv('POSTGRES_USERNAME', 'postgres'),
        'password': os.getenv('POSTGRES_PASSWORD', 'password'),
        'database': os.getenv('POSTGRES_DATABASE', 'postgres')
    }

    try:
        conn = psycopg2.connect(**db_params)
        cur = conn.cursor()

        print("| Verifying security audit findings...")

        # Check if security_audit_results table exists
        cur.execute("""
            SELECT EXISTS (
                SELECT FROM information_schema.tables
                WHERE table_name = 'security_audit_results'
            );
        """)

        if not cur.fetchone()[0]:
            print("FAIL: security_audit_results table not found")
            return False

        # Check if security_audit_details table exists
        cur.execute("""
            SELECT EXISTS (
                SELECT FROM information_schema.tables
                WHERE table_name = 'security_audit_details'
            );
        """)

        if not cur.fetchone()[0]:
            print("FAIL: security_audit_details table not found")
            return False

        # Get all detailed findings
        cur.execute("SELECT * FROM security_audit_details ORDER BY detail_id;")
        findings = cur.fetchall()

        if not findings:
            print("FAIL: No findings in security_audit_details table")
            return False

        print(f"| Found {len(findings)} audit findings")

        # Expected findings based on the ground truth:
        expected_findings = {
            # Expected dangling users
            'dangling_users': {'temp_contractor', 'old_employee', 'test_account'},

            # Expected missing permissions (should be granted)
            'missing_permissions': {
                ('analytics_user', 'user_profiles', 'SELECT'),
                ('analytics_user', 'product_catalog', 'SELECT'),
                ('analytics_user', 'order_management', 'SELECT'),
                ('marketing_user', 'product_catalog', 'SELECT'),
                ('customer_service', 'product_catalog', 'SELECT'),
                ('finance_user', 'user_profiles', 'SELECT'),
                ('product_manager', 'user_stat_analysis', 'SELECT'),
                ('security_auditor', 'audit_logs', 'SELECT'),
                ('developer_user', 'product_catalog', 'SELECT'),
                ('backup_user', 'order_management', 'SELECT'),
                ('backup_user', 'financial_transactions', 'SELECT'),
                ('backup_user', 'user_stat_analysis', 'SELECT'),
                ('backup_user', 'user_credentials', 'SELECT')
            },

            # Expected excessive permissions (should be revoked)
            'excessive_permissions': {
                ('analytics_user', 'financial_transactions', 'SELECT'),
                ('marketing_user', 'financial_transactions', 'SELECT'),
                ('customer_service', 'user_credentials', 'SELECT'),
                ('product_manager', 'financial_transactions', 'SELECT'),
                ('security_auditor', 'financial_transactions', 'UPDATE'),
                ('developer_user', 'user_credentials', 'SELECT'),
                ('developer_user', 'order_management', 'UPDATE'),
                ('backup_user', 'product_catalog', 'DELETE'),
                ('temp_contractor', 'product_catalog', 'SELECT'),
                ('temp_contractor', 'user_profiles', 'SELECT'),
                ('old_employee', 'audit_logs', 'SELECT'),
                ('old_employee', 'user_stat_analysis', 'UPDATE'),
                ('test_account', 'user_profiles', 'SELECT')
            }
        }

        found_dangling = set()
        found_missing_permissions = set()
        found_excessive_permissions = set()

        # Analyze findings (detail_id, username, issue_type, table_name, permission_type, expected_access)
        for finding in findings:
            username = finding[1]
            issue_type = finding[2]
            table_name = finding[3]
            permission_type = finding[4]
            expected_access = finding[5]

            if issue_type == 'DANGLING_USER':
                found_dangling.add(username)
            elif issue_type == 'MISSING_PERMISSION' and expected_access:
                if table_name and permission_type:
                    found_missing_permissions.add((username, table_name, permission_type))
            elif issue_type == 'EXCESSIVE_PERMISSION' and not expected_access:
                if table_name and permission_type:
                    found_excessive_permissions.add((username, table_name, permission_type))

        # Verify dangling users
        missing_dangling = expected_findings['dangling_users'] - found_dangling
        extra_dangling = found_dangling - expected_findings['dangling_users']

        # Verify missing permissions
        missing_missing_perms = expected_findings['missing_permissions'] - found_missing_permissions
        extra_missing_perms = found_missing_permissions - expected_findings['missing_permissions']

        # Verify excessive permissions
        missing_excessive_perms = expected_findings['excessive_permissions'] - found_excessive_permissions
        extra_excessive_perms = found_excessive_permissions - expected_findings['excessive_permissions']

        # Validate structure
        structure_valid = True
        for i, finding in enumerate(findings):
            if len(finding) != 6:  # Should have 6 columns
                print(f"| FAIL: Finding {i + 1} has wrong number of columns (expected 6, got {len(finding)})")
                structure_valid = False
                continue

            detail_id, username, issue_type, table_name, permission_type, expected_access = finding

            if not username:
                print(f"| FAIL: Finding {i + 1} missing username")
                structure_valid = False

            if issue_type not in ['DANGLING_USER', 'MISSING_PERMISSION', 'EXCESSIVE_PERMISSION']:
                print(f"| FAIL: Finding {i + 1} invalid issue_type: {issue_type}")
                structure_valid = False

            if expected_access not in [True, False]:
                print(f"| FAIL: Finding {i + 1} invalid expected_access: {expected_access}")
                structure_valid = False

        if structure_valid:
            print(f"| ✓ structure is valid")

        # Check for missing findings
        all_correct = True

        print(f"| Expected dangling users: {expected_findings['dangling_users']} Found: {found_dangling}")
        if missing_dangling:
            print(f"| Missing dangling users: {missing_dangling}")
            all_correct = False

        print(
            f"| Expected missing permissions: {len(expected_findings['missing_permissions'])} Found: {len(found_missing_permissions)} Missing: {len(missing_missing_perms)}")
        if missing_missing_perms:
            print(f"| Missing 'missing permission' findings:")
            for perm in sorted(missing_missing_perms):
                print(f"|   - {perm[0]} should be granted {perm[2]} on {perm[1]}")
            all_correct = False

        print(
            f"| Expected excessive permissions: {len(expected_findings['excessive_permissions'])} Found: {len(found_excessive_permissions)} Missing: {len(missing_excessive_perms)}")
        if missing_excessive_perms:
            print(f"| Missing 'excessive permission' findings:")
            for perm in sorted(missing_excessive_perms):
                print(f"|   - {perm[0]} should have {perm[2]} revoked on {perm[1]}")
            all_correct = False

        # Check audit summary table
        cur.execute(
            "SELECT audit_type, total_issues, users_affected, tables_affected FROM security_audit_results ORDER BY audit_type;")
        summary_results = cur.fetchall()

        # Expected summary numbers based on ground truth
        expected_summary = {
            'DANGLING_USERS': (3, 3, 0),          # 3 issues, 3 users affected, 0 tables affected
            'EXCESSIVE_PERMISSIONS': (13, 10, 7), # 13 issues, 10 users affected, 7 tables affected
            'MISSING_PERMISSIONS': (13, 8, 7)     # 13 issues, 8 users affected, 7 tables affected
        }

        summary_correct = True
        for result in summary_results:
            audit_type, total_issues, users_affected, tables_affected = result
            print(f"| Summary result: [{audit_type}] {total_issues} issues, {users_affected} users affected, {tables_affected} tables affected")
            
            if audit_type in expected_summary:
                expected = expected_summary[audit_type]
                if (total_issues, users_affected, tables_affected) != expected:
                    print(f"| FAIL: {audit_type} summary mismatch - Expected: {expected}, Got: ({total_issues}, {users_affected}, {tables_affected})")
                    summary_correct = False
                else:
                    print(f"| ✓ {audit_type} summary matches expected values")

        # Assert exact counts match expected
        assert len(found_dangling) == 3, f"Expected 3 dangling users, found {len(found_dangling)}"
        assert len(found_missing_permissions) == 13, f"Expected 13 missing permissions, found {len(found_missing_permissions)}"
        assert len(found_excessive_permissions) == 13, f"Expected 13 excessive permissions, found {len(found_excessive_permissions)}"

        if all_correct and structure_valid and summary_correct:
            print("| ✓ All assertions passed")
            return True
        else:
            return False

    except Exception as e:
        print(f"FAIL: Error during verification: {e}")
        return False
    finally:
        if 'cur' in locals():
            cur.close()
        if 'conn' in locals():
            conn.close()


if __name__ == "__main__":
    success = verify_security_audit()
    sys.exit(0 if success else 1)


================================================
FILE: tasks/postgres/standard/sports/baseball_player_analysis/description.md
================================================
Create comprehensive baseball player performance analysis in the sports database.

## Background

You are a sports analyst working with a comprehensive sports database. The analytics team needs to create a detailed analysis of baseball players by combining their offensive and defensive statistics with personal information. Currently, this data is scattered across multiple tables and needs to be consolidated for reporting purposes.

## Your Task

Create a table called `baseball_player_analysis` that consolidates baseball player performance data. The table should provide a comprehensive view of each qualifying player's performance metrics.

### Table Structure

Create the `baseball_player_analysis` table with the following columns:
- `player_id` (INTEGER, NOT NULL) - Player identifier
- `player_name` (VARCHAR(255), NOT NULL) - Player's full name
- `team_name` (VARCHAR(255)) - Set to 'Unknown' for all players
- `games_played` (INTEGER) - Number of games/events the player participated in
- `at_bats` (INTEGER) - Total at-bats for the player
- `hits` (INTEGER) - Total hits for the player
- `runs_scored` (INTEGER) - Total runs scored by the player
- `rbi` (INTEGER) - Total runs batted in by the player
- `home_runs` (INTEGER) - Total home runs hit by the player
- `batting_average` (DECIMAL) - Calculated as hits/at_bats
- `defensive_games` (INTEGER) - Number of defensive games played (same as games_played)
- `putouts` (INTEGER) - Total putouts in defensive play
- `assists` (INTEGER) - Total assists in defensive play
- `errors` (INTEGER) - Total errors made in defensive play
- `fielding_percentage` (DECIMAL) - Calculated as (putouts + assists)/(putouts + assists + errors)

### Data Requirements

Include only baseball players that meet ALL of the following criteria:
- Have offensive statistics available for regular season play
- Have played at least 10 games/events
- Have at least 50 at-bats
- Have a valid name available in the system

### Important Notes

- Focus on regular season statistics only
- Handle NULL values appropriately in calculations (use 0 for missing stats)
- Ensure batting average and fielding percentage calculations handle division by zero
- Do NOT use ROUND functions - keep the full precision of calculated values
- Sort results by batting average descending, then by games played descending

## Requirements

- Explore the database to understand the table structure and relationships
- Create the table with the exact structure specified above
- Populate the table using appropriate queries and joins
- Ensure all calculations are mathematically correct
- Handle edge cases properly (division by zero, NULL values)

================================================
FILE: tasks/postgres/standard/sports/baseball_player_analysis/meta.json
================================================
{
  "task_id": "baseball_player_analysis",
  "task_name": "Baseball Player Analysis",
  "category_id": "sports",
  "category_name": "Sports",
  "description": "Consolidate scattered baseball player data into comprehensive analysis table combining offensive and defensive statistics.",
  "author": "Lingxiao Du",
  "created_at": "2025-08-18",
  "difficulty": "L3",
  "tags": [
    "reporting and analytics",
    "statistical aggregation",
    "schema design"
  ],
  "mcp": [
    "postgres"
  ],
  "meta_data": {
    "stateType": "text",
    "stateContent": "Table \"addresses\" {\n  \"id\" int4 [not null, increment]\n  \"location_id\" int4 [not null]\n  \"language\" varchar(100)\n  \"suite\" varchar(100)\n  \"floor\" varchar(100)\n  \"building\" varchar(100)\n  \"street_number\" varchar(100)\n  \"street_prefix\" varchar(100)\n  \"street\" varchar(100)\n  \"street_suffix\" varchar(100)\n  \"neighborhood\" varchar(100)\n  \"district\" varchar(100)\n  \"locality\" varchar(100)\n  \"county\" varchar(100)\n  \"region\" varchar(100)\n  \"postal_code\" varchar(100)\n  \"country\" varchar(100)\n}\n\nTable \"affiliation_phases\" {\n  \"id\" int4 [not null, increment]\n  \"affiliation_id\" int4 [not null]\n  \"ancestor_affiliation_id\" int4\n  \"start_season_id\" int4\n  \"start_date_time\" timestamp\n  \"end_season_id\" int4\n  \"end_date_time\" timestamp\n}\n\nTable \"affiliations\" {\n  \"id\" int4 [not null, increment]\n  \"affiliation_key\" varchar(100) [not null]\n  \"affiliation_type\" varchar(100)\n  \"publisher_id\" int4 [not null]\n}\n\nTable \"affiliations_documents\" {\n  \"affiliation_id\" int4 [not null]\n  \"document_id\" int4 [not null]\n}\n\nTable \"affiliations_events\" {\n  \"affiliation_id\" int4 [not null]\n  \"event_id\" int4 [not null]\n}\n\nTable \"affiliations_media\" {\n  \"affiliation_id\" int4 [not null]\n  \"media_id\" int4 [not null]\n}\n\nTable \"american_football_action_participants\" {\n  \"id\" int4 [not null, increment]\n  \"american_football_action_play_id\" int4 [not null]\n  \"person_id\" int4 [not null]\n  \"participant_role\" varchar(100) [not null]\n  \"score_type\" varchar(100)\n  \"field_line\" int4\n  \"yardage\" int4\n  \"score_credit\" int4\n  \"yards_gained\" int4\n}\n\nTable \"american_football_action_plays\" {\n  \"id\" int4 [not null, increment]\n  \"american_football_event_state_id\" int4 [not null]\n  \"play_type\" varchar(100)\n  \"score_attempt_type\" varchar(100)\n  \"drive_result\" varchar(100)\n  \"points\" int4\n  \"comment\" varchar(255)\n}\n\nTable \"american_football_defensive_stats\" {\n  \"id\" int4 [not null, increment]\n  \"tackles_total\" varchar(100)\n  \"tackles_solo\" varchar(100)\n  \"tackles_assists\" varchar(100)\n  \"interceptions_total\" varchar(100)\n  \"interceptions_yards\" varchar(100)\n  \"interceptions_average\" varchar(100)\n  \"interceptions_longest\" varchar(100)\n  \"interceptions_touchdown\" varchar(100)\n  \"quarterback_hurries\" varchar(100)\n  \"sacks_total\" varchar(100)\n  \"sacks_yards\" varchar(100)\n  \"passes_defensed\" varchar(100)\n}\n\nTable \"american_football_down_progress_stats\" {\n  \"id\" int4 [not null, increment]\n  \"first_downs_total\" varchar(100)\n  \"first_downs_pass\" varchar(100)\n  \"first_downs_run\" varchar(100)\n  \"first_downs_penalty\" varchar(100)\n  \"conversions_third_down\" varchar(100)\n  \"conversions_third_down_attempts\" varchar(100)\n  \"conversions_third_down_percentage\" varchar(100)\n  \"conversions_fourth_down\" varchar(100)\n  \"conversions_fourth_down_attempts\" varchar(100)\n  \"conversions_fourth_down_percentage\" varchar(100)\n}\n\nTable \"american_football_event_states\" {\n  \"id\" int4 [not null, increment]\n  \"event_id\" int4 [not null]\n  \"current_state\" int2\n  \"sequence_number\" int4\n  \"period_value\" int4\n  \"period_time_elapsed\" varchar(100)\n  \"period_time_remaining\" varchar(100)\n  \"clock_state\" varchar(100)\n  \"down\" int4\n  \"team_in_possession_id\" int4\n  \"distance_for_1st_down\" int4\n  \"field_side\" varchar(100)\n  \"field_line\" int4\n  \"context\" varchar(40)\n}\n\nTable \"american_football_fumbles_stats\" {\n  \"id\" int4 [not null, increment]\n  \"fumbles_committed\" varchar(100)\n  \"fumbles_forced\" varchar(100)\n  \"fumbles_recovered\" varchar(100)\n  \"fumbles_lost\" varchar(100)\n  \"fumbles_yards_gained\" varchar(100)\n  \"fumbles_own_committed\" varchar(100)\n  \"fumbles_own_recovered\" varchar(100)\n  \"fumbles_own_lost\" varchar(100)\n  \"fumbles_own_yards_gained\" varchar(100)\n  \"fumbles_opposing_committed\" varchar(100)\n  \"fumbles_opposing_recovered\" varchar(100)\n  \"fumbles_opposing_lost\" varchar(100)\n  \"fumbles_opposing_yards_gained\" varchar(100)\n}\n\nTable \"american_football_offensive_stats\" {\n  \"id\" int4 [not null, increment]\n  \"offensive_plays_yards\" varchar(100)\n  \"offensive_plays_number\" varchar(100)\n  \"offensive_plays_average_yards_per\" varchar(100)\n  \"possession_duration\" varchar(100)\n  \"turnovers_giveaway\" varchar(100)\n}\n\nTable \"american_football_passing_stats\" {\n  \"id\" int4 [not null, increment]\n  \"passes_attempts\" varchar(100)\n  \"passes_completions\" varchar(100)\n  \"passes_percentage\" varchar(100)\n  \"passes_yards_gross\" varchar(100)\n  \"passes_yards_net\" varchar(100)\n  \"passes_yards_lost\" varchar(100)\n  \"passes_touchdowns\" varchar(100)\n  \"passes_touchdowns_percentage\" varchar(100)\n  \"passes_interceptions\" varchar(100)\n  \"passes_interceptions_percentage\" varchar(100)\n  \"passes_longest\" varchar(100)\n  \"passes_average_yards_per\" varchar(100)\n  \"passer_rating\" varchar(100)\n  \"receptions_total\" varchar(100)\n  \"receptions_yards\" varchar(100)\n  \"receptions_touchdowns\" varchar(100)\n  \"receptions_first_down\" varchar(100)\n  \"receptions_longest\" varchar(100)\n  \"receptions_average_yards_per\" varchar(100)\n}\n\nTable \"american_football_penalties_stats\" {\n  \"id\" int4 [not null, increment]\n  \"penalties_total\" varchar(100)\n  \"penalty_yards\" varchar(100)\n  \"penalty_first_downs\" varchar(100)\n}\n\nTable \"american_football_rushing_stats\" {\n  \"id\" int4 [not null, increment]\n  \"rushes_attempts\" varchar(100)\n  \"rushes_yards\" varchar(100)\n  \"rushes_touchdowns\" varchar(100)\n  \"rushing_average_yards_per\" varchar(100)\n  \"rushes_first_down\" varchar(100)\n  \"rushes_longest\" varchar(100)\n}\n\nTable \"american_football_sacks_against_stats\" {\n  \"id\" int4 [not null, increment]\n  \"sacks_against_yards\" varchar(100)\n  \"sacks_against_total\" varchar(100)\n}\n\nTable \"american_football_scoring_stats\" {\n  \"id\" int4 [not null, increment]\n  \"touchdowns_total\" varchar(100)\n  \"touchdowns_passing\" varchar(100)\n  \"touchdowns_rushing\" varchar(100)\n  \"touchdowns_special_teams\" varchar(100)\n  \"touchdowns_defensive\" varchar(100)\n  \"extra_points_attempts\" varchar(100)\n  \"extra_points_made\" varchar(100)\n  \"extra_points_missed\" varchar(100)\n  \"extra_points_blocked\" varchar(100)\n  \"field_goal_attempts\" varchar(100)\n  \"field_goals_made\" varchar(100)\n  \"field_goals_missed\" varchar(100)\n  \"field_goals_blocked\" varchar(100)\n  \"safeties_against\" varchar(100)\n  \"two_point_conversions_attempts\" varchar(100)\n  \"two_point_conversions_made\" varchar(100)\n  \"touchbacks_total\" varchar(100)\n}\n\nTable \"american_football_special_teams_stats\" {\n  \"id\" int4 [not null, increment]\n  \"returns_punt_total\" varchar(100)\n  \"returns_punt_yards\" varchar(100)\n  \"returns_punt_average\" varchar(100)\n  \"returns_punt_longest\" varchar(100)\n  \"returns_punt_touchdown\" varchar(100)\n  \"returns_kickoff_total\" varchar(100)\n  \"returns_kickoff_yards\" varchar(100)\n  \"returns_kickoff_average\" varchar(100)\n  \"returns_kickoff_longest\" varchar(100)\n  \"returns_kickoff_touchdown\" varchar(100)\n  \"returns_total\" varchar(100)\n  \"returns_yards\" varchar(100)\n  \"punts_total\" varchar(100)\n  \"punts_yards_gross\" varchar(100)\n  \"punts_yards_net\" varchar(100)\n  \"punts_longest\" varchar(100)\n  \"punts_inside_20\" varchar(100)\n  \"punts_inside_20_percentage\" varchar(100)\n  \"punts_average\" varchar(100)\n  \"punts_blocked\" varchar(100)\n  \"touchbacks_total\" varchar(100)\n  \"touchbacks_total_percentage\" varchar(100)\n  \"touchbacks_kickoffs\" varchar(100)\n  \"touchbacks_kickoffs_percentage\" varchar(100)\n  \"touchbacks_punts\" varchar(100)\n  \"touchbacks_punts_percentage\" varchar(100)\n  \"touchbacks_interceptions\" varchar(100)\n  \"touchbacks_interceptions_percentage\" varchar(100)\n  \"fair_catches\" varchar(100)\n}\n\nTable \"baseball_action_contact_details\" {\n  \"id\" int4 [not null, increment]\n  \"baseball_action_pitch_id\" int4 [not null]\n  \"location\" varchar(100)\n  \"strength\" varchar(100)\n  \"velocity\" int4\n  \"comment\" text\n  \"trajectory_coordinates\" varchar(100)\n  \"trajectory_formula\" varchar(100)\n}\n\nTable \"baseball_action_pitches\" {\n  \"id\" int4 [not null, increment]\n  \"baseball_action_play_id\" int4 [not null]\n  \"sequence_number\" int4\n  \"baseball_defensive_group_id\" int4\n  \"umpire_call\" varchar(100)\n  \"pitch_location\" varchar(100)\n  \"pitch_type\" varchar(100)\n  \"pitch_velocity\" int4\n  \"comment\" text\n  \"trajectory_coordinates\" varchar(100)\n  \"trajectory_formula\" varchar(100)\n  \"ball_type\" varchar(40)\n  \"strike_type\" varchar(40)\n}\n\nTable \"baseball_action_plays\" {\n  \"id\" int4 [not null, increment]\n  \"baseball_event_state_id\" int4 [not null]\n  \"play_type\" varchar(100)\n  \"notation\" varchar(100)\n  \"notation_yaml\" text\n  \"baseball_defensive_group_id\" int4\n  \"comment\" varchar(255)\n  \"runner_on_first_advance\" int4\n  \"runner_on_second_advance\" int4\n  \"runner_on_third_advance\" int4\n  \"outs_recorded\" int4\n  \"rbi\" int4\n  \"runs_scored\" int4\n  \"earned_runs_scored\" varchar(100)\n}\n\nTable \"baseball_action_substitutions\" {\n  \"id\" int4 [not null, increment]\n  \"baseball_event_state_id\" int4 [not null]\n  \"sequence_number\" int4\n  \"person_type\" varchar(100)\n  \"person_original_id\" int4\n  \"person_original_position_id\" int4\n  \"person_original_lineup_slot\" int4\n  \"person_replacing_id\" int4\n  \"person_replacing_position_id\" int4\n  \"person_replacing_lineup_slot\" int4\n  \"substitution_reason\" varchar(100)\n  \"comment\" varchar(100)\n}\n\nTable \"baseball_defensive_group\" {\n  \"id\" int4 [not null, increment]\n}\n\nTable \"baseball_defensive_players\" {\n  \"id\" int4 [not null, increment]\n  \"baseball_defensive_group_id\" int4 [not null]\n  \"player_id\" int4 [not null]\n  \"position_id\" int4 [not null]\n}\n\nTable \"baseball_defensive_stats\" {\n  \"id\" int4 [not null, increment]\n  \"double_plays\" int4\n  \"triple_plays\" int4\n  \"putouts\" int4\n  \"assists\" int4\n  \"errors\" int4\n  \"fielding_percentage\" numeric\n  \"defensive_average\" numeric\n  \"errors_passed_ball\" int4\n  \"errors_catchers_interference\" int4\n}\n\nTable \"baseball_event_states\" {\n  \"id\" int4 [not null, increment]\n  \"event_id\" int4 [not null]\n  \"current_state\" int2\n  \"sequence_number\" int4\n  \"at_bat_number\" int4\n  \"inning_value\" int4\n  \"inning_half\" varchar(100)\n  \"outs\" int4\n  \"balls\" int4\n  \"strikes\" int4\n  \"runner_on_first_id\" int4\n  \"runner_on_second_id\" int4\n  \"runner_on_third_id\" int4\n  \"runner_on_first\" int2\n  \"runner_on_second\" int2\n  \"runner_on_third\" int2\n  \"runs_this_inning_half\" int4\n  \"pitcher_id\" int4\n  \"batter_id\" int4\n  \"batter_side\" varchar(100)\n  \"context\" varchar(40)\n}\n\nTable \"baseball_offensive_stats\" {\n  \"id\" int4 [not null, increment]\n  \"average\" numeric\n  \"runs_scored\" int4\n  \"at_bats\" int4\n  \"hits\" int4\n  \"rbi\" int4\n  \"total_bases\" int4\n  \"slugging_percentage\" numeric\n  \"bases_on_balls\" int4\n  \"strikeouts\" int4\n  \"left_on_base\" int4\n  \"left_in_scoring_position\" int4\n  \"singles\" int4\n  \"doubles\" int4\n  \"triples\" int4\n  \"home_runs\" int4\n  \"grand_slams\" int4\n  \"at_bats_per_rbi\" numeric\n  \"plate_appearances_per_rbi\" numeric\n  \"at_bats_per_home_run\" numeric\n  \"plate_appearances_per_home_run\" numeric\n  \"sac_flies\" int4\n  \"sac_bunts\" int4\n  \"grounded_into_double_play\" int4\n  \"moved_up\" int4\n  \"on_base_percentage\" numeric\n  \"stolen_bases\" int4\n  \"stolen_bases_caught\" int4\n  \"stolen_bases_average\" numeric\n  \"hit_by_pitch\" int4\n  \"defensive_interferance_reaches\" int4\n  \"on_base_plus_slugging\" numeric\n  \"plate_appearances\" int4\n  \"hits_extra_base\" int4\n}\n\nTable \"baseball_pitching_stats\" {\n  \"id\" int4 [not null, increment]\n  \"runs_allowed\" int4\n  \"singles_allowed\" int4\n  \"doubles_allowed\" int4\n  \"triples_allowed\" int4\n  \"home_runs_allowed\" int4\n  \"innings_pitched\" varchar(20)\n  \"hits\" int4\n  \"earned_runs\" int4\n  \"unearned_runs\" int4\n  \"bases_on_balls\" int4\n  \"bases_on_balls_intentional\" int4\n  \"strikeouts\" int4\n  \"strikeout_to_bb_ratio\" numeric\n  \"number_of_pitches\" int4\n  \"era\" numeric\n  \"inherited_runners_scored\" int4\n  \"pick_offs\" int4\n  \"errors_hit_with_pitch\" int4\n  \"errors_wild_pitch\" int4\n  \"balks\" int4\n  \"wins\" int4\n  \"losses\" int4\n  \"saves\" int4\n  \"shutouts\" int4\n  \"games_complete\" int4\n  \"games_finished\" int4\n  \"winning_percentage\" numeric\n  \"event_credit\" varchar(40)\n  \"save_credit\" varchar(40)\n}\n\nTable \"basketball_defensive_stats\" {\n  \"id\" int4 [not null, increment]\n  \"steals_total\" varchar(100)\n  \"steals_per_game\" varchar(100)\n  \"blocks_total\" varchar(100)\n  \"blocks_per_game\" varchar(100)\n}\n\nTable \"basketball_event_states\" {\n  \"id\" int4 [not null, increment]\n  \"event_id\" int4 [not null]\n  \"current_state\" int4\n  \"sequence_number\" int4\n  \"period_value\" varchar(100)\n  \"period_time_elapsed\" varchar(100)\n  \"period_time_remaining\" varchar(100)\n  \"context\" varchar(40)\n}\n\nTable \"basketball_offensive_stats\" {\n  \"id\" int4 [not null, increment]\n  \"field_goals_made\" int4\n  \"field_goals_attempted\" int4\n  \"field_goals_percentage\" varchar(100)\n  \"field_goals_per_game\" varchar(100)\n  \"field_goals_attempted_per_game\" varchar(100)\n  \"field_goals_percentage_adjusted\" varchar(100)\n  \"three_pointers_made\" int4\n  \"three_pointers_attempted\" int4\n  \"three_pointers_percentage\" varchar(100)\n  \"three_pointers_per_game\" varchar(100)\n  \"three_pointers_attempted_per_game\" varchar(100)\n  \"free_throws_made\" varchar(100)\n  \"free_throws_attempted\" varchar(100)\n  \"free_throws_percentage\" varchar(100)\n  \"free_throws_per_game\" varchar(100)\n  \"free_throws_attempted_per_game\" varchar(100)\n  \"points_scored_total\" varchar(100)\n  \"points_scored_per_game\" varchar(100)\n  \"assists_total\" varchar(100)\n  \"assists_per_game\" varchar(100)\n  \"turnovers_total\" varchar(100)\n  \"turnovers_per_game\" varchar(100)\n  \"points_scored_off_turnovers\" varchar(100)\n  \"points_scored_in_paint\" varchar(100)\n  \"points_scored_on_second_chance\" varchar(100)\n  \"points_scored_on_fast_break\" varchar(100)\n}\n\nTable \"basketball_rebounding_stats\" {\n  \"id\" int4 [not null, increment]\n  \"rebounds_total\" varchar(100)\n  \"rebounds_per_game\" varchar(100)\n  \"rebounds_defensive\" varchar(100)\n  \"rebounds_offensive\" varchar(100)\n  \"team_rebounds_total\" varchar(100)\n  \"team_rebounds_per_game\" varchar(100)\n  \"team_rebounds_defensive\" varchar(100)\n  \"team_rebounds_offensive\" varchar(100)\n}\n\nTable \"basketball_team_stats\" {\n  \"id\" int4 [not null, increment]\n  \"timeouts_left\" varchar(100)\n  \"largest_lead\" varchar(100)\n  \"fouls_total\" varchar(100)\n  \"turnover_margin\" varchar(100)\n}\n\nTable \"bookmakers\" {\n  \"id\" int4 [not null, increment]\n  \"bookmaker_key\" varchar(100)\n  \"publisher_id\" int4 [not null]\n  \"location_id\" int4\n}\n\nTable \"core_person_stats\" {\n  \"id\" int4 [not null, increment]\n  \"time_played_event\" varchar(40)\n  \"time_played_total\" varchar(40)\n  \"time_played_event_average\" varchar(40)\n  \"events_played\" int4\n  \"events_started\" int4\n  \"position_id\" int4\n}\n\nTable \"core_stats\" {\n  \"id\" int4 [not null, increment]\n  \"score\" varchar(100)\n  \"score_opposing\" varchar(100)\n  \"score_attempts\" varchar(100)\n  \"score_attempts_opposing\" varchar(100)\n  \"score_percentage\" varchar(100)\n  \"score_percentage_opposing\" varchar(100)\n}\n\nTable \"db_info\" {\n  \"version\" varchar(100) [not null, default: 16]\n}\n\nTable \"display_names\" {\n  \"id\" int4 [not null, increment]\n  \"language\" varchar(100) [not null]\n  \"entity_type\" varchar(100) [not null]\n  \"entity_id\" int4 [not null]\n  \"full_name\" varchar(100)\n  \"first_name\" varchar(100)\n  \"middle_name\" varchar(100)\n  \"last_name\" varchar(100)\n  \"alias\" varchar(100)\n  \"abbreviation\" varchar(100)\n  \"short_name\" varchar(100)\n  \"prefix\" varchar(20)\n  \"suffix\" varchar(20)\n}\n\nTable \"document_classes\" {\n  \"id\" int4 [not null, increment]\n  \"name\" varchar(100)\n}\n\nTable \"document_contents\" {\n  \"id\" int4 [not null, increment]\n  \"document_id\" int4 [not null]\n  \"sportsml\" varchar(200)\n  \"abstract\" text\n}\n\nTable \"document_fixtures\" {\n  \"id\" int4 [not null, increment]\n  \"fixture_key\" varchar(100)\n  \"publisher_id\" int4 [not null]\n  \"name\" varchar(100)\n  \"document_class_id\" int4 [not null]\n}\n\nTable \"document_fixtures_events\" {\n  \"id\" int4 [not null, increment]\n  \"document_fixture_id\" int4 [not null]\n  \"event_id\" int4 [not null]\n  \"latest_document_id\" int4 [not null]\n  \"last_update\" timestamp\n}\n\nTable \"document_package_entry\" {\n  \"id\" int4 [not null, increment]\n  \"document_package_id\" int4 [not null]\n  \"rank\" varchar(100)\n  \"document_id\" int4 [not null]\n  \"headline\" varchar(100)\n  \"short_headline\" varchar(100)\n}\n\nTable \"document_packages\" {\n  \"id\" int4 [not null, increment]\n  \"package_key\" varchar(100)\n  \"package_name\" varchar(100)\n  \"date_time\" date\n}\n\nTable \"documents\" {\n  \"id\" int4 [not null, increment]\n  \"doc_id\" varchar(75) [not null]\n  \"publisher_id\" int4 [not null]\n  \"date_time\" timestamp\n  \"title\" varchar(255)\n  \"language\" varchar(100)\n  \"priority\" varchar(100)\n  \"revision_id\" varchar(75)\n  \"stats_coverage\" varchar(100)\n  \"document_fixture_id\" int4 [not null]\n  \"source_id\" int4\n  \"db_loading_date_time\" timestamp\n}\n\nTable \"documents_media\" {\n  \"id\" int4 [not null, increment]\n  \"document_id\" int4 [not null]\n  \"media_id\" int4 [not null]\n  \"media_caption_id\" int4 [not null]\n}\n\nTable \"events\" {\n  \"id\" int4 [not null, increment]\n  \"event_key\" varchar(100) [not null]\n  \"publisher_id\" int4 [not null]\n  \"start_date_time\" timestamp\n  \"site_id\" int4\n  \"site_alignment\" varchar(100)\n  \"event_status\" varchar(100)\n  \"duration\" varchar(100)\n  \"attendance\" varchar(100)\n  \"last_update\" timestamp\n}\n\nTable \"events_documents\" {\n  \"event_id\" int4 [not null]\n  \"document_id\" int4 [not null]\n}\n\nTable \"events_media\" {\n  \"event_id\" int4 [not null]\n  \"media_id\" int4 [not null]\n}\n\nTable \"events_sub_seasons\" {\n  \"event_id\" int4 [not null]\n  \"sub_season_id\" int4 [not null]\n}\n\nTable \"ice_hockey_action_participants\" {\n  \"id\" int4 [not null, increment]\n  \"ice_hockey_action_play_id\" int4 [not null]\n  \"person_id\" int4 [not null]\n  \"participant_role\" varchar(100) [not null]\n  \"point_credit\" int4\n}\n\nTable \"ice_hockey_action_plays\" {\n  \"id\" int4 [not null, increment]\n  \"ice_hockey_event_state_id\" int4 [not null]\n  \"play_type\" varchar(100)\n  \"score_attempt_type\" varchar(100)\n  \"play_result\" varchar(100)\n  \"comment\" varchar(255)\n}\n\nTable \"ice_hockey_defensive_stats\" {\n  \"id\" int4 [not null, increment]\n  \"shots_power_play_allowed\" varchar(100)\n  \"shots_penalty_shot_allowed\" varchar(100)\n  \"goals_power_play_allowed\" varchar(100)\n  \"goals_penalty_shot_allowed\" varchar(100)\n  \"goals_against_average\" varchar(100)\n  \"saves\" varchar(100)\n  \"save_percentage\" varchar(100)\n  \"penalty_killing_amount\" varchar(100)\n  \"penalty_killing_percentage\" varchar(100)\n  \"shots_blocked\" varchar(100)\n  \"takeaways\" varchar(100)\n  \"shutouts\" varchar(100)\n  \"minutes_penalty_killing\" varchar(100)\n  \"hits\" varchar(100)\n  \"goals_empty_net_allowed\" varchar(100)\n  \"goals_short_handed_allowed\" varchar(100)\n  \"goals_shootout_allowed\" varchar(100)\n  \"shots_shootout_allowed\" varchar(100)\n}\n\nTable \"ice_hockey_event_states\" {\n  \"id\" int4 [not null, increment]\n  \"event_id\" int4 [not null]\n  \"current_state\" int4\n  \"sequence_number\" int4\n  \"period_value\" varchar(100)\n  \"period_time_elapsed\" varchar(100)\n  \"period_time_remaining\" varchar(100)\n  \"context\" varchar(40)\n}\n\nTable \"ice_hockey_offensive_stats\" {\n  \"id\" int4 [not null, increment]\n  \"goals_game_winning\" varchar(100)\n  \"goals_game_tying\" varchar(100)\n  \"goals_power_play\" varchar(100)\n  \"goals_short_handed\" varchar(100)\n  \"goals_even_strength\" varchar(100)\n  \"goals_empty_net\" varchar(100)\n  \"goals_overtime\" varchar(100)\n  \"goals_shootout\" varchar(100)\n  \"goals_penalty_shot\" varchar(100)\n  \"assists\" varchar(100)\n  \"points\" varchar(100)\n  \"power_play_amount\" varchar(100)\n  \"power_play_percentage\" varchar(100)\n  \"shots_penalty_shot_taken\" varchar(100)\n  \"shots_penalty_shot_missed\" varchar(100)\n  \"shots_penalty_shot_percentage\" varchar(100)\n  \"giveaways\" varchar(100)\n  \"minutes_power_play\" varchar(100)\n  \"faceoff_wins\" varchar(100)\n  \"faceoff_losses\" varchar(100)\n  \"faceoff_win_percentage\" varchar(100)\n  \"scoring_chances\" varchar(100)\n}\n\nTable \"ice_hockey_player_stats\" {\n  \"id\" int4 [not null, increment]\n  \"plus_minus\" varchar(100)\n}\n\nTable \"injury_phases\" {\n  \"id\" int4 [not null, increment]\n  \"person_id\" int4 [not null]\n  \"injury_status\" varchar(100)\n  \"injury_type\" varchar(100)\n  \"injury_comment\" varchar(100)\n  \"disabled_list\" varchar(100)\n  \"start_date_time\" timestamp\n  \"end_date_time\" timestamp\n  \"season_id\" int4\n  \"phase_type\" varchar(100)\n  \"injury_side\" varchar(100)\n}\n\nTable \"key_aliases\" {\n  \"id\" int4 [not null, increment]\n  \"key_id\" int4 [not null]\n  \"key_root_id\" int4 [not null]\n}\n\nTable \"key_roots\" {\n  \"id\" int4 [not null, increment]\n  \"key_type\" varchar(100)\n}\n\nTable \"latest_revisions\" {\n  \"id\" int4 [not null, increment]\n  \"revision_id\" varchar(75) [not null]\n  \"latest_document_id\" int4 [not null]\n}\n\nTable \"locations\" {\n  \"id\" int4 [not null, increment]\n  \"timezone\" varchar(100)\n  \"latitude\" varchar(100)\n  \"longitude\" varchar(100)\n  \"country_code\" varchar(100)\n}\n\nTable \"media\" {\n  \"id\" int4 [not null, increment]\n  \"object_id\" int4\n  \"source_id\" int4\n  \"revision_id\" int4\n  \"media_type\" varchar(100)\n  \"publisher_id\" int4 [not null]\n  \"date_time\" varchar(100)\n  \"credit_id\" int4 [not null]\n  \"db_loading_date_time\" timestamp\n  \"creation_location_id\" int4 [not null]\n}\n\nTable \"media_captions\" {\n  \"id\" int4 [not null, increment]\n  \"media_id\" int4 [not null]\n  \"caption_type\" varchar(100)\n  \"caption\" varchar(100)\n  \"caption_author_id\" int4 [not null]\n  \"language\" varchar(100)\n  \"caption_size\" varchar(100)\n}\n\nTable \"media_contents\" {\n  \"id\" int4 [not null, increment]\n  \"media_id\" int4 [not null]\n  \"object\" varchar(100)\n  \"format\" varchar(100)\n  \"mime_type\" varchar(100)\n  \"height\" varchar(100)\n  \"width\" varchar(100)\n  \"duration\" varchar(100)\n  \"file_size\" varchar(100)\n  \"resolution\" varchar(100)\n}\n\nTable \"media_keywords\" {\n  \"id\" int4 [not null, increment]\n  \"keyword\" varchar(100)\n  \"media_id\" int4 [not null]\n}\n\nTable \"motor_racing_event_states\" {\n  \"id\" int4 [not null, increment]\n  \"event_id\" int4 [not null]\n  \"current_state\" int4\n  \"sequence_number\" int4\n  \"lap\" varchar(100)\n  \"laps_remaining\" varchar(100)\n  \"time_elapsed\" varchar(100)\n  \"flag_state\" varchar(100)\n  \"context\" varchar(40)\n}\n\nTable \"motor_racing_qualifying_stats\" {\n  \"id\" int4 [not null, increment]\n  \"grid\" varchar(100)\n  \"pole_position\" varchar(100)\n  \"pole_wins\" varchar(100)\n  \"qualifying_speed\" varchar(100)\n  \"qualifying_speed_units\" varchar(100)\n  \"qualifying_time\" varchar(100)\n  \"qualifying_position\" varchar(100)\n}\n\nTable \"motor_racing_race_stats\" {\n  \"id\" int4 [not null, increment]\n  \"time_behind_leader\" varchar(100)\n  \"laps_behind_leader\" varchar(100)\n  \"time_ahead_follower\" varchar(100)\n  \"laps_ahead_follower\" varchar(100)\n  \"time\" varchar(100)\n  \"points\" varchar(100)\n  \"points_rookie\" varchar(100)\n  \"bonus\" varchar(100)\n  \"laps_completed\" varchar(100)\n  \"laps_leading_total\" varchar(100)\n  \"distance_leading\" varchar(100)\n  \"distance_completed\" varchar(100)\n  \"distance_units\" varchar(40)\n  \"speed_average\" varchar(40)\n  \"speed_units\" varchar(40)\n  \"status\" varchar(40)\n  \"finishes_top_5\" varchar(40)\n  \"finishes_top_10\" varchar(40)\n  \"starts\" varchar(40)\n  \"finishes\" varchar(40)\n  \"non_finishes\" varchar(40)\n  \"wins\" varchar(40)\n  \"races_leading\" varchar(40)\n  \"money\" varchar(40)\n  \"money_units\" varchar(40)\n  \"leads_total\" varchar(40)\n}\n\nTable \"outcome_totals\" {\n  \"id\" int4 [not null, increment]\n  \"standing_subgroup_id\" int4 [not null]\n  \"outcome_holder_type\" varchar(100)\n  \"outcome_holder_id\" int4\n  \"rank\" varchar(100)\n  \"wins\" varchar(100)\n  \"losses\" varchar(100)\n  \"ties\" varchar(100)\n  \"undecideds\" varchar(100)\n  \"winning_percentage\" varchar(100)\n  \"points_scored_for\" varchar(100)\n  \"points_scored_against\" varchar(100)\n  \"points_difference\" varchar(100)\n  \"standing_points\" varchar(100)\n  \"streak_type\" varchar(100)\n  \"streak_duration\" varchar(100)\n  \"streak_total\" varchar(100)\n  \"streak_start\" date\n  \"streak_end\" date\n}\n\nTable \"participants_events\" {\n  \"id\" int4 [not null, increment]\n  \"participant_type\" varchar(100) [not null]\n  \"participant_id\" int4 [not null]\n  \"event_id\" int4 [not null]\n  \"alignment\" varchar(100)\n  \"score\" varchar(100)\n  \"event_outcome\" varchar(100)\n  \"rank\" int4\n}\n\nTable \"periods\" {\n  \"id\" int4 [not null, increment]\n  \"participant_event_id\" int4 [not null]\n  \"period_value\" varchar(100)\n  \"score\" varchar(100)\n}\n\nTable \"person_event_metadata\" {\n  \"id\" int4 [not null, increment]\n  \"person_id\" int4 [not null]\n  \"event_id\" int4 [not null]\n  \"status\" varchar(100)\n  \"health\" varchar(100)\n  \"weight\" varchar(100)\n  \"role_id\" int4\n  \"position_id\" int4\n  \"team_id\" int4\n  \"lineup_slot\" int4\n  \"lineup_slot_sequence\" int4\n}\n\nTable \"person_phases\" {\n  \"id\" int4 [not null, increment]\n  \"person_id\" int4 [not null]\n  \"membership_type\" varchar(40) [not null]\n  \"membership_id\" int4 [not null]\n  \"role_id\" int4\n  \"role_status\" varchar(40)\n  \"phase_status\" varchar(40)\n  \"uniform_number\" varchar(20)\n  \"regular_position_id\" int4\n  \"regular_position_depth\" varchar(40)\n  \"height\" varchar(100)\n  \"weight\" varchar(100)\n  \"start_date_time\" timestamp\n  \"start_season_id\" int4\n  \"end_date_time\" timestamp\n  \"end_season_id\" int4\n  \"entry_reason\" varchar(40)\n  \"exit_reason\" varchar(40)\n  \"selection_level\" int4\n  \"selection_sublevel\" int4\n  \"selection_overall\" int4\n}\n\nTable \"persons\" {\n  \"id\" int4 [not null, increment]\n  \"person_key\" varchar(100) [not null]\n  \"publisher_id\" int4 [not null]\n  \"gender\" varchar(20)\n  \"birth_date\" varchar(30)\n  \"death_date\" varchar(30)\n  \"birth_location_id\" int4\n  \"hometown_location_id\" int4\n  \"residence_location_id\" int4\n  \"death_location_id\" int4\n}\n\nTable \"persons_documents\" {\n  \"person_id\" int4 [not null]\n  \"document_id\" int4 [not null]\n}\n\nTable \"persons_media\" {\n  \"person_id\" int4 [not null]\n  \"media_id\" int4 [not null]\n}\n\nTable \"positions\" {\n  \"id\" int4 [not null, increment]\n  \"affiliation_id\" int4 [not null]\n  \"abbreviation\" varchar(100) [not null]\n}\n\nTable \"publishers\" {\n  \"id\" int4 [not null, increment]\n  \"publisher_key\" varchar(100) [not null]\n  \"publisher_name\" varchar(100)\n}\n\nTable \"roles\" {\n  \"id\" int4 [not null, increment]\n  \"role_key\" varchar(100) [not null]\n  \"role_name\" varchar(100)\n  \"comment\" varchar(100)\n}\n\nTable \"seasons\" {\n  \"id\" int4 [not null, increment]\n  \"season_key\" int4 [not null]\n  \"publisher_id\" int4 [not null]\n  \"league_id\" int4 [not null]\n  \"start_date_time\" timestamp\n  \"end_date_time\" timestamp\n}\n\nTable \"sites\" {\n  \"id\" int4 [not null, increment]\n  \"site_key\" int4 [not null]\n  \"publisher_id\" int4 [not null]\n  \"location_id\" int4\n}\n\nTable \"soccer_defensive_stats\" {\n  \"id\" int4 [not null, increment]\n  \"shots_penalty_shot_allowed\" varchar(100)\n  \"goals_penalty_shot_allowed\" varchar(100)\n  \"goals_against_average\" varchar(100)\n  \"goals_against_total\" varchar(100)\n  \"saves\" varchar(100)\n  \"save_percentage\" varchar(100)\n  \"catches_punches\" varchar(100)\n  \"shots_on_goal_total\" varchar(100)\n  \"shots_shootout_total\" varchar(100)\n  \"shots_shootout_allowed\" varchar(100)\n  \"shots_blocked\" varchar(100)\n  \"shutouts\" varchar(100)\n}\n\nTable \"soccer_event_states\" {\n  \"id\" int4 [not null, increment]\n  \"event_id\" int4 [not null]\n  \"current_state\" int4\n  \"sequence_number\" int4\n  \"period_value\" varchar(100)\n  \"period_time_elapsed\" varchar(100)\n  \"period_time_remaining\" varchar(100)\n  \"minutes_elapsed\" varchar(100)\n  \"period_minute_elapsed\" varchar(100)\n  \"context\" varchar(40)\n}\n\nTable \"soccer_foul_stats\" {\n  \"id\" int4 [not null, increment]\n  \"fouls_suffered\" varchar(100)\n  \"fouls_commited\" varchar(100)\n  \"cautions_total\" varchar(100)\n  \"cautions_pending\" varchar(100)\n  \"caution_points_total\" varchar(100)\n  \"caution_points_pending\" varchar(100)\n  \"ejections_total\" varchar(100)\n}\n\nTable \"soccer_offensive_stats\" {\n  \"id\" int4 [not null, increment]\n  \"goals_game_winning\" varchar(100)\n  \"goals_game_tying\" varchar(100)\n  \"goals_overtime\" varchar(100)\n  \"goals_shootout\" varchar(100)\n  \"goals_total\" varchar(100)\n  \"assists_game_winning\" varchar(100)\n  \"assists_game_tying\" varchar(100)\n  \"assists_overtime\" varchar(100)\n  \"assists_total\" varchar(100)\n  \"points\" varchar(100)\n  \"shots_total\" varchar(100)\n  \"shots_on_goal_total\" varchar(100)\n  \"shots_hit_frame\" varchar(100)\n  \"shots_penalty_shot_taken\" varchar(100)\n  \"shots_penalty_shot_scored\" varchar(100)\n  \"shots_penalty_shot_missed\" varchar(40)\n  \"shots_penalty_shot_percentage\" varchar(40)\n  \"shots_shootout_taken\" varchar(40)\n  \"shots_shootout_scored\" varchar(40)\n  \"shots_shootout_missed\" varchar(40)\n  \"shots_shootout_percentage\" varchar(40)\n  \"giveaways\" varchar(40)\n  \"offsides\" varchar(40)\n  \"corner_kicks\" varchar(40)\n  \"hat_tricks\" varchar(40)\n}\n\nTable \"standing_subgroups\" {\n  \"id\" int4 [not null, increment]\n  \"standing_id\" int4 [not null]\n  \"affiliation_id\" int4 [not null]\n}\n\nTable \"standings\" {\n  \"id\" int4 [not null, increment]\n  \"affiliation_id\" int4 [not null]\n  \"standing_type\" varchar(100)\n  \"sub_season_id\" int4 [not null]\n  \"last_updated\" varchar(100)\n  \"duration_scope\" varchar(100)\n  \"competition_scope\" varchar(100)\n  \"competition_scope_id\" varchar(100)\n  \"alignment_scope\" varchar(100)\n  \"site_scope\" varchar(100)\n  \"scoping_label\" varchar(100)\n  \"publisher_id\" int4 [not null]\n  \"source\" varchar(100)\n}\n\nTable \"stats\" {\n  \"id\" int4 [not null, increment]\n  \"stat_repository_type\" varchar(100)\n  \"stat_repository_id\" int4 [not null]\n  \"stat_holder_type\" varchar(100)\n  \"stat_holder_id\" int4\n  \"stat_coverage_type\" varchar(100)\n  \"stat_coverage_id\" int4\n  \"context\" varchar(40) [not null]\n}\n\nTable \"sub_periods\" {\n  \"id\" int4 [not null, increment]\n  \"period_id\" int4 [not null]\n  \"sub_period_value\" varchar(100)\n  \"score\" varchar(100)\n}\n\nTable \"sub_seasons\" {\n  \"id\" int4 [not null, increment]\n  \"sub_season_key\" varchar(100) [not null]\n  \"season_id\" int4 [not null]\n  \"sub_season_type\" varchar(100) [not null]\n  \"start_date_time\" timestamp\n  \"end_date_time\" timestamp\n}\n\nTable \"team_american_football_stats\" {\n  \"id\" int4 [not null, increment]\n  \"yards_per_attempt\" varchar(100)\n  \"average_starting_position\" varchar(100)\n  \"timeouts\" varchar(100)\n  \"time_of_possession\" varchar(100)\n  \"turnover_ratio\" varchar(100)\n}\n\nTable \"team_phases\" {\n  \"id\" int4 [not null, increment]\n  \"team_id\" int4 [not null]\n  \"start_season_id\" int4\n  \"end_season_id\" int4\n  \"affiliation_id\" int4 [not null]\n  \"start_date_time\" varchar(100)\n  \"end_date_time\" varchar(100)\n  \"phase_status\" varchar(40)\n  \"role_id\" int4\n}\n\nTable \"teams\" {\n  \"id\" int4 [not null, increment]\n  \"team_key\" varchar(100) [not null]\n  \"publisher_id\" int4 [not null]\n  \"home_site_id\" int4\n}\n\nTable \"teams_documents\" {\n  \"team_id\" int4 [not null]\n  \"document_id\" int4 [not null]\n}\n\nTable \"teams_media\" {\n  \"team_id\" int4 [not null]\n  \"media_id\" int4 [not null]\n}\n\nTable \"tennis_action_points\" {\n  \"id\" int4 [not null, increment]\n  \"sub_period_id\" varchar(100)\n  \"sequence_number\" varchar(100)\n  \"win_type\" varchar(100)\n}\n\nTable \"tennis_action_volleys\" {\n  \"id\" int4 [not null, increment]\n  \"sequence_number\" varchar(100)\n  \"tennis_action_points_id\" int4\n  \"landing_location\" varchar(100)\n  \"swing_type\" varchar(100)\n  \"result\" varchar(100)\n  \"spin_type\" varchar(100)\n  \"trajectory_details\" varchar(100)\n}\n\nTable \"tennis_event_states\" {\n  \"id\" int4 [not null, increment]\n  \"event_id\" int4 [not null]\n  \"current_state\" int4\n  \"sequence_number\" int4\n  \"tennis_set\" varchar(100)\n  \"game\" varchar(100)\n  \"server_person_id\" int4\n  \"server_score\" varchar(100)\n  \"receiver_person_id\" int4\n  \"receiver_score\" varchar(100)\n  \"service_number\" varchar(100)\n  \"context\" varchar(40)\n}\n\nTable \"tennis_return_stats\" {\n  \"id\" int4 [not null, increment]\n  \"returns_played\" varchar(100)\n  \"matches_played\" varchar(100)\n  \"first_service_return_points_won\" varchar(100)\n  \"first_service_return_points_won_pct\" varchar(100)\n  \"second_service_return_points_won\" varchar(100)\n  \"second_service_return_points_won_pct\" varchar(100)\n  \"return_games_played\" varchar(100)\n  \"return_games_won\" varchar(100)\n  \"return_games_won_pct\" varchar(100)\n  \"break_points_played\" varchar(100)\n  \"break_points_converted\" varchar(100)\n  \"break_points_converted_pct\" varchar(100)\n}\n\nTable \"tennis_service_stats\" {\n  \"id\" int4 [not null, increment]\n  \"services_played\" varchar(100)\n  \"matches_played\" varchar(100)\n  \"aces\" varchar(100)\n  \"first_services_good\" varchar(100)\n  \"first_services_good_pct\" varchar(100)\n  \"first_service_points_won\" varchar(100)\n  \"first_service_points_won_pct\" varchar(100)\n  \"second_service_points_won\" varchar(100)\n  \"second_service_points_won_pct\" varchar(100)\n  \"service_games_played\" varchar(100)\n  \"service_games_won\" varchar(100)\n  \"service_games_won_pct\" varchar(100)\n  \"break_points_played\" varchar(100)\n  \"break_points_saved\" varchar(100)\n  \"break_points_saved_pct\" varchar(100)\n}\n\nTable \"wagering_moneylines\" {\n  \"id\" int4 [not null, increment]\n  \"bookmaker_id\" int4 [not null]\n  \"event_id\" int4 [not null]\n  \"date_time\" timestamp\n  \"team_id\" int4 [not null]\n  \"person_id\" int4\n  \"rotation_key\" varchar(100)\n  \"comment\" varchar(100)\n  \"vigorish\" varchar(100)\n  \"line\" varchar(100)\n  \"line_opening\" varchar(100)\n  \"prediction\" varchar(100)\n}\n\nTable \"wagering_odds_lines\" {\n  \"id\" int4 [not null, increment]\n  \"bookmaker_id\" int4 [not null]\n  \"event_id\" int4 [not null]\n  \"date_time\" timestamp\n  \"team_id\" int4 [not null]\n  \"person_id\" int4\n  \"rotation_key\" varchar(100)\n  \"comment\" varchar(100)\n  \"numerator\" varchar(100)\n  \"denominator\" varchar(100)\n  \"prediction\" varchar(100)\n  \"payout_calculation\" varchar(100)\n  \"payout_amount\" varchar(100)\n}\n\nTable \"wagering_runlines\" {\n  \"id\" int4 [not null, increment]\n  \"bookmaker_id\" int4 [not null]\n  \"event_id\" int4 [not null]\n  \"date_time\" timestamp\n  \"team_id\" int4 [not null]\n  \"person_id\" int4\n  \"rotation_key\" varchar(100)\n  \"comment\" varchar(100)\n  \"vigorish\" varchar(100)\n  \"line\" varchar(100)\n  \"line_opening\" varchar(100)\n  \"line_value\" varchar(100)\n  \"prediction\" varchar(100)\n}\n\nTable \"wagering_straight_spread_lines\" {\n  \"id\" int4 [not null, increment]\n  \"bookmaker_id\" int4 [not null]\n  \"event_id\" int4 [not null]\n  \"date_time\" timestamp\n  \"team_id\" int4 [not null]\n  \"person_id\" int4\n  \"rotation_key\" varchar(100)\n  \"comment\" varchar(100)\n  \"vigorish\" varchar(100)\n  \"line_value\" varchar(100)\n  \"line_value_opening\" varchar(100)\n  \"prediction\" varchar(100)\n}\n\nTable \"wagering_total_score_lines\" {\n  \"id\" int4 [not null, increment]\n  \"bookmaker_id\" int4 [not null]\n  \"event_id\" int4 [not null]\n  \"date_time\" timestamp\n  \"team_id\" int4 [not null]\n  \"person_id\" int4\n  \"rotation_key\" varchar(100)\n  \"comment\" varchar(100)\n  \"vigorish\" varchar(100)\n  \"line_over\" varchar(100)\n  \"line_under\" varchar(100)\n  \"total\" varchar(100)\n  \"total_opening\" varchar(100)\n  \"prediction\" varchar(100)\n}\n\nTable \"weather_conditions\" {\n  \"id\" int4 [not null, increment]\n  \"event_id\" int4 [not null]\n  \"temperature\" varchar(100)\n  \"temperature_units\" varchar(40)\n  \"humidity\" varchar(100)\n  \"clouds\" varchar(100)\n  \"wind_direction\" varchar(100)\n  \"wind_velocity\" varchar(100)\n  \"weather_code\" varchar(100)\n}\n",
    "stateUrl": null,
    "stateOriginalUrl": "https://github.com/yugabyte/yugabyte-db/blob/master/sample/sportsdb_tables.sql"
  }
}

================================================
FILE: tasks/postgres/standard/sports/baseball_player_analysis/verify.py
================================================
"""
Verification script for PostgreSQL Sports Task 1: Baseball Player Analysis
"""

import os
import sys
import psycopg2
from decimal import Decimal

def rows_match(actual_row, expected_row):
    """Compare two rows with appropriate tolerance for decimals and floats."""
    if len(actual_row) != len(expected_row):
        return False
    
    for actual, expected in zip(actual_row, expected_row):
        if isinstance(actual, (Decimal, float)) and isinstance(expected, (Decimal, float)):
            # Use higher tolerance for floating point comparisons
            if abs(float(actual) - float(expected)) > 0.001:
                return False
        elif actual != expected:
            return False
    
    return True

def get_connection_params() -> dict:
    """Get database connection parameters."""
    return {
        "host": os.getenv("POSTGRES_HOST", "localhost"),
        "port": int(os.getenv("POSTGRES_PORT", 5432)),
        "database": os.getenv("POSTGRES_DATABASE"),
        "user": os.getenv("POSTGRES_USERNAME"),
        "password": os.getenv("POSTGRES_PASSWORD")
    }

def verify_baseball_player_analysis_table(conn) -> bool:
    """Verify the baseball_player_analysis table results."""
    with conn.cursor() as cur:
        cur.execute("""
            SELECT player_id, player_name, team_name, games_played, at_bats, hits,
                   runs_scored, rbi, home_runs, batting_average, defensive_games,
                   putouts, assists, errors, fielding_percentage
            FROM baseball_player_analysis
            ORDER BY batting_average DESC, games_played DESC
        """)
        actual_results = cur.fetchall()
        
        cur.execute("""
            SELECT
            p.id AS player_id,
            MAX(dn.full_name) AS player_name,
            'Unknown' AS team_name,
            core.events_played AS games_played,
            off.at_bats,
            off.hits,
            off.runs_scored,
            off.rbi,
            off.home_runs,
            CASE WHEN off.at_bats > 0
                THEN 1.0 * off.hits / off.at_bats
                ELSE 0
            END AS batting_average,
            core.events_played AS defensive_games,
            COALESCE(def.putouts, 0)  AS putouts,
            COALESCE(def.assists, 0)  AS assists,
            COALESCE(def.errors, 0)   AS errors,
            CASE
                WHEN (COALESCE(def.putouts,0) + COALESCE(def.assists,0) + COALESCE(def.errors,0)) > 0
                THEN 1.0 * (COALESCE(def.putouts,0) + COALESCE(def.assists,0))
                    / (COALESCE(def.putouts,0) + COALESCE(def.assists,0) + COALESCE(def.errors,0))
                ELSE 0
            END AS fielding_percentage
            FROM persons p
            JOIN display_names dn
            ON dn.entity_id = p.id
            AND dn.entity_type = 'persons'
            AND NULLIF(TRIM(dn.full_name), '') IS NOT NULL
            JOIN (
            SELECT s.stat_holder_id AS player_id,
                    SUM(bos.at_bats)       AS at_bats,
                    SUM(bos.hits)          AS hits,
                    SUM(bos.runs_scored)   AS runs_scored,
                    SUM(bos.rbi)           AS rbi,
                    SUM(bos.home_runs)     AS home_runs
            FROM stats s
            JOIN baseball_offensive_stats bos
                ON bos.id = s.stat_repository_id
            WHERE s.stat_holder_type = 'persons'
                AND s.stat_repository_type = 'baseball_offensive_stats'
                AND s.context = 'season-regular'
            GROUP BY s.stat_holder_id
            ) off ON off.player_id = p.id
            JOIN (
            SELECT s.stat_holder_id AS player_id,
                    SUM(cps.events_played) AS events_played
            FROM stats s
            JOIN core_person_stats cps
                ON cps.id = s.stat_repository_id
            WHERE s.stat_holder_type = 'persons'
                AND s.stat_repository_type = 'core_person_stats'
                AND s.context = 'season-regular'
            GROUP BY s.stat_holder_id
            ) core ON core.player_id = p.id
            LEFT JOIN (
            SELECT s.stat_holder_id AS player_id,
                    SUM(bds.putouts)  AS putouts,
                    SUM(bds.assists)  AS assists,
                    SUM(bds.errors)   AS errors
            FROM stats s
            JOIN baseball_defensive_stats bds
                ON bds.id = s.stat_repository_id
            WHERE s.stat_holder_type = 'persons'
                AND s.stat_repository_type = 'baseball_defensive_stats'
                AND s.context = 'season-regular'
            GROUP BY s.stat_holder_id
            ) def ON def.player_id = p.id
            WHERE core.events_played >= 10
            AND off.at_bats >= 50
            GROUP BY
            p.id, core.events_played,
            off.at_bats, off.hits, off.runs_scored, off.rbi, off.home_runs,
            def.putouts, def.assists, def.errors
            ORDER BY batting_average DESC, games_played DESC;
        """)
        expected_results = cur.fetchall()
        
        if len(actual_results) != len(expected_results):
            print(f"❌ baseball_player_analysis table has {len(actual_results)} records, expected {len(expected_results)}")
            return False
            
        mismatches = 0
        for i, (actual, expected) in enumerate(zip(actual_results, expected_results)):
            if not rows_match(actual, expected):
                if mismatches < 5:  # Only show first 5 mismatches
                    print(f"❌ Player analysis row {i+1} mismatch: expected {expected}, got {actual}")
                mismatches += 1
                
        if mismatches > 0:
            print(f"❌ Total player analysis mismatches: {mismatches}")
            return False
            
        print(f"✅ baseball_player_analysis table created and populated correctly ({len(actual_results)} players)")
        return True

def main():
    """Main verification function."""
    print("=" * 70)
    print("PostgreSQL Sports Task 1 Verification: Baseball Player Analysis")
    print("=" * 70)
    
    # Get connection parameters
    conn_params = get_connection_params()
    
    if not conn_params["database"]:
        print("❌ No database specified")
        sys.exit(1)
    
    try:
        # Connect to database
        conn = psycopg2.connect(**conn_params)
        
        # Verify results
        success = verify_baseball_player_analysis_table(conn)
        
        conn.close()
        
        if success:
            print("\n🎉 Task verification: PASS")
            sys.exit(0)
        else:
            print("\n❌ Task verification: FAIL")
            sys.exit(1)
            
    except psycopg2.Error as e:
        print(f"❌ Database error: {e}")
        sys.exit(1)
    except Exception as e:
        print(f"❌ Verification error: {e}")
        sys.exit(1)

if __name__ == "__main__":
    main()

================================================
FILE: tasks/postgres/standard/sports/participant_report_optimization/description.md
================================================
# Query Performance Optimization

## Background
You need to optimize a slow-running analytics query that generates performance reports. The query currently takes too long to execute and needs optimization.

## Requirements

### 1. Create Performance Report Table
Create a table called `participant_performance_report` with the following structure:
- report_id (serial primary key)
- participant_id (integer not null)
- event_count (integer)
- stat_count (integer)
- stat_type_count (integer)
- last_event_date (timestamp)
- created_at (timestamp default current_timestamp)

Add constraint: CHECK (participant_id > 0)

### 2. Execute and Optimize the Slow Query
The following query is currently running very slowly. Your task is to:
1. **Identify why the query is slow**
2. **Create appropriate indexes to optimize it** 
3. **Populate the report table with the query results**

```sql
SELECT 
    pe.participant_id,
    COUNT(pe.event_id) as event_count,
    (SELECT COUNT(*) FROM stats s WHERE s.stat_holder_id = pe.participant_id AND s.stat_holder_type = 'persons') as stat_count,
    (SELECT COUNT(DISTINCT s.stat_repository_type) FROM stats s WHERE s.stat_holder_id = pe.participant_id AND s.stat_holder_type = 'persons') as stat_type_count,
    (SELECT MAX(e.start_date_time) FROM events e JOIN participants_events pe2 ON e.id = pe2.event_id WHERE pe2.participant_id = pe.participant_id) as last_event_date
FROM participants_events pe 
WHERE pe.participant_id <= 50
GROUP BY pe.participant_id
ORDER BY pe.participant_id;
```

### 3. Document Performance Improvement
After optimization, insert the results into your `participant_performance_report` table.

## Success Criteria
- The query should execute significantly faster after your optimization
- All results should be correctly inserted into the report table
- Your optimization should use appropriate database indexes

## Important Notes
- Analyze the query execution plan to identify bottlenecks
- Focus on the most impactful optimizations
- Handle NULL values appropriately in calculations

================================================
FILE: tasks/postgres/standard/sports/participant_report_optimization/meta.json
================================================
{
  "task_id": "participant_report_optimization",
  "task_name": "Participant Report Optimization",
  "category_id": "sports",
  "category_name": "Sports",
  "description": "Optimize slow-running participant performance query by creating indexes and populating performance report table.",
  "author": "Lingxiao Du",
  "created_at": "2025-08-18",
  "difficulty": "L3",
  "tags": [
    "performance optimization",
    "schema design"
  ],
  "mcp": [
    "postgres"
  ],
  "meta_data": {
    "stateType": "text",
    "stateContent": "Table \"addresses\" {\n  \"id\" int4 [not null, increment]\n  \"location_id\" int4 [not null]\n  \"language\" varchar(100)\n  \"suite\" varchar(100)\n  \"floor\" varchar(100)\n  \"building\" varchar(100)\n  \"street_number\" varchar(100)\n  \"street_prefix\" varchar(100)\n  \"street\" varchar(100)\n  \"street_suffix\" varchar(100)\n  \"neighborhood\" varchar(100)\n  \"district\" varchar(100)\n  \"locality\" varchar(100)\n  \"county\" varchar(100)\n  \"region\" varchar(100)\n  \"postal_code\" varchar(100)\n  \"country\" varchar(100)\n}\n\nTable \"affiliation_phases\" {\n  \"id\" int4 [not null, increment]\n  \"affiliation_id\" int4 [not null]\n  \"ancestor_affiliation_id\" int4\n  \"start_season_id\" int4\n  \"start_date_time\" timestamp\n  \"end_season_id\" int4\n  \"end_date_time\" timestamp\n}\n\nTable \"affiliations\" {\n  \"id\" int4 [not null, increment]\n  \"affiliation_key\" varchar(100) [not null]\n  \"affiliation_type\" varchar(100)\n  \"publisher_id\" int4 [not null]\n}\n\nTable \"affiliations_documents\" {\n  \"affiliation_id\" int4 [not null]\n  \"document_id\" int4 [not null]\n}\n\nTable \"affiliations_events\" {\n  \"affiliation_id\" int4 [not null]\n  \"event_id\" int4 [not null]\n}\n\nTable \"affiliations_media\" {\n  \"affiliation_id\" int4 [not null]\n  \"media_id\" int4 [not null]\n}\n\nTable \"american_football_action_participants\" {\n  \"id\" int4 [not null, increment]\n  \"american_football_action_play_id\" int4 [not null]\n  \"person_id\" int4 [not null]\n  \"participant_role\" varchar(100) [not null]\n  \"score_type\" varchar(100)\n  \"field_line\" int4\n  \"yardage\" int4\n  \"score_credit\" int4\n  \"yards_gained\" int4\n}\n\nTable \"american_football_action_plays\" {\n  \"id\" int4 [not null, increment]\n  \"american_football_event_state_id\" int4 [not null]\n  \"play_type\" varchar(100)\n  \"score_attempt_type\" varchar(100)\n  \"drive_result\" varchar(100)\n  \"points\" int4\n  \"comment\" varchar(255)\n}\n\nTable \"american_football_defensive_stats\" {\n  \"id\" int4 [not null, increment]\n  \"tackles_total\" varchar(100)\n  \"tackles_solo\" varchar(100)\n  \"tackles_assists\" varchar(100)\n  \"interceptions_total\" varchar(100)\n  \"interceptions_yards\" varchar(100)\n  \"interceptions_average\" varchar(100)\n  \"interceptions_longest\" varchar(100)\n  \"interceptions_touchdown\" varchar(100)\n  \"quarterback_hurries\" varchar(100)\n  \"sacks_total\" varchar(100)\n  \"sacks_yards\" varchar(100)\n  \"passes_defensed\" varchar(100)\n}\n\nTable \"american_football_down_progress_stats\" {\n  \"id\" int4 [not null, increment]\n  \"first_downs_total\" varchar(100)\n  \"first_downs_pass\" varchar(100)\n  \"first_downs_run\" varchar(100)\n  \"first_downs_penalty\" varchar(100)\n  \"conversions_third_down\" varchar(100)\n  \"conversions_third_down_attempts\" varchar(100)\n  \"conversions_third_down_percentage\" varchar(100)\n  \"conversions_fourth_down\" varchar(100)\n  \"conversions_fourth_down_attempts\" varchar(100)\n  \"conversions_fourth_down_percentage\" varchar(100)\n}\n\nTable \"american_football_event_states\" {\n  \"id\" int4 [not null, increment]\n  \"event_id\" int4 [not null]\n  \"current_state\" int2\n  \"sequence_number\" int4\n  \"period_value\" int4\n  \"period_time_elapsed\" varchar(100)\n  \"period_time_remaining\" varchar(100)\n  \"clock_state\" varchar(100)\n  \"down\" int4\n  \"team_in_possession_id\" int4\n  \"distance_for_1st_down\" int4\n  \"field_side\" varchar(100)\n  \"field_line\" int4\n  \"context\" varchar(40)\n}\n\nTable \"american_football_fumbles_stats\" {\n  \"id\" int4 [not null, increment]\n  \"fumbles_committed\" varchar(100)\n  \"fumbles_forced\" varchar(100)\n  \"fumbles_recovered\" varchar(100)\n  \"fumbles_lost\" varchar(100)\n  \"fumbles_yards_gained\" varchar(100)\n  \"fumbles_own_committed\" varchar(100)\n  \"fumbles_own_recovered\" varchar(100)\n  \"fumbles_own_lost\" varchar(100)\n  \"fumbles_own_yards_gained\" varchar(100)\n  \"fumbles_opposing_committed\" varchar(100)\n  \"fumbles_opposing_recovered\" varchar(100)\n  \"fumbles_opposing_lost\" varchar(100)\n  \"fumbles_opposing_yards_gained\" varchar(100)\n}\n\nTable \"american_football_offensive_stats\" {\n  \"id\" int4 [not null, increment]\n  \"offensive_plays_yards\" varchar(100)\n  \"offensive_plays_number\" varchar(100)\n  \"offensive_plays_average_yards_per\" varchar(100)\n  \"possession_duration\" varchar(100)\n  \"turnovers_giveaway\" varchar(100)\n}\n\nTable \"american_football_passing_stats\" {\n  \"id\" int4 [not null, increment]\n  \"passes_attempts\" varchar(100)\n  \"passes_completions\" varchar(100)\n  \"passes_percentage\" varchar(100)\n  \"passes_yards_gross\" varchar(100)\n  \"passes_yards_net\" varchar(100)\n  \"passes_yards_lost\" varchar(100)\n  \"passes_touchdowns\" varchar(100)\n  \"passes_touchdowns_percentage\" varchar(100)\n  \"passes_interceptions\" varchar(100)\n  \"passes_interceptions_percentage\" varchar(100)\n  \"passes_longest\" varchar(100)\n  \"passes_average_yards_per\" varchar(100)\n  \"passer_rating\" varchar(100)\n  \"receptions_total\" varchar(100)\n  \"receptions_yards\" varchar(100)\n  \"receptions_touchdowns\" varchar(100)\n  \"receptions_first_down\" varchar(100)\n  \"receptions_longest\" varchar(100)\n  \"receptions_average_yards_per\" varchar(100)\n}\n\nTable \"american_football_penalties_stats\" {\n  \"id\" int4 [not null, increment]\n  \"penalties_total\" varchar(100)\n  \"penalty_yards\" varchar(100)\n  \"penalty_first_downs\" varchar(100)\n}\n\nTable \"american_football_rushing_stats\" {\n  \"id\" int4 [not null, increment]\n  \"rushes_attempts\" varchar(100)\n  \"rushes_yards\" varchar(100)\n  \"rushes_touchdowns\" varchar(100)\n  \"rushing_average_yards_per\" varchar(100)\n  \"rushes_first_down\" varchar(100)\n  \"rushes_longest\" varchar(100)\n}\n\nTable \"american_football_sacks_against_stats\" {\n  \"id\" int4 [not null, increment]\n  \"sacks_against_yards\" varchar(100)\n  \"sacks_against_total\" varchar(100)\n}\n\nTable \"american_football_scoring_stats\" {\n  \"id\" int4 [not null, increment]\n  \"touchdowns_total\" varchar(100)\n  \"touchdowns_passing\" varchar(100)\n  \"touchdowns_rushing\" varchar(100)\n  \"touchdowns_special_teams\" varchar(100)\n  \"touchdowns_defensive\" varchar(100)\n  \"extra_points_attempts\" varchar(100)\n  \"extra_points_made\" varchar(100)\n  \"extra_points_missed\" varchar(100)\n  \"extra_points_blocked\" varchar(100)\n  \"field_goal_attempts\" varchar(100)\n  \"field_goals_made\" varchar(100)\n  \"field_goals_missed\" varchar(100)\n  \"field_goals_blocked\" varchar(100)\n  \"safeties_against\" varchar(100)\n  \"two_point_conversions_attempts\" varchar(100)\n  \"two_point_conversions_made\" varchar(100)\n  \"touchbacks_total\" varchar(100)\n}\n\nTable \"american_football_special_teams_stats\" {\n  \"id\" int4 [not null, increment]\n  \"returns_punt_total\" varchar(100)\n  \"returns_punt_yards\" varchar(100)\n  \"returns_punt_average\" varchar(100)\n  \"returns_punt_longest\" varchar(100)\n  \"returns_punt_touchdown\" varchar(100)\n  \"returns_kickoff_total\" varchar(100)\n  \"returns_kickoff_yards\" varchar(100)\n  \"returns_kickoff_average\" varchar(100)\n  \"returns_kickoff_longest\" varchar(100)\n  \"returns_kickoff_touchdown\" varchar(100)\n  \"returns_total\" varchar(100)\n  \"returns_yards\" varchar(100)\n  \"punts_total\" varchar(100)\n  \"punts_yards_gross\" varchar(100)\n  \"punts_yards_net\" varchar(100)\n  \"punts_longest\" varchar(100)\n  \"punts_inside_20\" varchar(100)\n  \"punts_inside_20_percentage\" varchar(100)\n  \"punts_average\" varchar(100)\n  \"punts_blocked\" varchar(100)\n  \"touchbacks_total\" varchar(100)\n  \"touchbacks_total_percentage\" varchar(100)\n  \"touchbacks_kickoffs\" varchar(100)\n  \"touchbacks_kickoffs_percentage\" varchar(100)\n  \"touchbacks_punts\" varchar(100)\n  \"touchbacks_punts_percentage\" varchar(100)\n  \"touchbacks_interceptions\" varchar(100)\n  \"touchbacks_interceptions_percentage\" varchar(100)\n  \"fair_catches\" varchar(100)\n}\n\nTable \"baseball_action_contact_details\" {\n  \"id\" int4 [not null, increment]\n  \"baseball_action_pitch_id\" int4 [not null]\n  \"location\" varchar(100)\n  \"strength\" varchar(100)\n  \"velocity\" int4\n  \"comment\" text\n  \"trajectory_coordinates\" varchar(100)\n  \"trajectory_formula\" varchar(100)\n}\n\nTable \"baseball_action_pitches\" {\n  \"id\" int4 [not null, increment]\n  \"baseball_action_play_id\" int4 [not null]\n  \"sequence_number\" int4\n  \"baseball_defensive_group_id\" int4\n  \"umpire_call\" varchar(100)\n  \"pitch_location\" varchar(100)\n  \"pitch_type\" varchar(100)\n  \"pitch_velocity\" int4\n  \"comment\" text\n  \"trajectory_coordinates\" varchar(100)\n  \"trajectory_formula\" varchar(100)\n  \"ball_type\" varchar(40)\n  \"strike_type\" varchar(40)\n}\n\nTable \"baseball_action_plays\" {\n  \"id\" int4 [not null, increment]\n  \"baseball_event_state_id\" int4 [not null]\n  \"play_type\" varchar(100)\n  \"notation\" varchar(100)\n  \"notation_yaml\" text\n  \"baseball_defensive_group_id\" int4\n  \"comment\" varchar(255)\n  \"runner_on_first_advance\" int4\n  \"runner_on_second_advance\" int4\n  \"runner_on_third_advance\" int4\n  \"outs_recorded\" int4\n  \"rbi\" int4\n  \"runs_scored\" int4\n  \"earned_runs_scored\" varchar(100)\n}\n\nTable \"baseball_action_substitutions\" {\n  \"id\" int4 [not null, increment]\n  \"baseball_event_state_id\" int4 [not null]\n  \"sequence_number\" int4\n  \"person_type\" varchar(100)\n  \"person_original_id\" int4\n  \"person_original_position_id\" int4\n  \"person_original_lineup_slot\" int4\n  \"person_replacing_id\" int4\n  \"person_replacing_position_id\" int4\n  \"person_replacing_lineup_slot\" int4\n  \"substitution_reason\" varchar(100)\n  \"comment\" varchar(100)\n}\n\nTable \"baseball_defensive_group\" {\n  \"id\" int4 [not null, increment]\n}\n\nTable \"baseball_defensive_players\" {\n  \"id\" int4 [not null, increment]\n  \"baseball_defensive_group_id\" int4 [not null]\n  \"player_id\" int4 [not null]\n  \"position_id\" int4 [not null]\n}\n\nTable \"baseball_defensive_stats\" {\n  \"id\" int4 [not null, increment]\n  \"double_plays\" int4\n  \"triple_plays\" int4\n  \"putouts\" int4\n  \"assists\" int4\n  \"errors\" int4\n  \"fielding_percentage\" numeric\n  \"defensive_average\" numeric\n  \"errors_passed_ball\" int4\n  \"errors_catchers_interference\" int4\n}\n\nTable \"baseball_event_states\" {\n  \"id\" int4 [not null, increment]\n  \"event_id\" int4 [not null]\n  \"current_state\" int2\n  \"sequence_number\" int4\n  \"at_bat_number\" int4\n  \"inning_value\" int4\n  \"inning_half\" varchar(100)\n  \"outs\" int4\n  \"balls\" int4\n  \"strikes\" int4\n  \"runner_on_first_id\" int4\n  \"runner_on_second_id\" int4\n  \"runner_on_third_id\" int4\n  \"runner_on_first\" int2\n  \"runner_on_second\" int2\n  \"runner_on_third\" int2\n  \"runs_this_inning_half\" int4\n  \"pitcher_id\" int4\n  \"batter_id\" int4\n  \"batter_side\" varchar(100)\n  \"context\" varchar(40)\n}\n\nTable \"baseball_offensive_stats\" {\n  \"id\" int4 [not null, increment]\n  \"average\" numeric\n  \"runs_scored\" int4\n  \"at_bats\" int4\n  \"hits\" int4\n  \"rbi\" int4\n  \"total_bases\" int4\n  \"slugging_percentage\" numeric\n  \"bases_on_balls\" int4\n  \"strikeouts\" int4\n  \"left_on_base\" int4\n  \"left_in_scoring_position\" int4\n  \"singles\" int4\n  \"doubles\" int4\n  \"triples\" int4\n  \"home_runs\" int4\n  \"grand_slams\" int4\n  \"at_bats_per_rbi\" numeric\n  \"plate_appearances_per_rbi\" numeric\n  \"at_bats_per_home_run\" numeric\n  \"plate_appearances_per_home_run\" numeric\n  \"sac_flies\" int4\n  \"sac_bunts\" int4\n  \"grounded_into_double_play\" int4\n  \"moved_up\" int4\n  \"on_base_percentage\" numeric\n  \"stolen_bases\" int4\n  \"stolen_bases_caught\" int4\n  \"stolen_bases_average\" numeric\n  \"hit_by_pitch\" int4\n  \"defensive_interferance_reaches\" int4\n  \"on_base_plus_slugging\" numeric\n  \"plate_appearances\" int4\n  \"hits_extra_base\" int4\n}\n\nTable \"baseball_pitching_stats\" {\n  \"id\" int4 [not null, increment]\n  \"runs_allowed\" int4\n  \"singles_allowed\" int4\n  \"doubles_allowed\" int4\n  \"triples_allowed\" int4\n  \"home_runs_allowed\" int4\n  \"innings_pitched\" varchar(20)\n  \"hits\" int4\n  \"earned_runs\" int4\n  \"unearned_runs\" int4\n  \"bases_on_balls\" int4\n  \"bases_on_balls_intentional\" int4\n  \"strikeouts\" int4\n  \"strikeout_to_bb_ratio\" numeric\n  \"number_of_pitches\" int4\n  \"era\" numeric\n  \"inherited_runners_scored\" int4\n  \"pick_offs\" int4\n  \"errors_hit_with_pitch\" int4\n  \"errors_wild_pitch\" int4\n  \"balks\" int4\n  \"wins\" int4\n  \"losses\" int4\n  \"saves\" int4\n  \"shutouts\" int4\n  \"games_complete\" int4\n  \"games_finished\" int4\n  \"winning_percentage\" numeric\n  \"event_credit\" varchar(40)\n  \"save_credit\" varchar(40)\n}\n\nTable \"basketball_defensive_stats\" {\n  \"id\" int4 [not null, increment]\n  \"steals_total\" varchar(100)\n  \"steals_per_game\" varchar(100)\n  \"blocks_total\" varchar(100)\n  \"blocks_per_game\" varchar(100)\n}\n\nTable \"basketball_event_states\" {\n  \"id\" int4 [not null, increment]\n  \"event_id\" int4 [not null]\n  \"current_state\" int4\n  \"sequence_number\" int4\n  \"period_value\" varchar(100)\n  \"period_time_elapsed\" varchar(100)\n  \"period_time_remaining\" varchar(100)\n  \"context\" varchar(40)\n}\n\nTable \"basketball_offensive_stats\" {\n  \"id\" int4 [not null, increment]\n  \"field_goals_made\" int4\n  \"field_goals_attempted\" int4\n  \"field_goals_percentage\" varchar(100)\n  \"field_goals_per_game\" varchar(100)\n  \"field_goals_attempted_per_game\" varchar(100)\n  \"field_goals_percentage_adjusted\" varchar(100)\n  \"three_pointers_made\" int4\n  \"three_pointers_attempted\" int4\n  \"three_pointers_percentage\" varchar(100)\n  \"three_pointers_per_game\" varchar(100)\n  \"three_pointers_attempted_per_game\" varchar(100)\n  \"free_throws_made\" varchar(100)\n  \"free_throws_attempted\" varchar(100)\n  \"free_throws_percentage\" varchar(100)\n  \"free_throws_per_game\" varchar(100)\n  \"free_throws_attempted_per_game\" varchar(100)\n  \"points_scored_total\" varchar(100)\n  \"points_scored_per_game\" varchar(100)\n  \"assists_total\" varchar(100)\n  \"assists_per_game\" varchar(100)\n  \"turnovers_total\" varchar(100)\n  \"turnovers_per_game\" varchar(100)\n  \"points_scored_off_turnovers\" varchar(100)\n  \"points_scored_in_paint\" varchar(100)\n  \"points_scored_on_second_chance\" varchar(100)\n  \"points_scored_on_fast_break\" varchar(100)\n}\n\nTable \"basketball_rebounding_stats\" {\n  \"id\" int4 [not null, increment]\n  \"rebounds_total\" varchar(100)\n  \"rebounds_per_game\" varchar(100)\n  \"rebounds_defensive\" varchar(100)\n  \"rebounds_offensive\" varchar(100)\n  \"team_rebounds_total\" varchar(100)\n  \"team_rebounds_per_game\" varchar(100)\n  \"team_rebounds_defensive\" varchar(100)\n  \"team_rebounds_offensive\" varchar(100)\n}\n\nTable \"basketball_team_stats\" {\n  \"id\" int4 [not null, increment]\n  \"timeouts_left\" varchar(100)\n  \"largest_lead\" varchar(100)\n  \"fouls_total\" varchar(100)\n  \"turnover_margin\" varchar(100)\n}\n\nTable \"bookmakers\" {\n  \"id\" int4 [not null, increment]\n  \"bookmaker_key\" varchar(100)\n  \"publisher_id\" int4 [not null]\n  \"location_id\" int4\n}\n\nTable \"core_person_stats\" {\n  \"id\" int4 [not null, increment]\n  \"time_played_event\" varchar(40)\n  \"time_played_total\" varchar(40)\n  \"time_played_event_average\" varchar(40)\n  \"events_played\" int4\n  \"events_started\" int4\n  \"position_id\" int4\n}\n\nTable \"core_stats\" {\n  \"id\" int4 [not null, increment]\n  \"score\" varchar(100)\n  \"score_opposing\" varchar(100)\n  \"score_attempts\" varchar(100)\n  \"score_attempts_opposing\" varchar(100)\n  \"score_percentage\" varchar(100)\n  \"score_percentage_opposing\" varchar(100)\n}\n\nTable \"db_info\" {\n  \"version\" varchar(100) [not null, default: 16]\n}\n\nTable \"display_names\" {\n  \"id\" int4 [not null, increment]\n  \"language\" varchar(100) [not null]\n  \"entity_type\" varchar(100) [not null]\n  \"entity_id\" int4 [not null]\n  \"full_name\" varchar(100)\n  \"first_name\" varchar(100)\n  \"middle_name\" varchar(100)\n  \"last_name\" varchar(100)\n  \"alias\" varchar(100)\n  \"abbreviation\" varchar(100)\n  \"short_name\" varchar(100)\n  \"prefix\" varchar(20)\n  \"suffix\" varchar(20)\n}\n\nTable \"document_classes\" {\n  \"id\" int4 [not null, increment]\n  \"name\" varchar(100)\n}\n\nTable \"document_contents\" {\n  \"id\" int4 [not null, increment]\n  \"document_id\" int4 [not null]\n  \"sportsml\" varchar(200)\n  \"abstract\" text\n}\n\nTable \"document_fixtures\" {\n  \"id\" int4 [not null, increment]\n  \"fixture_key\" varchar(100)\n  \"publisher_id\" int4 [not null]\n  \"name\" varchar(100)\n  \"document_class_id\" int4 [not null]\n}\n\nTable \"document_fixtures_events\" {\n  \"id\" int4 [not null, increment]\n  \"document_fixture_id\" int4 [not null]\n  \"event_id\" int4 [not null]\n  \"latest_document_id\" int4 [not null]\n  \"last_update\" timestamp\n}\n\nTable \"document_package_entry\" {\n  \"id\" int4 [not null, increment]\n  \"document_package_id\" int4 [not null]\n  \"rank\" varchar(100)\n  \"document_id\" int4 [not null]\n  \"headline\" varchar(100)\n  \"short_headline\" varchar(100)\n}\n\nTable \"document_packages\" {\n  \"id\" int4 [not null, increment]\n  \"package_key\" varchar(100)\n  \"package_name\" varchar(100)\n  \"date_time\" date\n}\n\nTable \"documents\" {\n  \"id\" int4 [not null, increment]\n  \"doc_id\" varchar(75) [not null]\n  \"publisher_id\" int4 [not null]\n  \"date_time\" timestamp\n  \"title\" varchar(255)\n  \"language\" varchar(100)\n  \"priority\" varchar(100)\n  \"revision_id\" varchar(75)\n  \"stats_coverage\" varchar(100)\n  \"document_fixture_id\" int4 [not null]\n  \"source_id\" int4\n  \"db_loading_date_time\" timestamp\n}\n\nTable \"documents_media\" {\n  \"id\" int4 [not null, increment]\n  \"document_id\" int4 [not null]\n  \"media_id\" int4 [not null]\n  \"media_caption_id\" int4 [not null]\n}\n\nTable \"events\" {\n  \"id\" int4 [not null, increment]\n  \"event_key\" varchar(100) [not null]\n  \"publisher_id\" int4 [not null]\n  \"start_date_time\" timestamp\n  \"site_id\" int4\n  \"site_alignment\" varchar(100)\n  \"event_status\" varchar(100)\n  \"duration\" varchar(100)\n  \"attendance\" varchar(100)\n  \"last_update\" timestamp\n}\n\nTable \"events_documents\" {\n  \"event_id\" int4 [not null]\n  \"document_id\" int4 [not null]\n}\n\nTable \"events_media\" {\n  \"event_id\" int4 [not null]\n  \"media_id\" int4 [not null]\n}\n\nTable \"events_sub_seasons\" {\n  \"event_id\" int4 [not null]\n  \"sub_season_id\" int4 [not null]\n}\n\nTable \"ice_hockey_action_participants\" {\n  \"id\" int4 [not null, increment]\n  \"ice_hockey_action_play_id\" int4 [not null]\n  \"person_id\" int4 [not null]\n  \"participant_role\" varchar(100) [not null]\n  \"point_credit\" int4\n}\n\nTable \"ice_hockey_action_plays\" {\n  \"id\" int4 [not null, increment]\n  \"ice_hockey_event_state_id\" int4 [not null]\n  \"play_type\" varchar(100)\n  \"score_attempt_type\" varchar(100)\n  \"play_result\" varchar(100)\n  \"comment\" varchar(255)\n}\n\nTable \"ice_hockey_defensive_stats\" {\n  \"id\" int4 [not null, increment]\n  \"shots_power_play_allowed\" varchar(100)\n  \"shots_penalty_shot_allowed\" varchar(100)\n  \"goals_power_play_allowed\" varchar(100)\n  \"goals_penalty_shot_allowed\" varchar(100)\n  \"goals_against_average\" varchar(100)\n  \"saves\" varchar(100)\n  \"save_percentage\" varchar(100)\n  \"penalty_killing_amount\" varchar(100)\n  \"penalty_killing_percentage\" varchar(100)\n  \"shots_blocked\" varchar(100)\n  \"takeaways\" varchar(100)\n  \"shutouts\" varchar(100)\n  \"minutes_penalty_killing\" varchar(100)\n  \"hits\" varchar(100)\n  \"goals_empty_net_allowed\" varchar(100)\n  \"goals_short_handed_allowed\" varchar(100)\n  \"goals_shootout_allowed\" varchar(100)\n  \"shots_shootout_allowed\" varchar(100)\n}\n\nTable \"ice_hockey_event_states\" {\n  \"id\" int4 [not null, increment]\n  \"event_id\" int4 [not null]\n  \"current_state\" int4\n  \"sequence_number\" int4\n  \"period_value\" varchar(100)\n  \"period_time_elapsed\" varchar(100)\n  \"period_time_remaining\" varchar(100)\n  \"context\" varchar(40)\n}\n\nTable \"ice_hockey_offensive_stats\" {\n  \"id\" int4 [not null, increment]\n  \"goals_game_winning\" varchar(100)\n  \"goals_game_tying\" varchar(100)\n  \"goals_power_play\" varchar(100)\n  \"goals_short_handed\" varchar(100)\n  \"goals_even_strength\" varchar(100)\n  \"goals_empty_net\" varchar(100)\n  \"goals_overtime\" varchar(100)\n  \"goals_shootout\" varchar(100)\n  \"goals_penalty_shot\" varchar(100)\n  \"assists\" varchar(100)\n  \"points\" varchar(100)\n  \"power_play_amount\" varchar(100)\n  \"power_play_percentage\" varchar(100)\n  \"shots_penalty_shot_taken\" varchar(100)\n  \"shots_penalty_shot_missed\" varchar(100)\n  \"shots_penalty_shot_percentage\" varchar(100)\n  \"giveaways\" varchar(100)\n  \"minutes_power_play\" varchar(100)\n  \"faceoff_wins\" varchar(100)\n  \"faceoff_losses\" varchar(100)\n  \"faceoff_win_percentage\" varchar(100)\n  \"scoring_chances\" varchar(100)\n}\n\nTable \"ice_hockey_player_stats\" {\n  \"id\" int4 [not null, increment]\n  \"plus_minus\" varchar(100)\n}\n\nTable \"injury_phases\" {\n  \"id\" int4 [not null, increment]\n  \"person_id\" int4 [not null]\n  \"injury_status\" varchar(100)\n  \"injury_type\" varchar(100)\n  \"injury_comment\" varchar(100)\n  \"disabled_list\" varchar(100)\n  \"start_date_time\" timestamp\n  \"end_date_time\" timestamp\n  \"season_id\" int4\n  \"phase_type\" varchar(100)\n  \"injury_side\" varchar(100)\n}\n\nTable \"key_aliases\" {\n  \"id\" int4 [not null, increment]\n  \"key_id\" int4 [not null]\n  \"key_root_id\" int4 [not null]\n}\n\nTable \"key_roots\" {\n  \"id\" int4 [not null, increment]\n  \"key_type\" varchar(100)\n}\n\nTable \"latest_revisions\" {\n  \"id\" int4 [not null, increment]\n  \"revision_id\" varchar(75) [not null]\n  \"latest_document_id\" int4 [not null]\n}\n\nTable \"locations\" {\n  \"id\" int4 [not null, increment]\n  \"timezone\" varchar(100)\n  \"latitude\" varchar(100)\n  \"longitude\" varchar(100)\n  \"country_code\" varchar(100)\n}\n\nTable \"media\" {\n  \"id\" int4 [not null, increment]\n  \"object_id\" int4\n  \"source_id\" int4\n  \"revision_id\" int4\n  \"media_type\" varchar(100)\n  \"publisher_id\" int4 [not null]\n  \"date_time\" varchar(100)\n  \"credit_id\" int4 [not null]\n  \"db_loading_date_time\" timestamp\n  \"creation_location_id\" int4 [not null]\n}\n\nTable \"media_captions\" {\n  \"id\" int4 [not null, increment]\n  \"media_id\" int4 [not null]\n  \"caption_type\" varchar(100)\n  \"caption\" varchar(100)\n  \"caption_author_id\" int4 [not null]\n  \"language\" varchar(100)\n  \"caption_size\" varchar(100)\n}\n\nTable \"media_contents\" {\n  \"id\" int4 [not null, increment]\n  \"media_id\" int4 [not null]\n  \"object\" varchar(100)\n  \"format\" varchar(100)\n  \"mime_type\" varchar(100)\n  \"height\" varchar(100)\n  \"width\" varchar(100)\n  \"duration\" varchar(100)\n  \"file_size\" varchar(100)\n  \"resolution\" varchar(100)\n}\n\nTable \"media_keywords\" {\n  \"id\" int4 [not null, increment]\n  \"keyword\" varchar(100)\n  \"media_id\" int4 [not null]\n}\n\nTable \"motor_racing_event_states\" {\n  \"id\" int4 [not null, increment]\n  \"event_id\" int4 [not null]\n  \"current_state\" int4\n  \"sequence_number\" int4\n  \"lap\" varchar(100)\n  \"laps_remaining\" varchar(100)\n  \"time_elapsed\" varchar(100)\n  \"flag_state\" varchar(100)\n  \"context\" varchar(40)\n}\n\nTable \"motor_racing_qualifying_stats\" {\n  \"id\" int4 [not null, increment]\n  \"grid\" varchar(100)\n  \"pole_position\" varchar(100)\n  \"pole_wins\" varchar(100)\n  \"qualifying_speed\" varchar(100)\n  \"qualifying_speed_units\" varchar(100)\n  \"qualifying_time\" varchar(100)\n  \"qualifying_position\" varchar(100)\n}\n\nTable \"motor_racing_race_stats\" {\n  \"id\" int4 [not null, increment]\n  \"time_behind_leader\" varchar(100)\n  \"laps_behind_leader\" varchar(100)\n  \"time_ahead_follower\" varchar(100)\n  \"laps_ahead_follower\" varchar(100)\n  \"time\" varchar(100)\n  \"points\" varchar(100)\n  \"points_rookie\" varchar(100)\n  \"bonus\" varchar(100)\n  \"laps_completed\" varchar(100)\n  \"laps_leading_total\" varchar(100)\n  \"distance_leading\" varchar(100)\n  \"distance_completed\" varchar(100)\n  \"distance_units\" varchar(40)\n  \"speed_average\" varchar(40)\n  \"speed_units\" varchar(40)\n  \"status\" varchar(40)\n  \"finishes_top_5\" varchar(40)\n  \"finishes_top_10\" varchar(40)\n  \"starts\" varchar(40)\n  \"finishes\" varchar(40)\n  \"non_finishes\" varchar(40)\n  \"wins\" varchar(40)\n  \"races_leading\" varchar(40)\n  \"money\" varchar(40)\n  \"money_units\" varchar(40)\n  \"leads_total\" varchar(40)\n}\n\nTable \"outcome_totals\" {\n  \"id\" int4 [not null, increment]\n  \"standing_subgroup_id\" int4 [not null]\n  \"outcome_holder_type\" varchar(100)\n  \"outcome_holder_id\" int4\n  \"rank\" varchar(100)\n  \"wins\" varchar(100)\n  \"losses\" varchar(100)\n  \"ties\" varchar(100)\n  \"undecideds\" varchar(100)\n  \"winning_percentage\" varchar(100)\n  \"points_scored_for\" varchar(100)\n  \"points_scored_against\" varchar(100)\n  \"points_difference\" varchar(100)\n  \"standing_points\" varchar(100)\n  \"streak_type\" varchar(100)\n  \"streak_duration\" varchar(100)\n  \"streak_total\" varchar(100)\n  \"streak_start\" date\n  \"streak_end\" date\n}\n\nTable \"participants_events\" {\n  \"id\" int4 [not null, increment]\n  \"participant_type\" varchar(100) [not null]\n  \"participant_id\" int4 [not null]\n  \"event_id\" int4 [not null]\n  \"alignment\" varchar(100)\n  \"score\" varchar(100)\n  \"event_outcome\" varchar(100)\n  \"rank\" int4\n}\n\nTable \"periods\" {\n  \"id\" int4 [not null, increment]\n  \"participant_event_id\" int4 [not null]\n  \"period_value\" varchar(100)\n  \"score\" varchar(100)\n}\n\nTable \"person_event_metadata\" {\n  \"id\" int4 [not null, increment]\n  \"person_id\" int4 [not null]\n  \"event_id\" int4 [not null]\n  \"status\" varchar(100)\n  \"health\" varchar(100)\n  \"weight\" varchar(100)\n  \"role_id\" int4\n  \"position_id\" int4\n  \"team_id\" int4\n  \"lineup_slot\" int4\n  \"lineup_slot_sequence\" int4\n}\n\nTable \"person_phases\" {\n  \"id\" int4 [not null, increment]\n  \"person_id\" int4 [not null]\n  \"membership_type\" varchar(40) [not null]\n  \"membership_id\" int4 [not null]\n  \"role_id\" int4\n  \"role_status\" varchar(40)\n  \"phase_status\" varchar(40)\n  \"uniform_number\" varchar(20)\n  \"regular_position_id\" int4\n  \"regular_position_depth\" varchar(40)\n  \"height\" varchar(100)\n  \"weight\" varchar(100)\n  \"start_date_time\" timestamp\n  \"start_season_id\" int4\n  \"end_date_time\" timestamp\n  \"end_season_id\" int4\n  \"entry_reason\" varchar(40)\n  \"exit_reason\" varchar(40)\n  \"selection_level\" int4\n  \"selection_sublevel\" int4\n  \"selection_overall\" int4\n}\n\nTable \"persons\" {\n  \"id\" int4 [not null, increment]\n  \"person_key\" varchar(100) [not null]\n  \"publisher_id\" int4 [not null]\n  \"gender\" varchar(20)\n  \"birth_date\" varchar(30)\n  \"death_date\" varchar(30)\n  \"birth_location_id\" int4\n  \"hometown_location_id\" int4\n  \"residence_location_id\" int4\n  \"death_location_id\" int4\n}\n\nTable \"persons_documents\" {\n  \"person_id\" int4 [not null]\n  \"document_id\" int4 [not null]\n}\n\nTable \"persons_media\" {\n  \"person_id\" int4 [not null]\n  \"media_id\" int4 [not null]\n}\n\nTable \"positions\" {\n  \"id\" int4 [not null, increment]\n  \"affiliation_id\" int4 [not null]\n  \"abbreviation\" varchar(100) [not null]\n}\n\nTable \"publishers\" {\n  \"id\" int4 [not null, increment]\n  \"publisher_key\" varchar(100) [not null]\n  \"publisher_name\" varchar(100)\n}\n\nTable \"roles\" {\n  \"id\" int4 [not null, increment]\n  \"role_key\" varchar(100) [not null]\n  \"role_name\" varchar(100)\n  \"comment\" varchar(100)\n}\n\nTable \"seasons\" {\n  \"id\" int4 [not null, increment]\n  \"season_key\" int4 [not null]\n  \"publisher_id\" int4 [not null]\n  \"league_id\" int4 [not null]\n  \"start_date_time\" timestamp\n  \"end_date_time\" timestamp\n}\n\nTable \"sites\" {\n  \"id\" int4 [not null, increment]\n  \"site_key\" int4 [not null]\n  \"publisher_id\" int4 [not null]\n  \"location_id\" int4\n}\n\nTable \"soccer_defensive_stats\" {\n  \"id\" int4 [not null, increment]\n  \"shots_penalty_shot_allowed\" varchar(100)\n  \"goals_penalty_shot_allowed\" varchar(100)\n  \"goals_against_average\" varchar(100)\n  \"goals_against_total\" varchar(100)\n  \"saves\" varchar(100)\n  \"save_percentage\" varchar(100)\n  \"catches_punches\" varchar(100)\n  \"shots_on_goal_total\" varchar(100)\n  \"shots_shootout_total\" varchar(100)\n  \"shots_shootout_allowed\" varchar(100)\n  \"shots_blocked\" varchar(100)\n  \"shutouts\" varchar(100)\n}\n\nTable \"soccer_event_states\" {\n  \"id\" int4 [not null, increment]\n  \"event_id\" int4 [not null]\n  \"current_state\" int4\n  \"sequence_number\" int4\n  \"period_value\" varchar(100)\n  \"period_time_elapsed\" varchar(100)\n  \"period_time_remaining\" varchar(100)\n  \"minutes_elapsed\" varchar(100)\n  \"period_minute_elapsed\" varchar(100)\n  \"context\" varchar(40)\n}\n\nTable \"soccer_foul_stats\" {\n  \"id\" int4 [not null, increment]\n  \"fouls_suffered\" varchar(100)\n  \"fouls_commited\" varchar(100)\n  \"cautions_total\" varchar(100)\n  \"cautions_pending\" varchar(100)\n  \"caution_points_total\" varchar(100)\n  \"caution_points_pending\" varchar(100)\n  \"ejections_total\" varchar(100)\n}\n\nTable \"soccer_offensive_stats\" {\n  \"id\" int4 [not null, increment]\n  \"goals_game_winning\" varchar(100)\n  \"goals_game_tying\" varchar(100)\n  \"goals_overtime\" varchar(100)\n  \"goals_shootout\" varchar(100)\n  \"goals_total\" varchar(100)\n  \"assists_game_winning\" varchar(100)\n  \"assists_game_tying\" varchar(100)\n  \"assists_overtime\" varchar(100)\n  \"assists_total\" varchar(100)\n  \"points\" varchar(100)\n  \"shots_total\" varchar(100)\n  \"shots_on_goal_total\" varchar(100)\n  \"shots_hit_frame\" varchar(100)\n  \"shots_penalty_shot_taken\" varchar(100)\n  \"shots_penalty_shot_scored\" varchar(100)\n  \"shots_penalty_shot_missed\" varchar(40)\n  \"shots_penalty_shot_percentage\" varchar(40)\n  \"shots_shootout_taken\" varchar(40)\n  \"shots_shootout_scored\" varchar(40)\n  \"shots_shootout_missed\" varchar(40)\n  \"shots_shootout_percentage\" varchar(40)\n  \"giveaways\" varchar(40)\n  \"offsides\" varchar(40)\n  \"corner_kicks\" varchar(40)\n  \"hat_tricks\" varchar(40)\n}\n\nTable \"standing_subgroups\" {\n  \"id\" int4 [not null, increment]\n  \"standing_id\" int4 [not null]\n  \"affiliation_id\" int4 [not null]\n}\n\nTable \"standings\" {\n  \"id\" int4 [not null, increment]\n  \"affiliation_id\" int4 [not null]\n  \"standing_type\" varchar(100)\n  \"sub_season_id\" int4 [not null]\n  \"last_updated\" varchar(100)\n  \"duration_scope\" varchar(100)\n  \"competition_scope\" varchar(100)\n  \"competition_scope_id\" varchar(100)\n  \"alignment_scope\" varchar(100)\n  \"site_scope\" varchar(100)\n  \"scoping_label\" varchar(100)\n  \"publisher_id\" int4 [not null]\n  \"source\" varchar(100)\n}\n\nTable \"stats\" {\n  \"id\" int4 [not null, increment]\n  \"stat_repository_type\" varchar(100)\n  \"stat_repository_id\" int4 [not null]\n  \"stat_holder_type\" varchar(100)\n  \"stat_holder_id\" int4\n  \"stat_coverage_type\" varchar(100)\n  \"stat_coverage_id\" int4\n  \"context\" varchar(40) [not null]\n}\n\nTable \"sub_periods\" {\n  \"id\" int4 [not null, increment]\n  \"period_id\" int4 [not null]\n  \"sub_period_value\" varchar(100)\n  \"score\" varchar(100)\n}\n\nTable \"sub_seasons\" {\n  \"id\" int4 [not null, increment]\n  \"sub_season_key\" varchar(100) [not null]\n  \"season_id\" int4 [not null]\n  \"sub_season_type\" varchar(100) [not null]\n  \"start_date_time\" timestamp\n  \"end_date_time\" timestamp\n}\n\nTable \"team_american_football_stats\" {\n  \"id\" int4 [not null, increment]\n  \"yards_per_attempt\" varchar(100)\n  \"average_starting_position\" varchar(100)\n  \"timeouts\" varchar(100)\n  \"time_of_possession\" varchar(100)\n  \"turnover_ratio\" varchar(100)\n}\n\nTable \"team_phases\" {\n  \"id\" int4 [not null, increment]\n  \"team_id\" int4 [not null]\n  \"start_season_id\" int4\n  \"end_season_id\" int4\n  \"affiliation_id\" int4 [not null]\n  \"start_date_time\" varchar(100)\n  \"end_date_time\" varchar(100)\n  \"phase_status\" varchar(40)\n  \"role_id\" int4\n}\n\nTable \"teams\" {\n  \"id\" int4 [not null, increment]\n  \"team_key\" varchar(100) [not null]\n  \"publisher_id\" int4 [not null]\n  \"home_site_id\" int4\n}\n\nTable \"teams_documents\" {\n  \"team_id\" int4 [not null]\n  \"document_id\" int4 [not null]\n}\n\nTable \"teams_media\" {\n  \"team_id\" int4 [not null]\n  \"media_id\" int4 [not null]\n}\n\nTable \"tennis_action_points\" {\n  \"id\" int4 [not null, increment]\n  \"sub_period_id\" varchar(100)\n  \"sequence_number\" varchar(100)\n  \"win_type\" varchar(100)\n}\n\nTable \"tennis_action_volleys\" {\n  \"id\" int4 [not null, increment]\n  \"sequence_number\" varchar(100)\n  \"tennis_action_points_id\" int4\n  \"landing_location\" varchar(100)\n  \"swing_type\" varchar(100)\n  \"result\" varchar(100)\n  \"spin_type\" varchar(100)\n  \"trajectory_details\" varchar(100)\n}\n\nTable \"tennis_event_states\" {\n  \"id\" int4 [not null, increment]\n  \"event_id\" int4 [not null]\n  \"current_state\" int4\n  \"sequence_number\" int4\n  \"tennis_set\" varchar(100)\n  \"game\" varchar(100)\n  \"server_person_id\" int4\n  \"server_score\" varchar(100)\n  \"receiver_person_id\" int4\n  \"receiver_score\" varchar(100)\n  \"service_number\" varchar(100)\n  \"context\" varchar(40)\n}\n\nTable \"tennis_return_stats\" {\n  \"id\" int4 [not null, increment]\n  \"returns_played\" varchar(100)\n  \"matches_played\" varchar(100)\n  \"first_service_return_points_won\" varchar(100)\n  \"first_service_return_points_won_pct\" varchar(100)\n  \"second_service_return_points_won\" varchar(100)\n  \"second_service_return_points_won_pct\" varchar(100)\n  \"return_games_played\" varchar(100)\n  \"return_games_won\" varchar(100)\n  \"return_games_won_pct\" varchar(100)\n  \"break_points_played\" varchar(100)\n  \"break_points_converted\" varchar(100)\n  \"break_points_converted_pct\" varchar(100)\n}\n\nTable \"tennis_service_stats\" {\n  \"id\" int4 [not null, increment]\n  \"services_played\" varchar(100)\n  \"matches_played\" varchar(100)\n  \"aces\" varchar(100)\n  \"first_services_good\" varchar(100)\n  \"first_services_good_pct\" varchar(100)\n  \"first_service_points_won\" varchar(100)\n  \"first_service_points_won_pct\" varchar(100)\n  \"second_service_points_won\" varchar(100)\n  \"second_service_points_won_pct\" varchar(100)\n  \"service_games_played\" varchar(100)\n  \"service_games_won\" varchar(100)\n  \"service_games_won_pct\" varchar(100)\n  \"break_points_played\" varchar(100)\n  \"break_points_saved\" varchar(100)\n  \"break_points_saved_pct\" varchar(100)\n}\n\nTable \"wagering_moneylines\" {\n  \"id\" int4 [not null, increment]\n  \"bookmaker_id\" int4 [not null]\n  \"event_id\" int4 [not null]\n  \"date_time\" timestamp\n  \"team_id\" int4 [not null]\n  \"person_id\" int4\n  \"rotation_key\" varchar(100)\n  \"comment\" varchar(100)\n  \"vigorish\" varchar(100)\n  \"line\" varchar(100)\n  \"line_opening\" varchar(100)\n  \"prediction\" varchar(100)\n}\n\nTable \"wagering_odds_lines\" {\n  \"id\" int4 [not null, increment]\n  \"bookmaker_id\" int4 [not null]\n  \"event_id\" int4 [not null]\n  \"date_time\" timestamp\n  \"team_id\" int4 [not null]\n  \"person_id\" int4\n  \"rotation_key\" varchar(100)\n  \"comment\" varchar(100)\n  \"numerator\" varchar(100)\n  \"denominator\" varchar(100)\n  \"prediction\" varchar(100)\n  \"payout_calculation\" varchar(100)\n  \"payout_amount\" varchar(100)\n}\n\nTable \"wagering_runlines\" {\n  \"id\" int4 [not null, increment]\n  \"bookmaker_id\" int4 [not null]\n  \"event_id\" int4 [not null]\n  \"date_time\" timestamp\n  \"team_id\" int4 [not null]\n  \"person_id\" int4\n  \"rotation_key\" varchar(100)\n  \"comment\" varchar(100)\n  \"vigorish\" varchar(100)\n  \"line\" varchar(100)\n  \"line_opening\" varchar(100)\n  \"line_value\" varchar(100)\n  \"prediction\" varchar(100)\n}\n\nTable \"wagering_straight_spread_lines\" {\n  \"id\" int4 [not null, increment]\n  \"bookmaker_id\" int4 [not null]\n  \"event_id\" int4 [not null]\n  \"date_time\" timestamp\n  \"team_id\" int4 [not null]\n  \"person_id\" int4\n  \"rotation_key\" varchar(100)\n  \"comment\" varchar(100)\n  \"vigorish\" varchar(100)\n  \"line_value\" varchar(100)\n  \"line_value_opening\" varchar(100)\n  \"prediction\" varchar(100)\n}\n\nTable \"wagering_total_score_lines\" {\n  \"id\" int4 [not null, increment]\n  \"bookmaker_id\" int4 [not null]\n  \"event_id\" int4 [not null]\n  \"date_time\" timestamp\n  \"team_id\" int4 [not null]\n  \"person_id\" int4\n  \"rotation_key\" varchar(100)\n  \"comment\" varchar(100)\n  \"vigorish\" varchar(100)\n  \"line_over\" varchar(100)\n  \"line_under\" varchar(100)\n  \"total\" varchar(100)\n  \"total_opening\" varchar(100)\n  \"prediction\" varchar(100)\n}\n\nTable \"weather_conditions\" {\n  \"id\" int4 [not null, increment]\n  \"event_id\" int4 [not null]\n  \"temperature\" varchar(100)\n  \"temperature_units\" varchar(40)\n  \"humidity\" varchar(100)\n  \"clouds\" varchar(100)\n  \"wind_direction\" varchar(100)\n  \"wind_velocity\" varchar(100)\n  \"weather_code\" varchar(100)\n}\n",
    "stateUrl": null,
    "stateOriginalUrl": "https://github.com/yugabyte/yugabyte-db/blob/master/sample/sportsdb_tables.sql"
  }
}

================================================
FILE: tasks/postgres/standard/sports/participant_report_optimization/verify.py
================================================
"""
Verification script for PostgreSQL Sports Task 3: Query Performance Optimization
"""

import os
import sys
import psycopg2
from decimal import Decimal

def rows_match(actual_row, expected_row):
    """
    Compare two rows with appropriate tolerance.
    For Decimal types: allows 0.001 tolerance
    For other types: requires exact match
    """
    if len(actual_row) != len(expected_row):
        return False
    
    for actual, expected in zip(actual_row, expected_row):
        if isinstance(actual, Decimal) and isinstance(expected, Decimal):
            if abs(float(actual) - float(expected)) > 0.001:
                return False
        elif isinstance(actual, float) and isinstance(expected, float):
            if abs(actual - expected) > 0.001:
                return False
        elif actual != expected:
            return False
    
    return True

def get_connection_params() -> dict:
    """Get database connection parameters."""
    return {
        "host": os.getenv("POSTGRES_HOST", "localhost"),
        "port": int(os.getenv("POSTGRES_PORT", 5432)),
        "database": os.getenv("POSTGRES_DATABASE", "sports"),
        "user": os.getenv("POSTGRES_USERNAME", "postgres"),
        "password": os.getenv("POSTGRES_PASSWORD", "postgres")
    }

def verify_report_data(conn) -> bool:
    """Verify the report table contains the expected data."""
    with conn.cursor() as cur:
        # Get actual results from the report table
        cur.execute("""
            SELECT participant_id, event_count, stat_count, stat_type_count, last_event_date
            FROM participant_performance_report
            ORDER BY participant_id
        """)
        actual_results = cur.fetchall()
        
        if len(actual_results) == 0:
            print("❌ Report table is empty")
            return False
        
        # Execute ground truth query
        cur.execute("""
            SELECT 
                pe.participant_id,
                COUNT(pe.event_id) as event_count,
                (SELECT COUNT(*) FROM stats s WHERE s.stat_holder_id = pe.participant_id AND s.stat_holder_type = 'persons') as stat_count,
                (SELECT COUNT(DISTINCT s.stat_repository_type) FROM stats s WHERE s.stat_holder_id = pe.participant_id AND s.stat_holder_type = 'persons') as stat_type_count,
                (SELECT MAX(e.start_date_time) FROM events e JOIN participants_events pe2 ON e.id = pe2.event_id WHERE pe2.participant_id = pe.participant_id) as last_event_date
            FROM participants_events pe 
            WHERE pe.participant_id <= 50
            GROUP BY pe.participant_id
            ORDER BY pe.participant_id
        """)
        expected_results = cur.fetchall()

        if len(actual_results) != len(expected_results):
            print(f"❌ Expected {len(expected_results)} report records, got {len(actual_results)}")
            return False

        mismatches = 0
        for actual, expected in zip(actual_results, expected_results):
            if not rows_match(actual, expected):
                if mismatches < 5:
                    print(f"❌ Row mismatch: expected {expected}, got {actual}")
                mismatches += 1

        if mismatches > 0:
            print(f"❌ Total mismatches in report data: {mismatches}")
            return False

        print(f"✅ Report data is correct ({len(actual_results)} records)")
        return True

def verify_performance_optimization(conn) -> bool:
    """Verify that key performance optimization indexes have been implemented."""
    with conn.cursor() as cur:
        print("\n🔍 Checking for critical performance indexes...")
        
        # Check 1: participants_events.participant_id index (critical for subqueries)
        cur.execute("""
            SELECT indexname, indexdef 
            FROM pg_indexes 
            WHERE schemaname = 'public' 
            AND tablename = 'participants_events'
            AND indexdef LIKE '%participant_id%'
        """)
        participant_indexes = cur.fetchall()
        has_participant_index = len(participant_indexes) > 0
        
        # Check 2: stats table optimization (critical for subquery filtering)
        cur.execute("""
            SELECT indexname, indexdef 
            FROM pg_indexes 
            WHERE schemaname = 'public' 
            AND tablename = 'stats'
            AND indexdef LIKE '%stat_holder_type%'
            AND indexdef LIKE '%stat_holder_id%'
        """)
        stats_indexes = cur.fetchall()
        has_stats_index = len(stats_indexes) > 0
        
        # Report findings
        critical_indexes_found = 0
        
        if has_participant_index:
            print("✅ Found participant filtering index on participants_events.participant_id")
            critical_indexes_found += 1
        else:
            print("❌ Missing critical index on participants_events.participant_id")
            
        if has_stats_index:
            print("✅ Found subquery optimization index on stats table")
            critical_indexes_found += 1
        else:
            print("❌ Missing critical index on stats table")
        
        # Must have both critical indexes for this subquery-heavy query
        if critical_indexes_found >= 2:
            print(f"\n✅ Performance optimization: PASS ({critical_indexes_found}/2 critical indexes found)")
            return True
        else:
            print(f"\n❌ Performance optimization: FAIL ({critical_indexes_found}/2 critical indexes found)")
            print("   Create these critical indexes:")
            print("   - CREATE INDEX ON participants_events(participant_id);")
            print("   - CREATE INDEX ON stats(stat_holder_type, stat_holder_id);")
            return False

def main():
    """Main verification function."""
    print("=" * 50)
    print("Verifying Sports Task 3: Query Performance Optimization")
    print("=" * 50)

    # Get connection parameters
    conn_params = get_connection_params()

    if not conn_params["database"]:
        print("❌ No database specified")
        sys.exit(1)

    try:
        # Connect to database
        conn = psycopg2.connect(**conn_params)

        # Verify all components
        success = (
            verify_report_data(conn) and
            verify_performance_optimization(conn)
        )

        conn.close()

        if success:
            print("\n🎉 Task verification: PASS")
            sys.exit(0)
        else:
            print("\n❌ Task verification: FAIL")
            sys.exit(1)

    except psycopg2.Error as e:
        print(f"❌ Database error: {e}")
        sys.exit(1)
    except Exception as e:
        print(f"❌ Verification error: {e}")
        sys.exit(1)

if __name__ == "__main__":
    main()

================================================
FILE: tasks/postgres/standard/sports/team_roster_management/description.md
================================================
# Team Roster Management Operations

## Background
You need to manage team rosters for the upcoming season, including player transfers, injury tracking, and performance evaluations.

## Requirements

Complete the following 5 operations in order:

### 1. Set Up Player Performance Tracking
Create a table called `player_evaluation` with the following structure:
- performance_id (serial primary key)
- person_id (integer not null, references persons(id))
- batting_avg (decimal)
- home_runs (integer)
- rbis (integer)
- games_played (integer)
- performance_score (decimal)
- evaluation_date (date)

Add constraint: CHECK (batting_avg BETWEEN 0 AND 1)

### 2. Load Historical Player Statistics
Insert player performance data into `player_evaluation`:
- Select all players who have offensive statistics
- Calculate batting_avg as hits/at_bats (handle division by zero)
- Sum up home_runs, rbi from baseball_offensive_stats
- Count games_played from person_event_metadata
- Calculate performance_score as: (batting_avg * 1000) + (home_runs * 5) + (rbi * 2)
- Only include players with at least 10 games played
- Set evaluation_date to '2024-01-01'

### 3. Track Player Health Status
Create a table called `player_injury_status`:
- status_id (serial primary key)
- person_id (integer unique not null)
- injury_count (integer default 0)
- last_injury_date (date)
- current_status (varchar check in ('healthy', 'injured', 'recovering'))

Insert data by:
- Including all players from player_evaluation
- Count injuries from injury_phases for each player
- Get the most recent injury start_date as last_injury_date
- Set current_status: 'injured' if injury has no end_date, otherwise 'healthy'

### 4. Adjust Scores Based on Health
Update `player_evaluation` to reduce performance scores for injured players:
- Reduce performance_score by 20% for players with current_status = 'injured'
- Reduce performance_score by 10% for players with injury_count > 2
- Set minimum performance_score to 0 (no negative scores)

### 5. Generate Performance Summary Report
Create a summary table called `team_performance_summary`:
- summary_id (serial primary key)
- metric_name (varchar unique)
- metric_value (decimal)

Insert the following metrics:
- 'total_players' - count of players in player_evaluation
- 'avg_batting_average' - average batting_avg
- 'total_home_runs' - sum of all home_runs
- 'avg_performance_score' - average performance_score
- 'injured_player_count' - count of injured players
- 'healthy_player_count' - count of healthy players

## Important Notes
- Handle NULL values appropriately (treat as 0 where needed)
- Ensure foreign key constraints are properly set
- Do NOT use ROUND functions in calculations
- Use COALESCE to handle NULL values in calculations

================================================
FILE: tasks/postgres/standard/sports/team_roster_management/meta.json
================================================
{
  "task_id": "team_roster_management",
  "task_name": "Team Roster Management",
  "category_id": "sports",
  "category_name": "Sports",
  "description": "Manage team rosters with player transfers, injury tracking, performance evaluations, and health status adjustments.",
  "author": "Lingxiao Du",
  "created_at": "2025-08-18",
  "difficulty": "L3",
  "tags": [
    "schema design",
    "data migration",
    "statistical aggregation"
  ],
  "mcp": [
    "postgres"
  ],
  "meta_data": {
    "stateType": "text",
    "stateContent": "Table \"addresses\" {\n  \"id\" int4 [not null, increment]\n  \"location_id\" int4 [not null]\n  \"language\" varchar(100)\n  \"suite\" varchar(100)\n  \"floor\" varchar(100)\n  \"building\" varchar(100)\n  \"street_number\" varchar(100)\n  \"street_prefix\" varchar(100)\n  \"street\" varchar(100)\n  \"street_suffix\" varchar(100)\n  \"neighborhood\" varchar(100)\n  \"district\" varchar(100)\n  \"locality\" varchar(100)\n  \"county\" varchar(100)\n  \"region\" varchar(100)\n  \"postal_code\" varchar(100)\n  \"country\" varchar(100)\n}\n\nTable \"affiliation_phases\" {\n  \"id\" int4 [not null, increment]\n  \"affiliation_id\" int4 [not null]\n  \"ancestor_affiliation_id\" int4\n  \"start_season_id\" int4\n  \"start_date_time\" timestamp\n  \"end_season_id\" int4\n  \"end_date_time\" timestamp\n}\n\nTable \"affiliations\" {\n  \"id\" int4 [not null, increment]\n  \"affiliation_key\" varchar(100) [not null]\n  \"affiliation_type\" varchar(100)\n  \"publisher_id\" int4 [not null]\n}\n\nTable \"affiliations_documents\" {\n  \"affiliation_id\" int4 [not null]\n  \"document_id\" int4 [not null]\n}\n\nTable \"affiliations_events\" {\n  \"affiliation_id\" int4 [not null]\n  \"event_id\" int4 [not null]\n}\n\nTable \"affiliations_media\" {\n  \"affiliation_id\" int4 [not null]\n  \"media_id\" int4 [not null]\n}\n\nTable \"american_football_action_participants\" {\n  \"id\" int4 [not null, increment]\n  \"american_football_action_play_id\" int4 [not null]\n  \"person_id\" int4 [not null]\n  \"participant_role\" varchar(100) [not null]\n  \"score_type\" varchar(100)\n  \"field_line\" int4\n  \"yardage\" int4\n  \"score_credit\" int4\n  \"yards_gained\" int4\n}\n\nTable \"american_football_action_plays\" {\n  \"id\" int4 [not null, increment]\n  \"american_football_event_state_id\" int4 [not null]\n  \"play_type\" varchar(100)\n  \"score_attempt_type\" varchar(100)\n  \"drive_result\" varchar(100)\n  \"points\" int4\n  \"comment\" varchar(255)\n}\n\nTable \"american_football_defensive_stats\" {\n  \"id\" int4 [not null, increment]\n  \"tackles_total\" varchar(100)\n  \"tackles_solo\" varchar(100)\n  \"tackles_assists\" varchar(100)\n  \"interceptions_total\" varchar(100)\n  \"interceptions_yards\" varchar(100)\n  \"interceptions_average\" varchar(100)\n  \"interceptions_longest\" varchar(100)\n  \"interceptions_touchdown\" varchar(100)\n  \"quarterback_hurries\" varchar(100)\n  \"sacks_total\" varchar(100)\n  \"sacks_yards\" varchar(100)\n  \"passes_defensed\" varchar(100)\n}\n\nTable \"american_football_down_progress_stats\" {\n  \"id\" int4 [not null, increment]\n  \"first_downs_total\" varchar(100)\n  \"first_downs_pass\" varchar(100)\n  \"first_downs_run\" varchar(100)\n  \"first_downs_penalty\" varchar(100)\n  \"conversions_third_down\" varchar(100)\n  \"conversions_third_down_attempts\" varchar(100)\n  \"conversions_third_down_percentage\" varchar(100)\n  \"conversions_fourth_down\" varchar(100)\n  \"conversions_fourth_down_attempts\" varchar(100)\n  \"conversions_fourth_down_percentage\" varchar(100)\n}\n\nTable \"american_football_event_states\" {\n  \"id\" int4 [not null, increment]\n  \"event_id\" int4 [not null]\n  \"current_state\" int2\n  \"sequence_number\" int4\n  \"period_value\" int4\n  \"period_time_elapsed\" varchar(100)\n  \"period_time_remaining\" varchar(100)\n  \"clock_state\" varchar(100)\n  \"down\" int4\n  \"team_in_possession_id\" int4\n  \"distance_for_1st_down\" int4\n  \"field_side\" varchar(100)\n  \"field_line\" int4\n  \"context\" varchar(40)\n}\n\nTable \"american_football_fumbles_stats\" {\n  \"id\" int4 [not null, increment]\n  \"fumbles_committed\" varchar(100)\n  \"fumbles_forced\" varchar(100)\n  \"fumbles_recovered\" varchar(100)\n  \"fumbles_lost\" varchar(100)\n  \"fumbles_yards_gained\" varchar(100)\n  \"fumbles_own_committed\" varchar(100)\n  \"fumbles_own_recovered\" varchar(100)\n  \"fumbles_own_lost\" varchar(100)\n  \"fumbles_own_yards_gained\" varchar(100)\n  \"fumbles_opposing_committed\" varchar(100)\n  \"fumbles_opposing_recovered\" varchar(100)\n  \"fumbles_opposing_lost\" varchar(100)\n  \"fumbles_opposing_yards_gained\" varchar(100)\n}\n\nTable \"american_football_offensive_stats\" {\n  \"id\" int4 [not null, increment]\n  \"offensive_plays_yards\" varchar(100)\n  \"offensive_plays_number\" varchar(100)\n  \"offensive_plays_average_yards_per\" varchar(100)\n  \"possession_duration\" varchar(100)\n  \"turnovers_giveaway\" varchar(100)\n}\n\nTable \"american_football_passing_stats\" {\n  \"id\" int4 [not null, increment]\n  \"passes_attempts\" varchar(100)\n  \"passes_completions\" varchar(100)\n  \"passes_percentage\" varchar(100)\n  \"passes_yards_gross\" varchar(100)\n  \"passes_yards_net\" varchar(100)\n  \"passes_yards_lost\" varchar(100)\n  \"passes_touchdowns\" varchar(100)\n  \"passes_touchdowns_percentage\" varchar(100)\n  \"passes_interceptions\" varchar(100)\n  \"passes_interceptions_percentage\" varchar(100)\n  \"passes_longest\" varchar(100)\n  \"passes_average_yards_per\" varchar(100)\n  \"passer_rating\" varchar(100)\n  \"receptions_total\" varchar(100)\n  \"receptions_yards\" varchar(100)\n  \"receptions_touchdowns\" varchar(100)\n  \"receptions_first_down\" varchar(100)\n  \"receptions_longest\" varchar(100)\n  \"receptions_average_yards_per\" varchar(100)\n}\n\nTable \"american_football_penalties_stats\" {\n  \"id\" int4 [not null, increment]\n  \"penalties_total\" varchar(100)\n  \"penalty_yards\" varchar(100)\n  \"penalty_first_downs\" varchar(100)\n}\n\nTable \"american_football_rushing_stats\" {\n  \"id\" int4 [not null, increment]\n  \"rushes_attempts\" varchar(100)\n  \"rushes_yards\" varchar(100)\n  \"rushes_touchdowns\" varchar(100)\n  \"rushing_average_yards_per\" varchar(100)\n  \"rushes_first_down\" varchar(100)\n  \"rushes_longest\" varchar(100)\n}\n\nTable \"american_football_sacks_against_stats\" {\n  \"id\" int4 [not null, increment]\n  \"sacks_against_yards\" varchar(100)\n  \"sacks_against_total\" varchar(100)\n}\n\nTable \"american_football_scoring_stats\" {\n  \"id\" int4 [not null, increment]\n  \"touchdowns_total\" varchar(100)\n  \"touchdowns_passing\" varchar(100)\n  \"touchdowns_rushing\" varchar(100)\n  \"touchdowns_special_teams\" varchar(100)\n  \"touchdowns_defensive\" varchar(100)\n  \"extra_points_attempts\" varchar(100)\n  \"extra_points_made\" varchar(100)\n  \"extra_points_missed\" varchar(100)\n  \"extra_points_blocked\" varchar(100)\n  \"field_goal_attempts\" varchar(100)\n  \"field_goals_made\" varchar(100)\n  \"field_goals_missed\" varchar(100)\n  \"field_goals_blocked\" varchar(100)\n  \"safeties_against\" varchar(100)\n  \"two_point_conversions_attempts\" varchar(100)\n  \"two_point_conversions_made\" varchar(100)\n  \"touchbacks_total\" varchar(100)\n}\n\nTable \"american_football_special_teams_stats\" {\n  \"id\" int4 [not null, increment]\n  \"returns_punt_total\" varchar(100)\n  \"returns_punt_yards\" varchar(100)\n  \"returns_punt_average\" varchar(100)\n  \"returns_punt_longest\" varchar(100)\n  \"returns_punt_touchdown\" varchar(100)\n  \"returns_kickoff_total\" varchar(100)\n  \"returns_kickoff_yards\" varchar(100)\n  \"returns_kickoff_average\" varchar(100)\n  \"returns_kickoff_longest\" varchar(100)\n  \"returns_kickoff_touchdown\" varchar(100)\n  \"returns_total\" varchar(100)\n  \"returns_yards\" varchar(100)\n  \"punts_total\" varchar(100)\n  \"punts_yards_gross\" varchar(100)\n  \"punts_yards_net\" varchar(100)\n  \"punts_longest\" varchar(100)\n  \"punts_inside_20\" varchar(100)\n  \"punts_inside_20_percentage\" varchar(100)\n  \"punts_average\" varchar(100)\n  \"punts_blocked\" varchar(100)\n  \"touchbacks_total\" varchar(100)\n  \"touchbacks_total_percentage\" varchar(100)\n  \"touchbacks_kickoffs\" varchar(100)\n  \"touchbacks_kickoffs_percentage\" varchar(100)\n  \"touchbacks_punts\" varchar(100)\n  \"touchbacks_punts_percentage\" varchar(100)\n  \"touchbacks_interceptions\" varchar(100)\n  \"touchbacks_interceptions_percentage\" varchar(100)\n  \"fair_catches\" varchar(100)\n}\n\nTable \"baseball_action_contact_details\" {\n  \"id\" int4 [not null, increment]\n  \"baseball_action_pitch_id\" int4 [not null]\n  \"location\" varchar(100)\n  \"strength\" varchar(100)\n  \"velocity\" int4\n  \"comment\" text\n  \"trajectory_coordinates\" varchar(100)\n  \"trajectory_formula\" varchar(100)\n}\n\nTable \"baseball_action_pitches\" {\n  \"id\" int4 [not null, increment]\n  \"baseball_action_play_id\" int4 [not null]\n  \"sequence_number\" int4\n  \"baseball_defensive_group_id\" int4\n  \"umpire_call\" varchar(100)\n  \"pitch_location\" varchar(100)\n  \"pitch_type\" varchar(100)\n  \"pitch_velocity\" int4\n  \"comment\" text\n  \"trajectory_coordinates\" varchar(100)\n  \"trajectory_formula\" varchar(100)\n  \"ball_type\" varchar(40)\n  \"strike_type\" varchar(40)\n}\n\nTable \"baseball_action_plays\" {\n  \"id\" int4 [not null, increment]\n  \"baseball_event_state_id\" int4 [not null]\n  \"play_type\" varchar(100)\n  \"notation\" varchar(100)\n  \"notation_yaml\" text\n  \"baseball_defensive_group_id\" int4\n  \"comment\" varchar(255)\n  \"runner_on_first_advance\" int4\n  \"runner_on_second_advance\" int4\n  \"runner_on_third_advance\" int4\n  \"outs_recorded\" int4\n  \"rbi\" int4\n  \"runs_scored\" int4\n  \"earned_runs_scored\" varchar(100)\n}\n\nTable \"baseball_action_substitutions\" {\n  \"id\" int4 [not null, increment]\n  \"baseball_event_state_id\" int4 [not null]\n  \"sequence_number\" int4\n  \"person_type\" varchar(100)\n  \"person_original_id\" int4\n  \"person_original_position_id\" int4\n  \"person_original_lineup_slot\" int4\n  \"person_replacing_id\" int4\n  \"person_replacing_position_id\" int4\n  \"person_replacing_lineup_slot\" int4\n  \"substitution_reason\" varchar(100)\n  \"comment\" varchar(100)\n}\n\nTable \"baseball_defensive_group\" {\n  \"id\" int4 [not null, increment]\n}\n\nTable \"baseball_defensive_players\" {\n  \"id\" int4 [not null, increment]\n  \"baseball_defensive_group_id\" int4 [not null]\n  \"player_id\" int4 [not null]\n  \"position_id\" int4 [not null]\n}\n\nTable \"baseball_defensive_stats\" {\n  \"id\" int4 [not null, increment]\n  \"double_plays\" int4\n  \"triple_plays\" int4\n  \"putouts\" int4\n  \"assists\" int4\n  \"errors\" int4\n  \"fielding_percentage\" numeric\n  \"defensive_average\" numeric\n  \"errors_passed_ball\" int4\n  \"errors_catchers_interference\" int4\n}\n\nTable \"baseball_event_states\" {\n  \"id\" int4 [not null, increment]\n  \"event_id\" int4 [not null]\n  \"current_state\" int2\n  \"sequence_number\" int4\n  \"at_bat_number\" int4\n  \"inning_value\" int4\n  \"inning_half\" varchar(100)\n  \"outs\" int4\n  \"balls\" int4\n  \"strikes\" int4\n  \"runner_on_first_id\" int4\n  \"runner_on_second_id\" int4\n  \"runner_on_third_id\" int4\n  \"runner_on_first\" int2\n  \"runner_on_second\" int2\n  \"runner_on_third\" int2\n  \"runs_this_inning_half\" int4\n  \"pitcher_id\" int4\n  \"batter_id\" int4\n  \"batter_side\" varchar(100)\n  \"context\" varchar(40)\n}\n\nTable \"baseball_offensive_stats\" {\n  \"id\" int4 [not null, increment]\n  \"average\" numeric\n  \"runs_scored\" int4\n  \"at_bats\" int4\n  \"hits\" int4\n  \"rbi\" int4\n  \"total_bases\" int4\n  \"slugging_percentage\" numeric\n  \"bases_on_balls\" int4\n  \"strikeouts\" int4\n  \"left_on_base\" int4\n  \"left_in_scoring_position\" int4\n  \"singles\" int4\n  \"doubles\" int4\n  \"triples\" int4\n  \"home_runs\" int4\n  \"grand_slams\" int4\n  \"at_bats_per_rbi\" numeric\n  \"plate_appearances_per_rbi\" numeric\n  \"at_bats_per_home_run\" numeric\n  \"plate_appearances_per_home_run\" numeric\n  \"sac_flies\" int4\n  \"sac_bunts\" int4\n  \"grounded_into_double_play\" int4\n  \"moved_up\" int4\n  \"on_base_percentage\" numeric\n  \"stolen_bases\" int4\n  \"stolen_bases_caught\" int4\n  \"stolen_bases_average\" numeric\n  \"hit_by_pitch\" int4\n  \"defensive_interferance_reaches\" int4\n  \"on_base_plus_slugging\" numeric\n  \"plate_appearances\" int4\n  \"hits_extra_base\" int4\n}\n\nTable \"baseball_pitching_stats\" {\n  \"id\" int4 [not null, increment]\n  \"runs_allowed\" int4\n  \"singles_allowed\" int4\n  \"doubles_allowed\" int4\n  \"triples_allowed\" int4\n  \"home_runs_allowed\" int4\n  \"innings_pitched\" varchar(20)\n  \"hits\" int4\n  \"earned_runs\" int4\n  \"unearned_runs\" int4\n  \"bases_on_balls\" int4\n  \"bases_on_balls_intentional\" int4\n  \"strikeouts\" int4\n  \"strikeout_to_bb_ratio\" numeric\n  \"number_of_pitches\" int4\n  \"era\" numeric\n  \"inherited_runners_scored\" int4\n  \"pick_offs\" int4\n  \"errors_hit_with_pitch\" int4\n  \"errors_wild_pitch\" int4\n  \"balks\" int4\n  \"wins\" int4\n  \"losses\" int4\n  \"saves\" int4\n  \"shutouts\" int4\n  \"games_complete\" int4\n  \"games_finished\" int4\n  \"winning_percentage\" numeric\n  \"event_credit\" varchar(40)\n  \"save_credit\" varchar(40)\n}\n\nTable \"basketball_defensive_stats\" {\n  \"id\" int4 [not null, increment]\n  \"steals_total\" varchar(100)\n  \"steals_per_game\" varchar(100)\n  \"blocks_total\" varchar(100)\n  \"blocks_per_game\" varchar(100)\n}\n\nTable \"basketball_event_states\" {\n  \"id\" int4 [not null, increment]\n  \"event_id\" int4 [not null]\n  \"current_state\" int4\n  \"sequence_number\" int4\n  \"period_value\" varchar(100)\n  \"period_time_elapsed\" varchar(100)\n  \"period_time_remaining\" varchar(100)\n  \"context\" varchar(40)\n}\n\nTable \"basketball_offensive_stats\" {\n  \"id\" int4 [not null, increment]\n  \"field_goals_made\" int4\n  \"field_goals_attempted\" int4\n  \"field_goals_percentage\" varchar(100)\n  \"field_goals_per_game\" varchar(100)\n  \"field_goals_attempted_per_game\" varchar(100)\n  \"field_goals_percentage_adjusted\" varchar(100)\n  \"three_pointers_made\" int4\n  \"three_pointers_attempted\" int4\n  \"three_pointers_percentage\" varchar(100)\n  \"three_pointers_per_game\" varchar(100)\n  \"three_pointers_attempted_per_game\" varchar(100)\n  \"free_throws_made\" varchar(100)\n  \"free_throws_attempted\" varchar(100)\n  \"free_throws_percentage\" varchar(100)\n  \"free_throws_per_game\" varchar(100)\n  \"free_throws_attempted_per_game\" varchar(100)\n  \"points_scored_total\" varchar(100)\n  \"points_scored_per_game\" varchar(100)\n  \"assists_total\" varchar(100)\n  \"assists_per_game\" varchar(100)\n  \"turnovers_total\" varchar(100)\n  \"turnovers_per_game\" varchar(100)\n  \"points_scored_off_turnovers\" varchar(100)\n  \"points_scored_in_paint\" varchar(100)\n  \"points_scored_on_second_chance\" varchar(100)\n  \"points_scored_on_fast_break\" varchar(100)\n}\n\nTable \"basketball_rebounding_stats\" {\n  \"id\" int4 [not null, increment]\n  \"rebounds_total\" varchar(100)\n  \"rebounds_per_game\" varchar(100)\n  \"rebounds_defensive\" varchar(100)\n  \"rebounds_offensive\" varchar(100)\n  \"team_rebounds_total\" varchar(100)\n  \"team_rebounds_per_game\" varchar(100)\n  \"team_rebounds_defensive\" varchar(100)\n  \"team_rebounds_offensive\" varchar(100)\n}\n\nTable \"basketball_team_stats\" {\n  \"id\" int4 [not null, increment]\n  \"timeouts_left\" varchar(100)\n  \"largest_lead\" varchar(100)\n  \"fouls_total\" varchar(100)\n  \"turnover_margin\" varchar(100)\n}\n\nTable \"bookmakers\" {\n  \"id\" int4 [not null, increment]\n  \"bookmaker_key\" varchar(100)\n  \"publisher_id\" int4 [not null]\n  \"location_id\" int4\n}\n\nTable \"core_person_stats\" {\n  \"id\" int4 [not null, increment]\n  \"time_played_event\" varchar(40)\n  \"time_played_total\" varchar(40)\n  \"time_played_event_average\" varchar(40)\n  \"events_played\" int4\n  \"events_started\" int4\n  \"position_id\" int4\n}\n\nTable \"core_stats\" {\n  \"id\" int4 [not null, increment]\n  \"score\" varchar(100)\n  \"score_opposing\" varchar(100)\n  \"score_attempts\" varchar(100)\n  \"score_attempts_opposing\" varchar(100)\n  \"score_percentage\" varchar(100)\n  \"score_percentage_opposing\" varchar(100)\n}\n\nTable \"db_info\" {\n  \"version\" varchar(100) [not null, default: 16]\n}\n\nTable \"display_names\" {\n  \"id\" int4 [not null, increment]\n  \"language\" varchar(100) [not null]\n  \"entity_type\" varchar(100) [not null]\n  \"entity_id\" int4 [not null]\n  \"full_name\" varchar(100)\n  \"first_name\" varchar(100)\n  \"middle_name\" varchar(100)\n  \"last_name\" varchar(100)\n  \"alias\" varchar(100)\n  \"abbreviation\" varchar(100)\n  \"short_name\" varchar(100)\n  \"prefix\" varchar(20)\n  \"suffix\" varchar(20)\n}\n\nTable \"document_classes\" {\n  \"id\" int4 [not null, increment]\n  \"name\" varchar(100)\n}\n\nTable \"document_contents\" {\n  \"id\" int4 [not null, increment]\n  \"document_id\" int4 [not null]\n  \"sportsml\" varchar(200)\n  \"abstract\" text\n}\n\nTable \"document_fixtures\" {\n  \"id\" int4 [not null, increment]\n  \"fixture_key\" varchar(100)\n  \"publisher_id\" int4 [not null]\n  \"name\" varchar(100)\n  \"document_class_id\" int4 [not null]\n}\n\nTable \"document_fixtures_events\" {\n  \"id\" int4 [not null, increment]\n  \"document_fixture_id\" int4 [not null]\n  \"event_id\" int4 [not null]\n  \"latest_document_id\" int4 [not null]\n  \"last_update\" timestamp\n}\n\nTable \"document_package_entry\" {\n  \"id\" int4 [not null, increment]\n  \"document_package_id\" int4 [not null]\n  \"rank\" varchar(100)\n  \"document_id\" int4 [not null]\n  \"headline\" varchar(100)\n  \"short_headline\" varchar(100)\n}\n\nTable \"document_packages\" {\n  \"id\" int4 [not null, increment]\n  \"package_key\" varchar(100)\n  \"package_name\" varchar(100)\n  \"date_time\" date\n}\n\nTable \"documents\" {\n  \"id\" int4 [not null, increment]\n  \"doc_id\" varchar(75) [not null]\n  \"publisher_id\" int4 [not null]\n  \"date_time\" timestamp\n  \"title\" varchar(255)\n  \"language\" varchar(100)\n  \"priority\" varchar(100)\n  \"revision_id\" varchar(75)\n  \"stats_coverage\" varchar(100)\n  \"document_fixture_id\" int4 [not null]\n  \"source_id\" int4\n  \"db_loading_date_time\" timestamp\n}\n\nTable \"documents_media\" {\n  \"id\" int4 [not null, increment]\n  \"document_id\" int4 [not null]\n  \"media_id\" int4 [not null]\n  \"media_caption_id\" int4 [not null]\n}\n\nTable \"events\" {\n  \"id\" int4 [not null, increment]\n  \"event_key\" varchar(100) [not null]\n  \"publisher_id\" int4 [not null]\n  \"start_date_time\" timestamp\n  \"site_id\" int4\n  \"site_alignment\" varchar(100)\n  \"event_status\" varchar(100)\n  \"duration\" varchar(100)\n  \"attendance\" varchar(100)\n  \"last_update\" timestamp\n}\n\nTable \"events_documents\" {\n  \"event_id\" int4 [not null]\n  \"document_id\" int4 [not null]\n}\n\nTable \"events_media\" {\n  \"event_id\" int4 [not null]\n  \"media_id\" int4 [not null]\n}\n\nTable \"events_sub_seasons\" {\n  \"event_id\" int4 [not null]\n  \"sub_season_id\" int4 [not null]\n}\n\nTable \"ice_hockey_action_participants\" {\n  \"id\" int4 [not null, increment]\n  \"ice_hockey_action_play_id\" int4 [not null]\n  \"person_id\" int4 [not null]\n  \"participant_role\" varchar(100) [not null]\n  \"point_credit\" int4\n}\n\nTable \"ice_hockey_action_plays\" {\n  \"id\" int4 [not null, increment]\n  \"ice_hockey_event_state_id\" int4 [not null]\n  \"play_type\" varchar(100)\n  \"score_attempt_type\" varchar(100)\n  \"play_result\" varchar(100)\n  \"comment\" varchar(255)\n}\n\nTable \"ice_hockey_defensive_stats\" {\n  \"id\" int4 [not null, increment]\n  \"shots_power_play_allowed\" varchar(100)\n  \"shots_penalty_shot_allowed\" varchar(100)\n  \"goals_power_play_allowed\" varchar(100)\n  \"goals_penalty_shot_allowed\" varchar(100)\n  \"goals_against_average\" varchar(100)\n  \"saves\" varchar(100)\n  \"save_percentage\" varchar(100)\n  \"penalty_killing_amount\" varchar(100)\n  \"penalty_killing_percentage\" varchar(100)\n  \"shots_blocked\" varchar(100)\n  \"takeaways\" varchar(100)\n  \"shutouts\" varchar(100)\n  \"minutes_penalty_killing\" varchar(100)\n  \"hits\" varchar(100)\n  \"goals_empty_net_allowed\" varchar(100)\n  \"goals_short_handed_allowed\" varchar(100)\n  \"goals_shootout_allowed\" varchar(100)\n  \"shots_shootout_allowed\" varchar(100)\n}\n\nTable \"ice_hockey_event_states\" {\n  \"id\" int4 [not null, increment]\n  \"event_id\" int4 [not null]\n  \"current_state\" int4\n  \"sequence_number\" int4\n  \"period_value\" varchar(100)\n  \"period_time_elapsed\" varchar(100)\n  \"period_time_remaining\" varchar(100)\n  \"context\" varchar(40)\n}\n\nTable \"ice_hockey_offensive_stats\" {\n  \"id\" int4 [not null, increment]\n  \"goals_game_winning\" varchar(100)\n  \"goals_game_tying\" varchar(100)\n  \"goals_power_play\" varchar(100)\n  \"goals_short_handed\" varchar(100)\n  \"goals_even_strength\" varchar(100)\n  \"goals_empty_net\" varchar(100)\n  \"goals_overtime\" varchar(100)\n  \"goals_shootout\" varchar(100)\n  \"goals_penalty_shot\" varchar(100)\n  \"assists\" varchar(100)\n  \"points\" varchar(100)\n  \"power_play_amount\" varchar(100)\n  \"power_play_percentage\" varchar(100)\n  \"shots_penalty_shot_taken\" varchar(100)\n  \"shots_penalty_shot_missed\" varchar(100)\n  \"shots_penalty_shot_percentage\" varchar(100)\n  \"giveaways\" varchar(100)\n  \"minutes_power_play\" varchar(100)\n  \"faceoff_wins\" varchar(100)\n  \"faceoff_losses\" varchar(100)\n  \"faceoff_win_percentage\" varchar(100)\n  \"scoring_chances\" varchar(100)\n}\n\nTable \"ice_hockey_player_stats\" {\n  \"id\" int4 [not null, increment]\n  \"plus_minus\" varchar(100)\n}\n\nTable \"injury_phases\" {\n  \"id\" int4 [not null, increment]\n  \"person_id\" int4 [not null]\n  \"injury_status\" varchar(100)\n  \"injury_type\" varchar(100)\n  \"injury_comment\" varchar(100)\n  \"disabled_list\" varchar(100)\n  \"start_date_time\" timestamp\n  \"end_date_time\" timestamp\n  \"season_id\" int4\n  \"phase_type\" varchar(100)\n  \"injury_side\" varchar(100)\n}\n\nTable \"key_aliases\" {\n  \"id\" int4 [not null, increment]\n  \"key_id\" int4 [not null]\n  \"key_root_id\" int4 [not null]\n}\n\nTable \"key_roots\" {\n  \"id\" int4 [not null, increment]\n  \"key_type\" varchar(100)\n}\n\nTable \"latest_revisions\" {\n  \"id\" int4 [not null, increment]\n  \"revision_id\" varchar(75) [not null]\n  \"latest_document_id\" int4 [not null]\n}\n\nTable \"locations\" {\n  \"id\" int4 [not null, increment]\n  \"timezone\" varchar(100)\n  \"latitude\" varchar(100)\n  \"longitude\" varchar(100)\n  \"country_code\" varchar(100)\n}\n\nTable \"media\" {\n  \"id\" int4 [not null, increment]\n  \"object_id\" int4\n  \"source_id\" int4\n  \"revision_id\" int4\n  \"media_type\" varchar(100)\n  \"publisher_id\" int4 [not null]\n  \"date_time\" varchar(100)\n  \"credit_id\" int4 [not null]\n  \"db_loading_date_time\" timestamp\n  \"creation_location_id\" int4 [not null]\n}\n\nTable \"media_captions\" {\n  \"id\" int4 [not null, increment]\n  \"media_id\" int4 [not null]\n  \"caption_type\" varchar(100)\n  \"caption\" varchar(100)\n  \"caption_author_id\" int4 [not null]\n  \"language\" varchar(100)\n  \"caption_size\" varchar(100)\n}\n\nTable \"media_contents\" {\n  \"id\" int4 [not null, increment]\n  \"media_id\" int4 [not null]\n  \"object\" varchar(100)\n  \"format\" varchar(100)\n  \"mime_type\" varchar(100)\n  \"height\" varchar(100)\n  \"width\" varchar(100)\n  \"duration\" varchar(100)\n  \"file_size\" varchar(100)\n  \"resolution\" varchar(100)\n}\n\nTable \"media_keywords\" {\n  \"id\" int4 [not null, increment]\n  \"keyword\" varchar(100)\n  \"media_id\" int4 [not null]\n}\n\nTable \"motor_racing_event_states\" {\n  \"id\" int4 [not null, increment]\n  \"event_id\" int4 [not null]\n  \"current_state\" int4\n  \"sequence_number\" int4\n  \"lap\" varchar(100)\n  \"laps_remaining\" varchar(100)\n  \"time_elapsed\" varchar(100)\n  \"flag_state\" varchar(100)\n  \"context\" varchar(40)\n}\n\nTable \"motor_racing_qualifying_stats\" {\n  \"id\" int4 [not null, increment]\n  \"grid\" varchar(100)\n  \"pole_position\" varchar(100)\n  \"pole_wins\" varchar(100)\n  \"qualifying_speed\" varchar(100)\n  \"qualifying_speed_units\" varchar(100)\n  \"qualifying_time\" varchar(100)\n  \"qualifying_position\" varchar(100)\n}\n\nTable \"motor_racing_race_stats\" {\n  \"id\" int4 [not null, increment]\n  \"time_behind_leader\" varchar(100)\n  \"laps_behind_leader\" varchar(100)\n  \"time_ahead_follower\" varchar(100)\n  \"laps_ahead_follower\" varchar(100)\n  \"time\" varchar(100)\n  \"points\" varchar(100)\n  \"points_rookie\" varchar(100)\n  \"bonus\" varchar(100)\n  \"laps_completed\" varchar(100)\n  \"laps_leading_total\" varchar(100)\n  \"distance_leading\" varchar(100)\n  \"distance_completed\" varchar(100)\n  \"distance_units\" varchar(40)\n  \"speed_average\" varchar(40)\n  \"speed_units\" varchar(40)\n  \"status\" varchar(40)\n  \"finishes_top_5\" varchar(40)\n  \"finishes_top_10\" varchar(40)\n  \"starts\" varchar(40)\n  \"finishes\" varchar(40)\n  \"non_finishes\" varchar(40)\n  \"wins\" varchar(40)\n  \"races_leading\" varchar(40)\n  \"money\" varchar(40)\n  \"money_units\" varchar(40)\n  \"leads_total\" varchar(40)\n}\n\nTable \"outcome_totals\" {\n  \"id\" int4 [not null, increment]\n  \"standing_subgroup_id\" int4 [not null]\n  \"outcome_holder_type\" varchar(100)\n  \"outcome_holder_id\" int4\n  \"rank\" varchar(100)\n  \"wins\" varchar(100)\n  \"losses\" varchar(100)\n  \"ties\" varchar(100)\n  \"undecideds\" varchar(100)\n  \"winning_percentage\" varchar(100)\n  \"points_scored_for\" varchar(100)\n  \"points_scored_against\" varchar(100)\n  \"points_difference\" varchar(100)\n  \"standing_points\" varchar(100)\n  \"streak_type\" varchar(100)\n  \"streak_duration\" varchar(100)\n  \"streak_total\" varchar(100)\n  \"streak_start\" date\n  \"streak_end\" date\n}\n\nTable \"participants_events\" {\n  \"id\" int4 [not null, increment]\n  \"participant_type\" varchar(100) [not null]\n  \"participant_id\" int4 [not null]\n  \"event_id\" int4 [not null]\n  \"alignment\" varchar(100)\n  \"score\" varchar(100)\n  \"event_outcome\" varchar(100)\n  \"rank\" int4\n}\n\nTable \"periods\" {\n  \"id\" int4 [not null, increment]\n  \"participant_event_id\" int4 [not null]\n  \"period_value\" varchar(100)\n  \"score\" varchar(100)\n}\n\nTable \"person_event_metadata\" {\n  \"id\" int4 [not null, increment]\n  \"person_id\" int4 [not null]\n  \"event_id\" int4 [not null]\n  \"status\" varchar(100)\n  \"health\" varchar(100)\n  \"weight\" varchar(100)\n  \"role_id\" int4\n  \"position_id\" int4\n  \"team_id\" int4\n  \"lineup_slot\" int4\n  \"lineup_slot_sequence\" int4\n}\n\nTable \"person_phases\" {\n  \"id\" int4 [not null, increment]\n  \"person_id\" int4 [not null]\n  \"membership_type\" varchar(40) [not null]\n  \"membership_id\" int4 [not null]\n  \"role_id\" int4\n  \"role_status\" varchar(40)\n  \"phase_status\" varchar(40)\n  \"uniform_number\" varchar(20)\n  \"regular_position_id\" int4\n  \"regular_position_depth\" varchar(40)\n  \"height\" varchar(100)\n  \"weight\" varchar(100)\n  \"start_date_time\" timestamp\n  \"start_season_id\" int4\n  \"end_date_time\" timestamp\n  \"end_season_id\" int4\n  \"entry_reason\" varchar(40)\n  \"exit_reason\" varchar(40)\n  \"selection_level\" int4\n  \"selection_sublevel\" int4\n  \"selection_overall\" int4\n}\n\nTable \"persons\" {\n  \"id\" int4 [not null, increment]\n  \"person_key\" varchar(100) [not null]\n  \"publisher_id\" int4 [not null]\n  \"gender\" varchar(20)\n  \"birth_date\" varchar(30)\n  \"death_date\" varchar(30)\n  \"birth_location_id\" int4\n  \"hometown_location_id\" int4\n  \"residence_location_id\" int4\n  \"death_location_id\" int4\n}\n\nTable \"persons_documents\" {\n  \"person_id\" int4 [not null]\n  \"document_id\" int4 [not null]\n}\n\nTable \"persons_media\" {\n  \"person_id\" int4 [not null]\n  \"media_id\" int4 [not null]\n}\n\nTable \"positions\" {\n  \"id\" int4 [not null, increment]\n  \"affiliation_id\" int4 [not null]\n  \"abbreviation\" varchar(100) [not null]\n}\n\nTable \"publishers\" {\n  \"id\" int4 [not null, increment]\n  \"publisher_key\" varchar(100) [not null]\n  \"publisher_name\" varchar(100)\n}\n\nTable \"roles\" {\n  \"id\" int4 [not null, increment]\n  \"role_key\" varchar(100) [not null]\n  \"role_name\" varchar(100)\n  \"comment\" varchar(100)\n}\n\nTable \"seasons\" {\n  \"id\" int4 [not null, increment]\n  \"season_key\" int4 [not null]\n  \"publisher_id\" int4 [not null]\n  \"league_id\" int4 [not null]\n  \"start_date_time\" timestamp\n  \"end_date_time\" timestamp\n}\n\nTable \"sites\" {\n  \"id\" int4 [not null, increment]\n  \"site_key\" int4 [not null]\n  \"publisher_id\" int4 [not null]\n  \"location_id\" int4\n}\n\nTable \"soccer_defensive_stats\" {\n  \"id\" int4 [not null, increment]\n  \"shots_penalty_shot_allowed\" varchar(100)\n  \"goals_penalty_shot_allowed\" varchar(100)\n  \"goals_against_average\" varchar(100)\n  \"goals_against_total\" varchar(100)\n  \"saves\" varchar(100)\n  \"save_percentage\" varchar(100)\n  \"catches_punches\" varchar(100)\n  \"shots_on_goal_total\" varchar(100)\n  \"shots_shootout_total\" varchar(100)\n  \"shots_shootout_allowed\" varchar(100)\n  \"shots_blocked\" varchar(100)\n  \"shutouts\" varchar(100)\n}\n\nTable \"soccer_event_states\" {\n  \"id\" int4 [not null, increment]\n  \"event_id\" int4 [not null]\n  \"current_state\" int4\n  \"sequence_number\" int4\n  \"period_value\" varchar(100)\n  \"period_time_elapsed\" varchar(100)\n  \"period_time_remaining\" varchar(100)\n  \"minutes_elapsed\" varchar(100)\n  \"period_minute_elapsed\" varchar(100)\n  \"context\" varchar(40)\n}\n\nTable \"soccer_foul_stats\" {\n  \"id\" int4 [not null, increment]\n  \"fouls_suffered\" varchar(100)\n  \"fouls_commited\" varchar(100)\n  \"cautions_total\" varchar(100)\n  \"cautions_pending\" varchar(100)\n  \"caution_points_total\" varchar(100)\n  \"caution_points_pending\" varchar(100)\n  \"ejections_total\" varchar(100)\n}\n\nTable \"soccer_offensive_stats\" {\n  \"id\" int4 [not null, increment]\n  \"goals_game_winning\" varchar(100)\n  \"goals_game_tying\" varchar(100)\n  \"goals_overtime\" varchar(100)\n  \"goals_shootout\" varchar(100)\n  \"goals_total\" varchar(100)\n  \"assists_game_winning\" varchar(100)\n  \"assists_game_tying\" varchar(100)\n  \"assists_overtime\" varchar(100)\n  \"assists_total\" varchar(100)\n  \"points\" varchar(100)\n  \"shots_total\" varchar(100)\n  \"shots_on_goal_total\" varchar(100)\n  \"shots_hit_frame\" varchar(100)\n  \"shots_penalty_shot_taken\" varchar(100)\n  \"shots_penalty_shot_scored\" varchar(100)\n  \"shots_penalty_shot_missed\" varchar(40)\n  \"shots_penalty_shot_percentage\" varchar(40)\n  \"shots_shootout_taken\" varchar(40)\n  \"shots_shootout_scored\" varchar(40)\n  \"shots_shootout_missed\" varchar(40)\n  \"shots_shootout_percentage\" varchar(40)\n  \"giveaways\" varchar(40)\n  \"offsides\" varchar(40)\n  \"corner_kicks\" varchar(40)\n  \"hat_tricks\" varchar(40)\n}\n\nTable \"standing_subgroups\" {\n  \"id\" int4 [not null, increment]\n  \"standing_id\" int4 [not null]\n  \"affiliation_id\" int4 [not null]\n}\n\nTable \"standings\" {\n  \"id\" int4 [not null, increment]\n  \"affiliation_id\" int4 [not null]\n  \"standing_type\" varchar(100)\n  \"sub_season_id\" int4 [not null]\n  \"last_updated\" varchar(100)\n  \"duration_scope\" varchar(100)\n  \"competition_scope\" varchar(100)\n  \"competition_scope_id\" varchar(100)\n  \"alignment_scope\" varchar(100)\n  \"site_scope\" varchar(100)\n  \"scoping_label\" varchar(100)\n  \"publisher_id\" int4 [not null]\n  \"source\" varchar(100)\n}\n\nTable \"stats\" {\n  \"id\" int4 [not null, increment]\n  \"stat_repository_type\" varchar(100)\n  \"stat_repository_id\" int4 [not null]\n  \"stat_holder_type\" varchar(100)\n  \"stat_holder_id\" int4\n  \"stat_coverage_type\" varchar(100)\n  \"stat_coverage_id\" int4\n  \"context\" varchar(40) [not null]\n}\n\nTable \"sub_periods\" {\n  \"id\" int4 [not null, increment]\n  \"period_id\" int4 [not null]\n  \"sub_period_value\" varchar(100)\n  \"score\" varchar(100)\n}\n\nTable \"sub_seasons\" {\n  \"id\" int4 [not null, increment]\n  \"sub_season_key\" varchar(100) [not null]\n  \"season_id\" int4 [not null]\n  \"sub_season_type\" varchar(100) [not null]\n  \"start_date_time\" timestamp\n  \"end_date_time\" timestamp\n}\n\nTable \"team_american_football_stats\" {\n  \"id\" int4 [not null, increment]\n  \"yards_per_attempt\" varchar(100)\n  \"average_starting_position\" varchar(100)\n  \"timeouts\" varchar(100)\n  \"time_of_possession\" varchar(100)\n  \"turnover_ratio\" varchar(100)\n}\n\nTable \"team_phases\" {\n  \"id\" int4 [not null, increment]\n  \"team_id\" int4 [not null]\n  \"start_season_id\" int4\n  \"end_season_id\" int4\n  \"affiliation_id\" int4 [not null]\n  \"start_date_time\" varchar(100)\n  \"end_date_time\" varchar(100)\n  \"phase_status\" varchar(40)\n  \"role_id\" int4\n}\n\nTable \"teams\" {\n  \"id\" int4 [not null, increment]\n  \"team_key\" varchar(100) [not null]\n  \"publisher_id\" int4 [not null]\n  \"home_site_id\" int4\n}\n\nTable \"teams_documents\" {\n  \"team_id\" int4 [not null]\n  \"document_id\" int4 [not null]\n}\n\nTable \"teams_media\" {\n  \"team_id\" int4 [not null]\n  \"media_id\" int4 [not null]\n}\n\nTable \"tennis_action_points\" {\n  \"id\" int4 [not null, increment]\n  \"sub_period_id\" varchar(100)\n  \"sequence_number\" varchar(100)\n  \"win_type\" varchar(100)\n}\n\nTable \"tennis_action_volleys\" {\n  \"id\" int4 [not null, increment]\n  \"sequence_number\" varchar(100)\n  \"tennis_action_points_id\" int4\n  \"landing_location\" varchar(100)\n  \"swing_type\" varchar(100)\n  \"result\" varchar(100)\n  \"spin_type\" varchar(100)\n  \"trajectory_details\" varchar(100)\n}\n\nTable \"tennis_event_states\" {\n  \"id\" int4 [not null, increment]\n  \"event_id\" int4 [not null]\n  \"current_state\" int4\n  \"sequence_number\" int4\n  \"tennis_set\" varchar(100)\n  \"game\" varchar(100)\n  \"server_person_id\" int4\n  \"server_score\" varchar(100)\n  \"receiver_person_id\" int4\n  \"receiver_score\" varchar(100)\n  \"service_number\" varchar(100)\n  \"context\" varchar(40)\n}\n\nTable \"tennis_return_stats\" {\n  \"id\" int4 [not null, increment]\n  \"returns_played\" varchar(100)\n  \"matches_played\" varchar(100)\n  \"first_service_return_points_won\" varchar(100)\n  \"first_service_return_points_won_pct\" varchar(100)\n  \"second_service_return_points_won\" varchar(100)\n  \"second_service_return_points_won_pct\" varchar(100)\n  \"return_games_played\" varchar(100)\n  \"return_games_won\" varchar(100)\n  \"return_games_won_pct\" varchar(100)\n  \"break_points_played\" varchar(100)\n  \"break_points_converted\" varchar(100)\n  \"break_points_converted_pct\" varchar(100)\n}\n\nTable \"tennis_service_stats\" {\n  \"id\" int4 [not null, increment]\n  \"services_played\" varchar(100)\n  \"matches_played\" varchar(100)\n  \"aces\" varchar(100)\n  \"first_services_good\" varchar(100)\n  \"first_services_good_pct\" varchar(100)\n  \"first_service_points_won\" varchar(100)\n  \"first_service_points_won_pct\" varchar(100)\n  \"second_service_points_won\" varchar(100)\n  \"second_service_points_won_pct\" varchar(100)\n  \"service_games_played\" varchar(100)\n  \"service_games_won\" varchar(100)\n  \"service_games_won_pct\" varchar(100)\n  \"break_points_played\" varchar(100)\n  \"break_points_saved\" varchar(100)\n  \"break_points_saved_pct\" varchar(100)\n}\n\nTable \"wagering_moneylines\" {\n  \"id\" int4 [not null, increment]\n  \"bookmaker_id\" int4 [not null]\n  \"event_id\" int4 [not null]\n  \"date_time\" timestamp\n  \"team_id\" int4 [not null]\n  \"person_id\" int4\n  \"rotation_key\" varchar(100)\n  \"comment\" varchar(100)\n  \"vigorish\" varchar(100)\n  \"line\" varchar(100)\n  \"line_opening\" varchar(100)\n  \"prediction\" varchar(100)\n}\n\nTable \"wagering_odds_lines\" {\n  \"id\" int4 [not null, increment]\n  \"bookmaker_id\" int4 [not null]\n  \"event_id\" int4 [not null]\n  \"date_time\" timestamp\n  \"team_id\" int4 [not null]\n  \"person_id\" int4\n  \"rotation_key\" varchar(100)\n  \"comment\" varchar(100)\n  \"numerator\" varchar(100)\n  \"denominator\" varchar(100)\n  \"prediction\" varchar(100)\n  \"payout_calculation\" varchar(100)\n  \"payout_amount\" varchar(100)\n}\n\nTable \"wagering_runlines\" {\n  \"id\" int4 [not null, increment]\n  \"bookmaker_id\" int4 [not null]\n  \"event_id\" int4 [not null]\n  \"date_time\" timestamp\n  \"team_id\" int4 [not null]\n  \"person_id\" int4\n  \"rotation_key\" varchar(100)\n  \"comment\" varchar(100)\n  \"vigorish\" varchar(100)\n  \"line\" varchar(100)\n  \"line_opening\" varchar(100)\n  \"line_value\" varchar(100)\n  \"prediction\" varchar(100)\n}\n\nTable \"wagering_straight_spread_lines\" {\n  \"id\" int4 [not null, increment]\n  \"bookmaker_id\" int4 [not null]\n  \"event_id\" int4 [not null]\n  \"date_time\" timestamp\n  \"team_id\" int4 [not null]\n  \"person_id\" int4\n  \"rotation_key\" varchar(100)\n  \"comment\" varchar(100)\n  \"vigorish\" varchar(100)\n  \"line_value\" varchar(100)\n  \"line_value_opening\" varchar(100)\n  \"prediction\" varchar(100)\n}\n\nTable \"wagering_total_score_lines\" {\n  \"id\" int4 [not null, increment]\n  \"bookmaker_id\" int4 [not null]\n  \"event_id\" int4 [not null]\n  \"date_time\" timestamp\n  \"team_id\" int4 [not null]\n  \"person_id\" int4\n  \"rotation_key\" varchar(100)\n  \"comment\" varchar(100)\n  \"vigorish\" varchar(100)\n  \"line_over\" varchar(100)\n  \"line_under\" varchar(100)\n  \"total\" varchar(100)\n  \"total_opening\" varchar(100)\n  \"prediction\" varchar(100)\n}\n\nTable \"weather_conditions\" {\n  \"id\" int4 [not null, increment]\n  \"event_id\" int4 [not null]\n  \"temperature\" varchar(100)\n  \"temperature_units\" varchar(40)\n  \"humidity\" varchar(100)\n  \"clouds\" varchar(100)\n  \"wind_direction\" varchar(100)\n  \"wind_velocity\" varchar(100)\n  \"weather_code\" varchar(100)\n}\n",
    "stateUrl": null,
    "stateOriginalUrl": "https://github.com/yugabyte/yugabyte-db/blob/master/sample/sportsdb_tables.sql"
  }
}

================================================
FILE: tasks/postgres/standard/sports/team_roster_management/verify.py
================================================
"""
Verification script for PostgreSQL Sports Task 2: Team Roster Management Operations
"""

import os
import sys
import psycopg2
from decimal import Decimal

def rows_match(actual_row, expected_row):
    """
    Compare two rows with appropriate tolerance.
    For Decimal types: allows 0.001 tolerance
    For other types: requires exact match
    """
    if len(actual_row) != len(expected_row):
        return False
    
    for actual, expected in zip(actual_row, expected_row):
        if isinstance(actual, Decimal) and isinstance(expected, Decimal):
            if abs(float(actual) - float(expected)) > 0.001:
                return False
        elif isinstance(actual, float) and isinstance(expected, float):
            if abs(actual - expected) > 0.001:
                return False
        elif actual != expected:
            return False
    
    return True

def get_connection_params() -> dict:
    """Get database connection parameters."""
    return {
        "host": os.getenv("POSTGRES_HOST", "localhost"),
        "port": int(os.getenv("POSTGRES_PORT", 5432)),
        "database": os.getenv("POSTGRES_DATABASE", "sports"),
        "user": os.getenv("POSTGRES_USERNAME", "postgres"),
        "password": os.getenv("POSTGRES_PASSWORD", "postgres")
    }

def verify_player_evaluation_table(conn) -> bool:
    """Verify the final state of player_evaluation table after all operations."""
    with conn.cursor() as cur:        
        # Get actual results from the created table
        cur.execute("""
            SELECT person_id, batting_avg, home_runs, rbis, games_played, performance_score
            FROM player_evaluation
            ORDER BY person_id
        """)
        actual_results = cur.fetchall()
        
        # Execute ground truth query that simulates all steps:
        # 1. Initial insert (step 2)
        # 2. Update based on injuries (step 4)
        cur.execute("""
            WITH initial_players AS (
                SELECT 
                    s.stat_holder_id AS person_id,
                    SUM(bos.hits)      AS total_hits,
                    SUM(bos.at_bats)   AS total_at_bats,
                    CASE 
                        WHEN SUM(bos.at_bats) > 0 
                        THEN 1.0 * SUM(bos.hits) / SUM(bos.at_bats)
                        ELSE 0 
                    END                AS batting_avg,
                    SUM(bos.home_runs) AS home_runs,
                    SUM(bos.rbi)       AS rbis
                FROM stats s
                JOIN baseball_offensive_stats bos
                ON s.stat_repository_id = bos.id
                WHERE s.stat_holder_type = 'persons'
                AND s.stat_repository_type = 'baseball_offensive_stats'
                GROUP BY s.stat_holder_id
            ),
            game_counts AS (
                SELECT 
                    person_id,
                    COUNT(DISTINCT event_id) AS games_played
                FROM person_event_metadata
                GROUP BY person_id
            ),
            players_with_games AS (
                SELECT 
                    ip.person_id,
                    ip.batting_avg,
                    ip.home_runs,
                    ip.rbis,
                    COALESCE(gc.games_played, 0) AS games_played,
                    (ip.batting_avg * 1000)
                    + (COALESCE(ip.home_runs, 0) * 5)
                    + (COALESCE(ip.rbis, 0) * 2) AS initial_score
                FROM initial_players ip
                LEFT JOIN game_counts gc ON ip.person_id = gc.person_id
                WHERE COALESCE(gc.games_played, 0) >= 10
            ),
            injury_info AS (
                SELECT 
                    person_id,
                    COUNT(*) AS injury_count,
                    MAX(CASE WHEN end_date_time IS NULL THEN 1 ELSE 0 END) AS has_active_injury
                FROM injury_phases
                GROUP BY person_id
            ),
            adjusted_scores AS (
                SELECT 
                    pwg.person_id,
                    pwg.batting_avg,
                    pwg.home_runs,
                    pwg.rbis,
                    pwg.games_played,
                    GREATEST(
                        CASE 
                            WHEN COALESCE(ii.has_active_injury, 0) = 1 AND COALESCE(ii.injury_count, 0) > 2 
                                THEN pwg.initial_score * 0.8 * 0.9
                            WHEN COALESCE(ii.has_active_injury, 0) = 1 
                                THEN pwg.initial_score * 0.8
                            WHEN COALESCE(ii.injury_count, 0) > 2 
                                THEN pwg.initial_score * 0.9
                            ELSE pwg.initial_score
                        END,
                        0
                    ) AS performance_score
                FROM players_with_games pwg
                LEFT JOIN injury_info ii ON ii.person_id = pwg.person_id
            )
            SELECT 
                person_id,
                batting_avg,
                home_runs,
                rbis,
                games_played,
                performance_score
            FROM adjusted_scores
            ORDER BY person_id;
        """)
        expected_results = cur.fetchall()

        if len(actual_results) != len(expected_results):
            print(f"❌ Expected {len(expected_results)} player evaluation records, got {len(actual_results)}")
            return False

        mismatches = 0
        for i, (actual, expected) in enumerate(zip(actual_results, expected_results)):
            if not rows_match(actual, expected):
                if mismatches < 5:  # Only show first 5 mismatches
                    print(f"❌ Row {i+1} mismatch: expected {expected}, got {actual}")
                mismatches += 1

        if mismatches > 0:
            print(f"❌ Total mismatches in player_evaluation: {mismatches}")
            return False

        print(f"✅ Player evaluation table is correct ({len(actual_results)} records)")
        return True

def verify_injury_status_table(conn) -> bool:
    """Verify the player_injury_status table and data."""
    with conn.cursor() as cur:
        # Get actual results
        cur.execute("""
            SELECT person_id, injury_count, last_injury_date, current_status
            FROM player_injury_status
            ORDER BY person_id
        """)
        actual_results = cur.fetchall()
        
        # Execute ground truth query - get players from player_evaluation
        cur.execute("""
            WITH player_list AS (
                SELECT DISTINCT person_id 
                FROM player_evaluation
            ),
            injury_counts AS (
                SELECT 
                    person_id,
                    COUNT(*) as injury_count,
                    MAX(start_date_time::date) as last_injury_date,
                    MAX(CASE WHEN end_date_time IS NULL THEN 1 ELSE 0 END) as has_active_injury
                FROM injury_phases
                GROUP BY person_id
            )
            SELECT 
                pl.person_id,
                COALESCE(ic.injury_count, 0) as injury_count,
                ic.last_injury_date,
                CASE 
                    WHEN COALESCE(ic.has_active_injury, 0) = 1 THEN 'injured'
                    ELSE 'healthy'
                END as current_status
            FROM player_list pl
            LEFT JOIN injury_counts ic ON pl.person_id = ic.person_id
            ORDER BY pl.person_id
        """)
        expected_results = cur.fetchall()

        if len(actual_results) != len(expected_results):
            print(f"❌ Expected {len(expected_results)} injury status records, got {len(actual_results)}")
            return False

        mismatches = 0
        for i, (actual, expected) in enumerate(zip(actual_results, expected_results)):
            if not rows_match(actual, expected):
                if mismatches < 5:
                    print(f"❌ Row {i+1} mismatch: expected {expected}, got {actual}")
                mismatches += 1

        if mismatches > 0:
            print(f"❌ Total mismatches in player_injury_status: {mismatches}")
            return False

        print(f"✅ Player injury status table is correct ({len(actual_results)} records)")
        return True


def verify_summary_table(conn) -> bool:
    """Verify the team_performance_summary table."""
    with conn.cursor() as cur:
        # Get actual results
        cur.execute("""
            SELECT metric_name, metric_value
            FROM team_performance_summary
            ORDER BY metric_name
        """)
        actual_results = cur.fetchall()
        
        # Execute ground truth query
        cur.execute("""
            WITH player_data AS (
                SELECT 
                    COUNT(*) as total_players,
                    AVG(batting_avg) as avg_batting_average,
                    SUM(home_runs) as total_home_runs,
                    AVG(performance_score) as avg_performance_score
                FROM player_evaluation
            ),
            health_data AS (
                SELECT 
                    SUM(CASE WHEN current_status = 'injured' THEN 1 ELSE 0 END) as injured_count,
                    SUM(CASE WHEN current_status = 'healthy' THEN 1 ELSE 0 END) as healthy_count
                FROM player_injury_status
                WHERE person_id IN (SELECT person_id FROM player_evaluation)
            )
            SELECT metric_name, metric_value::DECIMAL
            FROM (
                SELECT 'avg_batting_average' as metric_name, avg_batting_average as metric_value FROM player_data
                UNION ALL
                SELECT 'avg_performance_score', avg_performance_score FROM player_data
                UNION ALL
                SELECT 'healthy_player_count', healthy_count FROM health_data
                UNION ALL
                SELECT 'injured_player_count', injured_count FROM health_data
                UNION ALL
                SELECT 'total_home_runs', total_home_runs FROM player_data
                UNION ALL
                SELECT 'total_players', total_players FROM player_data
            ) metrics
            ORDER BY metric_name
        """)
        expected_results = cur.fetchall()

        if len(actual_results) != len(expected_results):
            print(f"❌ Expected {len(expected_results)} metrics, got {len(actual_results)}")
            return False

        mismatches = 0
        for actual, expected in zip(actual_results, expected_results):
            if not rows_match(actual, expected):
                if mismatches < 5:
                    print(f"❌ Metric mismatch: expected {expected}, got {actual}")
                mismatches += 1

        if mismatches > 0:
            print(f"❌ Total mismatches in summary table: {mismatches}")
            return False
        
        print(f"✅ Team performance summary table is correct ({len(actual_results)} metrics)")
        return True

def main():
    """Main verification function."""
    print("=" * 50)
    print("Verifying Sports Task 2: Team Roster Management Operations")
    print("=" * 50)

    # Get connection parameters
    conn_params = get_connection_params()

    if not conn_params["database"]:
        print("❌ No database specified")
        sys.exit(1)

    try:
        # Connect to database
        conn = psycopg2.connect(**conn_params)

        # Verify all steps
        success = (
            verify_player_evaluation_table(conn) and 
            verify_injury_status_table(conn) and
            verify_summary_table(conn)
        )

        conn.close()

        if success:
            print("\n🎉 Task verification: PASS")
            sys.exit(0)
        else:
            print("\n❌ Task verification: FAIL")
            sys.exit(1)

    except psycopg2.Error as e:
        print(f"❌ Database error: {e}")
        sys.exit(1)
    except Exception as e:
        print(f"❌ Verification error: {e}")
        sys.exit(1)

if __name__ == "__main__":
    main()

================================================
FILE: tasks/postgres/standard/vectors/dba_vector_analysis/description.md
================================================
# PostgreSQL Vector Database Analysis

> Analyze and optimize a pgvector-powered database to understand storage patterns, performance characteristics, and data quality for embeddings in production workloads.

## What's this about?

You've got a PostgreSQL database running with the vector extension that stores embeddings for RAG (document similarity search, image recognition), or other ML workloads.
Your job is to dive deep into this vector database and figure out what's going on under the hood.
You need to understand:

- how vectors are stored
- how much space they're taking up
- whether indexes are working properly
- if there are any data quality issues lurking around

## What you need to investigate

First, get familiar with what you're working with:

- Check vector extension status: ensuring it's installed properly, check version, identify any configuration issues
- Identify all vector columns across entire database: providing me columns, types of columns, and vector dim (dimensions)
- Map the vector landscape: understand relationships between vector tables and regular tables, foreign keys, dependencies

Vectors can eat up a lot of storage, so let's see where the bytes are going:

- Calculate vector storage overhead: measure how much space vectors take compared to regular columns in same tables
- Analyze table sizes: identify which vector tables are biggest storage consumers, break down by table
- Understand growth patterns: examine record counts and project future storage needs based on current data

Vectors without proper indexes are painfully slow, so investigate:

- Catalog vector indexes: find all HNSW and IVFFlat indexes, document their configurations and parameters
- Measure index effectiveness: determine if indexes are actually being used and helping query performance
- Identify optimization opportunities: spot missing indexes, suboptimal configurations, unused indexes

Bad vector data makes everything worse:

- Hunt for data issues: locate NULL vectors, dimension mismatches, corrupted embeddings that could break queries
- Validate consistency: ensure vectors in each column have consistent dimensions across all rows
- Check for outliers: find vectors that might be skewing similarity calculations or causing performance issues

## Your deliverables

Create these analysis tables and populate them with your findings:

### `vector_analysis_columns`

Complete catalog of every vector column you find:

```sql
CREATE TABLE vector_analysis_columns (
    schema VARCHAR(50),
    table_name VARCHAR(100),
    column_name VARCHAR(100),
    dimensions INTEGER,
    data_type VARCHAR(50),
    has_constraints BOOLEAN,
    rows BIGINT
);
```

### `vector_analysis_storage_consumption`

Show exactly where storage is being consumed:

```sql
CREATE TABLE vector_analysis_storage_consumption (
    schema VARCHAR(50),
    table_name VARCHAR(100),
    total_size_bytes BIGINT,
    vector_data_bytes BIGINT,
    regular_data_bytes BIGINT,
    vector_storage_pct NUMERIC(5,2),
    row_count BIGINT
);
```

### `vector_analysis_indices`

Document all vector indexes and their characteristics:
```sql
CREATE TABLE vector_analysis_indices (
    schema VARCHAR(50),
    table_name VARCHAR(100),
    column_name VARCHAR(100),
    index_name VARCHAR(100),
    index_type VARCHAR(50), -- 'hnsw', 'ivfflat', etc.
    index_size_bytes BIGINT
);
```

Use PostgreSQL system catalogs, pgvector-specific views, and storage analysis functions to gather comprehensive metrics about the vector database implementation.


================================================
FILE: tasks/postgres/standard/vectors/dba_vector_analysis/ground_truth.sql
================================================
-- Ground Truth Data for Vector Database Analysis Task
-- This defines the exact expected results that candidates should discover and report

/*
================================================================================
EXPECTED VECTOR DATABASE STRUCTURE (created by vectors_setup.py)
================================================================================

Tables with Vector Columns:
1. documents.embedding (vector(1536))
2. document_chunks.embedding (vector(1536))
3. user_queries.embedding (vector(1536))

Vector Indexes:
1. documents_embedding_idx (HNSW on documents.embedding)
2. chunks_embedding_idx (HNSW on document_chunks.embedding)
3. queries_embedding_idx (HNSW on user_queries.embedding)

Expected Data Counts:
- documents: 10 records
- document_chunks: ~40-70 records (3-7 chunks per document)
- user_queries: 10 records
- embedding_models: 5 records (metadata)
- knowledge_base: 5 records (metadata)
- search_cache: 5 records (metadata)

================================================================================
DEFINITIVE GROUND TRUTH VERIFICATION DATA
================================================================================
*/

BEGIN;

-- Create expected analysis result structure
CREATE TABLE IF NOT EXISTS expected_vector_column_inventory (
    table_schema VARCHAR(50) DEFAULT 'public',
    table_name VARCHAR(100),
    column_name VARCHAR(100),
    vector_dimensions INTEGER,
    data_type VARCHAR(50) DEFAULT 'USER-DEFINED',
    has_constraints BOOLEAN DEFAULT false,
    min_estimated_rows BIGINT
);

-- Insert expected vector column inventory
INSERT INTO expected_vector_column_inventory (table_name, column_name, vector_dimensions, min_estimated_rows) VALUES
('documents', 'embedding', 1536, 10),
('document_chunks', 'embedding', 1536, 30),
('user_queries', 'embedding', 1536, 10);

-- Create expected storage analysis structure
CREATE TABLE IF NOT EXISTS expected_vector_storage_analysis (
    table_name VARCHAR(100),
    has_vector_data BOOLEAN,
    min_row_count BIGINT,
    vector_column_exists BOOLEAN,
    should_have_storage_metrics BOOLEAN DEFAULT true
);

-- Insert expected storage analysis
INSERT INTO expected_vector_storage_analysis (table_name, has_vector_data, min_row_count, vector_column_exists) VALUES
('documents', true, 10, true),
('document_chunks', true, 30, true),
('user_queries', true, 10, true),
('embedding_models', false, 5, false),
('knowledge_base', false, 5, false),
('search_cache', false, 5, false);

-- Create expected index analysis structure
CREATE TABLE IF NOT EXISTS expected_vector_index_analysis (
    index_name_pattern VARCHAR(100),
    table_name VARCHAR(100),
    column_name VARCHAR(100),
    expected_index_type VARCHAR(50),
    should_exist BOOLEAN DEFAULT true
);

-- Insert expected vector index analysis
INSERT INTO expected_vector_index_analysis (index_name_pattern, table_name, column_name, expected_index_type) VALUES
('%documents%embedding%', 'documents', 'embedding', 'hnsw'),
('%chunks%embedding%', 'document_chunks', 'embedding', 'hnsw'),
('%queries%embedding%', 'user_queries', 'embedding', 'hnsw');

-- Create storage analysis table
CREATE TABLE vector_storage_analysis (
    table_name VARCHAR(100),
    total_size_bytes BIGINT,
    vector_data_bytes BIGINT,
    regular_data_bytes BIGINT,
    vector_storage_pct NUMERIC(5,2),
    row_count BIGINT,
    avg_vector_size_bytes INTEGER
);

-- Populate storage analysis with actual storage metrics
DO $$
DECLARE
    rec RECORD;
    total_size BIGINT;
    row_cnt BIGINT;
    vector_size INTEGER := 1536 * 4; -- 1536 dimensions * 4 bytes per float
BEGIN
    FOR rec IN SELECT tablename FROM pg_tables WHERE tablename IN ('documents', 'document_chunks', 'user_queries') LOOP
        EXECUTE format('SELECT COUNT(*) FROM %I', rec.tablename) INTO row_cnt;
        SELECT pg_total_relation_size(format('public.%I', rec.tablename)) INTO total_size;

        INSERT INTO vector_storage_analysis (
            table_name, total_size_bytes, row_count, avg_vector_size_bytes,
            vector_data_bytes, regular_data_bytes, vector_storage_pct
        ) VALUES (
            rec.tablename,
            total_size,
            row_cnt,
            vector_size,
            row_cnt * vector_size,
            GREATEST(total_size - (row_cnt * vector_size), 0),
            ROUND((row_cnt * vector_size * 100.0) / NULLIF(total_size, 0), 2)
        );
    END LOOP;
END $$;

-- Create index analysis table
CREATE TABLE vector_index_analysis (
    index_name VARCHAR(100),
    table_name VARCHAR(100),
    column_name VARCHAR(100),
    index_type VARCHAR(50),
    index_size_bytes BIGINT,
    index_parameters TEXT,
    is_valid BOOLEAN
);

-- Populate index analysis with actual vector indexes
INSERT INTO vector_index_analysis (index_name, table_name, column_name, index_type, index_size_bytes, is_valid)
SELECT
    i.indexname as index_name,
    i.tablename as table_name,
    'embedding' as column_name, -- Known from our setup
    CASE
        WHEN i.indexdef ILIKE '%hnsw%' THEN 'hnsw'
        WHEN i.indexdef ILIKE '%ivfflat%' THEN 'ivfflat'
        ELSE 'unknown'
    END as index_type,
    pg_relation_size(format('public.%I', i.indexname)) as index_size_bytes,
    true as is_valid
FROM pg_indexes i
WHERE (i.indexdef ILIKE '%vector%' OR i.indexdef ILIKE '%hnsw%' OR i.indexdef ILIKE '%ivfflat%')
AND i.tablename IN ('documents', 'document_chunks', 'user_queries')
ORDER BY i.tablename, i.indexname;

-- Create data quality analysis table
CREATE TABLE vector_data_quality (
    table_name VARCHAR(100),
    column_name VARCHAR(100),
    quality_check_type VARCHAR(50),
    total_records BIGINT,
    issue_count BIGINT,
    quality_status VARCHAR(20),
    details TEXT
);

-- Populate data quality analysis with actual checks
DO $$
DECLARE
    rec RECORD;
    total_cnt BIGINT;
    null_cnt BIGINT;
BEGIN
    FOR rec IN SELECT tablename FROM pg_tables WHERE tablename IN ('documents', 'document_chunks', 'user_queries') LOOP
        -- Count total records
        EXECUTE format('SELECT COUNT(*) FROM %I', rec.tablename) INTO total_cnt;

        -- Count NULL vectors
        EXECUTE format('SELECT COUNT(*) FROM %I WHERE embedding IS NULL', rec.tablename) INTO null_cnt;

        -- Insert NULL_CHECK result
        INSERT INTO vector_data_quality (
            table_name, column_name, quality_check_type,
            total_records, issue_count, quality_status
        ) VALUES (
            rec.tablename, 'embedding', 'NULL_CHECK',
            total_cnt, null_cnt,
            CASE WHEN null_cnt = 0 THEN 'GOOD' ELSE 'WARNING' END
        );

        -- Insert DIMENSION_CHECK result (all vectors in our setup are 1536-dimensional)
        INSERT INTO vector_data_quality (
            table_name, column_name, quality_check_type,
            total_records, issue_count, quality_status
        ) VALUES (
            rec.tablename, 'embedding', 'DIMENSION_CHECK',
            total_cnt - null_cnt, 0, 'GOOD'
        );
    END LOOP;
END $$;

-- ============================================================================
-- GROUND TRUTH IMPLEMENTATION
-- ============================================================================
-- This is the correct analysis implementation that candidates should produce

-- Create vector_analysis_columns table and populate it
CREATE TABLE vector_analysis_columns (
    schema VARCHAR(50),
    table_name VARCHAR(100),
    column_name VARCHAR(100),
    dimensions INTEGER,
    data_type VARCHAR(50),
    has_constraints BOOLEAN,
    rows BIGINT
);

-- Discover and insert vector columns
INSERT INTO vector_analysis_columns (schema, table_name, column_name, dimensions, data_type, has_constraints, rows)
SELECT
    'public' as schema,
    c.table_name,
    c.column_name,
    1536 as dimensions, -- pgvector embedding dimension
    'USER-DEFINED' as data_type,
    false as has_constraints,
    -- Get actual row count using dynamic query
    CASE c.table_name
        WHEN 'documents' THEN (SELECT COUNT(*) FROM documents)
        WHEN 'document_chunks' THEN (SELECT COUNT(*) FROM document_chunks)
        WHEN 'user_queries' THEN (SELECT COUNT(*) FROM user_queries)
        ELSE 0
    END as rows
FROM information_schema.columns c
WHERE c.data_type = 'USER-DEFINED'
AND c.udt_name = 'vector'
ORDER BY c.table_name, c.column_name;

-- Create vector_analysis_storage_consumption table
CREATE TABLE vector_analysis_storage_consumption (
    schema VARCHAR(50),
    table_name VARCHAR(100),
    total_size_bytes BIGINT,
    vector_data_bytes BIGINT,
    regular_data_bytes BIGINT,
    vector_storage_pct NUMERIC(5,2),
    row_count BIGINT
);

-- Populate storage analysis for vector tables
DO $$
DECLARE
    rec RECORD;
    total_size BIGINT;
    row_cnt BIGINT;
    vector_size INTEGER := 1536 * 4; -- 1536 dimensions * 4 bytes per float
BEGIN
    FOR rec IN
        SELECT DISTINCT c.table_name
        FROM information_schema.columns c
        WHERE c.data_type = 'USER-DEFINED'
        AND c.udt_name = 'vector'
    LOOP
        -- Get actual row count
        EXECUTE format('SELECT COUNT(*) FROM %I', rec.table_name) INTO row_cnt;

        -- Get actual table size
        SELECT pg_total_relation_size(format('public.%I', rec.table_name)) INTO total_size;

        -- Insert analysis results
        INSERT INTO vector_analysis_storage_consumption (
            schema, table_name, total_size_bytes, vector_data_bytes,
            regular_data_bytes, vector_storage_pct, row_count
        ) VALUES (
            'public',
            rec.table_name,
            total_size,
            row_cnt * vector_size,
            GREATEST(total_size - (row_cnt * vector_size), 0),
            ROUND((row_cnt * vector_size * 100.0) / NULLIF(total_size, 0), 2),
            row_cnt
        );
    END LOOP;
END $$;

-- Create vector_analysis_indices table
CREATE TABLE vector_analysis_indices (
    schema VARCHAR(50),
    table_name VARCHAR(100),
    column_name VARCHAR(100),
    index_name VARCHAR(100),
    index_type VARCHAR(50),
    index_size_bytes BIGINT
);

-- Populate index analysis for vector indexes
INSERT INTO vector_analysis_indices (schema, table_name, column_name, index_name, index_type, index_size_bytes)
SELECT
    i.schemaname as schema,
    i.tablename as table_name,
    'embedding' as column_name, -- known from our setup
    i.indexname as index_name,
    CASE
        WHEN i.indexdef ILIKE '%hnsw%' THEN 'hnsw'
        WHEN i.indexdef ILIKE '%ivfflat%' THEN 'ivfflat'
        ELSE 'unknown'
    END as index_type,
    pg_relation_size(format('public.%I', i.indexname)) as index_size_bytes
FROM pg_indexes i
WHERE (i.indexdef ILIKE '%hnsw%' OR i.indexdef ILIKE '%ivfflat%')
AND i.tablename IN (
    SELECT DISTINCT table_name
    FROM information_schema.columns
    WHERE data_type = 'USER-DEFINED' AND udt_name = 'vector'
)
ORDER BY i.tablename, i.indexname;

COMMIT;

-- ============================================================================
-- VERIFICATION HELPER QUERIES
-- ============================================================================

-- Query to check actual vector columns in the database
/*
SELECT
    table_schema,
    table_name,
    column_name,
    data_type,
    udt_name
FROM information_schema.columns
WHERE data_type = 'USER-DEFINED'
AND udt_name = 'vector'
ORDER BY table_name, column_name;
*/

-- Query to check actual vector indexes
/*
SELECT
    schemaname,
    tablename,
    indexname,
    indexdef
FROM pg_indexes
WHERE indexdef ILIKE '%vector%'
   OR indexdef ILIKE '%hnsw%'
   OR indexdef ILIKE '%ivfflat%'
ORDER BY tablename, indexname;
*/

-- Query to check table row counts
/*
SELECT
    'documents' as table_name, COUNT(*) as row_count FROM documents
UNION ALL
SELECT
    'document_chunks' as table_name, COUNT(*) as row_count FROM document_chunks
UNION ALL
SELECT
    'user_queries' as table_name, COUNT(*) as row_count FROM user_queries
ORDER BY table_name;
*/

-- Query to check pgvector extension
/*
SELECT extname, extversion
FROM pg_extension
WHERE extname = 'vector';
*/


================================================
FILE: tasks/postgres/standard/vectors/dba_vector_analysis/meta.json
================================================
{
  "task_id": "dba_vector_analysis",
  "task_name": "DBA Vector Analysis",
  "category_id": "vectors",
  "category_name": "Vectors",
  "description": "Analyze pgvector database storage, identify vector columns, assess space utilization and performance for RAG applications.",
  "author": "Fanshi Zhang",
  "created_at": "2025-08-18",
  "difficulty": "L3",
  "tags": [
    "performance optimization",
    "audit and compliance",
    "statistical aggregation"
  ],
  "mcp": [
    "postgres"
  ],
  "meta_data": {
    "stateType": "text",
    "stateContent": "Table \"documents\" {\n  \"id\" int4 [pk, not null, increment]\n  \"title\" text [not null]\n  \"content\" text [not null]\n  \"source_url\" text\n  \"document_type\" varchar(50) [default: 'article']\n  \"created_at\" timestamp [default: `CURRENT_TIMESTAMP`]\n  \"updated_at\" timestamp [default: `CURRENT_TIMESTAMP`]\n  \"word_count\" int4\n  \"embedding\" public.vector\n\n  Indexes {\n    created_at [type: btree, name: \"documents_created_idx\"]\n    embedding [type: hnsw, name: \"documents_embedding_idx\"]\n    title [type: btree, name: \"documents_title_idx\"]\n    document_type [type: btree, name: \"documents_type_idx\"]\n  }\n}\n\nTable \"document_chunks\" {\n  \"id\" int4 [pk, not null, increment]\n  \"document_id\" int4\n  \"chunk_index\" int4 [not null]\n  \"chunk_text\" text [not null]\n  \"chunk_size\" int4\n  \"overlap_size\" int4 [default: 0]\n  \"created_at\" timestamp [default: `CURRENT_TIMESTAMP`]\n  \"embedding\" public.vector\n\n  Indexes {\n    document_id [type: btree, name: \"chunks_doc_id_idx\"]\n    embedding [type: hnsw, name: \"chunks_embedding_idx\"]\n    chunk_index [type: btree, name: \"chunks_index_idx\"]\n  }\n}\n\nTable \"user_queries\" {\n  \"id\" int4 [pk, not null, increment]\n  \"query_text\" text [not null]\n  \"user_id\" varchar(100)\n  \"session_id\" varchar(100)\n  \"created_at\" timestamp [default: `CURRENT_TIMESTAMP`]\n  \"response_time_ms\" int4\n  \"embedding\" public.vector\n\n  Indexes {\n    created_at [type: btree, name: \"queries_created_idx\"]\n    embedding [type: hnsw, name: \"queries_embedding_idx\"]\n    user_id [type: btree, name: \"queries_user_idx\"]\n  }\n}\n\nTable \"embedding_models\" {\n  \"id\" int4 [pk, not null, increment]\n  \"model_name\" varchar(100) [unique, not null]\n  \"provider\" varchar(50) [not null]\n  \"dimensions\" int4 [not null]\n  \"max_tokens\" int4\n  \"cost_per_token\" numeric(10,8)\n  \"created_at\" timestamp [default: `CURRENT_TIMESTAMP`]\n  \"is_active\" bool [default: true]\n}\n\nTable \"knowledge_base\" {\n  \"id\" int4 [pk, not null, increment]\n  \"kb_name\" varchar(100) [not null]\n  \"description\" text\n  \"domain\" varchar(50)\n  \"language\" varchar(10) [default: 'en']\n  \"total_documents\" int4 [default: 0]\n  \"total_chunks\" int4 [default: 0]\n  \"total_storage_mb\" numeric(10,2)\n  \"created_at\" timestamp [default: `CURRENT_TIMESTAMP`]\n  \"updated_at\" timestamp [default: `CURRENT_TIMESTAMP`]\n}\n\nTable \"search_cache\" {\n  \"id\" int4 [pk, not null, increment]\n  \"query_hash\" varchar(64) [not null]\n  \"query_text\" text [not null]\n  \"results_json\" jsonb\n  \"result_count\" int4\n  \"search_time_ms\" int4\n  \"similarity_threshold\" numeric(4,3)\n  \"created_at\" timestamp [default: `CURRENT_TIMESTAMP`]\n  \"expires_at\" timestamp\n\n  Indexes {\n    expires_at [type: btree, name: \"cache_expires_idx\"]\n    query_hash [type: btree, name: \"cache_hash_idx\"]\n  }\n}\n\nRef \"document_chunks_document_id_fkey\":\"documents\".\"id\" < \"document_chunks\".\"document_id\" [delete: cascade]\n",
    "stateUrl": null,
    "stateOriginalUrl": null
  }
}

================================================
FILE: tasks/postgres/standard/vectors/dba_vector_analysis/prepare_environment.py
================================================
"""
Environment preparation script for Vector Database DBA Analysis task.

This script imports and uses the shared vector database setup utilities.
"""

import sys
import logging
from pathlib import Path

# Add the vectors directory to import the shared utilities
sys.path.append(str(Path(__file__).resolve().parents[1]))

from vectors_setup import prepare_vector_environment

logger = logging.getLogger(__name__)


def prepare_environment():
    """Main function to prepare the vector database environment."""
    prepare_vector_environment()


if __name__ == "__main__":
    logging.basicConfig(level=logging.INFO)
    prepare_environment()

================================================
FILE: tasks/postgres/standard/vectors/dba_vector_analysis/verify.py
================================================
"""
Verification script for Vector Database DBA Analysis task.

This script verifies that the candidate has properly analyzed the vector database
and stored their findings in appropriate result tables.
"""

import logging
import psycopg2
import os
import sys
from typing import Dict, Any

logger = logging.getLogger(__name__)


def get_connection_params():
    """Get database connection parameters from environment variables."""
    return {
        "host": os.getenv("POSTGRES_HOST", "localhost"),
        "port": int(os.getenv("POSTGRES_PORT", 5432)),
        "database": os.getenv("POSTGRES_DATABASE"),
        "user": os.getenv("POSTGRES_USERNAME"),
        "password": os.getenv("POSTGRES_PASSWORD"),
    }


def verify_vector_analysis_columns(conn) -> Dict[str, Any]:
    """Verify the vector_analysis_columns table exists, has correct columns, and contains actual vector columns from the database."""
    results = {'passed': False, 'issues': []}
    expected_columns = [
        'schema', 'table_name', 'column_name', 'dimensions', 'data_type', 'has_constraints', 'rows'
    ]
    try:
        with conn.cursor() as cur:
            # Check if table exists
            cur.execute("""
                SELECT EXISTS (
                    SELECT FROM information_schema.tables
                    WHERE table_name = 'vector_analysis_columns'
                );
            """)
            if not cur.fetchone()[0]:
                results['issues'].append("vector_analysis_columns table not found")
                return results

            # Check columns
            cur.execute("""
                SELECT column_name FROM information_schema.columns
                WHERE table_name = 'vector_analysis_columns'
                ORDER BY column_name;
            """)
            actual_columns = {row[0] for row in cur.fetchall()}
            missing = set(expected_columns) - actual_columns
            extra = actual_columns - set(expected_columns)
            if missing:
                results['issues'].append(f"Missing columns: {missing}")
            if extra:
                results['issues'].append(f"Unexpected columns: {extra}")

            # Check for data
            cur.execute("SELECT COUNT(*) FROM vector_analysis_columns;")
            count = cur.fetchone()[0]
            if count == 0:
                results['issues'].append("No rows found in vector_analysis_columns")
                return results

            # Get actual vector columns from the database
            cur.execute("""
                SELECT table_name, column_name
                FROM information_schema.columns
                WHERE data_type = 'USER-DEFINED'
                AND udt_name = 'vector'
                ORDER BY table_name, column_name;
            """)
            actual_vector_columns = set(cur.fetchall())

            # Get what the agent found
            cur.execute("""
                SELECT table_name, column_name
                FROM vector_analysis_columns
                ORDER BY table_name, column_name;
            """)
            found_vector_columns = set(cur.fetchall())

            # Check if agent found the actual vector columns
            missing_vectors = actual_vector_columns - found_vector_columns
            extra_vectors = found_vector_columns - actual_vector_columns

            if missing_vectors:
                results['issues'].append(f"Missing: {missing_vectors}")
            if extra_vectors:
                results['issues'].append(f"Non-existing: {extra_vectors}")

            if not missing and not extra and count > 0 and not missing_vectors and not extra_vectors:
                results['passed'] = True

    except psycopg2.Error as e:
        results['issues'].append(f"Database error: {e}")
    except Exception as e:
        results['issues'].append(f"Verification error: {e}")
    return results


def verify_vector_analysis_storage_consumption(conn) -> Dict[str, Any]:
    """Verify the vector_analysis_storage_consumption table exists, has correct columns, and analyzes actual vector tables."""
    results = {'passed': False, 'issues': []}
    expected_columns = [
        'schema', 'table_name', 'total_size_bytes', 'vector_data_bytes', 'regular_data_bytes', 'vector_storage_pct', 'row_count'
    ]
    try:
        with conn.cursor() as cur:
            cur.execute("""
                SELECT EXISTS (
                    SELECT FROM information_schema.tables
                    WHERE table_name = 'vector_analysis_storage_consumption'
                );
            """)
            if not cur.fetchone()[0]:
                results['issues'].append("vector_analysis_storage_consumption table not found")
                return results

            cur.execute("""
                SELECT column_name FROM information_schema.columns
                WHERE table_name = 'vector_analysis_storage_consumption'
                ORDER BY column_name;
            """)
            actual_columns = {row[0] for row in cur.fetchall()}
            missing = set(expected_columns) - actual_columns
            extra = actual_columns - set(expected_columns)
            if missing:
                results['issues'].append(f"Missing columns: {missing}")
            if extra:
                results['issues'].append(f"Unexpected columns: {extra}")

            cur.execute("SELECT COUNT(*) FROM vector_analysis_storage_consumption;")
            count = cur.fetchone()[0]
            if count == 0:
                results['issues'].append("No rows found in vector_analysis_storage_consumption")
                return results

            # Get actual tables with vector columns
            cur.execute("""
                SELECT DISTINCT table_name
                FROM information_schema.columns
                WHERE data_type = 'USER-DEFINED'
                AND udt_name = 'vector'
                ORDER BY table_name;
            """)
            actual_vector_tables = {row[0] for row in cur.fetchall()}

            # Get what the agent analyzed
            cur.execute("""
                SELECT DISTINCT table_name
                FROM vector_analysis_storage_consumption
                ORDER BY table_name;
            """)
            analyzed_tables = {row[0] for row in cur.fetchall()}

            # Check if agent analyzed the actual vector tables
            missing_tables = actual_vector_tables - analyzed_tables
            if missing_tables:
                results['issues'].append(f"Agent missed analyzing vector tables: {missing_tables}")

            # Check that analyzed tables actually have vector columns
            extra_tables = analyzed_tables - actual_vector_tables
            if extra_tables:
                results['issues'].append(f"Agent analyzed non-vector tables: {extra_tables}")

            if not missing and not extra and count > 0 and not missing_tables and not extra_tables:
                results['passed'] = True

    except psycopg2.Error as e:
        results['issues'].append(f"Database error: {e}")
    except Exception as e:
        results['issues'].append(f"Verification error: {e}")
    return results


def verify_vector_analysis_indices(conn) -> Dict[str, Any]:
    """Verify the vector_analysis_indices table exists, has correct columns, and identifies actual vector indexes."""
    results = {'passed': False, 'issues': []}
    expected_columns = [
        'schema', 'table_name', 'column_name', 'index_name', 'index_type', 'index_size_bytes'
    ]
    try:
        with conn.cursor() as cur:
            cur.execute("""
                SELECT EXISTS (
                    SELECT FROM information_schema.tables
                    WHERE table_name = 'vector_analysis_indices'
                );
            """)
            if not cur.fetchone()[0]:
                results['issues'].append("vector_analysis_indices table not found")
                return results

            cur.execute("""
                SELECT column_name FROM information_schema.columns
                WHERE table_name = 'vector_analysis_indices'
                ORDER BY column_name;
            """)
            actual_columns = {row[0] for row in cur.fetchall()}
            missing = set(expected_columns) - actual_columns
            extra = actual_columns - set(expected_columns)
            if missing:
                results['issues'].append(f"Missing columns: {missing}")
            if extra:
                results['issues'].append(f"Unexpected columns: {extra}")

            cur.execute("SELECT COUNT(*) FROM vector_analysis_indices;")
            count = cur.fetchone()[0]
            if count == 0:
                results['issues'].append("No rows found in vector_analysis_indices")
                return results

            # Get actual vector indexes from the database (exclude ground truth table indexes)
            cur.execute("""
                SELECT schemaname, tablename, indexname
                FROM pg_indexes
                WHERE (indexdef ILIKE '%hnsw%' OR indexdef ILIKE '%ivfflat%')
                AND tablename NOT LIKE '%analysis%'
                ORDER BY tablename, indexname;
            """)
            actual_vector_indexes = set(cur.fetchall())

            # Get what the agent found
            cur.execute("""
                SELECT schema, table_name, index_name
                FROM vector_analysis_indices
                ORDER BY table_name, index_name;
            """)
            found_indexes = set(cur.fetchall())

            # Check if agent found the actual vector indexes
            missing_indexes = actual_vector_indexes - found_indexes
            if missing_indexes:
                results['issues'].append(f"Agent missed vector indexes: {missing_indexes}")

            # Allow agent to find more indexes than just vector ones (they might include related indexes)
            # but at least they should find the vector-specific ones

            if not missing and not extra and count > 0 and not missing_indexes:
                results['passed'] = True

    except psycopg2.Error as e:
        results['issues'].append(f"Database error: {e}")
    except Exception as e:
        results['issues'].append(f"Verification error: {e}")
    return results


def verify_no_extra_analysis_tables(conn) -> Dict[str, Any]:
    """Check that only the required analysis tables exist (no legacy/extra analysis tables)."""
    results = {'passed': True, 'issues': []}  # Start with passed=True, more lenient
    required = {
        'vector_analysis_columns',
        'vector_analysis_storage_consumption',
        'vector_analysis_indices',
    }
    try:
        with conn.cursor() as cur:
            cur.execute("""
                SELECT table_name FROM information_schema.tables
                WHERE table_schema = 'public'
                AND table_name LIKE 'vector_analysis_%';
            """)
            analysis_tables = {row[0] for row in cur.fetchall()}

            # Only flag as issue if there are analysis tables that don't match our required set
            # Exclude ground truth tables from this check
            analysis_tables_filtered = {t for t in analysis_tables if not t.startswith('expected_') and not t.startswith('vector_analysis_results')}
            extra = analysis_tables_filtered - required
            if extra:
                results['issues'].append(f"Found unexpected analysis tables: {extra}")
                results['passed'] = False

    except Exception as e:
        results['issues'].append(f"Verification error: {e}")
        results['passed'] = False
    return results


def main():
    """Main verification function for vector analysis deliverables."""

    conn_params = get_connection_params()
    if not conn_params["database"]:
        print("No database specified")
        sys.exit(1)
    try:
        conn = psycopg2.connect(**conn_params)
        checks = [
            ("vector_analysis_columns", verify_vector_analysis_columns),
            ("vector_analysis_storage_consumption", verify_vector_analysis_storage_consumption),
            ("vector_analysis_indices", verify_vector_analysis_indices),
            ("no_extra_analysis_tables", verify_no_extra_analysis_tables),
        ]
        passed_checks = 0
        all_issues = []
        for i, (desc, check_func) in enumerate(checks, 1):
            result = check_func(conn)
            if result['passed']:
                print(f"  PASSED")
                passed_checks += 1
            else:
                print(f"  FAILED")
                for issue in result['issues']:
                    print(f"    - {issue}")
                all_issues.extend(result['issues'])
            print()
        conn.close()
        total_checks = len(checks)
        print(f"Results: {passed_checks}/{total_checks} checks passed")
        if passed_checks == total_checks:
            sys.exit(0)
        elif passed_checks >= total_checks * 0.75:
            sys.exit(0)
        else:
            sys.exit(1)
    except psycopg2.Error as e:
        print(f"Database connection error: {e}")
        sys.exit(1)
    except Exception as e:
        print(f"Verification error: {e}")
        sys.exit(1)


if __name__ == "__main__":
    main()


================================================
FILE: tasks/postgres/standard/vectors/vectors_setup.py
================================================
"""
Shared Vector Database Setup Utilities

This module provides utilities for setting up a complete PostgreSQL database
with pgvector extension and sample RAG-related tables with vector data.
Used by all vector database tasks.
"""

import os
import logging
import psycopg2
import json
import random
import numpy as np
from typing import List

logger = logging.getLogger(__name__)

def get_connection_params():
    """Get database connection parameters from environment variables."""
    return {
        'host': os.getenv('POSTGRES_HOST', 'localhost'),
        'port': os.getenv('POSTGRES_PORT', '5432'),
        'user': os.getenv('POSTGRES_USERNAME', 'postgres'),
        'password': os.getenv('POSTGRES_PASSWORD', 'password'),
        'database': os.getenv('POSTGRES_DATABASE', 'postgres')
    }


def generate_mock_embedding(dimensions: int = 1536) -> List[float]:
    """Generate a mock embedding vector with specified dimensions."""
    # Generate random values between -1 and 1, then normalize
    vector = np.random.uniform(-1, 1, dimensions)
    # Normalize to unit vector (common practice for embeddings)
    norm = np.linalg.norm(vector)
    if norm > 0:
        vector = vector / norm
    return vector.tolist()


def create_vector_extension():
    """Create the pgvector extension."""
    conn_params = get_connection_params()

    try:
        conn = psycopg2.connect(**conn_params)
        conn.autocommit = True

        with conn.cursor() as cur:
            logger.info("Creating pgvector extension...")
            cur.execute("CREATE EXTENSION IF NOT EXISTS vector;")
            logger.info("pgvector extension created successfully")

        conn.close()

    except psycopg2.Error as e:
        logger.error(f"Failed to create pgvector extension: {e}")
        raise


def create_vector_tables():
    """Create sample tables with vector columns for RAG applications."""
    conn_params = get_connection_params()

    try:
        conn = psycopg2.connect(**conn_params)
        conn.autocommit = True

        with conn.cursor() as cur:
            logger.info("Creating vector database tables...")

            # Create documents table for document embeddings
            cur.execute("""
                CREATE TABLE IF NOT EXISTS documents (
                    id SERIAL PRIMARY KEY,
                    title TEXT NOT NULL,
                    content TEXT NOT NULL,
                    source_url TEXT,
                    document_type VARCHAR(50) DEFAULT 'article',
                    created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
                    updated_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
                    word_count INTEGER,
                    embedding vector(1536)
                );
            """)

            # Create chunks table for document chunks (common in RAG)
            cur.execute("""
                CREATE TABLE IF NOT EXISTS document_chunks (
                    id SERIAL PRIMARY KEY,
                    document_id INTEGER REFERENCES documents(id) ON DELETE CASCADE,
                    chunk_index INTEGER NOT NULL,
                    chunk_text TEXT NOT NULL,
                    chunk_size INTEGER,
                    overlap_size INTEGER DEFAULT 0,
                    created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
                    embedding vector(1536)
                );
            """)

            # Create queries table for storing user queries and their embeddings
            cur.execute("""
                CREATE TABLE IF NOT EXISTS user_queries (
                    id SERIAL PRIMARY KEY,
                    query_text TEXT NOT NULL,
                    user_id VARCHAR(100),
                    session_id VARCHAR(100),
                    created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
                    response_time_ms INTEGER,
                    embedding vector(1536)
                );
            """)

            # Create embeddings metadata table
            cur.execute("""
                CREATE TABLE IF NOT EXISTS embedding_models (
                    id SERIAL PRIMARY KEY,
                    model_name VARCHAR(100) NOT NULL UNIQUE,
                    provider VARCHAR(50) NOT NULL,
                    dimensions INTEGER NOT NULL,
                    max_tokens INTEGER,
                    cost_per_token DECIMAL(10, 8),
                    created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
                    is_active BOOLEAN DEFAULT TRUE
                );
            """)

            # Create knowledge base table
            cur.execute("""
                CREATE TABLE IF NOT EXISTS knowledge_base (
                    id SERIAL PRIMARY KEY,
                    kb_name VARCHAR(100) NOT NULL,
                    description TEXT,
                    domain VARCHAR(50),
                    language VARCHAR(10) DEFAULT 'en',
                    total_documents INTEGER DEFAULT 0,
                    total_chunks INTEGER DEFAULT 0,
                    total_storage_mb DECIMAL(10, 2),
                    created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
                    updated_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
                );
            """)

            # Create similarity search results cache
            cur.execute("""
                CREATE TABLE IF NOT EXISTS search_cache (
                    id SERIAL PRIMARY KEY,
                    query_hash VARCHAR(64) NOT NULL,
                    query_text TEXT NOT NULL,
                    results_json JSONB,
                    result_count INTEGER,
                    search_time_ms INTEGER,
                    similarity_threshold DECIMAL(4, 3),
                    created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
                    expires_at TIMESTAMP
                );
            """)

            logger.info("Vector database tables created successfully")

        conn.close()

    except psycopg2.Error as e:
        logger.error(f"Failed to create vector tables: {e}")
        raise


def create_vector_indexes():
    """Create indexes for vector columns and other frequently queried fields."""
    conn_params = get_connection_params()

    try:
        conn = psycopg2.connect(**conn_params)
        conn.autocommit = True

        with conn.cursor() as cur:
            logger.info("Creating vector indexes...")

            # Vector indexes using HNSW (Hierarchical Navigable Small World)
            indexes = [
                ("documents_embedding_idx", "documents", "embedding", "hnsw"),
                ("chunks_embedding_idx", "document_chunks", "embedding", "hnsw"),
                ("queries_embedding_idx", "user_queries", "embedding", "hnsw"),
            ]

            for idx_name, table_name, column_name, method in indexes:
                try:
                    if method == "hnsw":
                        cur.execute(f"""
                            CREATE INDEX IF NOT EXISTS {idx_name}
                            ON {table_name} USING hnsw ({column_name} vector_cosine_ops);
                        """)
                    else:
                        cur.execute(f"""
                            CREATE INDEX IF NOT EXISTS {idx_name}
                            ON {table_name} USING ivfflat ({column_name} vector_cosine_ops) WITH (lists = 100);
                        """)
                    logger.info(f"Created index {idx_name} on {table_name}")
                except psycopg2.Error as e:
                    logger.warning(f"Could not create {method} index {idx_name}: {e}")
                    # Try with IVFFlat as fallback
                    if method == "hnsw":
                        try:
                            cur.execute(f"""
                                CREATE INDEX IF NOT EXISTS {idx_name}_ivf
                                ON {table_name} USING ivfflat ({column_name} vector_cosine_ops) WITH (lists = 100);
                            """)
                            logger.info(f"Created fallback IVFFlat index {idx_name}_ivf on {table_name}")
                        except psycopg2.Error as e2:
                            logger.warning(f"Could not create fallback index: {e2}")

            # Regular indexes for performance
            regular_indexes = [
                ("documents_title_idx", "documents", "title"),
                ("documents_type_idx", "documents", "document_type"),
                ("documents_created_idx", "documents", "created_at"),
                ("chunks_doc_id_idx", "document_chunks", "document_id"),
                ("chunks_index_idx", "document_chunks", "chunk_index"),
                ("queries_user_idx", "user_queries", "user_id"),
                ("queries_created_idx", "user_queries", "created_at"),
                ("cache_hash_idx", "search_cache", "query_hash"),
                ("cache_expires_idx", "search_cache", "expires_at"),
            ]

            for idx_name, table_name, column_name in regular_indexes:
                try:
                    cur.execute(f"CREATE INDEX IF NOT EXISTS {idx_name} ON {table_name} ({column_name});")
                    logger.debug(f"Created regular index {idx_name}")
                except psycopg2.Error as e:
                    logger.warning(f"Could not create regular index {idx_name}: {e}")

            logger.info("Vector indexes created successfully")

        conn.close()

    except psycopg2.Error as e:
        logger.error(f"Failed to create vector indexes: {e}")
        raise


def insert_sample_data():
    """Insert sample data into vector tables."""
    conn_params = get_connection_params()

    try:
        conn = psycopg2.connect(**conn_params)
        conn.autocommit = True

        with conn.cursor() as cur:
            logger.info("Inserting sample data...")

            # Insert embedding models
            embedding_models = [
                ('text-embedding-3-small', 'OpenAI', 1536, 8192, 0.00000002, True),
                ('text-embedding-3-large', 'OpenAI', 3072, 8192, 0.00000013, True),
                ('text-embedding-ada-002', 'OpenAI', 1536, 8192, 0.00000010, False),
                ('all-MiniLM-L6-v2', 'Sentence-Transformers', 384, 512, 0.0, True),
                ('all-mpnet-base-v2', 'Sentence-Transformers', 768, 514, 0.0, True),
            ]

            for model_data in embedding_models:
                cur.execute("""
                    INSERT INTO embedding_models (model_name, provider, dimensions, max_tokens, cost_per_token, is_active)
                    VALUES (%s, %s, %s, %s, %s, %s)
                    ON CONFLICT (model_name) DO NOTHING;
                """, model_data)

            # Insert knowledge bases
            knowledge_bases = [
                ('Technical Documentation', 'Software engineering and API documentation', 'technology'),
                ('Research Papers', 'Academic papers and research publications', 'research'),
                ('Customer Support', 'FAQ and troubleshooting guides', 'support'),
                ('Product Catalog', 'Product descriptions and specifications', 'commerce'),
                ('Legal Documents', 'Contracts, policies, and legal texts', 'legal'),
            ]

            kb_ids = []
            for kb_data in knowledge_bases:
                cur.execute("""
                    INSERT INTO knowledge_base (kb_name, description, domain, total_documents, total_chunks, total_storage_mb)
                    VALUES (%s, %s, %s, %s, %s, %s)
                    RETURNING id;
                """, kb_data + (random.randint(50, 500), random.randint(200, 2000), round(random.uniform(10.5, 250.8), 2)))
                kb_ids.append(cur.fetchone()[0])

            # Insert sample documents
            sample_documents = [
                ("PostgreSQL Performance Tuning", "Comprehensive guide to optimizing PostgreSQL database performance including indexing strategies, query optimization, and configuration tuning.", "https://example.com/pg-performance", "technical_guide"),
                ("Vector Similarity Search", "Understanding vector embeddings and similarity search algorithms for AI applications and recommendation systems.", "https://example.com/vector-search", "technical_guide"),
                ("RAG Implementation Best Practices", "Best practices for implementing Retrieval-Augmented Generation systems using vector databases and large language models.", "https://example.com/rag-practices", "best_practices"),
                ("Database Security Guidelines", "Security considerations and implementation guidelines for PostgreSQL databases in production environments.", "https://example.com/db-security", "security_guide"),
                ("Machine Learning with SQL", "Integrating machine learning workflows with SQL databases and leveraging database extensions for AI applications.", "https://example.com/ml-sql", "tutorial"),
                ("API Documentation Standards", "Standards and best practices for creating comprehensive and user-friendly API documentation.", "https://example.com/api-docs", "documentation"),
                ("Microservices Architecture", "Design patterns and implementation strategies for microservices architecture in modern applications.", "https://example.com/microservices", "architecture_guide"),
                ("Data Pipeline Optimization", "Optimizing data processing pipelines for scalability, reliability, and performance in enterprise environments.", "https://example.com/data-pipelines", "optimization_guide"),
                ("Cloud Database Migration", "Step-by-step guide for migrating on-premises databases to cloud infrastructure with minimal downtime.", "https://example.com/cloud-migration", "migration_guide"),
                ("NoSQL vs SQL Comparison", "Detailed comparison of NoSQL and SQL databases, including use cases, performance characteristics, and selection criteria.", "https://example.com/nosql-sql", "comparison_guide"),
            ]

            doc_ids = []
            for title, content, url, doc_type in sample_documents:
                embedding = generate_mock_embedding(1536)
                word_count = len(content.split())

                cur.execute("""
                    INSERT INTO documents (title, content, source_url, document_type, word_count, embedding)
                    VALUES (%s, %s, %s, %s, %s, %s)
                    RETURNING id;
                """, (title, content, url, doc_type, word_count, embedding))
                doc_ids.append(cur.fetchone()[0])

            # Insert document chunks
            chunk_count = 0
            for doc_id in doc_ids:
                # Generate 3-7 chunks per document
                num_chunks = random.randint(3, 7)
                for chunk_idx in range(num_chunks):
                    chunk_text = f"This is chunk {chunk_idx + 1} of document {doc_id}. " + \
                               "It contains relevant information that would be useful for similarity search and RAG applications. " + \
                               "The content includes technical details, examples, and best practices."
                    chunk_size = len(chunk_text)
                    overlap_size = random.randint(20, 50) if chunk_idx > 0 else 0
                    embedding = generate_mock_embedding(1536)

                    cur.execute("""
                        INSERT INTO document_chunks (document_id, chunk_index, chunk_text, chunk_size, overlap_size, embedding)
                        VALUES (%s, %s, %s, %s, %s, %s);
                    """, (doc_id, chunk_idx, chunk_text, chunk_size, overlap_size, embedding))
                    chunk_count += 1

            # Insert sample user queries
            sample_queries = [
                ("How to optimize PostgreSQL performance?", "user123", "session_abc1"),
                ("What are vector embeddings?", "user456", "session_def2"),
                ("Best practices for RAG implementation", "user789", "session_ghi3"),
                ("Database security checklist", "user123", "session_abc2"),
                ("Machine learning with databases", "user456", "session_def3"),
                ("API documentation examples", "user321", "session_jkl1"),
                ("Microservices design patterns", "user654", "session_mno2"),
                ("Data pipeline best practices", "user987", "session_pqr3"),
                ("Cloud migration strategies", "user111", "session_stu4"),
                ("NoSQL vs SQL databases", "user222", "session_vwx5"),
            ]

            for query_text, user_id, session_id in sample_queries:
                embedding = generate_mock_embedding(1536)
                response_time = random.randint(50, 500)

                cur.execute("""
                    INSERT INTO user_queries (query_text, user_id, session_id, response_time_ms, embedding)
                    VALUES (%s, %s, %s, %s, %s);
                """, (query_text, user_id, session_id, response_time, embedding))

            # Insert some search cache entries
            for i in range(5):
                query_hash = f"hash_{random.randint(100000, 999999)}"
                query_text = f"Sample cached query {i + 1}"
                results = [{"doc_id": random.randint(1, len(doc_ids)), "similarity": round(random.uniform(0.7, 0.95), 3)} for _ in range(3)]
                result_count = len(results)
                search_time = random.randint(10, 100)
                threshold = round(random.uniform(0.6, 0.8), 3)

                cur.execute("""
                    INSERT INTO search_cache (query_hash, query_text, results_json, result_count, search_time_ms, similarity_threshold)
                    VALUES (%s, %s, %s, %s, %s, %s);
                """, (query_hash, query_text, json.dumps(results), result_count, search_time, threshold))

            logger.info(f"Sample data inserted successfully:")
            logger.info(f"   {len(sample_documents)} documents")
            logger.info(f"   {chunk_count} document chunks")
            logger.info(f"   {len(sample_queries)} user queries")
            logger.info(f"   {len(embedding_models)} embedding models")
            logger.info(f"   {len(knowledge_bases)} knowledge bases")

        conn.close()

    except psycopg2.Error as e:
        logger.error(f"Failed to insert sample data: {e}")
        raise


def verify_vector_setup():
    """Verify that the vector database was set up correctly."""
    conn_params = get_connection_params()

    try:
        conn = psycopg2.connect(**conn_params)

        with conn.cursor() as cur:
            logger.info("Verifying vector database setup...")

            # Check extension
            cur.execute("SELECT extname FROM pg_extension WHERE extname = 'vector';")
            if cur.fetchone():
                logger.info("pgvector extension is installed")
            else:
                logger.error("pgvector extension not found")
                return False

            # Check tables and record counts
            tables_to_check = [
                'documents', 'document_chunks', 'user_queries',
                'embedding_models', 'knowledge_base', 'search_cache'
            ]

            table_counts = {}
            for table in tables_to_check:
                cur.execute(f'SELECT COUNT(*) FROM {table}')
                count = cur.fetchone()[0]
                table_counts[table] = count
                logger.info(f"Table {table}: {count} records")

            # Check vector columns
            cur.execute("""
                SELECT table_name, column_name, data_type
                FROM information_schema.columns
                WHERE data_type = 'USER-DEFINED'
                AND udt_name = 'vector'
                ORDER BY table_name, column_name;
            """)

            vector_columns = cur.fetchall()
            logger.info(f"Found {len(vector_columns)} vector columns:")
            for table, column, dtype in vector_columns:
                logger.info(f"   {table}.{column} ({dtype})")

            # Check indexes
            cur.execute("""
                SELECT schemaname, tablename, indexname, indexdef
                FROM pg_indexes
                WHERE indexdef LIKE '%vector%' OR indexdef LIKE '%hnsw%' OR indexdef LIKE '%ivfflat%'
                ORDER BY tablename, indexname;
            """)

            vector_indexes = cur.fetchall()
            logger.info(f"Found {len(vector_indexes)} vector indexes:")
            for schema, table, index, definition in vector_indexes:
                logger.info(f"   {index} on {table}")

            # Test a simple vector similarity query
            mock_embedding = generate_mock_embedding(1536)
            cur.execute("""
                SELECT id, title, embedding <-> %s::vector as distance
                FROM documents
                ORDER BY embedding <-> %s::vector
                LIMIT 3;
            """, (mock_embedding, mock_embedding))

            results = cur.fetchall()
            logger.info(f"Vector similarity query returned {len(results)} results")

        conn.close()
        logger.info("Vector database verification completed successfully")
        return table_counts, vector_columns, vector_indexes

    except psycopg2.Error as e:
        logger.error(f"Verification failed: {e}")
        raise


def prepare_vector_environment():
    """Main function to prepare the vector database environment."""
    logger.info("Preparing vector database environment...")

    try:
        # Create pgvector extension
        create_vector_extension()

        # Create vector tables
        create_vector_tables()

        # Insert sample data first
        insert_sample_data()

        # Create indexes after data insertion for better performance
        create_vector_indexes()

        # Verify the setup
        table_counts, vector_columns, vector_indexes = verify_vector_setup()

        logger.info("Vector database environment prepared successfully!")
        logger.info(f"Total tables created: {len(table_counts)}")
        logger.info(f"Total vector columns: {len(vector_columns)}")
        logger.info(f"Total vector indexes: {len(vector_indexes)}")

        return {
            'table_counts': table_counts,
            'vector_columns': vector_columns,
            'vector_indexes': vector_indexes
        }

    except Exception as e:
        logger.error(f"Failed to prepare vector environment: {e}")
        raise


if __name__ == "__main__":
    # Allow running this module directly for testing
    logging.basicConfig(level=logging.INFO)
    prepare_vector_environment()


================================================
FILE: tasks/utils/__init__.py
================================================


================================================
FILE: tasks/utils/notion_utils.py
================================================
import os
from notion_client import Client
import sys
from dotenv import load_dotenv


def get_notion_client():
    # Construct the absolute path to the .env file in the project root
    load_dotenv(dotenv_path=".mcp_env")
    api_key = os.getenv("EVAL_NOTION_API_KEY")
    if not api_key:
        print(
            "Error: EVAL_NOTION_API_KEY not found in environment variables.",
            file=sys.stderr,
        )
        sys.exit(1)
    return Client(auth=api_key)


def _find_object(notion: Client, title: str, object_type: str):
    """Generic helper to find a Notion page or database by title.

    Args:
        notion: Authenticated Notion Client.
        title: Title (or partial title) to search for.
        object_type: Either "page" or "database".

    Returns:
        The ID string if found, otherwise None.
    """
    search_results = (
        notion.search(
            query=title, filter={"property": "object", "value": object_type}
        ).get("results")
        or []
    )

    if not search_results:
        return None

    # Shortcut when there is only one match
    if len(search_results) == 1:
        return search_results[0]["id"]

    # Attempt to find a case-insensitive match on the title field
    for result in search_results:
        if object_type == "page":
            # Pages store their title inside the "properties.title.title" rich text list
            title_rich_texts = (
                result.get("properties", {}).get("title", {}).get("title", [])
            )
        else:  # database
            title_rich_texts = result.get("title", [])

        for text_obj in title_rich_texts:
            if title.lower() in text_obj.get("plain_text", "").lower():
                return result["id"]

    # Fallback: return the first result
    return search_results[0]["id"]


def find_page(notion: Client, page_title: str):
    """Finds a page by title. Wrapper around _find_object with object_type='page'."""
    return _find_object(notion, page_title, "page")


def get_page_by_id(notion: Client, page_id: str):
    """Gets a page by its ID. Returns the page object if found, None otherwise."""
    try:
        return notion.pages.retrieve(page_id=page_id)
    except Exception:
        return None


def find_page_by_id(notion: Client, page_id: str):
    """Finds a page by its ID and returns the ID if it exists, None otherwise."""
    try:
        notion.pages.retrieve(page_id=page_id)
        return page_id
    except Exception:
        return None


def find_database_by_id(notion: Client, database_id: str):
    """Finds a database by its ID and returns the ID if it exists, None otherwise."""
    try:
        notion.databases.retrieve(database_id=database_id)
        return database_id
    except Exception:
        return None


def find_page_or_database_by_id(notion: Client, object_id: str):
    """
    Finds either a page or database by ID. Returns a tuple (object_id, object_type)
    where object_type is either 'page' or 'database', or (None, None) if not found.
    """
    # Try as page first
    try:
        notion.pages.retrieve(page_id=object_id)
        return (object_id, "page")
    except Exception:
        pass

    # Try as database
    try:
        notion.databases.retrieve(database_id=object_id)
        return (object_id, "database")
    except Exception:
        pass

    return (None, None)


def find_database(notion: Client, db_title: str):
    """Finds a database by title. Wrapper around _find_object with object_type='database'."""
    return _find_object(notion, db_title, "database")


def find_database_in_block(notion: Client, block_id: str, db_title: str):
    """
    Recursively find a database by title within a block.
    """
    blocks = notion.blocks.children.list(block_id=block_id).get("results")
    for block in blocks:
        if (
            block.get("type") == "child_database"
            and block.get("child_database", {}).get("title") == db_title
        ):
            return block["id"]
        if block.get("has_children"):
            db_id = find_database_in_block(notion, block["id"], db_title)
            if db_id:
                return db_id
    return None


def get_all_blocks_recursively(notion: Client, block_id: str):
    """
    Recursively fetches all blocks from a starting block ID and its children,
    returning a single flat list of block objects.
    """
    all_blocks = []
    try:
        direct_children = notion.blocks.children.list(block_id=block_id).get(
            "results", []
        )
    except Exception:
        return []

    for block in direct_children:
        all_blocks.append(block)
        if block.get("has_children"):
            all_blocks.extend(get_all_blocks_recursively(notion, block["id"]))

    return all_blocks


def get_block_plain_text(block):
    """
    Safely extract plain_text from a block (paragraph, heading, etc.).
    """
    block_type = block.get("type")
    if not block_type:
        return ""

    block_content = block.get(block_type)
    if not block_content:
        return ""

    rich_text_list = block_content.get("rich_text", [])
    plain_text = "".join([rt.get("plain_text", "") for rt in rich_text_list])

    return plain_text


================================================
FILE: tasks/utils/postgres_utils.py
================================================
"""
PostgreSQL Data Loading Utilities for MCPMark Tasks
===================================================

Common utilities for loading data into PostgreSQL databases from CSV files
and setting up schemas in prepare_environment.py scripts.
"""

import csv
import os
import psycopg2
from pathlib import Path
from typing import Dict, List, Any, Optional
import logging

logger = logging.getLogger(__name__)


def get_connection_params() -> dict:
    """Get database connection parameters from environment variables."""
    return {
        "host": os.getenv("POSTGRES_HOST", "localhost"),
        "port": int(os.getenv("POSTGRES_PORT", 5432)),
        "database": os.getenv("POSTGRES_DATABASE"),
        "user": os.getenv("POSTGRES_USERNAME"),
        "password": os.getenv("POSTGRES_PASSWORD"),
    }


def execute_schema_sql(conn, schema_sql: str):
    """Execute schema SQL with proper error handling."""
    with conn.cursor() as cur:
        cur.execute(schema_sql)
        conn.commit()
        logger.info("✅ Database schema created successfully")


def load_csv_to_table(
    conn, 
    csv_file_path: Path, 
    table_name: str, 
    columns: Optional[List[str]] = None,
    skip_header: bool = True
):
    """
    Load CSV data into a PostgreSQL table.
    
    Args:
        conn: Database connection
        csv_file_path: Path to CSV file
        table_name: Target table name
        columns: List of column names (if None, uses all columns)
        skip_header: Whether to skip the first row
    """
    if not csv_file_path.exists():
        raise FileNotFoundError(f"CSV file not found: {csv_file_path}")
    
    with conn.cursor() as cur:
        with open(csv_file_path, 'r', encoding='utf-8') as f:
            csv_reader = csv.reader(f)
            
            # Skip header if needed
            if skip_header:
                next(csv_reader)
            
            # Build COPY command
            if columns:
                copy_sql = f"COPY {table_name} ({', '.join(columns)}) FROM STDIN WITH CSV"
            else:
                copy_sql = f"COPY {table_name} FROM STDIN WITH CSV"
            
            # Reset file pointer and copy data
            f.seek(0)
            if skip_header:
                next(csv.reader(f))  # Skip header again
            
            cur.copy_expert(copy_sql, f)
            
        conn.commit()
        logger.info(f"✅ Loaded data from {csv_file_path.name} into {table_name}")


def insert_data_from_dict(conn, table_name: str, data: List[Dict[str, Any]]):
    """
    Insert data from a list of dictionaries into a table.
    
    Args:
        conn: Database connection
        table_name: Target table name
        data: List of dictionaries with column_name: value pairs
    """
    if not data:
        return
    
    # Get column names from first record
    columns = list(data[0].keys())
    placeholders = ', '.join(['%s'] * len(columns))
    columns_str = ', '.join(columns)
    
    insert_sql = f"INSERT INTO {table_name} ({columns_str}) VALUES ({placeholders}) ON CONFLICT DO NOTHING"
    
    with conn.cursor() as cur:
        for row in data:
            values = [row[col] for col in columns]
            cur.execute(insert_sql, values)
        
        conn.commit()
        logger.info(f"✅ Inserted {len(data)} rows into {table_name}")


def create_table_with_data(
    conn, 
    table_name: str, 
    schema_sql: str, 
    data: Optional[List[Dict[str, Any]]] = None,
    data_from_csv: Optional[Path] = None
):
    """
    Create a table and optionally load data.
    
    Args:
        conn: Database connection
        table_name: Table name
        schema_sql: CREATE TABLE SQL statement
        data: Optional list of dictionaries to insert
        data_from_csv: Optional CSV file to load
    """
    with conn.cursor() as cur:
        # Create table
        cur.execute(schema_sql)
        logger.info(f"✅ Created table {table_name}")
        
        # Load data if provided
        if data:
            insert_data_from_dict(conn, table_name, data)
        elif data_from_csv:
            load_csv_to_table(conn, data_from_csv, table_name)


def setup_database_with_config(setup_config: Dict[str, Any]):
    """
    Set up database using a configuration dictionary.
    
    Args:
        setup_config: Dictionary with 'tables' key containing table configurations
        
    Example config:
    {
        "tables": {
            "artists": {
                "schema": "CREATE TABLE artists (id SERIAL PRIMARY KEY, name VARCHAR(120))",
                "data": [{"id": 1, "name": "Iron Maiden"}],
                "data_from_csv": "data/artists.csv"  # alternative to data
            }
        }
    }
    """
    conn_params = get_connection_params()
    if not conn_params["database"]:
        raise ValueError("❌ No database specified in POSTGRES_DATABASE environment variable")
    
    try:
        conn = psycopg2.connect(**conn_params)
        
        for table_name, config in setup_config["tables"].items():
            schema_sql = config["schema"]
            data = config.get("data")
            csv_file_path = None
            
            # Handle CSV file path
            if "data_from_csv" in config:
                csv_file_path = Path(config["data_from_csv"])
                if not csv_file_path.is_absolute():
                    # Assume relative to current working directory (task directory)
                    csv_file_path = Path.cwd() / csv_file_path
            
            create_table_with_data(
                conn, 
                table_name, 
                schema_sql, 
                data=data, 
                data_from_csv=csv_file_path
            )
        
        conn.close()
        logger.info("🎉 Database setup completed successfully")
        
    except psycopg2.Error as e:
        logger.error(f"❌ Database error during setup: {e}")
        raise
    except Exception as e:
        logger.error(f"❌ Setup error: {e}")
        raise