Repository: eval-sys/mcpmark Branch: main Commit: adc5e6558f05 Files: 670 Total size: 3.1 MB Directory structure: gitextract_5znolca_/ ├── .dockerignore ├── .editorconfig ├── .gitattributes ├── .github/ │ ├── ISSUE_TEMPLATE/ │ │ ├── 1_bug_report.yml │ │ ├── 2_feature_request.yml │ │ └── config.yml │ ├── PULL_REQUEST_TEMPLATE.md │ ├── scripts/ │ │ └── pr-comment.js │ └── workflows/ │ └── publish-docker-image.yml ├── .gitignore ├── CHANGELOG.md ├── Dockerfile ├── LICENSE ├── README.md ├── build-docker.sh ├── cspell.config.yaml ├── docs/ │ ├── contributing/ │ │ └── make-contribution.md │ ├── datasets/ │ │ └── task.md │ ├── installation_and_docker_usage.md │ ├── introduction.md │ ├── mcp/ │ │ ├── filesystem.md │ │ ├── github.md │ │ ├── notion.md │ │ ├── playwright.md │ │ └── postgres.md │ └── quickstart.md ├── pipeline.py ├── pyproject.toml ├── run-benchmark.sh ├── run-task.sh ├── src/ │ ├── agents/ │ │ ├── __init__.py │ │ ├── base_agent.py │ │ ├── mcp/ │ │ │ ├── __init__.py │ │ │ ├── http_server.py │ │ │ └── stdio_server.py │ │ ├── mcpmark_agent.py │ │ ├── react_agent.py │ │ └── utils/ │ │ ├── __init__.py │ │ └── token_usage.py │ ├── aggregators/ │ │ ├── aggregate_results.py │ │ ├── aggregate_specific_results.py │ │ ├── aggregate_task_meta.py │ │ └── pricing.py │ ├── base/ │ │ ├── __init__.py │ │ ├── login_helper.py │ │ ├── state_manager.py │ │ └── task_manager.py │ ├── config/ │ │ ├── __init__.py │ │ └── config_schema.py │ ├── errors.py │ ├── evaluator.py │ ├── factory.py │ ├── logger.py │ ├── mcp_services/ │ │ ├── filesystem/ │ │ │ ├── __init__.py │ │ │ ├── filesystem_login_helper.py │ │ │ ├── filesystem_state_manager.py │ │ │ └── filesystem_task_manager.py │ │ ├── github/ │ │ │ ├── __init__.py │ │ │ ├── github_login_helper.py │ │ │ ├── github_state_manager.py │ │ │ ├── github_task_manager.py │ │ │ ├── repo_exporter.py │ │ │ ├── repo_importer.py │ │ │ └── token_pool.py │ │ ├── insforge/ │ │ │ ├── __init__.py │ │ │ ├── insforge_login_helper.py │ │ │ ├── insforge_state_manager.py │ │ │ └── insforge_task_manager.py │ │ ├── notion/ │ │ │ ├── __init__.py │ │ │ ├── notion_login_helper.py │ │ │ ├── notion_state_manager.py │ │ │ └── notion_task_manager.py │ │ ├── playwright/ │ │ │ ├── __init__.py │ │ │ ├── playwright_login_helper.py │ │ │ ├── playwright_state_manager.py │ │ │ └── playwright_task_manager.py │ │ ├── playwright_webarena/ │ │ │ ├── playwright_login_helper.py │ │ │ ├── playwright_state_manager.py │ │ │ ├── playwright_task_manager.py │ │ │ └── reddit_env_setup.md │ │ ├── postgres/ │ │ │ ├── __init__.py │ │ │ ├── postgres_login_helper.py │ │ │ ├── postgres_state_manager.py │ │ │ └── postgres_task_manager.py │ │ └── supabase/ │ │ ├── __init__.py │ │ ├── supabase_login_helper.py │ │ ├── supabase_state_manager.py │ │ └── supabase_task_manager.py │ ├── model_config.py │ ├── results_reporter.py │ └── services.py └── tasks/ ├── __init__.py ├── filesystem/ │ ├── easy/ │ │ ├── .gitkeep │ │ ├── file_context/ │ │ │ ├── file_splitting/ │ │ │ │ ├── description.md │ │ │ │ ├── meta.json │ │ │ │ └── verify.py │ │ │ ├── pattern_matching/ │ │ │ │ ├── description.md │ │ │ │ ├── meta.json │ │ │ │ └── verify.py │ │ │ └── uppercase/ │ │ │ ├── description.md │ │ │ ├── meta.json │ │ │ └── verify.py │ │ ├── file_property/ │ │ │ ├── largest_rename/ │ │ │ │ ├── description.md │ │ │ │ ├── meta.json │ │ │ │ └── verify.py │ │ │ └── txt_merging/ │ │ │ ├── description.md │ │ │ ├── meta.json │ │ │ └── verify.py │ │ ├── folder_structure/ │ │ │ └── structure_analysis/ │ │ │ ├── description.md │ │ │ ├── meta.json │ │ │ └── verify.py │ │ ├── legal_document/ │ │ │ └── file_reorganize/ │ │ │ ├── description.md │ │ │ ├── meta.json │ │ │ └── verify.py │ │ ├── papers/ │ │ │ └── papers_counting/ │ │ │ ├── description.md │ │ │ ├── meta.json │ │ │ └── verify.py │ │ └── student_database/ │ │ ├── duplicate_name/ │ │ │ ├── description.md │ │ │ ├── meta.json │ │ │ └── verify.py │ │ └── recommender_name/ │ │ ├── description.md │ │ ├── meta.json │ │ └── verify.py │ └── standard/ │ ├── desktop/ │ │ ├── music_report/ │ │ │ ├── description.md │ │ │ ├── meta.json │ │ │ └── verify.py │ │ ├── project_management/ │ │ │ ├── description.md │ │ │ ├── meta.json │ │ │ └── verify.py │ │ └── timeline_extraction/ │ │ ├── description.md │ │ ├── meta.json │ │ └── verify.py │ ├── desktop_template/ │ │ ├── budget_computation/ │ │ │ ├── description.md │ │ │ ├── meta.json │ │ │ └── verify.py │ │ ├── contact_information/ │ │ │ ├── description.md │ │ │ ├── meta.json │ │ │ └── verify.py │ │ └── file_arrangement/ │ │ ├── description.md │ │ ├── meta.json │ │ └── verify.py │ ├── file_context/ │ │ ├── duplicates_searching/ │ │ │ ├── description.md │ │ │ ├── meta.json │ │ │ └── verify.py │ │ ├── file_merging/ │ │ │ ├── description.md │ │ │ ├── meta.json │ │ │ └── verify.py │ │ ├── file_splitting/ │ │ │ ├── description.md │ │ │ ├── meta.json │ │ │ └── verify.py │ │ ├── pattern_matching/ │ │ │ ├── description.md │ │ │ ├── meta.json │ │ │ └── verify.py │ │ └── uppercase/ │ │ ├── description.md │ │ ├── meta.json │ │ └── verify.py │ ├── file_property/ │ │ ├── size_classification/ │ │ │ ├── description.md │ │ │ ├── meta.json │ │ │ └── verify.py │ │ └── time_classification/ │ │ ├── description.md │ │ ├── meta.json │ │ └── verify.py │ ├── folder_structure/ │ │ ├── structure_analysis/ │ │ │ ├── description.md │ │ │ ├── meta.json │ │ │ └── verify.py │ │ └── structure_mirror/ │ │ ├── description.md │ │ ├── meta.json │ │ └── verify.py │ ├── legal_document/ │ │ ├── dispute_review/ │ │ │ ├── description.md │ │ │ ├── meta.json │ │ │ └── verify.py │ │ ├── individual_comments/ │ │ │ ├── description.md │ │ │ ├── meta.json │ │ │ └── verify.py │ │ └── solution_tracing/ │ │ ├── description.md │ │ ├── meta.json │ │ └── verify.py │ ├── papers/ │ │ ├── author_folders/ │ │ │ ├── description.md │ │ │ ├── meta.json │ │ │ └── verify.py │ │ ├── find_math_paper/ │ │ │ ├── description.md │ │ │ ├── meta.json │ │ │ └── verify.py │ │ └── organize_legacy_papers/ │ │ ├── description.md │ │ ├── meta.json │ │ └── verify.py │ ├── student_database/ │ │ ├── duplicate_name/ │ │ │ ├── description.md │ │ │ ├── meta.json │ │ │ └── verify.py │ │ ├── english_talent/ │ │ │ ├── description.md │ │ │ ├── meta.json │ │ │ └── verify.py │ │ └── gradebased_score/ │ │ ├── description.md │ │ ├── meta.json │ │ └── verify.py │ ├── threestudio/ │ │ ├── code_locating/ │ │ │ ├── description.md │ │ │ ├── meta.json │ │ │ └── verify.py │ │ ├── output_analysis/ │ │ │ ├── description.md │ │ │ ├── meta.json │ │ │ └── verify.py │ │ └── requirements_completion/ │ │ ├── description.md │ │ ├── meta.json │ │ └── verify.py │ └── votenet/ │ ├── dataset_comparison/ │ │ ├── description.md │ │ ├── meta.json │ │ └── verify.py │ ├── debugging/ │ │ ├── description.md │ │ ├── meta.json │ │ └── verify.py │ └── requirements_writing/ │ ├── description.md │ ├── meta.json │ └── verify.py ├── github/ │ ├── easy/ │ │ ├── build-your-own-x/ │ │ │ ├── close_commented_issues/ │ │ │ │ ├── description.md │ │ │ │ ├── meta.json │ │ │ │ └── verify.py │ │ │ └── record_recent_commits/ │ │ │ ├── description.md │ │ │ ├── meta.json │ │ │ └── verify.py │ │ ├── claude-code/ │ │ │ ├── add_terminal_shortcuts_doc/ │ │ │ │ ├── description.md │ │ │ │ ├── meta.json │ │ │ │ └── verify.py │ │ │ ├── thank_docker_pr_author/ │ │ │ │ ├── description.md │ │ │ │ ├── meta.json │ │ │ │ └── verify.py │ │ │ └── triage_missing_tool_result_issue/ │ │ │ ├── description.md │ │ │ ├── meta.json │ │ │ └── verify.py │ │ ├── mcpmark-cicd/ │ │ │ ├── basic_ci_checks/ │ │ │ │ ├── description.md │ │ │ │ ├── meta.json │ │ │ │ └── verify.py │ │ │ ├── issue_lint_guard/ │ │ │ │ ├── description.md │ │ │ │ ├── meta.json │ │ │ │ └── verify.py │ │ │ └── nightly_health_check/ │ │ │ ├── description.md │ │ │ ├── meta.json │ │ │ └── verify.py │ │ └── missing-semester/ │ │ ├── count_translations/ │ │ │ ├── description.md │ │ │ ├── meta.json │ │ │ └── verify.py │ │ └── find_ga_tracking_id/ │ │ ├── description.md │ │ ├── meta.json │ │ └── verify.py │ └── standard/ │ ├── build_your_own_x/ │ │ ├── find_commit_date/ │ │ │ ├── description.md │ │ │ ├── meta.json │ │ │ └── verify.py │ │ └── find_rag_commit/ │ │ ├── description.md │ │ ├── meta.json │ │ └── verify.py │ ├── claude-code/ │ │ ├── automated_changelog_generation/ │ │ │ ├── description.md │ │ │ ├── meta.json │ │ │ └── verify.py │ │ ├── claude_collaboration_analysis/ │ │ │ ├── description.md │ │ │ ├── meta.json │ │ │ └── verify.py │ │ ├── critical_issue_hotfix_workflow/ │ │ │ ├── description.md │ │ │ ├── meta.json │ │ │ └── verify.py │ │ ├── feature_commit_tracking/ │ │ │ ├── description.md │ │ │ ├── meta.json │ │ │ └── verify.py │ │ └── label_color_standardization/ │ │ ├── description.md │ │ ├── meta.json │ │ └── verify.py │ ├── easyr1/ │ │ ├── advanced_branch_strategy/ │ │ │ ├── description.md │ │ │ ├── meta.json │ │ │ └── verify.py │ │ ├── config_parameter_audit/ │ │ │ ├── description.md │ │ │ ├── meta.json │ │ │ └── verify.py │ │ ├── performance_regression_investigation/ │ │ │ ├── description.md │ │ │ ├── meta.json │ │ │ └── verify.py │ │ └── qwen3_issue_management/ │ │ ├── description.md │ │ ├── meta.json │ │ └── verify.py │ ├── harmony/ │ │ ├── fix_conflict/ │ │ │ ├── description.md │ │ │ ├── meta.json │ │ │ └── verify.py │ │ ├── issue_pr_commit_workflow/ │ │ │ ├── description.md │ │ │ ├── meta.json │ │ │ └── verify.py │ │ ├── issue_tagging_pr_closure/ │ │ │ ├── description.md │ │ │ ├── meta.json │ │ │ └── verify.py │ │ ├── multi_branch_commit_aggregation/ │ │ │ ├── description.md │ │ │ ├── meta.json │ │ │ └── verify.py │ │ └── release_management_workflow/ │ │ ├── description.md │ │ ├── meta.json │ │ └── verify.py │ ├── mcpmark-cicd/ │ │ ├── deployment_status_workflow/ │ │ │ ├── description.md │ │ │ ├── meta.json │ │ │ └── verify.py │ │ ├── issue_management_workflow/ │ │ │ ├── description.md │ │ │ ├── meta.json │ │ │ └── verify.py │ │ ├── linting_ci_workflow/ │ │ │ ├── description.md │ │ │ ├── meta.json │ │ │ └── verify.py │ │ └── pr_automation_workflow/ │ │ ├── description.md │ │ ├── meta.json │ │ └── verify.py │ └── missing-semester/ │ ├── assign_contributor_labels/ │ │ ├── description.md │ │ ├── meta.json │ │ └── verify.py │ ├── find_legacy_name/ │ │ ├── description.md │ │ ├── meta.json │ │ └── verify.py │ └── find_salient_file/ │ ├── description.md │ ├── meta.json │ └── verify.py ├── notion/ │ ├── easy/ │ │ ├── .gitkeep │ │ ├── computer_science_student_dashboard/ │ │ │ ├── simple__code_snippets_go/ │ │ │ │ ├── description.md │ │ │ │ ├── meta.json │ │ │ │ └── verify.py │ │ │ └── simple__study_session_tracker/ │ │ │ ├── description.md │ │ │ ├── meta.json │ │ │ └── verify.py │ │ ├── it_trouble_shooting_hub/ │ │ │ └── simple__asset_retirement_migration/ │ │ │ ├── description.md │ │ │ ├── meta.json │ │ │ └── verify.py │ │ ├── japan_travel_planner/ │ │ │ └── simple__remove_osaka_itinerary/ │ │ │ ├── description.md │ │ │ ├── meta.json │ │ │ └── verify.py │ │ ├── online_resume/ │ │ │ └── simple__skills_development_tracker/ │ │ │ ├── description.md │ │ │ ├── meta.json │ │ │ └── verify.py │ │ ├── python_roadmap/ │ │ │ └── simple__expert_level_lessons/ │ │ │ ├── description.md │ │ │ ├── meta.json │ │ │ └── verify.py │ │ ├── self_assessment/ │ │ │ └── simple__faq_column_layout/ │ │ │ ├── description.md │ │ │ ├── meta.json │ │ │ └── verify.py │ │ ├── standard_operating_procedure/ │ │ │ └── simple__section_organization/ │ │ │ ├── description.md │ │ │ ├── meta.json │ │ │ └── verify.py │ │ ├── team_projects/ │ │ │ └── simple__swap_tasks/ │ │ │ ├── description.md │ │ │ ├── meta.json │ │ │ └── verify.py │ │ └── toronto_guide/ │ │ └── simple__change_color/ │ │ ├── description.md │ │ ├── meta.json │ │ └── verify.py │ └── standard/ │ ├── company_in_a_box/ │ │ ├── employee_onboarding/ │ │ │ ├── description.md │ │ │ ├── meta.json │ │ │ └── verify.py │ │ ├── goals_restructure/ │ │ │ ├── description.md │ │ │ ├── meta.json │ │ │ └── verify.py │ │ └── quarterly_review_dashboard/ │ │ ├── description.md │ │ ├── meta.json │ │ └── verify.py │ ├── computer_science_student_dashboard/ │ │ ├── code_snippets_go/ │ │ │ ├── description.md │ │ │ ├── meta.json │ │ │ └── verify.py │ │ ├── courses_internships_relation/ │ │ │ ├── description.md │ │ │ ├── meta.json │ │ │ └── verify.py │ │ └── study_session_tracker/ │ │ ├── description.md │ │ ├── meta.json │ │ └── verify.py │ ├── it_trouble_shooting_hub/ │ │ ├── asset_retirement_migration/ │ │ │ ├── description.md │ │ │ ├── meta.json │ │ │ └── verify.py │ │ ├── security_audit_ticket/ │ │ │ ├── description.md │ │ │ ├── meta.json │ │ │ └── verify.py │ │ └── verification_expired_update/ │ │ ├── description.md │ │ ├── meta.json │ │ └── verify.py │ ├── japan_travel_planner/ │ │ ├── daily_itinerary_overview/ │ │ │ ├── description.md │ │ │ ├── meta.json │ │ │ └── verify.py │ │ ├── packing_progress_summary/ │ │ │ ├── description.md │ │ │ ├── meta.json │ │ │ └── verify.py │ │ ├── remove_osaka_itinerary/ │ │ │ ├── description.md │ │ │ ├── meta.json │ │ │ └── verify.py │ │ └── restaurant_expenses_sync/ │ │ ├── description.md │ │ ├── meta.json │ │ └── verify.py │ ├── online_resume/ │ │ ├── layout_adjustment/ │ │ │ ├── description.md │ │ │ ├── meta.json │ │ │ └── verify.py │ │ ├── projects_section_update/ │ │ │ ├── description.md │ │ │ ├── meta.json │ │ │ └── verify.py │ │ ├── skills_development_tracker/ │ │ │ ├── description.md │ │ │ ├── meta.json │ │ │ └── verify.py │ │ └── work_history_addition/ │ │ ├── description.md │ │ ├── meta.json │ │ └── verify.py │ ├── python_roadmap/ │ │ ├── expert_level_lessons/ │ │ │ ├── description.md │ │ │ ├── meta.json │ │ │ └── verify.py │ │ └── learning_metrics_dashboard/ │ │ ├── description.md │ │ ├── meta.json │ │ └── verify.py │ ├── self_assessment/ │ │ ├── faq_column_layout/ │ │ │ ├── description.md │ │ │ ├── meta.json │ │ │ └── verify.py │ │ ├── hyperfocus_analysis_report/ │ │ │ ├── description.md │ │ │ ├── meta.json │ │ │ └── verify.py │ │ └── numbered_list_emojis/ │ │ ├── description.md │ │ ├── meta.json │ │ └── verify.py │ ├── standard_operating_procedure/ │ │ ├── deployment_process_sop/ │ │ │ ├── description.md │ │ │ ├── meta.json │ │ │ └── verify.py │ │ └── section_organization/ │ │ ├── description.md │ │ ├── meta.json │ │ └── verify.py │ ├── team_projects/ │ │ ├── priority_tasks_table/ │ │ │ ├── description.md │ │ │ ├── meta.json │ │ │ └── verify.py │ │ └── swap_tasks/ │ │ ├── description.md │ │ ├── meta.json │ │ └── verify.py │ └── toronto_guide/ │ ├── change_color/ │ │ ├── description.md │ │ ├── meta.json │ │ └── verify.py │ └── weekend_adventure_planner/ │ ├── description.md │ ├── meta.json │ └── verify.py ├── playwright/ │ ├── easy/ │ │ └── .gitkeep │ └── standard/ │ ├── eval_web/ │ │ ├── cloudflare_turnstile_challenge/ │ │ │ ├── description.md │ │ │ ├── meta.json │ │ │ └── verify.py │ │ └── extraction_table/ │ │ ├── data.csv │ │ ├── description.md │ │ ├── meta.json │ │ └── verify.py │ └── web_search/ │ ├── birth_of_arvinxu/ │ │ ├── description.md │ │ ├── meta.json │ │ └── verify.py │ └── r1_arxiv/ │ ├── content.txt │ ├── description.md │ ├── meta.json │ └── verify.py ├── playwright_webarena/ │ ├── easy/ │ │ ├── .gitkeep │ │ ├── reddit/ │ │ │ ├── ai_data_analyst/ │ │ │ │ ├── description.md │ │ │ │ ├── label.txt │ │ │ │ ├── meta.json │ │ │ │ └── verify.py │ │ │ ├── llm_research_summary/ │ │ │ │ ├── description.md │ │ │ │ ├── label.txt │ │ │ │ ├── meta.json │ │ │ │ └── verify.py │ │ │ ├── movie_reviewer_analysis/ │ │ │ │ ├── description.md │ │ │ │ ├── label.txt │ │ │ │ ├── meta.json │ │ │ │ └── verify.py │ │ │ ├── nba_statistics_analysis/ │ │ │ │ ├── description.md │ │ │ │ ├── label.txt │ │ │ │ ├── meta.json │ │ │ │ └── verify.py │ │ │ └── routine_tracker_forum/ │ │ │ ├── description.md │ │ │ ├── meta.json │ │ │ └── verify.py │ │ └── shopping_admin/ │ │ ├── fitness_promotion_strategy/ │ │ │ ├── description.md │ │ │ ├── label.txt │ │ │ ├── meta.json │ │ │ └── verify.py │ │ ├── ny_expansion_analysis/ │ │ │ ├── description.md │ │ │ ├── label.txt │ │ │ ├── meta.json │ │ │ └── verify.py │ │ ├── products_sales_analysis/ │ │ │ ├── description.md │ │ │ ├── label.txt │ │ │ ├── meta.json │ │ │ └── verify.py │ │ ├── sales_inventory_analysis/ │ │ │ ├── description.md │ │ │ ├── label.txt │ │ │ ├── meta.json │ │ │ └── verify.py │ │ └── search_filtering_operations/ │ │ ├── description.md │ │ ├── label.txt │ │ ├── meta.json │ │ └── verify.py │ └── standard/ │ ├── reddit/ │ │ ├── ai_data_analyst/ │ │ │ ├── description.md │ │ │ ├── label.txt │ │ │ ├── meta.json │ │ │ └── verify.py │ │ ├── budget_europe_travel/ │ │ │ ├── description.md │ │ │ ├── meta.json │ │ │ └── verify.py │ │ ├── buyitforlife_research/ │ │ │ ├── description.md │ │ │ ├── label.txt │ │ │ ├── meta.json │ │ │ └── verify.py │ │ ├── llm_research_summary/ │ │ │ ├── description.md │ │ │ ├── label.txt │ │ │ ├── meta.json │ │ │ └── verify.py │ │ ├── movie_reviewer_analysis/ │ │ │ ├── description.md │ │ │ ├── label.txt │ │ │ ├── meta.json │ │ │ └── verify.py │ │ ├── nba_statistics_analysis/ │ │ │ ├── description.md │ │ │ ├── label.txt │ │ │ ├── meta.json │ │ │ └── verify.py │ │ └── routine_tracker_forum/ │ │ ├── description.md │ │ ├── meta.json │ │ └── verify.py │ ├── shopping/ │ │ ├── advanced_product_analysis/ │ │ │ ├── description.md │ │ │ ├── label.txt │ │ │ ├── meta.json │ │ │ └── verify.py │ │ ├── gaming_accessories_analysis/ │ │ │ ├── description.md │ │ │ ├── label.txt │ │ │ ├── meta.json │ │ │ └── verify.py │ │ ├── health_routine_optimization/ │ │ │ ├── description.md │ │ │ ├── label.txt │ │ │ ├── meta.json │ │ │ └── verify.py │ │ ├── holiday_baking_competition/ │ │ │ ├── description.md │ │ │ ├── label.txt │ │ │ ├── meta.json │ │ │ └── verify.py │ │ ├── multi_category_budget_analysis/ │ │ │ ├── description.md │ │ │ ├── label.txt │ │ │ ├── meta.json │ │ │ └── verify.py │ │ ├── printer_keyboard_search/ │ │ │ ├── description.md │ │ │ ├── label.txt │ │ │ ├── meta.json │ │ │ └── verify.py │ │ └── running_shoes_purchase/ │ │ ├── description.md │ │ ├── label.txt │ │ ├── meta.json │ │ └── verify.py │ └── shopping_admin/ │ ├── customer_segmentation_setup/ │ │ ├── description.md │ │ ├── label.txt │ │ ├── meta.json │ │ └── verify.py │ ├── fitness_promotion_strategy/ │ │ ├── description.md │ │ ├── label.txt │ │ ├── meta.json │ │ └── verify.py │ ├── marketing_customer_analysis/ │ │ ├── description.md │ │ ├── label.txt │ │ ├── meta.json │ │ └── verify.py │ ├── ny_expansion_analysis/ │ │ ├── description.md │ │ ├── label.txt │ │ ├── meta.json │ │ └── verify.py │ ├── products_sales_analysis/ │ │ ├── description.md │ │ ├── label.txt │ │ ├── meta.json │ │ └── verify.py │ ├── sales_inventory_analysis/ │ │ ├── description.md │ │ ├── label.txt │ │ ├── meta.json │ │ └── verify.py │ └── search_filtering_operations/ │ ├── description.md │ ├── label.txt │ ├── meta.json │ └── verify.py ├── postgres/ │ ├── easy/ │ │ ├── .gitkeep │ │ ├── chinook/ │ │ │ ├── customer_data_migration_basic/ │ │ │ │ ├── customer_data.pkl │ │ │ │ ├── description.md │ │ │ │ ├── meta.json │ │ │ │ └── verify.py │ │ │ └── update_employee_info/ │ │ │ ├── description.md │ │ │ ├── meta.json │ │ │ └── verify.py │ │ ├── dvdrental/ │ │ │ └── create_payment_index/ │ │ │ ├── description.md │ │ │ ├── meta.json │ │ │ └── verify.py │ │ ├── employees/ │ │ │ ├── department_summary_view/ │ │ │ │ ├── description.md │ │ │ │ ├── meta.json │ │ │ │ └── verify.py │ │ │ ├── employee_gender_statistics/ │ │ │ │ ├── description.md │ │ │ │ ├── meta.json │ │ │ │ └── verify.py │ │ │ ├── employee_projects_basic/ │ │ │ │ ├── description.md │ │ │ │ ├── meta.json │ │ │ │ └── verify.py │ │ │ └── hiring_year_summary/ │ │ │ ├── description.md │ │ │ ├── meta.json │ │ │ └── verify.py │ │ ├── lego/ │ │ │ ├── basic_security_setup/ │ │ │ │ ├── description.md │ │ │ │ ├── meta.json │ │ │ │ └── verify.py │ │ │ └── fix_data_inconsistencies/ │ │ │ ├── description.md │ │ │ ├── meta.json │ │ │ └── verify.py │ │ └── sports/ │ │ └── create_performance_indexes/ │ │ ├── description.md │ │ ├── meta.json │ │ └── verify.py │ └── standard/ │ ├── chinook/ │ │ ├── customer_data_migration/ │ │ │ ├── customer_data.pkl │ │ │ ├── description.md │ │ │ ├── meta.json │ │ │ └── verify.py │ │ ├── employee_hierarchy_management/ │ │ │ ├── description.md │ │ │ ├── meta.json │ │ │ └── verify.py │ │ └── sales_and_music_charts/ │ │ ├── description.md │ │ ├── meta.json │ │ └── verify.py │ ├── dvdrental/ │ │ ├── customer_analysis_fix/ │ │ │ ├── description.md │ │ │ ├── meta.json │ │ │ └── verify.py │ │ ├── customer_analytics_optimization/ │ │ │ ├── description.md │ │ │ ├── meta.json │ │ │ └── verify.py │ │ └── film_inventory_management/ │ │ ├── description.md │ │ ├── meta.json │ │ └── verify.py │ ├── employees/ │ │ ├── employee_demographics_report/ │ │ │ ├── description.md │ │ │ ├── meta.json │ │ │ └── verify.py │ │ ├── employee_performance_analysis/ │ │ │ ├── description.md │ │ │ ├── meta.json │ │ │ └── verify.py │ │ ├── employee_project_tracking/ │ │ │ ├── description.md │ │ │ ├── meta.json │ │ │ └── verify.py │ │ ├── employee_retention_analysis/ │ │ │ ├── description.md │ │ │ ├── meta.json │ │ │ └── verify.py │ │ ├── executive_dashboard_automation/ │ │ │ ├── description.md │ │ │ ├── meta.json │ │ │ └── verify.py │ │ └── management_structure_analysis/ │ │ ├── description.md │ │ ├── meta.json │ │ └── verify.py │ ├── lego/ │ │ ├── consistency_enforcement/ │ │ │ ├── description.md │ │ │ ├── meta.json │ │ │ └── verify.py │ │ ├── database_security_policies/ │ │ │ ├── description.md │ │ │ ├── meta.json │ │ │ └── verify.py │ │ └── transactional_inventory_transfer/ │ │ ├── description.md │ │ ├── meta.json │ │ └── verify.py │ ├── security/ │ │ ├── rls_business_access/ │ │ │ ├── description.md │ │ │ ├── ground_truth.sql │ │ │ ├── meta.json │ │ │ ├── prepare_environment.py │ │ │ └── verify.py │ │ └── user_permission_audit/ │ │ ├── description.md │ │ ├── ground_truth.sql │ │ ├── meta.json │ │ ├── prepare_environment.py │ │ └── verify.py │ ├── sports/ │ │ ├── baseball_player_analysis/ │ │ │ ├── description.md │ │ │ ├── meta.json │ │ │ └── verify.py │ │ ├── participant_report_optimization/ │ │ │ ├── description.md │ │ │ ├── meta.json │ │ │ └── verify.py │ │ └── team_roster_management/ │ │ ├── description.md │ │ ├── meta.json │ │ └── verify.py │ └── vectors/ │ ├── dba_vector_analysis/ │ │ ├── description.md │ │ ├── ground_truth.sql │ │ ├── meta.json │ │ ├── prepare_environment.py │ │ └── verify.py │ └── vectors_setup.py └── utils/ ├── __init__.py ├── notion_utils.py └── postgres_utils.py ================================================ FILE CONTENTS ================================================ ================================================ FILE: .dockerignore ================================================ # Git .git .gitignore # Python __pycache__ *.pyc *.pyo *.pyd .Python *.egg *.egg-info/ dist/ build/ .eggs/ *.so # Virtual environments venv/ env/ ENV/ .venv/ # IDE .vscode/ .idea/ *.swp *.swo *~ .DS_Store # Environment files (contain secrets) .env .mcp_env notion_state.json # Test and development files .pytest_cache/ .coverage htmlcov/ .tox/ .mypy_cache/ .ruff_cache/ tests/ test_environments/ # Results and logs results/ *.log logs/ # PostgreSQL data .postgres/ # Playwright playwright-report/ test-results/ # Documentation images asset/ # Temporary files *.tmp tmp/ temp/ # Docker Dockerfile docker-compose.yml .dockerignore # Node modules (if any locally installed) node_modules/ # Pixi lock file pixi.lock .pixi/ # GitHub state files github_state/ github_template_repo/ # Backup directories .mcpbench_backups/ ================================================ FILE: .editorconfig ================================================ root = true ; Always use Unix style new lines with new line ending on every file and trim whitespace [*] end_of_line = lf insert_final_newline = true trim_trailing_whitespace = true ; Python: PEP8 defines 4 spaces for indentation [*.py] indent_style = space indent_size = 4 ================================================ FILE: .gitattributes ================================================ # SCM syntax highlighting & preventing 3-way merges pixi.lock merge=binary linguist-language=YAML linguist-generated=true ================================================ FILE: .github/ISSUE_TEMPLATE/1_bug_report.yml ================================================ name: '🐛 Bug Report' description: 'Report an bug' labels: ['unconfirm'] type: Bug body: - type: textarea attributes: label: '🐛 Bug Description' description: A clear and concise description of the bug, if the above option is `Other`, please also explain in detail. validations: required: true - type: textarea attributes: label: '📷 Recurrence Steps' description: A clear and concise description of how to recurrence. - type: textarea attributes: label: '🚦 Expected Behavior' description: A clear and concise description of what you expected to happen. - type: textarea attributes: label: '📝 Additional Information' description: If your problem needs further explanation, or if the issue you're seeing cannot be reproduced in a gist, please add more information here. ================================================ FILE: .github/ISSUE_TEMPLATE/2_feature_request.yml ================================================ name: '🌠 Feature Request' description: 'Suggest an idea' title: '[Request] ' type: Feature body: - type: textarea attributes: label: '🥰 Feature Description' description: Please add a clear and concise description of the problem you are seeking to solve with this feature request. validations: required: true - type: textarea attributes: label: '🧐 Proposed Solution' description: Describe the solution you'd like in a clear and concise manner. validations: required: true - type: textarea attributes: label: '📝 Additional Information' description: Add any other context about the problem here. ================================================ FILE: .github/ISSUE_TEMPLATE/config.yml ================================================ contact_links: - name: Questions and ideas url: https://github.com/eval-sys/mcpmark/discussions/new/choose about: Please post questions, and ideas in discussions. ================================================ FILE: .github/PULL_REQUEST_TEMPLATE.md ================================================ #### Change Type - [ ] ✨ feat - [ ] 🐛 fix - [ ] ♻️ refactor - [ ] 💄 style - [ ] 👷 build - [ ] ⚡️ perf - [ ] 📝 docs - [ ] 🔨 chore #### Description of Change #### Additional Information ================================================ FILE: .github/scripts/pr-comment.js ================================================ /** * Generate or update PR comment with Docker build info */ module.exports = async ({ github, context, dockerMetaJson, image, version, dockerhubUrl, platforms }) => { const COMMENT_IDENTIFIER = ''; const parseTags = () => { try { if (dockerMetaJson) { const parsed = JSON.parse(dockerMetaJson); if (Array.isArray(parsed.tags) && parsed.tags.length > 0) { return parsed.tags; } } } catch (e) { // ignore parsing error, fallback below } if (image && version) { return [`${image}:${version}`]; } return []; }; const generateCommentBody = () => { const tags = parseTags(); const buildTime = new Date().toISOString(); // Use the first tag as the main version const mainTag = tags.length > 0 ? tags[0] : `${image}:${version}`; const tagVersion = mainTag.includes(':') ? mainTag.split(':')[1] : version; return [ COMMENT_IDENTIFIER, '', '### 🐳 Docker Build Completed!', `**Version**: \`${tagVersion || 'N/A'}\``, `**Build Time**: \`${buildTime}\``, '', dockerhubUrl ? `🔗 View all tags on Docker Hub: ${dockerhubUrl}` : '', '', '### Pull Image', 'Download the Docker image to your local machine:', '', '```bash', `docker pull ${mainTag}`, '```', '', '### Run Eval', 'Execute evaluation tasks using the built image:', '', '```bash', `DOCKER_IMAGE_VERSION=${tagVersion} ./run-task.sh --models gpt-4.1-mini --tasks file_context/uppercase`, '```', '', '> [!IMPORTANT]', '> This build is for testing and validation purposes.', ] .filter(Boolean) .join('\n'); }; const body = generateCommentBody(); // List comments on the PR const { data: comments } = await github.rest.issues.listComments({ issue_number: context.issue.number, owner: context.repo.owner, repo: context.repo.repo, }); const existing = comments.find((c) => c.body && c.body.includes(COMMENT_IDENTIFIER)); if (existing) { await github.rest.issues.updateComment({ comment_id: existing.id, owner: context.repo.owner, repo: context.repo.repo, body, }); return { updated: true, id: existing.id }; } const result = await github.rest.issues.createComment({ issue_number: context.issue.number, owner: context.repo.owner, repo: context.repo.repo, body, }); return { updated: false, id: result.data.id }; }; ================================================ FILE: .github/workflows/publish-docker-image.yml ================================================ name: Publish Docker Image on: workflow_dispatch: release: types: [ published ] pull_request: types: [ synchronize, labeled, unlabeled ] permissions: contents: read pull-requests: write concurrency: group: ${{ github.ref }}-${{ github.workflow }} cancel-in-progress: true env: REGISTRY_IMAGE: evalsysorg/mcpmark PR_TAG_PREFIX: pr- jobs: build: if: | (github.event_name == 'pull_request' && contains(github.event.pull_request.labels.*.name, 'Build Docker')) || github.event_name != 'pull_request' strategy: matrix: include: - platform: linux/amd64 os: ubuntu-latest - platform: linux/arm64 os: ubuntu-24.04-arm runs-on: ${{ matrix.os }} name: Build ${{ matrix.platform }} Image steps: - name: Prepare run: | platform=${{ matrix.platform }} echo "PLATFORM_PAIR=${platform//\//-}" >> $GITHUB_ENV - name: Checkout base uses: actions/checkout@v4 with: fetch-depth: 0 - name: Set up Docker Buildx uses: docker/setup-buildx-action@v3 - name: Generate PR metadata if: github.event_name == 'pull_request' id: pr_meta run: | branch_name="${{ github.head_ref }}" sanitized_branch=$(echo "${branch_name}" | sed -E 's/[^a-zA-Z0-9_.-]+/-/g') echo "pr_tag=${sanitized_branch}-$(git rev-parse --short HEAD)" >> $GITHUB_OUTPUT - name: Docker meta id: meta uses: docker/metadata-action@v5 with: images: ${{ env.REGISTRY_IMAGE }} tags: | type=raw,value=${{ env.PR_TAG_PREFIX }}${{ steps.pr_meta.outputs.pr_tag }},enable=${{ github.event_name == 'pull_request' }} type=semver,pattern={{version}},enable=${{ github.event_name != 'pull_request' }} type=raw,value=latest,enable=${{ github.event_name != 'pull_request' }} - name: Docker login uses: docker/login-action@v3 with: username: ${{ secrets.DOCKER_REGISTRY_USER }} password: ${{ secrets.DOCKER_REGISTRY_PASSWORD }} - name: Get commit SHA if: github.ref == 'refs/heads/main' id: vars run: echo "sha_short=$(git rev-parse --short HEAD)" >> $GITHUB_OUTPUT - name: Build and export id: build uses: docker/build-push-action@v6 with: platforms: ${{ matrix.platform }} context: . file: ./Dockerfile labels: ${{ steps.meta.outputs.labels }} build-args: | SHA=${{ steps.vars.outputs.sha_short }} outputs: type=image,name=${{ env.REGISTRY_IMAGE }},push-by-digest=true,name-canonical=true,push=true - name: Export digest run: | rm -rf /tmp/digests mkdir -p /tmp/digests digest="${{ steps.build.outputs.digest }}" touch "/tmp/digests/${digest#sha256:}" - name: Upload artifact uses: actions/upload-artifact@v4 with: name: digest-${{ env.PLATFORM_PAIR }} path: /tmp/digests/* if-no-files-found: error retention-days: 1 merge: name: Merge needs: build runs-on: ubuntu-latest steps: - name: Checkout base uses: actions/checkout@v4 with: fetch-depth: 0 - name: Download digests uses: actions/download-artifact@v5 with: path: /tmp/digests pattern: digest-* merge-multiple: true - name: Set up Docker Buildx uses: docker/setup-buildx-action@v3 - name: Generate PR metadata if: github.event_name == 'pull_request' id: pr_meta run: | branch_name="${{ github.head_ref }}" sanitized_branch=$(echo "${branch_name}" | sed -E 's/[^a-zA-Z0-9_.-]+/-/g') echo "pr_tag=${sanitized_branch}-$(git rev-parse --short HEAD)" >> $GITHUB_OUTPUT - name: Docker meta id: meta uses: docker/metadata-action@v5 with: images: ${{ env.REGISTRY_IMAGE }} tags: | type=raw,value=${{ env.PR_TAG_PREFIX }}${{ steps.pr_meta.outputs.pr_tag }},enable=${{ github.event_name == 'pull_request' }} type=semver,pattern={{version}},enable=${{ github.event_name != 'pull_request' }} type=raw,value=latest,enable=${{ github.event_name != 'pull_request' }} - name: Docker login uses: docker/login-action@v3 with: username: ${{ secrets.DOCKER_REGISTRY_USER }} password: ${{ secrets.DOCKER_REGISTRY_PASSWORD }} - name: Create manifest list and push working-directory: /tmp/digests run: | docker buildx imagetools create $(jq -cr '.tags | map("-t " + .) | join(" ")' <<< "$DOCKER_METADATA_OUTPUT_JSON") \ $(printf '${{ env.REGISTRY_IMAGE }}@sha256:%s ' *) - name: Inspect image run: | docker buildx imagetools inspect ${{ env.REGISTRY_IMAGE }}:${{ steps.meta.outputs.version }} - name: Comment on PR with Docker build info if: github.event_name == 'pull_request' uses: actions/github-script@v7 with: github-token: ${{ secrets.GITHUB_TOKEN }} script: | const prComment = require('${{ github.workspace }}/.github/scripts/pr-comment.js'); const result = await prComment({ github, context, dockerMetaJson: ${{ toJSON(steps.meta.outputs.json) }}, image: "${{ env.REGISTRY_IMAGE }}", version: "${{ steps.meta.outputs.version }}", dockerhubUrl: "https://hub.docker.com/r/${{ env.REGISTRY_IMAGE }}/tags", platforms: "linux/amd64, linux/arm64", }); core.info(`Status: ${result.updated ? 'Updated' : 'Created'}, ID: ${result.id}`); ================================================ FILE: .gitignore ================================================ logs .claude CLAUDE.md .gemini results materials scripts !.github/scripts .nfs* .mcp_env .idea # Byte-compiled / optimized / DLL files __pycache__/ *.py[codz] *$py.class logs logs/* .DS_Store notion-sdk-py/ github_state/* # for playwright cookies notion_state.json # C extensions *.so # Distribution / packaging .Python build/ develop-eggs/ dist/ downloads/ eggs/ .eggs/ lib/ lib64/ parts/ sdist/ var/ wheels/ share/python-wheels/ *.egg-info/ .installed.cfg *.egg MANIFEST # PyInstaller # Usually these files are written by a python script from a template # before PyInstaller builds the exe, so as to inject date/other infos into it. *.manifest *.spec # Installer logs pip-log.txt pip-delete-this-directory.txt # Unit test / coverage reports htmlcov/ .tox/ .nox/ .coverage .coverage.* .cache nosetests.xml coverage.xml *.cover *.py.cover .hypothesis/ .pytest_cache/ cover/ # Translations *.mo *.pot # Django stuff: *.log local_settings.py db.sqlite3 db.sqlite3-journal # Flask stuff: instance/ .webassets-cache # Scrapy stuff: .scrapy # Sphinx documentation docs/_build/ # PyBuilder .pybuilder/ target/ # Jupyter Notebook .ipynb_checkpoints # IPython profile_default/ ipython_config.py # pyenv # For a library or package, you might want to ignore these files since the code is # intended to run in multiple environments; otherwise, check them in: # .python-version # pipenv # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. # However, in case of collaboration, if having platform-specific dependencies or dependencies # having no cross-platform support, pipenv may install dependencies that don't work, or not # install all needed dependencies. #Pipfile.lock # UV # Similar to Pipfile.lock, it is generally recommended to include uv.lock in version control. # This is especially recommended for binary packages to ensure reproducibility, and is more # commonly ignored for libraries. #uv.lock # poetry # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control. # This is especially recommended for binary packages to ensure reproducibility, and is more # commonly ignored for libraries. # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control #poetry.lock #poetry.toml # pdm # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control. # pdm recommends including project-wide configuration in pdm.toml, but excluding .pdm-python. # https://pdm-project.org/en/latest/usage/project/#working-with-version-control #pdm.lock #pdm.toml .pdm-python .pdm-build/ # pixi # Similar to Pipfile.lock, it is generally recommended to include pixi.lock in version control. #pixi.lock # Pixi creates a virtual environment in the .pixi directory, just like venv module creates one # in the .venv directory. It is recommended not to include this directory in version control. .pixi # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm __pypackages__/ # Celery stuff celerybeat-schedule celerybeat.pid # SageMath parsed files *.sage.py # Environments .env .envrc .venv env/ venv/ ENV/ env.bak/ venv.bak/ # Spyder project settings .spyderproject .spyproject # Rope project settings .ropeproject # mkdocs documentation /site # mypy .mypy_cache/ .dmypy.json dmypy.json # Pyre type checker .pyre/ # pytype static type analyzer .pytype/ # Cython debug symbols cython_debug/ # PyCharm # JetBrains specific template is maintained in a separate JetBrains.gitignore that can # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore # and can be added to the global gitignore or merged into this file. For a more nuclear # option (not recommended) you can uncomment the following to ignore the entire idea folder. #.idea/ # Abstra # Abstra is an AI-powered process automation framework. # Ignore directories containing user credentials, local state, and settings. # Learn more at https://abstra.io/docs .abstra/ # Visual Studio Code # Visual Studio Code specific template is maintained in a separate VisualStudioCode.gitignore # that can be found at https://github.com/github/gitignore/blob/main/Global/VisualStudioCode.gitignore # and can be added to the global gitignore or merged into this file. However, if you prefer, # you could uncomment the following to ignore the entire vscode folder # .vscode/ # Ruff stuff: .ruff_cache/ # PyPI configuration file .pypirc # Cursor # Cursor is an AI-powered code editor. `.cursorignore` specifies files/directories to # exclude from AI features like autocomplete and code analysis. Recommended for sensitive data # refer to https://docs.cursor.com/context/ignore-files .cursorignore .cursorindexingignore # Marimo marimo/_static/ marimo/_lsp/ __marimo__/ # pixi environments .pixi *.egg-info .postgres # MCPMark backup directories .mcpmark_backups/* test_environments/ postgres_state ================================================ FILE: CHANGELOG.md ================================================ # Changelog All notable changes to this project will be documented in this file. The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). ## v1.2.0 - 2025-09-20 This version includes multiple important feature enhancements, particularly improvements in cost calculation, error handling, and Notion integration. Added per-model cost calculation, comprehensive aggregator functionality, and more robust error recovery mechanisms. ### ✨ Features - **Add 1m parameter & improve log** (#198) - Added claude-1m-context option and enhanced logging functionality - **Refine Notion parent resolution and duplicate recovery** (#197) - Improved Notion parent page resolution and duplicate content recovery mechanism - **Comprehensive aggregator, enable push to new branch** (#185) - Implemented comprehensive aggregator functionality with support for pushing to new branches - **Support price cost calculating per model** (#186) - Added per-model price cost calculation functionality - **Improve agent end log** (#183) - Enhanced agent end logging - **Improve litellm error handling** (#181) - Enhanced LiteLLM error handling mechanism ### ♻️ Refactoring - **Use notion child block list to locate page** (#196) - Refactored page location logic to use Notion child block list approach ### 🐛 Bug Fixes - **Fix verification in Notion task company_in_a_box/goals_restructure** (#194) - Fixed verification logic for specific Notion tasks - **Improve claude error handling** (#195) - Improved error handling for Claude API interactions - **Fix tailing slash issue for find_legacy_name** - Resolved trailing slash issues in find_legacy_name path handling - **Recover when duplication lands on parent** (#189) - Fixed recovery mechanism when duplicate content affects parent pages - **Correctly handle playwright parser** (#184) - Properly handle Playwright parser - **Handle timeout error, add timeout error for resuming** (#182) - Handle timeout errors and add timeout error handling for resume operations ### 📝 Documentation - **Better readme, notion language guide** (#190) - Improved README documentation and added comprehensive Notion language guide ### 🔨 Maintenance - **Update price info** (#188) - Updated pricing information - **Update desktop_template/file_arrangement/verify.py** (#187) - Maintenance updates to verification scripts ================================================ FILE: Dockerfile ================================================ # MCPMark Docker image with optimized layer caching # Stage 1: Builder for Python dependencies only FROM python:3.12-slim AS builder RUN apt-get update && apt-get install -y --no-install-recommends \ gcc \ g++ \ libpq-dev \ && rm -rf /var/lib/apt/lists/* WORKDIR /build # Copy project files needed for pip install COPY pyproject.toml ./ COPY src/ ./src/ COPY tasks/ ./tasks/ # Install dependencies RUN pip install --no-cache-dir --user . # Stage 2: Final image with all runtime dependencies FROM python:3.12-slim # Layer 1: Core system dependencies (very stable, rarely changes) RUN apt-get update && apt-get install -y --no-install-recommends \ ca-certificates \ && rm -rf /var/lib/apt/lists/* # Layer 2: PostgreSQL runtime and client tools (stable, only changes with postgres version) RUN apt-get update && apt-get install -y --no-install-recommends \ libpq5 \ postgresql-client \ && rm -rf /var/lib/apt/lists/* # Layer 3: Git (stable) RUN apt-get update && apt-get install -y --no-install-recommends \ git \ && rm -rf /var/lib/apt/lists/* # Layer 4: Playwright system dependencies (changes with browser requirements) RUN apt-get update && apt-get install -y --no-install-recommends \ libnss3 \ libnspr4 \ libatk1.0-0 \ libatk-bridge2.0-0 \ libcups2 \ libdrm2 \ libxkbcommon0 \ libatspi2.0-0 \ libx11-6 \ libxcomposite1 \ libxdamage1 \ libxfixes3 \ libxrandr2 \ libgbm1 \ libxcb1 \ libpango-1.0-0 \ libcairo2 \ libasound2 \ && rm -rf /var/lib/apt/lists/* # Layer 5: Download tools and Node.js (changes with Node version) RUN apt-get update && \ apt-get install -y --no-install-recommends curl wget unzip && \ curl -fsSL https://deb.nodesource.com/setup_20.x | bash - && \ apt-get install -y --no-install-recommends nodejs && \ apt-get autoremove -y && \ rm -rf /var/lib/apt/lists/* # Layer 6: pipx (rarely changes) RUN pip install --no-cache-dir pipx && \ pipx ensurepath # Layer 7: Copy Python packages from builder (changes with dependencies) COPY --from=builder /root/.local /root/.local # Layer 8: Playwright browsers (changes with browser versions) RUN python3 -m playwright install chromium && \ npx -y playwright install chromium # Layer 9: Install PostgreSQL MCP server (Python, used via `pipx run postgres-mcp`) RUN pipx install postgres-mcp # Set working directory WORKDIR /app # Layer 9: Create directory structure (rarely changes) RUN mkdir -p /app/results # Layer 10: Application code (changes frequently) COPY . . # Set environment ENV PATH="/root/.local/bin:/root/.local/pipx/venvs/*/bin:${PATH}" ENV PYTHONPATH="/app" ENV PLAYWRIGHT_BROWSERS_PATH=/root/.cache/ms-playwright ENV PIPX_HOME=/root/.local/pipx ENV PIPX_BIN_DIR=/root/.local/bin # Default command CMD ["python3", "-m", "pipeline", "--help"] ================================================ FILE: LICENSE ================================================ Apache License Version 2.0, January 2004 http://www.apache.org/licenses/ TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 1. Definitions. "License" shall mean the terms and conditions for use, reproduction, and distribution as defined by Sections 1 through 9 of this document. "Licensor" shall mean the copyright owner or entity authorized by the copyright owner that is granting the License. "Legal Entity" shall mean the union of the acting entity and all other entities that control, are controlled by, or are under common control with that entity. For the purposes of this definition, "control" means (i) the power, direct or indirect, to cause the direction or management of such entity, whether by contract or otherwise, or (ii) ownership of fifty percent (50%) or more of the outstanding shares, or (iii) beneficial ownership of such entity. "You" (or "Your") shall mean an individual or Legal Entity exercising permissions granted by this License. "Source" form shall mean the preferred form for making modifications, including but not limited to software source code, documentation source, and configuration files. "Object" form shall mean any form resulting from mechanical transformation or translation of a Source form, including but not limited to compiled object code, generated documentation, and conversions to other media types. "Work" shall mean the work of authorship, whether in Source or Object form, made available under the License, as indicated by a copyright notice that is included in or attached to the work (an example is provided in the Appendix below). "Derivative Works" shall mean any work, whether in Source or Object form, that is based on (or derived from) the Work and for which the editorial revisions, annotations, elaborations, or other modifications represent, as a whole, an original work of authorship. For the purposes of this License, Derivative Works shall not include works that remain separable from, or merely link (or bind by name) to the interfaces of, the Work and Derivative Works thereof. "Contribution" shall mean any work of authorship, including the original version of the Work and any modifications or additions to that Work or Derivative Works thereof, that is intentionally submitted to Licensor for inclusion in the Work by the copyright owner or by an individual or Legal Entity authorized to submit on behalf of the copyright owner. For the purposes of this definition, "submitted" means any form of electronic, verbal, or written communication sent to the Licensor or its representatives, including but not limited to communication on electronic mailing lists, source code control systems, and issue tracking systems that are managed by, or on behalf of, the Licensor for the purpose of discussing and improving the Work, but excluding communication that is conspicuously marked or otherwise designated in writing by the copyright owner as "Not a Contribution." "Contributor" shall mean Licensor and any individual or Legal Entity on behalf of whom a Contribution has been received by Licensor and subsequently incorporated within the Work. 2. Grant of Copyright License. Subject to the terms and conditions of this License, each Contributor hereby grants to You a perpetual, worldwide, non-exclusive, no-charge, royalty-free, irrevocable copyright license to reproduce, prepare Derivative Works of, publicly display, publicly perform, sublicense, and distribute the Work and such Derivative Works in Source or Object form. 3. Grant of Patent License. Subject to the terms and conditions of this License, each Contributor hereby grants to You a perpetual, worldwide, non-exclusive, no-charge, royalty-free, irrevocable (except as stated in this section) patent license to make, have made, use, offer to sell, sell, import, and otherwise transfer the Work, where such license applies only to those patent claims licensable by such Contributor that are necessarily infringed by their Contribution(s) alone or by combination of their Contribution(s) with the Work to which such Contribution(s) was submitted. If You institute patent litigation against any entity (including a cross-claim or counterclaim in a lawsuit) alleging that the Work or a Contribution incorporated within the Work constitutes direct or contributory patent infringement, then any patent licenses granted to You under this License for that Work shall terminate as of the date such litigation is filed. 4. Redistribution. You may reproduce and distribute copies of the Work or Derivative Works thereof in any medium, with or without modifications, and in Source or Object form, provided that You meet the following conditions: (a) You must give any other recipients of the Work or Derivative Works a copy of this License; and (b) You must cause any modified files to carry prominent notices stating that You changed the files; and (c) You must retain, in the Source form of any Derivative Works that You distribute, all copyright, patent, trademark, and attribution notices from the Source form of the Work, excluding those notices that do not pertain to any part of the Derivative Works; and (d) If the Work includes a "NOTICE" text file as part of its distribution, then any Derivative Works that You distribute must include a readable copy of the attribution notices contained within such NOTICE file, excluding those notices that do not pertain to any part of the Derivative Works, in at least one of the following places: within a NOTICE text file distributed as part of the Derivative Works; within the Source form or documentation, if provided along with the Derivative Works; or, within a display generated by the Derivative Works, if and wherever such third-party notices normally appear. The contents of the NOTICE file are for informational purposes only and do not modify the License. You may add Your own attribution notices within Derivative Works that You distribute, alongside or as an addendum to the NOTICE text from the Work, provided that such additional attribution notices cannot be construed as modifying the License. You may add Your own copyright statement to Your modifications and may provide additional or different license terms and conditions for use, reproduction, or distribution of Your modifications, or for any such Derivative Works as a whole, provided Your use, reproduction, and distribution of the Work otherwise complies with the conditions stated in this License. 5. Submission of Contributions. Unless You explicitly state otherwise, any Contribution intentionally submitted for inclusion in the Work by You to the Licensor shall be under the terms and conditions of this License, without any additional terms or conditions. Notwithstanding the above, nothing herein shall supersede or modify the terms of any separate license agreement you may have executed with Licensor regarding such Contributions. 6. Trademarks. This License does not grant permission to use the trade names, trademarks, service marks, or product names of the Licensor, except as required for reasonable and customary use in describing the origin of the Work and reproducing the content of the NOTICE file. 7. Disclaimer of Warranty. Unless required by applicable law or agreed to in writing, Licensor provides the Work (and each Contributor provides its Contributions) on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied, including, without limitation, any warranties or conditions of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A PARTICULAR PURPOSE. You are solely responsible for determining the appropriateness of using or redistributing the Work and assume any risks associated with Your exercise of permissions under this License. 8. Limitation of Liability. In no event and under no legal theory, whether in tort (including negligence), contract, or otherwise, unless required by applicable law (such as deliberate and grossly negligent acts) or agreed to in writing, shall any Contributor be liable to You for damages, including any direct, indirect, special, incidental, or consequential damages of any character arising as a result of this License or out of the use or inability to use the Work (including but not limited to damages for loss of goodwill, work stoppage, computer failure or malfunction, or any and all other commercial damages or losses), even if such Contributor has been advised of the possibility of such damages. 9. Accepting Warranty or Additional Liability. While redistributing the Work or Derivative Works thereof, You may choose to offer, and charge a fee for, acceptance of support, warranty, indemnity, or other liability obligations and/or rights consistent with this License. However, in accepting such obligations, You may act only on Your own behalf and on Your sole responsibility, not on behalf of any other Contributor, and only if You agree to indemnify, defend, and hold each Contributor harmless for any liability incurred by, or claims asserted against, such Contributor by reason of your accepting any such warranty or additional liability. END OF TERMS AND CONDITIONS APPENDIX: How to apply the Apache License to your work. To apply the Apache License to your work, attach the following boilerplate notice, with the fields enclosed by brackets "[]" replaced with your own identifying information. (Don't include the brackets!) The text should be enclosed in the appropriate comment syntax for the file format. We also recommend that a file or class name and description of purpose be included on the same "printed page" as the copyright notice for easier identification within third-party archives. Copyright [yyyy] [name of copyright owner] Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. ================================================ FILE: README.md ================================================
# MCPMark: Stress-Testing Comprehensive MCP Use [![Website](https://img.shields.io/badge/Website-mcpmark.ai-4285F4?style=for-the-badge&logo=google-chrome&logoColor=white)](https://mcpmark.ai) [![arXiv](https://img.shields.io/badge/arXiv-2509.24002-b31b1b?style=for-the-badge&logo=arxiv&logoColor=white)](https://arxiv.org/abs/2509.24002) [![Discord](https://img.shields.io/badge/Join_our_discord-5865F2?style=for-the-badge&logo=discord&logoColor=white)](https://discord.gg/HrKkJAxDnA) [![Docs](https://img.shields.io/badge/Docs-000000?style=for-the-badge&logo=mdbook&color=105864)](https://mcpmark.ai/docs) [![Hugging Face](https://img.shields.io/badge/Trajectory_Logs-FFD21E?style=for-the-badge&logo=huggingface&logoColor=black)](https://huggingface.co/datasets/Jakumetsu/mcpmark-trajectory-log)
An evaluation suite for agentic models in real MCP tool environments (Notion / GitHub / Filesystem / Postgres / Playwright). MCPMark provides a reproducible, extensible benchmark for researchers and engineers: one-command tasks, isolated sandboxes, auto-resume for failures, unified metrics, and aggregated reports. [![MCPMark](https://github.com/user-attachments/assets/dfc06a41-e387-45e3-bc98-db7097ffa3dc)](https://mcpmark.ai) ## News - 📌 **21 Jan** — Pinned MCP server versions for reproducible benchmarks: GitHub MCP Server `v0.15.0` (switched to Docker for version control), Notion MCP Server `@1.9.1` (Notion released 2.0 but it has many bugs, not recommended). See [#246](https://github.com/eval-sys/mcpmark/pull/246). - 🔥 **13 Dec** — Added auto-compaction support (`--compaction-token`) to summarize long conversations and avoid context overflow during evaluation ([#236](https://github.com/eval-sys/mcpmark/pull/236])). - 🏅 **02 Dec** — Evaluated `gemini-3-pro-preview` (thinking: low): **Pass@1 50.6%** ± 2.3% — so close to `gpt-5-high` (51.6%)! Also `deepseek-v3.2-thinking` 36.8% and `deepseek-v3.2-chat` 29.7% - 🔥 **02 Dec** — Obfuscate GitHub @mentions to prevent notification spam during evaluation ([#229](https://github.com/eval-sys/mcpmark/pull/229)) - 🏅 **01 Dec** — DeepSeek v3.2 uses MCPMark! Kudos on securing the best open-source model. [X Post](https://x.com/deepseek_ai/status/1995452650557763728) | [Technical Report](https://huggingface.co/deepseek-ai/DeepSeek-V3.2/resolve/main/assets/paper.pdf) - 🔥 **17 Nov** — Added 50 easy tasks (10 per MCP server) for smaller open-source models ([#225](https://github.com/eval-sys/mcpmark/pull/225)) - 🤝 **31 Oct** — Community PR from insforge: better MCP servers achieve better results with fewer tokens! ([#214](https://github.com/eval-sys/mcpmark/pull/214)) - 🔥 **13 Oct** — Added ReAct agent support. PRs for new agent scaffolds welcome! ([#209](https://github.com/eval-sys/mcpmark/pull/209)) - 🏅 **10 Sep** — `qwen-3-coder-plus` is the best open-source model! Kudos to Qwen team. [X Post](https://x.com/Alibaba_Qwen/status/1965457023438651532) --- ## What you can do with MCPMark - **Evaluate real tool usage** across multiple MCP services: `Notion`, `GitHub`, `Filesystem`, `Postgres`, `Playwright`. - **Use ready-to-run tasks** covering practical workflows, each with strict automated verification. - **Reliable and reproducible**: isolated environments that do not pollute your accounts/data; failed tasks auto-retry and resume. - **Unified metrics and aggregation**: single/multi-run (pass@k, avg@k, etc.) with automated results aggregation. - **Flexible deployment**: local or Docker; fully validated on macOS and Linux. --- ## Quickstart (5 minutes) ### 1) Clone the repository ```bash git clone https://github.com/eval-sys/mcpmark.git cd mcpmark ``` ### 2) Set environment variables (create `.mcp_env` at repo root) Only set what you need. Add service credentials when running tasks for that service. ```env # Example: OpenAI OPENAI_BASE_URL="https://api.openai.com/v1" OPENAI_API_KEY="sk-..." # Optional: Notion (only for Notion tasks) SOURCE_NOTION_API_KEY="your-source-notion-api-key" EVAL_NOTION_API_KEY="your-eval-notion-api-key" EVAL_PARENT_PAGE_TITLE="MCPMark Eval Hub" PLAYWRIGHT_BROWSER="chromium" # chromium | firefox PLAYWRIGHT_HEADLESS="True" # Optional: GitHub (only for GitHub tasks) GITHUB_TOKENS="token1,token2" # token pooling for rate limits GITHUB_EVAL_ORG="your-eval-org" # Optional: Postgres (only for Postgres tasks) POSTGRES_HOST="localhost" POSTGRES_PORT="5432" POSTGRES_USERNAME="postgres" POSTGRES_PASSWORD="password" ``` See `docs/introduction.md` and the service guides below for more details. ### 3) Install and run a minimal example Local (Recommended) ```bash pip install -e . # If you'll use browser-based tasks, install Playwright browsers first playwright install ``` MCPMark defaults to the built-in orchestration agent (`MCPMarkAgent`). To experiment with the ReAct-style agent, pass `--agent react` to `pipeline.py` (other settings stay the same). Docker ```bash ./build-docker.sh ``` Run a filesystem task (no external accounts required): ```bash python -m pipeline \ --mcp filesystem \ --k 1 \ # run once to quick start --models gpt-5 \ # or any model you configured --tasks file_property/size_classification # Add --task-suite easy to run the lightweight dataset (where available) ``` Results are saved to `./results/{exp_name}/{model}__{mcp}/run-*/...` for the standard suite and `./results/{exp_name}/{model}__{mcp}-easy/run-*/...` when you run `--task-suite easy` (e.g., `./results/test-run/gpt-5__filesystem/run-1/...` or `./results/test-run/gpt-5__github-easy/run-1/...`). --- ## Run your evaluations ### Task suites (standard vs easy) - Each MCP service now stores tasks under `tasks/////`. - `standard` (default) covers the full benchmark (127 tasks today). - `easy` hosts 10 lightweight tasks per MCP, ideal for smoke tests and CI (GitHub’s are already available under `tasks/github/easy`). - Switch suites with `--task-suite easy` (defaults to `--task-suite standard`). ### Single run (k=1) ```bash # Run ALL tasks for a service python -m pipeline --exp-name exp --mcp notion --tasks all --models MODEL --k 1 # Run a task group python -m pipeline --exp-name exp --mcp notion --tasks online_resume --models MODEL --k 1 # Run a specific task python -m pipeline --exp-name exp --mcp notion --tasks online_resume/daily_itinerary_overview --models MODEL --k 1 # Evaluate multiple models python -m pipeline --exp-name exp --mcp notion --tasks all --models MODEL1,MODEL2,MODEL3 --k 1 ``` ### Multiple runs (k>1) for pass@k ```bash # Run k=4 to compute stability metrics (requires --exp-name to aggregate final results) python -m pipeline --exp-name exp --mcp notion --tasks all --models MODEL # Aggregate results (pass@1 / pass@k / pass^k / avg@k) python -m src.aggregators.aggregate_results --exp-name exp ``` ### Run with Docker ```bash # Run all tasks for a service ./run-task.sh --mcp notion --models MODEL --exp-name exp --tasks all # Cross-service benchmark ./run-benchmark.sh --models MODEL --exp-name exp --docker ``` Please visit `docs/introduction.md` for choices of *MODEL*. Tip: MCPMark supports **auto-resume**. When re-running, only unfinished tasks will execute. Failures matching our retryable patterns (see [RETRYABLE_PATTERNS](src/errors.py)) are retried automatically. Models may emit different error strings—if you encounter a new resumable error, please open a PR or issue. Tip: MCPMark supports **auto-compaction**; pass `--compaction-token N` to enable automatic context summarization when prompt tokens reach `N` (use `999999999` to disable). --- ## Service setup and authentication | Service | Setup summary | Docs | |-------------|-----------------------------------------------------------------------------------------------------------------|---------------------------------------| | Notion | Environment isolation (Source Hub / Eval Hub), integration creation and grants, browser login verification. | [Guide](docs/mcp/notion.md) | | GitHub | Multi-account token pooling recommended; import pre-exported repo state if needed. | [Guide](docs/mcp/github.md) | | Postgres | Start via Docker and import sample databases. | [Setup](docs/mcp/postgres.md) | | Playwright | Install browsers before first run; defaults to `chromium`. | [Setup](docs/mcp/playwright.md) | | Filesystem | Zero-configuration, run directly. | [Config](docs/mcp/filesystem.md) | You can also follow [Quickstart](docs/quickstart.md) for the shortest end-to-end path. ### Important Notice: GitHub Repository Privacy > **Please ensure your evaluation repositories are set to PRIVATE.** GitHub state templates are now automatically downloaded from our CDN during evaluation — no manual download is required. However, because these templates contain issues and pull requests from real open-source repositories, the recreation process includes `@username` mentions of the original authors. **We have received feedback from original GitHub authors who were inadvertently notified** when evaluation repositories were created as public. To be a responsible member of the open-source community, we urge all users to: 1. **Always keep evaluation repositories private** during the evaluation process. 2. **In the latest version**, we have added random suffixes to all `@username` mentions (e.g., `@user` becomes `@user_x7k2`) and implemented a safety check that prevents importing templates to public repositories. 3. **If you are using an older version of MCPMark**, please either: - Pull the latest code immediately, or - Manually ensure all GitHub evaluation repositories are set to private. Thank you for helping us maintain a respectful relationship with the open-source community. --- ## Results and metrics - Results are organized under `./results/{exp_name}/{model}__{mcp}/run-*/` (JSON + CSV per task). - Generate a summary with: ```bash # Basic usage python -m src.aggregators.aggregate_results --exp-name exp # For k-run experiments with single-run models python -m src.aggregators.aggregate_results --exp-name exp --k 4 --single-run-models claude-opus-4-1 ``` - Only models with complete results across all tasks and runs are included in the final summary. - Includes multi-run metrics (pass@k, pass^k) for stability comparisons when k > 1. --- ## Model and Tasks - **Model support**: MCPMark calls models via LiteLLM — see the LiteLLM docs: [`LiteLLM Doc`](https://docs.litellm.ai/docs/). For Anthropic (Claude) extended thinking mode (enabled via `--reasoning-effort`), we use Anthropic’s native API. - See `docs/introduction.md` for details and configuration of supported models in MCPMark. - To add a new model, edit `src/model_config.py`. Before adding, check LiteLLM supported models/providers. See [`LiteLLM Doc`](https://docs.litellm.ai/docs/). - Task design principles in `docs/datasets/task.md`. Each task ships with an automated `verify.py` for objective, reproducible evaluation, see `docs/task.md` for details. --- ## Contributing Contributions are welcome: 1. Add a new task under `tasks/////` with `meta.json`, `description.md` and `verify.py`. 2. Ensure local checks pass and open a PR. 3. See `docs/contributing/make-contribution.md`. --- ## Citation If you find our works useful for your research, please consider citing: ```bibtex @misc{wu2025mcpmark, title={MCPMark: A Benchmark for Stress-Testing Realistic and Comprehensive MCP Use}, author={Zijian Wu and Xiangyan Liu and Xinyuan Zhang and Lingjun Chen and Fanqing Meng and Lingxiao Du and Yiran Zhao and Fanshi Zhang and Yaoqi Ye and Jiawei Wang and Zirui Wang and Jinjie Ni and Yufan Yang and Arvin Xu and Michael Qizhe Shieh}, year={2025}, eprint={2509.24002}, archivePrefix={arXiv}, primaryClass={cs.CL}, url={https://arxiv.org/abs/2509.24002}, } ``` ## License This project is licensed under the Apache License 2.0 — see `LICENSE`. ================================================ FILE: build-docker.sh ================================================ #!/bin/bash # Build Docker image for MCPMark set -e # Color codes for output GREEN='\033[0;32m' YELLOW='\033[1;33m' NC='\033[0m' # No Color echo -e "${YELLOW}Building MCPMark Docker image locally...${NC}" # Build the Docker image with the same tag as Docker Hub for local testing docker build -t evalsysorg/mcpmark:latest . "$@" # Check if build was successful if [ $? -eq 0 ]; then echo -e "${GREEN}✓ Docker image built successfully${NC}" echo " Tag: evalsysorg/mcpmark:latest" # Show image info echo "" echo "Image details:" docker images evalsysorg/mcpmark:latest --format "table {{.Repository}}\t{{.Tag}}\t{{.Size}}\t{{.CreatedAt}}" echo "" echo "You can now run tasks using:" echo " ./run-task.sh --mcp notion --models o3 --exp-name test --tasks all" else echo "Docker build failed!" exit 1 fi ================================================ FILE: cspell.config.yaml ================================================ version: "0.2" ignorePaths: [] dictionaryDefinitions: [] dictionaries: [] words: - datname - domcontentloaded - modelcontextprotocol - pgdumplib - pixi - pypi - topbar - usename ignoreWords: [] import: [] ================================================ FILE: docs/contributing/make-contribution.md ================================================ # Contributing 1. Fork the repository and create a feature branch. 2. Add new tasks under `tasks/////` with the files of `meta.json`, `description.md` and `verify.py`. Please refer to [Task Page](../datasets/task.md) for detailed instructions. 3. Ensure all tests pass. 4. Submit a pull request — contributions are welcome! ================================================ FILE: docs/datasets/task.md ================================================ # Task The tasks in MCPMark follows two major principles - The tasks are based on realistic digital environments that are also used by human programmers. - The task outcome can be robustly verified in python scripts. Therefore, each MCPMark task consists of three files - `meta.json` - `description.md` - `verify.py` Here, `metadata.json` includes the meta information of the task, `description.md` describes the purpose and setting of the task, as well as the instruction to complete the task. `verify.py` checks whether the task is completed successfully. For example, you can ask the model agent to create a file with specific name and write specific content to the file, which belongs to the category of operating the file context. The structure looks like ``` tasks │ └───filesystem │ └───standard # task_suite (also supports `easy`) │ └───file_context # category_id │ └───create_file_write │ meta.json │ description.md │ verify.py ``` All tasks live under `tasks/////`. `filesystem` refers to the MCP service and `task_suite` captures the difficulty slice (`standard` benchmark vs `easy` smoke tests). `meta.json` includes the meta information about the task, including the following key - task_id: the id of the task. - task_name: full name of the task. - description: task description. - category_id: the id of task category. - category_name: the full name of task categeory. - author: the author of the task. - difficulty: the task difficulty level. - created_at: the timestamp of task creation. - tags: a list of tags that describe the task. - mcp: a list of MCP services it belongs to. - metadata: other meta information. Here `category_name` describes the shared feature or the environment across different tasks (e.g. the github repository or notion page the task is built on). In this running example, `category_name` refers to `file_context`. `description.md` could include the following information - Task name - Create and Write File. - Task description - Use the filesystem MCP tools to create a new file and write content to it. - Task Objectives - Create a new file named `hello_world.txt` in the test directory. - Write the following content to the file: ``` Hello, World``` - Verify the file was created successfully - Verification Criteria - File `hello_world.txt` exists in the test directory - File contains the expected content structure - File includes "Hello, World!" on the first line - Tips - Use the `write_file` tool to create and write content to the file - The test directory path will be provided in the task context The entire content of `description.md` will be read by the model agent for completing the task. Accordingly, the `verify.py` contains the following functionalities - Check whether the target directory exists. [![Check Target Directory](https://i.postimg.cc/SQfBYvby/task-sample-verify-get-test-dir.png)](https://postimg.cc/4nnLrw3M) - Check whether the target directory contains the file with target file name. [![Check Target File Existence](https://i.postimg.cc/Qx0Zwnf6/task-sample-verify-file-existence.png)](https://postimg.cc/7fGRTX87) - Check whether the target file contains the desired content `EXPECTED_PATTERNS = ["Hello Wolrd"]`. [![Check Content in Target File](https://i.postimg.cc/JzzMhWyV/task-sample-verify-check-content.png)](https://postimg.cc/w7ZSWZc0) - If the outcome passes **all the above verification functionalities**, the task would be marked as successfully completed. ================================================ FILE: docs/installation_and_docker_usage.md ================================================ # Installation and Docker Task Usage Guideline ## Overview The MCPMark setup supports installation through either pip or MCPMark Docker (recommended) after cloning the code repository. ### Pip Installtion ```bash pip install -e . ``` The MCPMark Docker setup provides a simple way to run evaluation tasks in isolated containers. PostgreSQL is automatically handled when needed. ## 1. Quick Start ### 1.1 Docker Image The official Docker image is automatically pulled from Docker Hub on first use. The image is hosted at: https://hub.docker.com/r/evalsysorg/mcpmark **Image Management:** - The scripts automatically download the image when it's not found locally - To manually update to the latest version: ```bash docker pull evalsysorg/mcpmark:latest ``` - For local development/testing, you can build your own docker: ```bash # Creates evalsysorg/mcpmark:latest locally ./build-docker.sh ``` ## 2. Running MCP Experiments ### 2.1 Running Individual MCP Experiment The `run-task.sh` script provides simplified Docker usage: ```bash # Run filesystem tasks (filesystem is the default mcp service) ./run-task.sh --models MODEL_NAME --k K # Run github/notion/postgres/playwright/playwright_webarena with specific task ./run-task.sh --mcp MCPSERVICE --models MODEL_NAME --exp-name EXPNAME --tasks TASK --k K ``` where *MODEL_NAME* refers to the model choice from the supported models (see [Introduction Page](./introduction.md) for more information), *EXPNAME* refers to customized experiment name, *TASK* refers to specific task or task group (see `tasks///...` for more information), *K* refers to the time of independent experiments. Additionally, the `run-benchmark.sh` script evaluates models across all MCP services: ```bash # Run all services with Docker (recommended) ./run-benchmark.sh --models MODEL --exp-name EXPNAME --docker # Run specific services ./run-benchmark.sh --models MODEL --exp-name EXPNAME --mcps MCPSERVICES --docker # Run with parallel execution for faster results ./run-benchmark.sh --models MODEL --exp-name EXPNAME --docker --parallel # Run locally without Docker ./run-benchmark.sh --models MODEL --exp-name EXPNAME --mcps MCPSERVICES ``` Here *MCPSERVICES* refers to group of MCP services, separated by comma (e.g. *filesystem,postgres*) The benchmark script: - Runs all or selected MCP services automatically - Supports progress tracking and timing - Generates summary reports and logs - Supports parallel service execution - Continues running even if some services fail - Automatically generates performance dashboards ### Manual Docker Commands #### For Non-Postgres Services Suppose Notion is the service: ```bash # Build the image first ./build-docker.sh # Run a task docker run --rm \ -v $(pwd)/results:/app/results \ -v $(pwd)/.mcp_env:/app/.mcp_env:ro \ -v $(pwd)/notion_state.json:/app/notion_state.json:ro \ evalsysorg/mcpmark:latest \ python3 -m pipeline --mcp notion --models MODEL --exp-name EXPNAME --tasks TASK --k K ``` #### For Postgres Service ```bash # The run-task.sh script handles postgres automatically, but if doing manually: # Start postgres container docker run -d \ --name mcp-postgres \ --network mcp-network \ -e POSTGRES_DATABASE=postgres \ -e POSTGRES_USER=postgres \ -e POSTGRES_PASSWORD=123456 \ ghcr.io/cloudnative-pg/postgresql:17-bookworm # Run postgres task docker run --rm \ --network mcp-network \ -e POSTGRES_HOST=mcp-postgres \ -v $(pwd)/results:/app/results \ -v $(pwd)/.mcp_env:/app/.mcp_env:ro \ evalsysorg/mcpmark:latest \ python3 -m pipeline --mcp postgres --models MODEL --exp-name EXPNAME --tasks TASK --k K # Stop and remove postgres when done docker stop mcp-postgres && docker rm mcp-postgres ``` ## Script Usage ### Benchmark Runner (`run-benchmark.sh`) ``` ./run-benchmark.sh --models MODELS --exp-name NAME [OPTIONS] Required Options: --models MODELS Comma-separated list of models to evaluate --exp-name NAME Experiment name for organizing results Optional Options: --docker Run tasks in Docker containers (recommended) --mcps SERVICES Comma-separated list of services to test Default: filesystem,notion,github,postgres,playwright --parallel Run services in parallel (experimental) --timeout SECONDS Timeout per task in seconds (default: 300) ``` ### Individual Task Runner (`run-task.sh`) ``` ./run-task.sh [--mcp SERVICE] [PIPELINE_ARGS] Options: --mcp SERVICE MCP service (notion|github|filesystem|playwright|postgres) Default: filesystem Environment Variables: DOCKER_MEMORY_LIMIT Memory limit for container (default: 4g) DOCKER_CPU_LIMIT CPU limit for container (default: 2) DOCKER_IMAGE_VERSION Docker image tag to use (default: latest) All other arguments are passed directly to the pipeline command. Pipeline arguments (see python3 -m pipeline --help): --mcp {notion,github,filesystem,playwright,postgres,playwright_webarena} MCP service to use (default: filesystem) --models MODELS Comma-separated list of models to evaluate (e.g., 'o3,k2,gpt-4.1') --tasks TASKS Tasks to run: "all", a category name, or "category/task_name" --exp-name EXP_NAME Experiment name; results are saved under results// (default: YYYY-MM-DD-HH-MM-SS) --k K Number of evaluation runs for pass@k metrics (default: 1) --timeout TIMEOUT Timeout in seconds for each task --output-dir OUTPUT_DIR Directory to save results ``` ## Docker Benefits 1. **Efficiency**: Only starts necessary containers 2. **Isolation**: Each task runs in a fresh container 3. **Resource Management**: Automatic cleanup of containers and networks 4. **Smart Dependencies**: PostgreSQL only starts for postgres service 5. **Parallel Support**: Can run multiple services simultaneously for faster benchmarks 6. **Comprehensive Testing**: Benchmark script runs all services with one command 7. **Progress Tracking**: Colored output with timing and status information 8. **Automatic Reporting**: Generates summary reports and performance dashboards ## Common Troubleshooting ### Permission Issues ```bash chmod +x run-task.sh ``` ### Docker Build Issues ```bash # Force rebuild with no cache ./run-task.sh --build --mcp MCPSERVICE --models MODEL_NAME --exp-name EXPNAME --tasks TASK ``` ### PostgreSQL Connection Issues ```bash # Check if postgres is running docker ps | grep postgres # View postgres logs docker logs mcp-postgres-task ``` ### Cleanup Stuck Resources ```bash # Stop all containers docker stop $(docker ps -q) # Remove task network docker network rm mcp-task-network # Remove postgres data volume (careful!) docker volume rm mcp-postgres-data ``` ## Environment Variables Create `.mcp_env` file with your credentials: ```env # Service credentials SOURCE_NOTION_API_KEY=your-key EVAL_NOTION_API_KEY=your-key GITHUB_TOKEN=your-token POSTGRES_PASSWORD=your-password # Model API keys OPENAI_API_KEY=your-key ANTHROPIC_API_KEY=your-key # ... etc ``` Please refer to [Quick Start](./quickstart.md) for setting up API key for specific model. ## Docker Compose Files - `docker-compose.yml` - Full stack with postgres (for development/testing) ## Notes - Results are saved under `./results//`. - Each task runs in an ephemeral container. - Docker image is shared across all tasks. - PostgreSQL data persists in Docker volume. ================================================ FILE: docs/introduction.md ================================================ # MCPMark MCPMark is a comprehensive evaluation suite for evaluating the agentic ability of frontier models. MCPMark includes Model Context Protocol (MCP) service in following environments - Notion - Github - Filesystem - Postgres - Playwright - Playwright-WebArena ### General Procedure MCPMark is designed to run agentic tasks in complex environment **safely**. Specifically, it sets up an isolated environment for the experiment, completing the task, and then destroy the environment without affecting existing user profile or information. ### How to Use MCPMark 1. MCPMark Installation. 2. Authorize service (for Github and Notion). 3. Configure the environment variables in `.mcp_env`. 4. Run MCPMark experiment. Please refer to [Quick Start](./quickstart.md) for details regarding how to start a sample filesystem experiment in properly, and [Task Page](./datasets/task.md) for task details. Please visit [Installation and Docker Uusage](./installation_and_docker_usage.md) information of full MCPMark setup. ### Running MCPMark MCPMark supports the following mode to run experiments (suppose the experiment is named as new_exp, and the model used are o3 and gpt-4.1 and the environment is notion), with K repetive experiments. #### MCPMark in Pip Installation ```bash # Evaluate ALL tasks python -m pipeline --exp-name new_exp --mcp notion --tasks all --models o3 --k K # Evaluate a single task group (online_resume) python -m pipeline --exp-name new_exp --mcp notion --tasks online_resume --models o3 --k K # Evaluate one specific task (task_1 in online_resume) python -m pipeline --exp-name new_exp --mcp notion --tasks online_resume/task_1 --models o3 --k K # Evaluate multiple models python -m pipeline --exp-name new_exp --mcp notion --tasks all --models o3,gpt-4.1 --k K ``` #### MCPMark in Docker Installation ```bash # Run all tasks for one service ./run-task.sh --mcp notion --models o3 --exp-name new_exp --tasks all # Run comprehensive benchmark across all services ./run-benchmark.sh --models o3,gpt-4.1 --exp-name new_exp --docker ``` #### Experiment Auto-Resume For re-run experiments, only unfinished tasks will be executed. Tasks that previously failed due to pipeline errors (such as State Duplication Error or MCP Network Error) will also be retried automatically. ### Results The experiment results are written to `./results/` (JSON + CSV). #### Reult Aggregation (for K > 1) MCP supports aggreated metrics of pass@1, pass@K, pass^K, avg@K. ```bash python -m src.aggregators.aggregate_results --exp-name new_exp ``` ### Model Support MCPMark supports the following models with according providers (model codes in the brackets). #### OpenAI - GPT-5 (gpt-5) - o3 (o3) #### Anthropic - Claude-4.1-Opus (claude-4.1-opus) - Claude-4-Sonnet (claude-4-sonnet) #### Google - Gemini-2.5-Pro (gemini-2.5-pro) #### Grok - Grok-4 (grok-4) #### Deepseek - DeepSeek-Chat (deepseek-chat) #### Alibaba - Qwen3-Coder (qwen-3-coder) #### Kimi - Kimi-K2 (k2) ### Want to contribute? Visit [Contributing Page](./contributing) to learn how to make contribution to MCPMark. ================================================ FILE: docs/mcp/filesystem.md ================================================ # Filesystem This guide walks you through preparing your filesystem environment for MCPMark. ## 1 · Configure Environment Variables Set the `FILESYSTEM_TEST_ROOT` environment variable in your `.mcp_env` file: ```env ## Filesystem FILESYSTEM_TEST_ROOT=./test_environments ``` **Recommended**: Use `FILESYSTEM_TEST_ROOT=./test_environments` (relative to project root) --- ## 2 · Automatic Test Environment Download Our code automatically downloads test folders to your specified `FILESYSTEM_TEST_ROOT` directory when the pipeline starts running. **Downloaded Structure**: ``` ./test_environments/ ├── desktop/ # Desktop environment ├── desktop_template/ # Template files for desktop ├── file_context/ # File content understanding tasks ├── file_property/ # File metadata and properties related tasks ├── folder_structure/ # Directory organization tasks ├── legal_document/ # Legal document processing ├── papers/ # Academic paper tasks ├── student_database/ # Database management tasks ├── threestudio/ # 3D Generation codebase └── votenet/ # 3D Object Detection codebase ``` --- ## 3 · Running Filesystem Tasks **Basic Command**: ```bash python -m pipeline --exp-name EXPNAME --mcp filesystem --tasks FILESYSTEMTASK --models MODEL --k K ``` **Docker Usage (Recommended)** Docker is recommended to avoid library version conflicts: ```bash # Build Docker image ./build-docker.sh # Run with Docker ./run-task.sh --mcp filesystem --models MODEL --exp-name EXPNAME --tasks FILESYSTEMTASK --k K ``` Here *EXPNAME* refers to customized experiment name, *FILESYSTEMTASK* refers to the github task or task group selected (see [Task Page](../datasets/task.md) for specific task information), *MODEL* refers to the selected model (see [Introduction Page](../introduction.md) for model supported), *K* refers to the time of independent experiments. --- ## 5 · Troubleshooting **Common Issues**: - **Test Environment Not Found**: Ensure `FILESYSTEM_TEST_ROOT` is set correctly - **Prerequisites**: Make sure your terminal has `wget` and `unzip` commands available - **Recommended**: Use Docker to prevent library version conflicts ================================================ FILE: docs/mcp/github.md ================================================ # GitHub This guide walks you through preparing your GitHub environment for MCPMark and authenticating the CLI tools with support for **token pooling** to mitigate rate limits. ## 1 · Prepare An Evaluation Organization in Github 1. **Create a free GitHub Organization** - In GitHub, click your avatar → **Your organizations** → **New organization**. - We recommend a name like `mcpmark-eval-xxx`. (Check if there is a conflict with other organization names.) - This keeps all benchmark repositories isolated from your personal and work code. - [![Create Org](https://i.postimg.cc/CxqJkRnj/github-create-org.png)](https://postimg.cc/k27xdXc4) 2. **Create Multiple GitHub Accounts (Recommended for Rate Limit Relief)** To effectively distribute API load and avoid rate limiting, we recommend creating **2-4 additional GitHub accounts**: - Create new GitHub accounts (e.g., `your-name-eval-1`, `your-name-eval-2`, etc.) - **Important**: Add all these accounts as **Owners** to your evaluation organization - This allows the token pooling system to distribute requests across multiple accounts 3. **Generate Fine-Grained Personal Access Tokens (PATs) for Each Account** **Repeat the following process for each GitHub account (including your main account):** - Navigate to *Settings → Developer settings → Personal access tokens → Fine-grained tokens* - Click **Generate new token**, select the evaluation organization you created - [![Create Token](https://i.postimg.cc/Z5SjPT82/github-create-token.png)](https://postimg.cc/Mv9yqJrm) - Give the token a descriptive name (e.g., *MCPMark Eval Token - Account 1*) - Under **Repository permissions** and **Organization permissions**, enable **All permissions** (read and write if applicable) - [![Token Permissions](https://i.postimg.cc/nc81ZHPr/github-token-permissions.png)](https://postimg.cc/14HFrZP1) - Copy the generated token and save it safely — you'll need all tokens for the next step 4. **Configure Token Pooling in `.mcp_env`** In your project root, edit (or create) the `.mcp_env` file and add your tokens: **For single token (Basic setup):** ```env ## GitHub - Single Token Configuration GITHUB_TOKENS="your-single-token-here" GITHUB_EVAL_ORG="your-eval-org-name" ``` **For multiple tokens (Recommended for handling rate limits):** ```env ## GitHub - Token Pooling Configuration GITHUB_TOKENS="token1,token2,token3,token4" GITHUB_EVAL_ORG="your-eval-org-name" ``` **Important Notes:** - Replace `token1,token2,token3,token4` with your actual tokens (comma-separated, no spaces) - **2-4 tokens** is recommended for optimal rate limit distribution - All tokens must have **the same permissions** on the evaluation organization - The system automatically rotates between tokens to distribute API load --- ## 2 · Download the Sample Repository State We have pre-exported several popular open-source repositories along with curated Issues and PRs. 1. Download the archive from [Google Drive](https://drive.google.com/drive/folders/16bFDjdtqJYzYJlqKcjKBGomo8DwOhWcN?usp=drive_link). 2. Extract it so that the directory `./github_state/` appears in the project root: ```bash mkdir -p github_state unzip github_state.zip -d ./github_state ``` --- ## 3 · Add New Repositories (Optional) If you want to benchmark additional repositories: 1. Export the desired repository state: ```bash python -m src.mcp_services.github.repo_exporter --source_repo_url owner/name --max-issues 20 --max-pulls 5 ``` 2. Open `src/mcp_services/github/state_manager.py` and add a new entry to `self.initial_state_mapping` pointing to the exported folder. --- ## 4 · GitHub Rate Limits & Token Pooling Benefits ### Understanding Rate Limits Fine-grained tokens are subject to GitHub API rate limits: - **Read operations**: 5,000 requests per hour per token - **General write operations**: 80 writes per minute and 500 writes per hour per token - **Content creation (Issues, PRs, Comments)**: **500 requests per hour per token** (Secondary Rate Limit) ### How Token Pooling Helps With **token pooling**, MCPMark automatically: - **Distributes requests** across multiple tokens to multiply your rate limits - **Rotates tokens** for each task execution to balance load - **Handles rate limit failures** by trying the next available token - **Ensures consistency** between agent execution and verification ### Example: Rate Limit Multiplication **Read Operations:** - **Single token**: 5,000 requests/hour - **4 tokens**: ~20,000 requests/hour total capacity **Content Creation (Critical for MCPMark):** - **Single token**: 500 content creation requests/hour - **4 tokens**: ~2,000 content creation requests/hour total capacity - **Automatic failover**: If one token hits limits, others continue working This dramatically improves evaluation performance, especially for large task batches or frequent testing cycles. **The content creation limit is often the bottleneck**, making token pooling essential for efficient evaluations. ### Repository Limits MCPMark places a cap on the number of PRs and issues (≤ 50 in total) per repository to ensure reasonable evaluation times and to stay within rate limits. ## 2. Running Github Tasks 1. Configure environment variables: make sure `GITHUB_TOKENS` and `GITHUB_EVAL_ORG` are properly set in `.mcp_env`. 2. For single task or task group, run ```bash python -m pipeline --exp-name EXPNAME --mcp github --tasks GITHUBTASK --models MODEL --k K ``` Here *EXPNAME* refers to customized experiment name, *GITHUBTASK* refers to the github task or task group selected (see [Task Page](../datasets/task.md) for specific task information), *MODEL* refers to the selected model (see [Introduction Page](../introduction.md) for model supported), *K* refers to the time of independent experiments. ================================================ FILE: docs/mcp/notion.md ================================================ # Notion This guide walks you through preparing your Notion environment for MCPMark and authenticating the CLI tools. > Note: Set your Notion app and workspace interface language to English. We use Playwright for browser automation and our locator logic relies on raw English text in the UI. Non-English interfaces can cause element selection to fail. ## 1 · Set up Notion Environment 1. **Duplicate the MCPMark Source Pages** Copy the template database and pages into your workspace from the public template following this tutorial: [Duplicate MCPMark Source](https://painted-tennis-ebc.notion.site/MCPBench-Source-Hub-23181626b6d7805fb3a7d59c63033819). 2. **Set up the Source and Eval Hub for Environment Isolation** - Prepare **two separate Notion pages**: - **Source Hub**: Stores all the template databases/pages. Managed by `SOURCE_NOTION_API_KEY`. - **Eval Hub**: Only contains the duplicated templates for the current evaluation. Managed by `EVAL_NOTION_API_KEY`. - In Notion, create an **empty page** in your Eval Hub. The page name **must exactly match** the value you set for `EVAL_PARENT_PAGE_TITLE` in your environment variables (e.g., `MCPMark Eval Hub`). - Name your **Source Hub** page to match `SOURCE_PARENT_PAGE_TITLE` (default: `MCPMark Source Hub`). This is where all initial-state templates live; we enumerate this page’s first-level children by exact title. - In Notion's **Connections** settings: - Bind the integration corresponding to `EVAL_NOTION_API_KEY` to the Eval Hub parent page you just created. - Bind the integration corresponding to `SOURCE_NOTION_API_KEY` to your Source Hub (where the templates are stored). 3. **Create Notion Integrations & Grant Access** a. Visit [Notion Integrations](https://www.notion.so/profile/integrations) and create **two internal integrations** (one for Source Hub, one for Eval Hub). b. Copy the generated **Internal Integration Tokens** (these will be your `SOURCE_NOTION_API_KEY` and `EVAL_NOTION_API_KEY`). c. Share the **Source Hub** with the Source integration, and the **Eval Hub parent page** with the Eval integration (*Full Access*). [![Source Page](https://i.postimg.cc/pVjDswLH/source-page.png)](https://postimg.cc/XXVGJD5H) [![Create Integration](https://i.postimg.cc/vZ091M3W/create-integration.png)](https://postimg.cc/NKrLShhM) [![Notion API Access](https://i.postimg.cc/YCDGrRCR/api-access.png)](https://postimg.cc/CRDLJjDn) [![Grant Access Source](https://i.postimg.cc/2yxyPFt4/grant-access-source.png)](https://postimg.cc/n9Cnm7pz) [![Grant Access Eval](https://i.postimg.cc/1RM91ttc/grant-access-eval.png)](https://postimg.cc/s1QFp35v) --- ## 2 · Authenticate with Notion ```bash # First, install Playwright and the browser binaries playwright install # Then, run the Notion login helper with your preferred browser python -m src.mcp_services.notion.notion_login_helper --browser {firefox|chromium} ``` The verification script will tell you which browser is working properly. The pipeline defaults to using **chromium**. Our pipeline has been **fully tested on macOS and Linux**. ## 3. Running Notion Tasks 1. Configure environment variables: make sure the following service credentials are added in `.mcp_env`. ```env ## Notion SOURCE_NOTION_API_KEY="your-source-notion-api-key" # For Source Hub (templates) EVAL_NOTION_API_KEY="your-eval-notion-api-key" # For Eval Hub (active evaluation) SOURCE_PARENT_PAGE_TITLE="MCPMark Source Hub" # Source hub page name (exact match) EVAL_PARENT_PAGE_TITLE="MCPMark Eval Hub" # Must match the name of the empty page you created in Eval Hub PLAYWRIGHT_BROWSER="chromium" # default to chromium, you can also choose firefox PLAYWRIGHT_HEADLESS="True" ``` 2. For single task or task group, run ```bash python -m pipeline --exp-name EXPNAME --mcp notion --tasks NOTIONTASK --models MODEL --k K ``` Here *EXPNAME* refers to customized experiment name, *NOTIONTASK* refers to the notion task or task group selected (see [Task Page](../datasets/task.md) for specific task information), *MODEL* refers to the selected model (see [Introduction Page](../introduction.md) for model supported), *K* refers to the time of independent experiments. ================================================ FILE: docs/mcp/playwright.md ================================================ # Playwright This guide walks you through setting up WebArena environments for Playwright MCP automated testing, including Shopping, Shopping Admin, and Reddit instances. Section 1 is designed mainly for completing the Playwright-WebArena tasks. ## 1. Setup WebArena Environment (For Playwright-WebArena Tasks) ### 1.1 Download Docker Images [WebArena](https://github.com/web-arena-x/webarena/tree/main/environment_docker) provides Docker images from multiple sources. Choose the fastest one for your network: ### Shopping Environment (Port 7770) ```bash # Option 1: Google Drive (Recommended) pip install gdown gdown 1gxXalk9O0p9eu1YkIJcmZta1nvvyAJpA # Option 2: Archive.org wget https://archive.org/download/webarena-env-shopping-image/shopping_final_0712.tar # Option 3: CMU Server wget http://metis.lti.cs.cmu.edu/webarena-images/shopping_final_0712.tar ``` ### Shopping Admin Environment (Port 7780) ```bash # Option 1: Google Drive (Recommended) gdown 1See0ZhJRw0WTTL9y8hFlgaduwPZ_nGfd # Option 2: Archive.org wget https://archive.org/download/webarena-env-shopping-admin-image/shopping_admin_final_0719.tar # Option 3: CMU Server wget http://metis.lti.cs.cmu.edu/webarena-images/shopping_admin_final_0719.tar ``` ### Reddit Environment (Port 9999) ```bash # Option 1: Google Drive (Recommended) gdown 17Qpp1iu_mPqzgO_73Z9BnFjHrzmX9DGf # Option 2: Archive.org wget https://archive.org/download/webarena-env-forum-image/postmill-populated-exposed-withimg.tar # Option 3: CMU Server wget http://metis.lti.cs.cmu.edu/webarena-images/postmill-populated-exposed-withimg.tar ``` ### 1.2 Deploy Environments #### Shopping (E-commerce Site) ```bash docker load --input shopping_final_0712.tar # Start container docker run --name shopping -p 7770:80 -d shopping_final_0712 # Wait for service initialization (2-3 minutes) sleep 180 # Configure for local access docker exec shopping /var/www/magento2/bin/magento setup:store-config:set --base-url="http://localhost:7770" docker exec shopping mysql -u magentouser -pMyPassword magentodb -e "UPDATE core_config_data SET value='http://localhost:7770/' WHERE path IN ('web/secure/base_url', 'web/unsecure/base_url');" docker exec shopping /var/www/magento2/bin/magento cache:flush ``` **Access**: `http://localhost:7770` #### Shopping Admin (Management Panel) ```bash docker load --input shopping_admin_final_0719.tar # Start container docker run --name shopping_admin -p 7780:80 -d shopping_admin_final_0719 # Wait for service initialization sleep 120 # Configure for local access docker exec shopping_admin /var/www/magento2/bin/magento setup:store-config:set --base-url="http://localhost:7780" docker exec shopping_admin mysql -u magentouser -pMyPassword magentodb -e "UPDATE core_config_data SET value='http://localhost:7780/' WHERE path IN ('web/secure/base_url', 'web/unsecure/base_url');" docker exec shopping_admin php /var/www/magento2/bin/magento config:set admin/security/password_is_forced 0 docker exec shopping_admin php /var/www/magento2/bin/magento config:set admin/security/password_lifetime 0 docker exec shopping_admin /var/www/magento2/bin/magento cache:flush ``` **Access**: `http://localhost:7780/admin` **Admin Credentials**: `admin / admin1234` #### Reddit (Forum) ```bash docker load --input postmill-populated-exposed-withimg.tar # Start container docker run --name forum -p 9999:80 -d postmill-populated-exposed-withimg # Wait for PostgreSQL initialization sleep 120 # Verify service status docker logs forum | grep "database system is ready" curl -I http://localhost:9999 ``` **Access**: `http://localhost:9999` ### 1.3 External Access Configuration For cloud deployments (GCP, AWS, etc.), configure external access: #### Configure Firewall (GCP Example) ```bash # Shopping environment gcloud compute firewall-rules create allow-shopping-7770 \ --allow tcp:7770 --source-ranges 0.0.0.0/0 # Shopping Admin gcloud compute firewall-rules create allow-shopping-admin-7780 \ --allow tcp:7780 --source-ranges 0.0.0.0/0 # Reddit gcloud compute firewall-rules create allow-reddit-9999 \ --allow tcp:9999 --source-ranges 0.0.0.0/0 ``` #### Update Base URLs for External Access ```bash # Get external IP EXTERNAL_IP=$(curl -s ifconfig.me) # Shopping docker exec shopping /var/www/magento2/bin/magento setup:store-config:set --base-url="http://${EXTERNAL_IP}:7770" docker exec shopping mysql -u magentouser -pMyPassword magentodb -e "UPDATE core_config_data SET value='http://${EXTERNAL_IP}:7770/' WHERE path IN ('web/secure/base_url', 'web/unsecure/base_url');" docker exec shopping /var/www/magento2/bin/magento cache:flush # Shopping Admin docker exec shopping_admin /var/www/magento2/bin/magento setup:store-config:set --base-url="http://${EXTERNAL_IP}:7780" docker exec shopping_admin mysql -u magentouser -pMyPassword magentodb -e "UPDATE core_config_data SET value='http://${EXTERNAL_IP}:7780/' WHERE path IN ('web/secure/base_url', 'web/unsecure/base_url');" docker exec shopping_admin /var/www/magento2/bin/magento cache:flush ``` ### 1.4 Alternative Access Methods (Not Verified) #### Cloudflared Tunnel (Free & Persistent) ```bash # Install cloudflared wget https://github.com/cloudflare/cloudflared/releases/latest/download/cloudflared-linux-amd64 sudo mv cloudflared-linux-amd64 /usr/local/bin/cloudflared sudo chmod +x /usr/local/bin/cloudflared # Create tunnels cloudflared tunnel --url http://localhost:7770 # Shopping cloudflared tunnel --url http://localhost:7780 # Admin cloudflared tunnel --url http://localhost:9999 # Reddit ``` #### ngrok (Quick Sharing) ```bash # Install ngrok wget https://bin.equinox.io/c/bNyj1mQVY4c/ngrok-v3-stable-linux-amd64.tgz tar xvzf ngrok-v3-stable-linux-amd64.tgz sudo mv ngrok /usr/local/bin # Create tunnel (choose port) ngrok http 7770 # For Shopping ``` ## 2. Running Playwright Tasks 1. Configure environment variables: make sure the following service credentials are added in `.mcp_env`. ```env PLAYWRIGHT_BROWSER="chromium" # default to chromium, you can also choose firefox PLAYWRIGHT_HEADLESS="True" ``` 2. For single task or task group, run ```bash python -m pipeline --exp-name EXPNAME --mcp MCP --tasks PLAYWRIGHTTASK --models MODEL ``` Here *EXPNAME* refers to customized experiment name, *MCP* refers to playwright or playwright_webarena denpending on the task, *PLAYWRIGHTTASK* refers to the task or task group selected (see [Task Page](../datasets/task.md) for specific task information), *MODEL* refers to the selected model (see [Introduction Page](../introduction.md) for model supported), *K* refers to the time of independent experiments. ## 3. Troubleshooting ### Container Issues ```bash # Check status docker ps -a | grep -E "shopping|forum" # View logs docker logs [container_name] --tail 50 # Restart container docker restart [container_name] ``` ### Access Problems - **First load is slow** (1-2 minutes for Magento) - this is normal - **Ensure ports are available**: `netstat -tlnp | grep -E "7770|7780|9999"` - **Clear cache after URL changes**: Required for Magento environments ### Reset Environment ```bash # Stop and remove container docker stop [container_name] docker rm [container_name] # Re-deploy (follow steps in Section 3) ``` ## 4. Important Notes - **Service startup time**: Allow 2-3 minutes for Magento, 1-2 minutes for Reddit - **Memory requirements**: Ensure Docker has at least 4GB RAM allocated per container - **URL configuration**: Must reconfigure base URLs after container restart for external access - **Port assignments**: - 7770: Shopping - 7780: Shopping Admin - 9999: Reddit ================================================ FILE: docs/mcp/postgres.md ================================================ # PostgreSQL This guide walks you through preparing your PostgreSQL environment for MCPMark evaluation. ## 1. Setup PostgreSQL Environment ### 1.1 Start PostgreSQL with Docker 1. **Run PostgreSQL Container** Start a PostgreSQL instance using Docker: ```bash docker run -d \ --name mcpmark-postgres \ -e POSTGRES_PASSWORD=password \ -e POSTGRES_USER=postgres \ -p 5432:5432 \ pgvector/pgvector:0.8.0-pg17-bookworm ``` 2. **Verify Container is Running** ```bash docker ps | grep mcpmark-postgres ``` --- ### 1.2 Import Sample Databases 1. **Download Database Backups** Download the backup files and place them in `./postgres_state/` directory: ```bash mkdir -p ./postgres_state cd ./postgres_state # Download all database backups wget https://storage.mcpmark.ai/postgres/employees.backup wget https://storage.mcpmark.ai/postgres/chinook.backup wget https://storage.mcpmark.ai/postgres/dvdrental.backup wget https://storage.mcpmark.ai/postgres/sports.backup wget https://storage.mcpmark.ai/postgres/lego.backup cd .. ``` 2. **Create Databases and Restore from Backups** > Make sure your Postgres client version matches the server's version (e.g., pg17). ```bash # Set the password environment variable export PGPASSWORD=password # Create and restore each database createdb -h localhost -U postgres employees pg_restore -h localhost -U postgres -d employees -v ./postgres_state/employees.backup createdb -h localhost -U postgres chinook pg_restore -h localhost -U postgres -d chinook -v ./postgres_state/chinook.backup createdb -h localhost -U postgres dvdrental pg_restore -h localhost -U postgres -d dvdrental -v ./postgres_state/dvdrental.backup createdb -h localhost -U postgres sports pg_restore -h localhost -U postgres -d sports -v ./postgres_state/sports.backup createdb -h localhost -U postgres lego pg_restore -h localhost -U postgres -d lego -v ./postgres_state/lego.backup ``` 3. **Verify Databases are Imported** ```bash # List all databases PGPASSWORD=password psql -h localhost -U postgres -c "\l" ``` --- ## 2. Configure Environment Variables Configure environment variables: make sure the following enservice credentials are added in `.mcp_env`: ```env ## PostgreSQL Configuration POSTGRES_HOST="localhost" POSTGRES_PORT="5432" POSTGRES_USERNAME="postgres" POSTGRES_PASSWORD="password" ``` ## 3. Verify Connection Verify the PostgreSQL setup is working correctly: ```bash # Test connection using psql PGPASSWORD=password psql -h localhost -U postgres -c "SELECT version();" ``` ## 4. Common Operations ### Stop PostgreSQL Container ```bash docker stop mcpmark-postgres ``` ### Start PostgreSQL Container ```bash docker start mcpmark-postgres ``` ### Remove PostgreSQL Container (Clean Setup) ```bash docker stop mcpmark-postgres docker rm mcpmark-postgres ``` ### Access PostgreSQL Shell ```bash PGPASSWORD=mysecretpassword psql -h localhost -U postgres ``` ## 5. Running Postgres Experiment For single task or task group, run ```bash python -m pipeline --exp-name EXPNAME --mcp postgres --tasks POSTGRESTASK --models MODEL ``` Here *EXPNAME* refers to customized experiment name, *POSTGRESTASK* refers to the postgres task or task group selected (see `tasks/` for specific task information), *MODEL* refers to the selected model (see [Introduction Page](../introduction.md) for model supported), *K* refers to the time of independent experiments. ## 6. Troubleshooting ### Port Already in Use If port 5432 is already in use, you can use a different port: ```bash docker run -d \ ```bash docker run -d \ --name mcpmark-postgres \ -e POSTGRES_PASSWORD=password \ -e POSTGRES_USER=postgres \ -p 5433:5432 \ pgvector/pgvector:0.8.0-pg17-bookworm ``` Remember to update `POSTGRES_PORT="5433"` in your `.mcp_env` file. ### Connection Refused Ensure the Docker container is running and the port mapping is correct: ```bash docker ps docker logs mcpmark-postgres ``` ================================================ FILE: docs/quickstart.md ================================================ # Quick Start To quickly experience MCPMark, we recommend firstly preparing the environment, and then execute the Postgres tasks. ### 1. Clone MCPMark ```bash git clone https://github.com/eval-sys/mcpmark.git cd mcpmark ``` ### 2. Setup Environment Variables To setup the model access in environment variable, edit the `.mcp_env` file in `mcpmark/`. ```env # Model Providers (set only those you need) ## Google Gemini GEMINI_BASE_URL="https://your-gemini-base-url.com/v1" GEMINI_API_KEY="your-gemini-api-key" ## DeepSeek DEEPSEEK_BASE_URL="https://your-deepseek-base-url.com/v1" DEEPSEEK_API_KEY="your-deepseek-api-key" ## OpenAI OPENAI_BASE_URL="https://your-openai-base-url.com/v1" OPENAI_API_KEY="your-openai-api-key" ## Anthropic ANTHROPIC_BASE_URL="https://your-anthropic-base-url.com/v1" ANTHROPIC_API_KEY="your-anthropic-api-key" ## Moonshot MOONSHOT_BASE_URL="https://your-moonshot-base-url.com/v1" MOONSHOT_API_KEY="your-moonshot-api-key" ## xAI XAI_BASE_URL="https://your-xai-base-url.com/v1" XAI_API_KEY="your-xai-api-key" ``` ### 3. Run Quick Example in MCPMark Suppose you are running the employee query task with gemini-2.5-flash, and name your experiment as test-run-1, you can use the following command to test the `size_classification` task in `file_property`, which categorizes files by their sizes. ```bash python -m pipeline --exp-name test-run-1 --mcp filesystem --tasks file_property/size_classification --models gemini-2.5-flash ``` Here is the expected output (the verification may encounter failure due to model choices). [![Sample Experiment Output](https://i.postimg.cc/4NRDYRS2/task-sample-file-property-size-classification.png)](https://postimg.cc/Yj8nPZkQ) The reuslts are saved under `restuls/{exp_name}/{mcp}_{model}/{tasks}`, if `exp-name` is not specified, the default name would be timestamp of the experiment (but specifying the `exp-name` is useful for resuming experiments). For other MCP services, please refers to the [Installation and Docker Usage Page](./installation_and_docker_usage.md) for detailed instruction. ================================================ FILE: pipeline.py ================================================ #!/usr/bin/env python3 """ MCPMark Unified Evaluation Pipeline =================================== This script provides an automated evaluation pipeline for testing Large Language Models (LLMs) on various Multi-Step Cognitive Processes (MCP) services like Notion, GitHub, and PostgreSQL. """ import argparse import sys from datetime import datetime from pathlib import Path from dotenv import load_dotenv from src.logger import get_logger from src.evaluator import MCPEvaluator from src.agents import AGENT_REGISTRY from src.factory import MCPServiceFactory from src.model_config import ModelConfig # Suppress httpcore/anyio cleanup exceptions that don't affect functionality. # These "Exception ignored" messages are caused by MCP library's streamablehttp_client # timing issues during cleanup, but don't impact actual task execution. def _suppress_cleanup_exceptions(unraisable): """Suppress known cleanup exceptions from httpcore/anyio.""" msg = str(unraisable.exc_value) if any( pattern in msg for pattern in [ "async generator ignored GeneratorExit", "cancel scope in a different task", "no running event loop", ] ): return # Silently ignore # Use default handler for other exceptions sys.__unraisablehook__(unraisable) sys.unraisablehook = _suppress_cleanup_exceptions # Initialize logger logger = get_logger(__name__) def main(): """Main entry point for the evaluation pipeline.""" parser = argparse.ArgumentParser(description="MCPMark Unified Evaluation Pipeline.") supported_mcp_services = MCPServiceFactory.get_supported_mcp_services() supported_models = ModelConfig.get_supported_models() # Main configuration parser.add_argument( "--mcp", default="filesystem", choices=supported_mcp_services, help="MCP service to use (default: filesystem)", ) parser.add_argument( "--models", required=True, help="Comma-separated list of models to evaluate (e.g., 'o3,k2,gpt-4.1')", ) parser.add_argument( "--agent", default="mcpmark", choices=sorted(AGENT_REGISTRY.keys()), help="Agent implementation to use (default: mcpmark)", ) parser.add_argument( "--tasks", default="all", help='Tasks to run: (1). "all"; (2). "category"; or (3). "category/task".', ) parser.add_argument( "--task-suite", default="standard", choices=["standard", "easy"], help="Task suite to run (default: standard). Use 'easy' to run the lightweight dataset.", ) parser.add_argument( "--exp-name", default=None, help="Experiment name; results are saved under results// (default: YYYY-MM-DD-HH-MM-SS)", ) parser.add_argument( "--k", type=int, default=4, help="Number of evaluation runs (default: 1)", ) # Execution configuration parser.add_argument( "--timeout", type=int, default=3600, help="Timeout in seconds for agent execution", ) parser.add_argument( "--compaction-token", type=int, default=999_999_999, help=( "Auto-compact conversation when prompt tokens (from API usage) reach this limit. " "Use 999999999 to disable compaction." ), ) parser.add_argument( "--reasoning-effort", default="default", choices=["default", "minimal", "low", "medium", "high"], help="Reasoning effort level for supported models (default: None)", ) # Output configuration parser.add_argument( "--output-dir", type=Path, default=Path("./results"), help="Directory to save results", ) # Load arguments and environment variables args = parser.parse_args() load_dotenv(dotenv_path=".mcp_env", override=False) # Validate k parameter and exp-name requirement if args.k > 1 and args.exp_name is None: parser.error("--exp-name is required when k > 1") # Generate default exp-name if not provided if args.exp_name is None: args.exp_name = datetime.now().strftime("%Y-%m-%d-%H-%M-%S") # Parse models (no validation - allow unsupported models) model_list = [m.strip() for m in args.models.split(",") if m.strip()] if not model_list: parser.error("No valid models provided") # Log warning for unsupported models but don't error unsupported_models = [m for m in model_list if m not in supported_models] if unsupported_models: logger.warning( f"Using unsupported models: {', '.join(unsupported_models)}. Will use OPENAI_BASE_URL and OPENAI_API_KEY from environment." ) logger.info("MCPMark Evaluation") logger.info( f"Experiment: {args.exp_name} | {len(model_list)} Model(s): {', '.join(model_list)}" ) logger.info(f"Task suite: {args.task_suite}") if args.k > 1: logger.info(f"Running {args.k} evaluation runs for pass@k metrics") # Run k evaluation runs for run_idx in range(1, args.k + 1): if args.k > 1: logger.info(f"\n{'=' * 80}") logger.info(f"Starting Run {run_idx}/{args.k}") logger.info(f"{'=' * 80}\n") # For k-runs, results/{exp}/{mcp}__{model}/run-N run_exp_name = f"run-{run_idx}" run_output_dir = args.output_dir / args.exp_name else: # For single run, still use run-1 under service_model run_exp_name = "run-1" run_output_dir = args.output_dir / args.exp_name # Run evaluation for each model for i, model in enumerate(model_list, 1): logger.info(f"\n{'=' * 60}") if args.k > 1: logger.info( f"Run {run_idx}/{args.k} | Model {i}/{len(model_list)}: {model}" ) else: logger.info(f"Starting evaluation {i}/{len(model_list)}: {model}") logger.info(f"{'=' * 60}\n") # Initialize and run the evaluation pipeline for this model pipeline = MCPEvaluator( mcp_service=args.mcp, model=model, timeout=args.timeout, exp_name=run_exp_name, output_dir=run_output_dir, reasoning_effort=args.reasoning_effort, agent_name=args.agent, task_suite=args.task_suite, compaction_token=args.compaction_token, ) pipeline.run_evaluation(args.tasks) logger.info(f"📁 Results: {pipeline.base_experiment_dir}") logger.info(f"\n{'=' * 60}") if args.k > 1: logger.info(f"✓ All {args.k} runs completed for {len(model_list)} model(s)") logger.info( f"Run `python -m src.aggregators.aggregate_results --exp-name {args.exp_name}` to compute all metrics" ) else: logger.info(f"✓ All evaluations completed for {len(model_list)} model(s)") logger.info(f"{'=' * 60}") if __name__ == "__main__": main() ================================================ FILE: pyproject.toml ================================================ [project] authors = [] name = "MCPMark" requires-python = ">= 3.11" version = "0.0.1" dependencies = [ "notion-client==2.4.0", "playwright>=1.43.0", "seaborn>=0.12.0", "matplotlib>=3.7.0", "numpy>=1.23.0", "openai-agents>=0.2.3,<0.3", "openai>=1.96.1", "python-dotenv>=1.1.1,<2", "ruff>=0.12.4,<0.13", "psycopg2-binary>=2.9.10,<3", "pyyaml>=6.0.2,<7", "nest-asyncio>=1.6.0,<2", "pixi", "pipx>=1.7.1,<2", "pgdumplib>=3.1.0,<4", "litellm==1.80.0" ] [build-system] build-backend = "hatchling.build" requires = ["hatchling"] [tool.pixi.workspace] channels = ["conda-forge"] platforms = [ "osx-arm64", "linux-aarch64", "linux-64", "win-64", "osx-64", ] [tool.pixi.tasks] fmt = "ruff" [tool.ruff.format] indent-style = "space" line-ending = "auto" [tool.hatch.build.targets.wheel] packages = ["src", "tasks"] ================================================ FILE: run-benchmark.sh ================================================ #!/bin/bash # MCPMark Full Benchmark Runner # Runs all tasks across all MCP services for comprehensive model evaluation set -e # Default values MODELS="" EXP_NAME="" USE_DOCKER=false SERVICES="filesystem,notion,github,postgres,playwright" PARALLEL=false TIMEOUT=3600 K=4 # Color codes for output RED='\033[0;31m' GREEN='\033[0;32m' YELLOW='\033[1;33m' BLUE='\033[0;34m' NC='\033[0m' # No Color # Function to print colored output print_status() { echo -e "${BLUE}[$(date '+%Y-%m-%d %H:%M:%S')]${NC} $1" } print_success() { echo -e "${GREEN}✓${NC} $1" } print_warning() { echo -e "${YELLOW}⚠${NC} $1" } print_error() { echo -e "${RED}✗${NC} $1" } # Parse arguments while [[ $# -gt 0 ]]; do case $1 in --models) MODELS="$2" shift 2 ;; --exp-name) EXP_NAME="$2" shift 2 ;; --docker) USE_DOCKER=true shift ;; --mcps) SERVICES="$2" shift 2 ;; --parallel) PARALLEL=true shift ;; --timeout) TIMEOUT="$2" shift 2 ;; --k) K="$2" shift 2 ;; --help) cat << EOF Usage: $0 --models MODELS --exp-name NAME [OPTIONS] Run comprehensive benchmark across all MCP services. Required Options: --models MODELS Comma-separated list of models to evaluate (e.g., "o3,gpt-4.1,claude-4-sonnet") --exp-name NAME Experiment name for organizing results Optional Options: --docker Run tasks in Docker containers (recommended) --mcps SERVICES Comma-separated list of services to test Default: filesystem,notion,github,postgres,playwright --parallel Run services in parallel (experimental) --timeout SECONDS Timeout per task in seconds (default: 300) --k RUNS Repeat runs per service for pass@k (default: 4) Examples: # Run all services with Docker $0 --models o3,gpt-4.1 --exp-name benchmark-1 --docker # Run specific services locally $0 --models o3 --exp-name test-1 --mcps filesystem,postgres # Run with parallel execution $0 --models claude-4 --exp-name parallel-test --docker --parallel EOF exit 0 ;; *) print_error "Unknown option: $1" echo "Use --help for usage information" exit 1 ;; esac done # Validate required arguments if [ -z "$MODELS" ]; then print_error "Error: --models is required" exit 1 fi if [ -z "$EXP_NAME" ]; then print_error "Error: --exp-name is required" exit 1 fi # Check prerequisites if [ "$USE_DOCKER" = true ]; then if ! command -v docker &> /dev/null; then print_error "Docker is not installed" exit 1 fi # Always use Docker Hub image DOCKER_IMAGE="evalsysorg/mcpmark:latest" # Check if Docker image exists locally, pull only if not found if ! docker image inspect "$DOCKER_IMAGE" >/dev/null 2>&1; then print_status "Docker image not found locally, pulling from Docker Hub..." docker pull "$DOCKER_IMAGE" || { print_error "Failed to pull Docker image from Docker Hub" exit 1 } else print_status "Using local Docker image: $DOCKER_IMAGE" fi else # Check Python installation if ! command -v python3 &> /dev/null; then print_error "Python 3 is not installed" exit 1 fi # Check if dependencies are installed if ! python3 -c "import src.evaluator" 2>/dev/null; then print_warning "Python dependencies not installed" echo "Installing dependencies..." pip install -e . || { print_error "Failed to install dependencies" exit 1 } fi fi # Check .mcp_env file if [ ! -f .mcp_env ]; then print_warning ".mcp_env file not found. Some tasks may fail without API credentials." echo "Create one from .mcp_env.example: cp .mcp_env.example .mcp_env" fi # Convert comma-separated services to array IFS=',' read -ra SERVICE_ARRAY <<< "$SERVICES" # Summary echo "" print_status "MCPMark Benchmark Configuration" echo "━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━" echo "Models: $MODELS" echo "Experiment: $EXP_NAME" echo "Services: ${SERVICE_ARRAY[*]}" echo "Docker: $USE_DOCKER" echo "Parallel: $PARALLEL" echo "Timeout: ${TIMEOUT}s per task" echo "K-Runs: $K" echo "━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━" echo "" # Create results directory RESULTS_DIR="./results/${EXP_NAME}" mkdir -p "$RESULTS_DIR" # Log file for this run with timestamp and models TIMESTAMP=$(date '+%Y%m%d_%H%M%S') LOG_FILE="${RESULTS_DIR}/benchmark_${TIMESTAMP}.log" echo "Benchmark started at $(date '+%Y-%m-%d %H:%M:%S')" > "$LOG_FILE" echo "Models: $MODELS" >> "$LOG_FILE" echo "Services: ${SERVICE_ARRAY[*]}" >> "$LOG_FILE" echo "━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━" >> "$LOG_FILE" # Function to run a single service run_service() { local service=$1 local start_time=$(date +%s) local start_time_formatted=$(date '+%Y-%m-%d %H:%M:%S') print_status "[$start_time_formatted] Starting $service tasks..." if [ "$USE_DOCKER" = true ]; then # Run with Docker ./run-task.sh --mcp "$service" \ --models "$MODELS" \ --exp-name "$EXP_NAME" \ --tasks all \ --timeout "$TIMEOUT" \ --k "$K" 2>&1 | tee -a "$LOG_FILE" else # Run locally python3 -m pipeline \ --mcp "$service" \ --models "$MODELS" \ --exp-name "$EXP_NAME" \ --tasks all \ --timeout "$TIMEOUT" \ --k "$K" 2>&1 | tee -a "$LOG_FILE" fi local exit_code=$? local end_time=$(date +%s) local duration=$((end_time - start_time)) if [ $exit_code -eq 0 ]; then print_success "$service completed in ${duration}s" echo "[$(date '+%Y-%m-%d %H:%M:%S')] $service: SUCCESS (${duration}s)" >> "${RESULTS_DIR}/summary.txt" else print_error "$service failed with exit code $exit_code" echo "[$(date '+%Y-%m-%d %H:%M:%S')] $service: FAILED (exit code $exit_code)" >> "${RESULTS_DIR}/summary.txt" fi return $exit_code } # Track overall results TOTAL_SERVICES=${#SERVICE_ARRAY[@]} COMPLETED_SERVICES=0 FAILED_SERVICES=0 # Main execution BENCHMARK_START=$(date +%s) if [ "$PARALLEL" = true ]; then print_status "Running services in parallel..." # Run all services in background for service in "${SERVICE_ARRAY[@]}"; do ( run_service "$service" ) & pids+=($!) done # Wait for all background jobs and collect exit codes for pid in "${pids[@]}"; do if wait $pid; then ((COMPLETED_SERVICES++)) else ((FAILED_SERVICES++)) fi done else print_status "Running services sequentially..." for service in "${SERVICE_ARRAY[@]}"; do if run_service "$service"; then ((COMPLETED_SERVICES++)) else ((FAILED_SERVICES++)) print_warning "Continuing despite failure in $service" fi done fi BENCHMARK_END=$(date +%s) TOTAL_DURATION=$((BENCHMARK_END - BENCHMARK_START)) # Generate final summary echo "" echo "━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━" print_status "Benchmark Summary" echo "━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━" echo "Completed at: $(date '+%Y-%m-%d %H:%M:%S')" echo "Total Services: $TOTAL_SERVICES" echo "Completed: $COMPLETED_SERVICES" echo "Failed: $FAILED_SERVICES" echo "Total Duration: ${TOTAL_DURATION}s ($(($TOTAL_DURATION / 60))m $(($TOTAL_DURATION % 60))s)" echo "Results saved to: $RESULTS_DIR" echo "Log file: $LOG_FILE" echo "━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━" # Final status if [ $FAILED_SERVICES -eq 0 ]; then print_success "Benchmark completed successfully!" exit 0 else print_warning "Benchmark completed with $FAILED_SERVICES failed service(s)" exit 1 fi ================================================ FILE: run-task.sh ================================================ #!/bin/bash # MCPMark Task Runner # Enable strict error handling set -euo pipefail # Default values SERVICE="filesystem" NETWORK_NAME="mcp-network" POSTGRES_CONTAINER="mcp-postgres" # Resource limits (can be overridden by environment variables) DOCKER_MEMORY_LIMIT="${DOCKER_MEMORY_LIMIT:-4g}" DOCKER_CPU_LIMIT="${DOCKER_CPU_LIMIT:-2}" # Cleanup function cleanup() { if [ "${SERVICE:-}" = "postgres" ]; then if docker ps --format '{{.Names}}' | grep -q "^${POSTGRES_CONTAINER}$"; then echo "Cleaning up PostgreSQL container..." docker stop "$POSTGRES_CONTAINER" >/dev/null 2>&1 || true docker rm "$POSTGRES_CONTAINER" >/dev/null 2>&1 || true fi fi } # Set up cleanup on exit trap cleanup EXIT # Parse arguments while [[ $# -gt 0 ]]; do case $1 in --mcp) SERVICE="$2"; shift 2 ;; --help) cat << EOF Usage: $0 [--mcp SERVICE] [PIPELINE_ARGS] Run MCPMark tasks in Docker containers. Options: --mcp SERVICE MCP service (notion|github|filesystem|playwright|postgres) Default: filesystem Environment Variables: DOCKER_MEMORY_LIMIT Memory limit for container (default: 4g) DOCKER_CPU_LIMIT CPU limit for container (default: 2) DOCKER_IMAGE_VERSION Docker image tag to use (default: latest) All other arguments are passed directly to the pipeline. Examples: $0 --mcp notion --models o3 --exp-name test-1 --tasks all $0 --mcp postgres --models gpt-4 --exp-name pg-test --tasks basic_queries EOF exit 0 ;; *) break ;; # Stop parsing, rest goes to pipeline esac done # Docker image tag can be overridden by environment variable DOCKER_IMAGE_REPO="evalsysorg/mcpmark" DOCKER_IMAGE_VERSION="${DOCKER_IMAGE_VERSION:-latest}" DOCKER_IMAGE="${DOCKER_IMAGE_REPO}:${DOCKER_IMAGE_VERSION}" # Check if Docker image exists locally, pull only if not found if ! docker image inspect "$DOCKER_IMAGE" >/dev/null 2>&1; then echo "Docker image not found locally, pulling from Docker Hub..." docker pull "$DOCKER_IMAGE" || { echo "Error: Failed to pull Docker image from Docker Hub" echo "Please check your internet connection or Docker Hub access" exit 1 } else echo "Using local Docker image: $DOCKER_IMAGE" fi # Check if .mcp_env exists (warn but don't fail) if [ ! -f .mcp_env ]; then echo "Warning: .mcp_env file not found. Some tasks may fail without API credentials." fi # Create network if doesn't exist if ! docker network ls --format '{{.Name}}' | grep -q "^${NETWORK_NAME}$"; then echo "Creating Docker network: $NETWORK_NAME" docker network create "$NETWORK_NAME" || { echo "Error: Failed to create Docker network" exit 1 } fi # Service-specific configurations if [ "$SERVICE" = "postgres" ]; then # For postgres service, ensure PostgreSQL container is running if ! docker ps --format '{{.Names}}' | grep -q "^${POSTGRES_CONTAINER}$"; then echo "Starting PostgreSQL container..." docker run -d \ --name "$POSTGRES_CONTAINER" \ --network "$NETWORK_NAME" \ -e POSTGRES_DATABASE=postgres \ -e POSTGRES_USER=postgres \ -e POSTGRES_PASSWORD="${POSTGRES_PASSWORD:-password}" \ pgvector/pgvector:0.8.0-pg17-bookworm echo "Waiting for PostgreSQL to be ready..." for i in {1..10}; do if docker exec "$POSTGRES_CONTAINER" pg_isready -U postgres >/dev/null 2>&1; then echo "PostgreSQL is ready!" break fi sleep 1 done else echo "PostgreSQL container already running" fi # Run task with network connection to postgres docker run --rm \ --memory="$DOCKER_MEMORY_LIMIT" \ --cpus="$DOCKER_CPU_LIMIT" \ --network "$NETWORK_NAME" \ -e POSTGRES_HOST="$POSTGRES_CONTAINER" \ -e POSTGRES_PORT=5432 \ -e POSTGRES_USERNAME=postgres \ -e POSTGRES_PASSWORD="${POSTGRES_PASSWORD:-password}" \ -e POSTGRES_DATABASE=postgres \ -v "$(pwd)/results:/app/results" \ -v "$(pwd)/postgres_state:/app/postgres_state" \ $([ -f .mcp_env ] && echo "-v $(pwd)/.mcp_env:/app/.mcp_env:ro") \ "$DOCKER_IMAGE" \ python3 -m pipeline --mcp "$SERVICE" --k 1 "$@" elif [ "$SERVICE" = "filesystem" ]; then # For filesystem service, mount test_environments docker run --rm \ --memory="$DOCKER_MEMORY_LIMIT" \ --cpus="$DOCKER_CPU_LIMIT" \ -v "$(pwd)/results:/app/results" \ -v "$(pwd)/test_environments:/app/test_environments" \ $([ -f .mcp_env ] && echo "-v $(pwd)/.mcp_env:/app/.mcp_env:ro") \ "$DOCKER_IMAGE" \ python3 -m pipeline --mcp "$SERVICE" --k 1 "$@" elif [ "$SERVICE" = "insforge" ]; then # For Insforge service, use host network to access Insforge backend on host docker run --rm \ --memory="$DOCKER_MEMORY_LIMIT" \ --cpus="$DOCKER_CPU_LIMIT" \ --add-host=host.docker.internal:host-gateway \ -v "$(pwd)/results:/app/results" \ $([ -f .mcp_env ] && echo "-v $(pwd)/.mcp_env:/app/.mcp_env:ro") \ "$DOCKER_IMAGE" \ python3 -m pipeline --mcp "$SERVICE" --k 1 "$@" else # For other services (notion, github, playwright, etc.) docker run --rm \ --memory="$DOCKER_MEMORY_LIMIT" \ --cpus="$DOCKER_CPU_LIMIT" \ -v "$(pwd)/results:/app/results" \ -v "$(pwd)/test_environments:/app/test_environments" \ $([ -f .mcp_env ] && echo "-v $(pwd)/.mcp_env:/app/.mcp_env:ro") \ $([ -f notion_state.json ] && echo "-v $(pwd)/notion_state.json:/app/notion_state.json") \ "$DOCKER_IMAGE" \ python3 -m pipeline --mcp "$SERVICE" --k 1 "$@" fi echo "Task completed!" ================================================ FILE: src/agents/__init__.py ================================================ """ MCPMark Agent Module ==================== Provides agent implementations and registry for MCPMark. """ from .base_agent import BaseMCPAgent from .mcpmark_agent import MCPMarkAgent from .react_agent import ReActAgent AGENT_REGISTRY = { "mcpmark": MCPMarkAgent, "react": ReActAgent, } __all__ = ["BaseMCPAgent", "MCPMarkAgent", "ReActAgent", "AGENT_REGISTRY"] ================================================ FILE: src/agents/base_agent.py ================================================ """Shared base agent functionality for MCPMark agents.""" from __future__ import annotations import asyncio import copy import json import uuid from abc import ABC, abstractmethod from typing import Any, Dict, List, Optional, Callable from src.logger import get_logger from .mcp import MCPStdioServer, MCPHttpServer from .utils import TokenUsageTracker logger = get_logger(__name__) class BaseMCPAgent(ABC): """Base class with shared functionality for MCPMark agents.""" STDIO_SERVICES = [ "notion", "filesystem", "playwright", "playwright_webarena", "postgres", "insforge", "github", ] HTTP_SERVICES = ["supabase"] DEFAULT_TIMEOUT = 600 COMPACTION_DISABLED_TOKEN = 999_999_999 CLAUDE_THINKING_BUDGETS = { "low": 1024, "medium": 2048, "high": 4096, } def __init__( self, litellm_input_model_name: str, api_key: str, base_url: str, mcp_service: str, timeout: int = DEFAULT_TIMEOUT, service_config: Optional[Dict[str, Any]] = None, service_config_provider: Optional[Callable[[], Dict[str, Any]]] = None, reasoning_effort: Optional[str] = "default", compaction_token: int = COMPACTION_DISABLED_TOKEN, ): self.litellm_input_model_name = litellm_input_model_name self.api_key = api_key self.base_url = base_url self.mcp_service = mcp_service self.timeout = timeout self.service_config = service_config or {} self._service_config_provider = service_config_provider self.reasoning_effort = reasoning_effort or "default" self.compaction_token = int(compaction_token) self.is_claude = self._is_anthropic_model(litellm_input_model_name) self.use_claude_thinking = self.is_claude and self.reasoning_effort != "default" self.usage_tracker = TokenUsageTracker() self.litellm_run_model_name = None self._partial_messages: List[Dict[str, Any]] = [] self._partial_token_usage: Dict[str, int] = {} self._partial_turn_count: int = 0 logger.debug( "Initialized %s for service '%s' with model '%s'", self.__class__.__name__, self.mcp_service, self.litellm_input_model_name, ) # Warn if Gemini 3 model uses unsupported reasoning_effort value if self._is_gemini_3_model() and self.reasoning_effort not in [ "default", "low", "high", ]: logger.warning( "Gemini 3 models only support reasoning_effort 'low' or 'high', " "got '%s'. LiteLLM may map this to the nearest supported value.", self.reasoning_effort, ) def __repr__(self) -> str: # pragma: no cover - debug helper return ( f"{self.__class__.__name__}(service='{self.mcp_service}', " f"model='{self.litellm_input_model_name}')" ) @abstractmethod async def execute( self, instruction: str, tool_call_log_file: Optional[str] = None, ) -> Dict[str, Any]: """Execute the agent logic and return execution metadata.""" def execute_sync( self, instruction: str, tool_call_log_file: Optional[str] = None, ) -> Dict[str, Any]: """Synchronous wrapper for async execution.""" return asyncio.run(self.execute(instruction, tool_call_log_file)) def get_usage_stats(self) -> Dict[str, Any]: """Return aggregated usage statistics.""" return self.usage_tracker.get_stats() def reset_usage_stats(self): """Clear usage statistics.""" self.usage_tracker.reset() # ------------------------------------------------------------------ # Shared helpers # ------------------------------------------------------------------ def _is_anthropic_model(self, model_name: str) -> bool: return "claude" in model_name.lower() def _get_claude_thinking_budget(self) -> Optional[int]: if not self.use_claude_thinking: return None return self.CLAUDE_THINKING_BUDGETS.get(self.reasoning_effort, 2048) def _refresh_service_config(self): if not self._service_config_provider: return try: latest_cfg = self._service_config_provider() or {} self.service_config.update(latest_cfg) except Exception as exc: # pragma: no cover - best effort refresh logger.warning("Failed to refresh service config: %s", exc) def _reset_progress(self): self._partial_messages = [] self._partial_token_usage = {} self._partial_turn_count = 0 def _update_progress( self, messages: List[Dict[str, Any]], token_usage: Dict[str, Any], turn_count: int, ): try: self._partial_messages = copy.deepcopy(messages) self._partial_token_usage = dict(token_usage or {}) self._partial_turn_count = int(turn_count or 0) except Exception: # pragma: no cover - defensive copy pass # ------------------------------------------------------------------ # MCP server management # ------------------------------------------------------------------ async def _create_mcp_server(self) -> Any: if self.mcp_service in self.STDIO_SERVICES: return self._create_stdio_server() if self.mcp_service in self.HTTP_SERVICES: return self._create_http_server() raise ValueError(f"Unsupported MCP service: {self.mcp_service}") def _create_stdio_server(self) -> MCPStdioServer: if self.mcp_service == "notion": notion_key = self.service_config.get("notion_key") if not notion_key: raise ValueError("Notion API key required") return MCPStdioServer( command="npx", args=["-y", "@notionhq/notion-mcp-server"], env={ "OPENAPI_MCP_HEADERS": ( '{"Authorization": "Bearer ' + notion_key + '", ' '"Notion-Version": "2022-06-28"}' ) }, ) if self.mcp_service == "filesystem": test_directory = self.service_config.get("test_directory") if not test_directory: raise ValueError("Test directory required for filesystem service") return MCPStdioServer( command="npx", args=[ "-y", "@modelcontextprotocol/server-filesystem", str(test_directory), ], ) if self.mcp_service in ("playwright", "playwright_webarena"): browser = self.service_config.get("browser", "chromium") headless = self.service_config.get("headless", True) viewport_width = self.service_config.get("viewport_width", 1280) viewport_height = self.service_config.get("viewport_height", 720) args = ["-y", "@playwright/mcp@latest"] if headless: args.append("--headless") args.extend( [ "--isolated", "--no-sandbox", "--browser", browser, "--viewport-size", f"{viewport_width},{viewport_height}", ] ) return MCPStdioServer(command="npx", args=args) if self.mcp_service == "postgres": host = self.service_config.get("host", "localhost") port = self.service_config.get("port", 5432) username = self.service_config.get("username") password = self.service_config.get("password") database = self.service_config.get( "current_database" ) or self.service_config.get("database") if not all([username, password, database]): raise ValueError("PostgreSQL requires username, password, and database") database_url = ( f"postgresql://{username}:{password}@{host}:{port}/{database}" ) return MCPStdioServer( command="pipx", args=["run", "postgres-mcp", "--access-mode=unrestricted"], env={"DATABASE_URI": database_url}, ) if self.mcp_service == "insforge": api_key = self.service_config.get("api_key") backend_url = self.service_config.get("backend_url") if not all([api_key, backend_url]): raise ValueError("Insforge requires api_key and backend_url") return MCPStdioServer( command="npx", args=["-y", "@insforge/mcp@dev"], env={ "INSFORGE_API_KEY": api_key, "INSFORGE_BACKEND_URL": backend_url, }, ) raise ValueError(f"Unsupported stdio service: {self.mcp_service}") def _create_http_server(self) -> MCPHttpServer: if self.mcp_service == "github": github_token = self.service_config.get("github_token") if not github_token: raise ValueError("GitHub token required") return MCPHttpServer( url="https://api.githubcopilot.com/mcp/", headers={ "Authorization": f"Bearer {github_token}", "User-Agent": "MCPMark/1.0", }, ) raise ValueError(f"Unsupported HTTP service: {self.mcp_service}") # ------------------------------------------------------------------ # Message/Tool formatting helpers # ------------------------------------------------------------------ def _compaction_enabled(self) -> bool: return 0 < self.compaction_token < self.COMPACTION_DISABLED_TOKEN def _count_prompt_tokens_litellm(self, messages: List[Dict[str, Any]]) -> int: try: from litellm import token_counter return int( token_counter(model=self.litellm_input_model_name, messages=messages) or 0 ) except Exception: # pragma: no cover - best effort return 0 def _convert_to_sdk_format( self, messages: List[Dict[str, Any]] ) -> List[Dict[str, Any]]: sdk_format: List[Dict[str, Any]] = [] function_call_map: Dict[str, str] = {} for msg in messages: role = msg.get("role") if role == "user": user_content = msg.get("content", "") if isinstance(user_content, list): tool_results = [ item for item in user_content if isinstance(item, dict) and item.get("type") == "tool_result" ] if tool_results: for tr in tool_results: content_items = tr.get("content", []) text_content = "" for ci in content_items: if isinstance(ci, dict) and ci.get("type") == "text": text_content = ci.get("text", "") break sdk_format.append( { "call_id": tr.get("tool_use_id", ""), "output": json.dumps( { "type": "text", "text": text_content, "annotations": None, "meta": None, } ), "type": "function_call_output", } ) else: text_parts = [] for item in user_content: if isinstance(item, dict) and item.get("type") == "text": text_parts.append(item.get("text", "")) sdk_format.append( {"content": "\n".join(text_parts), "role": "user"} ) else: sdk_format.append({"content": user_content, "role": "user"}) elif role == "assistant": tool_calls = msg.get("tool_calls", []) function_call = msg.get("function_call") content = msg.get("content") if isinstance(content, list): text_parts = [] claude_tool_uses = [] for block in content: if isinstance(block, dict): if block.get("type") == "text": text_parts.append(block.get("text", "")) elif block.get("type") == "thinking": thinking_text = block.get("thinking", "") if thinking_text: text_parts.append( f"\n{thinking_text}\n" ) elif block.get("type") == "tool_use": claude_tool_uses.append(block) content = "\n".join(text_parts) if claude_tool_uses and not tool_calls: tool_calls = [] for tu in claude_tool_uses: tool_calls.append( { "id": tu.get("id"), "function": { "name": tu.get("name"), "arguments": json.dumps(tu.get("input", {})), }, } ) if content: sdk_format.append( { "id": "__fake_id__", "content": [ { "annotations": [], "text": content, "type": "output_text", } ], "role": "assistant", "status": "completed", "type": "message", } ) if tool_calls: for tool_call in tool_calls: call_id = tool_call.get("id", f"call_{uuid.uuid4().hex}") func_name = tool_call.get("function", {}).get("name", "") sdk_format.append( { "arguments": tool_call.get("function", {}).get( "arguments", "{}" ), "call_id": call_id, "name": func_name, "type": "function_call", "id": "__fake_id__", } ) if function_call: func_name = function_call.get("name", "") call_id = f"call_{uuid.uuid4().hex}" function_call_map[func_name] = call_id sdk_format.append( { "arguments": function_call.get("arguments", "{}"), "call_id": call_id, "name": func_name, "type": "function_call", "id": "__fake_id__", } ) elif role == "tool": sdk_format.append( { "call_id": msg.get("tool_call_id", ""), "output": json.dumps( { "type": "text", "text": msg.get("content", ""), "annotations": None, "meta": None, } ), "type": "function_call_output", } ) elif role == "function": func_name = msg.get("name", "") call_id = function_call_map.get(func_name, f"call_{uuid.uuid4().hex}") sdk_format.append( { "call_id": call_id, "output": json.dumps( { "type": "text", "text": msg.get("content", ""), "annotations": None, "meta": None, } ), "type": "function_call_output", } ) return sdk_format def _convert_to_anthropic_format( self, tools: List[Dict[str, Any]] ) -> List[Dict[str, Any]]: anthropic_tools = [] for tool in tools: anthropic_tool = { "name": tool.get("name"), "description": tool.get("description", ""), "input_schema": tool.get( "inputSchema", {"type": "object", "properties": {}, "required": []}, ), } anthropic_tools.append(anthropic_tool) return anthropic_tools def _is_gemini_model(self) -> bool: model_lower = self.litellm_input_model_name.lower() return "gemini" in model_lower or "bison" in model_lower def _is_gemini_3_model(self) -> bool: """Check if this is a Gemini 3 series model.""" model_lower = self.litellm_input_model_name.lower() return "gemini-3" in model_lower or "gemini/gemini-3" in model_lower def _simplify_schema_for_gemini( self, schema: Optional[Dict[str, Any]] ) -> Dict[str, Any]: if not isinstance(schema, dict): return schema or {} simplified: Dict[str, Any] = {} for key, value in schema.items(): if key == "type" and isinstance(value, list): simplified[key] = value[0] if value else "string" elif key == "items" and isinstance(value, dict): simplified[key] = self._simplify_schema_for_gemini(value) elif key == "properties" and isinstance(value, dict): simplified[key] = { prop_key: self._simplify_schema_for_gemini(prop_val) for prop_key, prop_val in value.items() } elif isinstance(value, dict): simplified[key] = self._simplify_schema_for_gemini(value) elif isinstance(value, list) and key not in ("required", "enum"): simplified[key] = [ self._simplify_schema_for_gemini(item) if isinstance(item, dict) else item for item in value ] else: simplified[key] = value return simplified def _convert_to_openai_format( self, tools: List[Dict[str, Any]] ) -> List[Dict[str, Any]]: functions = [] is_gemini = self._is_gemini_model() if is_gemini: logger.debug( "Detected Gemini model '%s' – simplifying tool schemas", self.litellm_input_model_name, ) for tool in tools: input_schema = tool.get( "inputSchema", {"type": "object", "properties": {}, "required": []} ) if is_gemini: simplified = self._simplify_schema_for_gemini(input_schema) if simplified != input_schema: input_schema = simplified logger.debug("Simplified schema for tool '%s'", tool.get("name")) functions.append( { "name": tool.get("name"), "description": tool.get("description", ""), "parameters": input_schema, } ) if is_gemini: logger.info("Converted %d tools for Gemini compatibility", len(functions)) return functions ================================================ FILE: src/agents/mcp/__init__.py ================================================ """ MCP (Model Context Protocol) Components ======================================== Minimal MCP server implementations for MCPMark. """ from .stdio_server import MCPStdioServer from .http_server import MCPHttpServer __all__ = ["MCPStdioServer", "MCPHttpServer"] ================================================ FILE: src/agents/mcp/http_server.py ================================================ """ Minimal MCP HTTP Server Implementation ======================================= Provides HTTP-based MCP server communication for services like GitHub. """ import asyncio from contextlib import AsyncExitStack from typing import Any, Dict, List, Optional from mcp import ClientSession from mcp.client.streamable_http import streamablehttp_client class MCPHttpServer: """ HTTP-based MCP client using the official MCP Python SDK (Streamable HTTP transport). """ def __init__( self, url: str, headers: Optional[Dict[str, str]] = None, timeout: int = 30, ): self.url = url.rstrip("/") self.headers = headers or {} self.timeout = timeout self._stack: Optional[AsyncExitStack] = None self.session: Optional[ClientSession] = None self._tools_cache: Optional[List[Dict[str, Any]]] = None async def __aenter__(self): await self.start() return self async def __aexit__(self, exc_type, exc, tb): await self.stop() async def start(self): """Open Streamable HTTP transport and initialize MCP session.""" self._stack = AsyncExitStack() read_stream, write_stream, _ = await self._stack.enter_async_context( streamablehttp_client(self.url, headers=self.headers) ) self.session = await self._stack.enter_async_context(ClientSession(read_stream, write_stream)) await asyncio.wait_for(self.session.initialize(), timeout=self.timeout) async def stop(self): """Close the session/transport cleanly.""" if self._stack: await self._stack.aclose() self._stack = None self.session = None self._tools_cache = None async def list_tools(self) -> List[Dict[str, Any]]: """Return tool definitions (cached).""" if self._tools_cache is not None: return self._tools_cache if not self.session: raise RuntimeError("MCP HTTP client not started") resp = await asyncio.wait_for(self.session.list_tools(), timeout=self.timeout) self._tools_cache = [t.model_dump() for t in resp.tools] return self._tools_cache async def call_tool(self, name: str, arguments: Dict[str, Any]) -> Any: """Invoke a remote tool and return the structured result.""" if not self.session: raise RuntimeError("MCP HTTP client not started") result = await asyncio.wait_for(self.session.call_tool(name, arguments), timeout=self.timeout) return result.model_dump() ================================================ FILE: src/agents/mcp/stdio_server.py ================================================ """ Minimal MCP Stdio Server Implementation ======================================== Provides stdio-based MCP server communication for services like Notion, Filesystem, Playwright, and Postgres. """ import asyncio import os from contextlib import AsyncExitStack from typing import Any, Dict, List, Optional from mcp import ClientSession, StdioServerParameters from mcp.client.stdio import stdio_client class MCPStdioServer: """Lightweight wrapper around the official MCP Python SDK.""" def __init__(self, command: str, args: List[str], env: Optional[Dict[str, str]] = None, timeout: int = 120): self.params = StdioServerParameters(command=command, args=args, env={**os.environ, **(env or {})}) self.timeout = timeout self._stack: Optional[AsyncExitStack] = None self._streams = None self.session: Optional[ClientSession] = None async def __aenter__(self): self._stack = AsyncExitStack() read, write = await self._stack.enter_async_context(stdio_client(self.params)) self.session = await self._stack.enter_async_context(ClientSession(read, write)) await asyncio.wait_for(self.session.initialize(), timeout=self.timeout) return self async def __aexit__(self, exc_type, exc, tb): if self._stack: await self._stack.aclose() self._stack = None self.session = None async def list_tools(self) -> List[Dict[str, Any]]: resp = await asyncio.wait_for(self.session.list_tools(), timeout=self.timeout) return [t.model_dump() for t in resp.tools] async def call_tool(self, name: str, arguments: Dict[str, Any]) -> Any: result = await asyncio.wait_for(self.session.call_tool(name, arguments), timeout=self.timeout) return result.model_dump() # 同上,转成 dict ================================================ FILE: src/agents/mcpmark_agent.py ================================================ """ MCPMark Agent Implementation ============================ Unified agent using LiteLLM for all model interactions with minimal MCP support. """ import asyncio import json import time from typing import Any, Dict, List, Optional, Callable from pydantic import AnyUrl import httpx import litellm import nest_asyncio from src.logger import get_logger from .base_agent import BaseMCPAgent from .mcp import MCPStdioServer, MCPHttpServer # Apply nested asyncio support nest_asyncio.apply() # Configure LiteLLM litellm.suppress_debug_info = True logger = get_logger(__name__) # To fix the "Object of type AnyUrl is not JSON serializable" error in the find_file_contents function. class CustomJSONEncoder(json.JSONEncoder): def default(self, obj): if isinstance(obj, AnyUrl): return str(obj) return super().default(obj) class MCPMarkAgent(BaseMCPAgent): """ Unified agent for LLM and MCP server management using LiteLLM. - Anthropic models: Native MCP support via extra_body - Other models: Manual MCP server management with function calling """ MAX_TURNS = 100 SYSTEM_PROMPT = ( "You are a helpful agent that uses tools iteratively to complete the user's task, " 'and when finished, provides the final answer or simply states "Task completed" without further tool calls.' ) COMPACTION_PROMPT = ( "You are performing a CONTEXT CHECKPOINT COMPACTION.\n" "Summarize the conversation so far for another model to continue.\n\n" "Include:\n" "- Current progress and key decisions made\n" "- Important context, constraints, or user preferences\n" "- What remains to be done (clear next steps)\n" "- Any critical data, examples, or references needed to continue\n\n" "Be concise and structured. Do NOT call tools." ) DEFAULT_TIMEOUT = BaseMCPAgent.DEFAULT_TIMEOUT def __init__( self, litellm_input_model_name: str, api_key: str, base_url: str, mcp_service: str, timeout: int = DEFAULT_TIMEOUT, service_config: Optional[Dict[str, Any]] = None, service_config_provider: Optional[Callable[[], Dict[str, Any]]] = None, reasoning_effort: Optional[str] = "default", compaction_token: int = BaseMCPAgent.COMPACTION_DISABLED_TOKEN, ): super().__init__( litellm_input_model_name=litellm_input_model_name, api_key=api_key, base_url=base_url, mcp_service=mcp_service, timeout=timeout, service_config=service_config, service_config_provider=service_config_provider, reasoning_effort=reasoning_effort, compaction_token=compaction_token, ) logger.debug( "Initialized MCPMarkAgent for '%s' with model '%s' (Claude: %s, Thinking: %s, Reasoning: %s)", mcp_service, litellm_input_model_name, self.is_claude, self.use_claude_thinking, reasoning_effort, ) # ==================== Public Interface Methods ==================== async def execute( self, instruction: str, tool_call_log_file: Optional[str] = None ) -> Dict[str, Any]: """ Execute instruction with the agent. Args: instruction: The instruction/prompt to execute tool_call_log_file: Optional path to log tool calls Returns: Dictionary containing execution results """ start_time = time.time() try: # Reset partial progress for this run self._reset_progress() # Refresh service configuration self._refresh_service_config() # Execute with timeout control async def _execute_with_strategy(): if self.use_claude_thinking: # Claude with thinking -> native Anthropic API with tools return await self._execute_claude_native_with_tools( instruction, tool_call_log_file ) else: # All other cases -> LiteLLM with tools return await self._execute_litellm_with_tools( instruction, tool_call_log_file ) # Apply timeout to the entire execution result = await asyncio.wait_for( _execute_with_strategy(), timeout=self.timeout ) execution_time = time.time() - start_time # Update usage statistics self.usage_tracker.update( success=result["success"], token_usage=result.get("token_usage", {}), turn_count=result.get("turn_count", 0), execution_time=execution_time, ) result["execution_time"] = execution_time return result except Exception as e: execution_time = time.time() - start_time if isinstance(e, asyncio.TimeoutError): error_msg = f"Execution timed out after {self.timeout} seconds" logger.error(error_msg) else: error_msg = f"Agent execution failed: {e}" logger.error(error_msg, exc_info=True) self.usage_tracker.update( success=False, token_usage=self._partial_token_usage or {}, turn_count=self._partial_turn_count or 0, execution_time=execution_time, ) if self._partial_messages: if not self.is_claude: final_msg = self._convert_to_sdk_format(self._partial_messages) else: final_msg = self._partial_messages else: final_msg = [] return { "success": False, "output": final_msg, "token_usage": self._partial_token_usage or {}, "turn_count": self._partial_turn_count or 0, "execution_time": execution_time, "error": error_msg, "litellm_run_model_name": self.litellm_run_model_name, } def execute_sync( self, instruction: str, tool_call_log_file: Optional[str] = None ) -> Dict[str, Any]: """ Synchronous wrapper for execute method. """ return asyncio.run(self.execute(instruction, tool_call_log_file)) def get_usage_stats(self) -> Dict[str, Any]: """Get usage statistics.""" return self.usage_tracker.get_stats() def reset_usage_stats(self): """Reset usage statistics.""" self.usage_tracker.reset() # ==================== Claude Native API Execution Path ==================== async def _execute_claude_native_with_tools( self, instruction: str, tool_call_log_file: Optional[str] = None ) -> Dict[str, Any]: """ Execute Claude with thinking using native Anthropic API. Creates MCP server, gets tools, and executes with thinking. """ logger.debug("Using Claude native API with thinking") thinking_budget = self._get_claude_thinking_budget() # Create and start MCP server mcp_server = await self._create_mcp_server() async with mcp_server: # Get available tools tools = await mcp_server.list_tools() # Convert MCP tools to Anthropic format anthropic_tools = self._convert_to_anthropic_format(tools) # Execute with function calling loop return await self._execute_anthropic_native_tool_loop( instruction, anthropic_tools, mcp_server, thinking_budget, tool_call_log_file, ) async def _call_claude_native_api( self, messages: List[Dict], thinking_budget: int, tools: Optional[List[Dict]] = None, mcp_servers: Optional[List[Dict]] = None, system: Optional[str] = None, ) -> Dict[str, Any]: """ Call Claude's native API directly using httpx. Args: messages: Conversation messages thinking_budget: Token budget for thinking tools: Tool definitions for function calling mcp_servers: MCP server configurations system: System prompt Returns: API response as dictionary """ # Get API base and headers import os api_base = os.getenv("ANTHROPIC_API_BASE", "https://api.anthropic.com") headers = { "x-api-key": self.api_key, "anthropic-version": "2023-06-01", "content-type": "application/json", "anthropic-beta": "context-1m-2025-08-07", # by default } # Build payload max_tokens = max(thinking_budget + 4096, 4096) payload = { "model": self.litellm_input_model_name.replace("anthropic/", ""), "max_tokens": max_tokens, "messages": messages, } # Add thinking configuration if thinking_budget: payload["thinking"] = {"type": "enabled", "budget_tokens": thinking_budget} # Add tools if provided if tools: payload["tools"] = tools payload["tool_choice"] = {"type": "auto"} # Add MCP servers if provided if mcp_servers: headers["anthropic-beta"] = "mcp-client-2025-04-04" payload["mcp_servers"] = mcp_servers # Add system prompt if provided if system: payload["system"] = system # Make the API call async with httpx.AsyncClient() as client: try: response = await client.post( f"{api_base}/v1/messages", headers=headers, json=payload, timeout=self.timeout, ) response.raise_for_status() return response.json(), None except httpx.HTTPStatusError as e: return None, e.response.text except Exception as e: return None, e async def _count_claude_input_tokens( self, messages: List[Dict[str, Any]], tools: Optional[List[Dict]] = None, system: Optional[str] = None, ) -> int: import os api_base = os.getenv("ANTHROPIC_API_BASE", "https://api.anthropic.com") headers = { "x-api-key": self.api_key, "anthropic-version": "2023-06-01", "content-type": "application/json", } payload: Dict[str, Any] = { "model": self.litellm_input_model_name.replace("anthropic/", ""), "messages": messages, } if tools: payload["tools"] = tools if system: payload["system"] = system async with httpx.AsyncClient() as client: response = await client.post( f"{api_base}/v1/messages/count_tokens", headers=headers, json=payload, timeout=self.timeout, ) response.raise_for_status() data = response.json() or {} return int(data.get("input_tokens", 0) or 0) def _extract_litellm_text(self, response: Any) -> str: try: choices = getattr(response, "choices", None) or [] if not choices: return "" msg = getattr(choices[0], "message", None) if msg is not None: return str(getattr(msg, "content", "") or "") return str(getattr(choices[0], "text", "") or "") except Exception: # pragma: no cover - best effort return "" def _extract_anthropic_text(self, response_json: Dict[str, Any]) -> str: pieces: List[str] = [] for block in response_json.get("content", []) or []: if isinstance(block, dict) and block.get("type") == "text": text = block.get("text") if text: pieces.append(str(text)) return "\n".join(pieces).strip() def _merge_usage(self, total_tokens: Dict[str, int], usage: Dict[str, Any]) -> None: try: input_tokens = int(usage.get("input_tokens", 0) or 0) output_tokens = int(usage.get("output_tokens", 0) or 0) total_tokens_count = int( usage.get("total_tokens", 0) or (input_tokens + output_tokens) ) total_tokens["input_tokens"] += input_tokens total_tokens["output_tokens"] += output_tokens total_tokens["total_tokens"] += total_tokens_count except Exception: # pragma: no cover - best effort return async def _maybe_compact_litellm_messages( self, messages: List[Dict[str, Any]], total_tokens: Dict[str, int], tool_call_log_file: Optional[str], current_prompt_tokens: int, ) -> List[Dict[str, Any]]: if not self._compaction_enabled(): return messages if current_prompt_tokens < self.compaction_token: return messages logger.info( f"| [compaction] Triggered at prompt tokens: {current_prompt_tokens:,}" ) if tool_call_log_file: try: with open(tool_call_log_file, "a", encoding="utf-8") as f: f.write( f"| [compaction] Triggered at prompt tokens: {current_prompt_tokens:,}\n" ) except Exception: pass compact_messages = [ {"role": "system", "content": self.COMPACTION_PROMPT}, {"role": "user", "content": json.dumps(messages, ensure_ascii=False)}, ] completion_kwargs = { "model": self.litellm_input_model_name, "messages": compact_messages, "api_key": self.api_key, } if self.base_url: completion_kwargs["base_url"] = self.base_url response = await litellm.acompletion(**completion_kwargs) usage = getattr(response, "usage", None) if usage: input_tokens = ( getattr(usage, "prompt_tokens", None) or getattr(usage, "input_tokens", None) or 0 ) output_tokens = ( getattr(usage, "completion_tokens", None) or getattr(usage, "output_tokens", None) or 0 ) total_tokens_count = getattr(usage, "total_tokens", None) if total_tokens_count is None: total_tokens_count = input_tokens + output_tokens total_tokens["input_tokens"] += int(input_tokens or 0) total_tokens["output_tokens"] += int(output_tokens or 0) total_tokens["total_tokens"] += int(total_tokens_count or 0) summary = self._extract_litellm_text(response).strip() or "(no summary)" system_msg = ( messages[0] if messages else {"role": "system", "content": self.SYSTEM_PROMPT} ) first_user = ( messages[1] if len(messages) > 1 else {"role": "user", "content": ""} ) return [ system_msg, first_user, { "role": "user", "content": f"Context summary (auto-compacted due to token limit):\n{summary}", }, ] async def _maybe_compact_anthropic_messages( self, messages: List[Dict[str, Any]], total_tokens: Dict[str, int], thinking_budget: int, tool_call_log_file: Optional[str], current_input_tokens: int, ) -> List[Dict[str, Any]]: if not self._compaction_enabled(): return messages if current_input_tokens < self.compaction_token: return messages logger.info( f"| [compaction] Triggered at input tokens: {current_input_tokens:,}" ) if tool_call_log_file: try: with open(tool_call_log_file, "a", encoding="utf-8") as f: f.write( f"| [compaction] Triggered at input tokens: {current_input_tokens:,}\n" ) except Exception: pass compact_messages = [ {"role": "user", "content": self.COMPACTION_PROMPT}, {"role": "user", "content": json.dumps(messages, ensure_ascii=False)}, ] response, error_msg = await self._call_claude_native_api( messages=compact_messages, thinking_budget=thinking_budget, tools=None, system=None, ) if error_msg or not response: logger.warning(f"| [compaction] Failed: {error_msg}") return messages usage = response.get("usage", {}) or {} input_tokens = usage.get("input_tokens", 0) or 0 output_tokens = usage.get("output_tokens", 0) or 0 total_tokens["input_tokens"] += int(input_tokens) total_tokens["output_tokens"] += int(output_tokens) total_tokens["total_tokens"] += int(input_tokens + output_tokens) summary = self._extract_anthropic_text(response) or "(no summary)" first_user = messages[0] if messages else {"role": "user", "content": ""} return [ first_user, { "role": "user", "content": f"Context summary (auto-compacted due to token limit):\n{summary}", }, ] async def _execute_anthropic_native_tool_loop( self, instruction: str, tools: List[Dict], mcp_server: Any, thinking_budget: int, tool_call_log_file: Optional[str] = None, ) -> Dict[str, Any]: """ Execute Claude thinking loop with function calling. Handles thinking blocks, tool calls, and message formatting. """ messages = [{"role": "user", "content": instruction}] total_tokens = { "input_tokens": 0, "output_tokens": 0, "total_tokens": 0, "reasoning_tokens": 0, } turn_count = 0 max_turns = self.MAX_TURNS hit_turn_limit = False ended_normally = False system_text = self.SYSTEM_PROMPT # Record initial state self._update_progress(messages, total_tokens, turn_count) for _ in range(max_turns): turn_count += 1 current_input_tokens = 0 if self._compaction_enabled(): try: current_input_tokens = await self._count_claude_input_tokens( messages=messages, tools=tools, system=system_text, ) except Exception as exc: # noqa: BLE001 logger.debug("Claude token counting failed: %s", exc) messages = await self._maybe_compact_anthropic_messages( messages=messages, total_tokens=total_tokens, thinking_budget=thinking_budget, tool_call_log_file=tool_call_log_file, current_input_tokens=current_input_tokens, ) self._update_progress(messages, total_tokens, turn_count) # Call Claude native API response, error_msg = await self._call_claude_native_api( messages=messages, thinking_budget=thinking_budget, tools=tools, system=system_text, ) if turn_count == 1: self.litellm_run_model_name = response["model"].split("/")[-1] if error_msg: break # Update token usage if "usage" in response: usage = response["usage"] input_tokens = usage.get("input_tokens", 0) output_tokens = usage.get("output_tokens", 0) # Calculate output tokens as total - input for consistency total_tokens_count = output_tokens + input_tokens total_tokens["input_tokens"] += input_tokens total_tokens["output_tokens"] += output_tokens total_tokens["total_tokens"] += total_tokens_count ## TODO: add reasoning tokens for claude # Extract blocks from response blocks = response.get("content", []) tool_uses = [b for b in blocks if b.get("type") == "tool_use"] thinking_blocks = [b for b in blocks if b.get("type") == "thinking"] text_blocks = [b for b in blocks if b.get("type") == "text"] # Log text output for tb in text_blocks: if tb.get("text") and tool_call_log_file: with open(tool_call_log_file, "a", encoding="utf-8") as f: f.write(f"{tb['text']}\n") if tb.get("text"): for line in tb["text"].splitlines(): logger.info(f"| {line}") # Build assistant message with all blocks assistant_content = [] # Add thinking blocks for tb in thinking_blocks: assistant_content.append( { "type": "thinking", "thinking": tb.get("thinking", ""), "signature": tb.get("signature", ""), } ) # Add text blocks for tb in text_blocks: if tb.get("text"): assistant_content.append({"type": "text", "text": tb["text"]}) # Add tool_use blocks for tu in tool_uses: assistant_content.append( { "type": "tool_use", "id": tu.get("id"), "name": tu.get("name"), "input": tu.get("input", {}), } ) messages.append({"role": "assistant", "content": assistant_content}) # Update partial progress after assistant response self._update_progress(messages, total_tokens, turn_count) # If no tool calls, we're done if not tool_uses: ended_normally = True break # Execute tools and add results tool_results = [] for tu in tool_uses: name = tu.get("name") inputs = tu.get("input", {}) # Log tool call args_str = json.dumps(inputs, separators=(",", ": ")) display_args = ( args_str[:140] + "..." if len(args_str) > 140 else args_str ) logger.info(f"| \033[1m{name}\033[0m \033[2;37m{display_args}\033[0m") if tool_call_log_file: with open(tool_call_log_file, "a", encoding="utf-8") as f: f.write(f"| {name} {args_str}\n") # Execute tool try: result = await asyncio.wait_for( mcp_server.call_tool(name, inputs), timeout=60 ) tool_results.append( { "type": "tool_result", "tool_use_id": tu["id"], "content": [ { "type": "text", "text": json.dumps(result, cls=CustomJSONEncoder), } ], } ) except Exception as e: logger.error(f"Tool call failed: {e}") tool_results.append( { "type": "tool_result", "tool_use_id": tu["id"], "content": [{"type": "text", "text": f"Error: {str(e)}"}], } ) messages.append({"role": "user", "content": tool_results}) # Update partial progress after tool results self._update_progress(messages, total_tokens, turn_count) # Detect if we exited due to hitting the turn limit if not ended_normally: if turn_count >= max_turns: hit_turn_limit = True logger.warning( f"| Max turns ({max_turns}) exceeded; returning failure with partial output." ) if tool_call_log_file: try: with open(tool_call_log_file, "a", encoding="utf-8") as f: f.write(f"| Max turns ({max_turns}) exceeded\n") except Exception: pass elif error_msg: logger.warning(f"| {error_msg}\n") if tool_call_log_file: try: with open(tool_call_log_file, "a", encoding="utf-8") as f: f.write(f"| {error_msg}\n") except Exception: pass # Display final token usage if total_tokens["total_tokens"] > 0: log_msg = ( f"|\n| Token usage: Total: {total_tokens['total_tokens']:,} | " f"Input: {total_tokens['input_tokens']:,} | " f"Output: {total_tokens['output_tokens']:,}" ) if total_tokens.get("reasoning_tokens", 0) > 0: log_msg += f" | Reasoning: {total_tokens['reasoning_tokens']:,}" logger.info(log_msg) logger.info(f"| Turns: {turn_count}") # Convert messages to SDK format sdk_format_messages = self._convert_to_sdk_format(messages) if hit_turn_limit: return { "success": False, "output": sdk_format_messages, "token_usage": total_tokens, "turn_count": turn_count, "error": f"Max turns ({max_turns}) exceeded", "litellm_run_model_name": self.litellm_run_model_name, } if error_msg: return { "success": False, "output": sdk_format_messages, "token_usage": total_tokens, "turn_count": turn_count, "error": error_msg, "litellm_run_model_name": self.litellm_run_model_name, } return { "success": True, "output": sdk_format_messages, "token_usage": total_tokens, "turn_count": turn_count, "error": None, "litellm_run_model_name": self.litellm_run_model_name, } # ==================== LiteLLM Execution Path ==================== async def _execute_litellm_with_tools( self, instruction: str, tool_call_log_file: Optional[str] = None ) -> Dict[str, Any]: """ Execute with manual MCP server management. Used for all non-Anthropic models and Anthropic models with STDIO services. """ logger.debug("Using manual MCP execution with function calling loop") # Create and start MCP server mcp_server = await self._create_mcp_server() try: async with mcp_server: # Get available tools tools = await mcp_server.list_tools() # Convert MCP tools to OpenAI function format functions = self._convert_to_openai_format(tools) # Execute with function calling loop return await self._execute_litellm_tool_loop( instruction, functions, mcp_server, tool_call_log_file ) except Exception as e: logger.error(f"Manual MCP execution failed: {e}") raise async def _execute_litellm_tool_loop( self, instruction: str, functions: List[Dict], mcp_server: Any, tool_call_log_file: Optional[str] = None, ) -> Dict[str, Any]: """Execute function calling loop with LiteLLM.""" messages = [ {"role": "system", "content": self.SYSTEM_PROMPT}, {"role": "user", "content": instruction}, ] total_tokens = { "input_tokens": 0, "output_tokens": 0, "total_tokens": 0, "reasoning_tokens": 0, } turn_count = 0 max_turns = self.MAX_TURNS # Limit turns to prevent infinite loops consecutive_failures = 0 max_consecutive_failures = 3 hit_turn_limit = False ended_normally = False # Convert functions to tools format for newer models tools = ( [{"type": "function", "function": func} for func in functions] if functions else None ) if tool_call_log_file and tools: max_name_length = ( max(len(tool.get("function", {}).get("name", "")) for tool in tools) if tools else 15 ) with open(tool_call_log_file, "a", encoding="utf-8") as f: f.write("===== Available Tools =====\n") for tool in tools: function_info = tool.get("function", {}) tool_name = function_info.get("name", "N/A") description = function_info.get("description", "N/A") f.write( f"- ToolName: {tool_name:<{max_name_length}} Description: {description}\n" ) f.write("\n===== Execution Logs =====\n") # Record initial state self._update_progress(messages, total_tokens, turn_count) try: while turn_count < max_turns: current_prompt_tokens = 0 if self._compaction_enabled(): current_prompt_tokens = self._count_prompt_tokens_litellm(messages) messages = await self._maybe_compact_litellm_messages( messages=messages, total_tokens=total_tokens, tool_call_log_file=tool_call_log_file, current_prompt_tokens=current_prompt_tokens, ) self._update_progress(messages, total_tokens, turn_count) # Build completion kwargs completion_kwargs = { "model": self.litellm_input_model_name, "messages": messages, "api_key": self.api_key, } # Always use tools format if available - LiteLLM will handle conversion if tools: completion_kwargs["tools"] = tools completion_kwargs["tool_choice"] = "auto" # Add reasoning_effort and base_url if specified if self.reasoning_effort != "default": completion_kwargs["reasoning_effort"] = self.reasoning_effort if self.base_url: completion_kwargs["base_url"] = self.base_url try: # Call LiteLLM with timeout for individual call response = await asyncio.wait_for( litellm.acompletion(**completion_kwargs), timeout=self.timeout / 2, # Use half of total timeout ) consecutive_failures = 0 # Reset failure counter on success except asyncio.TimeoutError: logger.warning(f"| ✗ LLM call timed out on turn {turn_count + 1}") consecutive_failures += 1 if consecutive_failures >= max_consecutive_failures: raise Exception( f"Too many consecutive failures ({consecutive_failures})" ) await asyncio.sleep(8**consecutive_failures) # Exponential backoff continue except Exception as e: logger.error(f"| ✗ LLM call failed on turn {turn_count + 1}: {e}") consecutive_failures += 1 if consecutive_failures >= max_consecutive_failures: raise if "ContextWindowExceededError" in str(e): # Best-effort fallback: compact and retry once. messages = await self._maybe_compact_litellm_messages( messages=messages, total_tokens=total_tokens, tool_call_log_file=tool_call_log_file, current_prompt_tokens=self.compaction_token, ) self._update_progress(messages, total_tokens, turn_count) continue elif "RateLimitError" in str(e): await asyncio.sleep(12**consecutive_failures) else: await asyncio.sleep(2**consecutive_failures) continue # Extract actual model name from response (first turn only) if turn_count == 0 and hasattr(response, "model") and response.model: self.litellm_run_model_name = response.model.split("/")[-1] # Update token usage including reasoning tokens if hasattr(response, "usage") and response.usage: input_tokens = response.usage.prompt_tokens or 0 total_tokens_count = response.usage.total_tokens or 0 # Calculate output tokens as total - input for consistency output_tokens = ( total_tokens_count - input_tokens if total_tokens_count > 0 else (response.usage.completion_tokens or 0) ) total_tokens["input_tokens"] += input_tokens total_tokens["output_tokens"] += output_tokens total_tokens["total_tokens"] += total_tokens_count # Extract reasoning tokens if available if hasattr(response.usage, "completion_tokens_details"): details = response.usage.completion_tokens_details if hasattr(details, "reasoning_tokens"): total_tokens["reasoning_tokens"] += ( details.reasoning_tokens or 0 ) # Get response message choices = response.choices if len(choices): message = choices[0].message # deeply dump the message to ensure we capture all fields message_dict = ( message.model_dump() if hasattr(message, "model_dump") else dict(message) ) # Explicitly preserve function_call if present (even if tool_calls exists), # as it may contain provider-specific metadata (e.g. Gemini thought_signature) if hasattr(message, "function_call") and message.function_call: # Ensure it's in the dict if model_dump missed it or it was excluded if ( "function_call" not in message_dict or not message_dict["function_call"] ): fc = message.function_call message_dict["function_call"] = ( fc.model_dump() if hasattr(fc, "model_dump") else fc ) # Log assistant's text content if present if hasattr(message, "content") and message.content: # Display the content with line prefix for line in message.content.splitlines(): logger.info(f"| {line}") # Also log to file if specified if tool_call_log_file: with open(tool_call_log_file, "a", encoding="utf-8") as f: f.write(f"{message.content}\n") # Check for tool calls (newer format) if hasattr(message, "tool_calls") and message.tool_calls: messages.append(message_dict) turn_count += 1 # Update progress after assistant with tool calls self._update_progress(messages, total_tokens, turn_count) # Process tool calls for tool_call in message.tool_calls: func_name = tool_call.function.name func_args = json.loads(tool_call.function.arguments) try: result = await asyncio.wait_for( mcp_server.call_tool(func_name, func_args), timeout=60 ) messages.append( { "role": "tool", "tool_call_id": tool_call.id, "content": json.dumps( result, cls=CustomJSONEncoder ), } ) except asyncio.TimeoutError: error_msg = ( f"Tool call '{func_name}' timed out after 60 seconds" ) logger.error(error_msg) messages.append( { "role": "tool", "tool_call_id": tool_call.id, "content": f"Error: {error_msg}", } ) except Exception as e: logger.error(f"Tool call failed: {e}") messages.append( { "role": "tool", "tool_call_id": tool_call.id, "content": f"Error: {str(e)}", } ) # Format arguments for display (truncate if too long) args_str = json.dumps(func_args, separators=(",", ": ")) display_arguments = ( args_str[:140] + "..." if len(args_str) > 140 else args_str ) # Log with ANSI color codes (bold tool name, dim gray arguments) logger.info( f"| \033[1m{func_name}\033[0m \033[2;37m{display_arguments}\033[0m" ) if tool_call_log_file: with open(tool_call_log_file, "a", encoding="utf-8") as f: f.write(f"| {func_name} {args_str}\n") # Update progress after tool results appended self._update_progress(messages, total_tokens, turn_count) continue else: # Log end reason if not choices: logger.info( "|\n|\n| Task ended with no messages generated by the model." ) elif choices[0].finish_reason == "stop": logger.info( "|\n|\n| Task ended with the finish reason from messages being 'stop'." ) # No tool/function call, add message and we're done messages.append(message_dict) turn_count += 1 # Update progress before exiting self._update_progress(messages, total_tokens, turn_count) ended_normally = True break except Exception as loop_error: # On any error, return partial conversation, token usage, and turn count logger.error(f"Manual MCP loop failed: {loop_error}", exc_info=True) sdk_format_messages = self._convert_to_sdk_format(messages) return { "success": False, "output": sdk_format_messages, "token_usage": total_tokens, "turn_count": turn_count, "error": str(loop_error), "litellm_run_model_name": self.litellm_run_model_name, } # Detect if we exited due to hitting the turn limit if (not ended_normally) and (turn_count >= max_turns): hit_turn_limit = True logger.warning( f"| Max turns ({max_turns}) exceeded); returning failure with partial output." ) if tool_call_log_file: try: with open(tool_call_log_file, "a", encoding="utf-8") as f: f.write(f"| Max turns ({max_turns}) exceeded\n") except Exception: pass # Display final token usage if total_tokens["total_tokens"] > 0: log_msg = ( f"| Token usage: Total: {total_tokens['total_tokens']:,} | " f"Input: {total_tokens['input_tokens']:,} | " f"Output: {total_tokens['output_tokens']:,}" ) if total_tokens.get("reasoning_tokens", 0) > 0: log_msg += f" | Reasoning: {total_tokens['reasoning_tokens']:,}" logger.info(log_msg) logger.info(f"| Turns: {turn_count}") # Convert messages to SDK format for backward compatibility sdk_format_messages = self._convert_to_sdk_format(messages) return { "success": not hit_turn_limit, "output": sdk_format_messages, "token_usage": total_tokens, "turn_count": turn_count, "error": (f"Max turns ({max_turns}) exceeded" if hit_turn_limit else None), "litellm_run_model_name": self.litellm_run_model_name, } # ==================== MCP Server Management ==================== async def _create_mcp_server(self) -> Any: """Create and return an MCP server instance.""" if self.mcp_service in self.STDIO_SERVICES: return self._create_stdio_server() elif self.mcp_service in self.HTTP_SERVICES: return self._create_http_server() else: raise ValueError(f"Unsupported MCP service: {self.mcp_service}") def _create_stdio_server(self) -> MCPStdioServer: """Create stdio-based MCP server.""" if self.mcp_service == "notion": notion_key = self.service_config.get("notion_key") if not notion_key: raise ValueError("Notion API key required") return MCPStdioServer( command="npx", args=["-y", "@notionhq/notion-mcp-server@1.9.1"], env={ "OPENAPI_MCP_HEADERS": ( '{"Authorization": "Bearer ' + notion_key + '", ' '"Notion-Version": "2022-06-28"}' ) }, ) elif self.mcp_service == "filesystem": test_directory = self.service_config.get("test_directory") if not test_directory: raise ValueError("Test directory required for filesystem service") return MCPStdioServer( command="npx", args=[ "-y", "@modelcontextprotocol/server-filesystem", str(test_directory), ], ) elif self.mcp_service in ["playwright", "playwright_webarena"]: browser = self.service_config.get("browser", "chromium") headless = self.service_config.get("headless", True) viewport_width = self.service_config.get("viewport_width", 1280) viewport_height = self.service_config.get("viewport_height", 720) args = ["-y", "@playwright/mcp@latest"] if headless: args.append("--headless") args.extend( [ "--isolated", "--no-sandbox", "--browser", browser, "--viewport-size", f"{viewport_width},{viewport_height}", ] ) return MCPStdioServer(command="npx", args=args) elif self.mcp_service == "postgres": host = self.service_config.get("host", "localhost") port = self.service_config.get("port", 5432) username = self.service_config.get("username") password = self.service_config.get("password") database = self.service_config.get( "current_database" ) or self.service_config.get("database") if not all([username, password, database]): raise ValueError("PostgreSQL requires username, password, and database") database_url = ( f"postgresql://{username}:{password}@{host}:{port}/{database}" ) return MCPStdioServer( command="pipx", args=["run", "postgres-mcp", "--access-mode=unrestricted"], env={"DATABASE_URI": database_url}, ) elif self.mcp_service == "insforge": api_key = self.service_config.get("api_key") backend_url = self.service_config.get("backend_url") if not all([api_key, backend_url]): raise ValueError("Insforge requires api_key and backend_url") return MCPStdioServer( command="npx", args=["-y", "@insforge/mcp@dev"], env={ "INSFORGE_API_KEY": api_key, "INSFORGE_BACKEND_URL": backend_url, }, ) elif self.mcp_service == "github": github_token = self.service_config.get("github_token") if not github_token: raise ValueError("GitHub token required") return MCPStdioServer( command="docker", args=[ "run", "-i", "--rm", "-e", "GITHUB_PERSONAL_ACCESS_TOKEN", "ghcr.io/github/github-mcp-server:v0.15.0", ], env={"GITHUB_PERSONAL_ACCESS_TOKEN": github_token}, ) else: raise ValueError(f"Unsupported stdio service: {self.mcp_service}") def _create_http_server(self) -> MCPHttpServer: """Create HTTP-based MCP server.""" if self.mcp_service == "supabase": # Use built-in MCP server from Supabase CLI api_url = self.service_config.get("api_url", "http://localhost:54321") api_key = self.service_config.get("api_key", "") if not api_key: raise ValueError( "Supabase requires api_key (use secret key from 'supabase status')" ) # Supabase CLI exposes MCP at /mcp endpoint mcp_url = f"{api_url}/mcp" return MCPHttpServer( url=mcp_url, headers={ "apikey": api_key, "Authorization": f"Bearer {api_key}", }, ) else: raise ValueError(f"Unsupported HTTP service: {self.mcp_service}") ================================================ FILE: src/agents/react_agent.py ================================================ """ReAct agent implementation for the MCPMark pipeline.""" from __future__ import annotations import asyncio import json import time from typing import Any, Dict, List, Optional, Callable import litellm from src.logger import get_logger from .base_agent import BaseMCPAgent logger = get_logger(__name__) class ReActAgent(BaseMCPAgent): """ReAct-style agent that reuses MCPMark infrastructure.""" DEFAULT_SYSTEM_PROMPT = ( "You are a careful ReAct (reasoning and acting) agent. " "At each step you must decide whether to call a tool or provide a final response. " "Only use the tools that are listed for you. When you finish, respond with either the final answer " "or the phrase \"Task completed.\" if no further detail is required. " "Every reply must be valid JSON without code fences." ) COMPACTION_PROMPT = ( "You are performing a CONTEXT CHECKPOINT COMPACTION.\n" "Summarize the conversation so far for another model to continue.\n\n" "Include:\n" "- Current progress and key decisions made\n" "- Important context, constraints, or user preferences\n" "- What remains to be done (clear next steps)\n" "- Any critical data, examples, or references needed to continue\n\n" "Be concise and structured. Do NOT call tools." ) def __init__( self, litellm_input_model_name: str, api_key: str, base_url: str, mcp_service: str, timeout: int = BaseMCPAgent.DEFAULT_TIMEOUT, service_config: Optional[Dict[str, Any]] = None, service_config_provider: Optional[Callable[[], Dict[str, Any]]] = None, reasoning_effort: Optional[str] = "default", max_iterations: int = 100, system_prompt: Optional[str] = None, compaction_token: int = BaseMCPAgent.COMPACTION_DISABLED_TOKEN, ): super().__init__( litellm_input_model_name=litellm_input_model_name, api_key=api_key, base_url=base_url, mcp_service=mcp_service, timeout=timeout, service_config=service_config, service_config_provider=service_config_provider, reasoning_effort=reasoning_effort, compaction_token=compaction_token, ) self.max_iterations = max_iterations self.react_system_prompt = system_prompt or self.DEFAULT_SYSTEM_PROMPT async def execute( self, instruction: str, tool_call_log_file: Optional[str] = None, ) -> Dict[str, Any]: start_time = time.time() try: self._reset_progress() self._refresh_service_config() async def _run_react(): return await self._execute_react_loop(instruction, tool_call_log_file) result = await asyncio.wait_for(_run_react(), timeout=self.timeout) execution_time = time.time() - start_time self.usage_tracker.update( success=result.get("success", False), token_usage=result.get("token_usage", {}), turn_count=result.get("turn_count", 0), execution_time=execution_time, ) result["execution_time"] = execution_time return result except Exception as exc: # noqa: BLE001 execution_time = time.time() - start_time if isinstance(exc, asyncio.TimeoutError): error_msg = f"Execution timed out after {self.timeout} seconds" logger.error(error_msg) else: error_msg = f"ReAct agent execution failed: {exc}" logger.error(error_msg, exc_info=True) self.usage_tracker.update( success=False, token_usage=self._partial_token_usage or {}, turn_count=self._partial_turn_count or 0, execution_time=execution_time, ) if self._partial_messages: final_msg = self._convert_to_sdk_format(self._partial_messages) else: final_msg = [] return { "success": False, "output": final_msg, "token_usage": self._partial_token_usage or {}, "turn_count": self._partial_turn_count or 0, "execution_time": execution_time, "error": error_msg, "litellm_run_model_name": self.litellm_run_model_name, } async def _execute_react_loop( self, instruction: str, tool_call_log_file: Optional[str], ) -> Dict[str, Any]: system_message = {"role": "system", "content": self.react_system_prompt} total_tokens = { "input_tokens": 0, "output_tokens": 0, "total_tokens": 0, "reasoning_tokens": 0, } turn_count = 0 success = False final_error: Optional[str] = None mcp_server = await self._create_mcp_server() async with mcp_server: tools = await mcp_server.list_tools() tool_map = {tool.get("name"): tool for tool in tools} tools_description = self._render_tools_description(tools) task_message = { "role": "user", "content": self._build_task_prompt( instruction=instruction, tools_description=tools_description, ), } messages: List[Dict[str, Any]] = [system_message, task_message] self._update_progress(messages, total_tokens, turn_count) for step in range(1, self.max_iterations + 1): current_prompt_tokens = 0 if self._compaction_enabled(): current_prompt_tokens = self._count_prompt_tokens_litellm(messages) if self._compaction_enabled() and current_prompt_tokens >= self.compaction_token: logger.info( f"| [compaction] Triggered at prompt tokens: {current_prompt_tokens:,}" ) if tool_call_log_file: try: with open(tool_call_log_file, "a", encoding="utf-8") as log_file: log_file.write( f"| [compaction] Triggered at prompt tokens: {current_prompt_tokens:,}\n" ) except Exception: # noqa: BLE001 pass compact_messages = [ {"role": "system", "content": self.COMPACTION_PROMPT}, {"role": "user", "content": json.dumps(messages, ensure_ascii=False)}, ] compact_kwargs = { "model": self.litellm_input_model_name, "messages": compact_messages, "api_key": self.api_key, } if self.base_url: compact_kwargs["base_url"] = self.base_url compact_response = await litellm.acompletion(**compact_kwargs) usage = getattr(compact_response, "usage", None) if usage: prompt_tokens = ( getattr(usage, "prompt_tokens", None) or getattr(usage, "input_tokens", None) or 0 ) completion_tokens = ( getattr(usage, "completion_tokens", None) or getattr(usage, "output_tokens", None) or 0 ) total_tokens_count = getattr(usage, "total_tokens", None) if total_tokens_count is None: total_tokens_count = prompt_tokens + completion_tokens total_tokens["input_tokens"] += int(prompt_tokens or 0) total_tokens["output_tokens"] += int(completion_tokens or 0) total_tokens["total_tokens"] += int(total_tokens_count or 0) summary = "" try: summary = compact_response.choices[0].message.content or "" except Exception: # noqa: BLE001 summary = "" summary = summary.strip() or "(no summary)" messages = [ system_message, task_message, { "role": "user", "content": ( "Context summary (auto-compacted due to token limit):\n" f"{summary}" ), }, ] self._update_progress(messages, total_tokens, turn_count) completion_kwargs = { "model": self.litellm_input_model_name, "messages": messages, "api_key": self.api_key, } if self.base_url: completion_kwargs["base_url"] = self.base_url if self.reasoning_effort != "default": completion_kwargs["reasoning_effort"] = self.reasoning_effort try: response = await asyncio.wait_for( litellm.acompletion(**completion_kwargs), timeout=self.timeout / 2, ) except asyncio.TimeoutError: final_error = f"LLM call timed out on step {step}" logger.error(final_error) break except Exception as exc: # noqa: BLE001 final_error = f"LLM call failed on step {step}: {exc}" logger.error(final_error) if "ContextWindowExceededError" in str(exc): continue break if turn_count == 0 and getattr(response, "model", None): self.litellm_run_model_name = response.model.split("/")[-1] usage = getattr(response, "usage", None) if usage: prompt_tokens = ( getattr(usage, "prompt_tokens", None) or getattr(usage, "input_tokens", None) or 0 ) completion_tokens = ( getattr(usage, "completion_tokens", None) or getattr(usage, "output_tokens", None) or 0 ) total_tokens_count = getattr(usage, "total_tokens", None) if total_tokens_count is None: total_tokens_count = prompt_tokens + completion_tokens total_tokens["input_tokens"] += prompt_tokens total_tokens["output_tokens"] += completion_tokens total_tokens["total_tokens"] += total_tokens_count # Extract reasoning tokens if available if hasattr(response.usage, 'completion_tokens_details'): details = response.usage.completion_tokens_details if hasattr(details, 'reasoning_tokens'): total_tokens["reasoning_tokens"] += details.reasoning_tokens or 0 choice = response.choices[0] message_obj = getattr(choice, "message", None) if message_obj is None and isinstance(choice, dict): message_obj = choice.get("message") if message_obj is None: content_raw = getattr(choice, "text", "") else: content_raw = message_obj.get("content", "") assistant_text = self._normalize_content(content_raw) assistant_message = {"role": "assistant", "content": assistant_text} messages.append(assistant_message) turn_count += 1 self._update_progress(messages, total_tokens, turn_count) parsed = self._parse_react_response(assistant_text) if not parsed or "thought" not in parsed: warning = ( "The previous response was not valid JSON following the required schema. " "Please respond again using the JSON formats provided." ) messages.append({"role": "user", "content": warning}) self._update_progress(messages, total_tokens, turn_count) final_error = "Model produced an invalid response format." continue thought = parsed.get("thought", "") action = parsed.get("action") answer = parsed.get("answer") result = parsed.get("result") logger.info(f"|\n| \033[1;3mThought\033[0m: {str(thought)}") if tool_call_log_file: try: with open(tool_call_log_file, "a", encoding="utf-8") as log_file: log_file.write(f"| {str(thought)}\n") except Exception: # noqa: BLE001 pass if action is not None: func_name = action.get("tool") arguments = action.get("arguments", {}) or {} args_str = json.dumps(arguments, separators=(",", ": ")) display_arguments = args_str[:140] + "..." if len(args_str) > 140 else args_str logger.info(f"| \033[1;3mAction\033[0m: \033[1m{func_name}\033[0m \033[2;37m{display_arguments}\033[0m") if answer is not None: success = True break if action is not None and isinstance(action, dict): tool_name = action.get("tool") arguments = action.get("arguments", {}) or {} if tool_name not in tool_map: observation = ( f"Invalid tool '{tool_name}'. Available tools: " f"{', '.join(tool_map)}" ) else: try: tool_response = await asyncio.wait_for( mcp_server.call_tool(tool_name, arguments), timeout=60, ) observation = self._tool_result_to_text(tool_response) except asyncio.TimeoutError: observation = f"Tool '{tool_name}' timed out" except Exception as tool_exc: # noqa: BLE001 observation = f"Tool '{tool_name}' failed: {tool_exc}" if tool_call_log_file: try: with open(tool_call_log_file, "a", encoding="utf-8") as log_file: log_file.write(f"| {tool_name} {json.dumps(arguments, ensure_ascii=False)}\n") except Exception: # noqa: BLE001 pass observation_message = { "role": "user", "content": ( f"Observation:\n{observation}\n" "Please continue reasoning and reply using the required JSON format." ), } messages.append(observation_message) self._update_progress(messages, total_tokens, turn_count) continue if result is not None: observation_message = { "role": "user", "content": ( f"Observation:\n{result}\n" "Please continue reasoning and reply using the required JSON format." ), } messages.append(observation_message) self._update_progress(messages, total_tokens, turn_count) continue # Unexpected structure: ask model to restate properly messages.append( { "role": "user", "content": ( "The previous reply did not include an action, result, or answer. " "Please respond again using the JSON formats provided." ), } ) self._update_progress(messages, total_tokens, turn_count) if not success and final_error is None: final_error = ( f"Max iterations ({self.max_iterations}) reached without a final answer." ) if total_tokens["total_tokens"] > 0: log_msg = ( f"|\n|\n| Token usage: Total: {total_tokens['total_tokens']:,} | " f"Input: {total_tokens['input_tokens']:,} | " f"Output: {total_tokens['output_tokens']:,}" ) if total_tokens.get("reasoning_tokens", 0) > 0: log_msg += f" | Reasoning: {total_tokens['reasoning_tokens']:,}" logger.info(log_msg) logger.info(f"| Turns: {turn_count}") sdk_messages = self._convert_to_sdk_format(messages) return { "success": success, "output": sdk_messages, "token_usage": total_tokens, "turn_count": turn_count, "error": None if success else final_error, "litellm_run_model_name": self.litellm_run_model_name, } def _build_task_prompt( self, instruction: str, tools_description: str, ) -> str: return ( f"Task:\n{instruction}\n\n" f"Available MCP tools:\n{tools_description}\n\n" "Respond using the JSON formats below.\n\n" "If you need to use a tool:\n" "{\n" ' "thought": "Reasoning for the next action",\n' ' "action": {\n' ' "tool": "tool-name",\n' ' "arguments": {\n' ' "parameter": value\n' " }\n" " }\n" "}\n\n" "If you can provide the final answer:\n" "{\n" ' "thought": "Reasoning that justifies the answer",\n' ' "answer": "Either the final solution or \'Task completed.\' when no more detail is required"\n' "}\n\n" "Remember: omitting the action object ends the task, so only do this when finished." ) def _render_tools_description(self, tools: List[Dict[str, Any]]) -> str: descriptions = [] for tool in tools: name = tool.get("name", "unknown") description = tool.get("description", "No description provided.") input_schema = tool.get("inputSchema", {}) or {} properties = input_schema.get("properties", {}) or {} required = set(input_schema.get("required", []) or []) arg_lines = [] for prop_name, prop_details in properties.items(): details = json.dumps(prop_details, ensure_ascii=False, indent=2) suffix = " (required)" if prop_name in required else "" arg_lines.append(f"- {prop_name}{suffix}: {details}") if arg_lines: arguments_text = "\n".join(arg_lines) else: arguments_text = "(no arguments)" descriptions.append( f"Tool: {name}\nDescription: {description}\nArguments:\n{arguments_text}" ) return "\n\n".join(descriptions) if descriptions else "(no tools available)" def _normalize_content(self, content: Any) -> str: if isinstance(content, str): return content if isinstance(content, list): parts = [] for block in content: if isinstance(block, dict): if block.get("type") == "text": parts.append(block.get("text", "")) elif "text" in block: parts.append(str(block.get("text"))) else: parts.append(str(block)) return "\n".join(part for part in parts if part) return json.dumps(content, ensure_ascii=False) def _parse_react_response(self, payload: str) -> Dict[str, Any]: candidate = payload.strip().strip("`").strip() if candidate.lower().startswith("json"): candidate = candidate[4:].lstrip() try: return json.loads(candidate) except json.JSONDecodeError: return {} def _tool_result_to_text(self, result: Any) -> str: if result is None: return "" if isinstance(result, str): return result try: return json.dumps(result, ensure_ascii=False) except TypeError: return str(result) ================================================ FILE: src/agents/utils/__init__.py ================================================ """ Utility functions for MCPMark Agent ==================================== """ from .token_usage import TokenUsageTracker __all__ = ["TokenUsageTracker"] ================================================ FILE: src/agents/utils/token_usage.py ================================================ """ Token Usage Tracking Utilities =============================== """ from typing import Dict, Any class TokenUsageTracker: """Track token usage across agent executions.""" def __init__(self): """Initialize token usage tracker.""" self.reset() def reset(self): """Reset all usage statistics.""" self._stats = { "total_input_tokens": 0, "total_output_tokens": 0, "total_tokens": 0, "total_turns": 0, "total_execution_time": 0.0, "successful_executions": 0, "failed_executions": 0, } def update(self, success: bool, token_usage: Dict[str, int], turn_count: int, execution_time: float): """ Update usage statistics. Args: success: Whether execution was successful token_usage: Token usage dict with input_tokens, output_tokens, total_tokens turn_count: Number of conversation turns execution_time: Execution time in seconds """ if success: self._stats["successful_executions"] += 1 else: self._stats["failed_executions"] += 1 self._stats["total_input_tokens"] += token_usage.get("input_tokens", 0) self._stats["total_output_tokens"] += token_usage.get("output_tokens", 0) self._stats["total_tokens"] += token_usage.get("total_tokens", 0) self._stats["total_turns"] += turn_count self._stats["total_execution_time"] += execution_time def get_stats(self) -> Dict[str, Any]: """ Get usage statistics with calculated averages. Returns: Dictionary containing usage statistics """ stats = self._stats.copy() # Calculate averages total_executions = stats["successful_executions"] + stats["failed_executions"] if total_executions > 0: stats["avg_input_tokens"] = stats["total_input_tokens"] / total_executions stats["avg_output_tokens"] = stats["total_output_tokens"] / total_executions stats["avg_total_tokens"] = stats["total_tokens"] / total_executions stats["avg_turns"] = stats["total_turns"] / total_executions stats["avg_execution_time"] = stats["total_execution_time"] / total_executions stats["success_rate"] = (stats["successful_executions"] / total_executions * 100) else: stats.update({ "avg_input_tokens": 0.0, "avg_output_tokens": 0.0, "avg_total_tokens": 0.0, "avg_turns": 0.0, "avg_execution_time": 0.0, "success_rate": 0.0, }) return stats ================================================ FILE: src/aggregators/aggregate_results.py ================================================ #!/usr/bin/env python3 """ Simplified MCPMark Results Aggregator Aggregates evaluation results and generates summary with pass@k metrics. """ import json import os import argparse import subprocess import shutil import tempfile from pathlib import Path from collections import defaultdict from typing import Dict, List, Any, Tuple, Optional from datetime import datetime import sys sys.path.append(str(Path(__file__).parent.parent.parent)) from src.errors import is_retryable_error from src.aggregators.pricing import compute_cost_usd # Supported difficulty splits in ./tasks/// SUPPORTED_TASK_SETS = {"standard", "easy"} def discover_tasks(task_set: str = "standard") -> Dict[str, List[str]]: """Discover all tasks from ./tasks directory filtered by task set.""" tasks_dir = Path("./tasks") all_tasks = {} # Handle each MCP service # Note: playwright and playwright_webarena both map to "playwright" MCP service_mappings = { "filesystem": ["filesystem"], "github": ["github"], "notion": ["notion"], "playwright": ["playwright", "playwright_webarena"], # Both count as playwright "postgres": ["postgres"], # supabase and insforge are variants with same tasks, don't merge } for mcp_service, task_dirs in service_mappings.items(): tasks: List[str] = [] for task_dir_name in task_dirs: service_path = tasks_dir / task_dir_name if not service_path.exists(): continue selected_root = service_path / task_set # Detect if this service has partitioned task sets (e.g. standard/easy) has_partitioned_layout = any( child.is_dir() and child.name in SUPPORTED_TASK_SETS for child in service_path.iterdir() ) if selected_root.exists(): search_roots = [selected_root] elif has_partitioned_layout: # Requested task set missing for this service; skip it for this run print(f" ⚠️ No '{task_set}' tasks found under {service_path}") search_roots = [] else: # Legacy layout without task sets – fall back to original structure search_roots = [service_path] for root in search_roots: for category_dir in root.iterdir(): if not category_dir.is_dir() or category_dir.name.startswith("__"): continue for task_dir in category_dir.iterdir(): if task_dir.is_dir() and not task_dir.name.startswith("__"): tasks.append(f"{category_dir.name}__{task_dir.name}") all_tasks[mcp_service] = sorted(tasks) return all_tasks def collect_results(exp_dir: Path, k: int) -> Dict[str, Dict[str, Any]]: """Collect all results from experiment directory.""" results = defaultdict(lambda: defaultdict(lambda: defaultdict(dict))) # Current layout: results//__/run-N/__/ # Some pipelines include task-set suffix in service dir (e.g., "filesystem-easy"). # Normalize such names back to canonical service keys used by tasks/ (filesystem, github, notion, playwright, postgres). def normalize_service_name(name: str) -> str: # Strip known task-set suffixes like "-easy" or "-standard" if name.endswith("-easy") or name.endswith("-standard"): base = name.rsplit("-", 1)[0] else: base = name # Map variant names to canonical service if base == "playwright_webarena": return "playwright" return base for model_service_dir in exp_dir.iterdir(): if not model_service_dir.is_dir() or "__" not in model_service_dir.name: continue model, service = model_service_dir.name.split("__", 1) # Normalize service names if service == "playwright_webarena": service = "playwright" elif service in ["supabase", "insforge"]: service = "postgres" for run_idx in range(1, k + 1): run_dir = model_service_dir / f"run-{run_idx}" if not run_dir.exists(): continue for task_dir in run_dir.iterdir(): if not task_dir.is_dir() or "__" not in task_dir.name: continue meta_path = task_dir / "meta.json" if meta_path.exists(): with open(meta_path) as f: meta = json.load(f) task_name = task_dir.name results[model][service][f"run-{run_idx}"][task_name] = meta return results def check_completeness_and_validity( results: Dict, all_tasks: Dict, k: int, single_run_models: List[str] ) -> Tuple[Dict, Dict, Dict]: """Check completeness and validity of results.""" complete_models = {} incomplete_models = {} invalid_models = {} for model, model_results in results.items(): is_single_run = any(srm in model for srm in single_run_models) required_runs = 1 if is_single_run else k missing_info = [] invalid_info = [] # Check each service for service, service_tasks in all_tasks.items(): if service not in model_results: missing_info.append(f"Missing entire service: {service}") continue service_results = model_results[service] # Check runs for run_idx in range(1, required_runs + 1): run_name = f"run-{run_idx}" if run_name not in service_results: missing_info.append(f"Missing {run_name} for {service}") continue run_results = service_results[run_name] # Check tasks missing_tasks = [] invalid_tasks = [] for task in service_tasks: if task not in run_results: missing_tasks.append(task) else: # Check for retryable errors only if the task did not succeed meta = run_results[task] success = bool(meta.get("execution_result", {}).get("success", False)) error_msg = meta.get("execution_result", {}).get("error_message", "") if (not success) and error_msg and is_retryable_error(error_msg): invalid_tasks.append(f"{task}: {error_msg[:50]}...") if missing_tasks: missing_info.append(f"{service}/{run_name}: missing {len(missing_tasks)} tasks") if invalid_tasks: invalid_info.extend([f"{service}/{run_name}/{t}" for t in invalid_tasks]) if missing_info: incomplete_models[model] = missing_info elif invalid_info: invalid_models[model] = invalid_info else: complete_models[model] = model_results return complete_models, incomplete_models, invalid_models def calculate_metrics(complete_models: Dict, all_tasks: Dict, k: int, single_run_models: List[str]) -> Dict: """Calculate rich metrics (totals, averages, per-run aggregates, pass@k) for complete models.""" summary = { "generated_at": datetime.now().isoformat(), "k": k, "overall": {}, } # Initialize per-service sections mirroring overall structure for service in all_tasks.keys(): summary[service] = {} # Helper to safely extract token usage numbers def get_token_counts(meta: Dict[str, Any]) -> Tuple[int, int, int]: tu = meta.get("token_usage", {}) or {} input_tokens = int(tu.get("input_tokens", 0) or 0) output_tokens = int(tu.get("output_tokens", 0) or 0) total_tokens = int(tu.get("total_tokens", input_tokens + output_tokens) or (input_tokens + output_tokens)) return input_tokens, output_tokens, total_tokens for model, model_results in complete_models.items(): is_single_run = any(srm in model for srm in single_run_models) runs_count = 1 if is_single_run else k total_tasks = sum(len(tasks) for tasks in all_tasks.values()) # Aggregates across all services and runs total_agent_execution_time = 0.0 total_input_tokens = 0 total_output_tokens = 0 total_tokens = 0 total_turns = 0 # For optional fields actual_model_name: Optional[str] = None # If cost info is not present in metas, leave as None per_run_cost: Optional[float] = None # Model-level flags (to be inferred from meta.json) is_open_source_model: Optional[bool] = None is_reasoning_model: Optional[bool] = None # For pass@1 per-run statistics across all services pass1_rates_per_run_overall: List[float] = [] # For pass@k and pass^k across all services pass_k_task_success_any = 0 pass_power_k_task_success_all = 0 # Precompute successes per task across runs for overall # Also accumulate totals for tokens/time/turns for run_idx in range(1, runs_count + 1): run_name = f"run-{run_idx}" successes_this_run = 0 for service, service_tasks in all_tasks.items(): # service-level aggregates for this model (will compute fully below) for task in service_tasks: meta = ( model_results .get(service, {}) .get(run_name, {}) .get(task) ) # In complete_models, meta should exist; still guard if not meta: continue success = bool(meta.get("execution_result", {}).get("success", False)) if success: successes_this_run += 1 # totals accumulation total_agent_execution_time += float(meta.get("agent_execution_time", 0.0) or 0.0) in_tok, out_tok, ttl_tok = get_token_counts(meta) total_input_tokens += in_tok total_output_tokens += out_tok total_tokens += ttl_tok total_turns += int(meta.get("turn_count", 0) or 0) # capture actual model name if present if actual_model_name is None: actual_model_name = meta.get("actual_model_name") or None # capture cost if present in any meta as per-run cost token (rare) if per_run_cost is None: # A few possible fields people use; if none present, stays None possible_cost = meta.get("per_run_cost") or meta.get("run_cost") or meta.get("cost") if isinstance(possible_cost, (int, float)): per_run_cost = float(possible_cost) # capture model flags if present if is_open_source_model is None and "is_open_source_model" in meta: is_open_source_model = bool(meta.get("is_open_source_model")) if is_reasoning_model is None and "is_reasoning_model" in meta: is_reasoning_model = bool(meta.get("is_reasoning_model")) pass1_rates_per_run_overall.append(round(successes_this_run / total_tasks, 6)) # Compute pass@k and pass^k across tasks (overall) if not is_single_run: for service, service_tasks in all_tasks.items(): for task in service_tasks: successes = [] for run_idx in range(1, runs_count + 1): run_name = f"run-{run_idx}" meta = ( model_results .get(service, {}) .get(run_name, {}) .get(task) ) success = bool(meta.get("execution_result", {}).get("success", False)) if meta else False successes.append(success) if any(successes): pass_k_task_success_any += 1 if all(successes): pass_power_k_task_success_all += 1 # Build overall metrics entry denom = total_tasks * runs_count if total_tasks > 0 else 1 avg_agent_execution_time = total_agent_execution_time / denom avg_input_tokens = total_input_tokens / denom avg_output_tokens = total_output_tokens / denom avg_total_tokens = total_tokens / denom avg_turns = total_turns / denom # pass@1 stats across runs if pass1_rates_per_run_overall: avg_pass1 = sum(pass1_rates_per_run_overall) / len(pass1_rates_per_run_overall) mean = avg_pass1 variance = ( sum((r - mean) ** 2 for r in pass1_rates_per_run_overall) / len(pass1_rates_per_run_overall) ) std_pass1 = variance ** 0.5 else: avg_pass1 = 0.0 std_pass1 = 0.0 # Compute per-run tokens and cost per_run_input_tokens = total_input_tokens / runs_count if runs_count else 0 per_run_output_tokens = total_output_tokens / runs_count if runs_count else 0 model_for_pricing = actual_model_name or model computed_per_run_cost = compute_cost_usd(model_for_pricing, per_run_input_tokens, per_run_output_tokens) overall_metrics = { "total_tasks": total_tasks, "total_agent_execution_time": total_agent_execution_time, "total_input_tokens": total_input_tokens, "total_output_tokens": total_output_tokens, "total_tokens": total_tokens, "total_turns": total_turns, "avg_agent_execution_time": round(avg_agent_execution_time, 4), "avg_input_tokens": round(avg_input_tokens, 4), "avg_output_tokens": round(avg_output_tokens, 4), "avg_total_tokens": round(avg_total_tokens, 4), "avg_turns": round(avg_turns, 4), "per_run_input_tokens": per_run_input_tokens, "per_run_output_tokens": per_run_output_tokens, "per_run_cost": computed_per_run_cost if computed_per_run_cost is not None else (per_run_cost if per_run_cost is not None else None), "actual_model_name": actual_model_name or "", "is_open_source_model": (is_open_source_model if is_open_source_model is not None else False), "is_reasoning_model": (is_reasoning_model if is_reasoning_model is not None else False), "pass@1": { "avg": round(avg_pass1, 4), "std": round(std_pass1, 4), }, } if not is_single_run: overall_metrics[f"pass@{k}"] = round(pass_k_task_success_any / total_tasks, 4) overall_metrics[f"pass^{k}"] = round(pass_power_k_task_success_all / total_tasks, 4) summary["overall"][model] = overall_metrics # Per-service detailed metrics mirroring overall for service, service_tasks in all_tasks.items(): service_total_tasks = len(service_tasks) if service_total_tasks == 0: continue s_total_agent_execution_time = 0.0 s_total_input_tokens = 0 s_total_output_tokens = 0 s_total_tokens = 0 s_total_turns = 0 # per-run pass@1 for this service s_pass1_rates_per_run: List[float] = [] # pass@k for this service s_pass_k_task_success_any = 0 s_pass_power_k_task_success_all = 0 for run_idx in range(1, runs_count + 1): run_name = f"run-{run_idx}" s_successes_this_run = 0 for task in service_tasks: meta = ( model_results .get(service, {}) .get(run_name, {}) .get(task) ) if not meta: continue success = bool(meta.get("execution_result", {}).get("success", False)) if success: s_successes_this_run += 1 s_total_agent_execution_time += float(meta.get("agent_execution_time", 0.0) or 0.0) in_tok, out_tok, ttl_tok = get_token_counts(meta) s_total_input_tokens += in_tok s_total_output_tokens += out_tok s_total_tokens += ttl_tok s_total_turns += int(meta.get("turn_count", 0) or 0) s_pass1_rates_per_run.append(round(s_successes_this_run / service_total_tasks, 6)) if not is_single_run: for task in service_tasks: successes = [] for run_idx in range(1, runs_count + 1): run_name = f"run-{run_idx}" meta = ( model_results .get(service, {}) .get(run_name, {}) .get(task) ) success = bool(meta.get("execution_result", {}).get("success", False)) if meta else False successes.append(success) if any(successes): s_pass_k_task_success_any += 1 if all(successes): s_pass_power_k_task_success_all += 1 s_denom = service_total_tasks * runs_count if service_total_tasks > 0 else 1 s_avg_agent_execution_time = s_total_agent_execution_time / s_denom s_avg_input_tokens = s_total_input_tokens / s_denom s_avg_output_tokens = s_total_output_tokens / s_denom s_avg_total_tokens = s_total_tokens / s_denom s_avg_turns = s_total_turns / s_denom if s_pass1_rates_per_run: s_mean = sum(s_pass1_rates_per_run) / len(s_pass1_rates_per_run) s_var = sum((r - s_mean) ** 2 for r in s_pass1_rates_per_run) / len(s_pass1_rates_per_run) s_std = s_var ** 0.5 else: s_mean = 0.0 s_std = 0.0 # Compute per-run tokens and cost for this service s_per_run_input_tokens = s_total_input_tokens / runs_count if runs_count else 0 s_per_run_output_tokens = s_total_output_tokens / runs_count if runs_count else 0 s_computed_per_run_cost = compute_cost_usd(model_for_pricing, s_per_run_input_tokens, s_per_run_output_tokens) service_metrics = { "total_tasks": service_total_tasks, "total_agent_execution_time": s_total_agent_execution_time, "total_input_tokens": s_total_input_tokens, "total_output_tokens": s_total_output_tokens, "total_tokens": s_total_tokens, "total_turns": s_total_turns, "avg_agent_execution_time": round(s_avg_agent_execution_time, 4), "avg_input_tokens": round(s_avg_input_tokens, 4), "avg_output_tokens": round(s_avg_output_tokens, 4), "avg_total_tokens": round(s_avg_total_tokens, 4), "avg_turns": round(s_avg_turns, 4), "per_run_input_tokens": s_per_run_input_tokens, "per_run_output_tokens": s_per_run_output_tokens, "per_run_cost": s_computed_per_run_cost if s_computed_per_run_cost is not None else (per_run_cost if per_run_cost is not None else None), "actual_model_name": actual_model_name or "", "is_open_source_model": (is_open_source_model if is_open_source_model is not None else False), "is_reasoning_model": (is_reasoning_model if is_reasoning_model is not None else False), "pass@1": { "avg": round(s_mean, 4), "std": round(s_std, 4), }, } if not is_single_run: service_metrics[f"pass@{k}"] = round(s_pass_k_task_success_any / service_total_tasks, 4) service_metrics[f"pass^{k}"] = round(s_pass_power_k_task_success_all / service_total_tasks, 4) summary[service][model] = service_metrics return summary def generate_model_results(exp_dir: Path, complete_models: Dict, all_tasks: Dict): """Generate model_results directory.""" model_results_dir = exp_dir / "model_results" if model_results_dir.exists(): shutil.rmtree(model_results_dir) model_results_dir.mkdir() for model, model_data in complete_models.items(): model_dir = model_results_dir / model model_dir.mkdir() # Create a file for each task for service, service_tasks in all_tasks.items(): if service not in model_data: continue for task in service_tasks: task_data = { "model": model, "service": service, "task": task, "runs": {} } # Collect data from all runs for run_name, run_data in model_data[service].items(): if task in run_data: meta = run_data[task] task_data["runs"][run_name] = { "success": meta.get("execution_result", {}).get("success", False), "error_message": meta.get("execution_result", {}).get("error_message"), "execution_time": meta.get("agent_execution_time", 0), "token_usage": meta.get("token_usage", {}), "turn_count": meta.get("turn_count", 0) } # Save task file task_file = model_dir / f"{task}.json" with open(task_file, "w") as f: json.dump(task_data, f, indent=2) def generate_task_results(exp_dir: Path, complete_models: Dict, all_tasks: Dict): """Generate task_results directory.""" task_results_dir = exp_dir / "task_results" if task_results_dir.exists(): shutil.rmtree(task_results_dir) task_results_dir.mkdir() # For each task, collect results across all models for service, service_tasks in all_tasks.items(): for task in service_tasks: task_data = { "task": task, "service": service, "models": {} } for model, model_data in complete_models.items(): if service not in model_data: continue model_task_data = {"runs": []} for run_name, run_data in model_data[service].items(): if task in run_data: meta = run_data[task] agent_time = float(meta.get("agent_execution_time", 0.0) or 0.0) token_usage = meta.get("token_usage", {}) or {} turn_count = int(meta.get("turn_count", 0) or 0) success = bool(meta.get("execution_result", {}).get("success", False)) model_task_data["runs"].append({ "run": run_name, "success": success, "execution_time": agent_time, "agent_execution_time": agent_time, "token_usage": token_usage, "turn_count": turn_count, }) if model_task_data["runs"]: # Compute per-model summary across runs for this task runs_list = model_task_data["runs"] runs_count = len(runs_list) successful_runs = sum(1 for r in runs_list if r.get("success")) # Averages total_agent_time = sum(float(r.get("agent_execution_time", r.get("execution_time", 0.0)) or 0.0) for r in runs_list) avg_agent_time = round(total_agent_time / runs_count, 2) def _tok(r, key): tu = r.get("token_usage") or {} return int(tu.get(key, 0) or 0) total_input_tokens = 0 total_output_tokens = 0 total_total_tokens = 0 for r in runs_list: in_tok = _tok(r, "input_tokens") out_tok = _tok(r, "output_tokens") ttl_tok = int((r.get("token_usage") or {}).get("total_tokens", in_tok + out_tok) or (in_tok + out_tok)) total_input_tokens += in_tok total_output_tokens += out_tok total_total_tokens += ttl_tok avg_input_tokens = round(total_input_tokens / runs_count, 1) avg_output_tokens = round(total_output_tokens / runs_count, 1) avg_total_tokens = round(total_total_tokens / runs_count, 1) total_turns = sum(int(r.get("turn_count", 0) or 0) for r in runs_list) avg_turn_count = round(total_turns / runs_count, 2) summary_obj = { "total_runs": runs_count, "successful_runs": successful_runs, "avg_agent_execution_time": avg_agent_time, "avg_input_tokens": avg_input_tokens, "avg_output_tokens": avg_output_tokens, "avg_total_tokens": avg_total_tokens, "avg_turn_count": avg_turn_count, } # Include pass@k and pass^k only for multi-run models if runs_count > 1: summary_obj[f"pass@{runs_count}"] = 1.0 if successful_runs > 0 else 0.0 summary_obj[f"pass^{runs_count}"] = 1.0 if successful_runs == runs_count else 0.0 model_task_data["summary"] = summary_obj task_data["models"][model] = model_task_data # Save task file task_file = task_results_dir / f"{task}.json" with open(task_file, "w") as f: json.dump(task_data, f, indent=2) def generate_readme(exp_name: str, summary: Dict, k: int) -> str: """Generate README.md content with six tables: overall + 5 MCP services. Each table includes Total Tasks, Pass@1 (avg ± std), Avg Agent Time (s), and Pass@k/Pass^k (if k > 1). """ def get_pass1_avg_std(metrics: Dict[str, Any]) -> Tuple[float, float]: p1 = metrics.get("pass@1") if isinstance(p1, dict): return float(p1.get("avg", 0.0) or 0.0), float(p1.get("std", 0.0) or 0.0) # Back-compat if older summaries exist return float(p1 or 0.0), 0.0 def render_section(title: str, section_data: Dict[str, Any]) -> List[str]: lines_sec: List[str] = [ f"## {title}", "", ] header = "| Model | Total Tasks | Pass@1 (avg ± std) |" sep = "|-------|-------------|--------------------|" # include pass@k headers if present (k>1) include_k = k > 1 if include_k: header += f" Pass@{k} | Pass^{k} |" sep += "----------|----------|" # Add Per-Run Cost (USD) and Avg Agent Time (s) at the end header += " Per-Run Cost (USD) |" sep += "---------------------|" header += " Avg Agent Time (s) |" sep += "--------------------|" lines_sec.append(header) lines_sec.append(sep) # Sort by Pass@1 avg sorted_items = sorted( section_data.items(), key=lambda x: get_pass1_avg_std(x[1])[0], reverse=True ) for model, metrics in sorted_items: pass1_avg, pass1_std = get_pass1_avg_std(metrics) avg_time = float(metrics.get("avg_agent_execution_time", 0.0) or 0.0) # Format per-run cost (up to 2 decimal places, trim trailing zeros) cost_val = metrics.get("per_run_cost") if isinstance(cost_val, (int, float)): rounded_cost = round(float(cost_val), 2) formatted_cost = f"{rounded_cost:.2f}".rstrip('0').rstrip('.') cost_str = f"${formatted_cost}" else: cost_str = "/" row = ( f"| {model} | {metrics.get('total_tasks', 0)} | " f"{pass1_avg * 100:.1f}% ± {pass1_std * 100:.1f}% |" ) if include_k: if f"pass@{k}" in metrics and f"pass^{k}" in metrics: row += f" {metrics[f'pass@{k}'] * 100:.1f}% | {metrics[f'pass^{k}'] * 100:.1f}% |" else: # Single-run models do not have pass@k or pass^k; show placeholders row += " / | / |" # Append cost and avg agent time at the end row += f" {cost_str} |" row += f" {avg_time:.1f} |" lines_sec.append(row) lines_sec.append("") return lines_sec lines: List[str] = [ f"# {exp_name} - Evaluation Results", "", f"Generated: {summary['generated_at']}", ] task_set = summary.get("task_set") if task_set: lines.append(f"Task set: {task_set}") lines.append("") # Overall table lines.extend(render_section("Overall Performance", summary.get("overall", {}))) # Service tables: infer service keys from summary reserved = {"overall", "generated_at", "k", "experiment_name", "task_set"} service_keys = [key for key in summary.keys() if key not in reserved] # Keep stable order for service in sorted(service_keys): title = f"{service.capitalize()} Performance" lines.extend(render_section(title, summary.get(service, {}))) return "\n".join(lines) def push_to_github(exp_dir: Path, exp_name: str, branch: Optional[str] = None): """Push results to GitHub repository.""" try: with tempfile.TemporaryDirectory() as temp_dir: temp_path = Path(temp_dir) print("📥 Cloning experiments repository...") subprocess.run([ "git", "clone", "git@github.com:eval-sys/mcpmark-experiments.git", str(temp_path) ], check=True, capture_output=True) # Copy files for item in ["summary.json", "README.md", "model_results", "task_results"]: src = exp_dir / item if src.exists(): dst = temp_path / item if src.is_dir(): if dst.exists(): shutil.rmtree(dst) shutil.copytree(src, dst) else: shutil.copy2(src, dst) print(f" 📄 {item}") # Git operations os.chdir(temp_path) # If a branch is specified, create/checkout it before staging changes. Otherwise, ensure main. if branch: try: subprocess.run(["git", "fetch", "origin"], check=True) except subprocess.CalledProcessError: # Non-fatal if fetch fails in some environments pass subprocess.run(["git", "checkout", "-B", branch], check=True) print(f" 🌿 Using branch '{branch}'") else: # Default to main branch try: subprocess.run(["git", "fetch", "origin"], check=True) except subprocess.CalledProcessError: pass # Prefer main; if it doesn't exist locally, create tracking from origin/main result = subprocess.run(["git", "rev-parse", "--verify", "main"], capture_output=True) if result.returncode != 0: # Try to checkout origin/main try: subprocess.run(["git", "checkout", "-B", "main", "origin/main"], check=True) except subprocess.CalledProcessError: # Fallback: create main if no origin/main subprocess.run(["git", "checkout", "-B", "main"], check=True) else: subprocess.run(["git", "checkout", "main"], check=True) subprocess.run(["git", "add", "."], check=True) # Check for changes result = subprocess.run( ["git", "diff", "--staged", "--name-only"], capture_output=True, text=True ) if not result.stdout.strip(): print("✅ No changes to push") return True # Commit and push subprocess.run([ "git", "commit", "-m", f"Update results for {exp_name}" ], check=True) if branch: subprocess.run(["git", "push", "--set-upstream", "origin", branch], check=True) else: subprocess.run(["git", "push", "--set-upstream", "origin", "main"], check=True) print("✅ Successfully pushed to GitHub") return True except subprocess.CalledProcessError as e: print(f"❌ Git operation failed: {e}") return False def print_validation_report(complete: Dict, incomplete: Dict, invalid: Dict, all_tasks: Dict, k: int, single_run_models: List[str], raw_results: Dict): """Print structured validation report with summary table.""" # Combine all models all_models = {} for model in complete: all_models[model] = {"status": "complete", "data": complete[model]} for model in incomplete: all_models[model] = {"status": "incomplete", "issues": incomplete[model]} for model in invalid: all_models[model] = {"status": "invalid", "issues": invalid[model]} # Calculate expected counts total_expected_tasks = sum(len(tasks) for tasks in all_tasks.values()) # Summary table print("\n" + "=" * 100) print("COMPLETENESS SUMMARY TABLE") print("=" * 100) print() print(f"{'Model':<30} {'Expected':<12} {'Actual':<12} {'Missing':<12} {'Status':<30}") print("-" * 100) sorted_models = sorted(all_models.keys()) for model_name in sorted_models: model_info = all_models[model_name] # Determine expected runs and tasks is_single_run = any(srm in model_name for srm in single_run_models) expected_runs = 1 if is_single_run else k expected_total = total_expected_tasks * expected_runs if model_info["status"] == "complete": # Count actual tasks from complete model data actual_total = 0 for service, service_data in model_info["data"].items(): for run_name, run_data in service_data.items(): actual_total += len(run_data) missing = 0 status = "✅ Complete" else: # For incomplete/invalid models, count from raw results actual_total = 0 if model_name in raw_results: for service, service_data in raw_results[model_name].items(): for run_name, run_data in service_data.items(): actual_total += len(run_data) missing = expected_total - actual_total if model_info["status"] == "incomplete": # Find which services have issues problem_services = set() for issue in model_info["issues"]: if "Missing entire service:" in issue: service = issue.split(": ")[1] problem_services.add(service) elif "/" in issue: service = issue.split("/")[0] problem_services.add(service) elif "Missing run" in issue: service = issue.split(" for ")[1] problem_services.add(service) if problem_services: services_str = ", ".join(sorted(problem_services)) status = f"❌ Incomplete ({services_str})" else: status = "❌ Incomplete" else: # invalid status = "⚠️ Invalid (retryable errors)" # Format the row print(f"{model_name:<30} {expected_total:<12} {actual_total:<12} {missing:<12} {status:<30}") print() # Overall statistics complete_count = len(complete) incomplete_count = len(incomplete) invalid_count = len(invalid) total_models = complete_count + incomplete_count + invalid_count print("=" * 100) print("OVERALL STATISTICS") print("=" * 100) print(f"Total models analyzed: {total_models}") print(f"Complete models: {complete_count}") print(f"Incomplete models: {incomplete_count}") print(f"Invalid models (with retryable errors): {invalid_count}") print(f"Total tasks per MCP: {total_expected_tasks}") print(f"Expected runs (k): {k}") if not complete: print("\n❌ No models have complete and valid results!") else: print(f"\n✅ {complete_count} model(s) ready for aggregation: {', '.join(sorted(complete.keys()))}") def main(): # Extra parser for push-related options push_parent = argparse.ArgumentParser(add_help=False) push_parent.add_argument( "--branch", type=str, help="If provided with --push, push to this new branch" ) parser = argparse.ArgumentParser( description="Simplified MCPMark results aggregator" , parents=[push_parent]) parser.add_argument("--exp-name", required=True, help="Experiment name") parser.add_argument("--k", type=int, default=4, help="Number of runs (default: 4)") parser.add_argument( "--single-run-models", type=str, help="Comma-separated list of models that only need run-1" ) parser.add_argument( "--task-set", choices=sorted(SUPPORTED_TASK_SETS), default="standard", help="Which task subset to aggregate (default: standard)" ) parser.add_argument("--push", action="store_true", help="Push to GitHub (default to main)") args = parser.parse_args() # Parse single-run models single_run_models = [] if args.single_run_models: single_run_models = [m.strip() for m in args.single_run_models.split(",")] print(f"📌 Single-run models: {', '.join(single_run_models)}") # Setup paths exp_dir = Path("./results") / args.exp_name if not exp_dir.exists(): print(f"❌ Experiment directory {exp_dir} does not exist") return 1 print(f"🔄 Processing experiment: {args.exp_name}") # Discover all tasks print(f"📋 Discovering tasks (task set: {args.task_set})...") all_tasks = discover_tasks(args.task_set) total_tasks = sum(len(tasks) for tasks in all_tasks.values()) print(f" Found {total_tasks} tasks across {len(all_tasks)} services") print("📥 Collecting results...") results = collect_results(exp_dir, args.k) print(f" Found results for {len(results)} models") # Check completeness and validity print("✓ Checking completeness and validity...") complete_models, incomplete_models, invalid_models = check_completeness_and_validity( results, all_tasks, args.k, single_run_models ) # Print validation report with summary table print_validation_report(complete_models, incomplete_models, invalid_models, all_tasks, args.k, single_run_models, results) # Determine which models to include in output (strict: only complete models) models_for_output = dict(complete_models) if not models_for_output: return 1 # Calculate metrics print("\n📊 Calculating metrics...") summary = calculate_metrics(models_for_output, all_tasks, args.k, single_run_models) summary["experiment_name"] = args.exp_name summary["task_set"] = args.task_set # Save summary summary_path = exp_dir / "summary.json" with open(summary_path, "w") as f: json.dump(summary, f, indent=2) print(f" 📄 Saved summary.json") # Generate model_results print("📁 Generating model_results...") generate_model_results(exp_dir, models_for_output, all_tasks) print(f" Created {len(models_for_output)} model directories") # Generate task_results print("📁 Generating task_results...") generate_task_results(exp_dir, models_for_output, all_tasks) print(f" Created {total_tasks} task files") # Generate README readme_content = generate_readme(args.exp_name, summary, args.k) readme_path = exp_dir / "README.md" with open(readme_path, "w") as f: f.write(readme_content) print(" 📄 Generated README.md") # Push to GitHub if requested if args.push: print("\n🚀 Pushing to GitHub...") push_to_github(exp_dir, args.exp_name, branch=args.branch) print(f"\n🎉 Successfully processed {args.exp_name}") return 0 if __name__ == "__main__": exit(main()) ================================================ FILE: src/aggregators/aggregate_specific_results.py ================================================ #!/usr/bin/env python3 """ Simple Results Aggregator - Aggregate specific result directories Usage: python -m src.aggregators.aggregate_specific_results --result-dir results/exp/model__service --k 4 """ import json import argparse from pathlib import Path from collections import defaultdict from typing import Dict, Any, Tuple, List from datetime import datetime import sys sys.path.append(str(Path(__file__).parent.parent.parent)) from src.aggregators.pricing import compute_cost_usd def collect_results_from_dir(result_dir: Path, k: int) -> Dict[str, Any]: """Collect all results from a specific result directory.""" results = {} for run_idx in range(1, k + 1): run_dir = result_dir / f"run-{run_idx}" if not run_dir.exists(): print(f"⚠️ Warning: {run_dir} does not exist, skipping") continue run_results = {} for task_dir in run_dir.iterdir(): if not task_dir.is_dir(): continue meta_path = task_dir / "meta.json" if meta_path.exists(): with open(meta_path) as f: meta = json.load(f) run_results[task_dir.name] = meta results[f"run-{run_idx}"] = run_results return results def get_token_counts(meta: Dict[str, Any]) -> Tuple[int, int, int]: """Extract token counts from meta.""" tu = meta.get("token_usage", {}) or {} input_tokens = int(tu.get("input_tokens", 0) or 0) output_tokens = int(tu.get("output_tokens", 0) or 0) total_tokens = int(tu.get("total_tokens", input_tokens + output_tokens) or (input_tokens + output_tokens)) return input_tokens, output_tokens, total_tokens def calculate_metrics(results: Dict, k: int, model_name: str) -> Dict: """Calculate metrics from results.""" # Get all unique task names all_tasks = set() for run_name, run_data in results.items(): all_tasks.update(run_data.keys()) all_tasks = sorted(all_tasks) total_tasks = len(all_tasks) actual_runs = len(results) print(f"\n📊 Analysis:") print(f" Total unique tasks: {total_tasks}") print(f" Runs found: {actual_runs} (expected: {k})") # Aggregates total_agent_execution_time = 0.0 total_input_tokens = 0 total_output_tokens = 0 total_tokens = 0 total_turns = 0 actual_model_name = None # Per-run pass@1 pass1_rates_per_run = [] # For pass@k pass_k_task_success_any = 0 pass_power_k_task_success_all = 0 for run_idx in range(1, actual_runs + 1): run_name = f"run-{run_idx}" successes_this_run = 0 for task in all_tasks: meta = results.get(run_name, {}).get(task) if not meta: continue success = bool(meta.get("execution_result", {}).get("success", False)) if success: successes_this_run += 1 total_agent_execution_time += float(meta.get("agent_execution_time", 0.0) or 0.0) in_tok, out_tok, ttl_tok = get_token_counts(meta) total_input_tokens += in_tok total_output_tokens += out_tok total_tokens += ttl_tok total_turns += int(meta.get("turn_count", 0) or 0) if actual_model_name is None: actual_model_name = meta.get("actual_model_name") or None pass1_rate = successes_this_run / total_tasks if total_tasks > 0 else 0 pass1_rates_per_run.append(pass1_rate) print(f" Run {run_idx}: {successes_this_run}/{total_tasks} = {pass1_rate*100:.1f}%") # Calculate pass@k for task in all_tasks: successes = [] for run_idx in range(1, actual_runs + 1): run_name = f"run-{run_idx}" meta = results.get(run_name, {}).get(task) success = bool(meta.get("execution_result", {}).get("success", False)) if meta else False successes.append(success) if any(successes): pass_k_task_success_any += 1 if all(successes): pass_power_k_task_success_all += 1 # Averages denom = total_tasks * actual_runs if total_tasks > 0 else 1 avg_agent_execution_time = total_agent_execution_time / denom avg_input_tokens = total_input_tokens / denom avg_output_tokens = total_output_tokens / denom avg_total_tokens = total_tokens / denom avg_turns = total_turns / denom # Pass@1 stats if pass1_rates_per_run: avg_pass1 = sum(pass1_rates_per_run) / len(pass1_rates_per_run) mean = avg_pass1 variance = sum((r - mean) ** 2 for r in pass1_rates_per_run) / len(pass1_rates_per_run) std_pass1 = variance ** 0.5 else: avg_pass1 = 0.0 std_pass1 = 0.0 # Cost calculation per_run_input_tokens = total_input_tokens / actual_runs if actual_runs else 0 per_run_output_tokens = total_output_tokens / actual_runs if actual_runs else 0 model_for_pricing = actual_model_name or model_name per_run_cost = compute_cost_usd(model_for_pricing, per_run_input_tokens, per_run_output_tokens) summary = { "generated_at": datetime.now().isoformat(), "model": model_name, "actual_model_name": actual_model_name or model_name, "runs": actual_runs, "total_tasks": total_tasks, "total_agent_execution_time": round(total_agent_execution_time, 2), "total_input_tokens": total_input_tokens, "total_output_tokens": total_output_tokens, "total_tokens": total_tokens, "total_turns": total_turns, "avg_agent_execution_time": round(avg_agent_execution_time, 4), "avg_input_tokens": round(avg_input_tokens, 2), "avg_output_tokens": round(avg_output_tokens, 2), "avg_total_tokens": round(avg_total_tokens, 2), "avg_turns": round(avg_turns, 2), "per_run_input_tokens": round(per_run_input_tokens, 2), "per_run_output_tokens": round(per_run_output_tokens, 2), "per_run_cost": round(per_run_cost, 4) if per_run_cost else None, "pass@1": { "avg": round(avg_pass1, 4), "std": round(std_pass1, 4), "per_run": [round(r, 4) for r in pass1_rates_per_run] }, } if actual_runs > 1: summary[f"pass@{actual_runs}"] = round(pass_k_task_success_any / total_tasks, 4) summary[f"pass^{actual_runs}"] = round(pass_power_k_task_success_all / total_tasks, 4) return summary def main(): parser = argparse.ArgumentParser(description="Simple results aggregator for specific directories") parser.add_argument("--result-dir", required=True, help="Path to result directory (e.g., results/exp/model__service)") parser.add_argument("--k", type=int, default=4, help="Number of runs (default: 4)") parser.add_argument("--output", help="Output JSON file path (default: /summary.json)") args = parser.parse_args() result_dir = Path(args.result_dir) if not result_dir.exists(): print(f"❌ Result directory {result_dir} does not exist") return 1 # Extract model name from directory name model_name = result_dir.name.replace("__", "-") print(f"🔄 Processing: {result_dir}") print(f"📋 Model: {model_name}") # Collect results results = collect_results_from_dir(result_dir, args.k) if not results: print("❌ No results found") return 1 # Calculate metrics summary = calculate_metrics(results, args.k, model_name) # Save summary output_path = Path(args.output) if args.output else result_dir / "summary.json" with open(output_path, "w") as f: json.dump(summary, f, indent=2) print(f"\n✅ Summary saved to: {output_path}") print(f"\n📈 Results:") print(f" Pass@1: {summary['pass@1']['avg']*100:.1f}% ± {summary['pass@1']['std']*100:.1f}%") if f"pass@{args.k}" in summary: print(f" Pass@{args.k}: {summary[f'pass@{args.k}']*100:.1f}%") print(f" Pass^{args.k}: {summary[f'pass^{args.k}']*100:.1f}%") print(f" Per-run cost: ${summary['per_run_cost']:.4f}" if summary['per_run_cost'] else " Per-run cost: N/A") print(f" Avg agent time: {summary['avg_agent_execution_time']:.2f}s") print(f" Avg turns: {summary['avg_turns']:.2f}") print(f"\n📊 Token Usage:") avg_tokens_per_run = summary['total_tokens'] / summary['runs'] if summary['runs'] > 0 else 0 print(f" Avg tokens per run: {avg_tokens_per_run:,.0f}") print(f" Avg tokens per turn: {summary['avg_total_tokens'] / summary['avg_turns']:.0f}" if summary['avg_turns'] > 0 else " Avg tokens per turn: N/A") print(f" Total tokens (all runs): {summary['total_tokens']:,}") print(f" Total turns (all runs): {summary['total_turns']:,}") return 0 if __name__ == "__main__": exit(main()) ================================================ FILE: src/aggregators/aggregate_task_meta.py ================================================ #!/usr/bin/env python3 """ Task Meta Aggregator for MCPBench Aggregates all meta.json files from the tasks directory into a single JSON file. """ import json import os import argparse import subprocess import shutil from pathlib import Path from typing import Dict, List, Any, Set def find_all_meta_files(tasks_root: Path = Path("tasks")) -> List[Path]: """Find all meta.json files in the tasks directory""" meta_files = [] for root, dirs, files in os.walk(tasks_root): if "meta.json" in files: meta_files.append(Path(root) / "meta.json") return meta_files def parse_meta_file(meta_path: Path) -> Dict[str, Any]: """Parse a single meta.json file""" try: with open(meta_path, "r", encoding="utf-8") as f: return json.load(f) except Exception as e: print(f"Error parsing {meta_path}: {e}") return {} def aggregate_task_meta(meta_files: List[Path]) -> Dict[str, Any]: """Aggregate all meta.json files into the required structure""" all_data = [] categories_dict = {} # Use dict to track unique categories all_tags_set = set() # Set to collect all unique tags for meta_path in meta_files: meta_data = parse_meta_file(meta_path) if meta_data: # Exclude model_results field from aggregated data filtered_data = {k: v for k, v in meta_data.items() if k != "model_results"} all_data.append(filtered_data) # Collect categories using category_id and category_name if "category_id" in filtered_data and "category_name" in filtered_data: category_id = filtered_data["category_id"] category_name = filtered_data["category_name"] # Use category_id as the key to ensure uniqueness categories_dict[category_id] = { "id": category_id, "name": category_name, } # Collect all unique tags if "tags" in filtered_data and isinstance(filtered_data["tags"], list): all_tags_set.update(filtered_data["tags"]) # Convert categories dict to sorted list categories_list = sorted(categories_dict.values(), key=lambda x: x["id"]) # Convert tags set to sorted list all_tags_list = sorted(all_tags_set) return { "data": all_data, "count": len(all_data), "categories": categories_list, "tags": all_tags_list, } def create_individual_task_files(meta_files: List[Path]) -> List[Dict[str, Any]]: """Create individual task JSON files with instruction and verify content""" task_files = [] for meta_path in meta_files: meta_data = parse_meta_file(meta_path) if not meta_data or "task_id" not in meta_data: continue # Get the task directory task_dir = meta_path.parent # Read description.md if exists description_path = task_dir / "description.md" instruction_content = "" if description_path.exists(): try: with open(description_path, "r", encoding="utf-8") as f: instruction_content = f.read() except Exception as e: print(f"Warning: Could not read {description_path}: {e}") # Read verify.py if exists verify_path = task_dir / "verify.py" verify_content = "" if verify_path.exists(): try: with open(verify_path, "r", encoding="utf-8") as f: verify_content = f.read() except Exception as e: print(f"Warning: Could not read {verify_path}: {e}") # Create combined task data, excluding model_results task_data = { k: v for k, v in meta_data.items() if k != "model_results" } task_data["instruction"] = instruction_content task_data["verify"] = verify_content task_files.append({"filename": f"{meta_data['task_id']}.json", "data": task_data}) return task_files def push_to_file( output_file: Path, data: Dict[str, Any], task_files: List[Dict[str, Any]] = None, push_to_repo: bool = False, ) -> bool: """Save the aggregated data to file and optionally push to repo""" try: # Create parent directory if it doesn't exist output_file.parent.mkdir(parents=True, exist_ok=True) # Write the aggregated data with open(output_file, "w", encoding="utf-8") as f: json.dump(data, f, indent=2, ensure_ascii=False) print(f"✅ Task meta data saved to: {output_file}") print(f"📊 Summary:") print(f" - Total tasks with meta.json: {data['count']}") print(f" - Categories: {len(data['categories'])}") print(f" - Unique tags: {len(data['tags'])}") if push_to_repo: return push_to_experiments_repo(output_file, task_files) return True except Exception as e: print(f"❌ Error saving file: {e}") return False def push_to_experiments_repo( file_path: Path, task_files: List[Dict[str, Any]] = None ) -> bool: """Push the task meta file and individual task files to eval-sys/mcpmark-experiments repo""" if not file_path.exists(): print("⚠️ File does not exist") return False repo_url = "https://github.com/eval-sys/mcpmark-experiments.git" temp_dir = Path("./temp_experiments_repo") try: print(f"\n🔄 Preparing to push task meta to experiments repo...") # Clean up any existing temp directory if temp_dir.exists(): shutil.rmtree(temp_dir) # Clone the repo print("📥 Cloning experiments repo...") subprocess.run( ["git", "clone", repo_url, str(temp_dir)], check=True, capture_output=True ) # Copy the main task_meta.json file target_path = temp_dir / "task_meta.json" print(f"📁 Copying task meta file: task_meta.json") shutil.copy2(file_path, target_path) # Create tasks directory and copy individual task files if task_files: tasks_dir = temp_dir / "tasks" tasks_dir.mkdir(exist_ok=True) print(f"📁 Creating individual task files in ./tasks directory...") for task_file in task_files: task_file_path = tasks_dir / task_file["filename"] with open(task_file_path, "w", encoding="utf-8") as f: json.dump(task_file["data"], f, indent=2, ensure_ascii=False) print(f" - Created {len(task_files)} individual task files") # Change to repo directory for git operations original_dir = os.getcwd() os.chdir(temp_dir) # Add all changes subprocess.run(["git", "add", "."], check=True) # Check if there are changes to commit result = subprocess.run( ["git", "status", "--porcelain"], capture_output=True, text=True ) if not result.stdout.strip(): print("✅ No changes to push (files are up to date)") return True # Commit changes commit_msg = "Update task meta data and individual task files" subprocess.run(["git", "commit", "-m", commit_msg], check=True) # Push changes print("🚀 Pushing to remote repository...") subprocess.run(["git", "push"], check=True) print("✅ Successfully pushed task meta and individual task files to repo!") return True except subprocess.CalledProcessError as e: print(f"❌ Git operation failed: {e}") return False except Exception as e: print(f"❌ Error pushing to repo: {e}") return False finally: # Change back to original directory os.chdir(original_dir) # Clean up temp directory if temp_dir.exists(): shutil.rmtree(temp_dir) def main(): parser = argparse.ArgumentParser(description="Aggregate all task meta.json files") parser.add_argument( "--output", type=str, default="task_meta.json", help="Output file path (default: task_meta.json)", ) parser.add_argument( "--push", action="store_true", help="Push results to eval-sys/mcpmark-experiments repo", ) args = parser.parse_args() print("🔍 Searching for meta.json files in tasks directory...") # Find all meta.json files meta_files = find_all_meta_files() if not meta_files: print("❌ No meta.json files found in tasks directory") return 1 print(f"📁 Found {len(meta_files)} meta.json files") # Aggregate the data print("🔄 Aggregating task meta data...") aggregated_data = aggregate_task_meta(meta_files) # Create individual task files if pushing to repo task_files = None if args.push: print("🔄 Creating individual task files...") task_files = create_individual_task_files(meta_files) print(f"📝 Prepared {len(task_files)} individual task files") # Save to file output_path = Path(args.output) success = push_to_file(output_path, aggregated_data, task_files, args.push) if not success: return 1 if args.push: print( f"🚀 Task meta data and individual task files pushed to eval-sys/mcpmark-experiments repo" ) return 0 if __name__ == "__main__": exit(main()) ================================================ FILE: src/aggregators/pricing.py ================================================ """ Pricing utilities for computing per-run cost from token usage. All prices are specified per 1,000,000 tokens (M tokens) in USD. """ from __future__ import annotations from typing import Dict, Optional # Price map keyed by canonical model name (lowercased) # Values are dicts with per-M token prices for input and output tokens MODEL_PRICES_PER_M: Dict[str, Dict[str, float]] = { # Use exact actual_model_name keys (lowercased) provided by the user # Anthropic "claude-opus-4-1-20250805": {"input": 15.0, "output": 75.0}, "claude-opus-4-5-20251101": {"input": 5.0, "output": 25.0}, "claude-sonnet-4-20250514": {"input": 3.0, "output": 15.0}, "claude-sonnet-4-5-20250929": {"input": 3.0, "output": 15.0}, # DeepSeek "deepseek-v3.1-non-think": {"input": 0.56, "output": 1.68}, "deepseek-v3.2-chat": {"input": 0.27, "output": 0.40}, "deepseek-v3.2-reasoner": {"input": 0.27, "output": 0.40}, "deepseek-v3.1-terminus-thinking": {"input": 0.21, "output": 0.79}, "deepseek-v3.1-terminus": {"input": 0.21, "output": 0.79}, # Google Gemini "gemini-2.5-pro": {"input": 2.5, "output": 15.0}, "gemini-2.5-flash": {"input": 0.3, "output": 2.5}, "gemini-3-pro": {"input": 2.0, "output": 12.0}, # Z.AI "glm-4.5": {"input": 0.33, "output": 1.32}, # OpenAI "gpt-5-2025-08-07": {"input": 1.25, "output": 10.0}, "gpt-5.2-2025-12-11": {"input": 1.75, "output": 14.0}, "gpt-5-mini-2025-08-07": {"input": 0.25, "output": 2.0}, "gpt-5-nano-2025-08-07": {"input": 0.05, "output": 0.4}, "gpt-4.1-2025-04-14": {"input": 2.0, "output": 8.0}, "gpt-4.1-mini-2025-04-14": {"input": 0.4, "output": 1.6}, "gpt-4.1-nano-2025-04-14": {"input": 0.1, "output": 0.4}, "o3-2025-04-16": {"input": 2.0, "output": 8.0}, "o4-mini-2025-04-16": {"input": 1.1, "output": 4.4}, "gpt-oss-120b": {"input": 0.072, "output": 0.28}, # Qwen "qwen3-coder-480b-a35b-instruct": {"input": 0.2, "output": 0.8}, "qwen3-max-preview": {"input": 1.2, "output": 6}, # Xai "grok-4-0709": {"input": 3.0, "output": 15.0}, "grok-code-fast-1": {"input": 0.2, "output": 1.5}, "grok-4-fast": {"input": 0.2, "output": 0.5}, # Moonshot "kimi-k2-0711-preview": {"input": 0.6, "output": 2.5}, "kimi-k2-0905-preview": {"input": 0.6, "output": 2.5}, } def normalize_model_name(model_name: str) -> str: """Normalize model name for pricing lookup. Lowercases only. """ return (model_name or "").strip().lower() def get_price_per_m(model_name: str) -> Optional[Dict[str, float]]: """Return per-M token prices for given model, or None if unknown.""" key = normalize_model_name(model_name) return MODEL_PRICES_PER_M.get(key) def compute_cost_usd(model_name: str, input_tokens: float, output_tokens: float) -> Optional[float]: """Compute cost in USD given token usage and model pricing. Prices are per 1,000,000 tokens. If pricing unknown, returns None. """ prices = get_price_per_m(model_name) if not prices: return None input_cost = (input_tokens / 1_000_000.0) * prices["input"] output_cost = (output_tokens / 1_000_000.0) * prices["output"] return float(round(input_cost + output_cost, 6)) ================================================ FILE: src/base/__init__.py ================================================ ================================================ FILE: src/base/login_helper.py ================================================ from abc import ABC, abstractmethod class BaseLoginHelper(ABC): """Abstract base class for login helpers.""" def __init__(self): pass @abstractmethod def login(self, **kwargs): pass ================================================ FILE: src/base/state_manager.py ================================================ import time from abc import ABC, abstractmethod from dataclasses import dataclass from typing import Any, Dict, List, Optional from src.logger import get_logger from .task_manager import BaseTask # Initialize logger logger = get_logger(__name__) @dataclass class InitialStateInfo: """Information about created initial state for a task.""" state_id: str state_url: Optional[str] = None metadata: Optional[Dict[str, Any]] = None class BaseStateManager(ABC): """ Simplified abstract base class for state management in MCP services. This class provides essential functionality for initial state creation and cleanup while allowing service-specific implementations through template methods. """ def __init__(self, service_name: str): self.service_name = service_name # Simple resource tracking for cleanup self.tracked_resources: List[Dict[str, Any]] = [] # Note: Initialization is now handled in service-specific constructors def set_up(self, task: BaseTask) -> bool: """Set up initial state for a specific task. Args: task: The task for which to set up the initial state Returns: True if setup successful, False otherwise """ try: logger.info( f"| Setting up initial state for {self.service_name} task: {task.name}" ) # Create initial state initial_state_info = self._create_initial_state(task) if not initial_state_info: logger.error(f"| Failed to create initial state for {task.name}") return False # Store initial state info in task self._store_initial_state_info(task, initial_state_info) logger.info(f"| ✓ Initial state setup completed for {task.name}") return True except Exception as e: logger.error(f"| Setup failed for {task.name}: {e}") return False def clean_up(self, task: BaseTask = None) -> bool: """Clean up resources with common patterns and service-specific hooks. Args: task: Optional task to clean up specific resources for Returns: True if cleanup successful, False otherwise """ try: cleanup_success = True # Task-specific cleanup if task: logger.info( f"| ○ Cleaning up initial state for {self.service_name} task: {task.name}" ) if not self._cleanup_task_initial_state(task): cleanup_success = False # Clean up all tracked resources if not self._cleanup_tracked_resources(): cleanup_success = False if cleanup_success: logger.info(f"| ✓ Cleanup completed for {self.service_name}") else: logger.warning( f"| Cleanup completed with some failures for {self.service_name}" ) return cleanup_success except Exception as e: logger.error(f"Cleanup failed for {self.service_name}: {e}") return False def track_resource( self, resource_type: str, identifier: str, metadata: Optional[Dict[str, Any]] = None, ) -> None: """Track a resource for later cleanup. Args: resource_type: Type of resource (e.g., 'repository', 'page') identifier: Unique identifier for the resource metadata: Additional metadata about the resource """ resource = { "type": resource_type, "id": identifier, "created_at": time.time(), "metadata": metadata or {}, } self.tracked_resources.append(resource) logger.debug(f"Tracked {resource_type} resource: {identifier}") def get_service_config_for_agent(self) -> dict: """ Get service-specific configuration for agent execution. This method should be overridden by service implementations that need to provide additional configuration to the agent. Returns: Dictionary containing configuration needed by the agent/MCP server """ return {} def set_verification_environment(self, messages_path: str = None) -> None: """ Set environment variables needed for verification scripts. Args: messages_path: Optional path to messages.json file for verification This method can be overridden by service implementations that need to set specific environment variables for their verification scripts. The default implementation sets MCP_MESSAGES if provided. """ import os if messages_path: os.environ["MCP_MESSAGES"] = str(messages_path) def _cleanup_tracked_resources(self) -> bool: """Clean up all tracked resources.""" cleanup_success = True for resource in self.tracked_resources: try: if not self._cleanup_single_resource(resource): cleanup_success = False except Exception as e: logger.error(f"Failed to cleanup resource {resource}: {e}") cleanup_success = False # Clear resources after cleanup attempt self.tracked_resources.clear() return cleanup_success # ========================================================================= # Abstract methods for service-specific behavior (simplified) # ========================================================================= # Note: Service-specific initialization is now handled in constructors @abstractmethod def _create_initial_state(self, task: BaseTask) -> Optional[InitialStateInfo]: """Create initial state for a task (e.g., duplicate page, fork repo). Args: task: Task for which to create initial state Returns: InitialStateInfo object or None if creation failed """ pass @abstractmethod def _store_initial_state_info( self, task: BaseTask, state_info: InitialStateInfo ) -> None: """Store initial state information in the task object. Args: task: Task object to update state_info: Initial state information to store """ pass @abstractmethod def _cleanup_task_initial_state(self, task: BaseTask) -> bool: """Clean up initial state for a specific task. Args: task: Task whose initial state should be cleaned up Returns: True if cleanup successful, False otherwise """ pass @abstractmethod def _cleanup_single_resource(self, resource: Dict[str, Any]) -> bool: """Clean up a single tracked resource. Args: resource: Resource dictionary with type, id, and metadata Returns: True if cleanup successful, False otherwise """ pass ================================================ FILE: src/base/task_manager.py ================================================ #!/usr/bin/env python3 """ Enhanced Base Task Manager with Common Task Discovery Logic =========================================================== This module provides an improved base class for task managers that consolidates common task discovery patterns while maintaining flexibility for service-specific needs. """ import json import subprocess import sys from abc import ABC from dataclasses import dataclass from pathlib import Path from typing import Any, Dict, List, Optional from src.logger import get_logger from src.results_reporter import TaskResult logger = get_logger(__name__) @dataclass class BaseTask: """Base class for evaluation tasks.""" task_instruction_path: Path task_verification_path: Path service: str category_id: str # From meta.json if available, otherwise directory name task_id: str # From meta.json if available, otherwise directory name @property def name(self) -> str: """Return the task name using '__' separator format: 'category_id__task_id'.""" return f"{self.category_id}__{self.task_id}" def get_task_instruction(self) -> str: """Return the full text content of the task instruction file.""" if not self.task_instruction_path.exists(): raise FileNotFoundError( f"Task instruction file not found: {self.task_instruction_path}" ) return self.task_instruction_path.read_text(encoding="utf-8") class BaseTaskManager(ABC): """Enhanced base class for service-specific task managers with common discovery logic.""" def __init__( self, tasks_root: Path, mcp_service: str = None, task_class: type = None, task_organization: str = None, task_suite: str | None = "standard", ): """Initialize the base task manager. Args: tasks_root: Root directory containing all tasks mcp_service: MCP service name (e.g., 'notion', 'github', 'filesystem') task_class: Custom task class to use (defaults to BaseTask) task_organization: 'file' or 'directory' based task organization task_suite: Logical task suite (e.g., 'standard', 'easy') """ self.tasks_root = tasks_root self.mcp_service = mcp_service or self.__class__.__name__.lower().replace( "taskmanager", "" ) self.task_class = task_class or BaseTask self.task_organization = task_organization self.task_suite = task_suite self._tasks_cache = None # ========================================================================= # Common Task Discovery Implementation # ========================================================================= def discover_all_tasks(self) -> List[BaseTask]: """Discover all available tasks for this service (common implementation).""" if self._tasks_cache is not None: return self._tasks_cache tasks = [] service_dir = self.tasks_root / ( self.mcp_service or self._get_service_directory_name() ) if self.task_suite: service_dir = service_dir / self.task_suite if not service_dir.exists(): logger.warning( f"{self.mcp_service.title()} tasks directory does not exist: {service_dir}" ) return tasks # Scan categories for category_dir in service_dir.iterdir(): if not self._is_valid_category_dir(category_dir): continue category_id = category_dir.name logger.info("Discovering tasks in category: %s", category_id) # Find tasks using service-specific logic task_files = self._find_task_files(category_dir) for task_files_info in task_files: task = self._create_task_from_files(category_id, task_files_info) if task: tasks.append(task) logger.debug("Found task: %s", task.name) # Sort and cache # Sort by category_id and a stringified task_id to handle both numeric IDs and slugs uniformly self._tasks_cache = sorted(tasks, key=lambda t: (t.category_id, str(t.task_id))) logger.info( "Discovered %d %s tasks across all categories (suite=%s)", len(self._tasks_cache), self.mcp_service.title(), self.task_suite or "default", ) return self._tasks_cache def get_categories(self) -> List[str]: """Get a list of all task categories (common implementation).""" tasks = self.discover_all_tasks() return sorted(list(set(task.category_id for task in tasks))) def filter_tasks(self, task_filter: str) -> List[BaseTask]: """Filter tasks based on category or specific task pattern (common implementation).""" all_tasks = self.discover_all_tasks() if not task_filter or task_filter.lower() == "all": return all_tasks # Check if it's a category filter categories = self.get_categories() if task_filter in categories: return [task for task in all_tasks if task.category_id == task_filter] # Check for specific task pattern (category_id/task_id) if "/" in task_filter: try: category, task_part = task_filter.split("/", 1) # First try to match by task_id (could be numeric or string) for task in all_tasks: if task.category_id == category: # Check if task_id matches (as string or as specific pattern) if str(task.task_id) == task_part: return [task] except (ValueError, IndexError): pass # Fallback: check for partial matches in task names or categories filtered_tasks = [] for task in all_tasks: if ( task_filter in task.category_id or task_filter in task.name or task_filter == str(task.task_id) ): filtered_tasks.append(task) return filtered_tasks # ========================================================================= # Common Helper Methods # ========================================================================= def get_task_instruction(self, task: BaseTask) -> str: """Get formatted task instruction (template method).""" base_instruction = self._read_task_instruction(task) return self._format_task_instruction(base_instruction) def execute_task(self, task: BaseTask, agent_result: Dict[str, Any]) -> TaskResult: """Execute task verification (template method).""" logger.info(f"| Verifying task ({self.mcp_service.title()}): {task.name}") # Track agent success separately agent_success = agent_result.get("success", False) agent_error = None verification_success = False verification_error = None verification_output = None # Handle agent failure (but still continue to verification) if not agent_success: agent_error = agent_result.get("error", "Agent execution failed") # Standardize MCP network errors agent_error = self._standardize_error_message(agent_error) logger.error(f"| ✗ Agent execution failed for task") logger.error(f"| ⚠️ Error: {agent_error}") logger.info(f"| - Proceeding with verification despite agent failure") try: # Always run verification regardless of agent success verify_result = self.run_verification(task) # Process verification results verification_success = verify_result.returncode == 0 verification_output = verify_result.stdout # Log verification output if verification_output: print(verification_output) # Capture verification error if failed if not verification_success: verification_error = verify_result.stderr if verify_result.stderr else "Verification failed with no error message" if verification_success: logger.info(f"| Verification Result: \033[92m✓ PASSED\033[0m") else: logger.error(f"| Verification Result: \033[91m✗ FAILED\033[0m") if verification_error: logger.error(f"| Verification Error: {verification_error}") return TaskResult( task_name=task.name, success=verification_success, error_message=agent_error, # Agent execution error verification_error=verification_error, # Verification error verification_output=verification_output, # Verification output model_output=agent_result.get("output", ""), category_id=task.category_id, task_id=task.task_id, token_usage=agent_result.get("token_usage", {}), turn_count=agent_result.get("turn_count", -1), ) except Exception as e: logger.error(f"| Task verification failed: {e}", exc_info=True) return TaskResult( task_name=task.name, success=False, error_message=agent_error, # Keep agent error if any verification_error=str(e), # Verification exception verification_output=None, category_id=task.category_id, task_id=task.task_id, model_output=agent_result.get("output", ""), token_usage=agent_result.get("token_usage", {}), turn_count=agent_result.get("turn_count", 0), ) def run_verification(self, task: BaseTask) -> subprocess.CompletedProcess: """Run the verification script for a task (can be overridden). Default implementation runs the verification command. Services can override this to add environment variables or custom logic. """ return subprocess.run( self._get_verification_command(task), capture_output=True, # Capture stdout and stderr for logging text=True, timeout=300, ) # ========================================================================= # Abstract Methods - Minimal Set Required # ========================================================================= def _get_service_directory_name(self) -> str: """Return the service directory name (e.g., 'notion', 'github'). Default implementation uses the service parameter if provided. """ if self.mcp_service: return self.mcp_service raise NotImplementedError( "Must provide service parameter or implement _get_service_directory_name" ) def _get_task_organization(self) -> str: """Return task organization type: 'directory' or 'file'. - 'directory': Tasks organized as task_X/description.md (Notion) - 'file': Tasks organized as task_X.md (GitHub, Filesystem) Default implementation uses the task_organization parameter if provided. """ if self.task_organization: return self.task_organization raise NotImplementedError( "Must provide task_organization parameter or implement _get_task_organization" ) # Note: _create_task_instance is no longer needed - use task_class parameter instead # ========================================================================= # Hook Methods with Smart Defaults # ========================================================================= def _is_valid_category_dir(self, category_dir: Path) -> bool: """Check if a directory is a valid category directory.""" return ( category_dir.is_dir() and not category_dir.name.startswith(".") and category_dir.name != "utils" and category_dir.name != "__pycache__" ) def _find_task_files(self, category_dir: Path) -> List[Dict[str, Any]]: """Find task files in a category directory (smart default implementation). Automatically handles both directory-based and file-based organization. """ task_files: List[Dict[str, Any]] = [] for task_dir in category_dir.iterdir(): # Skip anything that is not a directory or is hidden if not task_dir.is_dir() or task_dir.name.startswith("."): continue description_path = task_dir / "description.md" verify_path = task_dir / "verify.py" # We consider a directory a valid task only if the two mandatory files exist if not (description_path.exists() and verify_path.exists()): logger.warning( "Skipping %s – missing description.md or verify.py", task_dir ) continue task_files.append( { "task_id": task_dir.name, "instruction_path": description_path, "verification_path": verify_path, } ) return task_files def _create_task_from_files( self, category_id: str, task_files_info: Dict[str, Any] ) -> Optional[BaseTask]: """Create a task from file information with meta.json support.""" # Check for meta.json meta_path = task_files_info["instruction_path"].parent / "meta.json" # Default to directory names task_id = task_files_info["task_id"] final_category_id = category_id if meta_path.exists(): try: with open(meta_path, 'r') as f: meta_data = json.load(f) # Use values from meta.json if available final_category_id = meta_data.get("category_id", category_id) task_id = meta_data.get("task_id", task_id) except Exception as e: logger.warning(f"Failed to load meta.json from {meta_path}: {e}") return self.task_class( task_instruction_path=task_files_info["instruction_path"], task_verification_path=task_files_info["verification_path"], service=self.mcp_service, category_id=final_category_id, task_id=task_id, ) def _read_task_instruction(self, task: BaseTask) -> str: """Read and return the task instruction content.""" return task.get_task_instruction() def _format_task_instruction(self, base_instruction: str) -> str: """Format task instruction with Notion-specific additions.""" return ( base_instruction + "\n\nNote: Based on your understanding, solve the task all at once by yourself, don't ask for my opinions on anything." ) def _get_verification_command(self, task: BaseTask) -> List[str]: """Get the command to run task verification (default implementation).""" return [sys.executable, str(task.task_verification_path)] def _standardize_error_message(self, error_message: str) -> str: """Standardize error messages for consistent reporting.""" from src.errors import standardize_error_message return standardize_error_message(error_message, mcp_service=self.mcp_service) ================================================ FILE: src/config/__init__.py ================================================ ================================================ FILE: src/config/config_schema.py ================================================ #!/usr/bin/env python3 """ Centralized Configuration Schema for MCPMark ============================================= This module provides a unified configuration system with validation, type safety, and support for multiple configuration sources. """ import os from abc import ABC, abstractmethod from dataclasses import dataclass from pathlib import Path from typing import Any, Dict, Optional import yaml from dotenv import load_dotenv from src.logger import get_logger logger = get_logger(__name__) # Lazy import to avoid circular dependencies def get_service_definition(service_name: str) -> dict: from src.services import get_service_definition as _get_service_def return _get_service_def(service_name) @dataclass class ConfigValue: """Represents a configuration value with metadata.""" key: str value: Any source: str # 'env', 'file', 'default' required: bool = True description: str = "" validator: Optional[callable] = None def validate(self) -> bool: """Validate the configuration value.""" if self.required and self.value is None: raise ValueError(f"Required configuration '{self.key}' is missing") if self.validator and self.value is not None: if not self.validator(self.value): raise ValueError(f"Invalid value for '{self.key}': {self.value}") return True class ConfigSchema(ABC): """Abstract base class for service configuration schemas.""" def __init__(self, service_name: str): self.service_name = service_name self._values: Dict[str, ConfigValue] = {} self._load_dotenv() self._define_schema() self._load_values() self._validate() @abstractmethod def _define_schema(self) -> None: """Define the configuration schema for this service.""" pass def _load_dotenv(self) -> None: """Load environment variables from .mcp_env file.""" load_dotenv(dotenv_path=".mcp_env", override=False) def _add_config( self, key: str, env_var: Optional[str] = None, default: Any = None, required: bool = True, description: str = "", validator: Optional[callable] = None, transform: Optional[callable] = None, ) -> None: """Add a configuration value to the schema.""" # Try to get value from environment first value = None source = "default" if env_var: env_value = os.getenv(env_var) if env_value is not None: value = transform(env_value) if transform else env_value source = "env" # Use default if no environment value if value is None and default is not None: value = default source = "default" self._values[key] = ConfigValue( key=key, value=value, source=source, required=required, description=description, validator=validator, ) def _load_values(self) -> None: """Load configuration values from file if available.""" config_file = Path(f"config/{self.service_name}.yaml") if config_file.exists(): with open(config_file) as f: file_config = yaml.safe_load(f) for key, value in file_config.items(): if key in self._values and self._values[key].value is None: self._values[key].value = value self._values[key].source = "file" def _validate(self) -> None: """Validate all configuration values.""" for config_value in self._values.values(): config_value.validate() def get(self, key: str, default: Any = None) -> Any: """Get a configuration value.""" if key in self._values: return self._values[key].value return default def get_all(self) -> Dict[str, Any]: """Get all configuration values as a dictionary.""" return {k: v.value for k, v in self._values.items()} def get_debug_info(self) -> Dict[str, Dict[str, Any]]: """Get detailed configuration information for debugging.""" return { k: { "value": v.value, "source": v.source, "required": v.required, "description": v.description, } for k, v in self._values.items() } class GenericConfigSchema(ConfigSchema): """Generic configuration schema that reads from service definitions.""" def __init__(self, service_name: str): # Get service definition before calling parent init self.service_definition = get_service_definition(service_name) super().__init__(service_name) def _define_schema(self) -> None: """Define schema from service definition.""" config_schema = self.service_definition.get("config_schema", {}) for key, config in config_schema.items(): # Handle transform strings transform = None transform_str = config.get("transform") if transform_str == "bool": transform = lambda x: x.lower() in ["true", "1", "yes"] elif transform_str == "int": transform = int elif transform_str == "path": transform = lambda x: Path(x) if x else None elif transform_str == "list": transform = lambda x: [t.strip() for t in x.split(",")] if x else [] # Handle validator strings validator = None validator_str = config.get("validator") if validator_str == "port": validator = lambda x: 1 <= x <= 65535 elif validator_str and validator_str.startswith("in:"): valid_values = validator_str[3:].split(",") validator = lambda x, values=valid_values: x in values self._add_config( key=key, env_var=config.get("env_var"), default=config.get("default"), required=config.get("required", True), description=config.get("description", ""), validator=validator, transform=transform, ) # Configuration Registry class ConfigRegistry: """Central registry for all service configurations.""" _instances: Dict[str, ConfigSchema] = {} @classmethod def get_config(cls, service_name: str) -> ConfigSchema: """Get or create configuration for a service.""" if service_name not in cls._instances: cls._instances[service_name] = GenericConfigSchema(service_name) return cls._instances[service_name] @classmethod def validate_all(cls) -> Dict[str, bool]: """Validate all registered configurations.""" from src.services import get_supported_mcp_services results = {} for service_name in get_supported_mcp_services(): try: cls.get_config(service_name) results[service_name] = True except Exception as e: logger.error(f"Configuration validation failed for {service_name}: {e}") results[service_name] = False return results @classmethod def export_template(cls, service_name: str, output_path: Path) -> None: """Export a configuration template for a service.""" config = cls.get_config(service_name) template = {"service": service_name, "configuration": {}} for key, config_value in config._values.items(): template["configuration"][key] = { "value": config_value.value if config_value.source == "default" else None, "description": config_value.description, "required": config_value.required, "env_var": f"${{{key.upper()}}}", } with open(output_path, "w") as f: yaml.dump(template, f, default_flow_style=False, sort_keys=False) # Utility Functions def get_service_config(service_name: str) -> Dict[str, Any]: """Get service configuration as a dictionary.""" return ConfigRegistry.get_config(service_name).get_all() ================================================ FILE: src/errors.py ================================================ #!/usr/bin/env python3 """ Simple Error Handling for MCPMark ================================== Provides basic error standardization and retry logic. """ from typing import Optional """Retryable error detection via minimal substring matching (lower-case).""" # Keep this list short and generic; aim to catch API/infrastructure issues only. RETRYABLE_PATTERNS = { "ratelimit", # e.g., RateLimitError, too many requests # "connection", # connection refused/reset/error "agent execution failed", "unavailable", # service unavailable # "execution timed out", # timeout "internal server error", # 500s "network error", # generic network issue "quota", # budget/quota exceeded # "llm provider not provided", # litellm error # pipeline infra signals "account balance", "mcp network error", "state duplication error", "thought_signature", "overloaded." } def is_retryable_error(error: str) -> bool: """Return True if the error string contains any retryable pattern.""" error_lower = str(error or "").lower() return any(pattern in error_lower for pattern in RETRYABLE_PATTERNS) def standardize_error_message(error: str, mcp_service: Optional[str] = None) -> str: """Standardize error messages for consistent reporting.""" error_str = str(error).strip() # Common standardizations if "timeout" in error_str.lower(): base_msg = "Operation timed out" elif ( "connection refused" in error_str.lower() or "econnrefused" in error_str.lower() ): base_msg = "Connection refused" elif "not found" in error_str.lower(): base_msg = "Resource not found" elif "already exists" in error_str.lower(): base_msg = "Resource already exists" else: # Return original message if no standardization applies return error_str # Add MCP service prefix if provided if mcp_service: return f"{mcp_service.title()} {base_msg}" return base_msg ================================================ FILE: src/evaluator.py ================================================ import time import json import shutil from datetime import datetime from pathlib import Path from typing import List, Optional from src.logger import get_logger from src.factory import MCPServiceFactory from src.model_config import ModelConfig from src.results_reporter import EvaluationReport, ResultsReporter, TaskResult from src.errors import is_retryable_error from src.agents import AGENT_REGISTRY # Initialize logger logger = get_logger(__name__) class MCPEvaluator: def __init__( self, mcp_service: str, model: str, timeout: int = 300, exp_name: str = "test-run", output_dir: Path = None, reasoning_effort: str = "default", agent_name: str = "mcpmark", task_suite: str = "standard", compaction_token: int = 0, ): # Main configuration self.mcp_service = mcp_service self.timeout = timeout self.agent_name = (agent_name or "mcpmark").lower() self.task_suite = (task_suite or "standard").lower() if self.agent_name not in AGENT_REGISTRY: raise ValueError(f"Unsupported agent '{agent_name}'. Available: {sorted(AGENT_REGISTRY)}") # Initialize model configuration self.reasoning_effort = reasoning_effort self.model_name = model model_config = ModelConfig(self.model_name) self.api_key = model_config.api_key self.base_url = model_config.base_url self.litellm_input_model_name = model_config.litellm_input_model_name # Track the actual model name from LiteLLM responses self.litellm_run_model_name = None # Initialize managers using the factory pattern (simplified) self.task_manager = MCPServiceFactory.create_task_manager( mcp_service, task_suite=self.task_suite ) self.state_manager = MCPServiceFactory.create_state_manager(mcp_service) # Obtain static service configuration from state manager (e.g., notion_key) self.service_config = self.state_manager.get_service_config_for_agent() # Initialize agent for LLM and MCP server management. The agent will # automatically refresh its service configuration from the state # manager before each execution, so per-task manual updates are no # longer needed. agent_cls = AGENT_REGISTRY[self.agent_name] self.agent = agent_cls( litellm_input_model_name=self.litellm_input_model_name, api_key=self.api_key, base_url=self.base_url, mcp_service=mcp_service, timeout=timeout, service_config=self.service_config, service_config_provider=self.state_manager.get_service_config_for_agent, reasoning_effort=self.reasoning_effort, compaction_token=compaction_token, ) # Initialize results reporter self.results_reporter = ResultsReporter() # Output directory handling if self.reasoning_effort != "default": model_slug = self.model_name.replace(".", "-") + "-" + self.reasoning_effort else: model_slug = self.model_name.replace(".", "-") service_for_dir = "playwright" if mcp_service == "playwright_webarena" else mcp_service suite_suffix = "" if self.task_suite in ("standard", "", None) else f"-{self.task_suite}" service_dir_name = f"{service_for_dir}{suite_suffix}" self.base_experiment_dir = output_dir / f"{model_slug}__{service_dir_name}" / exp_name self.base_experiment_dir.mkdir(parents=True, exist_ok=True) def _format_duration(self, seconds: float) -> str: """Format duration: <1s as ms, otherwise seconds.""" return f"{(seconds * 1000):.2f}ms" if seconds < 1 else f"{seconds:.2f}s" def _get_task_output_dir(self, task) -> Path: """Return the directory path for storing this task's reports using '__' separator.""" # Use category_id and task_id with '__' separator category_id = task.category_id if task.category_id else "uncategorized" task_id = str(task.task_id) return self.base_experiment_dir / f"{category_id}__{task_id}" # ------------------------------------------------------------------ # Resuming helpers # ------------------------------------------------------------------ def _load_latest_task_result(self, task) -> Optional[TaskResult]: """Return the most recent TaskResult for *task* if it has been run before.""" task_dir = self._get_task_output_dir(task) if not task_dir.exists(): return None meta_path = task_dir / "meta.json" if not meta_path.exists(): return None try: with meta_path.open("r", encoding="utf-8") as f: meta_data = json.load(f) return TaskResult( task_name=meta_data["task_name"], success=meta_data["execution_result"]["success"], error_message=meta_data["execution_result"].get("error_message"), verification_error=meta_data["execution_result"].get("verification_error"), verification_output=meta_data["execution_result"].get("verification_output"), category_id=task.category_id, task_id=task.task_id, model_output=None, token_usage=meta_data.get("token_usage", {}), turn_count=meta_data.get("turn_count"), agent_execution_time=meta_data.get("agent_execution_time", 0.0), task_execution_time=meta_data.get("task_execution_time", 0.0), ) except Exception as exc: logger.warning("Failed to load existing result for %s: %s", task.name, exc) return None def _gather_all_task_results(self) -> List[TaskResult]: """Scan *all* task sub-directories and collect the latest TaskResult from each.""" results: list[TaskResult] = [] if not self.base_experiment_dir.exists(): return results for task_dir in self.base_experiment_dir.iterdir(): if not task_dir.is_dir(): continue meta_path = task_dir / "meta.json" if not meta_path.exists(): continue try: with meta_path.open("r", encoding="utf-8") as f: meta_data = json.load(f) category_id, task_id = task_dir.name.split("__", 1) result = TaskResult( task_name=meta_data["task_name"], success=meta_data["execution_result"]["success"], error_message=meta_data["execution_result"].get("error_message"), verification_error=meta_data["execution_result"].get("verification_error"), verification_output=meta_data["execution_result"].get("verification_output"), category_id=category_id, task_id=task_id, model_output=None, token_usage=meta_data.get("token_usage", {}), turn_count=meta_data.get("turn_count"), agent_execution_time=meta_data.get("agent_execution_time", 0.0), task_execution_time=meta_data.get("task_execution_time", 0.0), ) results.append(result) except Exception as exc: logger.warning( "Failed to parse existing report in %s: %s", task_dir, exc ) return results def _run_single_task(self, task) -> TaskResult: """ Runs a single task, including setup, agent execution, verification, and cleanup. """ # Track overall task start time task_start_time = time.time() # ------------------------------------------------------------------ # Stage 1: Set up the initial state for the task # ------------------------------------------------------------------ setup_start_time = time.time() logger.info( "\n┌─ Stage 1: Setup ─────────────────────────────────────────────────────" ) setup_success = self.state_manager.set_up(task) setup_time = time.time() - setup_start_time if not setup_success: logger.error(f"| State setup failed for task: {task.name}") task_total_time = time.time() - task_start_time return TaskResult( task_name=task.name, success=False, error_message="State Duplication Error", verification_error=None, verification_output=None, category_id=task.category_id, task_id=task.task_id, agent_execution_time=0.0, task_execution_time=task_total_time, ) display_time = self._format_duration(setup_time) logger.info(f"└─ Completed in {display_time}\n") # ------------------------------------------------------------------ # Stage 2: Execute the task using the agent # ------------------------------------------------------------------ logger.info( "┌─ Stage 2: Execute ───────────────────────────────────────────────────" ) agent_execution_start_time = time.time() # Get task instruction from task manager task_instruction = self.task_manager.get_task_instruction(task) # Prepare task_output_dir and tool call log file task_output_dir = self._get_task_output_dir(task) task_output_dir.mkdir(parents=True, exist_ok=True) execution_log_path = task_output_dir / "execution.log" # Remove existing execution.log to ensure clean start if execution_log_path.exists(): execution_log_path.unlink() # Execute with agent agent_result = self.agent.execute_sync( task_instruction, str(execution_log_path) ) agent_execution_time = time.time() - agent_execution_start_time # Extract actual model name from LiteLLM response if agent_result.get("litellm_run_model_name"): self.litellm_run_model_name = agent_result["litellm_run_model_name"] # Write messages.json to task_output_dir messages_path = task_output_dir / "messages.json" self.results_reporter.save_messages_json( agent_result.get("output", []), messages_path ) # Set service-specific environment variables for verification scripts self.state_manager.set_verification_environment(str(messages_path)) logger.info(f"└─ Completed in {self._format_duration(agent_execution_time)}\n") # ------------------------------------------------------------------ # Stage 3: Verify # ------------------------------------------------------------------ logger.info( "┌─ Stage 3: Verify ────────────────────────────────────────────────────" ) verify_start_time = time.time() try: result = self.task_manager.execute_task(task, agent_result) finally: # Clean up environment variables import os os.environ.pop("MCP_MESSAGES", None) os.environ.pop("MCP_GITHUB_TOKEN", None) verify_time = time.time() - verify_start_time logger.info(f"└─ Completed in {self._format_duration(verify_time)}\n") # ------------------------------------------------------------------ # Stage 4: Clean up # ------------------------------------------------------------------ logger.info( "┌─ Stage 4: Cleanup ───────────────────────────────────────────────────" ) cleanup_start_time = time.time() self.state_manager.clean_up(task) cleanup_time = time.time() - cleanup_start_time logger.info(f"└─ Completed in {self._format_duration(cleanup_time)}\n") # Calculate total task execution time task_total_time = time.time() - task_start_time # Add timing information to the result result.agent_execution_time = agent_execution_time result.task_execution_time = task_total_time return result def run_evaluation(self, task_filter: str) -> EvaluationReport: """ Runs the full evaluation for the specified tasks. """ tasks = self.task_manager.filter_tasks(task_filter) results = [] for task in tasks: # -------------------------------------------------------------- # Resume check # -------------------------------------------------------------- existing_result = self._load_latest_task_result(task) # Decide whether to skip or retry this task retry_due_to_error = ( existing_result is not None and not existing_result.success and is_retryable_error(existing_result.error_message) ) if existing_result and not retry_due_to_error: # Existing result is either successful or failed with a non-retryable error – skip. logger.info( "↩️ Skipping already-completed task (resume): %s", task.name ) results.append(existing_result) continue if retry_due_to_error: # Clean previous artifacts so that new results fully replace them. task_output_dir = self._get_task_output_dir(task) if task_output_dir.exists(): shutil.rmtree(task_output_dir) logger.info( "🔄 Retrying task due to pipeline error (%s): %s", existing_result.error_message, task.name, ) # -------------------------------------------------------------- # Execute new task # -------------------------------------------------------------- task_start = time.time() task_result = self._run_single_task(task) task_end = time.time() results.append(task_result) # Prepare directory & save task_output_dir = self._get_task_output_dir(task) task_output_dir.mkdir(parents=True, exist_ok=True) # Save messages.json (conversation trajectory) messages_path = task_output_dir / "messages.json" if not messages_path.exists(): # 已经写过就跳过 messages = ( task_result.model_output if getattr(task_result, "model_output", None) else [] ) self.results_reporter.save_messages_json(messages, messages_path) # Save meta.json (all other metadata) meta_path = task_output_dir / "meta.json" model_config = { "mcp_service": self.mcp_service, "model_name": self.model_name, "litellm_run_model_name": self.litellm_run_model_name, "reasoning_effort": self.reasoning_effort, "timeout": self.timeout, "agent_name": self.agent_name, } self.results_reporter.save_meta_json( task_result, model_config, datetime.fromtimestamp(task_start), datetime.fromtimestamp(task_end), meta_path, ) # -------------------------------------------------------------- # Aggregate results – combine current `results` with any previously # saved TaskResults that ALSO match the current task_filter. # -------------------------------------------------------------- # Helper: determine if a TaskResult matches the filter string def _matches_filter(tr: TaskResult, flt: str) -> bool: if flt.lower() == "all": return True if "/" in flt: # specific task (category_id/task_id) category_id, task_id = flt.split("/", 1) return tr.category_id == category_id and str(tr.task_id) == task_id # category level return tr.category_id == flt # Pull existing reports from disk and merge existing_results = [ r for r in self._gather_all_task_results() if _matches_filter(r, task_filter) ] # Merge, giving preference to fresh `results` (avoids duplicates) merged: dict[str, TaskResult] = {r.task_name: r for r in existing_results} merged.update({r.task_name: r for r in results}) # overwrite with latest run final_results = list(merged.values()) aggregated_report = EvaluationReport( model_name=self.model_name, model_config={ "mcp_service": self.mcp_service, "model_name": self.model_name, "litellm_run_model_name": self.litellm_run_model_name, "reasoning_effort": self.reasoning_effort, "timeout": self.timeout, "agent_name": self.agent_name, }, total_tasks=len(final_results), successful_tasks=sum(1 for r in final_results if r.success), failed_tasks=sum(1 for r in final_results if not r.success), task_results=final_results, tasks_filter=task_filter, ) # Save model-level summary summary_path = self.base_experiment_dir / "summary.json" self.results_reporter.save_model_summary(aggregated_report, summary_path) logger.info( "\n============================================================" "\nResults Summary" "\n============================================================" ) logger.info( f"✓ Tasks passed: {aggregated_report.successful_tasks}/{aggregated_report.total_tasks} ({aggregated_report.success_rate:.1f}%)" ) logger.info(f"⏱ Total time: {aggregated_report.total_task_execution_time:.1f}s") return aggregated_report ================================================ FILE: src/factory.py ================================================ #!/usr/bin/env python3 """ MCP Service Factory for MCPMark ================================= This module provides a simplified factory pattern for creating service-specific managers with centralized configuration management. Features: - Dynamic service loading from definitions - Centralized configuration - Simplified service registration """ import importlib from dataclasses import dataclass from typing import Dict, Type from src.base.login_helper import BaseLoginHelper from src.base.state_manager import BaseStateManager from src.base.task_manager import BaseTaskManager from src.config.config_schema import ConfigRegistry from src.services import get_service_definition, get_supported_mcp_services @dataclass class ServiceComponents: """All components required for an MCP service.""" task_manager_class: Type[BaseTaskManager] state_manager_class: Type[BaseStateManager] login_helper_class: Type[BaseLoginHelper] config_mapping: Dict[str, Dict[str, str]] def import_class(module_path: str): """Dynamically import a class from module path string.""" if not module_path: return None module_name, class_name = module_path.rsplit(".", 1) module = importlib.import_module(module_name) return getattr(module, class_name) def apply_config_mapping(config: dict, mapping: dict) -> dict: """Apply config mapping to transform config keys to constructor params.""" if not mapping: return {} result = {} for param_name, config_key in mapping.items(): if config_key in config: result[param_name] = config[config_key] return result class ServiceRegistry: """Central registry that loads MCP services from definitions.""" # Cache for loaded components _components_cache: Dict[str, ServiceComponents] = {} @classmethod def get_components(cls, service_name: str) -> ServiceComponents: """Get MCP service components from definition.""" if service_name in cls._components_cache: return cls._components_cache[service_name] definition = get_service_definition(service_name) # Import classes dynamically components = ServiceComponents( task_manager_class=import_class(definition["components"]["task_manager"]), state_manager_class=import_class(definition["components"]["state_manager"]), login_helper_class=import_class(definition["components"]["login_helper"]), config_mapping=definition.get("config_mapping", {}), ) cls._components_cache[service_name] = components return components class GenericServiceFactory: """Generic factory that works with any MCP service.""" def __init__(self, components: ServiceComponents, service_name: str): self.components = components self.service_name = service_name def create_task_manager(self, **kwargs) -> BaseTaskManager: """Create task manager instance.""" return self.components.task_manager_class(**kwargs) def create_state_manager(self, config) -> BaseStateManager: """Create state manager with config mapping.""" mapping = self.components.config_mapping.get("state_manager", {}) # Handle both dict and config schema objects config_dict = config.get_all() if hasattr(config, "get_all") else config kwargs = apply_config_mapping(config_dict, mapping) return self.components.state_manager_class(**kwargs) def create_login_helper(self, config) -> BaseLoginHelper: """Create login helper with config mapping.""" mapping = self.components.config_mapping.get("login_helper", {}) # Handle both dict and config schema objects config_dict = config.get_all() if hasattr(config, "get_all") else config kwargs = apply_config_mapping(config_dict, mapping) # Special handling for GitHub login helper - it needs a single token if self.service_name == "github" and "token" in kwargs: tokens_list = kwargs["token"] if isinstance(tokens_list, list) and tokens_list: kwargs["token"] = tokens_list[0] # Use first token for login helper return self.components.login_helper_class(**kwargs) class MCPServiceFactory: """Main factory interface.""" @classmethod def create_service_config(cls, service_name: str): """Create MCP service configuration (backward compatible).""" config = ConfigRegistry.get_config(service_name) # Create a backward-compatible ServiceConfig-like object class ServiceConfigCompat: def __init__(self, service_name: str, config_dict: dict): self.service_name = service_name self.config = config_dict self.api_key = config_dict.get("api_key") return ServiceConfigCompat(service_name, config.get_all()) @classmethod def create_task_manager(cls, service_name: str, **kwargs) -> BaseTaskManager: """Create task manager for the specified MCP service.""" components = ServiceRegistry.get_components(service_name) return components.task_manager_class(**kwargs) @classmethod def create_state_manager(cls, service_name: str, **kwargs) -> BaseStateManager: """Create state manager for the specified MCP service.""" components = ServiceRegistry.get_components(service_name) config = ConfigRegistry.get_config(service_name).get_all() # Use provided kwargs or apply config mapping if not kwargs: mapping = components.config_mapping.get("state_manager", {}) kwargs = apply_config_mapping(config, mapping) return components.state_manager_class(**kwargs) @classmethod def create_login_helper(cls, service_name: str, **kwargs) -> BaseLoginHelper: """Create login helper for the specified MCP service.""" components = ServiceRegistry.get_components(service_name) config = ConfigRegistry.get_config(service_name).get_all() # Use provided kwargs or apply config mapping if not kwargs: mapping = components.config_mapping.get("login_helper", {}) kwargs = apply_config_mapping(config, mapping) # Special handling for GitHub login helper - it needs a single token if service_name == "github" and "token" in kwargs: tokens_list = kwargs["token"] if isinstance(tokens_list, list) and tokens_list: kwargs["token"] = tokens_list[0] # Use first token for login helper return components.login_helper_class(**kwargs) @classmethod def get_supported_mcp_services(cls) -> list: """Get list of supported MCP services.""" return get_supported_mcp_services() @classmethod def get_config_info(cls, service_name: str) -> dict: """Get detailed configuration information for debugging.""" config = ConfigRegistry.get_config(service_name) return config.get_debug_info() @classmethod def export_config_template(cls, service_name: str, output_path: str) -> None: """Export a configuration template for an MCP service.""" from pathlib import Path ConfigRegistry.export_template(service_name, Path(output_path)) ================================================ FILE: src/logger.py ================================================ #!/usr/bin/env python3 """Logger configuration for MCPMark.""" import logging import sys def get_logger(name: str) -> logging.Logger: """Get a configured logger instance.""" logger = logging.getLogger(name) if not logger.handlers: handler = logging.StreamHandler(sys.stdout) formatter = logging.Formatter("%(message)s") handler.setFormatter(formatter) logger.addHandler(handler) logger.setLevel(logging.INFO) return logger ================================================ FILE: src/mcp_services/filesystem/__init__.py ================================================ """ Filesystem MCP Service for MCPMark =================================== This module provides filesystem-specific MCP server integration for MCPMark evaluation. Uses the official filesystem MCP server for local file operations. """ from .filesystem_login_helper import FilesystemLoginHelper from .filesystem_state_manager import FilesystemStateManager from .filesystem_task_manager import FilesystemTaskManager, FilesystemTask __all__ = [ "FilesystemLoginHelper", "FilesystemStateManager", "FilesystemTaskManager", "FilesystemTask", ] ================================================ FILE: src/mcp_services/filesystem/filesystem_login_helper.py ================================================ """ Filesystem Login Helper for MCPMark ==================================== This module provides a minimal login helper for the filesystem MCP service. Since filesystem operations don't require authentication, this is a simple pass-through implementation that satisfies the interface requirements. """ from pathlib import Path from typing import Optional from src.base.login_helper import BaseLoginHelper from src.logger import get_logger logger = get_logger(__name__) class FilesystemLoginHelper(BaseLoginHelper): """ Login helper for filesystem MCP service. The filesystem MCP server doesn't require authentication, so this implementation simply returns success for all login operations. """ def __init__(self, state_path: Optional[Path] = None): """ Initialize the filesystem login helper. Args: state_path: Path to save state (not used for filesystem) """ super().__init__() self.state_path = ( state_path or Path.home() / ".mcpmark" / "filesystem_state.json" ) logger.info("Initialized FilesystemLoginHelper (no auth required)") def login(self, **kwargs) -> bool: """ Perform login operation. Since filesystem doesn't require authentication, this always returns True. Returns: bool: Always True for filesystem service """ logger.info("Filesystem service does not require authentication") return True def is_authenticated(self) -> bool: """ Check if authenticated. Returns: bool: Always True for filesystem service """ return True def get_credentials(self) -> dict: """ Get credentials for the service. Returns: dict: Empty dict as no credentials needed """ return {} ================================================ FILE: src/mcp_services/filesystem/filesystem_state_manager.py ================================================ """ Filesystem State Manager for MCPMark ===================================== This module handles filesystem state management for consistent task evaluation. It manages test directories, file creation/cleanup, and environment isolation. """ import os import shutil import tempfile from pathlib import Path from typing import Any, Dict, List, Optional from src.base.state_manager import BaseStateManager from src.base.task_manager import BaseTask from src.logger import get_logger logger = get_logger(__name__) class FilesystemStateManager(BaseStateManager): """ Manages filesystem state for task evaluation. This includes creating isolated test directories, tracking created resources, and cleaning up after task completion. """ def _get_project_root(self) -> Path: """Find project root by looking for marker files.""" current = Path(__file__).resolve() # Look for project root markers for parent in current.parents: if (parent / "pyproject.toml").exists() or (parent / "pipeline.py").exists(): return parent # Fallback to old method if markers not found return Path(__file__).parent / "../../../" def __init__(self, test_root: Optional[Path] = None, cleanup_on_exit: bool = False): """ Initialize filesystem state manager. Args: test_root: Root directory for test operations (from FILESYSTEM_TEST_ROOT env var) cleanup_on_exit: Whether to clean up test directories after tasks (default False for persistent environment) """ super().__init__(service_name="filesystem") # Use provided test root or default to persistent test environment if test_root: self.test_root = Path(test_root) else: # Default to persistent test environment project_root = self._get_project_root() self.test_root = (project_root / "test_environments/desktop").resolve() self.cleanup_on_exit = cleanup_on_exit self.current_task_dir: Optional[Path] = None self.created_resources: List[Path] = [] # Backup and restore functionality self.backup_dir: Optional[Path] = None self.backup_enabled = ( True # Enable backup/restore by default for task isolation ) logger.info( f"Initialized FilesystemStateManager with persistent test environment: {self.test_root}" ) def initialize(self, **kwargs) -> bool: """ Initialize the filesystem environment. Ensures the persistent test environment exists and is accessible. Returns: bool: True if initialization successful """ try: # Ensure test environment directory exists if not self.test_root.exists(): logger.error(f"Persistent test environment not found: {self.test_root}") logger.error( "Please ensure test_environments/desktop/ exists in the repository" ) return False logger.info(f"Using persistent test environment: {self.test_root}") # Verify we can write to the directory test_file = self.test_root / ".mcpbench_test" test_file.write_text("test") test_file.unlink() return True except Exception as e: logger.error(f"Failed to initialize filesystem environment: {e}") return False def set_up(self, task: BaseTask) -> bool: """ Set up filesystem environment for a specific task. Creates a backup of the current environment, then uses the backup as the working directory to keep the original unchanged. Args: task: The task for which to set up the state Returns: bool: True if setup successful """ try: # Dynamically set test root based on task category self._set_dynamic_test_root(task) # Create backup of current test environment before task execution if self.backup_enabled: if not self._create_backup(task): logger.error(f"Failed to create backup for task {task.name}") return False # Use the backup directory as the working directory instead of the original self.current_task_dir = ( self.backup_dir ) # Use backup directory for operations logger.info( f"| ✓ Using the backup environment for operations" ) # Store the test directory path in the task object for use by task manager if hasattr(task, "__dict__"): task.test_directory = str(self.current_task_dir) # Set environment variable for verification scripts and MCP server os.environ["FILESYSTEM_TEST_DIR"] = str(self.current_task_dir) return True except Exception as e: logger.error(f"Failed to set up filesystem state for {task.name}: {e}") return False def _set_dynamic_test_root(self, task: BaseTask) -> None: """ Dynamically set the test root directory based on the task category. Args: task: The task for which to set the test root """ # Get the base test environments directory from environment variable base_test_root = os.getenv("FILESYSTEM_TEST_ROOT") if not base_test_root: # Fallback to default path project_root = self._get_project_root() base_test_root = str(project_root / "test_environments") base_test_path = Path(base_test_root) # If task has a category_id, append it to the base path if task.category_id: self.test_root = base_test_path / task.category_id # Store the current task category for URL selection self._current_task_category = task.category_id logger.info(f"| ✓ Setting test root to category-specific directory: {self.test_root}") else: # Use the base test environments directory self.test_root = base_test_path # For base directory, use 'desktop' as default category self._current_task_category = 'desktop' logger.info(f"| Setting test root to base directory: {self.test_root}") # Ensure the directory exists by downloading and extracting if needed if not self.test_root.exists(): logger.warning(f"| Test directory does not exist: {self.test_root}") if not self._download_and_extract_test_environment(): logger.error(f"Failed to download and extract test environment for: {self.test_root}") raise RuntimeError(f"Test environment not available: {self.test_root}") logger.info(f"| Downloaded and extracted test environment: {self.test_root}") def clean_up(self, task: Optional[BaseTask] = None, **kwargs) -> bool: """ Clean up filesystem resources created during task execution. Since we operate on the backup directory, we just need to clean up the backup. Args: task: The task to clean up after (optional) **kwargs: Additional cleanup options Returns: bool: True if cleanup successful """ try: cleanup_success = True # Clean up the backup directory since we operated on it if self.backup_enabled and self.backup_dir and self.backup_dir.exists(): try: shutil.rmtree(self.backup_dir) logger.info( f"| ✓ Cleaned up backup directory for task {task.name if task else 'unknown'}" ) self.backup_dir = None except Exception as e: logger.error(f"Failed to clean up backup directory: {e}") cleanup_success = False else: logger.info("No backup directory to clean up") # Clear the resources list self.created_resources.clear() return cleanup_success except Exception as e: logger.error(f"Filesystem cleanup failed: {e}") return False def get_test_directory(self) -> Optional[Path]: """ Get the current test directory path. Returns: Path to the current test directory, or None if not set up """ return self.current_task_dir def get_service_config_for_agent(self) -> dict: """ Get service-specific configuration for agent execution. Returns: Dictionary containing configuration needed by the agent/MCP server """ service_config = {} # Add test directory if available if self.current_task_dir: service_config["test_directory"] = str(self.current_task_dir) return service_config def track_resource(self, resource_path: Path): """ Track a resource for cleanup. Args: resource_path: Path to the resource to track """ if resource_path not in self.created_resources: self.created_resources.append(resource_path) logger.debug(f"Tracking resource for cleanup: {resource_path}") def reset_test_environment(self) -> bool: """ Reset the test environment to its original state. This method can be used for development/debugging purposes. In normal operation, the persistent environment is maintained. Returns: bool: True if reset successful """ try: # Remove any sorting directories that might have been created sorting_dirs = ["has_test", "no_test", "organized", "backup"] for dir_name in sorting_dirs: dir_path = self.test_root / dir_name if dir_path.exists(): shutil.rmtree(dir_path) logger.info(f"Removed sorting directory: {dir_path}") # Remove any temporary files that might have been created temp_files = ["hello_world.txt", "new_file.txt", "temp.txt"] for file_name in temp_files: file_path = self.test_root / file_name if file_path.exists(): file_path.unlink() logger.info(f"Removed temporary file: {file_path}") logger.info("Test environment reset completed") return True except Exception as e: logger.error(f"Test environment reset failed: {e}") return False # ========================================================================= # Backup and Restore Methods for Task Isolation # ========================================================================= def _create_backup(self, task: BaseTask) -> bool: """ Create a complete backup of the test environment before task execution. Args: task: The task for which to create backup Returns: bool: True if backup successful """ try: # Create backup directory with task-specific name project_root = self._get_project_root() backup_root = (project_root / ".mcpmark_backups").resolve() backup_root.mkdir(exist_ok=True) task_id = f"{task.service}_{task.category_id}_{task.task_id}" self.backup_dir = backup_root / f"backup_{task_id}_{os.getpid()}" # Remove existing backup if it exists if self.backup_dir.exists(): shutil.rmtree(self.backup_dir) # Create fresh backup by copying entire test environment shutil.copytree(self.test_root, self.backup_dir) logger.info(f"| ✓ Created backup for task {task.name}: {self.backup_dir}") return True except Exception as e: logger.error(f"Failed to create backup for task {task.name}: {e}") return False def _restore_from_backup(self, task: Optional[BaseTask] = None) -> bool: """ Restore the test environment from backup. Args: task: The task to restore after (optional, for logging) Returns: bool: True if restore successful """ try: if not self.backup_dir or not self.backup_dir.exists(): logger.error("No backup directory available for restore") return False # Remove current test environment if self.test_root.exists(): shutil.rmtree(self.test_root) # Restore from backup shutil.copytree(self.backup_dir, self.test_root) # Clean up backup directory shutil.rmtree(self.backup_dir) self.backup_dir = None task_name = task.name if task else "unknown" logger.info( f"✅ Restored test environment from backup after task {task_name}" ) return True except Exception as e: task_name = task.name if task else "unknown" logger.error(f"Failed to restore from backup after task {task_name}: {e}") return False # ========================================================================= # Abstract Method Implementations Required by BaseStateManager # ========================================================================= def _create_initial_state(self, task: BaseTask) -> Optional[Dict[str, Any]]: """Create initial state for a task. For filesystem, this is handled in set_up() method by creating task directories. Returns the task directory path as state info. """ if self.current_task_dir and self.current_task_dir.exists(): return {"task_directory": str(self.current_task_dir)} return None def _store_initial_state_info( self, task: BaseTask, state_info: Dict[str, Any] ) -> None: """Store initial state information in the task object. For filesystem, we store the test directory path. """ if state_info and "task_directory" in state_info: if hasattr(task, "__dict__"): task.test_directory = state_info["task_directory"] def _cleanup_task_initial_state(self, task: BaseTask) -> bool: """Clean up initial state for a specific task. For filesystem, this means removing the task directory. """ if hasattr(task, "test_directory") and task.test_directory: task_dir = Path(task.test_directory) if task_dir.exists(): try: shutil.rmtree(task_dir) logger.info(f"Cleaned up task directory: {task_dir}") return True except Exception as e: logger.error(f"Failed to clean up task directory: {e}") return False return True def _cleanup_single_resource(self, resource: Dict[str, Any]) -> bool: """Clean up a single tracked resource. For filesystem, resources are paths to files/directories. """ if "path" in resource: resource_path = Path(resource["path"]) if resource_path.exists(): try: if resource_path.is_dir(): shutil.rmtree(resource_path) else: resource_path.unlink() logger.info(f"Cleaned up resource: {resource_path}") return True except Exception as e: logger.error(f"Failed to clean up {resource_path}: {e}") return False return True def _download_and_extract_test_environment(self) -> bool: """ Download and extract test environment using wget and unzip commands. This approach preserves original file timestamps and is simpler than Python zipfile. Returns: bool: True if download and extraction successful """ try: import subprocess import sys # Define URL mapping for different test environment categories url_mapping = { 'desktop': 'https://storage.mcpmark.ai/filesystem/desktop.zip', 'file_context': 'https://storage.mcpmark.ai/filesystem/file_context.zip', 'file_property': 'https://storage.mcpmark.ai/filesystem/file_property.zip', 'folder_structure': 'https://storage.mcpmark.ai/filesystem/folder_structure.zip', 'papers': 'https://storage.mcpmark.ai/filesystem/papers.zip', 'student_database': 'https://storage.mcpmark.ai/filesystem/student_database.zip', 'threestudio': 'https://storage.mcpmark.ai/filesystem/threestudio.zip', 'votenet': 'https://storage.mcpmark.ai/filesystem/votenet.zip', 'legal_document': 'https://storage.mcpmark.ai/filesystem/legal_document.zip', 'desktop_template': 'https://storage.mcpmark.ai/filesystem/desktop_template.zip' } # Get the category from the current task context category = getattr(self, '_current_task_category', None) if not category: logger.error("| No task category available for URL selection") return False # Select the appropriate URL based on category if category in url_mapping: test_env_url = url_mapping[category] logger.info(f"| ○ Selected URL for category '{category}': {test_env_url}") else: logger.error(f"| No URL mapping found for category: {category}") return False # Allow override via environment variable test_env_url = os.getenv('TEST_ENVIRONMENT_URL', test_env_url) logger.info(f"| ○ Downloading test environment from: {test_env_url}") # Create a temporary directory for the download with tempfile.TemporaryDirectory() as temp_dir: temp_path = Path(temp_dir) zip_path = temp_path / "test_environment.zip" # Step 1: Download using wget logger.info("| ○ Downloading test environment zip file...") try: # Use wget if available, otherwise fall back to curl if sys.platform == "win32": # Windows: try wget, fall back to curl try: result = subprocess.run( ["wget", "-O", str(zip_path), test_env_url], capture_output=True, text=True, check=True ) except (subprocess.CalledProcessError, FileNotFoundError): # Fall back to curl result = subprocess.run( ["curl", "-L", "-o", str(zip_path), test_env_url], capture_output=True, text=True, check=True ) else: # Unix-like systems: try wget, fall back to curl try: result = subprocess.run( ["wget", "-O", str(zip_path), test_env_url], capture_output=True, text=True, check=True ) except (subprocess.CalledProcessError, FileNotFoundError): # Fall back to curl result = subprocess.run( ["curl", "-L", "-o", str(zip_path), test_env_url], capture_output=True, text=True, check=True ) logger.info("| ✓ Download completed successfully") except Exception as e: logger.error(f"| Download failed: {e}") return False # Step 2: Extract using unzip logger.info("| ○ Extracting test environment...") try: # Extract to parent directory to maintain expected structure result = subprocess.run( ["unzip", "-o", str(zip_path), "-d", str(self.test_root.parent)], capture_output=True, text=True, check=True ) logger.info("| ✓ Extraction completed successfully") except Exception as e: logger.error(f"| Extraction failed: {e}") return False # Step 3: Remove __MACOSX folder if it exists logger.info("| ○ Cleaning up macOS metadata...") macosx_path = self.test_root.parent / "__MACOSX" if macosx_path.exists(): try: shutil.rmtree(macosx_path) logger.info("| ✓ Removed __MACOSX folder") except Exception as e: logger.warning(f"| Failed to remove __MACOSX folder: {e}") # Verify the extracted directory exists if not self.test_root.exists(): logger.error(f"| Extracted directory not found at expected path: {self.test_root}") return False logger.info(f"| ✓ Successfully downloaded and extracted test environment to: {self.test_root}") return True except Exception as e: logger.error(f"| Failed to download and extract test environment: {e}") return False ================================================ FILE: src/mcp_services/filesystem/filesystem_task_manager.py ================================================ """ Simplified Filesystem Task Manager using Enhanced Base Class ============================================================ This module shows how the filesystem task manager can be simplified using the enhanced base task manager. """ import os import subprocess from dataclasses import dataclass from pathlib import Path from typing import List, Optional, Dict, Any from src.base.task_manager import BaseTask, BaseTaskManager from src.logger import get_logger logger = get_logger(__name__) @dataclass class FilesystemTask(BaseTask): """Filesystem-specific task with additional fields.""" test_directory: Optional[str] = None expected_files: Optional[List[str]] = None expected_directories: Optional[List[str]] = None class FilesystemTaskManager(BaseTaskManager): """Simplified filesystem task manager using enhanced base class.""" def __init__(self, tasks_root: Path = None, task_suite: str = "standard"): """Initialize filesystem task manager.""" if tasks_root is None: tasks_root = Path(__file__).resolve().parents[3] / "tasks" super().__init__( tasks_root, mcp_service="filesystem", task_class=FilesystemTask, task_organization="directory", task_suite=task_suite, ) # Override only what's needed for filesystem-specific behavior def _create_task_from_files( self, category_id: str, task_files_info: Dict[str, Any] ) -> BaseTask: """Instantiate a `BaseTask` from the dictionary returned by `_find_task_files`.""" import json # Support arbitrary task names, not just task_n format task_name = task_files_info["task_id"] # Use task_name as default task_id task_id = task_name # Check for meta.json meta_path = task_files_info["instruction_path"].parent / "meta.json" final_category_id = category_id if meta_path.exists(): try: with open(meta_path, 'r') as f: meta_data = json.load(f) # Use values from meta.json if available final_category_id = meta_data.get("category_id", category_id) task_id = meta_data.get("task_id", task_id) except Exception as e: logger.warning(f"Failed to load meta.json from {meta_path}: {e}") return self.task_class( task_instruction_path=task_files_info["instruction_path"], task_verification_path=task_files_info["verification_path"], service="filesystem", category_id=final_category_id, task_id=task_id, ) def run_verification(self, task: BaseTask) -> subprocess.CompletedProcess: """Run verification with filesystem-specific environment.""" env = os.environ.copy() # Pass test directory to verification script # Priority: task.test_directory (set by state manager) > environment variable test_dir = None if hasattr(task, "test_directory") and task.test_directory: test_dir = task.test_directory else: test_dir = os.getenv("FILESYSTEM_TEST_DIR") if test_dir: env["FILESYSTEM_TEST_DIR"] = test_dir logger.debug(f"Setting FILESYSTEM_TEST_DIR to: {test_dir}") return subprocess.run( self._get_verification_command(task), capture_output=True, text=True, timeout=300, env=env, ) def filter_tasks(self, task_filter: str) -> List[BaseTask]: """Filter tasks based on category or specific task pattern with support for arbitrary task names.""" all_tasks = self.discover_all_tasks() if not task_filter or task_filter.lower() == "all": return all_tasks # Check if it's a category filter categories = self.get_categories() if task_filter in categories: return [task for task in all_tasks if task.category_id == task_filter] # Check for specific task pattern (category_id/task_X or category_id/arbitrary_name) if "/" in task_filter: try: category_id, task_id = task_filter.split("/", 1) # Direct string matching for task_id for task in all_tasks: if task.category_id == category_id and str(task.task_id) == task_id: return [task] except (ValueError, IndexError): pass # Fallback: check for partial matches in task names or categories filtered_tasks = [] for task in all_tasks: if ( task_filter in task.category_id or task_filter in task.name or task_filter == str(task.task_id) ): filtered_tasks.append(task) return filtered_tasks ================================================ FILE: src/mcp_services/github/__init__.py ================================================ """ GitHub MCP Service for MCPMark =============================== This module provides GitHub-specific MCP server integration for MCPMark evaluation. Uses GitHub's official remote MCP server for streamable HTTP/SSE communication. Updated to include initial state-based environment replication mechanism. """ from .github_login_helper import GitHubLoginHelper from .github_task_manager import GitHubTaskManager, GitHubTask from .github_state_manager import GitHubStateManager __all__ = ["GitHubLoginHelper", "GitHubTaskManager", "GitHubTask", "GitHubStateManager"] ================================================ FILE: src/mcp_services/github/github_login_helper.py ================================================ """ GitHub Login Helper for MCPMark ================================ This module provides GitHub token authentication and validation utilities. Unlike browser-based services, GitHub uses token-based authentication. """ import json import requests from pathlib import Path from typing import Optional, Dict, Any from src.base.login_helper import BaseLoginHelper from src.logger import get_logger logger = get_logger(__name__) class GitHubLoginHelper(BaseLoginHelper): """ Utility helper for GitHub token authentication and validation. """ def __init__( self, token: Optional[str] = None, state_path: Optional[Path] = None, ) -> None: """ Initialize the GitHub login helper. Args: token: GitHub Personal Access Token state_path: Path to save authentication state """ self.token = token self.state_path = state_path or Path.home() / ".mcpmark" / "github_auth.json" # Ensure state directory exists self.state_path.parent.mkdir(parents=True, exist_ok=True) def login_and_save_state(self, **kwargs) -> bool: """ Validate GitHub token and save authentication state. Returns: bool: True if authentication successful, False otherwise """ if not self.token: logger.error("No GitHub token provided") return False try: # Validate token by making an authenticated request session = requests.Session() session.headers.update( { "Authorization": f"Bearer {self.token}", "Accept": "application/vnd.github.v3+json", "X-GitHub-Api-Version": "2022-11-28", "User-Agent": "MCPMark/1.0", } ) # Get user information response = session.get("https://api.github.com/user") if response.status_code != 200: logger.error( f"GitHub authentication failed: {response.status_code} {response.text}" ) return False user_info = response.json() logger.info( f"GitHub authentication successful for user: {user_info['login']}" ) # Get token scopes token_scopes = self._get_token_scopes(session) # Save authentication state auth_state = { "user": user_info, "token_scopes": token_scopes, "authenticated_at": self._get_current_timestamp(), } self._save_auth_state(auth_state) # Verify required permissions if not self._verify_required_permissions(token_scopes): logger.warning("GitHub token may not have all required permissions") return False return True except Exception as e: logger.error(f"GitHub authentication error: {e}") return False def _get_token_scopes(self, session: requests.Session) -> list: """Get the scopes available to the current token.""" try: response = session.get("https://api.github.com/user") scopes_header = response.headers.get("X-OAuth-Scopes", "") if scopes_header: return [ scope.strip() for scope in scopes_header.split(",") if scope.strip() ] return [] except Exception as e: logger.warning(f"Could not determine token scopes: {e}") return [] def _verify_required_permissions(self, scopes: list) -> bool: """ Verify that the token has the minimum required permissions. For MCPMark GitHub tasks, we typically need: - repo (for repository access) - read:user (for user information) """ required_scopes = ["repo"] # Minimum requirement recommended_scopes = ["repo", "read:user", "read:org"] has_required = all(scope in scopes for scope in required_scopes) if not has_required: logger.error( f"Token missing required scopes. Required: {required_scopes}, Available: {scopes}" ) return False has_recommended = all(scope in scopes for scope in recommended_scopes) if not has_recommended: logger.warning( f"Token missing some recommended scopes. Recommended: {recommended_scopes}, Available: {scopes}" ) return True def _save_auth_state(self, auth_state: Dict[str, Any]): """Save authentication state to local file.""" try: with open(self.state_path, "w") as f: json.dump(auth_state, f, indent=2, default=str) # Set restrictive permissions (user read/write only) self.state_path.chmod(0o600) logger.info(f"Authentication state saved to: {self.state_path}") except Exception as e: logger.error(f"Failed to save authentication state: {e}") def _get_current_timestamp(self) -> str: """Get current timestamp in ISO format.""" from datetime import datetime return datetime.utcnow().isoformat() + "Z" def get_saved_auth_state(self) -> Optional[Dict[str, Any]]: """Load and return saved authentication state.""" try: if self.state_path.exists(): with open(self.state_path, "r") as f: return json.load(f) except Exception as e: logger.error(f"Failed to load authentication state: {e}") return None def is_token_valid(self) -> bool: """Check if the current token is still valid.""" if not self.token: return False try: session = requests.Session() session.headers.update( { "Authorization": f"Bearer {self.token}", "Accept": "application/vnd.github.v3+json", } ) response = session.get("https://api.github.com/user") return response.status_code == 200 except Exception: return False def get_rate_limit_info(self) -> Dict[str, Any]: """Get current rate limit information for the token.""" if not self.token: return {} try: session = requests.Session() session.headers.update( { "Authorization": f"Bearer {self.token}", "Accept": "application/vnd.github.v3+json", } ) response = session.get("https://api.github.com/rate_limit") if response.status_code == 200: return response.json() except Exception as e: logger.warning(f"Failed to get rate limit info: {e}") return {} def test_repository_access(self, owner: str, repo: str) -> bool: """Test if the token has access to a specific repository.""" if not self.token: return False try: session = requests.Session() session.headers.update( { "Authorization": f"Bearer {self.token}", "Accept": "application/vnd.github.v3+json", } ) response = session.get(f"https://api.github.com/repos/{owner}/{repo}") return response.status_code == 200 except Exception: return False ================================================ FILE: src/mcp_services/github/github_state_manager.py ================================================ """ GitHub State Manager for MCPMark ================================= This module handles GitHub repository state management for consistent task evaluation. Manages test repositories, branches, and cleanup after evaluation. """ import requests from typing import Optional, List, Union from pathlib import Path from src.base.state_manager import BaseStateManager, InitialStateInfo from src.base.task_manager import BaseTask from src.logger import get_logger from src.mcp_services.github.token_pool import GitHubTokenPool logger = get_logger(__name__) class GitHubStateManager(BaseStateManager): """ Manages GitHub repository state for task evaluation. """ def __init__( self, github_token: Union[str, List[str]], # Name of the evaluation organisation / user where temporary test repositories are created eval_org: str = "mcpmark-eval", # Local directory that stores *exported* repository templates (produced by repo_exporter.py) templates_root: str = "./github_state", ): """ Initialize GitHub state manager. Args: github_token: GitHub Personal Access Token(s). Can be a single token string or a list of tokens for round-robin usage. eval_org: Organisation / user used to host **ephemeral evaluation repositories**. """ super().__init__(service_name="github") # Track repos created via template import so we can delete them afterwards self._repos_to_cleanup: list[tuple[str, str]] = [] # (owner, repo_name) # Initialize token pool if isinstance(github_token, str): # Single token - create pool with one token self.token_pool = GitHubTokenPool([github_token]) self.github_token = github_token # Keep for backward compatibility else: # Multiple tokens - use token pool self.token_pool = GitHubTokenPool(github_token) self.github_token = ( self.token_pool.get_current_token() ) # For backward compatibility # Store evaluation context (consistent naming) self.eval_org = eval_org # evaluation organisation / user # Local path that contains exported repository templates self.templates_root = Path(templates_root).expanduser().resolve() # Set up HTTP session for GitHub API self.session = requests.Session() # Note: We'll update the Authorization header before each request self.session.headers.update( { "Accept": "application/vnd.github.v3+json", "X-GitHub-Api-Version": "2022-11-28", "User-Agent": "MCPMark/1.0", } ) # Validate GitHub configuration during initialization try: # Set initial token for validation self._update_session_token() response = self.session.get("https://api.github.com/user") if response.status_code != 200: raise ValueError( f"Invalid GitHub token: {response.status_code} {response.text}" ) user_info = response.json() logger.info(f"GitHub authenticated as: {user_info['login']}") logger.info(f"Using token pool with {self.token_pool.pool_size} token(s)") # Check if evaluation organisation exists (optional) if self.eval_org: org_response = self.session.get( f"https://api.github.com/orgs/{self.eval_org}" ) if org_response.status_code == 200: logger.info(f"Using evaluation organisation: {self.eval_org}") else: logger.warning( f"Evaluation organisation {self.eval_org} not accessible, using user account" ) # Fall back to user account self.eval_org = user_info["login"] logger.info("GitHub state manager initialized successfully") except Exception as e: raise RuntimeError(f"GitHub initialization failed: {e}") # Initial state mapping - categories to initial state repositories self.initial_state_mapping = { "build_your_own_x": "codecrafters-io-build-your-own-x", "missing-semester": "missing-semester-missing-semester", "mcpmark-cicd": "zjwu0522-mcpmark-cicd", "harmony": "openai-harmony", "claude-code": "anthropics-claude-code", "easyr1": "hiyouga-EasyR1", } # CDN URL mapping for downloading GitHub templates self.github_template_url_mapping = { "codecrafters-io-build-your-own-x": "https://storage.mcpmark.ai/github/codecrafters-io-build-your-own-x.zip", "missing-semester-missing-semester": "https://storage.mcpmark.ai/github/missing-semester-missing-semester.zip", "zjwu0522-mcpmark-cicd": "https://storage.mcpmark.ai/github/zjwu0522-mcpmark-cicd.zip", "openai-harmony": "https://storage.mcpmark.ai/github/openai-harmony.zip", "anthropics-claude-code": "https://storage.mcpmark.ai/github/anthropics-claude-code.zip", "hiyouga-EasyR1": "https://storage.mcpmark.ai/github/hiyouga-EasyR1.zip", } # ========================================================================= # Core Template Methods (Required by BaseStateManager) # ========================================================================= # --------------------------------------------------------------------- # Internal helper – template importer (replicates repo_importer logic) # --------------------------------------------------------------------- def _import_template_repo( self, template_dir: Path, owner: str, private: bool = True ) -> str: """Import repository from local template directory to GitHub (simplified).""" import json import subprocess import time # ------------------------------------------------------------------ # Helper functions (stripped-down versions of repo_importer utilities) # ------------------------------------------------------------------ def _list_refs(repo_dir: str) -> list[str]: result = subprocess.run( ["git", "-C", repo_dir, "for-each-ref", "--format=%(refname)"], check=True, capture_output=True, text=True, ) return result.stdout.strip().splitlines() def _push_repo( repo_path: Path, repo_owner: str, repo_name: str, required_refs: list[str] ): """Push repo to GitHub: try mirror, else per-ref.""" token = self.github_token dst_url = f"https://x-access-token:{token}@github.com/{repo_owner}/{repo_name}.git" try: subprocess.run( ["git", "-C", str(repo_path), "push", "--mirror", dst_url], check=True, capture_output=True, ) return except subprocess.CalledProcessError as err: logger.warning( "| [push] Mirror push failed – falling back: %s", err.stderr.decode(errors="ignore"), ) refs = required_refs or _list_refs(str(repo_path)) for ref in refs: for attempt in range(3): try: subprocess.run( [ "git", "-C", str(repo_path), "push", dst_url, f"{ref}:{ref}", ], check=True, capture_output=True, ) break except subprocess.CalledProcessError as ref_err: if attempt == 2: raise RuntimeError( f"Failed to push ref {ref}: {ref_err.stderr}" ) from ref_err time.sleep(2 * (attempt + 1)) # ------------------------------------------------------------------ # Phase 0 – read template metadata # ------------------------------------------------------------------ meta = json.loads((template_dir / "meta.json").read_text()) repo_name: str = meta["repo"] pr_head_refs = meta.get("pr_head_refs", []) default_branch = meta.get("default_branch", "main") pulls_data = json.loads((template_dir / "pulls.json").read_text()) fork_branches = [ pr["local_branch"] for pr in pulls_data if pr.get("is_from_fork") and "local_branch" in pr ] needed_refs = ( [f"refs/heads/{default_branch}"] + [f"refs/heads/{h}" for h in pr_head_refs] + [f"refs/heads/{b}" for b in fork_branches] ) # ------------------------------------------------------------------ # Phase 1 – create empty repo under owner # ------------------------------------------------------------------ create_payload = { "name": repo_name, "description": f"Restored template repo {repo_name}", "private": private, "auto_init": False, "has_issues": True, "has_projects": True, "has_wiki": False, "default_branch": default_branch, # Set the correct default branch } auth_user = self._get_authenticated_user() create_url = ( "https://api.github.com/user/repos" if owner == auth_user else f"https://api.github.com/orgs/{owner}/repos" ) resp = self._request_with_retry("POST", create_url, json=create_payload) if resp.status_code == 422 and "name already exists" in resp.text: # Attempt to delete and recreate self._delete_repository(owner, repo_name) resp = self._request_with_retry("POST", create_url, json=create_payload) if resp.status_code not in (200, 201): raise RuntimeError(f"Failed to create repo: {resp.status_code} {resp.text}") html_url = resp.json()["html_url"] logger.info("| [import] Target repository created: %s", html_url) # Safety check: Prevent importing to public repositories # Public repos would send @ mention notifications to real users, causing spam if not private: error_msg = ( "ERROR: Cannot import template to a public repository.\n\n" "Reason: The template contains @ mentions of real GitHub users from the original\n" "repository. Importing to a public repository would send notifications to these\n" "users, which is disruptive and inappropriate.\n\n" "Solution: Set private=True when calling _import_template_repo()." ) logger.error(error_msg) # Clean up the created repo before raising self._delete_repository(owner, repo_name) raise RuntimeError(error_msg) # Immediately disable GitHub Actions for ALL repositories to prevent any accidental triggers # We'll re-enable it later only for mcpmark-cicd logger.info( "| [import] Disabling GitHub Actions immediately after repo creation..." ) self._disable_github_actions(owner, repo_name) # ------------------------------------------------------------------ # Phase 2 – push git history # ------------------------------------------------------------------ repo_path = template_dir / "repo" logger.info("| [import] Pushing git history …") _push_repo(repo_path, owner, repo_name, needed_refs) # Remove .github directory after pushing with a new commit import shutil github_dir = repo_path / ".github" if github_dir.exists(): logger.info("| [import] Removing .github directory after push …") shutil.rmtree(github_dir) # Commit the deletion subprocess.run( ["git", "-C", str(repo_path), "add", "-A"], check=True, capture_output=True, ) subprocess.run( [ "git", "-C", str(repo_path), "commit", "-m", "Remove .github directory", ], capture_output=True, ) # Push the new commit token = self.github_token dst_url = ( f"https://x-access-token:{token}@github.com/{owner}/{repo_name}.git" ) subprocess.run( ["git", "-C", str(repo_path), "push", dst_url], check=True, capture_output=True, ) # ------------------------------------------------------------------ # Phase 3 – recreate issues & PRs # ------------------------------------------------------------------ def _create_comment(issue_number: int, body: str): self._request_with_retry( "POST", f"https://api.github.com/repos/{owner}/{repo_name}/issues/{issue_number}/comments", json={"body": body}, ) def _create_issue(item: dict) -> Optional[int]: data = { "title": item["title"], "body": self._obfuscate_mentions(item.get("body", "")), "labels": item.get("labels", []), } r = self._request_with_retry( "POST", f"https://api.github.com/repos/{owner}/{repo_name}/issues", json=data, ) if r.status_code not in (200, 201): return None new_no = r.json()["number"] if item.get("state") == "closed": self._request_with_retry( "PATCH", f"https://api.github.com/repos/{owner}/{repo_name}/issues/{new_no}", json={"state": "closed"}, ) return new_no def _create_pull(pr_itm: dict) -> Optional[int]: body = self._obfuscate_mentions(pr_itm.get("body", "")) if pr_itm.get("is_from_fork", False): fork_note = f"\n\n---\n_This PR was originally from a fork: **{pr_itm.get('fork_owner')}/{pr_itm.get('fork_repo')}** (branch: `{pr_itm['head']}`)_" body = body + fork_note if body else fork_note[2:] payload = { "title": pr_itm["title"], "body": body, "head": pr_itm.get("local_branch", pr_itm["head"]), "base": pr_itm["base"], } r = self._request_with_retry( "POST", f"https://api.github.com/repos/{owner}/{repo_name}/pulls", json=payload, ) if r.status_code not in (200, 201): return None return r.json()["number"] # Issues issues_data = json.loads((template_dir / "issues.json").read_text()) created_issues = 0 logger.info("| [phase] Re-creating issues …") for itm in issues_data: new_no = _create_issue(itm) if new_no: created_issues += 1 for c in itm.get("comments", []): _create_comment( new_no, self._obfuscate_mentions( f"*Original author: @{c['user']}*\n\n{c['body']}" ), ) logger.info( "| [phase] Created %d out of %d issues", created_issues, len(issues_data) ) # Pull requests logger.info("| [phase] Re-creating pull requests …") created_prs = 0 skipped_prs = 0 for pr in pulls_data: new_pr_no = _create_pull(pr) if new_pr_no: created_prs += 1 for c in pr.get("comments", []): _create_comment( new_pr_no, self._obfuscate_mentions( f"*Original author: @{c['user']}*\n\n{c['body']}" ), ) for rc in pr.get("review_comments", []): _create_comment( new_pr_no, self._obfuscate_mentions( f"*Original author: @{rc['user']}* (review)\n\n{rc['body']}" ), ) else: skipped_prs += 1 logger.info( "| [phase] Created %d PRs, skipped %d PRs", created_prs, skipped_prs ) # Re-enable GitHub Actions ONLY for mcpmark-cicd repository # All other repos remain disabled (as set immediately after creation) if "mcpmark-cicd" in template_dir.name: logger.info("| [import] Re-enabling GitHub Actions for CI/CD repository…") self._enable_github_actions(owner, repo_name) # Disable notifications to prevent email spam logger.info("| [import] Disabling repository notifications …") self._disable_repository_notifications(owner, repo_name) logger.info("| [import] Repository import complete: %s", html_url) return html_url # --------------------------------------------------------------------- # Public – create initial state using local template import # --------------------------------------------------------------------- def _create_initial_state(self, task: "BaseTask") -> Optional[InitialStateInfo]: """ Set up GitHub environment for a specific task. This may involve: 1. Creating/forking test repositories 2. Setting up branches 3. Creating issues or PRs if needed """ try: logger.info(f"| Setting up GitHub state for task: {task.name}") template_name = self.select_initial_state_for_task(task.category_id) if template_name is None: raise RuntimeError( f"No template configured for task category: {task.category_id}" ) template_dir = (self.templates_root / template_name).resolve() if not template_dir.exists(): logger.warning( "| Template directory %s not found locally, attempting to download from CDN", template_dir, ) if not self._download_and_extract_github_template(template_name): logger.error( "| Failed to download template %s from CDN", template_name ) return None logger.info("| Template %s downloaded successfully", template_name) logger.info(f"| Importing repository template from {template_dir} …") owner = self.eval_org if self.eval_org else self._get_authenticated_user() if "mcpmark-cicd" in template_name: repo_url = self._import_template_repo(template_dir, owner, False) else: repo_url = self._import_template_repo(template_dir, owner, True) # Record for cleanup later repo_name = repo_url.rstrip("/").split("/")[-1] self._repos_to_cleanup.append((owner, repo_name)) # Build InitialStateInfo return InitialStateInfo( state_id=f"{owner}/{repo_name}", state_url=repo_url, metadata={ "owner": owner, "repo_name": repo_name, "category": task.category_id, "task_id": task.task_id, }, ) except Exception as e: logger.error(f"| GitHub setup failed for {task.name}: {e}") return None # --------------------------------------------------------------------- # BaseStateManager required hooks # --------------------------------------------------------------------- def _store_initial_state_info(self, task, state_info: InitialStateInfo) -> None: # type: ignore[override] if hasattr(task, "repository_url"): task.repository_url = state_info.state_url def _cleanup_task_initial_state(self, task) -> bool: # type: ignore[override] """No-op – cleanup is handled by self.clean_up which deletes imported repos.""" return True def _cleanup_single_resource(self, resource) -> bool: # type: ignore[override] """No-op – we don't use BaseStateManager's tracked_resources anymore.""" return True # --------------------------------------------------------------------- def clean_up(self, task=None, **kwargs) -> bool: """Delete repositories that were imported for tasks.""" success = True for owner, repo_name in self._repos_to_cleanup: try: self._delete_repository(owner, repo_name) logger.info("| Deleted repository: %s/%s", owner, repo_name) except Exception as err: logger.error( "| Failed to delete repository %s/%s: %s", owner, repo_name, err ) success = False self._repos_to_cleanup.clear() return success # ========================================================================= # Repository Creation and Setup Operations # ========================================================================= def _delete_repository(self, owner: str, repo_name: str): """Delete a repository (use with caution).""" delete_url = f"https://api.github.com/repos/{owner}/{repo_name}" response = self.session.delete(delete_url) if response.status_code not in [200, 204]: logger.warning( f"| Failed to delete repository {owner}/{repo_name}: {response.text}" ) raise Exception( f"| Failed to delete repository {owner}/{repo_name}: {response.status_code} {response.text}" ) else: logger.info(f"| Successfully deleted repository {owner}/{repo_name}") def _obfuscate_mentions(self, text: str) -> str: """ Obfuscate @ mentions to prevent notifications to real users. Replaces @username with @username_XXXX (random suffix) to ensure the mentioned user does not exist on GitHub. This prevents notification spam when importing templates that contain @ mentions from original repositories. Args: text: The text content that may contain @ mentions Returns: Text with obfuscated @ mentions """ import re import random import string if not text: return text # Pattern matches @username (GitHub usernames: alphanumeric, hyphens, max 39 chars) # Negative lookbehind (? str: """Return cached authenticated username or fetch once from GitHub.""" if hasattr(self, "_auth_user") and self._auth_user: return self._auth_user response = self.session.get("https://api.github.com/user") if response.status_code == 200: self._auth_user = response.json()["login"] return self._auth_user return None # --------------------------------------------------------------------- # Token management helpers # --------------------------------------------------------------------- def _update_session_token(self): """Update the session Authorization header with the current token.""" current_token = self.token_pool.get_current_token() self.session.headers.update({"Authorization": f"Bearer {current_token}"}) # Update backward compatibility attribute self.github_token = current_token def _rotate_token(self): """Rotate to the next token in the pool and update session.""" next_token = self.token_pool.get_next_token() self.session.headers.update({"Authorization": f"Bearer {next_token}"}) # Update backward compatibility attribute self.github_token = next_token logger.debug(f"| Rotated to next token in pool") # --------------------------------------------------------------------- # Generic request helper with rate-limit (403) retry handling # --------------------------------------------------------------------- def _request_with_retry( self, method: str, url: str, *, max_retries: int = 2, sleep_seconds: int = 120, **kwargs, ): """Send a GitHub API request with basic rate-limit handling and token rotation. If a request receives HTTP 403 (rate limit): 1. First try rotating to the next token in the pool 2. If still rate limited, sleep and retry 3. After max_retries are exhausted, raise RuntimeError """ import time # local import to avoid adding global dependency attempt = 0 tokens_tried = 0 while True: # Ensure we have the current token set self._update_session_token() resp = self.session.request(method, url, **kwargs) # Successful or non-rate-limited response – return immediately if resp.status_code != 403: return resp # 403 – very likely rate-limited # First, try rotating tokens if we have multiple if ( self.token_pool.pool_size > 1 and tokens_tried < self.token_pool.pool_size ): logger.warning( "| GitHub API rate limit encountered. Rotating to next token (tried %d/%d tokens)", tokens_tried + 1, self.token_pool.pool_size, ) self._rotate_token() tokens_tried += 1 continue # All tokens exhausted or single token, resort to sleep/retry if attempt >= max_retries: raise RuntimeError( f"GitHub API rate limited after {attempt + 1} attempts with {self.token_pool.pool_size} token(s): {resp.status_code} {resp.text}" ) logger.warning( "| All tokens rate limited (attempt %d/%d). Sleeping %d seconds before retrying …", attempt + 1, max_retries + 1, sleep_seconds, ) time.sleep(sleep_seconds) attempt += 1 tokens_tried = 0 # Reset token counter for next attempt # ========================================================================= # Initial State Selection and Repository Creation # ========================================================================= # Initial state for each task category is resolved via self.initial_state_mapping def select_initial_state_for_task(self, task_category: str) -> Optional[str]: """Resolve template name for a task category with light normalization.""" if not task_category: return None candidate_keys = [] candidate_keys.append(task_category) # Allow users to swap between hyphen/underscore naming conventions. hyphen_to_underscore = task_category.replace("-", "_") if hyphen_to_underscore not in candidate_keys: candidate_keys.append(hyphen_to_underscore) underscore_to_hyphen = task_category.replace("_", "-") if underscore_to_hyphen not in candidate_keys: candidate_keys.append(underscore_to_hyphen) for key in candidate_keys: template = self.initial_state_mapping.get(key) if template: if key != task_category: logger.debug( "| Resolved GitHub template for %s via alias %s -> %s", task_category, key, template, ) return template return None def extract_repo_info_from_url(self, repo_url: str) -> tuple[str, str]: """Extract owner and repo name from GitHub URL.""" try: from urllib.parse import urlparse # Support https://github.com/owner/repo format if "github.com" in repo_url: path = urlparse(repo_url).path.strip("/") parts = path.split("/") if len(parts) >= 2: return parts[0], parts[1] raise ValueError(f"Invalid GitHub URL format: {repo_url}") except Exception as e: logger.error(f"| Failed to extract repo info from URL {repo_url}: {e}") raise def get_service_config_for_agent(self) -> dict: """ Get service-specific configuration for agent execution. Rotates to the next token in the pool before returning config to distribute load across tokens. Returns: Dictionary containing configuration needed by the agent/MCP server """ service_config = {} # Add GitHub token if available if self.github_token: service_config["github_token"] = self.github_token return service_config def set_verification_environment(self, messages_path: str = None) -> None: """ Set GitHub-specific environment variables for verification scripts. This ensures verification scripts use the same token as the current agent execution, maintaining consistency across the evaluation flow. Args: messages_path: Optional path to messages.json file for verification """ import os # Set common MCP_MESSAGES if provided if messages_path: os.environ["MCP_MESSAGES"] = str(messages_path) # Set GitHub-specific token current_token = self.token_pool.get_current_token() os.environ["MCP_GITHUB_TOKEN"] = current_token logger.info("| Set MCP_GITHUB_TOKEN for verification scripts") def _enable_github_actions(self, owner: str, repo_name: str): """Enable GitHub Actions for the repository using REST API.""" try: # Enable GitHub Actions url = ( f"https://api.github.com/repos/{owner}/{repo_name}/actions/permissions" ) response = self.session.put( url, json={"enabled": True, "allowed_actions": "all"} ) if response.status_code in [200, 204]: logger.info( "| Successfully enabled GitHub Actions for %s/%s", owner, repo_name ) else: logger.warning( "| Failed to enable GitHub Actions: %s %s", response.status_code, response.text, ) except Exception as e: logger.error("| Failed to enable GitHub Actions: %s", e) def _disable_github_actions(self, owner: str, repo_name: str): """Disable GitHub Actions for the repository using REST API.""" try: # Disable GitHub Actions url = ( f"https://api.github.com/repos/{owner}/{repo_name}/actions/permissions" ) response = self.session.put(url, json={"enabled": False}) if response.status_code in [200, 204]: logger.info( "| Successfully disabled GitHub Actions for %s/%s", owner, repo_name ) else: logger.warning( "| Failed to disable GitHub Actions: %s %s", response.status_code, response.text, ) except Exception as e: logger.error("| Failed to disable GitHub Actions: %s", e) def _disable_repository_notifications(self, owner: str, repo_name: str): """Disable repository notifications to prevent email spam.""" try: # Set repository notification subscription to ignore url = f"https://api.github.com/repos/{owner}/{repo_name}/subscription" response = self.session.put( url, json={"subscribed": False, "ignored": True} ) if response.status_code in [200, 201]: logger.info( "| Successfully disabled notifications for %s/%s", owner, repo_name ) elif response.status_code == 403: # This is expected if the token doesn't have notifications scope logger.debug( "| Cannot disable notifications for %s/%s (token lacks notifications scope - this is OK)", owner, repo_name, ) else: logger.warning( "| Failed to disable repository notifications: %s %s", response.status_code, response.text, ) except Exception as e: logger.error("| Failed to disable repository notifications: %s", e) def _download_and_extract_github_template(self, template_name: str) -> bool: """ Download and extract GitHub template from CDN using wget and unzip commands. This approach preserves original file timestamps and is simpler than Python zipfile. Args: template_name: Name of the template to download (e.g., "anthropics-claude-code") Returns: bool: True if download and extraction successful """ try: import subprocess import sys import tempfile import shutil import os # Get the URL from mapping if template_name not in self.github_template_url_mapping: logger.error(f"| No URL mapping found for template: {template_name}") return False template_url = self.github_template_url_mapping[template_name] # Allow override via environment variable template_url = os.getenv("GITHUB_TEMPLATE_URL", template_url) logger.info(f"| ○ Downloading GitHub template from: {template_url}") # Create a temporary directory for the download with tempfile.TemporaryDirectory() as temp_dir: temp_path = Path(temp_dir) zip_path = temp_path / "github_template.zip" # Step 1: Download using wget/curl logger.info("| ○ Downloading GitHub template zip file...") try: # Use wget if available, otherwise fall back to curl if sys.platform == "win32": # Windows: try wget, fall back to curl try: result = subprocess.run( ["wget", "-O", str(zip_path), template_url], capture_output=True, text=True, check=True, ) except (subprocess.CalledProcessError, FileNotFoundError): # Fall back to curl result = subprocess.run( ["curl", "-L", "-o", str(zip_path), template_url], capture_output=True, text=True, check=True, ) else: # Unix-like systems: try wget, fall back to curl try: result = subprocess.run( ["wget", "-O", str(zip_path), template_url], capture_output=True, text=True, check=True, ) except (subprocess.CalledProcessError, FileNotFoundError): # Fall back to curl result = subprocess.run( ["curl", "-L", "-o", str(zip_path), template_url], capture_output=True, text=True, check=True, ) logger.info("| ✓ Download completed successfully") except Exception as e: logger.error(f"| Download failed: {e}") return False # Step 2: Extract using unzip logger.info("| ○ Extracting GitHub template...") try: # Extract to templates root directory result = subprocess.run( ["unzip", "-o", str(zip_path), "-d", str(self.templates_root)], capture_output=True, text=True, check=True, ) logger.info("| ✓ Extraction completed successfully") except Exception as e: logger.error(f"| Extraction failed: {e}") return False # Step 3: Remove __MACOSX folder if it exists macosx_path = self.templates_root / "__MACOSX" if macosx_path.exists(): logger.info("| ○ Cleaning up macOS metadata...") try: shutil.rmtree(macosx_path) logger.info("| ✓ Removed __MACOSX folder") except Exception as e: logger.warning(f"| Failed to remove __MACOSX folder: {e}") # Verify the extracted template directory exists template_path = self.templates_root / template_name if not template_path.exists(): logger.error( f"| Extracted template directory not found at expected path: {template_path}" ) return False logger.info( f"| ✓ Successfully downloaded and extracted GitHub template to: {template_path}" ) return True except Exception as e: logger.error(f"| Failed to download and extract GitHub template: {e}") return False ================================================ FILE: src/mcp_services/github/github_task_manager.py ================================================ """ GitHub Task Manager for MCPMark Evaluation Pipeline ==================================================== This module provides utilities for discovering, filtering, and managing GitHub-based evaluation tasks. The task manager is responsible for: - Task discovery and filtering - Task verification and result processing - Task-specific logic (NOT LLM execution) """ import sys from dataclasses import dataclass from pathlib import Path from typing import Any, Dict, List, Optional from src.base.task_manager import BaseTask, BaseTaskManager from src.logger import get_logger logger = get_logger(__name__) @dataclass class GitHubTask(BaseTask): """Represents a single evaluation task for GitHub service.""" # GitHub-specific fields repository_url: Optional[str] = None branch_name: Optional[str] = None pr_number: Optional[int] = None issue_number: Optional[int] = None expected_actions: Optional[List[str]] = None # Expected GitHub actions to verify # Directory-based task slug (e.g., "update_readme") task_name: str = "" # No need to override name property, inherited from BaseTask class GitHubTaskManager(BaseTaskManager): """Manages task discovery, filtering, and verification for GitHub-based MCPMark evaluation.""" def __init__(self, tasks_root: Path = None, task_suite: str = "standard"): """Initialize GitHub task manager. Args: tasks_root: Path to the tasks directory """ if tasks_root is None: tasks_root = Path(__file__).resolve().parents[3] / "tasks" # Call parent constructor super().__init__( tasks_root, mcp_service="github", task_class=GitHubTask, task_organization="file", task_suite=task_suite, ) # GitHub uses file-based tasks # ========================================================================= # Service-specific implementations # ========================================================================= # No custom task discovery methods needed; relying entirely on BaseTaskManager defaults. def _create_task_from_files( self, category_id: str, task_files_info: Dict[str, Any] ) -> Optional[GitHubTask]: """Instantiate a GitHubTask from the dictionary yielded by _find_task_files.""" import json # Check for meta.json meta_path = task_files_info["instruction_path"].parent / "meta.json" final_category_id = category_id task_id = task_files_info["task_id"] if meta_path.exists(): try: with open(meta_path, 'r') as f: meta_data = json.load(f) # Use values from meta.json if available final_category_id = meta_data.get("category_id", category_id) task_id = meta_data.get("task_id", task_id) except Exception as e: logger.warning(f"Failed to load meta.json from {meta_path}: {e}") return GitHubTask( task_instruction_path=task_files_info["instruction_path"], task_verification_path=task_files_info["verification_path"], service="github", category_id=final_category_id, task_id=task_id, task_name=task_files_info["task_id"], ) def _get_verification_command(self, task: GitHubTask) -> List[str]: """Get the verification command for GitHub tasks.""" return [sys.executable, str(task.task_verification_path)] def get_task_instruction(self, task: GitHubTask) -> str: """Return task instruction prefixed with repository context. Adds an English prefix to every GitHub task instruction so that the agent knows **exactly** which repository to operate on, following the pattern requested by the user: Please execute the following task in my repository {owner}/{repo_name}: If the repository URL has not yet been injected into the ``task`` (for example when the state manager has not run), we fall back to a more generic prefix without owner/repo placeholder. """ # Read the original task description first base_instruction = task.get_task_instruction() # Derive the owner/repo pair from the repository URL if available prefix: str if task.repository_url: # Example URL: https://github.com/owner/repo_name.git (or without .git) url_parts = task.repository_url.rstrip("/").replace(".git", "").split("/") if len(url_parts) >= 2: owner, repo_name = url_parts[-2], url_parts[-1] prefix = f"Please execute the following task in my repository {owner}/{repo_name}:" else: prefix = "Please execute the following task:" else: prefix = "Please execute the following task:" # Compose instruction with prefix instruction_with_prefix = f"{prefix}\n\n{base_instruction.strip()}" # Apply the common formatting suffix from base class return self._format_task_instruction(instruction_with_prefix) ================================================ FILE: src/mcp_services/github/repo_exporter.py ================================================ """ repo_exporter.py – Export public GitHub repository *and* open Issues/PRs ===================================================================== Workflow -------- 1. Mirror-clone the public repository to a local bare repo directory ``${out_dir}/${owner}-${repo}/repo.git``. 2. Fetch all *open* Issues & Pull-Requests via GitHub REST API (no auth needed for public repos, but a token can be provided to increase the rate limit) and serialise them as JSON under the same folder: • ``issues.json`` – list[Issue] • ``pulls.json`` – list[PullRequest] • ``meta.json`` – {"owner": owner, "repo": repo} Usage (CLI) ----------- $ python -m src.mcp_services.github.repo_exporter \ https://github.com/octocat/Hello-World \ --out-dir ./github_state Optionally ``--token`` can be supplied (or env GITHUB_TOKEN) to avoid the 60-req/h anonymous limit. """ from __future__ import annotations import json import logging import os from dotenv import load_dotenv import subprocess from pathlib import Path from tempfile import mkdtemp from typing import Optional from urllib.parse import urlparse import requests logging.basicConfig(level=logging.INFO) logger = logging.getLogger(__name__) _API_ROOT = "https://api.github.com" _DEFAULT_HEADERS = { "Accept": "application/vnd.github.v3+json", "User-Agent": "MCPMark/RepoExporter/1.0", } # --------------------------------------------------------------------------- # Helper utilities # --------------------------------------------------------------------------- def _make_session(token: Optional[str] = None) -> requests.Session: sess = requests.Session() sess.headers.update(_DEFAULT_HEADERS) if token: sess.headers["Authorization"] = f"Bearer {token}" return sess def _parse_repo(url: str) -> tuple[str, str]: parsed = urlparse(url) parts = parsed.path.strip("/").split("/") if len(parts) < 2: raise ValueError(f"Invalid GitHub repo URL: {url}") return parts[0], parts[1] # --------------------------------------------------------------------------- # Core export logic # --------------------------------------------------------------------------- def export_repository( source_repo_url: str, out_dir: str = "./github_state", github_token: str | None = None, max_issues: int | None = None, max_pulls: int | None = None, ) -> str: """Export repository code plus Issues/PRs to ``out_dir``. ``max_issues`` / ``max_pulls`` – when supplied, export **only** the most recently created *open* Issues or Pull Requests (respectively). Returns the absolute path of the export folder. """ owner, repo = _parse_repo(source_repo_url) export_root = Path(out_dir).expanduser().resolve() repo_dir = export_root / f"{owner}-{repo}" repo_dir.mkdir(parents=True, exist_ok=True) # ------------------------------------------------------------------ # 1. Clone repository – full or shallow *working* clone (no bare repo) # ------------------------------------------------------------------ repo_path = repo_dir / "repo" if repo_path.exists(): logger.info("[clone] Repository already exists, skipping clone: %s", repo_path) else: logger.info("[clone] Cloning %s/%s to %s", owner, repo, repo_path) env = { **os.environ, "GIT_TERMINAL_PROMPT": "0", "GIT_LFS_SKIP_SMUDGE": "1", } tmp_dir = mkdtemp(prefix="mcp_export_") try: # Always perform a full clone (no shallow depth limitation). clone_cmd = [ "git", "clone", "--no-single-branch", f"https://github.com/{owner}/{repo}.git", tmp_dir, ] subprocess.run(clone_cmd, check=True, capture_output=True, env=env) subprocess.run(["mv", tmp_dir, str(repo_path)], check=True) logger.info("[clone] Clone completed") finally: # tmp_dir moved if success; remove if left if os.path.isdir(tmp_dir): subprocess.run(["rm", "-rf", tmp_dir]) # ------------------------------------------------------------------ # 2. Dump Issues & Pull Requests # ------------------------------------------------------------------ sess = _make_session(github_token) def _paginate(url: str, state: str = "all", extra_params: dict | None = None): page = 1 while True: params = {"state": state, "per_page": 100, "page": page} if extra_params: params.update(extra_params) resp = sess.get(url, params=params) if resp.status_code != 200: logger.warning("Failed to list: %s – %s", url, resp.text) break items = resp.json() if not items: break yield from items page += 1 # -------------------------------------------------------------- # Helper: fetch all issue comments for a given issue / PR number # -------------------------------------------------------------- def _fetch_issue_comments(number: int) -> list[dict]: """Return a list of {user, body} comment dicts for the given issue/PR.""" comments = [] for c in _paginate( f"{_API_ROOT}/repos/{owner}/{repo}/issues/{number}/comments" ): comments.append( { "user": c.get("user", {}).get("login", "unknown"), "body": c.get("body", ""), } ) return comments # -------------------------------------------------------------- # Helper: fetch all *review* comments (code comments) for a PR # -------------------------------------------------------------- def _fetch_review_comments(number: int) -> list[dict]: """Return a list of {user, body} review comments for the given PR.""" comments = [] for c in _paginate(f"{_API_ROOT}/repos/{owner}/{repo}/pulls/{number}/comments"): comments.append( { "user": c.get("user", {}).get("login", "unknown"), "body": c.get("body", ""), } ) return comments # Issues (non-PR) issues = [] # If max_issues is 0, skip fetching issues entirely if max_issues == 0: logger.info("[export] Skipping issues (max_issues=0)") else: for itm in _paginate( f"{_API_ROOT}/repos/{owner}/{repo}/issues", extra_params={"sort": "created", "direction": "desc"}, ): if "pull_request" in itm: continue issues.append( { "title": itm.get("title"), "body": itm.get("body", ""), "labels": [lbl.get("name") for lbl in itm.get("labels", [])], "state": itm.get("state", "open"), # Store issue state "number": itm.get("number"), # Store issue number for reference "comments": _fetch_issue_comments(itm.get("number")), } ) if max_issues is not None and len(issues) >= max_issues: break (repo_dir / "issues.json").write_text(json.dumps(issues, indent=2)) logger.info("[export] Saved %d issues", len(issues)) # Pull requests – include *all* PRs including those from forks pulls = [] pr_head_refs: set[str] = set() fork_pr_branches: dict[str, dict] = {} # Maps PR branch names to fork info # If max_pulls is 0, skip fetching pull requests entirely if max_pulls == 0: logger.info("[export] Skipping pull requests (max_pulls=0)") else: for pr in _paginate( f"{_API_ROOT}/repos/{owner}/{repo}/pulls", state="open", extra_params={"sort": "created", "direction": "desc"}, ): pr_number = pr.get("number") head = pr.get("head", {}) if head is None: logger.warning("PR #%s has no head (deleted fork), skipping", pr_number) continue # skip PRs with missing head (deleted fork) head_repo = head.get("repo") head_ref = head.get("ref") head_sha = head.get("sha") if head_repo is None: logger.warning("PR #%s source repo was deleted, skipping", pr_number) continue # skip PRs where source repo was deleted head_repo_full = head_repo.get("full_name") is_from_fork = head_repo_full != f"{owner}/{repo}" # Create PR data with fork information pr_data = { "number": pr_number, "title": pr.get("title"), "body": pr.get("body", ""), "head": head_ref, "base": pr.get("base", {}).get("ref"), "is_from_fork": is_from_fork, } if is_from_fork: # Store additional metadata for forked PRs pr_data["fork_owner"] = head_repo.get("owner", {}).get("login") pr_data["fork_repo"] = head_repo.get("name") pr_data["head_sha"] = head_sha # Create a unique branch name for this forked PR fork_branch_name = f"pr/{pr_number}-{pr_data['fork_owner']}-{head_ref}" pr_data["local_branch"] = fork_branch_name fork_pr_branches[fork_branch_name] = { "clone_url": head_repo.get("clone_url"), "ref": head_ref, "sha": head_sha, "pr_number": pr_number, } else: # For non-fork PRs, keep the original branch reference pr_head_refs.add(head_ref) # Attach comments pr_data["comments"] = _fetch_issue_comments(pr_number) pr_data["review_comments"] = _fetch_review_comments(pr_number) pulls.append(pr_data) if max_pulls is not None and len(pulls) >= max_pulls: break (repo_dir / "pulls.json").write_text(json.dumps(pulls, indent=2)) logger.info("[export] Saved %d pull requests", len(pulls)) # Get default branch info first (needed for fetching) sess = _make_session(github_token) try: repo_info = sess.get(f"{_API_ROOT}/repos/{owner}/{repo}") default_branch = repo_info.json().get("default_branch", "main") except Exception: default_branch = "main" # Fetch branches from non-fork PRs (branches from the same repository) non_fork_branches = list(pr_head_refs) # These are branches from the same repo # Always include the default branch in the branches to fetch if default_branch not in non_fork_branches: non_fork_branches.append(default_branch) pr_head_refs.add(default_branch) if non_fork_branches: logger.info( "[fetch] Fetching %d branches from same repository (including default branch '%s')", len(non_fork_branches), default_branch, ) try: # Fetch all remote branches to ensure we have the PR branches subprocess.run( ["git", "-C", str(repo_path), "fetch", "origin", "--no-tags"], check=True, capture_output=True, ) # Create local branches for each PR branch for branch in non_fork_branches: try: # Create local branch tracking the remote branch subprocess.run( [ "git", "-C", str(repo_path), "branch", "--track", branch, f"origin/{branch}", ], check=False, capture_output=True, ) # check=False because branch might already exist logger.info("[fetch] Created local branch %s", branch) except subprocess.CalledProcessError: # Branch might already exist, which is fine pass except subprocess.CalledProcessError as e: logger.warning( "[fetch] Failed to fetch branches from origin: %s", e.stderr.decode(errors="ignore") if e.stderr else str(e), ) # Fetch branches from forks for PRs if fork_pr_branches: logger.info( "[fetch] Fetching branches from %d forked PRs", len(fork_pr_branches) ) for branch_name, fork_info in fork_pr_branches.items(): try: logger.info( "[fetch] Fetching branch %s from fork %s", fork_info["ref"], fork_info["clone_url"], ) # Add fork as remote and fetch the specific branch remote_name = f"fork-pr-{fork_info['pr_number']}" # Add remote subprocess.run( [ "git", "-C", str(repo_path), "remote", "add", remote_name, fork_info["clone_url"], ], check=True, capture_output=True, ) # Fetch the specific branch from the fork subprocess.run( [ "git", "-C", str(repo_path), "fetch", remote_name, f"{fork_info['ref']}:refs/heads/{branch_name}", ], check=True, capture_output=True, ) # Remove the remote after fetching subprocess.run( ["git", "-C", str(repo_path), "remote", "remove", remote_name], check=True, capture_output=True, ) # Add the fork branch to pr_head_refs so it gets pushed pr_head_refs.add(branch_name) logger.info("[fetch] Successfully fetched branch %s", branch_name) except subprocess.CalledProcessError as e: logger.warning( "[fetch] Failed to fetch branch from fork PR #%s: %s", fork_info["pr_number"], e.stderr.decode(errors="ignore") if e.stderr else str(e), ) except Exception as e: logger.warning( "[fetch] Unexpected error fetching fork PR #%s: %s", fork_info["pr_number"], str(e), ) meta = { "owner": owner, "repo": repo, "default_branch": default_branch, "pr_head_refs": sorted(pr_head_refs), } (repo_dir / "meta.json").write_text(json.dumps(meta, indent=2)) logger.info("[done] Export finished – data stored at %s", repo_dir) return str(repo_dir) # --------------------------------------------------------------------------- # CLI # --------------------------------------------------------------------------- if __name__ == "__main__": import argparse load_dotenv(".mcp_env") parser = argparse.ArgumentParser( description="Export public GitHub repository with Issues/PRs" ) parser.add_argument( "--source_repo_url", required=True, help="HTTPS URL of the public repository" ) parser.add_argument( "--out-dir", default="./github_state", help="Output directory root" ) parser.add_argument( "--max-issues", type=int, default=20, help="Export only the latest N issues (optional)", ) parser.add_argument( "--max-pulls", type=int, default=5, help="Export only the latest N pull requests (optional)", ) args = parser.parse_args() token = os.getenv("GITHUB_TOKEN") export_repository( args.source_repo_url, args.out_dir, token, args.max_issues, args.max_pulls ) ================================================ FILE: src/mcp_services/github/repo_importer.py ================================================ """ repo_importer.py – Restore previously exported GitHub repository into an org/user =============================================================================== Given a local export folder created by ``repo_exporter.py`` that contains ``repo.git`` (bare mirror) and JSON files for Issues/PRs, this script: 1. Creates an empty repository under the specified owner (user/org) using the provided GitHub token. 2. Pushes *all* Git history from the local bare repository to the target repo (fallback to per-ref push to avoid timeouts). 3. Re-creates the open Issues & Pull Requests from the JSON dump. CLI usage --------- $ python -m src.mcp_services.github.repo_importer \ ./github_template_repo/octocat-Hello-World \ --token YOUR_GH_PAT \ --target-owner EvalOrg \ --private """ from __future__ import annotations import json import logging import os import subprocess import time from pathlib import Path from typing import Iterable import requests from dotenv import load_dotenv logging.basicConfig(level=logging.INFO) logger = logging.getLogger(__name__) _API_ROOT = "https://api.github.com" _HEADERS = { "Accept": "application/vnd.github.v3+json", "User-Agent": "MCPMark/RepoImporter/1.0", } # --------------------------------------------------------------------------- # Helper functions copied / simplified from repo_mirror (shallow clone logic removed) # --------------------------------------------------------------------------- def _make_session(token: str) -> requests.Session: sess = requests.Session() sess.headers.update(_HEADERS | {"Authorization": f"Bearer {token}"}) return sess def _create_target_repo( sess: requests.Session, owner: str, repo_name: str, description: str, private: bool ) -> str: data = { "name": repo_name, "description": description, "private": private, "auto_init": False, "has_issues": True, "has_projects": True, "has_wiki": False, } # Determine if owner == auth user auth_user = _get_authenticated_user(sess) create_url = ( f"{_API_ROOT}/user/repos" if owner == auth_user else f"{_API_ROOT}/orgs/{owner}/repos" ) resp = sess.post(create_url, json=data) if resp.status_code == 422 and "name already exists" in resp.text: logger.warning("Repository already exists; attempting to delete and recreate …") _delete_repo(sess, owner, repo_name) resp = sess.post(create_url, json=data) if resp.status_code not in (200, 201): raise RuntimeError(f"Failed to create repo: {resp.status_code} {resp.text}") html_url = resp.json()["html_url"] logger.info("[init] Target repository created: %s", html_url) return html_url def _get_authenticated_user(sess: requests.Session) -> str: resp = sess.get(f"{_API_ROOT}/user") resp.raise_for_status() return resp.json()["login"] def _delete_repo(sess: requests.Session, owner: str, repo: str): sess.delete(f"{_API_ROOT}/repos/{owner}/{repo}") def _list_refs(repo_dir: str) -> list[str]: result = subprocess.run( ["git", "-C", repo_dir, "for-each-ref", "--format=%(refname)"], check=True, capture_output=True, text=True, ) return result.stdout.strip().splitlines() def _push_repo( repo_path: Path, owner: str, repo_name: str, token: str, required_refs: Iterable[str] | None = None, ): """Push repository to GitHub. Strategy: 1. Attempt a full `git push --mirror`. 2. If that fails (e.g. due to large repo), fall back to pushing refs one-by-one. """ dst_url = f"https://x-access-token:{token}@github.com/{owner}/{repo_name}.git" # First try mirror push (fast path) try: subprocess.run( ["git", "-C", str(repo_path), "push", "--mirror", dst_url], check=True, capture_output=True, ) logger.info("[push] Mirror push succeeded") return except subprocess.CalledProcessError as err: logger.warning( "[push] Mirror push failed (%s). Falling back to per-ref", err.stderr.decode(errors="ignore"), ) # ------------------------------------------------------------------ # Fallback: push each ref individually (robust but slower) # ------------------------------------------------------------------ refs = required_refs or _list_refs(str(repo_path)) logger.info("[push] Pushing %d refs individually …", len(refs)) for ref in refs: for attempt in range(3): try: subprocess.run( ["git", "-C", str(repo_path), "push", dst_url, f"{ref}:{ref}"], check=True, capture_output=True, ) break except subprocess.CalledProcessError as ref_err: if attempt == 2: raise RuntimeError( f"Failed to push ref {ref}: {ref_err.stderr}" ) from ref_err time.sleep(2 * (attempt + 1)) def _create_comment( sess: requests.Session, owner: str, repo: str, issue_number: int, body: str ): """Create a comment on an Issue or Pull Request. Returns True on success.""" resp = sess.post( f"{_API_ROOT}/repos/{owner}/{repo}/issues/{issue_number}/comments", json={"body": body}, ) if resp.status_code not in (200, 201): logger.debug("Failed to create comment on #%s: %s", issue_number, resp.text) return False return True def _create_issue( sess: requests.Session, owner: str, repo: str, title: str, body: str, labels: list[str], state: str = "open", number: int = None, ): """Create a new Issue and return the *new* issue number (or None on failure).""" data = {"title": title, "body": body, "labels": labels} resp = sess.post(f"{_API_ROOT}/repos/{owner}/{repo}/issues", json=data) if resp.status_code not in (200, 201): logger.debug("Failed to create issue #%s: %s", number, resp.text) return None new_number = resp.json().get("number") # Close issue if original state was closed if state == "closed": close_resp = sess.patch( f"{_API_ROOT}/repos/{owner}/{repo}/issues/{new_number}", json={"state": "closed"}, ) if close_resp.status_code not in (200, 201): logger.debug("Failed to close issue #%s: %s", new_number, close_resp.text) return new_number def _create_pull( sess: requests.Session, owner: str, repo: str, title: str, body: str, head: str, base: str, pr_number: int = None, ): """Create a Pull Request and return the *new* PR number (or None on failure).""" data = {"title": title, "body": body, "head": head, "base": base} resp = sess.post(f"{_API_ROOT}/repos/{owner}/{repo}/pulls", json=data) if resp.status_code not in (200, 201): logger.warning( "Failed to create PR #%s (head: %s, base: %s): %s", pr_number, head, base, resp.text, ) return None return resp.json().get("number") def _enable_github_actions(sess: requests.Session, owner: str, repo_name: str): """Enable GitHub Actions for the repository using REST API.""" try: url = f"{_API_ROOT}/repos/{owner}/{repo_name}/actions/permissions" response = sess.put(url, json={"enabled": True, "allowed_actions": "all"}) if response.status_code in [200, 204]: logger.info( "Successfully enabled GitHub Actions for %s/%s", owner, repo_name ) else: logger.warning( "Failed to enable GitHub Actions: %s %s", response.status_code, response.text, ) except Exception as e: logger.error("Failed to enable GitHub Actions: %s", e) def _disable_repository_notifications( sess: requests.Session, owner: str, repo_name: str ): """Disable repository notifications to prevent email spam.""" try: url = f"{_API_ROOT}/repos/{owner}/{repo_name}/subscription" response = sess.put(url, json={"subscribed": False, "ignored": True}) if response.status_code in [200, 201]: logger.info( "Successfully disabled notifications for %s/%s", owner, repo_name ) elif response.status_code == 403: # This is expected if the token doesn't have notifications scope logger.debug( "Cannot disable notifications for %s/%s (token lacks notifications scope - this is OK)", owner, repo_name, ) else: logger.warning( "Failed to disable repository notifications: %s %s", response.status_code, response.text, ) except Exception as e: logger.error("Failed to disable repository notifications: %s", e) def _set_default_branch( sess: requests.Session, owner: str, repo_name: str, default_branch: str ): """Set the default branch for a repository.""" if default_branch != "main": # Only update if not already main logger.info("[import] Setting default branch to '%s'", default_branch) url = f"{_API_ROOT}/repos/{owner}/{repo_name}" data = {"default_branch": default_branch} resp = sess.patch(url, json=data) if resp.status_code in (200, 201): logger.info( "[import] Successfully set default branch to '%s'", default_branch ) else: logger.warning( "[import] Failed to set default branch: %s %s", resp.status_code, resp.text, ) def _remove_github_directory(repo_path: Path, owner: str, repo_name: str, token: str): """Remove .github directory after pushing and commit the deletion.""" import shutil github_dir = repo_path / ".github" if github_dir.exists(): logger.info("[import] Removing .github directory after push …") shutil.rmtree(github_dir) # Commit the deletion subprocess.run( ["git", "-C", str(repo_path), "add", "-A"], check=True, capture_output=True ) subprocess.run( ["git", "-C", str(repo_path), "commit", "-m", "Remove .github directory"], capture_output=True, ) # Push the new commit dst_url = f"https://x-access-token:{token}@github.com/{owner}/{repo_name}.git" subprocess.run( ["git", "-C", str(repo_path), "push", dst_url], check=True, capture_output=True, ) # --------------------------------------------------------------------------- # Main import logic # --------------------------------------------------------------------------- def import_repository( template_dir: str, github_token: str, target_owner: str, private: bool = False ) -> str: """Import repository from a local template directory to GitHub.""" # ------------------------------------------------------------------ # Ensure Git HTTP buffer large enough to avoid 400 errors on big pushes # ------------------------------------------------------------------ try: subprocess.run( [ "git", "config", "--global", "http.postBuffer", "157286400", # 150 MiB ], check=True, capture_output=True, ) logger.debug("[init] Set git http.postBuffer to 150MiB globally") except subprocess.CalledProcessError as cfg_err: logger.warning( "[init] Failed to set http.postBuffer – proceeding anyway: %s", cfg_err.stderr.decode(errors="ignore"), ) tdir = Path(template_dir).expanduser().resolve() meta = json.loads((tdir / "meta.json").read_text()) repo_name = meta["repo"] pr_head_refs = meta.get("pr_head_refs", []) default_branch = meta.get("default_branch", "main") # Also include fork PR branches that were fetched pulls = json.loads((tdir / "pulls.json").read_text()) fork_branches = [ pr["local_branch"] for pr in pulls if pr.get("is_from_fork", False) and "local_branch" in pr ] needed_refs = ( [f"refs/heads/{default_branch}"] + [f"refs/heads/{h}" for h in pr_head_refs] + [f"refs/heads/{b}" for b in fork_branches] ) sess = _make_session(github_token) # 1. Create target repo html_url = _create_target_repo( sess, target_owner, repo_name, f"Restored mirror of {repo_name}", private ) # 2. Push code repo_path = tdir / "repo" logger.info("[phase] Pushing git history …") _push_repo(repo_path, target_owner, repo_name, github_token, needed_refs) # Set the default branch if it's not 'main' _set_default_branch(sess, target_owner, repo_name, default_branch) # Remove .github directory right after pushing, before creating issues/PRs _remove_github_directory(repo_path, target_owner, repo_name, github_token) # 3. Re-create issues & PRs logger.info("[phase] Re-creating issues …") issues = json.loads((tdir / "issues.json").read_text()) created_issues = 0 for itm in issues: new_issue_no = _create_issue( sess, target_owner, repo_name, itm["title"], itm.get("body", ""), itm.get("labels", []), itm.get("state", "open"), itm.get("number"), ) if new_issue_no: created_issues += 1 for c in itm.get("comments", []): comment_body = f"*Original author: @{c['user']}*\n\n{c['body']}" _create_comment( sess, target_owner, repo_name, new_issue_no, comment_body ) logger.info("[phase] Created %d out of %d issues", created_issues, len(issues)) logger.info("[phase] Re-creating pull requests …") pulls = json.loads((tdir / "pulls.json").read_text()) created_prs = 0 skipped_prs = 0 for pr in pulls: # Use local_branch for forked PRs, otherwise use original head head_branch = pr.get("local_branch", pr["head"]) # Add note to PR body if it's from a fork body = pr.get("body", "") if pr.get("is_from_fork", False): fork_note = f"\n\n---\n_This PR was originally from a fork: **{pr.get('fork_owner')}/{pr.get('fork_repo')}** (branch: `{pr['head']}`)_" body = ( body + fork_note if body else fork_note[2:] ) # Remove leading newlines if body is empty new_pr_number = _create_pull( sess, target_owner, repo_name, pr["title"], body, head_branch, pr["base"], pr.get("number"), ) if new_pr_number: created_prs += 1 for c in pr.get("comments", []): comment_body = f"*Original author: @{c['user']}*\n\n{c['body']}" _create_comment( sess, target_owner, repo_name, new_pr_number, comment_body ) for rc in pr.get("review_comments", []): comment_body = ( f"*Original author: @{rc['user']}* (review)\n\n{rc['body']}" ) _create_comment( sess, target_owner, repo_name, new_pr_number, comment_body ) else: skipped_prs += 1 logger.info("[phase] Created %d PRs, skipped %d PRs", created_prs, skipped_prs) # Enable GitHub Actions after creating issues and PRs logger.info("[import] Enabling GitHub Actions …") _enable_github_actions(sess, target_owner, repo_name) # Disable notifications to prevent email spam logger.info("[import] Disabling repository notifications …") _disable_repository_notifications(sess, target_owner, repo_name) logger.info("[done] Import complete: %s", html_url) return html_url # --------------------------------------------------------------------------- # CLI # --------------------------------------------------------------------------- if __name__ == "__main__": import argparse load_dotenv(".mcp_env") parser = argparse.ArgumentParser( description="Import repository from local template into GitHub" ) parser.add_argument("--template_dir", help="Path to exported template directory") parser.add_argument( "--target-owner", "-o", default="mcpmark-eval", help="User or organisation that will own the new repository", ) args = parser.parse_args() token = os.getenv("GITHUB_TOKEN") if not token: parser.error("GITHUB_TOKEN not set in environment or .mcp_env") # Always create the target repository as private import_repository(args.template_dir, token, args.target_owner, True) ================================================ FILE: src/mcp_services/github/token_pool.py ================================================ """ GitHub Token Pool Manager ========================= Simple round-robin token pool for distributing API requests across multiple tokens to avoid rate limit issues. """ from typing import List from src.logger import get_logger logger = get_logger(__name__) class GitHubTokenPool: """ Manages a pool of GitHub tokens with round-robin selection. """ def __init__(self, tokens: List[str]): """ Initialize token pool. Args: tokens: List of GitHub personal access tokens """ if not tokens: raise ValueError("Token pool must contain at least one token") self.tokens = tokens self.current_index = 0 logger.info(f"Initialized GitHub token pool with {len(tokens)} token(s)") def get_next_token(self) -> str: """ Get the next token in round-robin fashion. Returns: The next GitHub token to use """ token = self.tokens[self.current_index] self.current_index = (self.current_index + 1) % len(self.tokens) return token def get_current_token(self) -> str: """ Get the current token without advancing the index. Returns: The current GitHub token """ return self.tokens[self.current_index] @property def pool_size(self) -> int: """Get the number of tokens in the pool.""" return len(self.tokens) ================================================ FILE: src/mcp_services/insforge/__init__.py ================================================ """Insforge MCP Service Implementation for MCPMark.""" ================================================ FILE: src/mcp_services/insforge/insforge_login_helper.py ================================================ """ Insforge Login Helper for MCPMark ================================== Handles Insforge backend authentication and connection validation. """ import json import requests from pathlib import Path from typing import Optional, Dict, Any from src.base.login_helper import BaseLoginHelper from src.logger import get_logger logger = get_logger(__name__) class InsforgeLoginHelper(BaseLoginHelper): """Handles Insforge backend authentication and connection validation.""" def __init__( self, api_key: str, backend_url: str, state_path: Optional[Path] = None, ): """Initialize Insforge login helper. Args: api_key: Insforge backend API key for authentication backend_url: Insforge backend URL (e.g., https://your-app.insforge.app) state_path: Path to save connection state """ super().__init__() self.api_key = api_key self.backend_url = backend_url.rstrip('/') self.state_path = state_path or Path.home() / ".mcpbench" / "insforge_auth.json" # Ensure state directory exists self.state_path.parent.mkdir(parents=True, exist_ok=True) def login(self, **kwargs) -> bool: """Test Insforge backend connection and validate API key. Returns: bool: True if connection successful and API key valid """ try: # Test 1: Basic connectivity - try to get backend metadata logger.info(f"Testing connection to Insforge backend: {self.backend_url}") headers = { "Authorization": f"Bearer {self.api_key}", "Content-Type": "application/json", } # Test with a simple API endpoint - get current user or backend info # Try the auth current session endpoint first test_url = f"{self.backend_url}/api/auth/sessions/current" response = requests.get( test_url, headers=headers, timeout=10, ) if response.status_code == 200: # API key is valid and can authenticate logger.info("✓ Insforge API key authentication successful") connection_info = { "backend_url": self.backend_url, "authenticated": True, "authenticated_at": self._get_current_timestamp(), } elif response.status_code == 401: # Invalid API key logger.error("✗ Invalid Insforge API key") return False else: # API key might be admin key, try a different endpoint # Try listing tables/backend metadata as a test logger.info("Testing with backend metadata endpoint...") # Simple connectivity test - just check if backend is reachable health_url = f"{self.backend_url}/api/health" try: health_response = requests.get(health_url, timeout=5) if health_response.status_code in [200, 404]: # 404 is ok, backend is reachable logger.info("✓ Insforge backend is reachable") connection_info = { "backend_url": self.backend_url, "api_key_type": "admin", "authenticated": True, "authenticated_at": self._get_current_timestamp(), } else: logger.warning(f"Unexpected response from backend: {health_response.status_code}") connection_info = { "backend_url": self.backend_url, "authenticated": True, "authenticated_at": self._get_current_timestamp(), } except Exception as e: logger.warning(f"Health check failed, but proceeding: {e}") # Still consider it successful if we have credentials connection_info = { "backend_url": self.backend_url, "authenticated": True, "authenticated_at": self._get_current_timestamp(), } # Save connection state self._save_connection_state(connection_info) logger.info(f"Insforge backend connection validated: {self.backend_url}") return True except requests.exceptions.Timeout: logger.error(f"Connection timeout to Insforge backend: {self.backend_url}") return False except requests.exceptions.ConnectionError: logger.error(f"Cannot connect to Insforge backend: {self.backend_url}") return False except Exception as e: logger.error(f"Unexpected error during Insforge authentication: {e}") return False def _save_connection_state(self, state: Dict[str, Any]): """Save connection state to file.""" try: # Don't save API key safe_state = {k: v for k, v in state.items() if k not in ["api_key", "access_token"]} with open(self.state_path, "w") as f: json.dump(safe_state, f, indent=2) # Set restrictive permissions self.state_path.chmod(0o600) logger.info(f"Connection state saved to: {self.state_path}") except Exception as e: logger.error(f"Failed to save connection state: {e}") def _get_current_timestamp(self) -> str: """Get current timestamp in ISO format.""" from datetime import datetime, timezone return datetime.now(timezone.utc).isoformat() def is_connected(self) -> bool: """Check if we can connect to Insforge backend.""" return self.login() def get_connection_params(self) -> Dict[str, Any]: """Get connection parameters (without API key).""" return { "backend_url": self.backend_url, } ================================================ FILE: src/mcp_services/insforge/insforge_state_manager.py ================================================ """ Insforge State Manager for MCPMark =================================== Manages backend state for Insforge tasks including setup via prepare_environment.py and resource cleanup tracking. """ import os import sys import subprocess import requests from pathlib import Path from typing import Optional, Dict, Any, List from src.base.state_manager import BaseStateManager, InitialStateInfo from src.base.task_manager import BaseTask from src.logger import get_logger logger = get_logger(__name__) class InsforgeStateManager(BaseStateManager): """Manages Insforge backend state for task evaluation.""" def __init__( self, api_key: str, backend_url: str, ): """Initialize Insforge state manager. Args: api_key: Insforge backend API key for authentication backend_url: Insforge backend URL (e.g., https://your-app.insforge.app) """ super().__init__(service_name="insforge") self.api_key = api_key self.backend_url = backend_url.rstrip('/') # HTTP headers for API requests self.headers = { "Authorization": f"Bearer {self.api_key}", "Content-Type": "application/json", } # Track current task context for agent configuration self._current_task_context: Optional[Dict[str, Any]] = None # Validate connection on initialization try: self._test_connection() logger.info("Insforge state manager initialized successfully") except Exception as e: raise RuntimeError(f"Insforge initialization failed: {e}") # Store baseline tables (system tables that exist before any tasks run) self._baseline_tables = set( (t['schema'], t['name']) for t in self._get_all_tables() ) logger.debug(f"Stored baseline: {len(self._baseline_tables)} tables") def _test_connection(self): """Test backend connection.""" try: # Simple connectivity test - try any endpoint response = requests.get( f"{self.backend_url}/api/health", timeout=5, ) # Any response (even 404) means backend is reachable logger.debug(f"Insforge backend connectivity test: {response.status_code}") except requests.exceptions.RequestException: # Try with API key try: response = requests.get( f"{self.backend_url}/api/auth/sessions/current", headers=self.headers, timeout=5, ) logger.debug(f"Insforge backend auth test: {response.status_code}") except Exception as inner_e: raise RuntimeError(f"Cannot connect to Insforge backend: {inner_e}") def _create_initial_state(self, task: BaseTask) -> Optional[InitialStateInfo]: """Create initial backend state for a task. Restores from backup which may place tables in public or task-specific schema. Args: task: Task for which to create initial state Returns: InitialStateInfo object or None if creation failed """ try: # Generate unique state ID for this task run state_id = f"{task.category_id}_{task.task_id}_{self._get_timestamp()}" schema_name = task.category_id logger.info(f"| Creating initial state for Insforge task: {task.name}") # Drop schema first (cleanup from previous runs) self._drop_schema(schema_name) # Get list of existing tables before restore (to track what we create) tables_before = self._get_all_tables() logger.info(f"| Tables before restore: {len(tables_before)}") # Note: Don't create schema here - pg_restore will create it from the backup # Restore from backup if backup exists (may create tables in public or task schema) if self._restore_from_backup(schema_name): logger.info(f"| ✓ Restored '{schema_name}' from backup") else: logger.info(f"| ○ No backup found for '{schema_name}'") # Run prepare_environment.py if it exists task_prepared = self._run_prepare_environment(task) if not task_prepared: logger.debug(f"| No prepare_environment.py found for task {task.name}") # Get list of tables after restore (to track what we need to clean up) tables_after = self._get_all_tables() # Track ALL new tables created by the restore (compare before/after) tables_before_set = {(t['schema'], t['name']) for t in tables_before} created_tables = [ t for t in tables_after if (t['schema'], t['name']) not in tables_before_set ] logger.info(f"| Tracked {len(created_tables)} new tables for cleanup") for t in created_tables: logger.debug(f"| - {t['schema']}.{t['name']}") # Track the task context including created tables context = { "state_id": state_id, "category_id": task.category_id, "task_id": task.task_id, "task_name": task.name, "schema": schema_name, "created_tables": created_tables, # Track all created tables } return InitialStateInfo( state_id=state_id, state_url=self.backend_url, metadata=context, ) except Exception as e: logger.error(f"Failed to create initial state for {task.name}: {e}") return None def _store_initial_state_info( self, task: BaseTask, state_info: InitialStateInfo ) -> None: """Store backend info in task object for agent access.""" if hasattr(task, "__dict__"): task.backend_url = self.backend_url task.api_key = self.api_key task.state_id = state_info.state_id # Store current task context for agent configuration self._current_task_context = state_info.metadata def _cleanup_task_initial_state(self, task: BaseTask) -> bool: """Clean up task-specific resources. Drops ALL tables created during task (both setup and agent-created) by comparing against baseline. Args: task: Task whose initial state should be cleaned up Returns: True if cleanup successful """ try: logger.info(f"| Cleaning up initial state for task: {task.name}") if self._current_task_context: schema_name = self._current_task_context.get("schema") # Get ALL current tables all_current_tables = self._get_all_tables() # Find tables to drop: anything not in baseline tables_to_drop = [ t for t in all_current_tables if (t['schema'], t['name']) not in self._baseline_tables ] logger.info(f"| Found {len(tables_to_drop)} tables to clean up (setup + agent-created)") # Drop individual tables for table_info in tables_to_drop: try: self._drop_table(table_info["schema"], table_info["name"]) logger.debug(f"| ✓ Dropped table: {table_info['schema']}.{table_info['name']}") except Exception as e: logger.warning(f"| Failed to drop table {table_info}: {e}") # Drop the task schema (may be empty if all tables were in public) if schema_name: try: self._drop_schema(schema_name) logger.info(f"| ✓ Dropped schema: {schema_name}") except Exception as e: logger.warning(f"| Failed to drop schema {schema_name}: {e}") # Clear task context if self._current_task_context.get("task_name") == task.name: self._current_task_context = None logger.info(f"| ✓ Initial state cleanup completed for {task.name}") return True except Exception as e: logger.error(f"Failed to cleanup task initial state for {task.name}: {e}") return False def _cleanup_single_resource(self, resource: Dict[str, Any]) -> bool: """Clean up a single tracked resource. This is a placeholder for resource-specific cleanup logic. Tasks should handle their own cleanup via cleanup scripts. Args: resource: Resource dictionary with type, id, and metadata Returns: True if cleanup successful """ resource_type = resource["type"] resource_id = resource["id"] logger.debug(f"| Cleanup for {resource_type} {resource_id} (handled by task scripts)") return True def _run_prepare_environment(self, task: BaseTask) -> bool: """Run prepare_environment.py script if it exists in the task directory. The script should use Insforge MCP tools or HTTP API to set up required state. Args: task: Task for which to prepare environment Returns: True if script ran successfully, False if script doesn't exist """ task_dir = task.task_instruction_path.parent prepare_script = task_dir / "prepare_environment.py" if not prepare_script.exists(): logger.debug(f"No prepare_environment.py found for task {task.name}") return False logger.info(f"| Running prepare_environment.py for task {task.name}") # Set up environment variables for the script env = os.environ.copy() env.update({ "INSFORGE_BACKEND_URL": self.backend_url, "INSFORGE_API_KEY": self.api_key, }) try: # Run the prepare_environment.py script result = subprocess.run( [sys.executable, str(prepare_script)], cwd=str(task_dir), # Run from task directory env=env, capture_output=True, text=True, timeout=300, # 5 minute timeout ) if result.returncode == 0: logger.info(f"| ✓ Environment preparation completed for {task.name}") if result.stdout.strip(): logger.debug(f"| prepare_environment.py output: {result.stdout}") return True else: logger.error(f"| ✗ Environment preparation failed for {task.name}") logger.error(f"| Error output: {result.stderr}") raise RuntimeError(f"prepare_environment.py failed with exit code {result.returncode}") except subprocess.TimeoutExpired: logger.error(f"✗ Environment preparation timed out for {task.name}") raise RuntimeError("prepare_environment.py execution timed out") except Exception as e: logger.error(f"✗ Failed to run prepare_environment.py for {task.name}: {e}") raise def _get_timestamp(self) -> str: """Get timestamp for unique naming.""" from datetime import datetime return datetime.now().strftime("%Y%m%d%H%M%S") def _drop_schema(self, schema_name: str) -> None: """Drop schema and all its contents.""" import psycopg2 from psycopg2 import sql conn_params = { "host": "localhost", "port": 5432, "user": "postgres", "password": "postgres", "database": "insforge", } conn = psycopg2.connect(**conn_params) conn.autocommit = True try: with conn.cursor() as cur: cur.execute( sql.SQL("DROP SCHEMA IF EXISTS {} CASCADE").format( sql.Identifier(schema_name) ) ) logger.debug(f"| Dropped schema: {schema_name}") finally: conn.close() def _create_schema(self, schema_name: str) -> None: """Create empty schema.""" import psycopg2 from psycopg2 import sql conn_params = { "host": "localhost", "port": 5432, "user": "postgres", "password": "postgres", "database": "insforge", } conn = psycopg2.connect(**conn_params) conn.autocommit = True try: with conn.cursor() as cur: cur.execute( sql.SQL("CREATE SCHEMA {}").format(sql.Identifier(schema_name)) ) logger.debug(f"| Created schema: {schema_name}") finally: conn.close() def _get_all_tables(self) -> List[Dict[str, str]]: """Get list of all user tables. Returns: List of dicts with 'schema' and 'name' keys """ import psycopg2 conn_params = { "host": "localhost", "port": 5432, "user": "postgres", "password": "postgres", "database": "insforge", } conn = psycopg2.connect(**conn_params) try: with conn.cursor() as cur: cur.execute(""" SELECT table_schema, table_name FROM information_schema.tables WHERE table_type = 'BASE TABLE' AND table_schema NOT IN ('information_schema', 'pg_catalog') AND table_schema NOT LIKE 'pg_%' AND table_name NOT LIKE '\\_%' ORDER BY table_schema, table_name """) rows = cur.fetchall() return [{"schema": row[0], "name": row[1]} for row in rows] finally: conn.close() def _drop_table(self, schema_name: str, table_name: str) -> None: """Drop a specific table or materialized view.""" import psycopg2 from psycopg2 import sql conn_params = { "host": "localhost", "port": 5432, "user": "postgres", "password": "postgres", "database": "insforge", } conn = psycopg2.connect(**conn_params) conn.autocommit = True try: with conn.cursor() as cur: # Try dropping as table first cur.execute( sql.SQL("DROP TABLE IF EXISTS {}.{} CASCADE").format( sql.Identifier(schema_name), sql.Identifier(table_name) ) ) # Also try dropping as materialized view (in case agent created one) cur.execute( sql.SQL("DROP MATERIALIZED VIEW IF EXISTS {}.{} CASCADE").format( sql.Identifier(schema_name), sql.Identifier(table_name) ) ) logger.debug(f"| Dropped table/view: {schema_name}.{table_name}") finally: conn.close() def _restore_from_backup(self, category_name: str) -> bool: """Restore from backup file. Tables may be restored into public schema or category-specific schema depending on how the backup was created. Args: category_name: Name of category (e.g., 'employees', 'chinook', 'lego') Returns: True if backup was restored, False if no backup exists """ # Path to backup file backup_dir = Path(__file__).parent.parent.parent.parent / "postgres_state" backup_file = backup_dir / f"{category_name}.backup" logger.debug(f"| Looking for backup at: {backup_file}") logger.debug(f"| Backup exists: {backup_file.exists()}") if not backup_file.exists(): logger.info(f"| ○ No backup file found: {backup_file}") return False logger.info(f"| Restoring {category_name} from backup...") # Set up environment for pg_restore env = os.environ.copy() env["PGPASSWORD"] = "postgres" try: # Restore backup without schema filter (tables go to whatever schema they're in) result = subprocess.run( [ "pg_restore", "-h", "localhost", "-p", "5432", "-U", "postgres", "-d", "insforge", "-v", str(backup_file), ], env=env, capture_output=True, text=True, timeout=120, # 2 minute timeout ) if result.returncode != 0 and "ERROR" in result.stderr: logger.warning(f"| pg_restore had errors for {category_name}: {result.stderr}") return False logger.info(f"| ✓ {category_name} restored successfully") return True except subprocess.TimeoutExpired: logger.error(f"| ✗ Restore timed out for {category_name}") return False except Exception as e: logger.error(f"| ✗ Failed to restore {category_name}: {e}") return False def get_service_config_for_agent(self) -> dict: """Get configuration for agent execution. This configuration is passed to the agent/MCP server so it can connect to the Insforge backend. Returns: Dictionary containing backend URL and API key """ config = { "backend_url": self.backend_url, "api_key": self.api_key, } # Include current task context if available if self._current_task_context: config["task_context"] = self._current_task_context return config def set_verification_environment(self, messages_path: str = None) -> None: """Set environment variables needed for verification scripts. Args: messages_path: Optional path to messages.json file for verification """ os.environ["INSFORGE_BACKEND_URL"] = self.backend_url os.environ["INSFORGE_API_KEY"] = self.api_key # Set PostgreSQL connection details for direct database verification # (Insforge exposes its internal postgres database for verification) os.environ["POSTGRES_HOST"] = "localhost" os.environ["POSTGRES_PORT"] = "5432" os.environ["POSTGRES_DATABASE"] = "insforge" os.environ["POSTGRES_USERNAME"] = "postgres" os.environ["POSTGRES_PASSWORD"] = "postgres" if messages_path: os.environ["MCP_MESSAGES"] = str(messages_path) logger.debug("Verification environment variables set for Insforge (including direct postgres access)") ================================================ FILE: src/mcp_services/insforge/insforge_task_manager.py ================================================ """ Insforge Task Manager for MCPMark =================================== Manages Insforge task discovery, execution, and verification. """ import os import subprocess import sys from dataclasses import dataclass from pathlib import Path from typing import Any, Dict, List, Optional from src.base.task_manager import BaseTask, BaseTaskManager from src.logger import get_logger logger = get_logger(__name__) @dataclass class InsforgeTask(BaseTask): """Insforge-specific task with backend information.""" task_name: str = "" backend_url: Optional[str] = None api_key: Optional[str] = None class InsforgeTaskManager(BaseTaskManager): """Manages Insforge tasks for MCPMark evaluation.""" def __init__(self, tasks_root: Path = None): """Initialize Insforge task manager. Args: tasks_root: Path to tasks directory """ if tasks_root is None: tasks_root = Path(__file__).resolve().parents[3] / "tasks" super().__init__( tasks_root, mcp_service="insforge", task_class=InsforgeTask, task_organization="file", # Insforge uses file-based tasks ) def _create_task_from_files( self, category_id: str, task_files_info: Dict[str, Any] ) -> Optional[InsforgeTask]: """Instantiate an `InsforgeTask` from the dictionary returned by `_find_task_files`.""" import json # Check for meta.json meta_path = task_files_info["instruction_path"].parent / "meta.json" final_category_id = category_id task_id = task_files_info["task_id"] if meta_path.exists(): try: with open(meta_path, 'r') as f: meta_data = json.load(f) # Use values from meta.json if available final_category_id = meta_data.get("category_id", category_id) task_id = meta_data.get("task_id", task_id) except Exception as e: logger.warning(f"Failed to load meta.json from {meta_path}: {e}") return InsforgeTask( task_instruction_path=task_files_info["instruction_path"], task_verification_path=task_files_info["verification_path"], service="insforge", category_id=final_category_id, task_id=task_id, task_name=task_files_info["task_id"], ) def _get_verification_command(self, task: InsforgeTask) -> List[str]: """Get verification command with Insforge backend info.""" cmd = [sys.executable, str(task.task_verification_path)] return cmd def run_verification(self, task: BaseTask) -> subprocess.CompletedProcess: """Run verification with Insforge environment.""" env = os.environ.copy() # Pass Insforge connection info to verification script if hasattr(task, "backend_url") and task.backend_url: env["INSFORGE_BACKEND_URL"] = task.backend_url if hasattr(task, "api_key") and task.api_key: env["INSFORGE_API_KEY"] = task.api_key return subprocess.run( self._get_verification_command(task), capture_output=True, text=True, timeout=300, env=env, ) def _format_task_instruction(self, base_instruction: str) -> str: """Add Insforge-specific instructions.""" return ( base_instruction + "\n\nNote: Use Insforge MCP tools to complete this task. The backend connection is already configured." ) ================================================ FILE: src/mcp_services/notion/__init__.py ================================================ """ Notion-specific modules for MCPMark. """ from .notion_task_manager import NotionTaskManager, NotionTask from .notion_state_manager import NotionStateManager __all__ = ["NotionTaskManager", "NotionTask", "NotionStateManager"] ================================================ FILE: src/mcp_services/notion/notion_login_helper.py ================================================ """ Notion Login Helper for MCPMark ================================= This module provides a utility class and CLI script for logging into Notion using Playwright. It saves the authenticated session state to a file, which can be used for subsequent automated tasks. """ import argparse from pathlib import Path from typing import Optional from playwright.sync_api import ( BrowserContext, Page, TimeoutError as PlaywrightTimeoutError, sync_playwright, ) from src.base.login_helper import BaseLoginHelper from src.logger import get_logger # Initialize logger logger = get_logger(__name__) class NotionLoginHelper(BaseLoginHelper): """ Utility helper for logging into Notion using Playwright. """ SUPPORTED_BROWSERS = {"chromium", "firefox"} def __init__( self, *, url: Optional[str] = None, headless: bool = True, state_path: Optional[str | Path] = None, browser: str = "firefox", ) -> None: """ Initializes the Notion login helper. Args: url: The Notion URL to open after launching the browser. headless: Whether to run Playwright in headless mode. state_path: The path to save the authenticated session state. browser: The browser engine to use ('chromium' or 'firefox'). """ super().__init__() if browser not in self.SUPPORTED_BROWSERS: raise ValueError( f"Unsupported browser '{browser}'. Supported browsers are: {', '.join(self.SUPPORTED_BROWSERS)}" ) self.url = url or "https://www.notion.so/login" self.headless = headless self.browser_name = browser self.state_path = ( Path(state_path or Path.cwd() / "notion_state.json").expanduser().resolve() ) self._browser_context: Optional[BrowserContext] = None self._playwright = None self._browser = None def login(self) -> BrowserContext: """ Launches a browser, performs login, and saves the session state. """ if self.state_path.exists(): try: self.state_path.unlink() except OSError as e: logger.warning("Unable to remove existing state file: %s", e) if self._playwright is None: self._playwright = sync_playwright().start() browser_type = getattr(self._playwright, self.browser_name) self._browser = browser_type.launch(headless=self.headless) context = self._browser.new_context() page = context.new_page() logger.info("Navigating to Notion URL: %s", self.url) page.goto(self.url, wait_until="load") if self.headless: self._handle_headless_login(context) else: logger.info( "A browser window has been opened. Please complete the Notion login." ) logger.info( "After you see your workspace, return to this terminal and press ." ) initial_url = page.url input() try: page.wait_for_url(lambda u: u != initial_url, timeout=10_000) except PlaywrightTimeoutError: pass # It's okay if the URL doesn't change try: page.wait_for_load_state("domcontentloaded", timeout=5_000) except PlaywrightTimeoutError: pass context.storage_state(path=str(self.state_path)) logger.info("✅ Login successful! Session state saved to %s", self.state_path) self._browser_context = context return context def close(self) -> None: """Closes the underlying browser and Playwright instance.""" if self._browser_context: try: self._browser_context.close() finally: self._browser_context = None if self._browser: try: self._browser.close() finally: self._browser = None if self._playwright: self._playwright.stop() self._playwright = None def _handle_headless_login(self, context: BrowserContext) -> None: """ Guides the user through the login process in headless mode. """ page: Page = context.pages[0] login_url = "https://www.notion.so/login" page.goto(login_url, wait_until="domcontentloaded") email = input("Enter your Notion email address: ").strip() try: email_input = page.locator( 'input[placeholder="Enter your email address..."]' ) email_input.wait_for(state="visible", timeout=120_000) email_input.fill(email) email_input.press("Enter") except PlaywrightTimeoutError: raise RuntimeError("Timed out waiting for the email input field.") except Exception: page.get_by_role("button", name="Continue", exact=True).click() try: code_input = page.locator('input[placeholder="Enter code"]') code_input.wait_for(state="visible", timeout=120_000) code = input("Enter the verification code from your email: ").strip() code_input.fill(code) code_input.press("Enter") except PlaywrightTimeoutError: raise RuntimeError("Timed out waiting for the verification code input.") except Exception: page.get_by_role("button", name="Continue", exact=True).click() try: page.wait_for_url(lambda url: url != login_url, timeout=180_000) except PlaywrightTimeoutError: logger.warning("Login redirect timed out, but proceeding to save state.") if self.url and self.url != login_url: page.goto(self.url, wait_until="domcontentloaded") def __enter__(self) -> "NotionLoginHelper": self.login() return self def __exit__(self, exc_type, exc_val, exc_tb): self.close() def main(): """Main entry point for the Notion login CLI script.""" parser = argparse.ArgumentParser( description="Authenticate to Notion and generate a session state file.", formatter_class=argparse.ArgumentDefaultsHelpFormatter, ) parser.add_argument( "--headless", action="store_true", help="Run the login flow in headless mode (prompts for credentials).", ) parser.add_argument( "--browser", default="firefox", choices=["chromium", "firefox"], help="The browser engine to use for Playwright.", ) args = parser.parse_args() helper = NotionLoginHelper(headless=args.headless, browser=args.browser) with helper: logger.info("Login process completed.") if __name__ == "__main__": main() ================================================ FILE: src/mcp_services/notion/notion_state_manager.py ================================================ """ Notion State Manager for MCPMark ================================= This module handles the duplication and management of Notion initial states Pages for consistent task evaluation using Playwright automation. """ import time from pathlib import Path from typing import Optional, Tuple, Dict, Any, Set from notion_client import Client from playwright.sync_api import ( Browser, BrowserContext, Page, Playwright, TimeoutError as PlaywrightTimeoutError, sync_playwright, ) from src.base.state_manager import BaseStateManager, InitialStateInfo from src.base.task_manager import BaseTask from src.logger import get_logger from src.mcp_services.notion.notion_task_manager import NotionTask import re # Initialize logger logger = get_logger(__name__) # Pattern to match orphan pages with "(n)" suffix, e.g., "Title (1)", "Title (2)" ORPHAN_PAGE_PATTERN = re.compile(r".+\s+\(\d+\)$") # Selectors for Notion UI elements PAGE_MENU_BUTTON_SELECTOR = '[data-testid="more-button"], div.notion-topbar-more-button, [aria-label="More"], button[aria-label="More"]' DUPLICATE_MENU_ITEM_SELECTOR = 'text="Duplicate"' DUPLICATE_WITH_CONTENT_SELECTOR = 'text="Duplicate with content"' MOVE_TO_MENU_ITEM_SELECTOR = 'text="Move to"' MOVE_TO_SEARCH_INPUT_SELECTOR = ( 'input[placeholder*="Move page to"], textarea[placeholder*="Move page to"]' ) class NotionStateManager(BaseStateManager): """ Manages the state of Notion initial states using Playwright and the Notion API. """ def __init__( self, source_notion_key: str, eval_notion_key: str, headless: bool = True, browser: str = "firefox", eval_parent_page_title: str = "MCPMark Eval Hub", source_parent_page_title: str = "MCPMark Source Hub", ): """ Initializes the Notion state manager. Args: source_notion_key: The Notion API key for source workspace. eval_notion_key: The Notion API key for evaluation workspace. headless: Whether to run Playwright in headless mode. browser: The browser engine to use ('chromium' or 'firefox'). eval_parent_page_title: Parent page title for evaluation workspace. """ super().__init__(service_name="notion") supported_browsers = {"chromium", "firefox"} if browser not in supported_browsers: raise ValueError( f"Unsupported browser '{browser}'. Supported browsers are: {', '.join(supported_browsers)}" ) self.browser_name = browser # Initialize separate Notion clients with provided keys if not source_notion_key or not eval_notion_key: raise ValueError( "Both source_notion_key and eval_notion_key must be provided to NotionStateManager." ) self.source_notion_client = Client(auth=source_notion_key) self.eval_notion_client = Client(auth=eval_notion_key) self.headless = headless self.state_file = Path("notion_state.json") # Parent page under which duplicated pages should be moved for evaluation self.eval_parent_page_title = eval_parent_page_title # Source hub page that contains all initial-state templates self.source_parent_page_title = source_parent_page_title # Cache resolved parent page IDs to avoid repeated workspace-wide searches self._eval_parent_page_id: Optional[str] = None self._source_hub_page_id: Optional[str] = None # Browser instance management for reuse within session self._playwright: Optional[Playwright] = None self._browser: Optional[Browser] = None self._context: Optional[BrowserContext] = None # Validate initialization if not self.source_notion_client or not self.eval_notion_client: raise ValueError( "Both source_notion_key and eval_notion_key must be provided and valid" ) if not self.state_file.exists(): raise FileNotFoundError( "Authentication state 'notion_state.json' not found. Run the Notion login helper first." ) logger.info("Notion state manager initialized successfully") # ========================================================================= # Core Template Methods (Required by BaseStateManager) # ========================================================================= def _cleanup_eval_hub_orphans(self) -> None: """Clean up all pages in MCPMark Eval Hub before creating new task state.""" try: parent_page_id = self._ensure_eval_parent_page_id() if not parent_page_id: logger.debug( "| ✗ Parent page '%s' not found in eval workspace, skipping cleanup", self.eval_parent_page_title, ) return # Get all child pages and archive them children = self.eval_notion_client.blocks.children.list( block_id=parent_page_id ) orphan_count = 0 for child in children.get("results", []): if child.get("type") == "child_page": try: self.eval_notion_client.pages.update( page_id=child["id"], archived=True ) orphan_count += 1 logger.debug("| ✓ Archived orphan page: %s", child["id"]) except Exception as e: logger.warning( "| ✗ Failed to archive orphan page %s: %s", child["id"], e ) if orphan_count > 0: logger.info( "| ✓ Cleaned up %d orphan page(s) from MCPMark Eval Hub", orphan_count ) except Exception as e: logger.warning("Orphan cleanup failed (non-critical, continuing): %s", e) # Don't raise exception - allow execution to continue def _cleanup_source_hub_orphans(self, exclude_page_ids: Optional[Set[str]] = None) -> int: """Clean up all orphan pages in source hub matching 'xxx (n)' pattern. Args: exclude_page_ids: Page IDs to exclude from cleanup (e.g., pages currently being operated on) Returns: Number of pages archived """ exclude_page_ids = exclude_page_ids or set() source_hub_id = self._ensure_source_hub_page_id() if not source_hub_id: return 0 orphan_count = 0 next_cursor = None try: while True: kwargs: Dict[str, Any] = {"block_id": source_hub_id} if next_cursor: kwargs["start_cursor"] = next_cursor children = self.source_notion_client.blocks.children.list(**kwargs) for child in children.get("results", []): if child.get("type") != "child_page": continue child_id = child.get("id") if child_id in exclude_page_ids: continue child_title = (child.get("child_page", {}) or {}).get("title", "").strip() # Match "xxx (n)" pattern where n is any digit(s) if ORPHAN_PAGE_PATTERN.match(child_title): try: self.source_notion_client.pages.update( page_id=child_id, archived=True ) orphan_count += 1 logger.info("| ✓ Archived source hub orphan: %s (%s)", child_title, child_id) except Exception as e: logger.warning("| ✗ Failed to archive orphan %s: %s", child_id, e) if not children.get("has_more"): break next_cursor = children.get("next_cursor") if orphan_count > 0: logger.info("| ✓ Cleaned up %d orphan page(s) from source hub", orphan_count) except Exception as e: logger.warning("Source hub orphan cleanup failed (non-critical, continuing): %s", e) return orphan_count def _ensure_eval_parent_page_id(self) -> Optional[str]: """Resolve and cache the evaluation hub parent page ID.""" if self._eval_parent_page_id: return self._eval_parent_page_id try: response = self.eval_notion_client.search( query=self.eval_parent_page_title, filter={"property": "object", "value": "page"}, ) for result in response.get("results", []): props = result.get("properties", {}) title_prop = props.get("title", {}).get("title") or props.get( "Name", {} ).get("title") if not title_prop: continue title = "".join(t.get("plain_text", "") for t in title_prop).strip() if title == self.eval_parent_page_title: self._eval_parent_page_id = result.get("id") break if not self._eval_parent_page_id: logger.debug( "| ✗ Eval parent page '%s' not found via search", self.eval_parent_page_title, ) except Exception as e: logger.error( "| ✗ Failed to resolve eval parent page '%s': %s", self.eval_parent_page_title, e, ) return self._eval_parent_page_id def _ensure_source_hub_page_id(self) -> Optional[str]: """Resolve and cache the source hub parent page ID used for initial states.""" if self._source_hub_page_id: return self._source_hub_page_id try: hub_search = self.source_notion_client.search( query=self.source_parent_page_title, filter={"property": "object", "value": "page"}, ) for result in hub_search.get("results", []): props = result.get("properties", {}) title_prop = props.get("title", {}).get("title") or props.get( "Name", {} ).get("title") current_title = "".join( t.get("plain_text", "") for t in (title_prop or []) ).strip() if current_title == self.source_parent_page_title: self._source_hub_page_id = result.get("id") break if not self._source_hub_page_id: logger.error( "| ✗ Source hub page '%s' not found.", self.source_parent_page_title, ) except Exception as e: logger.error( "| ✗ Failed to resolve source hub page '%s': %s", self.source_parent_page_title, e, ) return self._source_hub_page_id def _wait_for_database_ready( self, page_id: str, max_retries: int = 10, retry_delay: int = 2 ) -> bool: """ Wait for the database backend to be ready by checking page accessibility. Args: page_id: The ID of the page to check max_retries: Maximum number of retry attempts retry_delay: Delay between retries in seconds Returns: True if the database is ready, False if timeout """ logger.info("| ○ Starting heartbeat detection for page %s", page_id) for attempt in range(max_retries): try: # Try to retrieve the page from the evaluation workspace result = self.eval_notion_client.pages.retrieve(page_id=page_id) # Check if we got a valid response if result and isinstance(result, dict): # Additional check: try to get page properties if "properties" in result: logger.info( "| ✓ Database backend is ready (attempt %d/%d)", attempt + 1, max_retries ) return True except Exception as e: logger.debug( "| ✗ Database not ready yet (attempt %d/%d): %s", attempt + 1, max_retries, str(e) ) # Wait before next retry if attempt < max_retries - 1: time.sleep(retry_delay) logger.error( "| ✗ Database backend failed to become ready after %d attempts", max_retries ) return False def _create_initial_state(self, task: BaseTask) -> Optional[InitialStateInfo]: """Create initial state by duplicating Notion page.""" if not isinstance(task, NotionTask): logger.error("Task must be NotionTask for Notion state manager") return None # Clean up any orphan pages in eval hub before creating new state self._cleanup_eval_hub_orphans() # Clean up orphan pages in source hub before duplication self._cleanup_source_hub_orphans() try: initial_state_title = self._category_to_initial_state_title(task.category_id) initial_state_info = self._find_initial_state_by_title(initial_state_title) if not initial_state_info: logger.error( "| ✗ Initial state not found for category '%s' (title: '%s')", task.category_id, initial_state_title, ) return None _, initial_state_url = initial_state_info duplicated_url, duplicated_id = self._duplicate_initial_state_for_task( initial_state_url, task.category_id, task.name ) # Wait for database backend to be ready logger.info("| ○ Checking database backend accessibility for duplicated page...") if not self._wait_for_database_ready(duplicated_id): logger.error( "| ✗ Database backend is not accessible after duplication for task %s", task.name ) # Clean up the duplicated page if database is not ready try: self.eval_notion_client.pages.update( page_id=duplicated_id, archived=True ) logger.info("| ✓ Cleaned up inaccessible duplicated page: %s", duplicated_id) except Exception as cleanup_error: logger.error("| ✗ Failed to clean up duplicated page: %s", cleanup_error) raise RuntimeError( f"| ✗ Database backend failed to become ready for duplicated page {duplicated_id}" ) time.sleep(5) # allow the page to fully load return InitialStateInfo( state_id=duplicated_id, state_url=duplicated_url, metadata={ "original_url": initial_state_url, "category": task.category_id, "task_name": task.name, }, ) except Exception as e: logger.error(f"| ✗ Failed to create initial state for {task.name}: {e}") return None def _store_initial_state_info( self, task: BaseTask, state_info: InitialStateInfo ) -> None: """Store initial state information in NotionTask object.""" if isinstance(task, NotionTask): task.duplicated_initial_state_id = state_info.state_id task.duplicated_initial_state_url = state_info.state_url task.original_initial_state_url = state_info.metadata.get("original_url") # Track the duplicated page for cleanup self.track_resource("page", state_info.state_id, state_info.metadata) def _cleanup_task_initial_state(self, task: BaseTask) -> bool: """Clean up initial state for a specific Notion task.""" if not isinstance(task, NotionTask): return True # Nothing to clean up for non-Notion tasks initial_state_id = task.duplicated_initial_state_id if not initial_state_id: logger.warning( "| ✗ No duplicated initial state ID found for task %s, skipping cleanup.", task.name, ) return False try: # Archive the duplicated page self.eval_notion_client.pages.update( page_id=initial_state_id, archived=True ) logger.info("| ✓ Archived page initial state: %s", initial_state_id) # Remove from tracked resources to avoid duplicate cleanup self.tracked_resources = [ r for r in self.tracked_resources if not (r["type"] == "page" and r["id"] == initial_state_id) ] return True except Exception as e: logger.error("| ✗ Failed to archive initial state %s: %s", initial_state_id, e) return False def _cleanup_single_resource(self, resource: Dict[str, Any]) -> bool: """Clean up a single Notion resource.""" if resource["type"] == "page": try: self.eval_notion_client.pages.update( page_id=resource["id"], archived=True ) logger.info(f"| ✓ Archived Notion page: {resource['id']}") return True except Exception as e: logger.error(f"| ✗ Failed to archive Notion page {resource['id']}: {e}") return False logger.warning(f"| ? Unknown resource type for cleanup: {resource['type']}") return False # ========================================================================= # Notion API Operations # ========================================================================= def _rename_initial_state_via_api( self, initial_state_id: str, new_title: str ) -> None: """Renames a Notion page using the API.""" try: self.eval_notion_client.pages.update( page_id=initial_state_id, properties={"title": {"title": [{"text": {"content": new_title}}]}}, ) except Exception as e: logger.error("| ✗ Failed to rename page via API: %s", e) # ------------------------------------------------------------------ # Playwright helpers # ------------------------------------------------------------------ def _ensure_browser(self) -> Tuple[Browser, BrowserContext]: """Ensure browser instance is available, reusing existing or creating new. Returns: Tuple of (Browser, BrowserContext) """ if self._playwright is None: self._playwright = sync_playwright().start() if self._browser is None: browser_type = getattr(self._playwright, self.browser_name) self._browser = browser_type.launch(headless=self.headless) if self._context is None: self._context = self._browser.new_context( storage_state=str(self.state_file), locale="en-US", ) return self._browser, self._context def close(self) -> None: """Clean up browser resources. Should be called when session ends.""" if self._context: try: # Save storage state before closing self._context.storage_state(path=str(self.state_file)) self._context.close() except Exception: pass self._context = None if self._browser: try: self._browser.close() except Exception: pass self._browser = None if self._playwright: try: self._playwright.stop() except Exception: pass self._playwright = None def _recover_duplicate_via_ui( self, page: Page, original_title: str, *, timeout: int = 30_000, ) -> Optional[str]: """Recover duplicate page URL by navigating via UI when API-based recovery fails. This method navigates to the source hub and locates the duplicate page (e.g., "Title (1)") in the Notion sidebar, then clicks on it to obtain the URL directly from the browser. Args: page: The Playwright page instance original_title: The original page title (without suffix) timeout: Timeout for UI operations in milliseconds Returns: The URL of the duplicate page if found, None otherwise """ try: source_hub_id = self._ensure_source_hub_page_id() if not source_hub_id: logger.warning("| ✗ Cannot resolve source hub for UI-based recovery") return None # Build URL to navigate to source hub # Format: https://www.notion.so/ clean_hub_id = source_hub_id.replace("-", "") source_hub_url = f"https://www.notion.so/{clean_hub_id}" logger.info("| ○ Navigating to source hub for UI-based recovery...") page.goto(source_hub_url, wait_until="domcontentloaded", timeout=60_000) time.sleep(3) # Allow page to settle # Look for page title with "(n)" suffix pattern in sidebar or page content # The duplicate will be named "Original Title (1)" or similar duplicate_pattern = re.compile(rf"^{re.escape(original_title)}\s*\(\d+\)$") # Try to find the duplicate page in the page list/sidebar # Notion uses different selectors for page links, try common patterns page_link_selectors = [ f'a:has-text("{original_title} (1)")', f'div[data-block-id]:has-text("{original_title} (1)")', f'[role="treeitem"]:has-text("{original_title} (1)")', ] for selector in page_link_selectors: try: locator = page.locator(selector).first if locator.is_visible(timeout=5000): logger.info("| ○ Found duplicate page in UI, clicking...") locator.click() page.wait_for_load_state("domcontentloaded", timeout=timeout) time.sleep(3) recovered_url = page.url logger.info("| ✓ Recovered duplicate URL via UI: %s", recovered_url) return recovered_url except Exception: continue # If specific selectors didn't work, try a broader search try: # Look for any visible text matching the pattern and click it all_text_elements = page.locator(f'text="{original_title} ("') count = all_text_elements.count() if count > 0: for i in range(count): element = all_text_elements.nth(i) text_content = element.text_content() or "" if duplicate_pattern.match(text_content.strip()): logger.info("| ○ Found duplicate via text search, clicking...") element.click() page.wait_for_load_state("domcontentloaded", timeout=timeout) time.sleep(3) recovered_url = page.url logger.info("| ✓ Recovered duplicate URL via UI text search: %s", recovered_url) return recovered_url except Exception as e: logger.debug("| ✗ Broad text search failed: %s", e) logger.warning("| ✗ Could not locate duplicate '%s (n)' in UI", original_title) return None except Exception as e: logger.warning("| ✗ UI-based recovery failed: %s", e) return None # ========================================================================= # Playwright Automation Methods # ========================================================================= def _move_current_page_to_env( self, page: Page, *, wait_timeout: int = 60_000 ) -> None: """Moves the currently open page into the designated evaluation parent page. This operation is done via Playwright UI automation because the Notion API does not yet expose a direct "move" endpoint for pages. It relies on the following sequence: 1. Open the page action menu (same selector as duplication). 2. Choose the "Move to" menu item. 3. In the search field that appears (placeholder starts with "Move page to"), type the target parent page title. 4. Click the matching search result to complete the move. """ logger.info( "| ○ Moving duplicated page to evaluation parent '%s'...", self.eval_parent_page_title, ) try: # Step 1: Open the page menu page.wait_for_selector( PAGE_MENU_BUTTON_SELECTOR, state="visible", timeout=30_000 ) page.click(PAGE_MENU_BUTTON_SELECTOR) # Step 2: Select "Move to" page.hover(MOVE_TO_MENU_ITEM_SELECTOR) page.click(MOVE_TO_MENU_ITEM_SELECTOR) # Step 3: Fill the destination title page.wait_for_selector( MOVE_TO_SEARCH_INPUT_SELECTOR, state="visible", timeout=15_000 ) # Ensure focus then type the destination title – using type() triggers # key events Notion relies on for search filtering. search_input = page.locator(MOVE_TO_SEARCH_INPUT_SELECTOR).first search_input.click() search_input.fill("") # Clear any residual text (safety) search_input.type(self.eval_parent_page_title, delay=50) # Step 4: Wait for the search result matching the page title, then click it # Selector for the menu item row – ensure we click the outer container, not a nested
result_selector = ( f'div[role="menuitem"]:has-text("{self.eval_parent_page_title}")' ) page.wait_for_selector( result_selector, state="visible", timeout=wait_timeout ) page.locator(result_selector).first.click(force=True) # Wait for the dialog to disappear – indicates move finished page.wait_for_selector( MOVE_TO_SEARCH_INPUT_SELECTOR, state="detached", timeout=wait_timeout ) # Give Notion a brief moment to process the move time.sleep(3) except PlaywrightTimeoutError as e: logger.error( "| ✗ Playwright timed out while moving page to evaluation parent – move may have failed." ) raise RuntimeError("Playwright timeout during move-to operation") from e except Exception as exc: logger.error("| ✗ Unexpected error during move-to operation: %s", exc) # Propagate the error to allow retry logic at higher level if necessary raise def _category_to_initial_state_title(self, category: str) -> str: """Converts a category name to a capitalized initial state title.""" return " ".join(word.capitalize() for word in category.split("_")) def _extract_initial_state_id_from_url(self, url: str) -> str: """Extracts the initial state ID from a Notion URL.""" slug = url.split("?")[0].split("#")[0].rstrip("/").split("/")[-1] compact = "".join(c for c in slug if c.isalnum()) if len(compact) < 32: raise ValueError(f"Could not parse initial state ID from URL: {url}") compact = compact[-32:] return f"{compact[:8]}-{compact[8:12]}-{compact[12:16]}-{compact[16:20]}-{compact[20:]}" # ========================================================================= # URL and State Utilities # ========================================================================= def _get_slug_base(self, url: str) -> str: """Returns the slug part without its trailing 32-char ID (hyphen separated).""" slug = url.split("?", 1)[0].split("#", 1)[0].rstrip("/").split("/")[-1] match = re.match(r"^(.*)-([0-9a-fA-F]{32})$", slug) if match: return match.group(1) return slug def _is_valid_duplicate_url(self, original_url: str, duplicated_url: str) -> bool: """Checks whether duplicated_url looks like a Notion duplicate (original slug + '-N').""" orig_base = self._get_slug_base(original_url) dup_base = self._get_slug_base(duplicated_url) if not dup_base.startswith(orig_base + "-"): return False suffix = dup_base[len(orig_base) + 1 :] return suffix.isdigit() def _find_initial_state_by_title(self, title: str) -> Optional[Tuple[str, str]]: """Find a child page under the source hub by exact title. Strategy: - Locate the source hub page ("MCPBench Source Hub") via search to get its ID. - List its first-level children via `blocks.children.list`. - Find a `child_page` whose title exactly matches `title`. - Return the page ID and URL (retrieved via `pages.retrieve`). """ try: # 1) Resolve the source hub page once and reuse its ID source_hub_id = self._ensure_source_hub_page_id() if not source_hub_id: return None # 2) List first-level children of the hub page and find exact title match matched_child_id: Optional[str] = None next_cursor = None while True: kwargs = {"block_id": source_hub_id} if next_cursor: kwargs["start_cursor"] = next_cursor children = self.source_notion_client.blocks.children.list(**kwargs) for child in children.get("results", []): if child.get("type") != "child_page": continue # Only consider child pages child_title = (child.get("child_page", {}) or {}).get("title", "").strip() if child_title == title: matched_child_id = child.get("id") break if matched_child_id or not children.get("has_more"): break next_cursor = children.get("next_cursor") if not matched_child_id: logger.debug("| ✗ No child page titled '%s' under '%s'", title, self.source_parent_page_title) return None # 3) Retrieve the page to get its canonical URL try: page_obj = self.source_notion_client.pages.retrieve(page_id=matched_child_id) page_url = page_obj.get("url") except Exception as e: logger.warning("| ✗ Failed to retrieve page URL for '%s' (%s): %s", title, matched_child_id, e) page_url = None if not page_url: # Fall back to returning just the ID if URL couldn't be retrieved logger.debug("| ○ Returning page ID without URL for '%s'", title) return matched_child_id, "" return matched_child_id, page_url except Exception as e: logger.error("| ✗ Error locating initial state '%s' via children listing: %s", title, e) return None # ========================================================================= # Duplication and State Management # ========================================================================= # NOTE: Initial state type detection logic has been removed because all initial states are pages. def _duplicate_current_initial_state( self, page: Page, new_title: Optional[str] = None, *, original_initial_state_id: str, original_initial_state_title: str, wait_timeout: int = 180_000, ) -> str: """Duplicates the currently open Notion initial state using Playwright.""" try: logger.info("| ○ Opening page menu...") page.wait_for_selector( PAGE_MENU_BUTTON_SELECTOR, state="visible", timeout=30_000 ) page.click(PAGE_MENU_BUTTON_SELECTOR) logger.info("| ○ Clicking 'Duplicate'...") page.hover(DUPLICATE_MENU_ITEM_SELECTOR) page.click(DUPLICATE_MENU_ITEM_SELECTOR) original_url = page.url logger.info( "| ○ Waiting for duplicated initial state to load (up to %.1f s)...", wait_timeout / 1000, ) page.wait_for_url(lambda url: url != original_url, timeout=wait_timeout) # wait for the page to fully load time.sleep(5) duplicated_url = page.url # Validate that the resulting URL is a genuine duplicate of the original template. if not self._is_valid_duplicate_url(original_url, duplicated_url): # Sometimes duplication succeeds but UI navigates to parent instead of the new page. # In that case, try to find the most recently created page named exactly " (1)". logger.warning( "| ✗ Duplicate URL pattern mismatch. Attempting recovery by searching for latest '%s (1)' page...", original_initial_state_title, ) target_title = f"{original_initial_state_title} (1)" try: # Wait 5 seconds before the first search to allow Notion to index the new page time.sleep(5) attempts = 3 source_hub_id = self._ensure_source_hub_page_id() if not source_hub_id: logger.error( "| ✗ Cannot resolve source hub ID while locating '%s' duplicate.", target_title, ) else: for retry_idx in range(attempts): candidates = [] next_cursor = None while True: kwargs: Dict[str, Any] = {"block_id": source_hub_id} if next_cursor: kwargs["start_cursor"] = next_cursor children = self.source_notion_client.blocks.children.list(**kwargs) for child in children.get("results", []): if child.get("type") != "child_page": continue child_id = child.get("id") if child_id == original_initial_state_id: continue child_title = ( (child.get("child_page", {}) or {}) .get("title", "") .strip() ) if child_title != target_title: continue created_time = child.get("created_time") or child.get( "last_edited_time" ) candidates.append((created_time or "", child_id)) if not children.get("has_more"): break next_cursor = children.get("next_cursor") if candidates: latest_child_id = max(candidates, key=lambda x: x[0])[1] fallback_url = None try: page_obj = self.source_notion_client.pages.retrieve( page_id=latest_child_id ) fallback_url = page_obj.get("url") except Exception as retrieve_error: logger.warning( "| ✗ Failed to resolve URL for duplicate '%s': %s", latest_child_id, retrieve_error, ) if fallback_url: logger.info( "| ○ Navigating directly to latest '%s' duplicate via children list...", target_title, ) page.goto(fallback_url, wait_until="domcontentloaded", timeout=120_000) time.sleep(5) duplicated_url = page.url break if retry_idx < attempts - 1: logger.debug( "| ○ '%s' not visible yet via children listing. Waiting 5s before retry %d/%d...", target_title, retry_idx + 1, attempts - 1, ) time.sleep(5) # Re-validate after attempted recovery if not self._is_valid_duplicate_url(original_url, duplicated_url): # API-based recovery failed, try UI-based recovery as last resort logger.warning( "| ✗ API-based recovery failed. Trying UI-based recovery..." ) ui_recovered_url = self._recover_duplicate_via_ui( page, original_initial_state_title, timeout=wait_timeout, ) if ui_recovered_url and self._is_valid_duplicate_url(original_url, ui_recovered_url): duplicated_url = ui_recovered_url logger.info("| ✓ UI-based recovery successful") else: logger.error( "| ✗ Could not locate a valid '%s' duplicate after all recovery attempts.\n| Original: %s\n| Observed: %s", target_title, original_url, duplicated_url, ) # Attempt to clean up stray duplicate before propagating error. self._cleanup_orphan_duplicate( original_initial_state_id, original_initial_state_title ) raise RuntimeError( "Duplicate URL pattern mismatch – duplication likely failed" ) except Exception as search_exc: logger.error( "| ✗ Failed during recovery search for '%s': %s", target_title, search_exc, ) # Attempt to clean up stray duplicate before propagating error. self._cleanup_orphan_duplicate( original_initial_state_id, original_initial_state_title ) raise RuntimeError( "Duplicate URL pattern mismatch – duplication likely failed" ) from search_exc duplicated_initial_state_id = self._extract_initial_state_id_from_url( duplicated_url ) # Always move to evaluation parent self._move_current_page_to_env(page, wait_timeout=wait_timeout) # Rename if new title is provided if new_title: self._rename_initial_state_via_api( duplicated_initial_state_id, new_title ) # verify whether the page is moved to the evaluation parent page try: result = self.eval_notion_client.pages.retrieve( page_id=duplicated_initial_state_id ) if not result or not isinstance(result, dict): logger.error( "| ✗ Playwright move to error: Notion API did not return a valid page dict after move." ) raise RuntimeError( "Playwright move to error: Notion API did not return a valid page dict after move." ) logger.info( "| ✓ Page moved to '%s' successfully.", self.eval_parent_page_title ) except Exception as move_exc: logger.error(f"Playwright move to error: {move_exc}") raise RuntimeError( "Playwright move to error: Notion client failed to retrieve page after move." ) from move_exc return duplicated_initial_state_id except PlaywrightTimeoutError as e: logger.error("Playwright timed out while duplicating initial state.") raise RuntimeError("Playwright timeout during duplication") from e # ========================================================================= # Cleanup and Maintenance # ========================================================================= def _cleanup_orphan_duplicate( self, original_initial_state_id: str, initial_state_title: str, ) -> bool: """Finds and archives a stray duplicate ("orphan") that matches pattern 'Title (n)'. Returns True if at least one orphan duplicate was archived. """ try: source_hub_id = self._ensure_source_hub_page_id() if not source_hub_id: logger.error( "| ✗ Cannot resolve source hub while cleaning up duplicates for '%s'", initial_state_title, ) return False # Match any numbered duplicate "Title (n)" where n is any digit(s) title_regex = re.compile(rf"^{re.escape(initial_state_title)}\s*\(\d+\)$") archived_any = False next_cursor = None while True: kwargs: Dict[str, Any] = {"block_id": source_hub_id} if next_cursor: kwargs["start_cursor"] = next_cursor children = self.source_notion_client.blocks.children.list(**kwargs) for child in children.get("results", []): if child.get("type") != "child_page": continue dup_id = child.get("id") if dup_id == original_initial_state_id: continue title_plain = ( (child.get("child_page", {}) or {}).get("title", "") ).strip() if not title_regex.match(title_plain): continue # not a numbered duplicate try: self.source_notion_client.pages.update( page_id=dup_id, archived=True ) logger.info("| ✓ Archived orphan duplicate (%s): %s", "page", dup_id) archived_any = True except Exception as exc: logger.warning("| ✗ Failed to archive orphan page %s: %s", dup_id, exc) if not children.get("has_more"): break next_cursor = children.get("next_cursor") return archived_any except Exception as exc: logger.warning( "Error while attempting to cleanup orphan duplicate: %s", exc ) return False def _duplicate_initial_state_for_task( self, initial_state_url: str, category: str, task_name: str, *, max_retries: int = 2, initial_wait_ms: int = 180_000, ) -> Tuple[str, str]: """Duplicates an initial state for a task, with retries for reliability.""" if not self.state_file.exists(): raise FileNotFoundError( "Authentication state 'notion_state.json' not found. " "Run the Notion login helper first." ) last_exc = None for attempt in range(max_retries + 1): wait_timeout = initial_wait_ms * (attempt + 1) page = None try: # Reuse browser instance within session _, context = self._ensure_browser() page = context.new_page() logger.info("| ○ Navigating to initial state for %s...", category) # Start timing from the moment we begin navigating to the initial state page. start_time = time.time() page.goto(initial_state_url, wait_until="domcontentloaded", timeout=120_000) context.storage_state(path=str(self.state_file)) initial_state_id = self._extract_initial_state_id_from_url( initial_state_url ) initial_state_title = self._category_to_initial_state_title( category ) duplicated_id = self._duplicate_current_initial_state( page, new_title=initial_state_title, # Use original initial state name without (1) suffix original_initial_state_id=initial_state_id, original_initial_state_title=initial_state_title, wait_timeout=wait_timeout, ) duplicated_url = page.url # Validate URL pattern again at this higher level (should already be validated inside). context.storage_state(path=str(self.state_file)) # Log how long the whole duplication (navigate → duplicate) took. elapsed = time.time() - start_time logger.info( "| ✓ Initial state duplicated successfully in %.2f seconds (task: %s).", elapsed, task_name, ) return duplicated_url, duplicated_id except Exception as e: # No additional cleanup here—handled inside _duplicate_current_template. last_exc = e if attempt < max_retries: logger.warning( "| ✗ Duplication attempt %d failed: %s. Retrying...", attempt + 1, e, ) time.sleep(120 * attempt + 120) finally: # Close the page to prevent accumulation within reused context if page: try: page.close() except Exception: pass raise RuntimeError( f"Initial state duplication failed for task '{task_name}' after {max_retries + 1} attempts: {last_exc}" ) def get_service_config_for_agent(self) -> dict: """ Get service-specific configuration for agent execution. Returns: Dictionary containing configuration needed by the agent/MCP server """ from src.config.config_schema import ConfigRegistry # Get the eval_api_key from config registry config = ConfigRegistry.get_config("notion").get_all() service_config = {} if "eval_api_key" in config: service_config["notion_key"] = config["eval_api_key"] return service_config ================================================ FILE: src/mcp_services/notion/notion_task_manager.py ================================================ """ Notion Task Manager for MCPMark Evaluation Pipeline ==================================================== This module provides utilities for discovering, filtering, and managing evaluation tasks within the MCPMark project structure for Notion service. The task manager is responsible for: - Task discovery and filtering - Task verification and result processing - Task-specific logic (NOT LLM execution) """ import sys from dataclasses import dataclass from pathlib import Path from typing import Any, Dict, List, Optional from src.base.task_manager import BaseTask, BaseTaskManager from src.logger import get_logger logger = get_logger(__name__) @dataclass class NotionTask(BaseTask): """Represents a single evaluation task for Notion service.""" # Additional Notion-specific fields # A human-readable slug for the task directory (e.g. "employee_onboarding") task_name: str = "" original_initial_state_url: Optional[str] = None duplicated_initial_state_url: Optional[str] = None duplicated_initial_state_id: Optional[str] = None def __post_init__(self): # Ensure base class fields are set if not provided if ( not hasattr(self, "task_instruction_path") or self.task_instruction_path is None ): self.task_instruction_path = self.description_path if ( not hasattr(self, "task_verification_path") or self.task_verification_path is None ): self.task_verification_path = self.verify_path @property def description_path(self) -> Path: """Alias for task_instruction_path.""" return self.task_instruction_path @property def verify_path(self) -> Path: """Alias for task_verification_path.""" return self.task_verification_path def get_description(self) -> str: """Read and return the task description.""" if self.description_path.exists(): return self.description_path.read_text(encoding="utf-8") return "" class NotionTaskManager(BaseTaskManager): """Manages task discovery, filtering, and verification for Notion-based MCPMark evaluation.""" def __init__(self, tasks_root: Path = None, task_suite: str = "standard"): """Initialize with the tasks directory path. Args: tasks_root: Path to the tasks directory task_suite: Logical task suite (e.g., 'standard', 'easy') """ if tasks_root is None: tasks_root = Path(__file__).resolve().parents[3] / "tasks" # Call parent constructor super().__init__(tasks_root, mcp_service="notion", task_suite=task_suite) # ========================================================================= # Service-specific implementations for template methods # ========================================================================= # No custom task discovery methods needed; relying entirely on BaseTaskManager defaults. def _get_service_directory_name(self) -> str: """Return the service directory name for Notion.""" return "notion" def _create_task_from_files( self, category_id: str, task_files_info: Dict[str, Any] ) -> Optional[NotionTask]: """Instantiate a `NotionTask` from the dictionary returned by `_find_task_files`.""" import json # Check for meta.json meta_path = task_files_info["instruction_path"].parent / "meta.json" final_category_id = category_id task_id = task_files_info["task_id"] if meta_path.exists(): try: with open(meta_path, 'r') as f: meta_data = json.load(f) # Use values from meta.json if available final_category_id = meta_data.get("category_id", category_id) task_id = meta_data.get("task_id", task_id) except Exception as e: logger.warning(f"Failed to load meta.json from {meta_path}: {e}") return NotionTask( task_instruction_path=task_files_info["instruction_path"], task_verification_path=task_files_info["verification_path"], service="notion", category_id=final_category_id, task_id=task_id, task_name=task_files_info["task_id"], ) def _get_verification_command(self, task: NotionTask) -> List[str]: """Get the verification command for Notion tasks. Notion verification requires the duplicated template ID. """ return [ sys.executable, str(task.task_verification_path), task.duplicated_initial_state_id or "", ] ================================================ FILE: src/mcp_services/playwright/__init__.py ================================================ #!/usr/bin/env python3 """ Playwright MCP Service for MCPMark ================================== This package provides Playwright MCP integration for web automation tasks. """ ================================================ FILE: src/mcp_services/playwright/playwright_login_helper.py ================================================ """ Playwright Login Helper for MCPMark ==================================== This module provides browser session management and authentication utilities for Playwright-based web automation tasks. Handles browser context setup, session persistence, and state management. """ from pathlib import Path from typing import Optional from playwright.sync_api import ( BrowserContext, sync_playwright, ) from src.base.login_helper import BaseLoginHelper from src.logger import get_logger logger = get_logger(__name__) class PlaywrightLoginHelper(BaseLoginHelper): """ Login helper for Playwright web automation tasks. Manages browser contexts, session persistence, and authentication state for web automation scenarios. """ SUPPORTED_BROWSERS = {"chromium", "firefox"} def __init__( self, *, browser: str = "chromium", headless: bool = True, state_path: Optional[str | Path] = None, ) -> None: """ Initialize the Playwright login helper. Args: browser: Browser engine to use ('chromium' or 'firefox') headless: Whether to run browser in headless mode state_path: Path to save browser session state """ super().__init__() if browser not in self.SUPPORTED_BROWSERS: raise ValueError( f"Unsupported browser '{browser}'. Supported: {', '.join(self.SUPPORTED_BROWSERS)}" ) self.browser_name = browser self.headless = headless self.state_path = ( Path(state_path or Path.cwd() / "playwright_state.json") .expanduser() .resolve() ) # Browser management self._playwright = None self._browser = None self._browser_context: Optional[BrowserContext] = None logger.info(f"Initialized PlaywrightLoginHelper with {browser} browser") def login(self, **kwargs) -> bool: """ Set up browser context and session state. For most Playwright tasks, this creates a clean browser context that can be used for web automation. More complex authentication can be handled in specific implementations. Returns: bool: True if browser setup successful """ try: # Clean up any existing browser instances self.close() # Start Playwright self._playwright = sync_playwright().start() browser_type = getattr(self._playwright, self.browser_name) self._browser = browser_type.launch(headless=self.headless) # Create browser context context_options = {} # Load existing state if available if self.state_path.exists(): try: context_options["storage_state"] = str(self.state_path) logger.info(f"Loaded browser state from {self.state_path}") except Exception as e: logger.warning(f"Failed to load browser state: {e}") self._browser_context = self._browser.new_context(**context_options) # Save current state self._save_browser_state() logger.info("✅ Browser context setup successful") return True except Exception as e: logger.error(f"Browser setup failed: {e}") self.close() return False def get_browser_context(self) -> Optional[BrowserContext]: """ Get the current browser context. Returns: BrowserContext or None if not initialized """ return self._browser_context def is_authenticated(self) -> bool: """ Check if browser context is ready for use. Returns: bool: True if browser context is available """ return self._browser_context is not None def get_credentials(self) -> dict: """ Get browser configuration for MCP integration. Returns: dict: Browser configuration parameters """ return { "browser": self.browser_name, "headless": self.headless, "state_path": str(self.state_path), } def _save_browser_state(self) -> None: """Save current browser state to file.""" if self._browser_context: try: self._browser_context.storage_state(path=str(self.state_path)) logger.debug(f"Browser state saved to {self.state_path}") except Exception as e: logger.warning(f"Failed to save browser state: {e}") def close(self) -> None: """Clean up browser resources.""" if self._browser_context: try: # Save state before closing self._save_browser_state() self._browser_context.close() except Exception as e: logger.warning(f"Error closing browser context: {e}") finally: self._browser_context = None if self._browser: try: self._browser.close() except Exception as e: logger.warning(f"Error closing browser: {e}") finally: self._browser = None if self._playwright: try: self._playwright.stop() except Exception as e: logger.warning(f"Error stopping Playwright: {e}") finally: self._playwright = None ================================================ FILE: src/mcp_services/playwright/playwright_state_manager.py ================================================ """ Playwright State Manager for MCPMark ====================================== This module manages browser contexts and test environments for Playwright-based web automation tasks. Handles browser isolation, test page setup, and cleanup. """ import time from pathlib import Path from typing import Optional, Dict, Any, List from playwright.sync_api import ( BrowserContext, Page, TimeoutError as PlaywrightTimeoutError, ) from src.base.state_manager import BaseStateManager, InitialStateInfo from src.base.task_manager import BaseTask from src.logger import get_logger logger = get_logger(__name__) class PlaywrightStateManager(BaseStateManager): """ Manages browser state and test environments for Playwright tasks. Provides browser context isolation, test page setup, and resource cleanup for web automation evaluation. """ def __init__( self, browser: str = "chromium", headless: bool = True, state_path: Optional[Path] = None, network_origins: str = "*", user_profile: str = "isolated", viewport_width: int = 1280, viewport_height: int = 720, ): """ Initialize Playwright state manager. Args: browser: Browser engine to use ('chromium' or 'firefox') headless: Whether to run browser in headless mode state_path: Path to browser state file network_origins: Allowed network origins (comma-separated or *) user_profile: User profile type (isolated or persistent) viewport_width: Browser viewport width viewport_height: Browser viewport height """ super().__init__(service_name="playwright") self.browser_name = browser self.headless = headless # self.headless = False self.state_path = state_path or Path.cwd() / "playwright_state.json" self.network_origins = network_origins self.user_profile = user_profile self.viewport_width = viewport_width self.viewport_height = viewport_height # Browser management self._playwright = None self._browser = None self._current_context: Optional[BrowserContext] = None # Task-specific tracking self._current_task_pages: List[Page] = [] # Test environment URLs for different task categories self.test_environments = { "element_extraction": "https://mcp-eval-website.vercel.app/extraction", "form_interaction": "https://mcp-eval-website.vercel.app/forms/", "web_navigation": "https://mcp-eval-website.vercel.app/navigation", "authentication": "https://mcp-eval-website.vercel.app/auth/turnstile", } logger.info("Playwright state manager initialized") def _create_initial_state(self, task: BaseTask) -> Optional[InitialStateInfo]: """ Create isolated browser context for task execution. Args: task: Task for which to create browser state Returns: InitialStateInfo with browser context details """ try: logger.info( "| Skipping Playwright browser launch – no initial browser state " "needed for task: %s", task.name, ) # Generate a lightweight identifier to allow resource tracking even # though no real browser context is created. context_id = f"noop_{task.category_id}_{task.task_id}_{int(time.time())}" # We still expose the canonical test URL (if any) because some # consumers add it to the task metadata. test_url = self.test_environments.get(task.category_id) # Record a dummy resource so cleanup logic remains symmetrical. self.track_resource( "browser_context", context_id, { "task_name": task.name, "task_category": task.category_id, "test_url": test_url, }, ) return InitialStateInfo( state_id=context_id, state_url=test_url, metadata={ "browser": self.browser_name, "headless": self.headless, "test_url": test_url, "task_category": task.category_id, }, ) except Exception as e: logger.error(f"Failed to create stub initial state for {task.name}: {e}") return None def _store_initial_state_info( self, task: BaseTask, state_info: InitialStateInfo ) -> None: """Store browser context information in task object.""" if hasattr(task, "__dict__"): task.browser_context_id = state_info.state_id task.test_url = state_info.state_url task.browser_config = state_info.metadata def _cleanup_task_initial_state(self, task: BaseTask) -> bool: """Clean up browser context for specific task.""" try: success = True # Close any open pages if self._current_task_pages: for page in self._current_task_pages: try: page.close() except Exception as e: logger.warning(f"Failed to close page: {e}") success = False self._current_task_pages.clear() # Close browser context if self._current_context: try: self._current_context.close() logger.info("Closed browser context") except Exception as e: logger.error(f"Failed to close browser context: {e}") success = False finally: self._current_context = None return success except Exception as e: logger.error(f"Error during browser cleanup for {task.name}: {e}") return False def _cleanup_single_resource(self, resource: Dict[str, Any]) -> bool: """Clean up a single browser resource.""" try: if resource["type"] == "browser_context": # Context cleanup is handled in _cleanup_task_initial_state logger.debug(f"Browser context {resource['id']} marked for cleanup") return True logger.warning(f"Unknown resource type for cleanup: {resource['type']}") return False except Exception as e: logger.error(f"Failed to cleanup resource {resource}: {e}") return False def _get_context_options(self, task: BaseTask) -> Dict[str, Any]: """Get browser context options based on task requirements.""" options = { "viewport": {"width": self.viewport_width, "height": self.viewport_height} } # Load browser state if available if self.state_path.exists(): try: options["storage_state"] = str(self.state_path) except Exception as e: logger.warning(f"Failed to load browser state: {e}") # Task-specific context options if task.category_id == "form_interaction": # Enable form interactions options["permissions"] = ["geolocation"] elif task.category_id == "web_navigation": # Allow navigation between pages options["accept_downloads"] = False return options def _setup_test_environment(self, task: BaseTask) -> Optional[str]: """Set up test environment for task category.""" try: test_url = self.test_environments.get(task.category_id) if not test_url: logger.warning( f"No test environment defined for category: {task.category_id}" ) return None # Create a page and navigate to test environment if self._current_context: page = self._current_context.new_page() # Navigate to test URL to ensure it's accessible page.goto(test_url, wait_until="networkidle", timeout=30000) logger.info(f"Test environment ready: {test_url}") # Track the page for cleanup self._current_task_pages.append(page) # Verify page loaded correctly title = page.title() if title: logger.debug(f"Page loaded with title: {title}") return test_url except PlaywrightTimeoutError: logger.error(f"Timeout loading test environment: {test_url}") except Exception as e: logger.error(f"Failed to setup test environment: {e}") return None def get_current_context(self) -> Optional[BrowserContext]: """Get the current browser context.""" return self._current_context def get_test_page(self) -> Optional[Page]: """Get a page for testing (creates new one if needed).""" if self._current_context: try: page = self._current_context.new_page() self._current_task_pages.append(page) return page except Exception as e: logger.error(f"Failed to create test page: {e}") return None def navigate_to_test_url(self, task: BaseTask) -> Optional[Page]: """Navigate to the test URL for a specific task.""" test_url = self.test_environments.get(task.category_id) if not test_url: logger.error(f"No test URL defined for category: {task.category_id}") return None page = self.get_test_page() if page: try: page.goto(test_url, wait_until="networkidle", timeout=30000) logger.info(f"Navigated to test URL: {test_url}") return page except Exception as e: logger.error(f"Failed to navigate to {test_url}: {e}") return None def get_service_config_for_agent(self) -> dict: """ Get service-specific configuration for agent execution. Returns: Dictionary containing browser configuration for MCP server """ config = { "browser": self.browser_name, "headless": self.headless, } # Add browser state file if it exists if self.state_path.exists(): config["browser_state"] = str(self.state_path) # Add test environment URLs config["test_environments"] = self.test_environments return config def close_all(self) -> None: """Close all browser resources.""" try: # Close all pages for page in self._current_task_pages: try: page.close() except Exception: pass self._current_task_pages.clear() # Close context if self._current_context: self._current_context.close() self._current_context = None # Close browser if self._browser: self._browser.close() self._browser = None # Stop Playwright if self._playwright: self._playwright.stop() self._playwright = None logger.info("All browser resources closed") except Exception as e: logger.error(f"Error closing browser resources: {e}") def set_verification_environment(self, messages_path: str = None) -> None: """ Set Playwright-specific environment variables for verification scripts. Args: messages_path: Optional path to messages.json file for verification """ import os # Set common MCP_MESSAGES if provided if messages_path: os.environ["MCP_MESSAGES"] = str(messages_path) # Also set PLAYWRIGHT_WORK_DIR to the directory containing messages.json work_dir = str(Path(messages_path).parent) os.environ["PLAYWRIGHT_WORK_DIR"] = work_dir logger.info(f"| Set PLAYWRIGHT_WORK_DIR to: {work_dir}") logger.info(f"| Set MCP_MESSAGES to: {messages_path}") def __del__(self): """Ensure cleanup on deletion.""" self.close_all() ================================================ FILE: src/mcp_services/playwright/playwright_task_manager.py ================================================ """ Playwright Task Manager for MCPMark ==================================== Simple task manager for Playwright MCP tasks. Follows anti-over-engineering principles: keep it simple, do what's needed. """ import sys import os import subprocess from pathlib import Path from typing import List, Dict, Any from src.base.task_manager import BaseTask, BaseTaskManager from src.logger import get_logger logger = get_logger(__name__) class PlaywrightTask(BaseTask): """Playwright-specific task that uses directory name as task name.""" class PlaywrightTaskManager(BaseTaskManager): """Simple task manager for Playwright MCP tasks.""" def __init__(self, tasks_root: Path = None, task_suite: str = "standard"): """Initialize with tasks directory.""" if tasks_root is None: tasks_root = Path(__file__).resolve().parents[3] / "tasks" super().__init__( tasks_root, mcp_service="playwright", task_class=PlaywrightTask, task_organization="directory", task_suite=task_suite, ) def _create_task_from_files( self, category_id: str, task_files_info: Dict[str, Any] ) -> PlaywrightTask: """Instantiate a `PlaywrightTask` from the dictionary returned by `_find_task_files`.""" import json # Check for meta.json meta_path = task_files_info["instruction_path"].parent / "meta.json" final_category_id = category_id task_id = task_files_info["task_id"] if meta_path.exists(): try: with open(meta_path, 'r') as f: meta_data = json.load(f) # Use values from meta.json if available final_category_id = meta_data.get("category_id", category_id) task_id = meta_data.get("task_id", task_id) except Exception as e: logger.warning(f"Failed to load meta.json from {meta_path}: {e}") return PlaywrightTask( task_instruction_path=task_files_info["instruction_path"], task_verification_path=task_files_info["verification_path"], service="playwright", category_id=final_category_id, task_id=task_id, ) def _get_verification_command(self, task: BaseTask) -> List[str]: """Get verification command - just run the verify.py script.""" return [sys.executable, str(task.task_verification_path)] def run_verification(self, task: BaseTask) -> subprocess.CompletedProcess: """Run verification with Playwright-specific environment.""" env = os.environ.copy() # Pass messages.json path and working directory to verification script messages_path = os.getenv("MCP_MESSAGES") work_dir = os.getenv("PLAYWRIGHT_WORK_DIR") if messages_path: env["MCP_MESSAGES"] = messages_path logger.debug(f"Setting MCP_MESSAGES to: {messages_path}") if work_dir: env["PLAYWRIGHT_WORK_DIR"] = work_dir logger.debug(f"Setting PLAYWRIGHT_WORK_DIR to: {work_dir}") return subprocess.run( self._get_verification_command(task), capture_output=True, text=True, timeout=90, env=env, ) def _format_task_instruction(self, base_instruction: str) -> str: """Add Playwright-specific note to instructions.""" return ( base_instruction + "\n\nUse Playwright MCP tools to complete this web automation task." ) ================================================ FILE: src/mcp_services/playwright_webarena/playwright_login_helper.py ================================================ """ WebArena (Docker) Login Helper for MCPMark ========================================== This helper exposes basic browser configuration for agents. Authentication is not required for the public WebArena environment; isolation is handled via Docker containerization in the state manager. """ from __future__ import annotations from pathlib import Path from typing import Optional from src.base.login_helper import BaseLoginHelper from src.logger import get_logger logger = get_logger(__name__) class PlaywrightLoginHelper(BaseLoginHelper): """ Minimal login helper. It does not launch browsers; that is handled by the Playwright MCP client. It simply exposes configuration parameters such as headless mode and an optional storage state file path. """ def __init__( self, *, browser: str = "chromium", headless: bool = True, state_path: Optional[str | Path] = None, base_url: Optional[str] = None, ) -> None: super().__init__() self.browser_name = browser self.headless = headless self.state_path = ( Path(state_path or Path.cwd() / "playwright_state.json") .expanduser() .resolve() ) self.base_url = base_url logger.info( "Initialized WebArenaLoginHelper (browser=%s, headless=%s)", browser, headless, ) def login(self, **kwargs) -> bool: """ No-op login. For WebArena we don't need credentials; we only provide configuration for the MCP to open a browser. """ logger.info("WebArenaLoginHelper login: no-op") return True def is_authenticated(self) -> bool: return True def get_credentials(self) -> dict: return { "browser": self.browser_name, "headless": self.headless, "state_path": str(self.state_path), "base_url": self.base_url, } def close(self) -> None: # No resources to release pass ================================================ FILE: src/mcp_services/playwright_webarena/playwright_state_manager.py ================================================ """ WebArena (Docker) State Manager for MCPMark =========================================== This module manages a WebArena environment that runs inside a Docker container. It is responsible for starting the container in the initial state phase and stopping/removing it during cleanup. It exposes the target URL (e.g. http://localhost:9999) for Playwright MCP-based automation. """ from __future__ import annotations import socket import subprocess import time from dataclasses import dataclass from pathlib import Path from typing import Optional, Dict, Any from urllib.parse import urlparse import requests from src.base.state_manager import BaseStateManager, InitialStateInfo from src.base.task_manager import BaseTask from src.logger import get_logger logger = get_logger(__name__) @dataclass class DockerConfig: image_name: str = "shopping_admin_final_0719" image_tar_path: Optional[Path] = None container_name: str = "shopping_admin" host_port: int = 7780 container_port: int = 80 readiness_path: str = "/admin" readiness_timeout_seconds: int = 600 readiness_poll_interval_seconds: float = 2.0 @property def base_url(self) -> str: return f"http://localhost:{self.host_port}" class PlaywrightStateManager(BaseStateManager): """ Manage Docker lifecycle for WebArena-backed tasks. - Initial state: ensure image is present (optionally load from tar), then run container and wait until HTTP endpoint is ready. - Cleanup: stop and remove the container. """ # Category-specific Docker configurations CATEGORY_CONFIGS = { "reddit": { "image_name": "postmill-populated-exposed-withimg", "container_name": "forum", "host_port": 9999, "readiness_path": "/" }, "shopping": { "image_name": "shopping_final_0712", "container_name": "shopping", "host_port": 7770, "readiness_path": "/" }, "shopping_admin": { "image_name": "shopping_admin_final_0719", "container_name": "shopping_admin", "host_port": 7780, "readiness_path": "/admin" } } def __init__( self, *, docker_image_name: str = "shopping_admin_final_0719", docker_container_name: str = "shopping_admin", host_port: int = 7780, container_port: int = 80, image_tar_path: Optional[str | Path] = None, readiness_path: str = "/admin", readiness_timeout_seconds: int = 600, readiness_poll_interval_seconds: float = 2.0, # Playwright browser config params (ignored by this state manager) browser: Optional[str] = None, headless: Optional[bool] = None, network_origins: Optional[str] = None, user_profile: Optional[str] = None, viewport_width: Optional[int] = None, viewport_height: Optional[int] = None, # Debug mode - skip container cleanup skip_cleanup: bool = False, ) -> None: super().__init__(service_name="playwright_webarena") self.config = DockerConfig( image_name=docker_image_name, image_tar_path=Path(image_tar_path).expanduser().resolve() if image_tar_path else None, container_name=docker_container_name, host_port=host_port, container_port=container_port, readiness_path=readiness_path, readiness_timeout_seconds=readiness_timeout_seconds, readiness_poll_interval_seconds=readiness_poll_interval_seconds, ) self.skip_cleanup = skip_cleanup logger.info( "Initialized WebArenaStateManager (image=%s, container=%s, port=%s, skip_cleanup=%s)", self.config.image_name, self.config.container_name, self.config.host_port, self.skip_cleanup, ) # ---- Helpers --------------------------------------------------------- def _run_cmd( self, args: list[str], *, check: bool = False ) -> subprocess.CompletedProcess: logger.debug("| Running command: %s", " ".join(args)) return subprocess.run( args, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True, check=check ) def _image_exists(self, image: str) -> bool: result = self._run_cmd( ["docker", "images", "--format", "{{.Repository}}:{{.Tag}}"] ) lines = [line.strip() for line in result.stdout.splitlines() if line.strip()] # Parse target image (allow optional tag; default latest) if ":" in image: target_repo, target_tag = image.split(":", 1) else: target_repo, target_tag = image, "latest" for repo_tag in lines: if ":" in repo_tag: repo, tag = repo_tag.split(":", 1) else: repo, tag = repo_tag, "latest" if repo == target_repo and tag == target_tag: logger.debug("| Found Docker image %s:%s", repo, tag) return True logger.debug("| Docker image not found: %s:%s", target_repo, target_tag) return False def _load_image_from_tar_if_needed(self) -> None: if self.config.image_tar_path and not self._image_exists( self.config.image_name ): logger.info("| Loading Docker image from tar: %s", self.config.image_tar_path) result = self._run_cmd( ["docker", "load", "--input", str(self.config.image_tar_path)] ) if result.returncode != 0: logger.error("| Failed to load Docker image: %s", result.stderr.strip()) raise RuntimeError(f"docker load failed: {result.stderr}") logger.info("| Docker image loaded") def _stop_and_remove_container(self, name: str) -> None: # Stop (ignore errors if not running) self._run_cmd(["docker", "stop", name]) # Remove (ignore errors if not exists) self._run_cmd(["docker", "rm", name]) def _container_is_running(self, name: str) -> bool: result = self._run_cmd( ["docker", "ps", "--filter", f"name=^{name}$", "--format", "{{.Names}}"] ) running = any(line.strip() == name for line in result.stdout.splitlines()) logger.debug("| Container '%s' running: %s", name, running) return running def _port_open(self, host: str, port: int) -> bool: try: with socket.create_connection((host, port), timeout=1.0): return True except OSError: return False def _http_ready(self, url: str) -> bool: try: resp = requests.get(url, timeout=3) return resp.status_code < 500 except Exception: return False def _get_entry_url(self) -> str: base = self.config.base_url.rstrip("/") path = self.config.readiness_path if not path or path == "/": return base return f"{base}{path}" def _wait_until_ready(self) -> bool: deadline = time.time() + self.config.readiness_timeout_seconds base_url = self.config.base_url.rstrip("/") url = self._get_entry_url() # Determine host and port from URL for port checks parsed = urlparse(base_url) host = parsed.hostname or "localhost" port = parsed.port or self.config.host_port # First wait for port to open to avoid long HTTP errors while time.time() < deadline: if self._port_open(host, port): break time.sleep(self.config.readiness_poll_interval_seconds) while time.time() < deadline: if self._http_ready(url): logger.info("| WebArena HTTP endpoint ready: %s", url) return True time.sleep(self.config.readiness_poll_interval_seconds) logger.error("| Timed out waiting for WebArena at %s", url) return False def _wait_for_mysql_ready(self, max_wait_seconds: int = 120) -> bool: """Wait for MySQL to be ready in the container.""" deadline = time.time() + max_wait_seconds while time.time() < deadline: result = self._run_cmd([ "docker", "exec", self.config.container_name, "mysql", "-u", "magentouser", "-pMyPassword", "magentodb", "-e", "SELECT 1;" ]) if result.returncode == 0: logger.info("| MySQL is ready in container %s", self.config.container_name) return True time.sleep(2) logger.warning("| MySQL not ready after %d seconds", max_wait_seconds) return False def _wait_for_magento_ready(self, max_wait_seconds: int = 180) -> bool: """Wait for Magento to be fully initialized.""" deadline = time.time() + max_wait_seconds while time.time() < deadline: # Check if Magento's setup is complete by trying to access config result = self._run_cmd([ "docker", "exec", self.config.container_name, "/var/www/magento2/bin/magento", "config:show", "web/unsecure/base_url" ]) if result.returncode == 0: logger.info("| Magento is ready in container %s", self.config.container_name) return True time.sleep(5) logger.warning("| Magento not ready after %d seconds", max_wait_seconds) return False def _configure_shopping_post_start(self) -> None: """Run Magento-specific steps for shopping container. Waits for services to be ready before configuring. """ logger.info("| Running shopping post-start setup") # Wait for MySQL to be ready first if not self._wait_for_mysql_ready(): logger.warning("| MySQL not ready, attempting configuration anyway") # Wait for Magento to be ready if not self._wait_for_magento_ready(): logger.warning("| Magento not ready, attempting configuration anyway") base_url = f"http://localhost:{self.config.host_port}" cmds = [ [ "docker", "exec", self.config.container_name, "/var/www/magento2/bin/magento", "setup:store-config:set", f"--base-url={base_url}", ], [ "docker", "exec", self.config.container_name, "mysql", "-u", "magentouser", "-pMyPassword", "magentodb", "-e", f"UPDATE core_config_data SET value='{base_url}/' WHERE path IN ('web/secure/base_url', 'web/unsecure/base_url');", ], [ "docker", "exec", self.config.container_name, "/var/www/magento2/bin/magento", "cache:flush", ], ] for cmd in cmds: result = self._run_cmd(cmd) if result.returncode != 0: logger.warning( "| Shopping setup step failed (%s): %s", " ".join(cmd), result.stderr.strip(), ) else: logger.debug( "| Shopping setup step ok (%s): %s", " ".join(cmd), result.stdout.strip(), ) def _configure_shopping_admin_post_start(self) -> None: """Run Magento-specific steps for shopping_admin container. Waits for services to be ready before configuring. """ logger.info("| Running shopping_admin post-start setup") # Wait for MySQL to be ready first if not self._wait_for_mysql_ready(): logger.warning("| MySQL not ready, attempting configuration anyway") # Wait for Magento to be ready if not self._wait_for_magento_ready(): logger.warning("| Magento not ready, attempting configuration anyway") base_url = f"http://localhost:{self.config.host_port}" cmds = [ [ "docker", "exec", self.config.container_name, "/var/www/magento2/bin/magento", "setup:store-config:set", f"--base-url={base_url}", ], [ "docker", "exec", self.config.container_name, "mysql", "-u", "magentouser", "-pMyPassword", "magentodb", "-e", f"UPDATE core_config_data SET value='{base_url}/' WHERE path IN ('web/secure/base_url', 'web/unsecure/base_url');", ], [ "docker", "exec", self.config.container_name, "/var/www/magento2/bin/magento", "config:set", "admin/security/password_is_forced", "0", ], [ "docker", "exec", self.config.container_name, "/var/www/magento2/bin/magento", "config:set", "admin/security/password_lifetime", "0", ], [ "docker", "exec", self.config.container_name, "/var/www/magento2/bin/magento", "cache:flush", ], ] for cmd in cmds: result = self._run_cmd(cmd) if result.returncode != 0: logger.warning( "| Shopping_admin setup step failed (%s): %s", " ".join(cmd), result.stderr.strip(), ) else: logger.debug( "| Shopping_admin setup step ok (%s): %s", " ".join(cmd), result.stdout.strip(), ) # ---- BaseStateManager hooks ----------------------------------------- def _create_initial_state(self, task: BaseTask) -> Optional[InitialStateInfo]: try: # Dynamically update config based on task category if hasattr(task, 'category_id') and task.category_id in self.CATEGORY_CONFIGS: category_config = self.CATEGORY_CONFIGS[task.category_id] logger.info(f"| Using category-specific config for '{task.category_id}': {category_config}") # Update the config with category-specific values self.config.image_name = category_config["image_name"] self.config.container_name = category_config["container_name"] self.config.host_port = category_config["host_port"] self.config.readiness_path = category_config["readiness_path"] # Ensure image exists (load from tar if configured) self._load_image_from_tar_if_needed() # Ensure any stale container is gone self._stop_and_remove_container(self.config.container_name) # Run container run_cmd = [ "docker", "run", "--name", self.config.container_name, "-p", f"{self.config.host_port}:{self.config.container_port}", "-d", self.config.image_name, ] print("| Docker run command: ", run_cmd) result = self._run_cmd(run_cmd) if result.returncode != 0: logger.error("| Failed to start container: %s", result.stderr.strip()) return None container_id = result.stdout.strip() logger.info( "| Started container %s (%s)", self.config.container_name, container_id ) # Special handling for shopping and shopping_admin if self.config.container_name == "shopping": self._configure_shopping_post_start() if self.config.container_name == "shopping_admin": self._configure_shopping_admin_post_start() # Wait for readiness if not self._wait_until_ready(): # Cleanup on failure self._stop_and_remove_container(self.config.container_name) return None entry_url = self._get_entry_url() # Track resource for cleanup self.track_resource( "docker_container", self.config.container_name, { "image": self.config.image_name, "host_port": self.config.host_port, "container_port": self.config.container_port, "base_url": entry_url, }, ) # Provide initial state info return InitialStateInfo( state_id=self.config.container_name, state_url=entry_url, metadata={ "docker_image": self.config.image_name, "container_name": self.config.container_name, "host_port": self.config.host_port, "container_port": self.config.container_port, "base_url": entry_url, "category": task.category_id, }, ) except Exception as exc: logger.error("| Failed to create WebArena initial state: %s", exc) return None def _store_initial_state_info( self, task: BaseTask, state_info: InitialStateInfo ) -> None: if hasattr(task, "__dict__"): task.docker_container_name = state_info.state_id task.base_url = state_info.state_url task.docker_metadata = state_info.metadata def _cleanup_task_initial_state(self, task: BaseTask) -> bool: if self.skip_cleanup: logger.info("| Skipping container cleanup (skip_cleanup=True)") logger.info("| Container is still running at: %s", self._get_entry_url()) logger.info( "| To manually stop: docker stop %s && docker rm %s", self.config.container_name, self.config.container_name, ) return True try: self._stop_and_remove_container(self.config.container_name) return True except Exception as exc: logger.error("| Failed to cleanup container for %s: %s", task.name, exc) return False def _cleanup_single_resource(self, resource: Dict[str, Any]) -> bool: if self.skip_cleanup: logger.info( "| Skipping resource cleanup for %s (skip_cleanup=True)", resource.get("id"), ) return True try: if resource.get("type") == "docker_container": self._stop_and_remove_container(resource["id"]) return True logger.warning( "| Unknown resource type for cleanup: %s", resource.get("type") ) return False except Exception as exc: logger.error("| Resource cleanup failed: %s", exc) return False def get_service_config_for_agent(self) -> dict: """ Provide configuration to the agent. The key piece is the base URL that agents should navigate to when starting tasks. """ return { "environment": "webarena-docker", "base_url": self._get_entry_url(), "docker": { "image": self.config.image_name, "container": self.config.container_name, "host_port": self.config.host_port, "container_port": self.config.container_port, }, } def close_all(self) -> None: if self.skip_cleanup: logger.info("| Skipping container cleanup in close_all (skip_cleanup=True)") return try: self._stop_and_remove_container(self.config.container_name) except Exception: # Best effort pass def __del__(self) -> None: if not self.skip_cleanup: self.close_all() ================================================ FILE: src/mcp_services/playwright_webarena/playwright_task_manager.py ================================================ """ WebArena Playwright Task Manager for MCPMark ============================================ Simple task manager for WebArena-backed Playwright MCP tasks. """ from __future__ import annotations import sys import os import subprocess from pathlib import Path from typing import List, Dict, Any from src.logger import get_logger from src.base.task_manager import BaseTask, BaseTaskManager logger = get_logger(__name__) class PlaywrightTaskManager(BaseTaskManager): """Task manager for Playwright tasks against a WebArena environment.""" def __init__( self, tasks_root: Path | None = None, base_url: str | None = None, task_suite: str = "standard", ): if tasks_root is None: tasks_root = Path(__file__).resolve().parents[3] / "tasks" super().__init__( tasks_root, mcp_service="playwright_webarena", task_class=BaseTask, task_organization="directory", task_suite=task_suite, ) def _create_task_from_files( self, category_id: str, task_files_info: Dict[str, Any] ) -> BaseTask: import json # Check for meta.json meta_path = task_files_info["instruction_path"].parent / "meta.json" final_category_id = category_id task_id = task_files_info["task_id"] if meta_path.exists(): try: with open(meta_path, 'r') as f: meta_data = json.load(f) # Use values from meta.json if available final_category_id = meta_data.get("category_id", category_id) task_id = meta_data.get("task_id", task_id) except Exception as e: logger.warning(f"Failed to load meta.json from {meta_path}: {e}") task = BaseTask( task_instruction_path=task_files_info["instruction_path"], task_verification_path=task_files_info["verification_path"], service="playwright_webarena", category_id=final_category_id, task_id=task_id, ) return task # NEW: 注入统一前缀(基于 state manager 注入的 task.base_url) def get_task_instruction(self, task: BaseTask) -> str: base_instruction = task.get_task_instruction().strip() base_url = getattr(task, "base_url", None) prefix = f"Navigate to {base_url.rstrip('/')} and complete the following task." # 前缀 + 原始任务说明 return self._format_task_instruction(f"{prefix}\n\n{base_instruction}") def _get_verification_command(self, task: BaseTask) -> List[str]: return [sys.executable, str(task.task_verification_path)] # 将 base_url 通过环境变量传给 verify.py def run_verification(self, task: BaseTask) -> subprocess.CompletedProcess: env = os.environ.copy() base_url = getattr(task, "base_url", None) if base_url: env["WEBARENA_BASE_URL"] = base_url.rstrip("/") return subprocess.run( self._get_verification_command(task), capture_output=True, text=True, timeout=300, env=env, ) def _format_task_instruction(self, base_instruction: str) -> str: note = "Use Playwright MCP tools to complete this task." return (base_instruction + "\n\n" + note + "\n\nNote: Based on your understanding, solve the task all at once by yourself, don't ask for my opinions on anything.") ================================================ FILE: src/mcp_services/playwright_webarena/reddit_env_setup.md ================================================ # WebArena Reddit环境搭建指南 本指南介绍如何搭建WebArena Reddit环境,用于Playwright MCP自动化测试。 ## 系统要求 - Ubuntu 22.04+ 或其他Linux发行版 - Docker环境 - 至少50GB可用磁盘空间 - 至少4GB内存 ## 快速设置步骤 ### 1. 下载Reddit Docker镜像 WebArena提供3个镜像源,选择网络最快的: ```bash # 选项1: Google Drive (通常最快) pip install gdown gdown 17Qpp1iu_mPqzgO_73Z9BnFjHrzmX9DGf # 选项2: Archive.org wget https://archive.org/download/webarena-env-forum-image/postmill-populated-exposed-withimg.tar # 选项3: CMU服务器 wget http://metis.lti.cs.cmu.edu/webarena-images/postmill-populated-exposed-withimg.tar ``` ### 2. 安装Docker (如果尚未安装) ```bash sudo apt update sudo apt install docker.io -y sudo systemctl start docker sudo systemctl enable docker sudo usermod -aG docker $USER newgrp docker ``` ### 3. 启动Reddit环境 ```bash # 加载Docker镜像 (约50GB,需要等待几分钟) docker load --input postmill-populated-exposed-withimg.tar # 启动容器 docker run --name forum -p 9999:80 -d postmill-populated-exposed-withimg # 等待服务启动 (约1-2分钟) sleep 120 # 验证服务状态 docker logs forum | tail -10 curl -I http://localhost:9999 ``` ### 4. 验证环境 访问 `http://localhost:9999` 应该看到Postmill论坛主页,包含: - 导航栏 (Forums, Wiki) - 搜索框 - 登录/注册链接 - 论坛列表 (AskReddit, technology, gaming等) ## 端口开放策略 根据使用场景选择合适的端口开放策略: ### 策略1: GCP防火墙规则 (推荐 - 生产环境) **适用场景**: 长期使用、团队协作、稳定的公共访问 ```bash # 安装gcloud CLI (如果尚未安装) curl https://sdk.cloud.google.com | bash exec -l $SHELL # 认证 gcloud auth login # 创建防火墙规则 gcloud compute firewall-rules create allow-reddit-9999 \ --allow tcp:9999 \ --source-ranges 0.0.0.0/0 \ --description "Allow access to WebArena Reddit on port 9999" # 获取外部IP gcloud compute instances list ``` **优点**: 永久有效、稳定、无额外依赖 **缺点**: 需要GCP权限、公网完全开放 ### 策略2: ngrok隧道 (快速分享) **适用场景**: 临时演示、快速测试、无需GCP权限 ```bash # 安装ngrok wget https://bin.equinox.io/c/bNyj1mQVY4c/ngrok-v3-stable-linux-amd64.tgz tar xvzf ngrok-v3-stable-linux-amd64.tgz sudo mv ngrok /usr/local/bin # 创建隧道 ngrok http 9999 ``` **优点**: 即时生效、HTTPS支持、无需服务器配置 **缺点**: 临时URL、需要保持运行、免费版有限制 ### 策略3: Cloudflared隧道 (免费持久) **适用场景**: 长期免费使用、无需GCP、需要稳定访问 ```bash # 安装cloudflared wget https://github.com/cloudflare/cloudflared/releases/latest/download/cloudflared-linux-amd64 sudo mv cloudflared-linux-amd64 /usr/local/bin/cloudflared sudo chmod +x /usr/local/bin/cloudflared # 创建临时隧道 cloudflared tunnel --url http://localhost:9999 # 或创建永久隧道 (需要Cloudflare账号) cloudflared tunnel login cloudflared tunnel create webarena-reddit cloudflared tunnel route dns webarena-reddit reddit.yourdomain.com ``` **优点**: 免费、持久、自定义域名 **缺点**: 需要Cloudflare账号、设置稍复杂 ### 策略4: SSH端口转发 (开发调试) **适用场景**: 本地开发、安全要求高、团队内部访问 ```bash # 在本地机器上执行 ssh -L 8080:localhost:9999 user@your-server-ip # 然后访问 http://localhost:8080 ``` **优点**: 最安全、无需开放公网端口 **缺点**: 需要SSH访问、仅限本地使用 ## Playwright MCP测试 环境搭建完成后,可以使用Playwright MCP进行自动化测试: ```javascript // 基础连接测试 await page.goto('http://your-reddit-url:9999'); // 导航测试 await page.click('text=Forums'); await page.click('text=AskReddit'); // 表单交互测试 await page.click('text=Log in'); await page.fill('[placeholder="Username"]', 'testuser'); await page.fill('[placeholder="Password"]', 'testpass'); ``` ## 故障排除 ### 容器启动失败 ```bash # 检查容器状态 docker ps -a # 查看详细日志 docker logs forum # 重启容器 docker restart forum ``` ### 服务未就绪 ```bash # 检查PostgreSQL是否完全启动 docker logs forum | grep "database system is ready" # 等待更长时间 (数据库恢复需要时间) sleep 300 ``` ### 端口被占用 ```bash # 检查端口使用情况 netstat -tlnp | grep 9999 # 使用不同端口 docker run --name forum -p 8888:80 -d postmill-populated-exposed-withimg ``` ## 环境重置 完成测试后重置环境: ```bash # 停止并删除容器 docker stop forum docker rm forum # 重新启动 docker run --name forum -p 9999:80 -d postmill-populated-exposed-withimg ``` ## 高级配置 ### 环境变量设置 (WebArena标准) ```bash export REDDIT="your-server-hostname:9999" export REDDIT_URL="http://your-server-hostname:9999" ``` ### 批量任务测试 ```bash # 准备WebArena测试配置 mkdir -p ~/.webarena echo "REDDIT=your-server-hostname:9999" >> ~/.webarena/config ``` --- **注意**: 这个Reddit环境包含成千上万的预填充数据,完全模拟真实的Reddit使用场景,非常适合进行复杂的Web自动化任务测试。 ================================================ FILE: src/mcp_services/postgres/__init__.py ================================================ """ PostgreSQL MCP Service for MCPMark =================================== This module provides PostgreSQL database integration for MCPMark evaluation. """ from .postgres_login_helper import PostgresLoginHelper from .postgres_state_manager import PostgresStateManager from .postgres_task_manager import PostgresTaskManager, PostgresTask __all__ = [ "PostgresLoginHelper", "PostgresStateManager", "PostgresTaskManager", "PostgresTask", ] ================================================ FILE: src/mcp_services/postgres/postgres_login_helper.py ================================================ """ PostgreSQL Login Helper for MCPMark ==================================== Handles PostgreSQL authentication and connection validation. """ import json import psycopg2 from pathlib import Path from typing import Optional, Dict, Any from src.base.login_helper import BaseLoginHelper from src.logger import get_logger logger = get_logger(__name__) class PostgresLoginHelper(BaseLoginHelper): """Handles PostgreSQL authentication and connection validation.""" def __init__( self, host: str = "localhost", port: int = 5432, database: str = "postgres", username: str = "postgres", password: str = None, state_path: Optional[Path] = None, ): """Initialize PostgreSQL login helper. Args: host: Database host port: Database port database: Database name username: Database username password: Database password state_path: Path to save connection state """ super().__init__() self.host = host self.port = port self.database = database self.username = username self.password = password self.state_path = state_path or Path.home() / ".mcpbench" / "postgres_auth.json" # Ensure state directory exists self.state_path.parent.mkdir(parents=True, exist_ok=True) def login(self, **kwargs) -> bool: """Test PostgreSQL connection and save state. Returns: bool: True if connection successful """ try: # Test connection conn = psycopg2.connect( host=self.host, port=self.port, database=self.database, user=self.username, password=self.password, connect_timeout=10, ) # Execute test query with conn.cursor() as cur: cur.execute("SELECT version()") version = cur.fetchone()[0] logger.info(f"PostgreSQL connection successful: {version}") # Check permissions cur.execute( """ SELECT has_database_privilege(%s, 'CREATE') """, (self.database,), ) can_create = cur.fetchone()[0] if not can_create: logger.warning("User does not have CREATE privilege on database") conn.close() # Save connection state self._save_connection_state( { "host": self.host, "port": self.port, "database": self.database, "username": self.username, "version": version, "can_create": can_create, "authenticated_at": self._get_current_timestamp(), } ) return True except psycopg2.Error as e: logger.error(f"PostgreSQL connection failed: {e}") return False except Exception as e: logger.error(f"Unexpected error during PostgreSQL login: {e}") return False def _save_connection_state(self, state: Dict[str, Any]): """Save connection state to file.""" try: # Don't save password safe_state = {k: v for k, v in state.items() if k != "password"} with open(self.state_path, "w") as f: json.dump(safe_state, f, indent=2) # Set restrictive permissions self.state_path.chmod(0o600) logger.info(f"Connection state saved to: {self.state_path}") except Exception as e: logger.error(f"Failed to save connection state: {e}") def _get_current_timestamp(self) -> str: """Get current timestamp in ISO format.""" from datetime import datetime, timezone return datetime.now(timezone.utc).isoformat() def is_connected(self) -> bool: """Check if we can connect to PostgreSQL.""" return self.login() def get_connection_params(self) -> Dict[str, Any]: """Get connection parameters (without password).""" return { "host": self.host, "port": self.port, "database": self.database, "user": self.username, } ================================================ FILE: src/mcp_services/postgres/postgres_state_manager.py ================================================ """ PostgreSQL State Manager for MCPMark ===================================== Manages database state for PostgreSQL tasks including schema setup, test data creation, and cleanup. """ import os import subprocess import sys import psycopg2 from psycopg2 import sql from pathlib import Path from typing import Optional, Dict, Any, List from src.base.state_manager import BaseStateManager, InitialStateInfo from src.base.task_manager import BaseTask from src.logger import get_logger logger = get_logger(__name__) class PostgresStateManager(BaseStateManager): """Manages PostgreSQL database state for task evaluation.""" def __init__( self, host: str = "localhost", port: int = 5432, database: str = "postgres", username: str = "postgres", password: str = None, ): """Initialize PostgreSQL state manager. Args: host: Database host port: Database port database: Main database name username: Database username password: Database password template_db: Template database for initial states """ super().__init__(service_name="postgres") self.host = host self.port = port self.database = database self.username = username self.password = password # Connection parameters self.conn_params = { "host": host, "port": port, "user": username, "password": password, } # Track created databases for cleanup self.created_databases: List[str] = [] # Track current task database for agent configuration self._current_task_database: Optional[str] = None # Validate connection on initialization try: self._test_connection() logger.info("PostgreSQL state manager initialized successfully") self._setup_database() except Exception as e: raise RuntimeError(f"PostgreSQL initialization failed: {e}") def _test_connection(self): """Test database connection.""" conn = psycopg2.connect(**self.conn_params, database="postgres") conn.close() def _setup_database(self): """Setup all required databases by downloading and restoring from backup.""" databases = ['employees', 'chinook', 'dvdrental', 'sports', 'lego'] for db_name in databases: if not self._database_exists(db_name): logger.info(f"Setting up {db_name} database...") # Path to backup file backup_dir = Path(__file__).parent.parent.parent.parent / "postgres_state" backup_file = backup_dir / f"{db_name}.backup" # Download backup if not exists if not backup_file.exists(): backup_dir.mkdir(parents=True, exist_ok=True) logger.info(f"Downloading {db_name} backup...") try: import urllib.request urllib.request.urlretrieve( f'https://storage.mcpmark.ai/postgres/{db_name}.backup', str(backup_file) ) logger.info(f"{db_name} backup downloaded") except Exception as e: logger.warning(f"Failed to download {db_name} backup: {e}") continue # Create database try: self._create_empty_database(db_name) logger.info(f"Created {db_name} database") except Exception as e: logger.warning(f"Failed to create {db_name} database: {e}") continue # Restore from backup env = os.environ.copy() env['PGPASSWORD'] = self.password try: result = subprocess.run([ 'pg_restore', '-h', str(self.host), '-p', str(self.port), '-U', self.username, '-d', db_name, '-v', str(backup_file) ], env=env, capture_output=True, text=True) if result.returncode != 0: logger.warning(f"pg_restore had errors for {db_name}: {result.stderr}") else: logger.info(f"{db_name} database restored successfully") except Exception as e: logger.warning(f"Failed to restore {db_name} database: {e}") else: logger.debug(f"{db_name} database already exists") def _setup_database(self): """Setup all required databases by downloading and restoring from backup.""" databases = ['employees', 'chinook', 'dvdrental', 'sports', 'lego'] for db_name in databases: if not self._database_exists(db_name): logger.info(f"Setting up {db_name} database...") # Path to backup file backup_dir = Path(__file__).parent.parent.parent.parent / "postgres_state" backup_file = backup_dir / f"{db_name}.backup" # Download backup if not exists if not backup_file.exists(): backup_dir.mkdir(parents=True, exist_ok=True) logger.info(f"Downloading {db_name} backup...") try: import urllib.request urllib.request.urlretrieve( f'https://storage.mcpmark.ai/postgres/{db_name}.backup', str(backup_file) ) logger.info(f"{db_name} backup downloaded") except Exception as e: logger.warning(f"Failed to download {db_name} backup: {e}") continue # Create database try: self._create_empty_database(db_name) logger.info(f"Created {db_name} database") except Exception as e: logger.warning(f"Failed to create {db_name} database: {e}") continue # Restore from backup env = os.environ.copy() env['PGPASSWORD'] = self.password try: result = subprocess.run([ 'pg_restore', '-h', str(self.host), '-p', str(self.port), '-U', self.username, '-d', db_name, '-v', str(backup_file) ], env=env, capture_output=True, text=True) if result.returncode != 0 and "ERROR" in result.stderr: logger.warning(f"pg_restore had errors for {db_name}: {result.stderr}") else: logger.info(f"{db_name} database restored successfully") except Exception as e: logger.warning(f"Failed to restore {db_name} database: {e}") else: logger.debug(f"{db_name} database already exists") def _create_initial_state(self, task: BaseTask) -> Optional[InitialStateInfo]: """Create initial database state for a task.""" try: # Generate unique database name db_name = f"mcpmark_{task.category_id}_{task.task_id}_{self._get_timestamp()}" # Create database from template if exists, otherwise empty if self._database_exists(task.category_id): self._create_database_from_template(db_name, task.category_id) logger.info( f"| Created database '{db_name}' from template '{task.category_id}'" ) else: self._create_empty_database(db_name) logger.info(f"| Created empty database '{db_name}'") # Run prepare_environment.py if it exists self._run_prepare_environment(db_name, task) logger.info(f"| Prepared environment for database '{db_name}'") # Track for cleanup self.created_databases.append(db_name) self.track_resource("database", db_name, {"task": task.name}) return InitialStateInfo( state_id=db_name, state_url=f"postgresql://{self.username}@{self.host}:{self.port}/{db_name}", metadata={ "database": db_name, "category": task.category_id, "task_id": task.task_id, }, ) except Exception as e: logger.error(f"Failed to create initial state for {task.name}: {e}") return None def _store_initial_state_info( self, task: BaseTask, state_info: InitialStateInfo ) -> None: """Store database info in task object.""" if hasattr(task, "__dict__"): task.database_name = state_info.state_id task.database_url = state_info.state_url # Store current task database for agent configuration self._current_task_database = state_info.state_id def _cleanup_task_initial_state(self, task: BaseTask) -> bool: """Clean up task database.""" if hasattr(task, "database_name") and task.database_name: try: self._drop_database(task.database_name) logger.info(f"| Dropped database: {task.database_name}") # Remove from tracking self.created_databases = [ db for db in self.created_databases if db != task.database_name ] # Clear current task database if self._current_task_database == task.database_name: self._current_task_database = None return True except Exception as e: logger.error(f"Failed to drop database {task.database_name}: {e}") return False return True def _cleanup_single_resource(self, resource: Dict[str, Any]) -> bool: """Clean up a single PostgreSQL resource.""" if resource["type"] == "database": try: self._drop_database(resource["id"]) logger.info(f"| Dropped database: {resource['id']}") return True except Exception as e: logger.error(f"| Failed to drop database {resource['id']}: {e}") return False return False def _database_exists(self, db_name: str) -> bool: """Check if database exists.""" conn = psycopg2.connect(**self.conn_params, database="postgres") try: with conn.cursor() as cur: cur.execute("SELECT 1 FROM pg_database WHERE datname = %s", (db_name,)) return cur.fetchone() is not None finally: conn.close() def _create_database_from_template(self, new_db: str, template_db: str): """Create database from template.""" conn = psycopg2.connect(**self.conn_params, database="postgres") conn.autocommit = True try: with conn.cursor() as cur: cur.execute( sql.SQL(""" SELECT pg_terminate_backend(pid) FROM pg_stat_activity WHERE datname = %s AND pid <> pg_backend_pid() """), (template_db,), ) cur.execute( sql.SQL("CREATE DATABASE {} WITH TEMPLATE {}").format( sql.Identifier(new_db), sql.Identifier(template_db) ) ) finally: conn.close() def _create_empty_database(self, db_name: str): """Create empty database.""" conn = psycopg2.connect(**self.conn_params, database="postgres") conn.autocommit = True try: with conn.cursor() as cur: cur.execute( sql.SQL("CREATE DATABASE {}").format(sql.Identifier(db_name)) ) finally: conn.close() def _drop_database(self, db_name: str): """Drop database.""" conn = psycopg2.connect(**self.conn_params, database="postgres") conn.autocommit = True try: with conn.cursor() as cur: # Terminate connections cur.execute( sql.SQL(""" SELECT pg_terminate_backend(pid) FROM pg_stat_activity WHERE datname = %s AND pid <> pg_backend_pid() """), (db_name,), ) # Drop database cur.execute( sql.SQL("DROP DATABASE IF EXISTS {}").format( sql.Identifier(db_name) ) ) finally: conn.close() def _run_prepare_environment(self, db_name: str, task: BaseTask): """Run prepare_environment.py script if it exists in the task directory.""" # Find the task directory containing prepare_environment.py task_dir = task.task_instruction_path.parent prepare_script = task_dir / "prepare_environment.py" if not prepare_script.exists(): logger.debug(f"No prepare_environment.py found for task {task.name}") return logger.info(f"| Running prepare_environment.py for task {task.name}") # Set up environment variables for the script env = os.environ.copy() env.update({ "POSTGRES_HOST": str(self.host), "POSTGRES_PORT": str(self.port), "POSTGRES_DATABASE": db_name, "POSTGRES_USERNAME": self.username, "POSTGRES_PASSWORD": self.password or "", }) try: # Run the prepare_environment.py script result = subprocess.run( [sys.executable, str(prepare_script)], cwd=str(task_dir), # Run from task directory to access data/ folder env=env, capture_output=True, text=True, timeout=300, # 5 minute timeout ) if result.returncode == 0: logger.info(f"| ✓ Environment preparation completed for {task.name}") if result.stdout.strip(): logger.debug(f"| prepare_environment.py output: {result.stdout}") else: logger.error(f"| ❌ Environment preparation failed for {task.name}") logger.error(f"| Error output: {result.stderr}") raise RuntimeError(f"prepare_environment.py failed with exit code {result.returncode}") except subprocess.TimeoutExpired: logger.error(f"❌ Environment preparation timed out for {task.name}") raise RuntimeError("prepare_environment.py execution timed out") except Exception as e: logger.error(f"❌ Failed to run prepare_environment.py for {task.name}: {e}") raise def _setup_task_specific_data(self, db_name: str, task: BaseTask): """Set up task-specific schema and data.""" conn = psycopg2.connect(**self.conn_params, database=db_name) try: with conn.cursor() as cur: if task.category_id == "basic_queries": self._setup_basic_queries_data(cur) elif task.category_id == "data_manipulation": self._setup_data_manipulation_data(cur) elif task.category_id == "table_operations": self._setup_table_operations_data(cur) # Add more categories as needed conn.commit() except Exception as e: conn.rollback() logger.error(f"Failed to setup task data: {e}") raise finally: conn.close() def _setup_basic_queries_data(self, cursor): """Set up data for basic query tasks.""" cursor.execute(""" CREATE TABLE employees ( id SERIAL PRIMARY KEY, name VARCHAR(100) NOT NULL, department VARCHAR(50), salary DECIMAL(10, 2), hire_date DATE ); INSERT INTO employees (name, department, salary, hire_date) VALUES ('John Doe', 'Engineering', 75000.00, '2020-01-15'), ('Jane Smith', 'Marketing', 65000.00, '2019-03-22'), ('Bob Johnson', 'Engineering', 80000.00, '2018-07-01'), ('Alice Brown', 'HR', 55000.00, '2021-02-10'); """) def _setup_data_manipulation_data(self, cursor): """Set up data for data manipulation tasks.""" cursor.execute(""" CREATE TABLE products ( id SERIAL PRIMARY KEY, name VARCHAR(100) NOT NULL, category VARCHAR(50), price DECIMAL(10, 2), stock INTEGER DEFAULT 0 ); CREATE TABLE orders ( id SERIAL PRIMARY KEY, product_id INTEGER REFERENCES products(id), quantity INTEGER NOT NULL, order_date TIMESTAMP DEFAULT CURRENT_TIMESTAMP ); """) def _setup_table_operations_data(self, cursor): """Set up for table operation tasks.""" # Start with minimal schema that tasks will modify cursor.execute(""" CREATE TABLE test_table ( id SERIAL PRIMARY KEY, data VARCHAR(255) ); """) def _get_timestamp(self) -> str: """Get timestamp for unique naming.""" from datetime import datetime return datetime.now().strftime("%Y%m%d%H%M%S") def get_service_config_for_agent(self) -> dict: """Get configuration for agent execution.""" config = { "host": self.host, "port": self.port, "username": self.username, "password": self.password, } # If there's a current task database, include it if hasattr(self, "_current_task_database") and self._current_task_database: config["current_database"] = self._current_task_database config["database_url"] = ( f"postgresql://{self.username}:{self.password}@{self.host}:{self.port}/{self._current_task_database}" ) else: # Fallback to default database config["database"] = self.database config["database_url"] = ( f"postgresql://{self.username}:{self.password}@{self.host}:{self.port}/{self.database}" ) return config ================================================ FILE: src/mcp_services/postgres/postgres_task_manager.py ================================================ """ PostgreSQL Task Manager for MCPMark ==================================== Manages PostgreSQL task discovery, execution, and verification. """ import os import subprocess import sys from dataclasses import dataclass from pathlib import Path from typing import Any, Dict, List, Optional from src.base.task_manager import BaseTask, BaseTaskManager from src.logger import get_logger logger = get_logger(__name__) @dataclass class PostgresTask(BaseTask): """PostgreSQL-specific task with database information.""" task_name: str = "" database_name: Optional[str] = None database_url: Optional[str] = None expected_queries: Optional[List[str]] = None expected_tables: Optional[List[str]] = None class PostgresTaskManager(BaseTaskManager): """Manages PostgreSQL tasks for MCPMark evaluation.""" def __init__(self, tasks_root: Path = None, task_suite: str = "standard"): """Initialize PostgreSQL task manager. Args: tasks_root: Path to tasks directory task_suite: Logical task suite (e.g., 'standard', 'easy') """ if tasks_root is None: tasks_root = Path(__file__).resolve().parents[3] / "tasks" super().__init__( tasks_root, mcp_service="postgres", task_class=PostgresTask, task_organization="file", # PostgreSQL uses file-based tasks task_suite=task_suite, ) def _create_task_from_files( self, category_id: str, task_files_info: Dict[str, Any] ) -> Optional[PostgresTask]: """Instantiate a `PostgresTask` from the dictionary returned by `_find_task_files`.""" import json # Check for meta.json meta_path = task_files_info["instruction_path"].parent / "meta.json" final_category_id = category_id task_id = task_files_info["task_id"] if meta_path.exists(): try: with open(meta_path, 'r') as f: meta_data = json.load(f) # Use values from meta.json if available final_category_id = meta_data.get("category_id", category_id) task_id = meta_data.get("task_id", task_id) except Exception as e: logger.warning(f"Failed to load meta.json from {meta_path}: {e}") return PostgresTask( task_instruction_path=task_files_info["instruction_path"], task_verification_path=task_files_info["verification_path"], service="postgres", category_id=final_category_id, task_id=task_id, task_name=task_files_info["task_id"], ) def _get_verification_command(self, task: PostgresTask) -> List[str]: """Get verification command with database info.""" cmd = [sys.executable, str(task.task_verification_path)] # Pass database name as argument if available if task.database_name: cmd.append(task.database_name) return cmd def run_verification(self, task: BaseTask) -> subprocess.CompletedProcess: """Run verification with PostgreSQL environment.""" env = os.environ.copy() # Pass database connection info to verification script if hasattr(task, "database_name") and task.database_name: env["POSTGRES_DATABASE"] = task.database_name if hasattr(task, "database_url") and task.database_url: env["DATABASE_URL"] = task.database_url return subprocess.run( self._get_verification_command(task), capture_output=True, text=True, timeout=300, env=env, ) def _format_task_instruction(self, base_instruction: str) -> str: """Add PostgreSQL-specific instructions.""" return ( base_instruction + "\n\nNote: Use PostgreSQL MCP tools to complete this task. The database connection is already configured." ) ================================================ FILE: src/mcp_services/supabase/__init__.py ================================================ """Supabase MCP service integration for MCPMark.""" from .supabase_login_helper import SupabaseLoginHelper from .supabase_state_manager import SupabaseStateManager from .supabase_task_manager import SupabaseTaskManager __all__ = [ "SupabaseLoginHelper", "SupabaseStateManager", "SupabaseTaskManager", ] ================================================ FILE: src/mcp_services/supabase/supabase_login_helper.py ================================================ """ Supabase Login Helper for MCPMark =================================== Handles configuration and validation for Supabase MCP service. """ import os from typing import Dict, Any, Optional from src.base.login_helper import BaseLoginHelper from src.logger import get_logger logger = get_logger(__name__) class SupabaseLoginHelper(BaseLoginHelper): """Login helper for Supabase MCP service. Validates PostgREST API URL and API key configuration. """ def __init__(self): super().__init__("supabase") def prepare_credentials(self) -> Dict[str, Any]: """Prepare credentials for Supabase/PostgREST connection. Returns: Dictionary containing api_url, api_key, and postgres connection details """ # Get PostgREST API configuration (from Supabase CLI) api_url = os.getenv("SUPABASE_API_URL", "http://localhost:54321") api_key = os.getenv("SUPABASE_API_KEY") # Get PostgreSQL connection details (Supabase CLI defaults) postgres_host = os.getenv("SUPABASE_DB_HOST", "localhost") postgres_port = int(os.getenv("SUPABASE_DB_PORT", "54322")) postgres_user = os.getenv("SUPABASE_DB_USER", "postgres") postgres_password = os.getenv("SUPABASE_DB_PASSWORD", "postgres") postgres_database = os.getenv("SUPABASE_DB_NAME", "postgres") if not api_key: logger.warning( "SUPABASE_API_KEY not set.\n" "Run 'supabase status' to get your anon or service_role key.\n" "Set SUPABASE_API_KEY in your .mcp_env file." ) # Try to get it from supabase status api_key = self._get_key_from_supabase_status() return { "api_url": api_url, "api_key": api_key or "", "postgres_host": postgres_host, "postgres_port": postgres_port, "postgres_user": postgres_user, "postgres_password": postgres_password, "postgres_database": postgres_database, } def _get_key_from_supabase_status(self) -> Optional[str]: """Try to get anon key from supabase status command. Returns: Anon key if found, None otherwise """ import subprocess try: result = subprocess.run( ["supabase", "status"], capture_output=True, text=True, timeout=10, ) if result.returncode == 0: # Parse output for anon key for line in result.stdout.split('\n'): if 'anon key:' in line.lower(): # Extract the key after the colon key = line.split(':', 1)[1].strip() logger.info("Found anon key from 'supabase status'") return key except (subprocess.SubprocessError, FileNotFoundError): logger.debug("Could not run 'supabase status' to get anon key") return None def test_credentials(self, credentials: Dict[str, Any]) -> bool: """Test if Supabase credentials are valid. Args: credentials: Dictionary with api_url, api_key, and postgres connection details Returns: True if credentials are valid """ import requests import psycopg2 api_url = credentials["api_url"] api_key = credentials.get("api_key", "") # Test PostgreSQL connection try: conn_params = { "host": credentials["postgres_host"], "port": credentials["postgres_port"], "user": credentials["postgres_user"], "password": credentials["postgres_password"], "database": credentials["postgres_database"], } conn = psycopg2.connect(**conn_params) conn.close() logger.info("✓ PostgreSQL connection successful") except Exception as e: logger.error(f"✗ PostgreSQL connection failed: {e}") return False # Test PostgREST API connection (optional - may not be running yet) try: headers = {} if api_key: headers["apikey"] = api_key headers["Authorization"] = f"Bearer {api_key}" response = requests.get(api_url, headers=headers, timeout=5) # Any response (including 404, 401) means the API is reachable logger.info(f"✓ PostgREST API reachable at {api_url} (status: {response.status_code})") return True except requests.exceptions.ConnectionError: logger.warning( f"⚠ PostgREST API not reachable at {api_url}.\n" "Make sure PostgREST is running (e.g., docker run -p 3000:3000 postgrest/postgrest)\n" "or use a cloud Supabase instance URL." ) # Still return True as PostgreSQL connection works return True except Exception as e: logger.warning(f"⚠ PostgREST API test failed: {e}") # Still return True as PostgreSQL connection works return True def format_credentials_info(self, credentials: Dict[str, Any]) -> str: """Format credentials info for display. Args: credentials: Dictionary with connection details Returns: Formatted string describing the credentials """ api_url = credentials["api_url"] has_api_key = bool(credentials.get("api_key")) postgres_host = credentials["postgres_host"] postgres_db = credentials["postgres_database"] return ( f"Supabase Configuration:\n" f" API URL: {api_url}\n" f" API Key: {'✓ Configured' if has_api_key else '✗ Not set'}\n" f" PostgreSQL: {postgres_host}/{postgres_db}" ) ================================================ FILE: src/mcp_services/supabase/supabase_state_manager.py ================================================ """ Supabase State Manager for MCPMark ==================================== Manages database state for Supabase tasks using the same PostgreSQL backend as Insforge, but accessed via PostgREST/Supabase MCP server. """ import os import sys import subprocess import psycopg2 from psycopg2 import sql from pathlib import Path from typing import Optional, Dict, Any, List from src.base.state_manager import BaseStateManager, InitialStateInfo from src.base.task_manager import BaseTask from src.logger import get_logger logger = get_logger(__name__) class SupabaseStateManager(BaseStateManager): """Manages Supabase/PostgREST database state for task evaluation. Uses the same PostgreSQL database as Insforge but exposes it via PostgREST API for the Supabase MCP server to access. """ def __init__( self, api_url: str, api_key: str, postgres_host: str = "localhost", postgres_port: int = 54322, # Supabase CLI default port postgres_user: str = "postgres", postgres_password: str = "postgres", postgres_database: str = "postgres", # Supabase CLI default database ): """Initialize Supabase state manager. Args: api_url: PostgREST API URL from Supabase CLI (default: http://localhost:54321) api_key: API key from Supabase CLI (anon or service_role key) postgres_host: PostgreSQL host for direct database operations postgres_port: PostgreSQL port (Supabase CLI uses 54322) postgres_user: PostgreSQL username postgres_password: PostgreSQL password postgres_database: Main PostgreSQL database name """ super().__init__(service_name="supabase") self.api_url = api_url.rstrip('/') self.api_key = api_key # PostgreSQL connection for state management (Supabase CLI instance) self.postgres_host = postgres_host self.postgres_port = postgres_port self.postgres_user = postgres_user self.postgres_password = postgres_password self.postgres_database = postgres_database # Track current task context for agent configuration self._current_task_context: Optional[Dict[str, Any]] = None # Validate connection on initialization try: self._test_connection() logger.info("Supabase state manager initialized successfully") except Exception as e: raise RuntimeError(f"Supabase initialization failed: {e}") # Store baseline tables (system tables that exist before any tasks run) self._baseline_tables = set( (t['schema'], t['name']) for t in self._get_all_tables() ) logger.debug(f"Stored baseline: {len(self._baseline_tables)} tables") def _test_connection(self): """Test PostgreSQL connection.""" try: conn_params = { "host": self.postgres_host, "port": self.postgres_port, "user": self.postgres_user, "password": self.postgres_password, "database": self.postgres_database, } conn = psycopg2.connect(**conn_params) conn.close() logger.debug("PostgreSQL connection test successful") except Exception as e: raise RuntimeError(f"Cannot connect to PostgreSQL: {e}") def _create_initial_state(self, task: BaseTask) -> Optional[InitialStateInfo]: """Create initial backend state for a task. Restores from backup which may place tables in public or task-specific schema. Args: task: Task for which to create initial state Returns: InitialStateInfo object or None if creation failed """ try: # Generate unique state ID for this task run state_id = f"{task.category_id}_{task.task_id}_{self._get_timestamp()}" schema_name = task.category_id logger.info(f"| Creating initial state for Supabase task: {task.name}") # Drop schema first (cleanup from previous runs) self._drop_schema(schema_name) # Get list of existing tables before restore (to track what we create) tables_before = self._get_all_tables() logger.info(f"| Tables before restore: {len(tables_before)}") # Note: Don't create schema here - pg_restore will create it from the backup # Restore from backup if backup exists (may create tables in public or task schema) if self._restore_from_backup(schema_name): logger.info(f"| ✓ Restored '{schema_name}' from backup") else: logger.info(f"| ○ No backup found for '{schema_name}'") # Run prepare_environment.py if it exists task_prepared = self._run_prepare_environment(task) if not task_prepared: logger.debug(f"| No prepare_environment.py found for task {task.name}") # Get list of tables after restore (to track what we need to clean up) tables_after = self._get_all_tables() # Track ALL new tables created by the restore (compare before/after) tables_before_set = {(t['schema'], t['name']) for t in tables_before} created_tables = [ t for t in tables_after if (t['schema'], t['name']) not in tables_before_set ] logger.info(f"| Tracked {len(created_tables)} new tables for cleanup") for t in created_tables: logger.debug(f"| - {t['schema']}.{t['name']}") # Track the task context including created tables context = { "state_id": state_id, "category_id": task.category_id, "task_id": task.task_id, "task_name": task.name, "schema": schema_name, "created_tables": created_tables, } return InitialStateInfo( state_id=state_id, state_url=self.api_url, metadata=context, ) except Exception as e: logger.error(f"Failed to create initial state for {task.name}: {e}") return None def _store_initial_state_info( self, task: BaseTask, state_info: InitialStateInfo ) -> None: """Store backend info in task object for agent access.""" if hasattr(task, "__dict__"): task.api_url = self.api_url task.api_key = self.api_key task.state_id = state_info.state_id # Store current task context for agent configuration self._current_task_context = state_info.metadata def _cleanup_task_initial_state(self, task: BaseTask) -> bool: """Clean up task-specific resources. Drops ALL tables created during task (both setup and agent-created) by comparing against baseline. Args: task: Task whose initial state should be cleaned up Returns: True if cleanup successful """ try: logger.info(f"| Cleaning up initial state for task: {task.name}") if self._current_task_context: schema_name = self._current_task_context.get("schema") # Get ALL current tables all_current_tables = self._get_all_tables() # Find tables to drop: anything not in baseline tables_to_drop = [ t for t in all_current_tables if (t['schema'], t['name']) not in self._baseline_tables ] logger.info(f"| Found {len(tables_to_drop)} tables to clean up (setup + agent-created)") # Drop individual tables for table_info in tables_to_drop: try: self._drop_table(table_info["schema"], table_info["name"]) logger.debug(f"| ✓ Dropped table: {table_info['schema']}.{table_info['name']}") except Exception as e: logger.warning(f"| Failed to drop table {table_info}: {e}") # Drop the task schema (may be empty if all tables were in public) if schema_name: try: self._drop_schema(schema_name) logger.info(f"| ✓ Dropped schema: {schema_name}") except Exception as e: logger.warning(f"| Failed to drop schema {schema_name}: {e}") # Clear task context if self._current_task_context.get("task_name") == task.name: self._current_task_context = None logger.info(f"| ✓ Initial state cleanup completed for {task.name}") return True except Exception as e: logger.error(f"Failed to cleanup task initial state for {task.name}: {e}") return False def _cleanup_single_resource(self, resource: Dict[str, Any]) -> bool: """Clean up a single tracked resource. Args: resource: Resource dictionary with type, id, and metadata Returns: True if cleanup successful """ resource_type = resource["type"] resource_id = resource["id"] logger.debug(f"| Cleanup for {resource_type} {resource_id} (handled by task scripts)") return True def _run_prepare_environment(self, task: BaseTask) -> bool: """Run prepare_environment.py script if it exists in the task directory. The script should use database operations to set up required state. Args: task: Task for which to prepare environment Returns: True if script ran successfully, False if script doesn't exist """ task_dir = task.task_instruction_path.parent prepare_script = task_dir / "prepare_environment.py" if not prepare_script.exists(): logger.debug(f"No prepare_environment.py found for task {task.name}") return False logger.info(f"| Running prepare_environment.py for task {task.name}") # Set up environment variables for the script env = os.environ.copy() env.update({ "SUPABASE_API_URL": self.api_url, "SUPABASE_API_KEY": self.api_key, "POSTGRES_HOST": self.postgres_host, "POSTGRES_PORT": str(self.postgres_port), "POSTGRES_DATABASE": self.postgres_database, "POSTGRES_USERNAME": self.postgres_user, "POSTGRES_PASSWORD": self.postgres_password, }) try: # Run the prepare_environment.py script result = subprocess.run( [sys.executable, str(prepare_script)], cwd=str(task_dir), # Run from task directory env=env, capture_output=True, text=True, timeout=300, # 5 minute timeout ) if result.returncode == 0: logger.info(f"| ✓ Environment preparation completed for {task.name}") if result.stdout.strip(): logger.debug(f"| prepare_environment.py output: {result.stdout}") return True else: logger.error(f"| ✗ Environment preparation failed for {task.name}") logger.error(f"| Error output: {result.stderr}") raise RuntimeError(f"prepare_environment.py failed with exit code {result.returncode}") except subprocess.TimeoutExpired: logger.error(f"✗ Environment preparation timed out for {task.name}") raise RuntimeError("prepare_environment.py execution timed out") except Exception as e: logger.error(f"✗ Failed to run prepare_environment.py for {task.name}: {e}") raise def _get_timestamp(self) -> str: """Get timestamp for unique naming.""" from datetime import datetime return datetime.now().strftime("%Y%m%d%H%M%S") def _drop_schema(self, schema_name: str) -> None: """Drop schema and all its contents.""" conn_params = { "host": self.postgres_host, "port": self.postgres_port, "user": self.postgres_user, "password": self.postgres_password, "database": self.postgres_database, } conn = psycopg2.connect(**conn_params) conn.autocommit = True try: with conn.cursor() as cur: cur.execute( sql.SQL("DROP SCHEMA IF EXISTS {} CASCADE").format( sql.Identifier(schema_name) ) ) logger.debug(f"| Dropped schema: {schema_name}") finally: conn.close() def _create_schema(self, schema_name: str) -> None: """Create empty schema.""" conn_params = { "host": self.postgres_host, "port": self.postgres_port, "user": self.postgres_user, "password": self.postgres_password, "database": self.postgres_database, } conn = psycopg2.connect(**conn_params) conn.autocommit = True try: with conn.cursor() as cur: cur.execute( sql.SQL("CREATE SCHEMA {}").format(sql.Identifier(schema_name)) ) logger.debug(f"| Created schema: {schema_name}") finally: conn.close() def _get_all_tables(self) -> List[Dict[str, str]]: """Get list of all user tables. Returns: List of dicts with 'schema' and 'name' keys """ conn_params = { "host": self.postgres_host, "port": self.postgres_port, "user": self.postgres_user, "password": self.postgres_password, "database": self.postgres_database, } conn = psycopg2.connect(**conn_params) try: with conn.cursor() as cur: cur.execute(""" SELECT table_schema, table_name FROM information_schema.tables WHERE table_type = 'BASE TABLE' AND table_schema NOT IN ('information_schema', 'pg_catalog') AND table_schema NOT LIKE 'pg_%' AND table_name NOT LIKE '\\_%' ORDER BY table_schema, table_name """) rows = cur.fetchall() return [{"schema": row[0], "name": row[1]} for row in rows] finally: conn.close() def _drop_table(self, schema_name: str, table_name: str) -> None: """Drop a specific table or materialized view.""" conn_params = { "host": self.postgres_host, "port": self.postgres_port, "user": self.postgres_user, "password": self.postgres_password, "database": self.postgres_database, } conn = psycopg2.connect(**conn_params) conn.autocommit = True try: with conn.cursor() as cur: # Try dropping as table first cur.execute( sql.SQL("DROP TABLE IF EXISTS {}.{} CASCADE").format( sql.Identifier(schema_name), sql.Identifier(table_name) ) ) # Also try dropping as materialized view (in case agent created one) cur.execute( sql.SQL("DROP MATERIALIZED VIEW IF EXISTS {}.{} CASCADE").format( sql.Identifier(schema_name), sql.Identifier(table_name) ) ) logger.debug(f"| Dropped table/view: {schema_name}.{table_name}") finally: conn.close() def _restore_from_backup(self, category_name: str) -> bool: """Restore from backup file. Tables may be restored into public schema or category-specific schema depending on how the backup was created. Args: category_name: Name of category (e.g., 'employees', 'chinook', 'lego') Returns: True if backup was restored, False if no backup exists """ # Path to backup file (same as used by Insforge/Postgres) backup_dir = Path(__file__).parent.parent.parent.parent / "postgres_state" backup_file = backup_dir / f"{category_name}.backup" logger.debug(f"| Looking for backup at: {backup_file}") if not backup_file.exists(): logger.info(f"| ○ No backup file found: {backup_file}") return False logger.info(f"| Restoring {category_name} from backup...") # Set up environment for pg_restore env = os.environ.copy() env["PGPASSWORD"] = self.postgres_password try: # Restore backup result = subprocess.run( [ "pg_restore", "-h", self.postgres_host, "-p", str(self.postgres_port), "-U", self.postgres_user, "-d", self.postgres_database, "-v", str(backup_file), ], env=env, capture_output=True, text=True, timeout=120, # 2 minute timeout ) if result.returncode != 0 and "ERROR" in result.stderr: logger.warning(f"| pg_restore had errors for {category_name}: {result.stderr}") return False logger.info(f"| ✓ {category_name} restored successfully") return True except subprocess.TimeoutExpired: logger.error(f"| ✗ Restore timed out for {category_name}") return False except Exception as e: logger.error(f"| ✗ Failed to restore {category_name}: {e}") return False def get_service_config_for_agent(self) -> dict: """Get configuration for agent execution. This configuration is passed to the agent/MCP server so it can connect to the Supabase/PostgREST endpoint. Returns: Dictionary containing API URL and API key """ config = { "api_url": self.api_url, "api_key": self.api_key, "schema": "public", # Default schema for PostgREST } # Include current task context if available if self._current_task_context: config["task_context"] = self._current_task_context # If task uses a specific schema, include it if self._current_task_context.get("schema"): config["schema"] = self._current_task_context["schema"] return config def set_verification_environment(self, messages_path: str = None) -> None: """Set environment variables needed for verification scripts. Args: messages_path: Optional path to messages.json file for verification """ os.environ["SUPABASE_API_URL"] = self.api_url os.environ["SUPABASE_API_KEY"] = self.api_key # Set PostgreSQL connection details for direct database verification os.environ["POSTGRES_HOST"] = self.postgres_host os.environ["POSTGRES_PORT"] = str(self.postgres_port) os.environ["POSTGRES_DATABASE"] = self.postgres_database os.environ["POSTGRES_USERNAME"] = self.postgres_user os.environ["POSTGRES_PASSWORD"] = self.postgres_password if messages_path: os.environ["MCP_MESSAGES"] = str(messages_path) logger.debug("Verification environment variables set for Supabase (including direct postgres access)") ================================================ FILE: src/mcp_services/supabase/supabase_task_manager.py ================================================ """ Supabase Task Manager for MCPMark =================================== Manages Supabase task discovery, execution, and verification. Reuses Postgres tasks but accesses them via PostgREST/Supabase MCP. """ import os import subprocess import sys from dataclasses import dataclass from pathlib import Path from typing import Any, Dict, List, Optional from src.base.task_manager import BaseTask, BaseTaskManager from src.logger import get_logger logger = get_logger(__name__) @dataclass class SupabaseTask(BaseTask): """Supabase-specific task with API information.""" task_name: str = "" api_url: Optional[str] = None api_key: Optional[str] = None class SupabaseTaskManager(BaseTaskManager): """Manages Supabase tasks for MCPMark evaluation. Uses the same task structure as Postgres tasks but accessed via PostgREST/Supabase MCP server. """ def __init__(self, tasks_root: Path = None): """Initialize Supabase task manager. Args: tasks_root: Path to tasks directory """ if tasks_root is None: tasks_root = Path(__file__).resolve().parents[3] / "tasks" super().__init__( tasks_root, mcp_service="supabase", task_class=SupabaseTask, task_organization="file", # Supabase uses file-based tasks (like Postgres) ) def _create_task_from_files( self, category_id: str, task_files_info: Dict[str, Any] ) -> Optional[SupabaseTask]: """Instantiate a `SupabaseTask` from the dictionary returned by `_find_task_files`.""" import json # Check for meta.json meta_path = task_files_info["instruction_path"].parent / "meta.json" final_category_id = category_id task_id = task_files_info["task_id"] if meta_path.exists(): try: with open(meta_path, 'r') as f: meta_data = json.load(f) # Use values from meta.json if available final_category_id = meta_data.get("category_id", category_id) task_id = meta_data.get("task_id", task_id) except Exception as e: logger.warning(f"Failed to load meta.json from {meta_path}: {e}") return SupabaseTask( task_instruction_path=task_files_info["instruction_path"], task_verification_path=task_files_info["verification_path"], service="supabase", category_id=final_category_id, task_id=task_id, task_name=task_files_info["task_id"], ) def _get_verification_command(self, task: SupabaseTask) -> List[str]: """Get verification command with Supabase API info.""" cmd = [sys.executable, str(task.task_verification_path)] return cmd def run_verification(self, task: BaseTask) -> subprocess.CompletedProcess: """Run verification with Supabase environment.""" env = os.environ.copy() # Pass Supabase connection info to verification script if hasattr(task, "api_url") and task.api_url: env["SUPABASE_API_URL"] = task.api_url if hasattr(task, "api_key") and task.api_key: env["SUPABASE_API_KEY"] = task.api_key return subprocess.run( self._get_verification_command(task), capture_output=True, text=True, timeout=300, env=env, ) def _format_task_instruction(self, base_instruction: str) -> str: """Add Supabase-specific instructions.""" return ( base_instruction + "\n\nNote: Use Supabase MCP tools (PostgREST) to complete this task. The API connection is already configured." ) ================================================ FILE: src/model_config.py ================================================ #!/usr/bin/env python3 """ Model Configuration for MCPMark ================================ This module provides configuration management for different LLM models, automatically detecting the required API keys and base URLs based on the model name. """ import os from typing import Dict, List from src.logger import get_logger # Initialize logger logger = get_logger(__name__) class ModelConfig: """ Configuration container for a specific model. It loads the necessary API key and base URL from environment variables. """ # Model configuration mapping MODEL_CONFIGS = { # OpenAI models "gpt-4o": { "provider": "openai", "api_key_var": "OPENAI_API_KEY", "litellm_input_model_name": "openai/gpt-4o", }, "gpt-4.1": { "provider": "openai", "api_key_var": "OPENAI_API_KEY", "litellm_input_model_name": "openai/gpt-4.1", }, "gpt-4.1-mini": { "provider": "openai", "api_key_var": "OPENAI_API_KEY", "litellm_input_model_name": "openai/gpt-4.1-mini", }, "gpt-4.1-nano": { "provider": "openai", "api_key_var": "OPENAI_API_KEY", "litellm_input_model_name": "openai/gpt-4.1-nano", }, "gpt-5.2": { "provider": "openai", "api_key_var": "OPENAI_API_KEY", "litellm_input_model_name": "openai/gpt-5.2", }, "gpt-5": { "provider": "openai", "api_key_var": "OPENAI_API_KEY", "litellm_input_model_name": "openai/gpt-5", }, "gpt-5-mini": { "provider": "openai", "api_key_var": "OPENAI_API_KEY", "litellm_input_model_name": "openai/gpt-5-mini", }, "gpt-5-nano": { "provider": "openai", "api_key_var": "OPENAI_API_KEY", "litellm_input_model_name": "openai/gpt-5-nano", }, "o3": { "provider": "openai", "api_key_var": "OPENAI_API_KEY", "litellm_input_model_name": "openai/o3", }, "o4-mini": { "provider": "openai", "api_key_var": "OPENAI_API_KEY", "litellm_input_model_name": "openai/o4-mini", }, "gpt-oss-120b": { "provider": "openai", "api_key_var": "OPENROUTER_API_KEY", "litellm_input_model_name": "openrouter/openai/gpt-oss-120b", }, # DeepSeek models "deepseek-v3.2-instruct": { "provider": "deepseek", "api_key_var": "DEEPSEEK_API_KEY", "litellm_input_model_name": "deepseek/deepseek-chat", }, "deepseek-v3.2-thinking": { "provider": "deepseek", "api_key_var": "DEEPSEEK_API_KEY", "litellm_input_model_name": "deepseek/deepseek-reasoner", }, # Anthropic models "claude-3.7-sonnet": { "provider": "anthropic", "api_key_var": "ANTHROPIC_API_KEY", "litellm_input_model_name": "anthropic/claude-3-7-sonnet-20250219", }, "claude-sonnet-4": { "provider": "anthropic", "api_key_var": "ANTHROPIC_API_KEY", "litellm_input_model_name": "anthropic/claude-sonnet-4-20250514", }, "claude-sonnet-4.5": { "provider": "anthropic", "api_key_var": "ANTHROPIC_API_KEY", "litellm_input_model_name": "anthropic/claude-sonnet-4-5-20250929", }, "claude-opus-4": { "provider": "anthropic", "api_key_var": "ANTHROPIC_API_KEY", "litellm_input_model_name": "anthropic/claude-opus-4-20250514", }, "claude-opus-4.1": { "provider": "anthropic", "api_key_var": "ANTHROPIC_API_KEY", "litellm_input_model_name": "anthropic/claude-opus-4-1-20250805", }, "claude-opus-4.5": { "provider": "anthropic", "api_key_var": "ANTHROPIC_API_KEY", "litellm_input_model_name": "anthropic/claude-opus-4-5-20251101", }, # Google models "gemini-2.5-pro": { "provider": "google", "api_key_var": "GEMINI_API_KEY", "litellm_input_model_name": "gemini/gemini-2.5-pro", }, "gemini-2.5-flash": { "provider": "google", "api_key_var": "GEMINI_API_KEY", "litellm_input_model_name": "gemini/gemini-2.5-flash", }, "gemini-3-pro": { "provider": "google", "api_key_var": "GEMINI_API_KEY", "litellm_input_model_name": "gemini/gemini-3-pro-preview", }, # Moonshot models "kimi-k2-0711": { "provider": "moonshot", "api_key_var": "MOONSHOT_API_KEY", "litellm_input_model_name": "moonshot/kimi-k2-0711-preview", }, "kimi-k2-0905": { "provider": "moonshot", "api_key_var": "MOONSHOT_API_KEY", "litellm_input_model_name": "moonshot/kimi-k2-0905-preview", }, "kimi-k2-thinking": { "provider": "moonshot", "api_key_var": "OPENROUTER_API_KEY", "litellm_input_model_name": "openrouter/moonshotai/kimi-k2-thinking", }, # Grok models "grok-4": { "provider": "xai", "api_key_var": "GROK_API_KEY", "litellm_input_model_name": "xai/grok-4-0709", }, "grok-code-fast-1": { "provider": "xai", "api_key_var": "GROK_API_KEY", "litellm_input_model_name": "xai/grok-code-fast-1", }, # Qwen models "qwen-3-coder-plus": { "provider": "qwen", "api_key_var": "DASHSCOPE_API_KEY", "litellm_input_model_name": "dashscope/qwen3-coder-plus", }, "qwen-3-max": { "provider": "qwen", "api_key_var": "DASHSCOPE_API_KEY", "litellm_input_model_name": "dashscope/qwen3-max-preview", }, # Zhipu "glm-4.5": { "provider": "zhipu", "api_key_var": "OPENROUTER_API_KEY", "litellm_input_model_name": "openrouter/z-ai/glm-4.5", } } def __init__(self, model_name: str): """ Initializes the model configuration. Args: model_name: The name of the model (e.g., 'gpt-4o', 'deepseek-chat'). Raises: ValueError: If the model is not supported or environment variables are missing. """ self.short_model_name = model_name model_info = self._get_model_info(model_name) # Load API key, base URL and LiteLLM model name from environment variables if "base_url_var" in model_info: self.base_url = os.getenv(model_info["base_url_var"]) else: self.base_url = None self.api_key = os.getenv(model_info["api_key_var"]) if not self.api_key: raise ValueError( f"Missing required environment variable: {model_info['api_key_var']}" ) self.litellm_input_model_name = model_info.get("litellm_input_model_name", model_name) def _get_model_info(self, model_name: str) -> Dict[str, str]: """ Retrieves the configuration details for a given model name. For unsupported models, defaults to using OPENAI_BASE_URL and OPENAI_API_KEY. """ if model_name not in self.MODEL_CONFIGS: logger.warning( f"Model '{model_name}' not in supported list. Using default OpenAI configuration." ) # Return default configuration for unsupported models return { "provider": "openai", "api_key_var": "OPENAI_API_KEY", "litellm_input_model_name": model_name, } return self.MODEL_CONFIGS[model_name] @classmethod def get_supported_models(cls) -> List[str]: """Returns a list of all supported model names.""" return list(cls.MODEL_CONFIGS.keys()) def main(): """Example usage of the ModelConfig class.""" logger.info("Supported models: %s", ModelConfig.get_supported_models()) try: # Example: Create a model config for DeepSeek model_config = ModelConfig("deepseek-chat") logger.info("✅ DeepSeek model config created successfully.") logger.info("Short model name: %s", model_config.short_model_name) logger.info("API key loaded: %s", bool(model_config.api_key)) except ValueError as e: logger.error("⚠️ Configuration error: %s", e) if __name__ == "__main__": main() ================================================ FILE: src/results_reporter.py ================================================ #!/usr/bin/env python3 """ Results Reporter for MCPMark Evaluation Pipeline ================================================ This module provides utilities for saving evaluation results in a structured format. """ import json from dataclasses import dataclass from datetime import datetime from pathlib import Path from typing import Any, Dict, List, Optional from src.logger import get_logger # Initialize logger logger = get_logger(__name__) @dataclass class TaskResult: """ Represents the result of a single task evaluation. Attributes: task_name: The full name of the task (e.g., "category_id__task_id"). success: Whether the task completed successfully. category_id: The task category ID. task_id: The task identifier (number or slug). error_message: Error message from agent execution if it failed. verification_error: Error message from verification if it failed. verification_output: Captured stdout from verification script. model_output: Agent conversation trajectory (messages). token_usage: Token usage statistics. turn_count: Number of turns taken during task execution. agent_execution_time: Time for Step 2 (agent execution) in seconds. task_execution_time: Total time for Steps 1-4 in seconds. """ task_name: str success: bool category_id: Optional[str] = None task_id: Optional[str] = None error_message: Optional[str] = None # Agent execution error verification_error: Optional[str] = None # Verification error (separate from agent error) verification_output: Optional[str] = None # Verification stdout/stderr model_output: Optional[Any] = None # Agent conversation trajectory token_usage: Optional[Dict[str, int]] = None # Token usage statistics turn_count: Optional[int] = None # Number of turns taken during task execution agent_execution_time: float = 0.0 # Time for Step 2 (agent execution) in seconds task_execution_time: float = 0.0 # Total time for Steps 1-4 in seconds @property def status(self) -> str: """Returns the status of the task as 'PASS' or 'FAIL'.""" return "PASS" if self.success else "FAIL" @dataclass class EvaluationReport: """Represents a complete evaluation report for a model.""" model_name: str model_config: Dict[str, Any] total_tasks: int successful_tasks: int failed_tasks: int task_results: List[TaskResult] tasks_filter: Optional[str] = None @property def success_rate(self) -> float: """Calculates the overall success rate as a percentage.""" if self.total_tasks == 0: return 0.0 return (self.successful_tasks / self.total_tasks) * 100 @property def total_input_tokens(self) -> int: """Calculate total input tokens across all tasks.""" total = 0 for result in self.task_results: if result.token_usage: total += (result.token_usage.get("input_tokens") or 0) return total @property def total_output_tokens(self) -> int: """Calculate total output tokens across all tasks.""" total = 0 for result in self.task_results: if result.token_usage: total += (result.token_usage.get("output_tokens") or 0) return total @property def total_tokens(self) -> int: """Calculate total tokens across all tasks.""" total = 0 for result in self.task_results: if result.token_usage: total += (result.token_usage.get("total_tokens") or 0) return total @property def total_reasoning_tokens(self) -> int: """Calculate total reasoning tokens across all tasks.""" total = 0 for result in self.task_results: if result.token_usage: total += (result.token_usage.get("reasoning_tokens") or 0) return total @property def avg_input_tokens(self) -> float: """Calculate average input tokens per task.""" if self.total_tasks == 0: return 0.0 return self.total_input_tokens / self.total_tasks @property def avg_output_tokens(self) -> float: """Calculate average output tokens per task.""" if self.total_tasks == 0: return 0.0 return self.total_output_tokens / self.total_tasks @property def avg_total_tokens(self) -> float: """Calculate average total tokens per task.""" if self.total_tasks == 0: return 0.0 return self.total_tokens / self.total_tasks @property def avg_reasoning_tokens(self) -> float: """Calculate average reasoning tokens per task.""" if self.total_tasks == 0: return 0.0 return self.total_reasoning_tokens / self.total_tasks @property def total_task_execution_time(self) -> float: """Calculates the total task execution time from sum of all task execution times.""" # Use sum of individual task execution times instead of pipeline wall clock time # This ensures resume functionality shows correct total time return sum(task.task_execution_time for task in self.task_results) @property def total_agent_execution_time(self) -> float: """Calculates the total agent execution time (Step 2) across all tasks.""" return sum(task.agent_execution_time for task in self.task_results) def get_category_stats(self) -> Dict[str, Dict[str, Any]]: """ Calculates and returns success statistics grouped by task category. """ category_stats = {} for result in self.task_results: category = result.category_id or "Uncategorized" if category not in category_stats: category_stats[category] = { "total": 0, "successful": 0, "failed": 0, "success_rate": 0.0, "avg_execution_time": 0.0, "avg_agent_execution_time": 0.0, "total_input_tokens": 0, "total_output_tokens": 0, "total_tokens": 0, "total_reasoning_tokens": 0, "avg_input_tokens": 0.0, "avg_output_tokens": 0.0, "avg_total_tokens": 0.0, "avg_reasoning_tokens": 0.0, "total_turns": 0, "avg_turns": 0.0, } category_stats[category]["total"] += 1 if result.success: category_stats[category]["successful"] += 1 else: category_stats[category]["failed"] += 1 # Add token and turn usage if result.token_usage: category_stats[category]["total_input_tokens"] += ( result.token_usage.get("input_tokens") or 0 ) category_stats[category]["total_output_tokens"] += ( result.token_usage.get("output_tokens") or 0 ) category_stats[category]["total_tokens"] += ( result.token_usage.get("total_tokens") or 0 ) category_stats[category]["total_reasoning_tokens"] += result.token_usage.get( "reasoning_tokens", 0 ) or 0 # Accumulate turns if result.turn_count is not None: category_stats[category]["total_turns"] += result.turn_count # Calculate derived metrics like success rate and average time for category, stats in category_stats.items(): if stats["total"] > 0: stats["success_rate"] = (stats["successful"] / stats["total"]) * 100 category_results = [ r for r in self.task_results if (r.category_id or "Uncategorized") == category ] total_time = sum(r.task_execution_time for r in category_results) stats["avg_execution_time"] = total_time / len(category_results) # Add agent execution time stats total_agent_time = sum(r.agent_execution_time for r in category_results) stats["avg_agent_execution_time"] = total_agent_time / len(category_results) # Calculate average tokens and turns stats["avg_input_tokens"] = stats["total_input_tokens"] / stats["total"] stats["avg_output_tokens"] = ( stats["total_output_tokens"] / stats["total"] ) stats["avg_total_tokens"] = stats["total_tokens"] / stats["total"] stats["avg_reasoning_tokens"] = stats["total_reasoning_tokens"] / stats["total"] stats["avg_turns"] = ( stats["total_turns"] / stats["total"] if stats["total"] > 0 else 0 ) return category_stats class ResultsReporter: """Handles saving evaluation results in structured formats.""" def __init__(self): """Initialize the results reporter.""" pass def save_messages_json(self, messages: Any, output_path: Path) -> Path: """Saves the conversation messages/trajectory as messages.json.""" output_path.parent.mkdir(parents=True, exist_ok=True) with output_path.open("w", encoding="utf-8") as f: json.dump(messages, f, indent=2, ensure_ascii=False) return output_path def save_meta_json( self, task_result: TaskResult, model_config: Dict[str, Any], start_time: datetime, end_time: datetime, output_path: Path, ) -> Path: """Saves task metadata (excluding messages) as meta.json.""" output_path.parent.mkdir(parents=True, exist_ok=True) meta_data = { "task_name": task_result.task_name, "model_name": model_config.get("model_name", "unknown"), "litellm_run_model_name": model_config.get("litellm_run_model_name"), "reasoning_effort": model_config.get("reasoning_effort"), "mcp": model_config.get("mcp_service", "unknown"), "timeout": model_config.get("timeout", 300), "time": {"start": start_time.isoformat(), "end": end_time.isoformat()}, "agent_execution_time": task_result.agent_execution_time, "task_execution_time": task_result.task_execution_time, "execution_result": { "success": task_result.success, "error_message": task_result.error_message, "verification_error": task_result.verification_error, "verification_output": task_result.verification_output, }, "token_usage": task_result.token_usage or {}, "turn_count": task_result.turn_count, } with output_path.open("w", encoding="utf-8") as f: json.dump(meta_data, f, indent=2, ensure_ascii=False) return output_path def save_model_summary(self, report: EvaluationReport, output_path: Path) -> Path: """Saves a concise model-level summary.""" output_path.parent.mkdir(parents=True, exist_ok=True) category_stats = report.get_category_stats() # Aggregate turn counts using category_stats total_turns = sum(stats["total_turns"] for stats in category_stats.values()) avg_turns = total_turns / report.total_tasks if report.total_tasks > 0 else 0 summary = { "model_name": report.model_name, "model_config": report.model_config, "total_tasks": report.total_tasks, "successful_tasks": report.successful_tasks, "failed_tasks": report.failed_tasks, "success_rate": round(report.success_rate, 2), "total_task_execution_time": report.total_task_execution_time, "average_task_execution_time": report.total_task_execution_time / report.total_tasks if report.total_tasks > 0 else 0, "total_agent_execution_time": report.total_agent_execution_time, "average_agent_execution_time": report.total_agent_execution_time / report.total_tasks if report.total_tasks > 0 else 0, "token_usage": { "total_input_tokens": report.total_input_tokens, "total_output_tokens": report.total_output_tokens, "total_tokens": report.total_tokens, "total_reasoning_tokens": report.total_reasoning_tokens, "avg_input_tokens": round(report.avg_input_tokens, 2), "avg_output_tokens": round(report.avg_output_tokens, 2), "avg_total_tokens": round(report.avg_total_tokens, 2), "avg_reasoning_tokens": round(report.avg_reasoning_tokens, 2), }, "turn_usage": { "total_turns": total_turns, "avg_turns": round(avg_turns, 2), }, "category_breakdown": { category: { "total": stats["total"], "success_rate": round(stats["success_rate"], 2), "avg_time": round(stats["avg_execution_time"], 2), "token_usage": { "total_input": stats["total_input_tokens"], "total_output": stats["total_output_tokens"], "total": stats["total_tokens"], "total_reasoning": stats["total_reasoning_tokens"], "avg_input": round(stats["avg_input_tokens"], 2), "avg_output": round(stats["avg_output_tokens"], 2), "avg_total": round(stats["avg_total_tokens"], 2), "avg_reasoning": round(stats["avg_reasoning_tokens"], 2), }, "turn_usage": { "total_turns": stats["total_turns"], "avg_turns": round(stats["avg_turns"], 2), }, } for category, stats in category_stats.items() }, } with output_path.open("w", encoding="utf-8") as f: json.dump(summary, f, indent=2, ensure_ascii=False) return output_path ================================================ FILE: src/services.py ================================================ """ Service Definitions for MCPMark ================================ Single source of truth for all MCP service configurations. Adding a new service only requires modifying this file. Note: Environment variables are already loaded from .mcp_env when the app starts, so we can reference them directly via the config system. MCP server creation is now handled entirely within src.agent.MCPAgent; therefore, the legacy "mcp_server" and "eval_config" entries in each service definition are deprecated and set to None for backward-compatibility. """ # Service definitions SERVICES = { "notion": { "config_schema": { "source_api_key": { "env_var": "SOURCE_NOTION_API_KEY", "required": True, "description": "Notion API key for source hub with templates", }, "eval_api_key": { "env_var": "EVAL_NOTION_API_KEY", "required": True, "description": "Notion API key for evaluation hub", }, "source_parent_page_title": { "env_var": "SOURCE_PARENT_PAGE_TITLE", "default": "MCPMark Source Hub", "required": False, "description": "Title of the source hub page that contains all initial states", }, "eval_parent_page_title": { "env_var": "EVAL_PARENT_PAGE_TITLE", "required": True, "description": "Title of the parent page in evaluation workspace", }, "playwright_headless": { "env_var": "PLAYWRIGHT_HEADLESS", "default": True, "required": False, "description": "Run browser in headless mode", "transform": "bool", # Will be handled by GenericConfigSchema }, "playwright_browser": { "env_var": "PLAYWRIGHT_BROWSER", "default": "firefox", "required": False, "description": "Browser to use for Playwright", "validator": "in:chromium,firefox,webkit", # Simple validator syntax }, }, "components": { "task_manager": "src.mcp_services.notion.notion_task_manager.NotionTaskManager", "state_manager": "src.mcp_services.notion.notion_state_manager.NotionStateManager", "login_helper": "src.mcp_services.notion.notion_login_helper.NotionLoginHelper", }, "config_mapping": { # Maps config schema keys to class constructor parameters "state_manager": { "source_notion_key": "source_api_key", "eval_notion_key": "eval_api_key", "headless": "playwright_headless", "browser": "playwright_browser", "source_parent_page_title": "source_parent_page_title", "eval_parent_page_title": "eval_parent_page_title", }, "login_helper": { "headless": "playwright_headless", "browser": "playwright_browser", }, }, # MCP server is now instantiated dynamically in MCPAgent; kept for backward # compatibility but set to None to indicate deprecation. "mcp_server": None, "eval_config": None, }, "github": { "config_schema": { "github_tokens": { "env_var": "GITHUB_TOKENS", "required": True, "description": "GitHub personal access token(s) - comma-separated for round-robin", "transform": "list", # Will split by comma }, # Evaluation organisation / user that hosts ephemeral test repositories "eval_org": { "env_var": "GITHUB_EVAL_ORG", "default": "mcpleague-eval", "required": False, "description": "Evaluation organisation or user for creating temporary test repositories", }, # (source_org removed – template repos now imported from local files) }, "components": { "task_manager": "src.mcp_services.github.github_task_manager.GitHubTaskManager", "state_manager": "src.mcp_services.github.github_state_manager.GitHubStateManager", "login_helper": "src.mcp_services.github.github_login_helper.GitHubLoginHelper", }, "config_mapping": { "state_manager": { "github_token": "github_tokens", "eval_org": "eval_org", }, "login_helper": { # Login helper needs a single token, we'll use the first one "token": "github_tokens", }, }, "mcp_server": None, "eval_config": None, }, "filesystem": { "config_schema": { "test_root": { "env_var": "FILESYSTEM_TEST_ROOT", "default": None, "required": False, "description": "Root directory for filesystem tests", "transform": "path", # Convert to Path object }, "cleanup_on_exit": { "env_var": "FILESYSTEM_CLEANUP", "default": True, "required": False, "description": "Clean up test directories after tasks", "transform": "bool", }, }, "components": { "task_manager": "src.mcp_services.filesystem.filesystem_task_manager.FilesystemTaskManager", "state_manager": "src.mcp_services.filesystem.filesystem_state_manager.FilesystemStateManager", "login_helper": "src.mcp_services.filesystem.filesystem_login_helper.FilesystemLoginHelper", }, "config_mapping": { "state_manager": { "test_root": "test_root", "cleanup_on_exit": "cleanup_on_exit", } }, "mcp_server": None, "eval_config": None, }, "playwright": { "config_schema": { "browser": { "env_var": "PLAYWRIGHT_BROWSER", "default": "chromium", "required": False, "description": "Browser to use (chromium, firefox, webkit)", "validator": "in:chromium,firefox,webkit", }, "headless": { "env_var": "PLAYWRIGHT_HEADLESS", "default": True, "required": False, "description": "Run browser in headless mode", "transform": "bool", }, "network_origins": { "env_var": "PLAYWRIGHT_NETWORK_ORIGINS", "default": "*", "required": False, "description": "Allowed network origins (comma-separated or *)", }, "user_profile": { "env_var": "PLAYWRIGHT_USER_PROFILE", "default": "isolated", "required": False, "description": "User profile type (isolated or persistent)", "validator": "in:isolated,persistent", }, "viewport_width": { "env_var": "PLAYWRIGHT_VIEWPORT_WIDTH", "default": 1280, "required": False, "description": "Browser viewport width", "transform": "int", }, "viewport_height": { "env_var": "PLAYWRIGHT_VIEWPORT_HEIGHT", "default": 720, "required": False, "description": "Browser viewport height", "transform": "int", }, }, "components": { "task_manager": "src.mcp_services.playwright.playwright_task_manager.PlaywrightTaskManager", "state_manager": "src.mcp_services.playwright.playwright_state_manager.PlaywrightStateManager", "login_helper": "src.mcp_services.playwright.playwright_login_helper.PlaywrightLoginHelper", }, "config_mapping": { "state_manager": { "browser": "browser", "headless": "headless", "network_origins": "network_origins", "user_profile": "user_profile", "viewport_width": "viewport_width", "viewport_height": "viewport_height", }, "login_helper": { "browser": "browser", "headless": "headless", }, }, "mcp_server": None, "eval_config": None, }, "postgres": { "config_schema": { "host": { "env_var": "POSTGRES_HOST", "default": "localhost", "required": False, "description": "PostgreSQL server host", }, "port": { "env_var": "POSTGRES_PORT", "default": 5432, "required": False, "description": "PostgreSQL server port", "transform": "int", "validator": "port", # Validates port range 1-65535 }, "database": { "env_var": "POSTGRES_DATABASE", "default": "postgres", "required": False, "description": "PostgreSQL database name", }, "username": { "env_var": "POSTGRES_USERNAME", "default": "postgres", "required": False, "description": "PostgreSQL username", }, "password": { "env_var": "POSTGRES_PASSWORD", "default": "password", "required": False, "description": "PostgreSQL password", }, }, "components": { "task_manager": "src.mcp_services.postgres.postgres_task_manager.PostgresTaskManager", "state_manager": "src.mcp_services.postgres.postgres_state_manager.PostgresStateManager", "login_helper": "src.mcp_services.postgres.postgres_login_helper.PostgresLoginHelper", }, "config_mapping": { "state_manager": { "host": "host", "port": "port", "database": "database", "username": "username", "password": "password", }, "login_helper": { "host": "host", "port": "port", "database": "database", "username": "username", "password": "password", }, }, "mcp_server": None, "eval_config": None, }, "insforge": { "config_schema": { "api_key": { "env_var": "INSFORGE_API_KEY", "required": True, "description": "Insforge backend API key for authentication", }, "backend_url": { "env_var": "INSFORGE_BACKEND_URL", "required": True, "description": "Insforge backend URL (e.g., https://your-app.insforge.app)", }, }, "components": { "task_manager": "src.mcp_services.insforge.insforge_task_manager.InsforgeTaskManager", "state_manager": "src.mcp_services.insforge.insforge_state_manager.InsforgeStateManager", "login_helper": "src.mcp_services.insforge.insforge_login_helper.InsforgeLoginHelper", }, "config_mapping": { "state_manager": { "api_key": "api_key", "backend_url": "backend_url", }, "login_helper": { "api_key": "api_key", "backend_url": "backend_url", }, }, "mcp_server": None, "eval_config": None, }, "supabase": { "config_schema": { "api_url": { "env_var": "SUPABASE_API_URL", "required": False, "description": "Supabase PostgREST API URL (default: http://localhost:54321 from CLI)", "default": "http://localhost:54321", }, "api_key": { "env_var": "SUPABASE_API_KEY", "required": False, "description": "Supabase API key (anon or service_role key from 'supabase status')", }, "postgres_host": { "env_var": "SUPABASE_DB_HOST", "required": False, "description": "PostgreSQL host for Supabase CLI instance", "default": "localhost", }, "postgres_port": { "env_var": "SUPABASE_DB_PORT", "required": False, "description": "PostgreSQL port for Supabase CLI instance (default: 54322)", "default": 54322, }, "postgres_user": { "env_var": "SUPABASE_DB_USER", "required": False, "description": "PostgreSQL username", "default": "postgres", }, "postgres_password": { "env_var": "SUPABASE_DB_PASSWORD", "required": False, "description": "PostgreSQL password", "default": "postgres", }, "postgres_database": { "env_var": "SUPABASE_DB_NAME", "required": False, "description": "PostgreSQL database name", "default": "postgres", }, }, "components": { "task_manager": "src.mcp_services.supabase.supabase_task_manager.SupabaseTaskManager", "state_manager": "src.mcp_services.supabase.supabase_state_manager.SupabaseStateManager", "login_helper": "src.mcp_services.supabase.supabase_login_helper.SupabaseLoginHelper", }, "config_mapping": { "state_manager": { "api_url": "api_url", "api_key": "api_key", "postgres_host": "postgres_host", "postgres_port": "postgres_port", "postgres_user": "postgres_user", "postgres_password": "postgres_password", "postgres_database": "postgres_database", }, "login_helper": {}, }, "mcp_server": None, "eval_config": None, }, "playwright_webarena": { "config_schema": { "browser": { "env_var": "PLAYWRIGHT_BROWSER", "default": "chromium", "required": False, "description": "Browser to use (chromium, firefox, webkit)", "validator": "in:chromium,firefox,webkit", }, "headless": { "env_var": "PLAYWRIGHT_HEADLESS", "default": True, "required": False, "description": "Run browser in headless mode", "transform": "bool", }, "network_origins": { "env_var": "PLAYWRIGHT_NETWORK_ORIGINS", "default": "*", "required": False, "description": "Allowed network origins (comma-separated or *)", }, "user_profile": { "env_var": "PLAYWRIGHT_USER_PROFILE", "default": "isolated", "required": False, "description": "User profile type (isolated or persistent)", "validator": "in:isolated,persistent", }, "viewport_width": { "env_var": "PLAYWRIGHT_VIEWPORT_WIDTH", "default": 1280, "required": False, "description": "Browser viewport width", "transform": "int", }, "viewport_height": { "env_var": "PLAYWRIGHT_VIEWPORT_HEIGHT", "default": 720, "required": False, "description": "Browser viewport height", "transform": "int", }, "skip_cleanup": { "env_var": "PLAYWRIGHT_WEBARENA_SKIP_CLEANUP", "default": False, "required": False, "description": "Skip Docker container cleanup for debugging", "transform": "bool", }, }, "components": { "task_manager": "src.mcp_services.playwright_webarena.playwright_task_manager.PlaywrightTaskManager", "state_manager": "src.mcp_services.playwright_webarena.playwright_state_manager.PlaywrightStateManager", "login_helper": "src.mcp_services.playwright_webarena.playwright_login_helper.PlaywrightLoginHelper", }, "config_mapping": { "state_manager": { "browser": "browser", "headless": "headless", "network_origins": "network_origins", "user_profile": "user_profile", "viewport_width": "viewport_width", "viewport_height": "viewport_height", "skip_cleanup": "skip_cleanup", }, "login_helper": { "browser": "browser", "headless": "headless", }, "task_manager": {}, }, "mcp_server": None, "eval_config": None, }, } def get_service_definition(service_name: str) -> dict: """Get MCP service definition by name.""" if service_name not in SERVICES: raise ValueError(f"Unknown MCP service: {service_name}") return SERVICES[service_name] def get_supported_mcp_services() -> list: """Get list of implemented MCP services.""" return [ name for name, config in SERVICES.items() if config["components"]["task_manager"] is not None ] ================================================ FILE: tasks/__init__.py ================================================ ================================================ FILE: tasks/filesystem/easy/.gitkeep ================================================ ================================================ FILE: tasks/filesystem/easy/file_context/file_splitting/description.md ================================================ # File Splitting Task ## 📋 Task Description You need to split a large text file into multiple smaller files with equal character counts. The task involves creating a new directory and splitting the content into exactly 3 files. ## 🎯 Task Objectives 1. **Create a new directory** named `split` in the test directory 2. **Split the file** `large_file.txt` into exactly 3 files with **similar** character counts (maximum character difference of 100 between any two files) 3. **Name the files** as `split_01.txt`, `split_02.txt`, `split_03.txt` in the `split` directory ================================================ FILE: tasks/filesystem/easy/file_context/file_splitting/meta.json ================================================ { "task_id": "file_splitting", "task_name": "File Splitting", "category_id": "file_context", "category_name": "File Context", "description": "Split large_file.txt into three nearly equal chunks stored as split_01.txt-split_03.txt inside a new split directory.", "author": "Lingjun Chen", "created_at": "2025-11-15", "difficulty": "L1", "tags": [ "content transformation", "file automation" ], "mcp": [ "filesystem" ], "meta_data": { "stateType": "text", "stateContent": "file_context/\n \u251c\u2500\u2500 file_01.txt\n \u251c\u2500\u2500 file_02.txt\n \u251c\u2500\u2500 file_03.txt\n \u251c\u2500\u2500 file_04.txt\n \u251c\u2500\u2500 file_05.txt\n \u251c\u2500\u2500 file_06.txt\n \u251c\u2500\u2500 file_07.txt\n \u251c\u2500\u2500 file_08.txt\n \u251c\u2500\u2500 file_09.txt\n \u251c\u2500\u2500 file_10.txt\n \u251c\u2500\u2500 file_11.txt\n \u251c\u2500\u2500 file_12.txt\n \u251c\u2500\u2500 file_13.txt\n \u251c\u2500\u2500 file_14.txt\n \u251c\u2500\u2500 file_15.txt\n \u251c\u2500\u2500 file_16.txt\n \u251c\u2500\u2500 file_17.txt\n \u251c\u2500\u2500 file_18.txt\n \u251c\u2500\u2500 file_19.txt\n \u251c\u2500\u2500 file_20.txt\n \u2514\u2500\u2500 large_file.txt", "stateUrl": "https://storage.mcpmark.ai/filesystem/file_context.zip", "stateOriginalUrl": null } } ================================================ FILE: tasks/filesystem/easy/file_context/file_splitting/verify.py ================================================ #!/usr/bin/env python3 """ Verification script for File Splitting Task """ import sys from pathlib import Path import os def get_test_directory() -> Path: """Get the test directory from FILESYSTEM_TEST_DIR env var.""" test_root = os.environ.get("FILESYSTEM_TEST_DIR") if not test_root: raise ValueError("FILESYSTEM_TEST_DIR environment variable is required") return Path(test_root) def verify_split_directory_exists(test_dir: Path) -> bool: """Verify that the split directory exists.""" split_dir = test_dir / "split" if not split_dir.exists(): print("❌ Directory 'split' not found") return False if not split_dir.is_dir(): print("❌ 'split' exists but is not a directory") return False print("✅ Split directory found") return True def verify_all_split_files_exist(test_dir: Path) -> bool: """Verify that all 3 split files exist with correct names.""" split_dir = test_dir / "split" expected_files = [f"split_{i:02d}.txt" for i in range(1, 4)] missing_files = [] for filename in expected_files: file_path = split_dir / filename if not file_path.exists(): missing_files.append(filename) if missing_files: print(f"❌ Missing files: {missing_files}") return False print("✅ All 3 split files exist with correct names") return True def verify_similar_file_lengths(test_dir: Path) -> bool: """Verify that all split files have similar character counts (within 30 characters difference).""" split_dir = test_dir / "split" file_lengths = [] for i in range(1, 4): filename = f"split_{i:02d}.txt" file_path = split_dir / filename try: content = file_path.read_text() file_lengths.append(len(content)) except Exception as e: print(f"❌ Error reading {filename}: {e}") return False # Check if all lengths are within 30 characters of each other min_length = min(file_lengths) max_length = max(file_lengths) length_difference = max_length - min_length if length_difference > 100: print(f"❌ File lengths differ by more than 30 characters: {length_difference}") print(f" Min length: {min_length}, Max length: {max_length}") print(f" All lengths: {file_lengths}") return False print(f"✅ All files have similar lengths (difference: {length_difference} characters)") print(f" Min: {min_length}, Max: {max_length}") return True def verify_content_integrity(test_dir: Path) -> bool: """Verify that concatenated split files equal the original file.""" split_dir = test_dir / "split" original_file = test_dir / "large_file.txt" # Read original content try: original_content = original_file.read_text() except Exception as e: print(f"❌ Error reading original file: {e}") return False # Concatenate all split files concatenated_content = "" for i in range(1, 4): filename = f"split_{i:02d}.txt" file_path = split_dir / filename try: content = file_path.read_text() concatenated_content += content except Exception as e: print(f"❌ Error reading {filename}: {e}") return False # Compare content if concatenated_content != original_content: print("❌ Concatenated content does not match original file") print(f" Original length: {len(original_content)}") print(f" Concatenated length: {len(concatenated_content)}") return False print("✅ Concatenated content matches original file exactly") return True def verify_no_extra_files(test_dir: Path) -> bool: """Verify that no extra files exist in the split directory.""" split_dir = test_dir / "split" expected_files = {f"split_{i:02d}.txt" for i in range(1, 4)} actual_files = {f.name for f in split_dir.iterdir() if f.is_file()} extra_files = actual_files - expected_files if extra_files: print(f"❌ Extra files found in split directory: {extra_files}") return False print("✅ No extra files in split directory") return True def main(): """Main verification function.""" test_dir = get_test_directory() print("🔍 Verifying File Splitting Task...") # Define verification steps verification_steps = [ ("Split Directory Exists", verify_split_directory_exists), ("All Split Files Exist", verify_all_split_files_exist), ("Similar File Lengths", verify_similar_file_lengths), ("Content Integrity", verify_content_integrity), ("No Extra Files", verify_no_extra_files), ] # Run all verification steps all_passed = True for step_name, verify_func in verification_steps: print(f"\n--- {step_name} ---") if not verify_func(test_dir): all_passed = False # Final result print("\n" + "="*50) if all_passed: print("✅ File splitting task completed correctly!") print("🎉 Task verification: PASS") sys.exit(0) else: print("❌ Task verification: FAIL") sys.exit(1) if __name__ == "__main__": main() ================================================ FILE: tasks/filesystem/easy/file_context/pattern_matching/description.md ================================================ # File Filtering Task: Find Files with Common Substring ## 📋 Task Description Your task is to find all files that contain a substring of 30 or more characters that also appears in `large_file.txt`. **You are not allowed to use python code.** ## 🎯 Task Objectives 1. **Read the reference file** `large_file.txt` to understand its content 2. **Examine each file** from file_01.txt to file_20.txt 3. **Find files** that contain a substring of 30 or more characters that matches a substring in `large_file.txt` 4. **Create a file `answer.txt`** and write the results to it with the following format: - One line per matching file - Format: `filename.txt` - Do not add any things else other than `filename.txt.` ================================================ FILE: tasks/filesystem/easy/file_context/pattern_matching/meta.json ================================================ { "task_id": "pattern_matching", "task_name": "Pattern Matching", "category_id": "file_context", "category_name": "File Context", "description": "Scan file_01.txt through file_20.txt for any 30+ character substring that also appears in large_file.txt and list each matching filename in answer.txt.", "author": "Lingjun Chen", "created_at": "2025-11-15", "difficulty": "L1", "tags": [ "pattern analysis", "search and filtering" ], "mcp": [ "filesystem" ], "meta_data": { "stateType": "text", "stateContent": "file_context/\n \u251c\u2500\u2500 file_01.txt\n \u251c\u2500\u2500 file_02.txt\n \u251c\u2500\u2500 file_03.txt\n \u251c\u2500\u2500 file_04.txt\n \u251c\u2500\u2500 file_05.txt\n \u251c\u2500\u2500 file_06.txt\n \u251c\u2500\u2500 file_07.txt\n \u251c\u2500\u2500 file_08.txt\n \u251c\u2500\u2500 file_09.txt\n \u251c\u2500\u2500 file_10.txt\n \u251c\u2500\u2500 file_11.txt\n \u251c\u2500\u2500 file_12.txt\n \u251c\u2500\u2500 file_13.txt\n \u251c\u2500\u2500 file_14.txt\n \u251c\u2500\u2500 file_15.txt\n \u251c\u2500\u2500 file_16.txt\n \u251c\u2500\u2500 file_17.txt\n \u251c\u2500\u2500 file_18.txt\n \u251c\u2500\u2500 file_19.txt\n \u251c\u2500\u2500 file_20.txt\n \u2514\u2500\u2500 large_file.txt", "stateUrl": "https://storage.mcpmark.ai/filesystem/file_context.zip", "stateOriginalUrl": null } } ================================================ FILE: tasks/filesystem/easy/file_context/pattern_matching/verify.py ================================================ #!/usr/bin/env python3 """ Verification script for File Filtering Task: Find Files with Common Substring """ import sys from pathlib import Path import os def get_test_directory() -> Path: """Get the test directory from FILESYSTEM_TEST_DIR env var.""" test_root = os.environ.get("FILESYSTEM_TEST_DIR") if not test_root: raise ValueError("FILESYSTEM_TEST_DIR environment variable is required") return Path(test_root) def verify_answer_file_exists(test_dir: Path) -> bool: """Verify that the answer.txt file exists.""" answer_file = test_dir / "answer.txt" if not answer_file.exists(): print("❌ File 'answer.txt' not found") return False print("✅ Answer file found") return True def verify_answer_format(test_dir: Path) -> bool: """Verify that the answer file has the correct format.""" answer_file = test_dir / "answer.txt" try: content = answer_file.read_text().strip() # If file is empty, that's acceptable (no matches found) if not content: print("✅ Answer file is empty (no matches found)") return True lines = content.split('\n') for i, line in enumerate(lines, 1): line = line.strip() if not line: continue # Check format: just filename.txt if not line.endswith('.txt') or not line.startswith('file_'): print(f"❌ Line {i} has incorrect format: {line}") print(" Expected format: filename.txt") return False print("✅ Answer format is correct") return True except Exception as e: print(f"❌ Error reading answer file: {e}") return False def find_30_plus_char_matches(test_dir: Path) -> set: """Find all files that have 30+ character substring matches with large_file.txt.""" large_file = test_dir / "large_file.txt" if not large_file.exists(): print("❌ large_file.txt not found") return set() large_content = large_file.read_text() matching_files = set() # Check each file from file_01.txt to file_20.txt for i in range(1, 21): filename = f"file_{i:02d}.txt" file_path = test_dir / filename if not file_path.exists(): continue file_content = file_path.read_text() # Check if there's a substring of 30+ characters that matches has_match = False for start_pos in range(len(file_content)): for end_pos in range(start_pos + 30, len(file_content) + 1): substring = file_content[start_pos:end_pos] if substring in large_content: has_match = True break if has_match: break if has_match: matching_files.add(filename) return matching_files def verify_matches_are_correct(test_dir: Path) -> bool: """Verify that the files listed in answer.txt actually have 30+ character matches.""" answer_file = test_dir / "answer.txt" try: content = answer_file.read_text().strip() # If no content, check if there should actually be no matches if not content: expected_matches = find_30_plus_char_matches(test_dir) if expected_matches: print("❌ Answer file is empty but matches should exist") for filename in expected_matches: print(f" Expected: {filename}") return False else: print("✅ No matches found (correct)") return True # Parse answer file answer_files = set() lines = content.split('\n') for line in lines: line = line.strip() if not line: continue answer_files.add(line) # Get expected matches expected_matches = find_30_plus_char_matches(test_dir) # Check if all answer files actually have matches for filename in answer_files: if filename not in expected_matches: print(f"❌ File {filename} listed in answer but has no valid 30+ character match") return False # Check if all expected matches are in answer for filename in expected_matches: if filename not in answer_files: print(f"❌ Missing match for {filename} in answer file") return False print("✅ All matches are correct") return True except Exception as e: print(f"❌ Error verifying matches: {e}") return False def verify_files_exist(test_dir: Path) -> bool: """Verify that all files mentioned in answer.txt actually exist.""" answer_file = test_dir / "answer.txt" try: content = answer_file.read_text().strip() if not content: return True # No files to verify lines = content.split('\n') for line in lines: line = line.strip() if not line: continue file_path = test_dir / line if not file_path.exists(): print(f"❌ File mentioned in answer does not exist: {line}") return False print("✅ All files mentioned in answer exist") return True except Exception as e: print(f"❌ Error verifying file existence: {e}") return False def main(): """Main verification function.""" test_dir = get_test_directory() print("🔍 Verifying File Filtering Task: Find Files with Common Substring...") # Define verification steps verification_steps = [ ("Answer File Exists", verify_answer_file_exists), ("Answer Format", verify_answer_format), ("Files Exist", verify_files_exist), ("Matches are Correct", verify_matches_are_correct), ] # Run all verification steps all_passed = True for step_name, verify_func in verification_steps: print(f"\n--- {step_name} ---") if not verify_func(test_dir): all_passed = False # Final result print("\n" + "="*50) if all_passed: print("✅ File filtering task completed correctly!") print("🎉 Task verification: PASS") sys.exit(0) else: print("❌ Task verification: FAIL") sys.exit(1) if __name__ == "__main__": main() ================================================ FILE: tasks/filesystem/easy/file_context/uppercase/description.md ================================================ # File Context Task: Convert Files to Uppercase ## 📋 Task Description You need to process 5 text files (file_01.txt to file_05.txt) and convert their content to uppercase format. ## 🎯 Task Objectives 1. **Create an uppercase directory** in the test environment root 2. **Convert each file** from file_01.txt to file_05.txt to uppercase 3. **Save converted files** in the uppercase/ directory with the same names ================================================ FILE: tasks/filesystem/easy/file_context/uppercase/meta.json ================================================ { "task_id": "uppercase", "task_name": "Uppercase", "category_id": "file_context", "category_name": "File Context", "description": "Copy file_01.txt-file_05.txt into an uppercase/ folder and convert the contents of every file to uppercase text.", "author": "Lingjun Chen", "created_at": "2025-11-15", "difficulty": "L1", "tags": [ "content transformation", "batch processing" ], "mcp": [ "filesystem" ], "meta_data": { "stateType": "text", "stateContent": "file_context/\n \u251c\u2500\u2500 file_01.txt\n \u251c\u2500\u2500 file_02.txt\n \u251c\u2500\u2500 file_03.txt\n \u251c\u2500\u2500 file_04.txt\n \u251c\u2500\u2500 file_05.txt\n \u251c\u2500\u2500 file_06.txt\n \u251c\u2500\u2500 file_07.txt\n \u251c\u2500\u2500 file_08.txt\n \u251c\u2500\u2500 file_09.txt\n \u251c\u2500\u2500 file_10.txt\n \u251c\u2500\u2500 file_11.txt\n \u251c\u2500\u2500 file_12.txt\n \u251c\u2500\u2500 file_13.txt\n \u251c\u2500\u2500 file_14.txt\n \u251c\u2500\u2500 file_15.txt\n \u251c\u2500\u2500 file_16.txt\n \u251c\u2500\u2500 file_17.txt\n \u251c\u2500\u2500 file_18.txt\n \u251c\u2500\u2500 file_19.txt\n \u251c\u2500\u2500 file_20.txt\n \u2514\u2500\u2500 large_file.txt", "stateUrl": "https://storage.mcpmark.ai/filesystem/file_context.zip", "stateOriginalUrl": null } } ================================================ FILE: tasks/filesystem/easy/file_context/uppercase/verify.py ================================================ #!/usr/bin/env python3 """ Verification script for File Context Task: Convert Files to Uppercase """ import sys from pathlib import Path import os import re def get_test_directory() -> Path: """Get the test directory from FILESYSTEM_TEST_DIR env var.""" test_root = os.environ.get("FILESYSTEM_TEST_DIR") if not test_root: raise ValueError("FILESYSTEM_TEST_DIR environment variable is required") return Path(test_root) def verify_uppercase_directory_exists(test_dir: Path) -> bool: """Verify that the uppercase directory exists.""" uppercase_dir = test_dir / "uppercase" if not uppercase_dir.exists(): print("❌ Directory 'uppercase' not found") return False if not uppercase_dir.is_dir(): print("❌ 'uppercase' exists but is not a directory") return False print("✅ Uppercase directory found") return True def verify_uppercase_files_exist(test_dir: Path) -> bool: """Verify that all 5 uppercase files exist.""" uppercase_dir = test_dir / "uppercase" for i in range(1, 6): filename = f"file_{i:02d}.txt" file_path = uppercase_dir / filename if not file_path.exists(): print(f"❌ File '{filename}' not found in uppercase directory") return False print("✅ All 5 uppercase files found") return True def verify_uppercase_content(test_dir: Path) -> bool: """Verify that uppercase files contain the correct uppercase content.""" uppercase_dir = test_dir / "uppercase" for i in range(1, 6): filename = f"file_{i:02d}.txt" original_file = test_dir / filename uppercase_file = uppercase_dir / filename if not original_file.exists(): print(f"❌ Original file '{filename}' not found") return False try: original_content = original_file.read_text() uppercase_content = uppercase_file.read_text() # Check if uppercase content is the uppercase version of original expected_uppercase = original_content.upper() if uppercase_content != expected_uppercase: print(f"❌ File '{filename}' content is not properly converted to uppercase") return False except Exception as e: print(f"❌ Error reading file '{filename}': {e}") return False print("✅ All uppercase files contain correct uppercase content") return True def verify_answer_file_exists(test_dir: Path) -> bool: """Verify that the answer.txt file exists in the uppercase directory.""" uppercase_dir = test_dir / "uppercase" answer_file = uppercase_dir / "answer.txt" if not answer_file.exists(): print("❌ File 'answer.txt' not found in uppercase directory") return False print("✅ Answer file found in uppercase directory") return True def verify_answer_format(test_dir: Path) -> bool: """Verify that the answer file has the correct format.""" uppercase_dir = test_dir / "uppercase" answer_file = uppercase_dir / "answer.txt" try: content = answer_file.read_text().strip() if not content: print("❌ Answer file is empty") return False lines = content.split('\n') # Check if we have exactly 10 lines if len(lines) != 10: print(f"❌ Answer file has {len(lines)} lines, expected 10") return False for i, line in enumerate(lines, 1): line = line.strip() if not line: print(f"❌ Line {i} is empty") return False # Check format: filename:word_count if ':' not in line: print(f"❌ Line {i} has incorrect format: {line}") print(" Expected format: filename:word_count") return False parts = line.split(':', 1) if len(parts) != 2: print(f"❌ Line {i} has incorrect format: {line}") print(" Expected format: filename:word_count") return False filename, word_count_str = parts # Check filename format if not filename.endswith('.txt') or not filename.startswith('file_'): print(f"❌ Line {i} has invalid filename: {filename}") return False # Check word count format (should be integer) try: word_count = int(word_count_str) if word_count <= 0: print(f"❌ Line {i} has invalid word count: {word_count_str}") return False except ValueError: print(f"❌ Line {i} has non-integer word count: {word_count_str}") return False print("✅ Answer format is correct") return True except Exception as e: print(f"❌ Error reading answer file: {e}") return False def count_words_in_file(file_path: Path) -> int: """Count words in a file.""" try: content = file_path.read_text() # Split by whitespace and filter out empty strings words = [word for word in content.split() if word.strip()] return len(words) except Exception as e: print(f"❌ Error reading file {file_path}: {e}") return 0 def verify_word_counts_are_correct(test_dir: Path) -> bool: """Verify that the word counts in answer.txt are correct.""" uppercase_dir = test_dir / "uppercase" answer_file = uppercase_dir / "answer.txt" try: content = answer_file.read_text().strip() lines = content.split('\n') # Expected word counts based on answer.md expected_counts = [22, 22, 22, 22, 18, 22, 22, 22, 18, 20] # Create a set of expected file entries for easier checking expected_entries = set() for i in range(1, 11): filename = f"file_{i:02d}.txt" expected_count = expected_counts[i - 1] if i == 6: # Special case for file_06.txt: can be 21 or 22 expected_entries.add(f"{filename}:21") expected_entries.add(f"{filename}:22") else: expected_entries.add(f"{filename}:{expected_count}") # Check each line in the answer file found_entries = set() for line in lines: line = line.strip() if line in expected_entries: found_entries.add(line) else: print(f"❌ Invalid entry: {line}") return False # Check if we found all expected entries if len(found_entries) != 10: print(f"❌ Found {len(found_entries)} entries, expected 10") missing = expected_entries - found_entries if missing: print(f" Missing entries: {missing}") return False print("✅ All word counts are correct") return True except Exception as e: print(f"❌ Error verifying word counts: {e}") return False def verify_all_files_are_included(test_dir: Path) -> bool: """Verify that all 10 files are included in the answer.""" uppercase_dir = test_dir / "uppercase" answer_file = uppercase_dir / "answer.txt" try: content = answer_file.read_text().strip() lines = content.split('\n') # Check that all 10 files are present found_files = set() for line in lines: parts = line.split(':', 1) filename = parts[0] found_files.add(filename) expected_files = {f"file_{i:02d}.txt" for i in range(1, 11)} if found_files != expected_files: missing = expected_files - found_files extra = found_files - expected_files if missing: print(f"❌ Missing files in answer: {missing}") if extra: print(f"❌ Extra files in answer: {extra}") return False print("✅ All 10 files are included in answer") return True except Exception as e: print(f"❌ Error verifying file inclusion: {e}") return False def main(): """Main verification function.""" try: test_dir = get_test_directory() print(f"🔍 Verifying Uppercase in: {test_dir}") print() # Run all verification checks checks = [ ("Uppercase directory exists", verify_uppercase_directory_exists), ("Uppercase files exist", verify_uppercase_files_exist), ("Uppercase content is correct", verify_uppercase_content), ] all_passed = True for check_name, check_func in checks: print(f"📋 {check_name}...") if not check_func(test_dir): all_passed = False print() if all_passed: print("🎉 All verification checks passed!") sys.exit(0) else: print("❌ Some verification checks failed!") sys.exit(1) except Exception as e: print(f"❌ Verification failed with error: {e}") sys.exit(1) if __name__ == "__main__": main() ================================================ FILE: tasks/filesystem/easy/file_property/largest_rename/description.md ================================================ # Largest File Rename Task ## 📋 Task Description Rename the largest `.jpg` file in the test directory to `largest.jpg` based on file size. ## 🎯 Task Objectives 1. **Find all `.jpg` files** in the test directory 2. **Determine which `.jpg` file is the largest** by file size 3. **Rename the largest `.jpg` file to `largest.jpg`** ================================================ FILE: tasks/filesystem/easy/file_property/largest_rename/meta.json ================================================ { "task_id": "largest_rename", "task_name": "Largest File Rename", "category_id": "file_property", "category_name": "File Property", "description": "Identify the largest .jpg in the workspace and rename it to largest.jpg while leaving the other files untouched.", "author": "Lingjun Chen", "created_at": "2025-11-15", "difficulty": "L1", "tags": [ "file organization", "attribute inspection" ], "mcp": [ "filesystem" ], "meta_data": { "stateType": "text", "stateContent": "file_property/\n \u251c\u2500\u2500 bear.jpg\n \u251c\u2500\u2500 bridge.jpg\n \u251c\u2500\u2500 bus.MOV\n \u251c\u2500\u2500 random_file_1.txt\n \u251c\u2500\u2500 random_file_2.txt\n \u251c\u2500\u2500 random_file_3.txt\n \u251c\u2500\u2500 road.MOV\n \u2514\u2500\u2500 sg.jpg", "stateUrl": "https://storage.mcpmark.ai/filesystem/file_property.zip", "stateOriginalUrl": null } } ================================================ FILE: tasks/filesystem/easy/file_property/largest_rename/verify.py ================================================ #!/usr/bin/env python3 """ Verification script for Largest File Rename Task """ import sys from pathlib import Path import os def get_test_directory() -> Path: """Get the test directory from FILESYSTEM_TEST_DIR env var.""" test_root = os.environ.get("FILESYSTEM_TEST_DIR") if not test_root: raise ValueError("FILESYSTEM_TEST_DIR environment variable is required") return Path(test_root) def verify_sg_jpg_not_exists(test_dir: Path) -> bool: """Verify that sg.jpg does not exist.""" sg_file = test_dir / "sg.jpg" if sg_file.exists(): print("❌ sg.jpg still exists (should be renamed)") return False print("✅ sg.jpg does not exist") return True def verify_largest_jpg_exists(test_dir: Path) -> bool: """Verify that largest.jpg exists.""" largest_file = test_dir / "largest.jpg" if not largest_file.exists(): print("❌ largest.jpg does not exist") return False print("✅ largest.jpg exists") return True def main(): """Main verification function.""" try: test_dir = get_test_directory() print(f"🔍 Verifying largest file rename in: {test_dir}") # Run all verification checks checks = [ ("sg.jpg does not exist", verify_sg_jpg_not_exists), ("largest.jpg exists", verify_largest_jpg_exists) ] all_passed = True for check_name, check_func in checks: print(f"\n📋 Checking: {check_name}") if not check_func(test_dir): all_passed = False if all_passed: print("\n🎉 All verification checks passed!") sys.exit(0) else: print("\n❌ Some verification checks failed!") sys.exit(1) except Exception as e: print(f"❌ Verification failed with error: {e}") sys.exit(1) if __name__ == "__main__": main() ================================================ FILE: tasks/filesystem/easy/file_property/txt_merging/description.md ================================================ # Text File Merging Task ## 📋 Task Description Merge all `.txt` files in the test directory into a single file called `merge.txt`. The merged file should contain the content from all `.txt` files. ## 🎯 Task Objectives 1. **Read all `.txt` files** in the test directory 2. **Create a new file** called `merge.txt` in the test directory 3. **Write the content** from all `.txt` files into `merge.txt` 4. **The order** of content doesn't matter - as long as all content from all `.txt` files is present in `merge.txt` ================================================ FILE: tasks/filesystem/easy/file_property/txt_merging/meta.json ================================================ { "task_id": "txt_merging", "task_name": "Text File Merging", "category_id": "file_property", "category_name": "File Property", "description": "Combine the contents of every .txt file into a single merge.txt file so the archive has one consolidated view.", "author": "Lingjun Chen", "created_at": "2025-11-15", "difficulty": "L1", "tags": [ "content consolidation", "file automation" ], "mcp": [ "filesystem" ], "meta_data": { "stateType": "text", "stateContent": "file_property/\n \u251c\u2500\u2500 bear.jpg\n \u251c\u2500\u2500 bridge.jpg\n \u251c\u2500\u2500 bus.MOV\n \u251c\u2500\u2500 random_file_1.txt\n \u251c\u2500\u2500 random_file_2.txt\n \u251c\u2500\u2500 random_file_3.txt\n \u251c\u2500\u2500 road.MOV\n \u2514\u2500\u2500 sg.jpg", "stateUrl": "https://storage.mcpmark.ai/filesystem/file_property.zip", "stateOriginalUrl": null } } ================================================ FILE: tasks/filesystem/easy/file_property/txt_merging/verify.py ================================================ #!/usr/bin/env python3 """ Verification script for Text File Merging Task """ import sys from pathlib import Path import os def get_test_directory() -> Path: """Get the test directory from FILESYSTEM_TEST_DIR env var.""" test_root = os.environ.get("FILESYSTEM_TEST_DIR") if not test_root: raise ValueError("FILESYSTEM_TEST_DIR environment variable is required") return Path(test_root) def get_expected_contents(): """Return the expected content from each .txt file.""" return [ "O rErmZ4tDgzMNoxn1oNfQhT1TRpy9w0tQPGTcrsaoMFrrgt9bY5mgBxO6q8c8lZywXxEEBWW4i6Jh9NbAtYtRKvkzB4bshGIMzn2G1 rDTpKJj", "DmRrDFFaIl1mPubzSJJaN4aMeZyBHqVxZe5tpztHQ9zSe6b69Hnl7coqeNJXHXU2EnaDnyhYxZSWHPn3IWLsLGWrx7py8d37Z8blMnh7VDUH7hAMamhLRO8lfUVV1roM8a0njnW9evXRq5AoNTt8Tv7kQ5LmLe6Z66MZwtjckRAXmOB4x3AYbbxLULYZAxitW1KNG1yTaDOYZQhtKdZkX1XqytzBl9dRXI4gk91ZlVHLOiujwUa89EVsdjayKeCc21gCJMXvbhDSOGAs6dXZEHuaHQnnBdM19X3TwPgfDONyhlc pjwoQ45D56UQVWxwNIJUTgwS1vctYOx4XFpMgf3PRQ7zZdfhIuPBFdQwnQvYUeQbWa5gnyMO9FVSU0vm9uccbJQvkcEAJzMkEh9i7z6EEixtbwVedlTGWL2XBwjenRdf2qsOgvJo8Dyuvf35ieCFMG7wR7200rs GJZ5bRdx4R2gGOWVMi3MOBrqcw3KhbcpJtdQoKMALEjBMrY7VYKtAZNI6LoXX OOTJZ3x3usHRJY0gMtKhh6OJ 37aknvBwNYJ0IRWYWaeJ8LBwJyO6ZV3ZJ0palISQvGaHEZ0olHnK2iNCTxqxvF8J7EdIdIPYssl5f0XgPl6", "aFCzXJbJq02zlCKnyarJnPUiwVIuUrQci3fZvGD53F5fUsKDUlEwO5 ANJ2VgBnJ5cuBJzjILcM9AxTvyNZ5NPIHjSCo5O20K" ] def verify_merge_file_exists(test_dir: Path) -> bool: """Verify that merge.txt exists in the test directory.""" merge_file = test_dir / "merge.txt" if not merge_file.exists(): print("❌ merge.txt not found") return False if not merge_file.is_file(): print("❌ merge.txt exists but is not a file") return False print("✅ merge.txt exists") return True def verify_merge_file_contents(test_dir: Path) -> bool: """Verify that merge.txt contains all expected content strings.""" merge_file = test_dir / "merge.txt" expected_contents = get_expected_contents() try: with open(merge_file, 'r', encoding='utf-8') as f: merge_content = f.read() except Exception as e: print(f"❌ Failed to read merge.txt: {e}") return False # Check that each expected content string is present in the merged file missing_contents = [] for content in expected_contents: if content not in merge_content: missing_contents.append(content[:50] + "..." if len(content) > 50 else content) if missing_contents: print(f"❌ Missing content in merge.txt:") for content in missing_contents: print(f" - {content}") return False print("✅ merge.txt contains all expected content") return True def main(): """Main verification function.""" try: test_dir = get_test_directory() print(f"🔍 Verifying text file merging in: {test_dir}") # Run all verification checks checks = [ ("Merge file existence", verify_merge_file_exists), ("Merge file contents", verify_merge_file_contents) ] all_passed = True for check_name, check_func in checks: print(f"\n📋 Checking: {check_name}") if not check_func(test_dir): all_passed = False if all_passed: print("\n🎉 All verification checks passed!") sys.exit(0) else: print("\n❌ Some verification checks failed!") sys.exit(1) except Exception as e: print(f"❌ Verification failed with error: {e}") sys.exit(1) if __name__ == "__main__": main() ================================================ FILE: tasks/filesystem/easy/folder_structure/structure_analysis/description.md ================================================ # Directory Structure Analysis Task You need to recursively traverse the entire folder structure under the main directory and count the total number of `.py` files in the entire directory (including all subdirectories). Write the answer (just a single number) in a file named `structure_analysis.txt` in the main directory (at the same level as the `complex_structure` folder). You should not change or delete any existed files. Do not try to use python code. ================================================ FILE: tasks/filesystem/easy/folder_structure/structure_analysis/meta.json ================================================ { "task_id": "structure_analysis", "task_name": "Structure Analysis", "category_id": "folder_structure", "category_name": "Folder Structure", "description": "Recursively inspect the complex_structure tree, count all .py files, and save the total as the only line of structure_analysis.txt.", "author": "Lingjun Chen", "created_at": "2025-11-15", "difficulty": "L1", "tags": [ "data extraction", "filesystem traversal" ], "mcp": [ "filesystem" ], "meta_data": { "stateType": "text", "stateContent": "folder_structure/\n \u2514\u2500\u2500 complex_structure/\n \u251c\u2500\u2500 deeply/\n \u2502 \u2514\u2500\u2500 nested/\n \u2502 \u2514\u2500\u2500 folder/\n \u2502 \u2514\u2500\u2500 structure/\n \u251c\u2500\u2500 empty_folder/\n \u251c\u2500\u2500 folder_lxkHt_0_1/\n \u2502 \u2514\u2500\u2500 file_PeLzC_0.txt\n \u251c\u2500\u2500 folder_QdTAj_0_2/\n \u2502 \u251c\u2500\u2500 folder_eXccj_1_0/\n \u2502 \u2502 \u251c\u2500\u2500 folder_Mqlwh_2_1/\n \u2502 \u2502 \u2502 \u251c\u2500\u2500 folder_cKxcP_3_3/\n \u2502 \u2502 \u2502 \u2502 \u251c\u2500\u2500 folder_BPTMK_4_1/\n \u2502 \u2502 \u2502 \u2502 \u2502 \u2514\u2500\u2500 file_RHtBP_0.txt\n \u2502 \u2502 \u2502 \u2502 \u251c\u2500\u2500 folder_QNqjq_4_0/\n \u2502 \u2502 \u2502 \u2502 \u2502 \u251c\u2500\u2500 folder_gRwPE_5_1/\n \u2502 \u2502 \u2502 \u2502 \u2502 \u2502 \u251c\u2500\u2500 file_jVlpp_0.txt\n \u2502 \u2502 \u2502 \u2502 \u2502 \u2502 \u2514\u2500\u2500 file_vJuHz_1.txt\n \u2502 \u2502 \u2502 \u2502 \u2502 \u251c\u2500\u2500 folder_XdXYJ_5_0/\n \u2502 \u2502 \u2502 \u2502 \u2502 \u2502 \u2514\u2500\u2500 file_KvkKi_0.txt\n \u2502 \u2502 \u2502 \u2502 \u2502 \u251c\u2500\u2500 file_gGxLG_2.txt\n \u2502 \u2502 \u2502 \u2502 \u2502 \u251c\u2500\u2500 file_Hzkxo_0.txt\n \u2502 \u2502 \u2502 \u2502 \u2502 \u2514\u2500\u2500 file_XRjeh_1.txt\n \u2502 \u2502 \u2502 \u2502 \u251c\u2500\u2500 folder_vIBIt_4_2/\n \u2502 \u2502 \u2502 \u2502 \u2502 \u251c\u2500\u2500 folder_kRDNS_5_0/\n \u2502 \u2502 \u2502 \u2502 \u2502 \u2502 \u2514\u2500\u2500 file_wFSjJ_0.txt\n \u2502 \u2502 \u2502 \u2502 \u2502 \u2514\u2500\u2500 file_NyBSO_0.txt\n \u2502 \u2502 \u2502 \u2502 \u251c\u2500\u2500 file_EOCNf_1.txt\n \u2502 \u2502 \u2502 \u2502 \u2514\u2500\u2500 file_gmrXA_0.txt\n \u2502 \u2502 \u2502 \u251c\u2500\u2500 folder_NcruA_3_1/\n \u2502 \u2502 \u2502 \u2502 \u251c\u2500\u2500 file_bLWDj_1.txt\n \u2502 \u2502 \u2502 \u2502 \u2514\u2500\u2500 file_WAftR_0.txt\n \u2502 \u2502 \u2502 \u251c\u2500\u2500 folder_qCDFI_3_2/\n \u2502 \u2502 \u2502 \u2502 \u251c\u2500\u2500 file_eSMOJ_0.txt\n \u2502 \u2502 \u2502 \u2502 \u251c\u2500\u2500 file_oxADy_2.txt\n \u2502 \u2502 \u2502 \u2502 \u2514\u2500\u2500 file_RTbbc_1.txt\n \u2502 \u2502 \u2502 \u251c\u2500\u2500 folder_QVHUU_3_0/\n \u2502 \u2502 \u2502 \u2502 \u251c\u2500\u2500 folder_FEPTK_4_1/\n \u2502 \u2502 \u2502 \u2502 \u2502 \u251c\u2500\u2500 folder_GHoMC_5_1/\n \u2502 \u2502 \u2502 \u2502 \u2502 \u2502 \u2514\u2500\u2500 file_rAMYd_0.txt\n \u2502 \u2502 \u2502 \u2502 \u2502 \u251c\u2500\u2500 folder_iBDUY_5_0/\n \u2502 \u2502 \u2502 \u2502 \u2502 \u2502 \u2514\u2500\u2500 file_IJCaw_0.txt\n \u2502 \u2502 \u2502 \u2502 \u2502 \u251c\u2500\u2500 folder_VRXgp_5_2/\n \u2502 \u2502 \u2502 \u2502 \u2502 \u2502 \u2514\u2500\u2500 file_hkUmS_0.txt\n \u2502 \u2502 \u2502 \u2502 \u2502 \u251c\u2500\u2500 file_nqLAf_1.txt\n \u2502 \u2502 \u2502 \u2502 \u2502 \u2514\u2500\u2500 file_XflmA_0.txt\n \u2502 \u2502 \u2502 \u2502 \u251c\u2500\u2500 folder_FlPoK_4_3/\n \u2502 \u2502 \u2502 \u2502 \u2502 \u251c\u2500\u2500 folder_hSVNm_5_3/\n \u2502 \u2502 \u2502 \u2502 \u2502 \u2502 \u2514\u2500\u2500 file_klnbn_0.txt\n \u2502 \u2502 \u2502 \u2502 \u2502 \u251c\u2500\u2500 folder_iZuEl_5_0/\n \u2502 \u2502 \u2502 \u2502 \u2502 \u2502 \u2514\u2500\u2500 file_LqAmy_0.txt\n \u2502 \u2502 \u2502 \u2502 \u2502 \u251c\u2500\u2500 folder_LcURj_5_2/\n \u2502 \u2502 \u2502 \u2502 \u2502 \u2502 \u251c\u2500\u2500 file_RgwOS_1.txt\n \u2502 \u2502 \u2502 \u2502 \u2502 \u2502 \u2514\u2500\u2500 file_ZHnYb_0.txt\n \u2502 \u2502 \u2502 \u2502 \u2502 \u251c\u2500\u2500 folder_tuZQJ_5_1/\n \u2502 \u2502 \u2502 \u2502 \u2502 \u2502 \u2514\u2500\u2500 file_LHuIx_0.txt\n \u2502 \u2502 \u2502 \u2502 \u2502 \u251c\u2500\u2500 file_asJnB_1.txt\n \u2502 \u2502 \u2502 \u2502 \u2502 \u2514\u2500\u2500 file_EzLdu_0.txt\n \u2502 \u2502 \u2502 \u2502 \u251c\u2500\u2500 folder_ndhsJ_4_0/\n \u2502 \u2502 \u2502 \u2502 \u2502 \u251c\u2500\u2500 folder_CUSXK_5_0/\n \u2502 \u2502 \u2502 \u2502 \u2502 \u2502 \u251c\u2500\u2500 file_DpiuM_1.txt\n \u2502 \u2502 \u2502 \u2502 \u2502 \u2502 \u2514\u2500\u2500 file_pSqeG_0.txt\n \u2502 \u2502 \u2502 \u2502 \u2502 \u251c\u2500\u2500 folder_pstmE_5_1/\n \u2502 \u2502 \u2502 \u2502 \u2502 \u2502 \u2514\u2500\u2500 file_YwdJt_0.txt\n \u2502 \u2502 \u2502 \u2502 \u2502 \u251c\u2500\u2500 folder_StlsP_5_2/\n \u2502 \u2502 \u2502 \u2502 \u2502 \u2502 \u251c\u2500\u2500 file_kriBJ_0.txt\n \u2502 \u2502 \u2502 \u2502 \u2502 \u2502 \u2514\u2500\u2500 file_XCEdm_1.txt\n \u2502 \u2502 \u2502 \u2502 \u2502 \u251c\u2500\u2500 file_ToDjh_1.txt\n \u2502 \u2502 \u2502 \u2502 \u2502 \u2514\u2500\u2500 file_xbIVx_0.txt\n \u2502 \u2502 \u2502 \u2502 \u251c\u2500\u2500 folder_PJBok_4_4/\n \u2502 \u2502 \u2502 \u2502 \u2502 \u251c\u2500\u2500 folder_mzxaf_5_0/\n \u2502 \u2502 \u2502 \u2502 \u2502 \u2502 \u251c\u2500\u2500 file_ILBzj_2.txt\n \u2502 \u2502 \u2502 \u2502 \u2502 \u2502 \u251c\u2500\u2500 file_MTGMm_1.txt\n \u2502 \u2502 \u2502 \u2502 \u2502 \u2502 \u2514\u2500\u2500 file_zBDqz_0.txt\n \u2502 \u2502 \u2502 \u2502 \u2502 \u251c\u2500\u2500 folder_sULMj_5_1/\n \u2502 \u2502 \u2502 \u2502 \u2502 \u2502 \u251c\u2500\u2500 file_BHziw_1.txt\n \u2502 \u2502 \u2502 \u2502 \u2502 \u2502 \u251c\u2500\u2500 file_sIjiu_2.txt\n \u2502 \u2502 \u2502 \u2502 \u2502 \u2502 \u2514\u2500\u2500 file_VqNkB_0.txt\n \u2502 \u2502 \u2502 \u2502 \u2502 \u251c\u2500\u2500 folder_vypSi_5_3/\n \u2502 \u2502 \u2502 \u2502 \u2502 \u2502 \u251c\u2500\u2500 file_kZbIm_1.txt\n \u2502 \u2502 \u2502 \u2502 \u2502 \u2502 \u2514\u2500\u2500 file_sOBtE_0.txt\n \u2502 \u2502 \u2502 \u2502 \u2502 \u251c\u2500\u2500 folder_ZLGHy_5_2/\n \u2502 \u2502 \u2502 \u2502 \u2502 \u2502 \u251c\u2500\u2500 file_azaFF_0.txt\n \u2502 \u2502 \u2502 \u2502 \u2502 \u2502 \u2514\u2500\u2500 file_nAFRe_1.txt\n \u2502 \u2502 \u2502 \u2502 \u2502 \u251c\u2500\u2500 file_mIkQU_0.txt\n \u2502 \u2502 \u2502 \u2502 \u2502 \u2514\u2500\u2500 file_sGPxd_1.txt\n \u2502 \u2502 \u2502 \u2502 \u251c\u2500\u2500 folder_VTbEG_4_2/\n \u2502 \u2502 \u2502 \u2502 \u2502 \u251c\u2500\u2500 file_HtYLg_0.txt\n \u2502 \u2502 \u2502 \u2502 \u2502 \u251c\u2500\u2500 file_JXjMd_1.txt\n \u2502 \u2502 \u2502 \u2502 \u2502 \u2514\u2500\u2500 file_tPccB_2.txt\n \u2502 \u2502 \u2502 \u2502 \u251c\u2500\u2500 file_BuOSw_1.txt\n \u2502 \u2502 \u2502 \u2502 \u2514\u2500\u2500 file_TpoqE_0.txt\n \u2502 \u2502 \u2502 \u251c\u2500\u2500 folder_wTvun_3_4/\n \u2502 \u2502 \u2502 \u2502 \u251c\u2500\u2500 file_GyhyE_1.txt\n \u2502 \u2502 \u2502 \u2502 \u251c\u2500\u2500 file_POsla_2.txt\n \u2502 \u2502 \u2502 \u2502 \u2514\u2500\u2500 file_tSsvk_0.txt\n \u2502 \u2502 \u2502 \u251c\u2500\u2500 file_irNju_0.txt\n \u2502 \u2502 \u2502 \u2514\u2500\u2500 file_jYBRm_1.txt\n \u2502 \u2502 \u251c\u2500\u2500 folder_YlJLI_2_0/\n \u2502 \u2502 \u2502 \u2514\u2500\u2500 file_FpFSL_0.txt\n \u2502 \u2502 \u251c\u2500\u2500 file_cFgBr_2.txt\n \u2502 \u2502 \u251c\u2500\u2500 file_lKEWN_1.txt\n \u2502 \u2502 \u2514\u2500\u2500 file_ZEWFP_0.txt\n \u2502 \u2514\u2500\u2500 file_ayUCH_0.txt\n \u251c\u2500\u2500 folder_xtgyi_0_0/\n \u2502 \u2514\u2500\u2500 file_BvSOB_0.txt\n \u251c\u2500\u2500 mixed_content/\n \u2502 \u2514\u2500\u2500 images_and_text/\n \u2502 \u2514\u2500\u2500 notes.txt\n \u251c\u2500\u2500 project/\n \u2502 \u251c\u2500\u2500 docs/\n \u2502 \u2502 \u2514\u2500\u2500 archive/\n \u2502 \u2502 \u2514\u2500\u2500 2023/\n \u2502 \u2502 \u2514\u2500\u2500 reports/\n \u2502 \u2502 \u251c\u2500\u2500 report_0.txt\n \u2502 \u2502 \u251c\u2500\u2500 report_1.txt\n \u2502 \u2502 \u2514\u2500\u2500 report_2.txt\n \u2502 \u2514\u2500\u2500 src/\n \u2502 \u2514\u2500\u2500 main/\n \u2502 \u2514\u2500\u2500 resources/\n \u2514\u2500\u2500 m.py", "stateUrl": "https://storage.mcpmark.ai/filesystem/folder_structure.zip", "stateOriginalUrl": null } } ================================================ FILE: tasks/filesystem/easy/folder_structure/structure_analysis/verify.py ================================================ #!/usr/bin/env python3 """ Verification script for Directory Structure Analysis Task """ import sys from pathlib import Path import os def get_test_directory() -> Path: """Get the test directory from FILESYSTEM_TEST_DIR env var.""" test_root = os.environ.get("FILESYSTEM_TEST_DIR") if not test_root: raise ValueError("FILESYSTEM_TEST_DIR environment variable is required") return Path(test_root) def verify_structure_analysis_file_exists(test_dir: Path) -> bool: """Verify that the structure_analysis.txt file exists.""" analysis_file = test_dir / "structure_analysis.txt" if not analysis_file.exists(): print("❌ File 'structure_analysis.txt' not found") return False print("✅ structure_analysis.txt file found") return True def verify_structure_analysis_content(test_dir: Path) -> bool: """Verify that the structure_analysis.txt file contains the correct count.""" analysis_file = test_dir / "structure_analysis.txt" try: content = analysis_file.read_text().strip() if not content: print("❌ structure_analysis.txt file is empty") return False # The expected answer is 1 expected_count = 1 # Check if content is exactly "1" if content != str(expected_count): print(f"❌ Expected '{expected_count}', but found: '{content}'") return False print(f"✅ Python file count is correct: {content}") return True except Exception as e: print(f"❌ Error reading structure_analysis.txt file: {e}") return False def main(): """Main verification function.""" try: test_dir = get_test_directory() print(f"🔍 Verifying Directory Structure Analysis Task in: {test_dir}") print() # Define verification steps verification_steps = [ ("Structure Analysis File Exists", verify_structure_analysis_file_exists), ("Python File Count is Correct", verify_structure_analysis_content), ] # Run all verification steps all_passed = True for step_name, verify_func in verification_steps: print(f"📋 {step_name}...") if not verify_func(test_dir): all_passed = False print() # Final result if all_passed: print("🎉 All verification checks passed!") sys.exit(0) else: print("❌ Some verification checks failed!") sys.exit(1) except Exception as e: print(f"❌ Verification failed with error: {e}") sys.exit(1) if __name__ == "__main__": main() ================================================ FILE: tasks/filesystem/easy/legal_document/file_reorganize/description.md ================================================ # Legal Document File Reorganization Task **Overview** The folder "legal_files/" contains multiple versions of the Stock Purchase Agreement (Preferred_Stock_Purchase_Agreement_v0.txt through Preferred_Stock_Purchase_Agreement_v10.txt). ## Task Your task is to: 1. Identify the final version of the document among the different versions 2. Create a folder named `final_version` inside the `legal_files/` directory 3. Create an **empty file** with the same name as the final version in the newly created `final_version/` folder 4. Keep the original file in its original location Note: Due to the large file size, you only need to create an empty file (not copy the content). The filename should remain unchanged in the `final_version/` folder. ================================================ FILE: tasks/filesystem/easy/legal_document/file_reorganize/meta.json ================================================ { "task_id": "file_reorganize", "task_name": "File Reorganize", "category_id": "legal_document", "category_name": "Legal Document", "description": "Determine the final Stock Purchase Agreement version and create an empty copy of that filename inside legal_files/final_version/.", "author": "Lingjun Chen", "created_at": "2025-11-15", "difficulty": "L1", "tags": [ "file organization", "version management" ], "mcp": [ "filesystem" ], "meta_data": { "stateType": "text", "stateContent": "legal_document/\n \u2514\u2500\u2500 legal_files/\n \u251c\u2500\u2500 Preferred_Stock_Purchase_Agreement_v0.txt\n \u251c\u2500\u2500 Preferred_Stock_Purchase_Agreement_v1.txt\n \u251c\u2500\u2500 Preferred_Stock_Purchase_Agreement_v2.txt\n \u251c\u2500\u2500 Preferred_Stock_Purchase_Agreement_v3.txt\n \u251c\u2500\u2500 Preferred_Stock_Purchase_Agreement_v4.txt\n \u251c\u2500\u2500 Preferred_Stock_Purchase_Agreement_v5.txt\n \u251c\u2500\u2500 Preferred_Stock_Purchase_Agreement_v6.txt\n \u251c\u2500\u2500 Preferred_Stock_Purchase_Agreement_v7.txt\n \u251c\u2500\u2500 Preferred_Stock_Purchase_Agreement_v8.txt\n \u251c\u2500\u2500 Preferred_Stock_Purchase_Agreement_v9.txt\n \u2514\u2500\u2500 Preferred_Stock_Purchase_Agreement_v10.txt", "stateUrl": "https://storage.mcpmark.ai/filesystem/legal_document.zip", "stateOriginalUrl": "https://www.cooleygo.com/documents/nvca-financing-documents" } } ================================================ FILE: tasks/filesystem/easy/legal_document/file_reorganize/verify.py ================================================ #!/usr/bin/env python3 """ Verification script for Legal Document File Reorganization Task """ import sys from pathlib import Path import os def get_test_directory() -> Path: """Get the test directory from FILESYSTEM_TEST_DIR env var.""" test_root = os.environ.get("FILESYSTEM_TEST_DIR") if not test_root: raise ValueError("FILESYSTEM_TEST_DIR environment variable is required") return Path(test_root) def verify_final_version_folder_exists(test_dir: Path) -> bool: """Verify that the final_version folder exists in legal_files.""" final_version_dir = test_dir / "legal_files" / "final_version" if not final_version_dir.exists(): print("❌ Folder 'legal_files/final_version' not found") return False if not final_version_dir.is_dir(): print("❌ 'legal_files/final_version' exists but is not a directory") return False print("✅ Folder 'legal_files/final_version' found") return True def verify_target_file_exists(test_dir: Path) -> bool: """Verify that Preferred_Stock_Purchase_Agreement_v10.txt exists in final_version folder.""" target_file = test_dir / "legal_files" / "final_version" / "Preferred_Stock_Purchase_Agreement_v10.txt" if not target_file.exists(): print("❌ File 'legal_files/final_version/Preferred_Stock_Purchase_Agreement_v10.txt' not found") return False if not target_file.is_file(): print("❌ 'Preferred_Stock_Purchase_Agreement_v10.txt' exists but is not a file") return False print("✅ Target file 'Preferred_Stock_Purchase_Agreement_v10.txt' found in final_version folder") return True def verify_original_file_preserved(test_dir: Path) -> bool: """Verify that the original v10 file is still in place.""" original_file = test_dir / "legal_files" / "Preferred_Stock_Purchase_Agreement_v10.txt" if not original_file.exists(): print("❌ Original file 'Preferred_Stock_Purchase_Agreement_v10.txt' was removed") return False print("✅ Original file 'Preferred_Stock_Purchase_Agreement_v10.txt' preserved") return True def verify_only_v10_in_final_version(test_dir: Path) -> bool: """Verify that final_version folder contains only v10 file.""" final_version_dir = test_dir / "legal_files" / "final_version" # Get all files in final_version folder files = list(final_version_dir.iterdir()) # Filter out directories, keep only files files_only = [f for f in files if f.is_file()] if len(files_only) != 1: print(f"❌ final_version folder should contain exactly 1 file, but found {len(files_only)}") for f in files_only: print(f" - {f.name}") return False # Check if the only file is v10 if files_only[0].name != "Preferred_Stock_Purchase_Agreement_v10.txt": print(f"❌ final_version folder contains wrong file: {files_only[0].name}") print(" Expected: Preferred_Stock_Purchase_Agreement_v10.txt") return False print("✅ final_version folder contains only Preferred_Stock_Purchase_Agreement_v10.txt") return True def main(): """Main verification function.""" test_dir = get_test_directory() print("🔍 Verifying Legal Document File Reorganization Task...") # Define verification steps verification_steps = [ ("Final Version Folder Exists", verify_final_version_folder_exists), ("Target File Exists", verify_target_file_exists), ("Only V10 in Final Version", verify_only_v10_in_final_version), ("Original File Preserved", verify_original_file_preserved), ] # Run all verification steps all_passed = True for step_name, verify_func in verification_steps: print(f"\n--- {step_name} ---") if not verify_func(test_dir): all_passed = False # Final result print("\n" + "="*50) if all_passed: print("✅ Legal document file reorganization completed correctly!") print("🎉 Task verification: PASS") sys.exit(0) else: print("❌ Task verification: FAIL") sys.exit(1) if __name__ == "__main__": main() ================================================ FILE: tasks/filesystem/easy/papers/papers_counting/description.md ================================================ # File Context Task: Count HTML Files ## 📋 Task Description You need to count the number of HTML files in the given directory and write the count to a file. ## 🎯 Task Objectives 1. **Count HTML files** in the given directory 2. **Create a file** named `count.txt` in the same directory 3. **Write the count** (just the number) to `count.txt` ## 📝 Expected Output - File `count.txt` containing only the number of HTML files found ================================================ FILE: tasks/filesystem/easy/papers/papers_counting/meta.json ================================================ { "task_id": "papers_counting", "task_name": "Papers Counting", "category_id": "papers", "category_name": "Papers", "description": "Count how many .html papers live in the directory and write just that number into count.txt.", "author": "Lingjun Chen", "created_at": "2025-11-15", "difficulty": "L1", "tags": [ "data extraction", "reporting" ], "mcp": [ "filesystem" ], "meta_data": { "stateType": "text", "stateContent": "papers/\n \u251c\u2500\u2500 1707.06347.html\n \u251c\u2500\u2500 2105.04165.html\n \u251c\u2500\u2500 2201.11903.html\n \u251c\u2500\u2500 2303.08774.html\n \u251c\u2500\u2500 2306.08640.html\n \u251c\u2500\u2500 2310.02255.html\n \u251c\u2500\u2500 2310.08446.html\n \u251c\u2500\u2500 2312.00849.html\n \u251c\u2500\u2500 2312.07533.html\n \u251c\u2500\u2500 2312.11805.html\n \u251c\u2500\u2500 2402.00253.html\n \u251c\u2500\u2500 2402.03300.html\n \u251c\u2500\u2500 2403.05530.html\n \u251c\u2500\u2500 2404.13046.html\n \u251c\u2500\u2500 2404.14367.html\n \u251c\u2500\u2500 2404.14396.html\n \u251c\u2500\u2500 2405.09818.html\n \u251c\u2500\u2500 2405.13911.html\n \u251c\u2500\u2500 2405.16473.html\n \u251c\u2500\u2500 2405.16640.html\n \u251c\u2500\u2500 2406.08478.html\n \u251c\u2500\u2500 2406.16852.html\n \u251c\u2500\u2500 2406.17294.html\n \u251c\u2500\u2500 2407.01284.html\n \u251c\u2500\u2500 2407.01509.html\n \u251c\u2500\u2500 2407.21783.html\n \u251c\u2500\u2500 2408.03326.html\n \u251c\u2500\u2500 2408.12528.html\n \u251c\u2500\u2500 2409.19256.html\n \u251c\u2500\u2500 2410.05993.html\n \u251c\u2500\u2500 2410.06166.html\n \u251c\u2500\u2500 2410.10563.html\n \u251c\u2500\u2500 2410.13848.html\n \u251c\u2500\u2500 2410.17885.html\n \u251c\u2500\u2500 2410.21276.html\n \u251c\u2500\u2500 2411.07975.html\n \u251c\u2500\u2500 2411.10442.html\n \u251c\u2500\u2500 2411.11930.html\n \u251c\u2500\u2500 2411.14432.html\n \u251c\u2500\u2500 2412.05271.html\n \u251c\u2500\u2500 2412.08443.html\n \u251c\u2500\u2500 2412.10302.html\n \u251c\u2500\u2500 2412.15115.html\n \u251c\u2500\u2500 2412.16720.html\n \u251c\u2500\u2500 2412.17256.html\n \u251c\u2500\u2500 2412.18319.html\n \u251c\u2500\u2500 2412.20631.html\n \u251c\u2500\u2500 2501.04686.html\n \u251c\u2500\u2500 2501.06186.html\n \u251c\u2500\u2500 2501.12599.html\n \u251c\u2500\u2500 2501.12948.html\n \u251c\u2500\u2500 2501.17811.html\n \u251c\u2500\u2500 2502.01456.html\n \u251c\u2500\u2500 2502.09621.html\n \u251c\u2500\u2500 2502.10391.html\n \u251c\u2500\u2500 2502.13923.html\n \u251c\u2500\u2500 2503.01785.html\n \u251c\u2500\u2500 2503.06520.html\n \u251c\u2500\u2500 2503.06749.html\n \u251c\u2500\u2500 2503.07065.html\n \u251c\u2500\u2500 2503.07365.html\n \u251c\u2500\u2500 2503.07536.html\n \u251c\u2500\u2500 2503.10291.html\n \u251c\u2500\u2500 2503.10615.html\n \u251c\u2500\u2500 2503.12937.html\n \u251c\u2500\u2500 2503.13939.html\n \u251c\u2500\u2500 2503.14476.html\n \u251c\u2500\u2500 2503.17352.html\n \u251c\u2500\u2500 2503.18892.html\n \u251c\u2500\u2500 2503.19786.html\n \u251c\u2500\u2500 2503.20783.html\n \u251c\u2500\u2500 2503.21620.html\n \u251c\u2500\u2500 2503.21776.html\n \u251c\u2500\u2500 2503.22679.html\n \u251c\u2500\u2500 2504.02587.html\n \u251c\u2500\u2500 2504.05599.html\n \u251c\u2500\u2500 2504.07491.html\n \u251c\u2500\u2500 2504.07934.html\n \u251c\u2500\u2500 2504.07954.html\n \u251c\u2500\u2500 2504.11455.html\n \u251c\u2500\u2500 2504.14945.html\n \u251c\u2500\u2500 2504.16656.html\n \u251c\u2500\u2500 2505.00703.html\n \u2514\u2500\u2500 arxiv_2025.bib", "stateUrl": "https://storage.mcpmark.ai/filesystem/papers.zip", "stateOriginalUrl": null } } ================================================ FILE: tasks/filesystem/easy/papers/papers_counting/verify.py ================================================ #!/usr/bin/env python3 """ Verification script for Paper Counting Task: Count HTML Files """ import sys from pathlib import Path import os def get_test_directory() -> Path: """Get the test directory from FILESYSTEM_TEST_DIR env var.""" test_root = os.environ.get("FILESYSTEM_TEST_DIR") if not test_root: raise ValueError("FILESYSTEM_TEST_DIR environment variable is required") return Path(test_root) def verify_count_file_exists(test_dir: Path) -> bool: """Verify that the count.txt file exists.""" count_file = test_dir / "count.txt" if not count_file.exists(): print("❌ File 'count.txt' not found") return False print("✅ count.txt file found") return True def verify_count_content(test_dir: Path) -> bool: """Verify that count.txt contains the correct number (83).""" count_file = test_dir / "count.txt" try: content = count_file.read_text().strip() # Check if content is exactly "83" if content == "83": print("✅ count.txt contains the correct number: 83") return True else: print(f"❌ count.txt contains '{content}' but expected '83'") return False except Exception as e: print(f"❌ Error reading count.txt: {e}") return False def verify_actual_html_count(test_dir: Path) -> bool: """Verify that there are actually 83 HTML files in the directory.""" html_files = list(test_dir.glob("*.html")) count = len(html_files) if count == 83: print(f"✅ Verified: There are exactly {count} HTML files in the directory") return True else: print(f"⚠️ Found {count} HTML files in the directory (expected 83)") return False def main(): """Main verification function.""" try: test_dir = get_test_directory() print(f"🔍 Verifying HTML file count in: {test_dir}") # Define verification steps verification_steps = [ ("Count File Exists", verify_count_file_exists), ("Count Content", verify_count_content), ("Actual HTML Count", verify_actual_html_count), ] # Run all verification steps all_passed = True for step_name, verify_func in verification_steps: print(f"\n--- {step_name} ---") if not verify_func(test_dir): all_passed = False # Final result print("\n" + "="*50) if all_passed: print("✅ HTML file count is correct!") print("🎉 Task verification: PASS") sys.exit(0) else: print("❌ Task verification: FAIL") sys.exit(1) except Exception as e: print(f"❌ Verification failed with error: {e}") sys.exit(1) if __name__ == "__main__": main() ================================================ FILE: tasks/filesystem/easy/student_database/duplicate_name/description.md ================================================ Please help me identify any duplicate name from the list of all the 150 students. Do not use python code. You only need to find **any one** duplicate name. Then generate a `namesake.txt` file to record the result in the following format, with only three lines. Note: when recording the name, replace underscores with spaces. name: xxx count: xxx ids: xxx, xxx, ... ================================================ FILE: tasks/filesystem/easy/student_database/duplicate_name/meta.json ================================================ { "task_id": "duplicate_name", "task_name": "Duplicate Name", "category_id": "student_database", "category_name": "Student Database", "description": "Search the 150 student folders for any repeated full name and document the name, count, and ids in namesake.txt.", "author": "Lingjun Chen", "created_at": "2025-11-15", "difficulty": "L1", "tags": [ "pattern analysis", "data validation" ], "mcp": [ "filesystem" ], "meta_data": { "stateType": "text", "stateContent": "student_database/\n \u251c\u2500\u2500 20101250_Patricia_Jones/\n \u2502 \u251c\u2500\u2500 basic_info.txt\n \u2502 \u2514\u2500\u2500 recommendation_letter.txt\n \u251c\u2500\u2500 20101701_Isabella_Davis/\n \u2502 \u251c\u2500\u2500 basic_info.txt\n \u2502 \u2514\u2500\u2500 recommendation_letter.txt\n \u251c\u2500\u2500 20102572_Michael_Taylor/\n \u2502 \u251c\u2500\u2500 basic_info.txt\n \u2502 \u2514\u2500\u2500 recommendation_letter.txt\n \u251c\u2500\u2500 20104233_Robert_Lopez/\n \u2502 \u251c\u2500\u2500 basic_info.txt\n \u2502 \u2514\u2500\u2500 recommendation_letter.txt\n \u251c\u2500\u2500 20104498_Sarah_Brown/\n \u2502 \u251c\u2500\u2500 basic_info.txt\n \u2502 \u2514\u2500\u2500 recommendation_letter.txt\n \u251c\u2500\u2500 20104653_Sophia_Brown/\n \u2502 \u251c\u2500\u2500 basic_info.txt\n \u2502 \u2514\u2500\u2500 recommendation_letter.txt\n \u251c\u2500\u2500 20104675_Michael_Gonzalez/\n \u2502 \u251c\u2500\u2500 basic_info.txt\n \u2502 \u2514\u2500\u2500 recommendation_letter.txt\n \u251c\u2500\u2500 20104846_Christopher_Brown/\n \u2502 \u251c\u2500\u2500 basic_info.txt\n \u2502 \u2514\u2500\u2500 recommendation_letter.txt\n \u251c\u2500\u2500 20107487_Mia_Martin/\n \u2502 \u251c\u2500\u2500 basic_info.txt\n \u2502 \u2514\u2500\u2500 recommendation_letter.txt\n \u251c\u2500\u2500 20108742_Sarah_Brown/\n \u2502 \u251c\u2500\u2500 basic_info.txt\n \u2502 \u2514\u2500\u2500 recommendation_letter.txt\n \u251c\u2500\u2500 20109144_Emma_Thomas/\n \u2502 \u251c\u2500\u2500 basic_info.txt\n \u2502 \u2514\u2500\u2500 recommendation_letter.txt\n \u251c\u2500\u2500 20109803_Oliver_Hernandez/\n \u2502 \u251c\u2500\u2500 basic_info.txt\n \u2502 \u2514\u2500\u2500 recommendation_letter.txt\n \u251c\u2500\u2500 20111634_Isabella_Thomas/\n \u2502 \u251c\u2500\u2500 basic_info.txt\n \u2502 \u2514\u2500\u2500 recommendation_letter.txt\n \u251c\u2500\u2500 20112439_Christopher_Moore/\n \u2502 \u251c\u2500\u2500 basic_info.txt\n \u2502 \u2514\u2500\u2500 recommendation_letter.txt\n \u251c\u2500\u2500 20113368_William_Wilson/\n \u2502 \u251c\u2500\u2500 basic_info.txt\n \u2502 \u2514\u2500\u2500 recommendation_letter.txt\n \u251c\u2500\u2500 20113603_Robert_Rodriguez/\n \u2502 \u251c\u2500\u2500 basic_info.txt\n \u2502 \u2514\u2500\u2500 recommendation_letter.txt\n \u251c\u2500\u2500 20114397_Isabella_Martin/\n \u2502 \u251c\u2500\u2500 basic_info.txt\n \u2502 \u2514\u2500\u2500 recommendation_letter.txt\n \u251c\u2500\u2500 20114869_Ethan_Martin/\n \u2502 \u251c\u2500\u2500 basic_info.txt\n \u2502 \u2514\u2500\u2500 recommendation_letter.txt\n \u251c\u2500\u2500 20115252_Mason_Johnson/\n \u2502 \u251c\u2500\u2500 basic_info.txt\n \u2502 \u2514\u2500\u2500 recommendation_letter.txt\n \u251c\u2500\u2500 20115632_Elizabeth_Anderson/\n \u2502 \u251c\u2500\u2500 basic_info.txt\n \u2502 \u2514\u2500\u2500 recommendation_letter.txt\n \u251c\u2500\u2500 20115753_Charlotte_Johnson/\n \u2502 \u251c\u2500\u2500 basic_info.txt\n \u2502 \u2514\u2500\u2500 recommendation_letter.txt\n \u251c\u2500\u2500 20115924_Michael_Lopez/\n \u2502 \u251c\u2500\u2500 basic_info.txt\n \u2502 \u2514\u2500\u2500 recommendation_letter.txt\n \u251c\u2500\u2500 20116232_Olivia_Lopez/\n \u2502 \u251c\u2500\u2500 basic_info.txt\n \u2502 \u2514\u2500\u2500 recommendation_letter.txt\n \u251c\u2500\u2500 20119528_Thomas_Brown/\n \u2502 \u251c\u2500\u2500 basic_info.txt\n \u2502 \u2514\u2500\u2500 recommendation_letter.txt\n \u251c\u2500\u2500 20122427_Karen_Gonzalez/\n \u2502 \u251c\u2500\u2500 basic_info.txt\n \u2502 \u2514\u2500\u2500 recommendation_letter.txt\n \u251c\u2500\u2500 20122977_Evelyn_Miller/\n \u2502 \u251c\u2500\u2500 basic_info.txt\n \u2502 \u2514\u2500\u2500 recommendation_letter.txt\n \u251c\u2500\u2500 20123376_Joseph_Johnson/\n \u2502 \u251c\u2500\u2500 basic_info.txt\n \u2502 \u2514\u2500\u2500 recommendation_letter.txt\n \u251c\u2500\u2500 20125451_Barbara_Brown/\n \u2502 \u251c\u2500\u2500 basic_info.txt\n \u2502 \u2514\u2500\u2500 recommendation_letter.txt\n \u251c\u2500\u2500 20126203_Barbara_Davis/\n \u2502 \u251c\u2500\u2500 basic_info.txt\n \u2502 \u2514\u2500\u2500 recommendation_letter.txt\n \u251c\u2500\u2500 20126394_Olivia_Williams/\n \u2502 \u251c\u2500\u2500 basic_info.txt\n \u2502 \u2514\u2500\u2500 recommendation_letter.txt\n \u251c\u2500\u2500 20126471_Ethan_Taylor/\n \u2502 \u251c\u2500\u2500 basic_info.txt\n \u2502 \u2514\u2500\u2500 recommendation_letter.txt\n \u251c\u2500\u2500 20127423_John_Williams/\n \u2502 \u251c\u2500\u2500 basic_info.txt\n \u2502 \u2514\u2500\u2500 recommendation_letter.txt\n \u251c\u2500\u2500 20128249_Oliver_Smith/\n \u2502 \u251c\u2500\u2500 basic_info.txt\n \u2502 \u2514\u2500\u2500 recommendation_letter.txt\n \u251c\u2500\u2500 20128879_Christopher_Taylor/\n \u2502 \u251c\u2500\u2500 basic_info.txt\n \u2502 \u2514\u2500\u2500 recommendation_letter.txt\n \u251c\u2500\u2500 20129898_Jessica_Johnson/\n \u2502 \u251c\u2500\u2500 basic_info.txt\n \u2502 \u2514\u2500\u2500 recommendation_letter.txt\n \u251c\u2500\u2500 20131271_Olivia_Brown/\n \u2502 \u251c\u2500\u2500 basic_info.txt\n \u2502 \u2514\u2500\u2500 recommendation_letter.txt\n \u251c\u2500\u2500 20131518_Sophia_Smith/\n \u2502 \u251c\u2500\u2500 basic_info.txt\n \u2502 \u2514\u2500\u2500 recommendation_letter.txt\n \u251c\u2500\u2500 20132026_Isabella_Smith/\n \u2502 \u251c\u2500\u2500 basic_info.txt\n \u2502 \u2514\u2500\u2500 recommendation_letter.txt\n \u251c\u2500\u2500 20132370_James_Brown/\n \u2502 \u251c\u2500\u2500 basic_info.txt\n \u2502 \u2514\u2500\u2500 recommendation_letter.txt\n \u251c\u2500\u2500 20132669_Noah_Smith/\n \u2502 \u251c\u2500\u2500 basic_info.txt\n \u2502 \u2514\u2500\u2500 recommendation_letter.txt\n \u251c\u2500\u2500 20133527_Mason_Jackson/\n \u2502 \u251c\u2500\u2500 basic_info.txt\n \u2502 \u2514\u2500\u2500 recommendation_letter.txt\n \u251c\u2500\u2500 20133697_Isabella_Smith/\n \u2502 \u251c\u2500\u2500 basic_info.txt\n \u2502 \u2514\u2500\u2500 recommendation_letter.txt\n \u251c\u2500\u2500 20135821_Thomas_Wilson/\n \u2502 \u251c\u2500\u2500 basic_info.txt\n \u2502 \u2514\u2500\u2500 recommendation_letter.txt\n \u251c\u2500\u2500 20136681_Benjamin_Anderson/\n \u2502 \u251c\u2500\u2500 basic_info.txt\n \u2502 \u2514\u2500\u2500 recommendation_letter.txt\n \u251c\u2500\u2500 20136890_Benjamin_Brown/\n \u2502 \u251c\u2500\u2500 basic_info.txt\n \u2502 \u2514\u2500\u2500 recommendation_letter.txt\n \u251c\u2500\u2500 20137514_Lucas_Anderson/\n \u2502 \u251c\u2500\u2500 basic_info.txt\n \u2502 \u2514\u2500\u2500 recommendation_letter.txt\n \u251c\u2500\u2500 20139234_Harper_Martinez/\n \u2502 \u251c\u2500\u2500 basic_info.txt\n \u2502 \u2514\u2500\u2500 recommendation_letter.txt\n \u251c\u2500\u2500 20139637_Noah_Johnson/\n \u2502 \u251c\u2500\u2500 basic_info.txt\n \u2502 \u2514\u2500\u2500 recommendation_letter.txt\n \u251c\u2500\u2500 20139647_Patricia_Lopez/\n \u2502 \u251c\u2500\u2500 basic_info.txt\n \u2502 \u2514\u2500\u2500 recommendation_letter.txt\n \u251c\u2500\u2500 20141421_Linda_Gonzalez/\n \u2502 \u251c\u2500\u2500 basic_info.txt\n \u2502 \u2514\u2500\u2500 recommendation_letter.txt\n \u251c\u2500\u2500 20142085_William_Anderson/\n \u2502 \u251c\u2500\u2500 basic_info.txt\n \u2502 \u2514\u2500\u2500 recommendation_letter.txt\n \u251c\u2500\u2500 20142383_Amelia_Brown/\n \u2502 \u251c\u2500\u2500 basic_info.txt\n \u2502 \u2514\u2500\u2500 recommendation_letter.txt\n \u251c\u2500\u2500 20143406_Susan_Martin/\n \u2502 \u251c\u2500\u2500 basic_info.txt\n \u2502 \u2514\u2500\u2500 recommendation_letter.txt\n \u251c\u2500\u2500 20143830_James_Garcia/\n \u2502 \u251c\u2500\u2500 basic_info.txt\n \u2502 \u2514\u2500\u2500 recommendation_letter.txt\n \u251c\u2500\u2500 20146035_Christopher_Garcia/\n \u2502 \u251c\u2500\u2500 basic_info.txt\n \u2502 \u2514\u2500\u2500 recommendation_letter.txt\n \u251c\u2500\u2500 20146277_William_Anderson/\n \u2502 \u251c\u2500\u2500 basic_info.txt\n \u2502 \u2514\u2500\u2500 recommendation_letter.txt\n \u251c\u2500\u2500 20146279_Christopher_Moore/\n \u2502 \u251c\u2500\u2500 basic_info.txt\n \u2502 \u2514\u2500\u2500 recommendation_letter.txt\n \u251c\u2500\u2500 20147301_James_Jones/\n \u2502 \u251c\u2500\u2500 basic_info.txt\n \u2502 \u2514\u2500\u2500 recommendation_letter.txt\n \u251c\u2500\u2500 20147789_James_Anderson/\n \u2502 \u251c\u2500\u2500 basic_info.txt\n \u2502 \u2514\u2500\u2500 recommendation_letter.txt\n \u251c\u2500\u2500 20148681_John_Hernandez/\n \u2502 \u251c\u2500\u2500 basic_info.txt\n \u2502 \u2514\u2500\u2500 recommendation_letter.txt\n \u251c\u2500\u2500 20148778_Susan_Anderson/\n \u2502 \u251c\u2500\u2500 basic_info.txt\n \u2502 \u2514\u2500\u2500 recommendation_letter.txt\n \u251c\u2500\u2500 20149712_Jessica_Rodriguez/\n \u2502 \u251c\u2500\u2500 basic_info.txt\n \u2502 \u2514\u2500\u2500 recommendation_letter.txt\n \u251c\u2500\u2500 20151012_Harper_Miller/\n \u2502 \u251c\u2500\u2500 basic_info.txt\n \u2502 \u2514\u2500\u2500 recommendation_letter.txt\n \u251c\u2500\u2500 20153174_Benjamin_Jackson/\n \u2502 \u251c\u2500\u2500 basic_info.txt\n \u2502 \u2514\u2500\u2500 recommendation_letter.txt\n \u251c\u2500\u2500 20153412_Charlotte_Martin/\n \u2502 \u251c\u2500\u2500 basic_info.txt\n \u2502 \u2514\u2500\u2500 recommendation_letter.txt\n \u251c\u2500\u2500 20153606_James_Anderson/\n \u2502 \u251c\u2500\u2500 basic_info.txt\n \u2502 \u2514\u2500\u2500 recommendation_letter.txt\n \u251c\u2500\u2500 20153687_Richard_Taylor/\n \u2502 \u251c\u2500\u2500 basic_info.txt\n \u2502 \u2514\u2500\u2500 recommendation_letter.txt\n \u251c\u2500\u2500 20154518_John_Gonzalez/\n \u2502 \u251c\u2500\u2500 basic_info.txt\n \u2502 \u2514\u2500\u2500 recommendation_letter.txt\n \u251c\u2500\u2500 20154710_Benjamin_Rodriguez/\n \u2502 \u251c\u2500\u2500 basic_info.txt\n \u2502 \u2514\u2500\u2500 recommendation_letter.txt\n \u251c\u2500\u2500 20156469_Jennifer_Hernandez/\n \u2502 \u251c\u2500\u2500 basic_info.txt\n \u2502 \u2514\u2500\u2500 recommendation_letter.txt\n \u251c\u2500\u2500 20156522_Jennifer_Martinez/\n \u2502 \u251c\u2500\u2500 basic_info.txt\n \u2502 \u2514\u2500\u2500 recommendation_letter.txt\n \u251c\u2500\u2500 20156851_Noah_Anderson/\n \u2502 \u251c\u2500\u2500 basic_info.txt\n \u2502 \u2514\u2500\u2500 recommendation_letter.txt\n \u251c\u2500\u2500 20157943_Harper_Williams/\n \u2502 \u251c\u2500\u2500 basic_info.txt\n \u2502 \u2514\u2500\u2500 recommendation_letter.txt\n \u251c\u2500\u2500 20158266_Sophia_Moore/\n \u2502 \u251c\u2500\u2500 basic_info.txt\n \u2502 \u2514\u2500\u2500 recommendation_letter.txt\n \u251c\u2500\u2500 20158294_Sophia_Wilson/\n \u2502 \u251c\u2500\u2500 basic_info.txt\n \u2502 \u2514\u2500\u2500 recommendation_letter.txt\n \u251c\u2500\u2500 20158819_Sarah_Wilson/\n \u2502 \u251c\u2500\u2500 basic_info.txt\n \u2502 \u2514\u2500\u2500 recommendation_letter.txt\n \u251c\u2500\u2500 20159113_John_Rodriguez/\n \u2502 \u251c\u2500\u2500 basic_info.txt\n \u2502 \u2514\u2500\u2500 recommendation_letter.txt\n \u251c\u2500\u2500 20159695_James_Moore/\n \u2502 \u251c\u2500\u2500 basic_info.txt\n \u2502 \u2514\u2500\u2500 recommendation_letter.txt\n \u251c\u2500\u2500 20161279_William_Moore/\n \u2502 \u251c\u2500\u2500 basic_info.txt\n \u2502 \u2514\u2500\u2500 recommendation_letter.txt\n \u251c\u2500\u2500 20162253_Mason_Rodriguez/\n \u2502 \u251c\u2500\u2500 basic_info.txt\n \u2502 \u2514\u2500\u2500 recommendation_letter.txt\n \u251c\u2500\u2500 20162542_Mia_Anderson/\n \u2502 \u251c\u2500\u2500 basic_info.txt\n \u2502 \u2514\u2500\u2500 recommendation_letter.txt\n \u251c\u2500\u2500 20163356_Ava_Anderson/\n \u2502 \u251c\u2500\u2500 basic_info.txt\n \u2502 \u2514\u2500\u2500 recommendation_letter.txt\n \u251c\u2500\u2500 20164515_Patricia_Moore/\n \u2502 \u251c\u2500\u2500 basic_info.txt\n \u2502 \u2514\u2500\u2500 recommendation_letter.txt\n \u251c\u2500\u2500 20164801_Noah_Rodriguez/\n \u2502 \u251c\u2500\u2500 basic_info.txt\n \u2502 \u2514\u2500\u2500 recommendation_letter.txt\n \u251c\u2500\u2500 20165511_Mary_Gonzalez/\n \u2502 \u251c\u2500\u2500 basic_info.txt\n \u2502 \u2514\u2500\u2500 recommendation_letter.txt\n \u251c\u2500\u2500 20166436_Christopher_Jackson/\n \u2502 \u251c\u2500\u2500 basic_info.txt\n \u2502 \u2514\u2500\u2500 recommendation_letter.txt\n \u251c\u2500\u2500 20166487_Barbara_Hernandez/\n \u2502 \u251c\u2500\u2500 basic_info.txt\n \u2502 \u2514\u2500\u2500 recommendation_letter.txt\n \u251c\u2500\u2500 20166564_Ava_Lopez/\n \u2502 \u251c\u2500\u2500 basic_info.txt\n \u2502 \u2514\u2500\u2500 recommendation_letter.txt\n \u251c\u2500\u2500 20166998_Ava_Lopez/\n \u2502 \u251c\u2500\u2500 basic_info.txt\n \u2502 \u2514\u2500\u2500 recommendation_letter.txt\n \u251c\u2500\u2500 20168311_Lucas_Jackson/\n \u2502 \u251c\u2500\u2500 basic_info.txt\n \u2502 \u2514\u2500\u2500 recommendation_letter.txt\n \u251c\u2500\u2500 20168491_Karen_Martinez/\n \u2502 \u251c\u2500\u2500 basic_info.txt\n \u2502 \u2514\u2500\u2500 recommendation_letter.txt\n \u251c\u2500\u2500 20169515_Thomas_Taylor/\n \u2502 \u251c\u2500\u2500 basic_info.txt\n \u2502 \u2514\u2500\u2500 recommendation_letter.txt\n \u251c\u2500\u2500 20171050_Christopher_Rodriguez/\n \u2502 \u251c\u2500\u2500 basic_info.txt\n \u2502 \u2514\u2500\u2500 recommendation_letter.txt\n \u251c\u2500\u2500 20171406_Mary_Anderson/\n \u2502 \u251c\u2500\u2500 basic_info.txt\n \u2502 \u2514\u2500\u2500 recommendation_letter.txt\n \u251c\u2500\u2500 20171613_Ethan_Moore/\n \u2502 \u251c\u2500\u2500 basic_info.txt\n \u2502 \u2514\u2500\u2500 recommendation_letter.txt\n \u251c\u2500\u2500 20172106_Isabella_Rodriguez/\n \u2502 \u251c\u2500\u2500 basic_info.txt\n \u2502 \u2514\u2500\u2500 recommendation_letter.txt\n \u251c\u2500\u2500 20173259_Michael_Anderson/\n \u2502 \u251c\u2500\u2500 basic_info.txt\n \u2502 \u2514\u2500\u2500 recommendation_letter.txt\n \u251c\u2500\u2500 20173492_Richard_Miller/\n \u2502 \u251c\u2500\u2500 basic_info.txt\n \u2502 \u2514\u2500\u2500 recommendation_letter.txt\n \u251c\u2500\u2500 20173501_Mary_Smith/\n \u2502 \u251c\u2500\u2500 basic_info.txt\n \u2502 \u2514\u2500\u2500 recommendation_letter.txt\n \u251c\u2500\u2500 20173517_Susan_Anderson/\n \u2502 \u251c\u2500\u2500 basic_info.txt\n \u2502 \u2514\u2500\u2500 recommendation_letter.txt\n \u251c\u2500\u2500 20174207_Richard_Wilson/\n \u2502 \u251c\u2500\u2500 basic_info.txt\n \u2502 \u2514\u2500\u2500 recommendation_letter.txt\n \u251c\u2500\u2500 20174369_Mary_Garcia/\n \u2502 \u251c\u2500\u2500 basic_info.txt\n \u2502 \u2514\u2500\u2500 recommendation_letter.txt\n \u251c\u2500\u2500 20175314_William_Taylor/\n \u2502 \u251c\u2500\u2500 basic_info.txt\n \u2502 \u2514\u2500\u2500 recommendation_letter.txt\n \u251c\u2500\u2500 20176169_Lucas_Smith/\n \u2502 \u251c\u2500\u2500 basic_info.txt\n \u2502 \u2514\u2500\u2500 recommendation_letter.txt\n \u251c\u2500\u2500 20176947_Noah_Miller/\n \u2502 \u251c\u2500\u2500 basic_info.txt\n \u2502 \u2514\u2500\u2500 recommendation_letter.txt\n \u251c\u2500\u2500 20177389_James_Smith/\n \u2502 \u251c\u2500\u2500 basic_info.txt\n \u2502 \u2514\u2500\u2500 recommendation_letter.txt\n \u251c\u2500\u2500 20178687_Isabella_Anderson/\n \u2502 \u251c\u2500\u2500 basic_info.txt\n \u2502 \u2514\u2500\u2500 recommendation_letter.txt\n \u251c\u2500\u2500 20179461_William_Johnson/\n \u2502 \u251c\u2500\u2500 basic_info.txt\n \u2502 \u2514\u2500\u2500 recommendation_letter.txt\n \u251c\u2500\u2500 20179690_Linda_Thomas/\n \u2502 \u251c\u2500\u2500 basic_info.txt\n \u2502 \u2514\u2500\u2500 recommendation_letter.txt\n \u251c\u2500\u2500 20181056_Sarah_Hernandez/\n \u2502 \u251c\u2500\u2500 basic_info.txt\n \u2502 \u2514\u2500\u2500 recommendation_letter.txt\n \u251c\u2500\u2500 20182020_Patricia_Taylor/\n \u2502 \u251c\u2500\u2500 basic_info.txt\n \u2502 \u2514\u2500\u2500 recommendation_letter.txt\n \u251c\u2500\u2500 20182390_Ethan_Wilson/\n \u2502 \u251c\u2500\u2500 basic_info.txt\n \u2502 \u2514\u2500\u2500 recommendation_letter.txt\n \u251c\u2500\u2500 20183149_David_Smith/\n \u2502 \u251c\u2500\u2500 basic_info.txt\n \u2502 \u2514\u2500\u2500 recommendation_letter.txt\n \u251c\u2500\u2500 20183219_Charlotte_Williams/\n \u2502 \u251c\u2500\u2500 basic_info.txt\n \u2502 \u2514\u2500\u2500 recommendation_letter.txt\n \u251c\u2500\u2500 20184489_Jessica_Gonzalez/\n \u2502 \u251c\u2500\u2500 basic_info.txt\n \u2502 \u2514\u2500\u2500 recommendation_letter.txt\n \u251c\u2500\u2500 20186154_Charlotte_Smith/\n \u2502 \u251c\u2500\u2500 basic_info.txt\n \u2502 \u2514\u2500\u2500 recommendation_letter.txt\n \u251c\u2500\u2500 20186510_James_Thomas/\n \u2502 \u251c\u2500\u2500 basic_info.txt\n \u2502 \u2514\u2500\u2500 recommendation_letter.txt\n \u251c\u2500\u2500 20187107_David_Martinez/\n \u2502 \u251c\u2500\u2500 basic_info.txt\n \u2502 \u2514\u2500\u2500 recommendation_letter.txt\n \u251c\u2500\u2500 20187144_Mary_Jackson/\n \u2502 \u251c\u2500\u2500 basic_info.txt\n \u2502 \u2514\u2500\u2500 recommendation_letter.txt\n \u251c\u2500\u2500 20187892_Christopher_Taylor/\n \u2502 \u251c\u2500\u2500 basic_info.txt\n \u2502 \u2514\u2500\u2500 recommendation_letter.txt\n \u251c\u2500\u2500 20187921_Mary_Jones/\n \u2502 \u251c\u2500\u2500 basic_info.txt\n \u2502 \u2514\u2500\u2500 recommendation_letter.txt\n \u251c\u2500\u2500 20187967_Sarah_Davis/\n \u2502 \u251c\u2500\u2500 basic_info.txt\n \u2502 \u2514\u2500\u2500 recommendation_letter.txt\n \u251c\u2500\u2500 20188937_James_Moore/\n \u2502 \u251c\u2500\u2500 basic_info.txt\n \u2502 \u2514\u2500\u2500 recommendation_letter.txt\n \u251c\u2500\u2500 20189123_Mary_Martin/\n \u2502 \u251c\u2500\u2500 basic_info.txt\n \u2502 \u2514\u2500\u2500 recommendation_letter.txt\n \u251c\u2500\u2500 20189192_Olivia_Jones/\n \u2502 \u251c\u2500\u2500 basic_info.txt\n \u2502 \u2514\u2500\u2500 recommendation_letter.txt\n \u251c\u2500\u2500 20189268_Emma_Williams/\n \u2502 \u251c\u2500\u2500 basic_info.txt\n \u2502 \u2514\u2500\u2500 recommendation_letter.txt\n \u251c\u2500\u2500 20189854_William_Taylor/\n \u2502 \u251c\u2500\u2500 basic_info.txt\n \u2502 \u2514\u2500\u2500 recommendation_letter.txt\n \u251c\u2500\u2500 20191265_Joseph_Lopez/\n \u2502 \u251c\u2500\u2500 basic_info.txt\n \u2502 \u2514\u2500\u2500 recommendation_letter.txt\n \u251c\u2500\u2500 20192725_Robert_Martinez/\n \u2502 \u251c\u2500\u2500 basic_info.txt\n \u2502 \u2514\u2500\u2500 recommendation_letter.txt\n \u251c\u2500\u2500 20194054_Michael_Jones/\n \u2502 \u251c\u2500\u2500 basic_info.txt\n \u2502 \u2514\u2500\u2500 recommendation_letter.txt\n \u251c\u2500\u2500 20194160_Benjamin_Jackson/\n \u2502 \u251c\u2500\u2500 basic_info.txt\n \u2502 \u2514\u2500\u2500 recommendation_letter.txt\n \u251c\u2500\u2500 20194164_Sarah_Jones/\n \u2502 \u251c\u2500\u2500 basic_info.txt\n \u2502 \u2514\u2500\u2500 recommendation_letter.txt\n \u251c\u2500\u2500 20194525_John_Taylor/\n \u2502 \u251c\u2500\u2500 basic_info.txt\n \u2502 \u2514\u2500\u2500 recommendation_letter.txt\n \u251c\u2500\u2500 20195164_Jennifer_Gonzalez/\n \u2502 \u251c\u2500\u2500 basic_info.txt\n \u2502 \u2514\u2500\u2500 recommendation_letter.txt\n \u251c\u2500\u2500 20195982_David_Jackson/\n \u2502 \u251c\u2500\u2500 basic_info.txt\n \u2502 \u2514\u2500\u2500 recommendation_letter.txt\n \u251c\u2500\u2500 20196776_William_Brown/\n \u2502 \u251c\u2500\u2500 basic_info.txt\n \u2502 \u2514\u2500\u2500 recommendation_letter.txt\n \u251c\u2500\u2500 20196896_Olivia_Jones/\n \u2502 \u251c\u2500\u2500 basic_info.txt\n \u2502 \u2514\u2500\u2500 recommendation_letter.txt\n \u251c\u2500\u2500 20196961_Joseph_Thomas/\n \u2502 \u251c\u2500\u2500 basic_info.txt\n \u2502 \u2514\u2500\u2500 recommendation_letter.txt\n \u251c\u2500\u2500 20196998_Ethan_Wilson/\n \u2502 \u251c\u2500\u2500 basic_info.txt\n \u2502 \u2514\u2500\u2500 recommendation_letter.txt\n \u251c\u2500\u2500 20198548_Evelyn_Moore/\n \u2502 \u251c\u2500\u2500 basic_info.txt\n \u2502 \u2514\u2500\u2500 recommendation_letter.txt\n \u251c\u2500\u2500 20199036_Benjamin_Hernandez/\n \u2502 \u251c\u2500\u2500 basic_info.txt\n \u2502 \u2514\u2500\u2500 recommendation_letter.txt\n \u251c\u2500\u2500 20199583_Mary_Brown/\n \u2502 \u251c\u2500\u2500 basic_info.txt\n \u2502 \u2514\u2500\u2500 recommendation_letter.txt\n \u251c\u2500\u2500 20199735_Mason_Johnson/\n \u2502 \u251c\u2500\u2500 basic_info.txt\n \u2502 \u2514\u2500\u2500 recommendation_letter.txt\n \u251c\u2500\u2500 20199872_Sophia_Jackson/\n \u2502 \u251c\u2500\u2500 basic_info.txt\n \u2502 \u2514\u2500\u2500 recommendation_letter.txt\n \u251c\u2500\u2500 20199980_James_Rodriguez/\n \u2502 \u251c\u2500\u2500 basic_info.txt\n \u2502 \u2514\u2500\u2500 recommendation_letter.txt\n \u251c\u2500\u2500 20201385_John_Taylor/\n \u2502 \u251c\u2500\u2500 basic_info.txt\n \u2502 \u2514\u2500\u2500 recommendation_letter.txt\n \u251c\u2500\u2500 20201800_John_Jones/\n \u2502 \u251c\u2500\u2500 basic_info.txt\n \u2502 \u2514\u2500\u2500 recommendation_letter.txt\n \u251c\u2500\u2500 20202548_Robert_Miller/\n \u2502 \u251c\u2500\u2500 basic_info.txt\n \u2502 \u2514\u2500\u2500 recommendation_letter.txt\n \u251c\u2500\u2500 20203855_Mia_Miller/\n \u2502 \u251c\u2500\u2500 basic_info.txt\n \u2502 \u2514\u2500\u2500 recommendation_letter.txt\n \u2514\u2500\u2500 20204611_Sarah_Wilson/\n \u251c\u2500\u2500 basic_info.txt\n \u2514\u2500\u2500 recommendation_letter.txt", "stateUrl": "https://storage.mcpmark.ai/filesystem/student_database.zip", "stateOriginalUrl": null } } ================================================ FILE: tasks/filesystem/easy/student_database/duplicate_name/verify.py ================================================ #!/usr/bin/env python3 """ Verification script for Student Database Task: Find Duplicate Names Simplified version that only checks against expected results without folder validation """ import sys from pathlib import Path import os def get_test_directory() -> Path: """Get the test directory from FILESYSTEM_TEST_DIR env var.""" test_root = os.environ.get("FILESYSTEM_TEST_DIR") if not test_root: raise ValueError("FILESYSTEM_TEST_DIR environment variable is required") return Path(test_root) def verify_namesake_file_exists(test_dir: Path) -> bool: """Verify that the namesake.txt file exists.""" namesake_file = test_dir / "namesake.txt" if not namesake_file.exists(): print("❌ File 'namesake.txt' not found") return False print("✅ Namesake file found") return True def parse_namesake_file(test_dir: Path) -> dict: """Parse the namesake.txt file and return structured data.""" namesake_file = test_dir / "namesake.txt" try: content = namesake_file.read_text() lines = content.strip().split('\n') namesakes = {} current_line = 0 while current_line < len(lines): # Skip blank lines if not lines[current_line].strip(): current_line += 1 continue # Check if we have enough lines for a complete group if current_line + 2 >= len(lines): print(f"❌ Incomplete group at line {current_line + 1}") return {} # Parse group name_line = lines[current_line].strip() count_line = lines[current_line + 1].strip() ids_line = lines[current_line + 2].strip() # Extract name if not name_line.startswith("name: "): print(f"❌ Invalid name line format at line {current_line + 1}: {name_line}") return {} name = name_line.replace("name: ", "").strip() # Extract count if not count_line.startswith("count: "): print(f"❌ Invalid count line format at line {current_line + 2}: {count_line}") return {} count_str = count_line.replace("count: ", "").strip() try: count = int(count_str) except ValueError: print(f"❌ Invalid count format: {count_str}") return {} # Extract IDs if not ids_line.startswith("ids: "): print(f"❌ Invalid ids line format at line {current_line + 3}: {ids_line}") return {} ids_str = ids_line.replace("ids: ", "").strip() ids = [id.strip() for id in ids_str.split(",")] namesakes[name] = { 'count': count, 'ids': ids } current_line += 4 # Skip to next group (after blank line) return namesakes except Exception as e: print(f"❌ Error parsing namesake file: {e}") return {} def verify_against_expected_results(namesakes: dict) -> bool: """Verify that exactly 1 duplicate name is found and it is correct.""" # Expected duplicate names from answer.md (hardcoded) expected_duplicates = { 'Isabella Smith': ['20132026', '20133697'], 'Ava Lopez': ['20166564', '20166998'], 'James Moore': ['20159695', '20188937'], 'William Taylor': ['20175314', '20189854'], 'Ethan Wilson': ['20182390', '20196998'], 'Christopher Taylor': ['20128879', '20187892'], 'William Anderson': ['20142085', '20146277'], 'James Anderson': ['20147789', '20153606'], 'Olivia Jones': ['20189192', '20196896'], 'Mason Johnson': ['20115252', '20199735'], 'Benjamin Jackson': ['20153174', '20194160'], 'John Taylor': ['20194525', '20201385'], 'Susan Anderson': ['20148778', '20173517'], 'Christopher Moore': ['20112439', '20146279'], 'Sarah Wilson': ['20158819', '20204611'], 'Sarah Brown': ['20104498', '20108742'] } # Check if exactly 1 duplicate name is found if len(namesakes) != 1: print(f"❌ Expected exactly 1 duplicate name, but found {len(namesakes)}") return False print(f"✅ Found exactly 1 duplicate name (as required)") # Check if the namesake in the file is actually a correct duplicate for name, data in namesakes.items(): if name not in expected_duplicates: print(f"❌ '{name}' is not a duplicate name (not in expected list)") return False expected_ids = set(expected_duplicates[name]) stated_ids = set(data['ids']) if expected_ids != stated_ids: print(f"❌ ID mismatch for '{name}':") print(f" Expected: {sorted(expected_ids)}") print(f" Stated: {sorted(stated_ids)}") return False # Verify count matches if data['count'] != 2: print(f"❌ Count mismatch for '{name}': expected 2, got {data['count']}") return False print("✅ The identified duplicate name is correct") print("✅ All student IDs match expected results") print("✅ Count is correct (2 for the duplicate name)") return True def main(): """Main verification function.""" test_dir = get_test_directory() print("🔍 Verifying Student Database Task: Find Duplicate Names...") # Check if namesake file exists print("\n--- File Existence Check ---") if not verify_namesake_file_exists(test_dir): print("\n❌ Basic verification failed, cannot proceed with content verification") sys.exit(1) # Parse the file and run content verification print("\n--- Content Verification ---") namesakes = parse_namesake_file(test_dir) if not namesakes: print("❌ Failed to parse namesake file") sys.exit(1) # Verify against expected results print("\n--- Results Verification ---") if not verify_against_expected_results(namesakes): print("\n❌ Task verification: FAIL") sys.exit(1) # Final result print("\n" + "="*50) print("✅ Namesake identification completed correctly!") print(f"🎉 Found 1 duplicate name (exactly 1 required)") print("🎉 Task verification: PASS") sys.exit(0) if __name__ == "__main__": main() ================================================ FILE: tasks/filesystem/easy/student_database/recommender_name/description.md ================================================ Please find the recommendation letter for Patricia Jones and identify who wrote it. Generate a `recommender.txt` file with only the author's name. ================================================ FILE: tasks/filesystem/easy/student_database/recommender_name/meta.json ================================================ { "task_id": "recommender_name", "task_name": "Recommender Name", "category_id": "student_database", "category_name": "Student Database", "description": "Read Patricia Jones's recommendation letter to capture who signed it and store only that name in recommender.txt.", "author": "Lingjun Chen", "created_at": "2025-11-15", "difficulty": "L1", "tags": [ "data extraction", "document search" ], "mcp": [ "filesystem" ], "meta_data": { "stateType": "text", "stateContent": "student_database/\n \u251c\u2500\u2500 20101250_Patricia_Jones/\n \u2502 \u251c\u2500\u2500 basic_info.txt\n \u2502 \u2514\u2500\u2500 recommendation_letter.txt\n \u251c\u2500\u2500 20101701_Isabella_Davis/\n \u2502 \u251c\u2500\u2500 basic_info.txt\n \u2502 \u2514\u2500\u2500 recommendation_letter.txt\n \u251c\u2500\u2500 20102572_Michael_Taylor/\n \u2502 \u251c\u2500\u2500 basic_info.txt\n \u2502 \u2514\u2500\u2500 recommendation_letter.txt\n \u251c\u2500\u2500 20104233_Robert_Lopez/\n \u2502 \u251c\u2500\u2500 basic_info.txt\n \u2502 \u2514\u2500\u2500 recommendation_letter.txt\n \u251c\u2500\u2500 20104498_Sarah_Brown/\n \u2502 \u251c\u2500\u2500 basic_info.txt\n \u2502 \u2514\u2500\u2500 recommendation_letter.txt\n \u251c\u2500\u2500 20104653_Sophia_Brown/\n \u2502 \u251c\u2500\u2500 basic_info.txt\n \u2502 \u2514\u2500\u2500 recommendation_letter.txt\n \u251c\u2500\u2500 20104675_Michael_Gonzalez/\n \u2502 \u251c\u2500\u2500 basic_info.txt\n \u2502 \u2514\u2500\u2500 recommendation_letter.txt\n \u251c\u2500\u2500 20104846_Christopher_Brown/\n \u2502 \u251c\u2500\u2500 basic_info.txt\n \u2502 \u2514\u2500\u2500 recommendation_letter.txt\n \u251c\u2500\u2500 20107487_Mia_Martin/\n \u2502 \u251c\u2500\u2500 basic_info.txt\n \u2502 \u2514\u2500\u2500 recommendation_letter.txt\n \u251c\u2500\u2500 20108742_Sarah_Brown/\n \u2502 \u251c\u2500\u2500 basic_info.txt\n \u2502 \u2514\u2500\u2500 recommendation_letter.txt\n \u251c\u2500\u2500 20109144_Emma_Thomas/\n \u2502 \u251c\u2500\u2500 basic_info.txt\n \u2502 \u2514\u2500\u2500 recommendation_letter.txt\n \u251c\u2500\u2500 20109803_Oliver_Hernandez/\n \u2502 \u251c\u2500\u2500 basic_info.txt\n \u2502 \u2514\u2500\u2500 recommendation_letter.txt\n \u251c\u2500\u2500 20111634_Isabella_Thomas/\n \u2502 \u251c\u2500\u2500 basic_info.txt\n \u2502 \u2514\u2500\u2500 recommendation_letter.txt\n \u251c\u2500\u2500 20112439_Christopher_Moore/\n \u2502 \u251c\u2500\u2500 basic_info.txt\n \u2502 \u2514\u2500\u2500 recommendation_letter.txt\n \u251c\u2500\u2500 20113368_William_Wilson/\n \u2502 \u251c\u2500\u2500 basic_info.txt\n \u2502 \u2514\u2500\u2500 recommendation_letter.txt\n \u251c\u2500\u2500 20113603_Robert_Rodriguez/\n \u2502 \u251c\u2500\u2500 basic_info.txt\n \u2502 \u2514\u2500\u2500 recommendation_letter.txt\n \u251c\u2500\u2500 20114397_Isabella_Martin/\n \u2502 \u251c\u2500\u2500 basic_info.txt\n \u2502 \u2514\u2500\u2500 recommendation_letter.txt\n \u251c\u2500\u2500 20114869_Ethan_Martin/\n \u2502 \u251c\u2500\u2500 basic_info.txt\n \u2502 \u2514\u2500\u2500 recommendation_letter.txt\n \u251c\u2500\u2500 20115252_Mason_Johnson/\n \u2502 \u251c\u2500\u2500 basic_info.txt\n \u2502 \u2514\u2500\u2500 recommendation_letter.txt\n \u251c\u2500\u2500 20115632_Elizabeth_Anderson/\n \u2502 \u251c\u2500\u2500 basic_info.txt\n \u2502 \u2514\u2500\u2500 recommendation_letter.txt\n \u251c\u2500\u2500 20115753_Charlotte_Johnson/\n \u2502 \u251c\u2500\u2500 basic_info.txt\n \u2502 \u2514\u2500\u2500 recommendation_letter.txt\n \u251c\u2500\u2500 20115924_Michael_Lopez/\n \u2502 \u251c\u2500\u2500 basic_info.txt\n \u2502 \u2514\u2500\u2500 recommendation_letter.txt\n \u251c\u2500\u2500 20116232_Olivia_Lopez/\n \u2502 \u251c\u2500\u2500 basic_info.txt\n \u2502 \u2514\u2500\u2500 recommendation_letter.txt\n \u251c\u2500\u2500 20119528_Thomas_Brown/\n \u2502 \u251c\u2500\u2500 basic_info.txt\n \u2502 \u2514\u2500\u2500 recommendation_letter.txt\n \u251c\u2500\u2500 20122427_Karen_Gonzalez/\n \u2502 \u251c\u2500\u2500 basic_info.txt\n \u2502 \u2514\u2500\u2500 recommendation_letter.txt\n \u251c\u2500\u2500 20122977_Evelyn_Miller/\n \u2502 \u251c\u2500\u2500 basic_info.txt\n \u2502 \u2514\u2500\u2500 recommendation_letter.txt\n \u251c\u2500\u2500 20123376_Joseph_Johnson/\n \u2502 \u251c\u2500\u2500 basic_info.txt\n \u2502 \u2514\u2500\u2500 recommendation_letter.txt\n \u251c\u2500\u2500 20125451_Barbara_Brown/\n \u2502 \u251c\u2500\u2500 basic_info.txt\n \u2502 \u2514\u2500\u2500 recommendation_letter.txt\n \u251c\u2500\u2500 20126203_Barbara_Davis/\n \u2502 \u251c\u2500\u2500 basic_info.txt\n \u2502 \u2514\u2500\u2500 recommendation_letter.txt\n \u251c\u2500\u2500 20126394_Olivia_Williams/\n \u2502 \u251c\u2500\u2500 basic_info.txt\n \u2502 \u2514\u2500\u2500 recommendation_letter.txt\n \u251c\u2500\u2500 20126471_Ethan_Taylor/\n \u2502 \u251c\u2500\u2500 basic_info.txt\n \u2502 \u2514\u2500\u2500 recommendation_letter.txt\n \u251c\u2500\u2500 20127423_John_Williams/\n \u2502 \u251c\u2500\u2500 basic_info.txt\n \u2502 \u2514\u2500\u2500 recommendation_letter.txt\n \u251c\u2500\u2500 20128249_Oliver_Smith/\n \u2502 \u251c\u2500\u2500 basic_info.txt\n \u2502 \u2514\u2500\u2500 recommendation_letter.txt\n \u251c\u2500\u2500 20128879_Christopher_Taylor/\n \u2502 \u251c\u2500\u2500 basic_info.txt\n \u2502 \u2514\u2500\u2500 recommendation_letter.txt\n \u251c\u2500\u2500 20129898_Jessica_Johnson/\n \u2502 \u251c\u2500\u2500 basic_info.txt\n \u2502 \u2514\u2500\u2500 recommendation_letter.txt\n \u251c\u2500\u2500 20131271_Olivia_Brown/\n \u2502 \u251c\u2500\u2500 basic_info.txt\n \u2502 \u2514\u2500\u2500 recommendation_letter.txt\n \u251c\u2500\u2500 20131518_Sophia_Smith/\n \u2502 \u251c\u2500\u2500 basic_info.txt\n \u2502 \u2514\u2500\u2500 recommendation_letter.txt\n \u251c\u2500\u2500 20132026_Isabella_Smith/\n \u2502 \u251c\u2500\u2500 basic_info.txt\n \u2502 \u2514\u2500\u2500 recommendation_letter.txt\n \u251c\u2500\u2500 20132370_James_Brown/\n \u2502 \u251c\u2500\u2500 basic_info.txt\n \u2502 \u2514\u2500\u2500 recommendation_letter.txt\n \u251c\u2500\u2500 20132669_Noah_Smith/\n \u2502 \u251c\u2500\u2500 basic_info.txt\n \u2502 \u2514\u2500\u2500 recommendation_letter.txt\n \u251c\u2500\u2500 20133527_Mason_Jackson/\n \u2502 \u251c\u2500\u2500 basic_info.txt\n \u2502 \u2514\u2500\u2500 recommendation_letter.txt\n \u251c\u2500\u2500 20133697_Isabella_Smith/\n \u2502 \u251c\u2500\u2500 basic_info.txt\n \u2502 \u2514\u2500\u2500 recommendation_letter.txt\n \u251c\u2500\u2500 20135821_Thomas_Wilson/\n \u2502 \u251c\u2500\u2500 basic_info.txt\n \u2502 \u2514\u2500\u2500 recommendation_letter.txt\n \u251c\u2500\u2500 20136681_Benjamin_Anderson/\n \u2502 \u251c\u2500\u2500 basic_info.txt\n \u2502 \u2514\u2500\u2500 recommendation_letter.txt\n \u251c\u2500\u2500 20136890_Benjamin_Brown/\n \u2502 \u251c\u2500\u2500 basic_info.txt\n \u2502 \u2514\u2500\u2500 recommendation_letter.txt\n \u251c\u2500\u2500 20137514_Lucas_Anderson/\n \u2502 \u251c\u2500\u2500 basic_info.txt\n \u2502 \u2514\u2500\u2500 recommendation_letter.txt\n \u251c\u2500\u2500 20139234_Harper_Martinez/\n \u2502 \u251c\u2500\u2500 basic_info.txt\n \u2502 \u2514\u2500\u2500 recommendation_letter.txt\n \u251c\u2500\u2500 20139637_Noah_Johnson/\n \u2502 \u251c\u2500\u2500 basic_info.txt\n \u2502 \u2514\u2500\u2500 recommendation_letter.txt\n \u251c\u2500\u2500 20139647_Patricia_Lopez/\n \u2502 \u251c\u2500\u2500 basic_info.txt\n \u2502 \u2514\u2500\u2500 recommendation_letter.txt\n \u251c\u2500\u2500 20141421_Linda_Gonzalez/\n \u2502 \u251c\u2500\u2500 basic_info.txt\n \u2502 \u2514\u2500\u2500 recommendation_letter.txt\n \u251c\u2500\u2500 20142085_William_Anderson/\n \u2502 \u251c\u2500\u2500 basic_info.txt\n \u2502 \u2514\u2500\u2500 recommendation_letter.txt\n \u251c\u2500\u2500 20142383_Amelia_Brown/\n \u2502 \u251c\u2500\u2500 basic_info.txt\n \u2502 \u2514\u2500\u2500 recommendation_letter.txt\n \u251c\u2500\u2500 20143406_Susan_Martin/\n \u2502 \u251c\u2500\u2500 basic_info.txt\n \u2502 \u2514\u2500\u2500 recommendation_letter.txt\n \u251c\u2500\u2500 20143830_James_Garcia/\n \u2502 \u251c\u2500\u2500 basic_info.txt\n \u2502 \u2514\u2500\u2500 recommendation_letter.txt\n \u251c\u2500\u2500 20146035_Christopher_Garcia/\n \u2502 \u251c\u2500\u2500 basic_info.txt\n \u2502 \u2514\u2500\u2500 recommendation_letter.txt\n \u251c\u2500\u2500 20146277_William_Anderson/\n \u2502 \u251c\u2500\u2500 basic_info.txt\n \u2502 \u2514\u2500\u2500 recommendation_letter.txt\n \u251c\u2500\u2500 20146279_Christopher_Moore/\n \u2502 \u251c\u2500\u2500 basic_info.txt\n \u2502 \u2514\u2500\u2500 recommendation_letter.txt\n \u251c\u2500\u2500 20147301_James_Jones/\n \u2502 \u251c\u2500\u2500 basic_info.txt\n \u2502 \u2514\u2500\u2500 recommendation_letter.txt\n \u251c\u2500\u2500 20147789_James_Anderson/\n \u2502 \u251c\u2500\u2500 basic_info.txt\n \u2502 \u2514\u2500\u2500 recommendation_letter.txt\n \u251c\u2500\u2500 20148681_John_Hernandez/\n \u2502 \u251c\u2500\u2500 basic_info.txt\n \u2502 \u2514\u2500\u2500 recommendation_letter.txt\n \u251c\u2500\u2500 20148778_Susan_Anderson/\n \u2502 \u251c\u2500\u2500 basic_info.txt\n \u2502 \u2514\u2500\u2500 recommendation_letter.txt\n \u251c\u2500\u2500 20149712_Jessica_Rodriguez/\n \u2502 \u251c\u2500\u2500 basic_info.txt\n \u2502 \u2514\u2500\u2500 recommendation_letter.txt\n \u251c\u2500\u2500 20151012_Harper_Miller/\n \u2502 \u251c\u2500\u2500 basic_info.txt\n \u2502 \u2514\u2500\u2500 recommendation_letter.txt\n \u251c\u2500\u2500 20153174_Benjamin_Jackson/\n \u2502 \u251c\u2500\u2500 basic_info.txt\n \u2502 \u2514\u2500\u2500 recommendation_letter.txt\n \u251c\u2500\u2500 20153412_Charlotte_Martin/\n \u2502 \u251c\u2500\u2500 basic_info.txt\n \u2502 \u2514\u2500\u2500 recommendation_letter.txt\n \u251c\u2500\u2500 20153606_James_Anderson/\n \u2502 \u251c\u2500\u2500 basic_info.txt\n \u2502 \u2514\u2500\u2500 recommendation_letter.txt\n \u251c\u2500\u2500 20153687_Richard_Taylor/\n \u2502 \u251c\u2500\u2500 basic_info.txt\n \u2502 \u2514\u2500\u2500 recommendation_letter.txt\n \u251c\u2500\u2500 20154518_John_Gonzalez/\n \u2502 \u251c\u2500\u2500 basic_info.txt\n \u2502 \u2514\u2500\u2500 recommendation_letter.txt\n \u251c\u2500\u2500 20154710_Benjamin_Rodriguez/\n \u2502 \u251c\u2500\u2500 basic_info.txt\n \u2502 \u2514\u2500\u2500 recommendation_letter.txt\n \u251c\u2500\u2500 20156469_Jennifer_Hernandez/\n \u2502 \u251c\u2500\u2500 basic_info.txt\n \u2502 \u2514\u2500\u2500 recommendation_letter.txt\n \u251c\u2500\u2500 20156522_Jennifer_Martinez/\n \u2502 \u251c\u2500\u2500 basic_info.txt\n \u2502 \u2514\u2500\u2500 recommendation_letter.txt\n \u251c\u2500\u2500 20156851_Noah_Anderson/\n \u2502 \u251c\u2500\u2500 basic_info.txt\n \u2502 \u2514\u2500\u2500 recommendation_letter.txt\n \u251c\u2500\u2500 20157943_Harper_Williams/\n \u2502 \u251c\u2500\u2500 basic_info.txt\n \u2502 \u2514\u2500\u2500 recommendation_letter.txt\n \u251c\u2500\u2500 20158266_Sophia_Moore/\n \u2502 \u251c\u2500\u2500 basic_info.txt\n \u2502 \u2514\u2500\u2500 recommendation_letter.txt\n \u251c\u2500\u2500 20158294_Sophia_Wilson/\n \u2502 \u251c\u2500\u2500 basic_info.txt\n \u2502 \u2514\u2500\u2500 recommendation_letter.txt\n \u251c\u2500\u2500 20158819_Sarah_Wilson/\n \u2502 \u251c\u2500\u2500 basic_info.txt\n \u2502 \u2514\u2500\u2500 recommendation_letter.txt\n \u251c\u2500\u2500 20159113_John_Rodriguez/\n \u2502 \u251c\u2500\u2500 basic_info.txt\n \u2502 \u2514\u2500\u2500 recommendation_letter.txt\n \u251c\u2500\u2500 20159695_James_Moore/\n \u2502 \u251c\u2500\u2500 basic_info.txt\n \u2502 \u2514\u2500\u2500 recommendation_letter.txt\n \u251c\u2500\u2500 20161279_William_Moore/\n \u2502 \u251c\u2500\u2500 basic_info.txt\n \u2502 \u2514\u2500\u2500 recommendation_letter.txt\n \u251c\u2500\u2500 20162253_Mason_Rodriguez/\n \u2502 \u251c\u2500\u2500 basic_info.txt\n \u2502 \u2514\u2500\u2500 recommendation_letter.txt\n \u251c\u2500\u2500 20162542_Mia_Anderson/\n \u2502 \u251c\u2500\u2500 basic_info.txt\n \u2502 \u2514\u2500\u2500 recommendation_letter.txt\n \u251c\u2500\u2500 20163356_Ava_Anderson/\n \u2502 \u251c\u2500\u2500 basic_info.txt\n \u2502 \u2514\u2500\u2500 recommendation_letter.txt\n \u251c\u2500\u2500 20164515_Patricia_Moore/\n \u2502 \u251c\u2500\u2500 basic_info.txt\n \u2502 \u2514\u2500\u2500 recommendation_letter.txt\n \u251c\u2500\u2500 20164801_Noah_Rodriguez/\n \u2502 \u251c\u2500\u2500 basic_info.txt\n \u2502 \u2514\u2500\u2500 recommendation_letter.txt\n \u251c\u2500\u2500 20165511_Mary_Gonzalez/\n \u2502 \u251c\u2500\u2500 basic_info.txt\n \u2502 \u2514\u2500\u2500 recommendation_letter.txt\n \u251c\u2500\u2500 20166436_Christopher_Jackson/\n \u2502 \u251c\u2500\u2500 basic_info.txt\n \u2502 \u2514\u2500\u2500 recommendation_letter.txt\n \u251c\u2500\u2500 20166487_Barbara_Hernandez/\n \u2502 \u251c\u2500\u2500 basic_info.txt\n \u2502 \u2514\u2500\u2500 recommendation_letter.txt\n \u251c\u2500\u2500 20166564_Ava_Lopez/\n \u2502 \u251c\u2500\u2500 basic_info.txt\n \u2502 \u2514\u2500\u2500 recommendation_letter.txt\n \u251c\u2500\u2500 20166998_Ava_Lopez/\n \u2502 \u251c\u2500\u2500 basic_info.txt\n \u2502 \u2514\u2500\u2500 recommendation_letter.txt\n \u251c\u2500\u2500 20168311_Lucas_Jackson/\n \u2502 \u251c\u2500\u2500 basic_info.txt\n \u2502 \u2514\u2500\u2500 recommendation_letter.txt\n \u251c\u2500\u2500 20168491_Karen_Martinez/\n \u2502 \u251c\u2500\u2500 basic_info.txt\n \u2502 \u2514\u2500\u2500 recommendation_letter.txt\n \u251c\u2500\u2500 20169515_Thomas_Taylor/\n \u2502 \u251c\u2500\u2500 basic_info.txt\n \u2502 \u2514\u2500\u2500 recommendation_letter.txt\n \u251c\u2500\u2500 20171050_Christopher_Rodriguez/\n \u2502 \u251c\u2500\u2500 basic_info.txt\n \u2502 \u2514\u2500\u2500 recommendation_letter.txt\n \u251c\u2500\u2500 20171406_Mary_Anderson/\n \u2502 \u251c\u2500\u2500 basic_info.txt\n \u2502 \u2514\u2500\u2500 recommendation_letter.txt\n \u251c\u2500\u2500 20171613_Ethan_Moore/\n \u2502 \u251c\u2500\u2500 basic_info.txt\n \u2502 \u2514\u2500\u2500 recommendation_letter.txt\n \u251c\u2500\u2500 20172106_Isabella_Rodriguez/\n \u2502 \u251c\u2500\u2500 basic_info.txt\n \u2502 \u2514\u2500\u2500 recommendation_letter.txt\n \u251c\u2500\u2500 20173259_Michael_Anderson/\n \u2502 \u251c\u2500\u2500 basic_info.txt\n \u2502 \u2514\u2500\u2500 recommendation_letter.txt\n \u251c\u2500\u2500 20173492_Richard_Miller/\n \u2502 \u251c\u2500\u2500 basic_info.txt\n \u2502 \u2514\u2500\u2500 recommendation_letter.txt\n \u251c\u2500\u2500 20173501_Mary_Smith/\n \u2502 \u251c\u2500\u2500 basic_info.txt\n \u2502 \u2514\u2500\u2500 recommendation_letter.txt\n \u251c\u2500\u2500 20173517_Susan_Anderson/\n \u2502 \u251c\u2500\u2500 basic_info.txt\n \u2502 \u2514\u2500\u2500 recommendation_letter.txt\n \u251c\u2500\u2500 20174207_Richard_Wilson/\n \u2502 \u251c\u2500\u2500 basic_info.txt\n \u2502 \u2514\u2500\u2500 recommendation_letter.txt\n \u251c\u2500\u2500 20174369_Mary_Garcia/\n \u2502 \u251c\u2500\u2500 basic_info.txt\n \u2502 \u2514\u2500\u2500 recommendation_letter.txt\n \u251c\u2500\u2500 20175314_William_Taylor/\n \u2502 \u251c\u2500\u2500 basic_info.txt\n \u2502 \u2514\u2500\u2500 recommendation_letter.txt\n \u251c\u2500\u2500 20176169_Lucas_Smith/\n \u2502 \u251c\u2500\u2500 basic_info.txt\n \u2502 \u2514\u2500\u2500 recommendation_letter.txt\n \u251c\u2500\u2500 20176947_Noah_Miller/\n \u2502 \u251c\u2500\u2500 basic_info.txt\n \u2502 \u2514\u2500\u2500 recommendation_letter.txt\n \u251c\u2500\u2500 20177389_James_Smith/\n \u2502 \u251c\u2500\u2500 basic_info.txt\n \u2502 \u2514\u2500\u2500 recommendation_letter.txt\n \u251c\u2500\u2500 20178687_Isabella_Anderson/\n \u2502 \u251c\u2500\u2500 basic_info.txt\n \u2502 \u2514\u2500\u2500 recommendation_letter.txt\n \u251c\u2500\u2500 20179461_William_Johnson/\n \u2502 \u251c\u2500\u2500 basic_info.txt\n \u2502 \u2514\u2500\u2500 recommendation_letter.txt\n \u251c\u2500\u2500 20179690_Linda_Thomas/\n \u2502 \u251c\u2500\u2500 basic_info.txt\n \u2502 \u2514\u2500\u2500 recommendation_letter.txt\n \u251c\u2500\u2500 20181056_Sarah_Hernandez/\n \u2502 \u251c\u2500\u2500 basic_info.txt\n \u2502 \u2514\u2500\u2500 recommendation_letter.txt\n \u251c\u2500\u2500 20182020_Patricia_Taylor/\n \u2502 \u251c\u2500\u2500 basic_info.txt\n \u2502 \u2514\u2500\u2500 recommendation_letter.txt\n \u251c\u2500\u2500 20182390_Ethan_Wilson/\n \u2502 \u251c\u2500\u2500 basic_info.txt\n \u2502 \u2514\u2500\u2500 recommendation_letter.txt\n \u251c\u2500\u2500 20183149_David_Smith/\n \u2502 \u251c\u2500\u2500 basic_info.txt\n \u2502 \u2514\u2500\u2500 recommendation_letter.txt\n \u251c\u2500\u2500 20183219_Charlotte_Williams/\n \u2502 \u251c\u2500\u2500 basic_info.txt\n \u2502 \u2514\u2500\u2500 recommendation_letter.txt\n \u251c\u2500\u2500 20184489_Jessica_Gonzalez/\n \u2502 \u251c\u2500\u2500 basic_info.txt\n \u2502 \u2514\u2500\u2500 recommendation_letter.txt\n \u251c\u2500\u2500 20186154_Charlotte_Smith/\n \u2502 \u251c\u2500\u2500 basic_info.txt\n \u2502 \u2514\u2500\u2500 recommendation_letter.txt\n \u251c\u2500\u2500 20186510_James_Thomas/\n \u2502 \u251c\u2500\u2500 basic_info.txt\n \u2502 \u2514\u2500\u2500 recommendation_letter.txt\n \u251c\u2500\u2500 20187107_David_Martinez/\n \u2502 \u251c\u2500\u2500 basic_info.txt\n \u2502 \u2514\u2500\u2500 recommendation_letter.txt\n \u251c\u2500\u2500 20187144_Mary_Jackson/\n \u2502 \u251c\u2500\u2500 basic_info.txt\n \u2502 \u2514\u2500\u2500 recommendation_letter.txt\n \u251c\u2500\u2500 20187892_Christopher_Taylor/\n \u2502 \u251c\u2500\u2500 basic_info.txt\n \u2502 \u2514\u2500\u2500 recommendation_letter.txt\n \u251c\u2500\u2500 20187921_Mary_Jones/\n \u2502 \u251c\u2500\u2500 basic_info.txt\n \u2502 \u2514\u2500\u2500 recommendation_letter.txt\n \u251c\u2500\u2500 20187967_Sarah_Davis/\n \u2502 \u251c\u2500\u2500 basic_info.txt\n \u2502 \u2514\u2500\u2500 recommendation_letter.txt\n \u251c\u2500\u2500 20188937_James_Moore/\n \u2502 \u251c\u2500\u2500 basic_info.txt\n \u2502 \u2514\u2500\u2500 recommendation_letter.txt\n \u251c\u2500\u2500 20189123_Mary_Martin/\n \u2502 \u251c\u2500\u2500 basic_info.txt\n \u2502 \u2514\u2500\u2500 recommendation_letter.txt\n \u251c\u2500\u2500 20189192_Olivia_Jones/\n \u2502 \u251c\u2500\u2500 basic_info.txt\n \u2502 \u2514\u2500\u2500 recommendation_letter.txt\n \u251c\u2500\u2500 20189268_Emma_Williams/\n \u2502 \u251c\u2500\u2500 basic_info.txt\n \u2502 \u2514\u2500\u2500 recommendation_letter.txt\n \u251c\u2500\u2500 20189854_William_Taylor/\n \u2502 \u251c\u2500\u2500 basic_info.txt\n \u2502 \u2514\u2500\u2500 recommendation_letter.txt\n \u251c\u2500\u2500 20191265_Joseph_Lopez/\n \u2502 \u251c\u2500\u2500 basic_info.txt\n \u2502 \u2514\u2500\u2500 recommendation_letter.txt\n \u251c\u2500\u2500 20192725_Robert_Martinez/\n \u2502 \u251c\u2500\u2500 basic_info.txt\n \u2502 \u2514\u2500\u2500 recommendation_letter.txt\n \u251c\u2500\u2500 20194054_Michael_Jones/\n \u2502 \u251c\u2500\u2500 basic_info.txt\n \u2502 \u2514\u2500\u2500 recommendation_letter.txt\n \u251c\u2500\u2500 20194160_Benjamin_Jackson/\n \u2502 \u251c\u2500\u2500 basic_info.txt\n \u2502 \u2514\u2500\u2500 recommendation_letter.txt\n \u251c\u2500\u2500 20194164_Sarah_Jones/\n \u2502 \u251c\u2500\u2500 basic_info.txt\n \u2502 \u2514\u2500\u2500 recommendation_letter.txt\n \u251c\u2500\u2500 20194525_John_Taylor/\n \u2502 \u251c\u2500\u2500 basic_info.txt\n \u2502 \u2514\u2500\u2500 recommendation_letter.txt\n \u251c\u2500\u2500 20195164_Jennifer_Gonzalez/\n \u2502 \u251c\u2500\u2500 basic_info.txt\n \u2502 \u2514\u2500\u2500 recommendation_letter.txt\n \u251c\u2500\u2500 20195982_David_Jackson/\n \u2502 \u251c\u2500\u2500 basic_info.txt\n \u2502 \u2514\u2500\u2500 recommendation_letter.txt\n \u251c\u2500\u2500 20196776_William_Brown/\n \u2502 \u251c\u2500\u2500 basic_info.txt\n \u2502 \u2514\u2500\u2500 recommendation_letter.txt\n \u251c\u2500\u2500 20196896_Olivia_Jones/\n \u2502 \u251c\u2500\u2500 basic_info.txt\n \u2502 \u2514\u2500\u2500 recommendation_letter.txt\n \u251c\u2500\u2500 20196961_Joseph_Thomas/\n \u2502 \u251c\u2500\u2500 basic_info.txt\n \u2502 \u2514\u2500\u2500 recommendation_letter.txt\n \u251c\u2500\u2500 20196998_Ethan_Wilson/\n \u2502 \u251c\u2500\u2500 basic_info.txt\n \u2502 \u2514\u2500\u2500 recommendation_letter.txt\n \u251c\u2500\u2500 20198548_Evelyn_Moore/\n \u2502 \u251c\u2500\u2500 basic_info.txt\n \u2502 \u2514\u2500\u2500 recommendation_letter.txt\n \u251c\u2500\u2500 20199036_Benjamin_Hernandez/\n \u2502 \u251c\u2500\u2500 basic_info.txt\n \u2502 \u2514\u2500\u2500 recommendation_letter.txt\n \u251c\u2500\u2500 20199583_Mary_Brown/\n \u2502 \u251c\u2500\u2500 basic_info.txt\n \u2502 \u2514\u2500\u2500 recommendation_letter.txt\n \u251c\u2500\u2500 20199735_Mason_Johnson/\n \u2502 \u251c\u2500\u2500 basic_info.txt\n \u2502 \u2514\u2500\u2500 recommendation_letter.txt\n \u251c\u2500\u2500 20199872_Sophia_Jackson/\n \u2502 \u251c\u2500\u2500 basic_info.txt\n \u2502 \u2514\u2500\u2500 recommendation_letter.txt\n \u251c\u2500\u2500 20199980_James_Rodriguez/\n \u2502 \u251c\u2500\u2500 basic_info.txt\n \u2502 \u2514\u2500\u2500 recommendation_letter.txt\n \u251c\u2500\u2500 20201385_John_Taylor/\n \u2502 \u251c\u2500\u2500 basic_info.txt\n \u2502 \u2514\u2500\u2500 recommendation_letter.txt\n \u251c\u2500\u2500 20201800_John_Jones/\n \u2502 \u251c\u2500\u2500 basic_info.txt\n \u2502 \u2514\u2500\u2500 recommendation_letter.txt\n \u251c\u2500\u2500 20202548_Robert_Miller/\n \u2502 \u251c\u2500\u2500 basic_info.txt\n \u2502 \u2514\u2500\u2500 recommendation_letter.txt\n \u251c\u2500\u2500 20203855_Mia_Miller/\n \u2502 \u251c\u2500\u2500 basic_info.txt\n \u2502 \u2514\u2500\u2500 recommendation_letter.txt\n \u2514\u2500\u2500 20204611_Sarah_Wilson/\n \u251c\u2500\u2500 basic_info.txt\n \u2514\u2500\u2500 recommendation_letter.txt", "stateUrl": "https://storage.mcpmark.ai/filesystem/student_database.zip", "stateOriginalUrl": null } } ================================================ FILE: tasks/filesystem/easy/student_database/recommender_name/verify.py ================================================ #!/usr/bin/env python3 """ Verification script for Student Database Task: Find Recommender Name """ import sys from pathlib import Path import os def get_test_directory() -> Path: """Get the test directory from FILESYSTEM_TEST_DIR env var.""" test_root = os.environ.get("FILESYSTEM_TEST_DIR") if not test_root: raise ValueError("FILESYSTEM_TEST_DIR environment variable is required") return Path(test_root) def verify_recommender_file_exists(test_dir: Path) -> bool: """Verify that the recommender.txt file exists.""" recommender_file = test_dir / "recommender.txt" if not recommender_file.exists(): print("❌ File 'recommender.txt' not found") return False print("✅ Recommender file found") return True def verify_recommender_content(test_dir: Path) -> bool: """Verify that the recommender.txt file contains 'Brown'.""" recommender_file = test_dir / "recommender.txt" try: content = recommender_file.read_text() if "Brown" in content: print("✅ Recommender name 'Brown' found in file") return True else: print("❌ Recommender name 'Brown' not found in file") print(f" File content: {content.strip()}") return False except Exception as e: print(f"❌ Error reading recommender file: {e}") return False def main(): """Main verification function.""" test_dir = get_test_directory() print("🔍 Verifying Student Database Task: Find Recommender Name...") # Check if recommender file exists print("\n--- File Existence Check ---") if not verify_recommender_file_exists(test_dir): print("\n❌ Basic verification failed, cannot proceed with content verification") sys.exit(1) # Verify content print("\n--- Content Verification ---") if not verify_recommender_content(test_dir): print("\n❌ Task verification: FAIL") sys.exit(1) # Final result print("\n" + "="*50) print("✅ Recommender identification completed correctly!") print("🎉 Task verification: PASS") sys.exit(0) if __name__ == "__main__": main() ================================================ FILE: tasks/filesystem/standard/desktop/music_report/description.md ================================================ Please use FileSystem tools to finish the following task: ### 1. Data Loading - Read and extract song information from `jay_chou/` - Read and extract song information from `jj_lin/` ### 2. Popularity Score Calculation For each songs, calculate popularity scores using this formula (keep 3 decimal places): ``` popularity_score = (rating × 0.4) + (play_count_normalized × 0.4) + (year_factor × 0.2) Where: - rating: song rating (1-5 scale) - play_count_normalized: play_count / 250 (0-1 scale) - year_factor: (2025 - release_year) / 25 (recency bonus) ``` ### 3. Generate Analysis Report Create a file named `music_analysis_report.txt` in the `music/` folder with the following exact format: **Lines 1-20**: Each line contains one song in format `songname:popularity_score` - Sort songs by popularity_score in descending order (highest first) - Use exact song names as they appear in the source files - Include all 20 songs from both artists **Lines 21-25**: Top 5 song names only (one per line) - List the top 5 songs by popularity_score - No scores, just song names - One song name per line **Important**: The file must contain exactly 25 lines with no additional content, headers, or formatting. ================================================ FILE: tasks/filesystem/standard/desktop/music_report/meta.json ================================================ { "task_id": "music_report", "task_name": "Music Report", "category_id": "desktop", "category_name": "Desktop", "description": "Search and analyze desktop music files to generate a scored recommendation list using specified computation rules and criteria.", "author": "Lingjun Chen", "created_at": "2025-08-12", "difficulty": "L3", "tags": [ "data extraction", "pattern analysis" ], "mcp": [ "filesystem" ], "meta_data": { "stateType": "text", "stateContent": "desktop/\n ├── exp_logs/\n │ ├── aug/\n │ │ └── augmentation_log.txt\n │ ├── project_1/\n │ │ ├── data.csv\n │ │ ├── model.py\n │ │ └── README.md\n │ ├── project_2/\n │ │ ├── analysis_report.md\n │ │ └── data_analysis.py\n │ ├── sep/\n │ │ └── september_summary.csv\n │ ├── exp_record.md\n │ ├── experiment_summary.md\n │ └── results_record.csv\n ├── learning/\n │ ├── 2024/\n │ │ └── learning_progress.csv\n │ ├── 2025/\n │ │ └── learning_roadmap.md\n │ ├── activities/\n │ │ └── study_notes.py\n │ ├── research/\n │ │ └── research_topics.md\n │ ├── schedule/\n │ │ └── weekly_schedule.csv\n │ └── learning_goals.md\n ├── music/\n │ ├── beni/\n │ │ └── playlist_manager.py\n │ ├── jay_chou/\n │ │ └── favorite_songs.csv\n │ ├── jj_lin/\n │ │ └── top_songs.txt\n │ └── music_collection.md\n ├── old_homebrew/\n │ ├── 2023-09-23_22/\n │ │ ├── opt/\n │ │ └── Users/\n │ └── 2023-09-23_23/\n │ ├── opt/\n │ └── Users/\n ├── play/\n │ ├── game_plan/\n │ │ └── gaming_schedule.md\n │ ├── hongkong_tour/\n │ │ └── travel_itinerary.csv\n │ ├── kit&shoes_collection/\n │ │ └── inventory.py\n │ └── others/\n │ └── entertainment_planner.md\n └── travel_plan/\n ├── travel_bucket_list.md\n └── travel_calculator.py\n", "stateUrl": "https://storage.mcpmark.ai/filesystem/desktop.zip", "stateOriginalUrl": null } } ================================================ FILE: tasks/filesystem/standard/desktop/music_report/verify.py ================================================ #!/usr/bin/env python3 """ Verification script for Desktop 2 Music Report Task: Music Collection Analysis """ import sys from pathlib import Path import os def get_test_directory() -> Path: """Get the test directory from FILESYSTEM_TEST_DIR env var.""" test_root = os.environ.get("FILESYSTEM_TEST_DIR") if not test_root: raise ValueError("FILESYSTEM_TEST_DIR environment variable is required") return Path(test_root) # Hardcoded expected data from answer.json EXPECTED_SONGS = [ {"song_name": "晴天", "popularity_score": 2.576}, {"song_name": "七里香", "popularity_score": 2.488}, {"song_name": "江南", "popularity_score": 2.488}, {"song_name": "夜曲", "popularity_score": 2.448}, {"song_name": "一千年以后", "popularity_score": 2.44}, {"song_name": "稻香", "popularity_score": 2.376}, {"song_name": "青花瓷", "popularity_score": 2.336}, {"song_name": "不为谁而作的歌", "popularity_score": 2.32}, {"song_name": "学不会", "popularity_score": 2.304}, {"song_name": "小酒窝", "popularity_score": 2.264}, {"song_name": "可惜没如果", "popularity_score": 2.248}, {"song_name": "修炼爱情", "popularity_score": 2.24}, {"song_name": "背对背拥抱", "popularity_score": 2.24}, {"song_name": "爱笑的眼睛", "popularity_score": 2.232}, {"song_name": "她说", "popularity_score": 2.216}, {"song_name": "简单爱", "popularity_score": 1.952}, {"song_name": "龙卷风", "popularity_score": 1.936}, {"song_name": "双截棍", "popularity_score": 1.92}, {"song_name": "可爱女人", "popularity_score": 1.912}, {"song_name": "星晴", "popularity_score": 1.896} ] EXPECTED_TOP_5 = ["晴天", "七里香", "江南", "夜曲", "一千年以后"] def verify_report_file_exists(test_dir: Path) -> bool: """Verify that the music_analysis_report.txt file exists.""" report_file = test_dir / "music" / "music_analysis_report.txt" if not report_file.exists(): print("❌ 'music_analysis_report.txt' file not found in music/ folder") return False if not report_file.is_file(): print("❌ 'music_analysis_report.txt' exists but is not a file") return False print("✅ 'music_analysis_report.txt' file exists") return True def verify_file_content_structure(test_dir: Path) -> bool: """Verify that the file has exactly 25 lines.""" report_file = test_dir / "music" / "music_analysis_report.txt" try: content = report_file.read_text(encoding='utf-8') lines = content.strip().split('\n') if len(lines) != 25: print(f"❌ File should have exactly 25 lines, but has {len(lines)}") return False print("✅ File has exactly 25 lines") return True except Exception as e: print(f"❌ Error reading file content: {e}") return False def verify_song_ranking_format(test_dir: Path) -> bool: """Verify that lines 1-20 contain songs with scores in correct format.""" report_file = test_dir / "music" / "music_analysis_report.txt" try: content = report_file.read_text(encoding='utf-8') lines = content.strip().split('\n') # Check lines 1-20 (index 0-19) for i in range(20): line = lines[i].strip() if not line: print(f"❌ Line {i+1} is empty") return False # Check format: songname:popularity_score if ':' not in line: print(f"❌ Line {i+1} missing colon separator: '{line}'") return False parts = line.split(':', 1) if len(parts) != 2: print(f"❌ Line {i+1} has incorrect format: '{line}'") return False song_name, score_str = parts if not song_name.strip(): print(f"❌ Line {i+1} has empty song name: '{line}'") return False try: score = float(score_str.strip()) if score < 0 or score > 5: print(f"❌ Line {i+1} has invalid score range: {score}") return False except ValueError: print(f"❌ Line {i+1} has invalid score format: '{score_str}'") return False print("✅ Lines 1-20 have correct song:score format") return True except Exception as e: print(f"❌ Error checking song ranking format: {e}") return False def verify_song_ranking_order_with_tolerance(test_dir: Path) -> bool: """Verify that songs are ranked by popularity score in descending order, allowing equal scores to be swapped.""" report_file = test_dir / "music" / "music_analysis_report.txt" try: content = report_file.read_text(encoding='utf-8') lines = content.strip().split('\n') scores = [] for i in range(20): line = lines[i].strip() parts = line.split(':', 1) score = float(parts[1].strip()) scores.append(score) # Check if scores are in descending order, allowing equal scores to be adjacent for i in range(1, len(scores)): if scores[i] > scores[i-1]: print(f"❌ Scores not in descending order: {scores[i-1]} < {scores[i]} at line {i+1}") return False print("✅ Songs are ranked by popularity score in descending order (allowing equal scores)") return True except Exception as e: print(f"❌ Error checking song ranking order: {e}") return False def verify_song_names_match_expected(test_dir: Path) -> bool: """Verify that all expected song names are present in the ranking.""" report_file = test_dir / "music" / "music_analysis_report.txt" try: content = report_file.read_text(encoding='utf-8') lines = content.strip().split('\n') found_songs = [] for i in range(20): line = lines[i].strip() song_name = line.split(':', 1)[0].strip() found_songs.append(song_name) # Check if all expected songs are present missing_songs = [] for expected_song in EXPECTED_SONGS: if expected_song["song_name"] not in found_songs: missing_songs.append(expected_song["song_name"]) if missing_songs: print(f"❌ Missing expected songs: {missing_songs}") return False print("✅ All expected song names are present") return True except Exception as e: print(f"❌ Error checking song names: {e}") return False def verify_popularity_scores_match_expected(test_dir: Path) -> bool: """Verify that popularity scores match the expected values.""" report_file = test_dir / "music" / "music_analysis_report.txt" try: content = report_file.read_text(encoding='utf-8') lines = content.strip().split('\n') score_errors = [] for i in range(20): line = lines[i].strip() parts = line.split(':', 1) song_name = parts[0].strip() actual_score = float(parts[1].strip()) # Find expected score for this song expected_score = None for expected_song in EXPECTED_SONGS: if expected_song["song_name"] == song_name: expected_score = expected_song["popularity_score"] break if expected_score is not None: # Allow small floating point precision differences if abs(actual_score - expected_score) > 0.001: score_errors.append(f"{song_name}: expected {expected_score}, got {actual_score}") if score_errors: print(f"❌ Score mismatches: {score_errors}") return False print("✅ All popularity scores match expected values") return True except Exception as e: print(f"❌ Error checking popularity scores: {e}") return False def verify_top_5_songs(test_dir: Path) -> bool: """Verify that lines 21-25 contain the top 5 song names, allowing equal scores to be in different order.""" report_file = test_dir / "music" / "music_analysis_report.txt" try: content = report_file.read_text(encoding='utf-8') lines = content.strip().split('\n') # Check lines 21-25 (index 20-24) found_top_5 = [] for i in range(5): line_num = i + 21 line = lines[i + 20].strip() # Index 20-24 for lines 21-25 if not line: print(f"❌ Line {line_num} is empty") return False if ':' in line: print(f"❌ Line {line_num} should not contain colon: '{line}'") return False found_top_5.append(line) # Check if all expected top 5 songs are present (order doesn't matter for equal scores) missing_songs = [] for expected_song in EXPECTED_TOP_5: if expected_song not in found_top_5: missing_songs.append(expected_song) if missing_songs: print(f"❌ Missing expected top 5 songs: {missing_songs}") return False # Check if the order is valid (allowing equal scores to be swapped) # Since 七里香 and 江南 both have score 2.488, they can be in either order valid_orders = [ ["晴天", "七里香", "江南", "夜曲", "一千年以后"], # Original order ["晴天", "江南", "七里香", "夜曲", "一千年以后"], # Swapped 七里香 and 江南 ] order_valid = False for valid_order in valid_orders: if found_top_5 == valid_order: order_valid = True break if not order_valid: print(f"❌ Top 5 songs order is invalid. Found: {found_top_5}") print(f"Expected one of: {valid_orders}") return False print("✅ Lines 21-25 contain correct top 5 song names in valid order") return True except Exception as e: print(f"❌ Error checking top 5 songs: {e}") return False def verify_no_extra_content(test_dir: Path) -> bool: """Verify that the file contains no extra content beyond the 25 lines.""" report_file = test_dir / "music" / "music_analysis_report.txt" try: content = report_file.read_text(encoding='utf-8') lines = content.strip().split('\n') if len(lines) != 25: print(f"❌ File should have exactly 25 lines, but has {len(lines)}") return False print("✅ File contains exactly 25 lines with no extra content") return True except Exception as e: print(f"❌ Error checking for extra content: {e}") return False def main(): """Main verification function.""" test_dir = get_test_directory() print("🔍 Verifying Desktop 2 Music Report Task: Music Collection Analysis...") # Define verification steps verification_steps = [ ("Report File Exists", verify_report_file_exists), ("File Content Structure", verify_file_content_structure), ("Song Ranking Format", verify_song_ranking_format), ("Song Ranking Order", verify_song_ranking_order_with_tolerance), ("Song Names Match Expected", verify_song_names_match_expected), ("Popularity Scores Match Expected", verify_popularity_scores_match_expected), ("Top 5 Songs", verify_top_5_songs), ("No Extra Content", verify_no_extra_content), ] # Run all verification steps all_passed = True for step_name, verify_func in verification_steps: print(f"\n--- {step_name} ---") if not verify_func(test_dir): all_passed = False # Final result print("\n" + "="*50) if all_passed: print("✅ Music collection analysis completed correctly!") print("🎉 Task verification: PASS") sys.exit(0) else: print("❌ Task verification: FAIL") sys.exit(1) if __name__ == "__main__": main() ================================================ FILE: tasks/filesystem/standard/desktop/project_management/description.md ================================================ Please use FileSystem tools to finish the following task: 1. **Create the main directory structure** in `desktop_2`: - Create a new directory in main directory called `organized_projects` - Inside `organized_projects`, create 3 main subdirectories: `experiments`, `learning`, and `personal` - Inside `experiments`, create 2 subdirectories: `ml_projects` and `data_analysis` - Inside `learning`, create 2 subdirectories: `progress_tracking` and `resources` - Inside `personal`, create 2 subdirectories: `entertainment` and `collections` 2. **Move all the Python files** to `experiments/ml_projects/`: 3. **Move all the CSV files** to `experiments/data_analysis/`: 4. **Only Move learning-related markdown files** to `learning/resources/`: 5. **Only Move entertainment planning-related markdown files** to `personal/entertainment/`: 6. **Only Move music collection-related markdown files** to `personal/collections/`: 7. **step 4.5.6 should move all the markdown files.** 8. **Create a project structure documentation file**: - Create `project_structure.md` in the `organized_projects` directory - Document the new organization with exact file counts for each subdirectory - Include a summary of what types of files are in each directory ================================================ FILE: tasks/filesystem/standard/desktop/project_management/meta.json ================================================ { "task_id": "project_management", "task_name": "Project Management", "category_id": "desktop", "category_name": "Desktop", "description": "Reorganize scattered desktop files into a structured project directory system based on content type, purpose, and file format analysis.", "author": "Lingjun Chen", "created_at": "2025-08-12", "difficulty": "L3", "tags": [ "file organization" ], "mcp": [ "filesystem" ], "meta_data": { "stateType": "text", "stateContent": "desktop/\n ├── exp_logs/\n │ ├── aug/\n │ │ └── augmentation_log.txt\n │ ├── project_1/\n │ │ ├── data.csv\n │ │ ├── model.py\n │ │ └── README.md\n │ ├── project_2/\n │ │ ├── analysis_report.md\n │ │ └── data_analysis.py\n │ ├── sep/\n │ │ └── september_summary.csv\n │ ├── exp_record.md\n │ ├── experiment_summary.md\n │ └── results_record.csv\n ├── learning/\n │ ├── 2024/\n │ │ └── learning_progress.csv\n │ ├── 2025/\n │ │ └── learning_roadmap.md\n │ ├── activities/\n │ │ └── study_notes.py\n │ ├── research/\n │ │ └── research_topics.md\n │ ├── schedule/\n │ │ └── weekly_schedule.csv\n │ └── learning_goals.md\n ├── music/\n │ ├── beni/\n │ │ └── playlist_manager.py\n │ ├── jay_chou/\n │ │ └── favorite_songs.csv\n │ ├── jj_lin/\n │ │ └── top_songs.txt\n │ └── music_collection.md\n ├── old_homebrew/\n │ ├── 2023-09-23_22/\n │ │ ├── opt/\n │ │ └── Users/\n │ └── 2023-09-23_23/\n │ ├── opt/\n │ └── Users/\n ├── play/\n │ ├── game_plan/\n │ │ └── gaming_schedule.md\n │ ├── hongkong_tour/\n │ │ └── travel_itinerary.csv\n │ ├── kit&shoes_collection/\n │ │ └── inventory.py\n │ └── others/\n │ └── entertainment_planner.md\n └── travel_plan/\n ├── travel_bucket_list.md\n └── travel_calculator.py\n", "stateUrl": "https://storage.mcpmark.ai/filesystem/desktop.zip", "stateOriginalUrl": null } } ================================================ FILE: tasks/filesystem/standard/desktop/project_management/verify.py ================================================ #!/usr/bin/env python3 """ Verification script for Desktop 2 Project Management Task: File Reorganization """ import sys from pathlib import Path import os def get_test_directory() -> Path: """Get the test directory from FILESYSTEM_TEST_DIR env var.""" test_root = os.environ.get("FILESYSTEM_TEST_DIR") if not test_root: raise ValueError("FILESYSTEM_TEST_DIR environment variable is required") return Path(test_root) def verify_organized_projects_directory_exists(test_dir: Path) -> bool: """Verify that the organized_projects directory exists.""" organized_dir = test_dir / "organized_projects" if not organized_dir.exists(): print("❌ 'organized_projects' directory not found") return False if not organized_dir.is_dir(): print("❌ 'organized_projects' exists but is not a directory") return False print("✅ 'organized_projects' directory exists") return True def verify_directory_structure(test_dir: Path) -> bool: """Verify that all required subdirectories exist.""" organized_dir = test_dir / "organized_projects" required_dirs = [ "experiments", "experiments/ml_projects", "experiments/data_analysis", "learning", "learning/progress_tracking", "learning/resources", "personal", "personal/entertainment", "personal/collections" ] missing_dirs = [] for dir_path in required_dirs: full_path = organized_dir / dir_path if not full_path.exists(): missing_dirs.append(dir_path) elif not full_path.is_dir(): missing_dirs.append(f"{dir_path} (not a directory)") if missing_dirs: print(f"❌ Missing or invalid directories: {missing_dirs}") return False print("✅ All required directory structure created correctly") return True def verify_python_files_in_ml_projects(test_dir: Path) -> bool: """Verify that all Python files are moved to experiments/ml_projects.""" organized_dir = test_dir / "organized_projects" ml_projects_dir = organized_dir / "experiments" / "ml_projects" expected_python_files = [ "study_notes.py", "model.py", "data_analysis.py", "travel_calculator.py", "inventory.py", "playlist_manager.py" ] missing_files = [] for filename in expected_python_files: file_path = ml_projects_dir / filename if not file_path.exists(): missing_files.append(filename) if missing_files: print(f"❌ Missing Python files in ml_projects: {missing_files}") return False print("✅ All Python files moved to experiments/ml_projects") return True def verify_csv_files_in_data_analysis(test_dir: Path) -> bool: """Verify that all CSV files are moved to experiments/data_analysis.""" organized_dir = test_dir / "organized_projects" data_analysis_dir = organized_dir / "experiments" / "data_analysis" expected_csv_files = [ "learning_progress.csv", "weekly_schedule.csv", "results_record.csv", "september_summary.csv", "data.csv", "favorite_songs.csv", "travel_itinerary.csv" ] missing_files = [] for filename in expected_csv_files: file_path = data_analysis_dir / filename if not file_path.exists(): missing_files.append(filename) if missing_files: print(f"❌ Missing CSV files in data_analysis: {missing_files}") return False print("✅ All CSV files moved to experiments/data_analysis") return True def verify_learning_md_files_in_resources(test_dir: Path) -> bool: """Verify that learning-related markdown files are moved to learning/resources.""" organized_dir = test_dir / "organized_projects" resources_dir = organized_dir / "learning" / "resources" expected_learning_files = [ "learning_roadmap.md", "research_topics.md", "experiment_summary.md", "exp_record.md", "README.md", "analysis_report.md", "learning_goals.md" ] missing_files = [] for filename in expected_learning_files: file_path = resources_dir / filename if not file_path.exists(): missing_files.append(filename) if missing_files: print(f"❌ Missing learning markdown files in resources: {missing_files}") return False print("✅ All learning markdown files moved to learning/resources") return True def verify_entertainment_md_files_in_entertainment(test_dir: Path) -> bool: """Verify that entertainment planning markdown files are moved to personal/entertainment.""" organized_dir = test_dir / "organized_projects" entertainment_dir = organized_dir / "personal" / "entertainment" expected_entertainment_files = [ "gaming_schedule.md", "entertainment_planner.md", "travel_bucket_list.md" ] missing_files = [] for filename in expected_entertainment_files: file_path = entertainment_dir / filename if not file_path.exists(): missing_files.append(filename) if missing_files: print(f"❌ Missing entertainment markdown files in entertainment: {missing_files}") return False print("✅ All entertainment markdown files moved to personal/entertainment") return True def verify_music_md_files_in_collections(test_dir: Path) -> bool: """Verify that music collection markdown files are moved to personal/collections.""" organized_dir = test_dir / "organized_projects" collections_dir = organized_dir / "personal" / "collections" expected_music_files = [ "music_collection.md" ] missing_files = [] for filename in expected_music_files: file_path = collections_dir / filename if not file_path.exists(): missing_files.append(filename) if missing_files: print(f"❌ Missing music collection markdown files in collections: {filename}") return False print("✅ All music collection markdown files moved to personal/collections") return True def verify_progress_tracking_empty(test_dir: Path) -> bool: """Verify that progress_tracking directory is empty.""" organized_dir = test_dir / "organized_projects" progress_dir = organized_dir / "learning" / "progress_tracking" files_in_progress = list(progress_dir.iterdir()) if files_in_progress: print(f"❌ progress_tracking directory should be empty, but contains: {[f.name for f in files_in_progress]}") return False print("✅ progress_tracking directory is correctly empty") return True def verify_project_structure_file_exists(test_dir: Path) -> bool: """Verify that project_structure.md file exists.""" organized_dir = test_dir / "organized_projects" structure_file = organized_dir / "project_structure.md" if not structure_file.exists(): print("❌ 'project_structure.md' file not found") return False if not structure_file.is_file(): print("❌ 'project_structure.md' exists but is not a file") return False print("✅ 'project_structure.md' file exists") return True def verify_file_counts(test_dir: Path) -> bool: """Verify that each directory has the correct number of files.""" organized_dir = test_dir / "organized_projects" expected_counts = { "experiments/ml_projects": 6, # 6 Python files "experiments/data_analysis": 7, # 7 CSV files "learning/resources": 7, # 7 learning markdown files "learning/progress_tracking": 0, # 0 files (empty) "personal/entertainment": 3, # 3 entertainment markdown files "personal/collections": 1 # 1 music collection markdown file } incorrect_counts = [] for dir_path, expected_count in expected_counts.items(): full_path = organized_dir / dir_path actual_count = len([f for f in full_path.iterdir() if f.is_file()]) if actual_count != expected_count: incorrect_counts.append(f"{dir_path}: expected {expected_count}, got {actual_count}") if incorrect_counts: print(f"❌ Incorrect file counts: {incorrect_counts}") return False print("✅ All directories have correct file counts") return True def main(): """Main verification function.""" test_dir = get_test_directory() print("🔍 Verifying Desktop 2 Project Management Task: File Reorganization...") # Define verification steps verification_steps = [ ("Organized Projects Directory Exists", verify_organized_projects_directory_exists), ("Directory Structure", verify_directory_structure), ("Python Files in ML Projects", verify_python_files_in_ml_projects), ("CSV Files in Data Analysis", verify_csv_files_in_data_analysis), ("Learning Markdown Files in Resources", verify_learning_md_files_in_resources), ("Entertainment Markdown Files in Entertainment", verify_entertainment_md_files_in_entertainment), ("Music Collection Files in Collections", verify_music_md_files_in_collections), ("Progress Tracking Empty", verify_progress_tracking_empty), ("Project Structure File Exists", verify_project_structure_file_exists), ("File Counts", verify_file_counts), ] # Run all verification steps all_passed = True for step_name, verify_func in verification_steps: print(f"\n--- {step_name} ---") if not verify_func(test_dir): all_passed = False # Final result print("\n" + "="*50) if all_passed: print("✅ Desktop 2 project reorganization completed correctly!") print("🎉 Task verification: PASS") sys.exit(0) else: print("❌ Task verification: FAIL") sys.exit(1) if __name__ == "__main__": main() ================================================ FILE: tasks/filesystem/standard/desktop/timeline_extraction/description.md ================================================ Please use FileSystem tools to finish the following task: Read all the files under current path, extract every time/plan information that clearly indicates 2024, and integrate them into a list and create a file in main directory called `timeline.txt`. Write the timeline in the file in the following format. ### Rules - If a task only shows month without day, use the 1st day of that month - If a task only shows year without month and day, skip it. - If a file shows multiple tasks on the same date, count only once per date ### Output Format - Each line format: `file_path:time` - `file_path`: The file path where this time information appears (**relative to the current path**) - `time`: Specific time, if it's a time period, write the start time (YYYY-MM-DD) ### Sorting Requirements - Sort by chronological order ================================================ FILE: tasks/filesystem/standard/desktop/timeline_extraction/meta.json ================================================ { "task_id": "timeline_extraction", "task_name": "Timeline Extraction", "category_id": "desktop", "category_name": "Desktop", "description": "Extract temporal event information from various desktop files and compile a comprehensive chronological timeline of activities and milestones.", "author": "Lingjun Chen", "created_at": "2025-08-12", "difficulty": "L3", "tags": [ "data extraction", "pattern analysis" ], "mcp": [ "filesystem" ], "meta_data": { "stateType": "text", "stateContent": "desktop/\n ├── exp_logs/\n │ ├── aug/\n │ │ └── augmentation_log.txt\n │ ├── project_1/\n │ │ ├── data.csv\n │ │ ├── model.py\n │ │ └── README.md\n │ ├── project_2/\n │ │ ├── analysis_report.md\n │ │ └── data_analysis.py\n │ ├── sep/\n │ │ └── september_summary.csv\n │ ├── exp_record.md\n │ ├── experiment_summary.md\n │ └── results_record.csv\n ├── learning/\n │ ├── 2024/\n │ │ └── learning_progress.csv\n │ ├── 2025/\n │ │ └── learning_roadmap.md\n │ ├── activities/\n │ │ └── study_notes.py\n │ ├── research/\n │ │ └── research_topics.md\n │ ├── schedule/\n │ │ └── weekly_schedule.csv\n │ └── learning_goals.md\n ├── music/\n │ ├── beni/\n │ │ └── playlist_manager.py\n │ ├── jay_chou/\n │ │ └── favorite_songs.csv\n │ ├── jj_lin/\n │ │ └── top_songs.txt\n │ └── music_collection.md\n ├── old_homebrew/\n │ ├── 2023-09-23_22/\n │ │ ├── opt/\n │ │ └── Users/\n │ └── 2023-09-23_23/\n │ ├── opt/\n │ └── Users/\n ├── play/\n │ ├── game_plan/\n │ │ └── gaming_schedule.md\n │ ├── hongkong_tour/\n │ │ └── travel_itinerary.csv\n │ ├── kit&shoes_collection/\n │ │ └── inventory.py\n │ └── others/\n │ └── entertainment_planner.md\n └── travel_plan/\n ├── travel_bucket_list.md\n └── travel_calculator.py\n", "stateUrl": "https://storage.mcpmark.ai/filesystem/desktop.zip", "stateOriginalUrl": null } } ================================================ FILE: tasks/filesystem/standard/desktop/timeline_extraction/verify.py ================================================ #!/usr/bin/env python3 """ Verification script for Desktop 2 Timeline Extraction Task """ import sys from pathlib import Path import os import re from datetime import datetime from typing import List, Tuple, Set def get_test_directory() -> Path: """Get the test directory from FILESYSTEM_TEST_DIR env var.""" test_root = os.environ.get("FILESYSTEM_TEST_DIR") if not test_root: raise ValueError("FILESYSTEM_TEST_DIR environment variable is required") return Path(test_root) def verify_timeline_file_exists(test_dir: Path) -> bool: """Verify that the timeline.txt file exists in the main directory.""" timeline_file = test_dir / "timeline.txt" if not timeline_file.exists(): print("❌ 'timeline.txt' file not found in main directory") return False if not timeline_file.is_file(): print("❌ 'timeline.txt' exists but is not a file") return False print("✅ 'timeline.txt' file exists in main directory") return True def verify_timeline_file_readable(test_dir: Path) -> bool: """Verify that the timeline.txt file is readable.""" timeline_file = test_dir / "timeline.txt" try: content = timeline_file.read_text(encoding='utf-8') if not content.strip(): print("❌ 'timeline.txt' file is empty") return False print("✅ 'timeline.txt' file is readable") return True except Exception as e: print(f"❌ Error reading 'timeline.txt' file: {e}") return False def verify_line_count(test_dir: Path) -> bool: """Verify that the timeline.txt file has exactly 43 lines.""" timeline_file = test_dir / "timeline.txt" try: content = timeline_file.read_text(encoding='utf-8') lines = [line.strip() for line in content.split('\n') if line.strip()] if len(lines) != 43: print(f"❌ Expected 43 lines, but found {len(lines)} lines") return False print(f"✅ File contains exactly {len(lines)} lines") return True except Exception as e: print(f"❌ Error checking line count: {e}") return False def verify_line_format(test_dir: Path) -> bool: """Verify that each line contains both file path and date time information.""" timeline_file = test_dir / "timeline.txt" try: content = timeline_file.read_text(encoding='utf-8') lines = [line.strip() for line in content.split('\n') if line.strip()] # More flexible pattern: just check if line contains both path-like content and date-like content date_pattern = r'\d{4}-\d{2}-\d{2}' # YYYY-MM-DD format invalid_lines = [] for i, line in enumerate(lines, 1): # Check if line contains a date if not re.search(date_pattern, line): invalid_lines.append(f"Line {i}: '{line}' (no valid date found)") continue # Check if line contains path-like content (contains '/' or '.' and not just a date) # More flexible: look for path anywhere in the line, not just at the beginning path_found = False # Split line into words and look for path-like content words = line.split() for word in words: # Check if word looks like a file path (contains '/' or '.' and not just a date) if ('/' in word or '.' in word) and not re.match(r'^\d{4}-\d{2}-\d{2}$', word.strip()): path_found = True break # Also check if line contains path-like content with colon separator if ':' in line: parts = line.split(':') for part in parts: if ('/' in part or '.' in part) and not re.match(r'^\d{4}-\d{2}-\d{2}$', part.strip()): path_found = True break if not path_found: invalid_lines.append(f"Line {i}: '{line}' (no valid path found)") continue if invalid_lines: print(f"❌ Invalid line format found: {invalid_lines[:5]}...") return False print("✅ All lines contain both file path and date time information") return True except Exception as e: print(f"❌ Error checking line format: {e}") return False def verify_date_format(test_dir: Path) -> bool: """Verify that all dates are in valid YYYY-MM-DD format.""" timeline_file = test_dir / "timeline.txt" try: content = timeline_file.read_text(encoding='utf-8') lines = [line.strip() for line in content.split('\n') if line.strip()] invalid_dates = [] for i, line in enumerate(lines, 1): try: # Find date pattern in the line (more flexible) date_match = re.search(r'\d{4}-\d{2}-\d{2}', line) if not date_match: invalid_dates.append(f"Line {i}: '{line}' (no date found)") continue date_part = date_match.group() datetime.strptime(date_part, '%Y-%m-%d') except (IndexError, ValueError) as e: invalid_dates.append(f"Line {i}: '{line}' (invalid date: {e})") if invalid_dates: print(f"❌ Invalid date format found: {invalid_dates[:5]}...") return False print("✅ All dates are in valid YYYY-MM-DD format") return True except Exception as e: print(f"❌ Error checking date format: {e}") return False def verify_chronological_order(test_dir: Path) -> bool: """Verify that dates are in chronological order.""" timeline_file = test_dir / "timeline.txt" try: content = timeline_file.read_text(encoding='utf-8') lines = [line.strip() for line in content.split('\n') if line.strip()] dates = [] for line in lines: # Find date pattern in the line (more flexible) date_match = re.search(r'\d{4}-\d{2}-\d{2}', line) if date_match: date_obj = datetime.strptime(date_match.group(), '%Y-%m-%d') dates.append(date_obj) # Check if dates are in ascending order for i in range(1, len(dates)): if dates[i] < dates[i-1]: print(f"❌ Date order violation: {dates[i-1].strftime('%Y-%m-%d')} comes after {dates[i].strftime('%Y-%m-%d')}") return False print("✅ All dates are in chronological order") return True except Exception as e: print(f"❌ Error checking chronological order: {e}") return False def verify_expected_entries(test_dir: Path) -> bool: """Verify that all expected entries from answer.txt are present.""" timeline_file = test_dir / "timeline.txt" try: content = timeline_file.read_text(encoding='utf-8') actual_lines = [line.strip() for line in content.split('\n') if line.strip()] # Expected entries from answer.txt expected_entries = { "exp_logs/project_2/analysis_report.md:2024-01-01", "learning/2024/learning_progress.csv:2024-01-01", "exp_logs/experiment_summary.md:2024-01-05", "play/kit&shoes_collection/inventory.py:2024-01-05", "exp_logs/experiment_summary.md:2024-01-10", "play/kit&shoes_collection/inventory.py:2024-01-10", "exp_logs/aug/augmentation_log.txt:2024-01-15", "exp_logs/experiment_summary.md:2024-01-15", "play/kit&shoes_collection/inventory.py:2024-01-15", "learning/2024/learning_progress.csv:2024-02-01", "learning/2024/learning_progress.csv:2024-03-01", "play/hongkong_tour/travel_itinerary.csv:2024-03-15", "travel_plan/travel_calculator.py:2024-03-15", "play/hongkong_tour/travel_itinerary.csv:2024-03-16", "play/hongkong_tour/travel_itinerary.csv:2024-03-17", "play/hongkong_tour/travel_itinerary.csv:2024-03-18", "play/hongkong_tour/travel_itinerary.csv:2024-03-19", "play/hongkong_tour/travel_itinerary.csv:2024-03-20", "travel_plan/travel_bucket_list.md:2024-04-01", "learning/2024/learning_progress.csv:2024-04-01", "learning/2024/learning_progress.csv:2024-05-01", "travel_plan/travel_bucket_list.md:2024-06-01", "learning/2024/learning_progress.csv:2024-06-01", "learning/2024/learning_progress.csv:2024-07-01", "exp_logs/exp_record.md:2024-08-01", "exp_logs/results_record.csv:2024-08-01", "travel_plan/travel_bucket_list.md:2024-08-01", "learning/2024/learning_progress.csv:2024-08-01", "exp_logs/results_record.csv:2024-08-02", "exp_logs/results_record.csv:2024-08-03", "exp_logs/results_record.csv:2024-08-04", "exp_logs/exp_record.md:2024-09-01", "exp_logs/sep/september_summary.csv:2024-09-01", "learning/2024/learning_progress.csv:2024-09-01", "exp_logs/sep/september_summary.csv:2024-09-05", "exp_logs/sep/september_summary.csv:2024-09-10", "exp_logs/sep/september_summary.csv:2024-09-15", "exp_logs/sep/september_summary.csv:2024-09-20", "exp_logs/sep/september_summary.csv:2024-09-25", "exp_logs/sep/september_summary.csv:2024-09-30", "learning/2024/learning_progress.csv:2024-10-01", "learning/2024/learning_progress.csv:2024-11-01", "learning/2024/learning_progress.csv:2024-12-01" } # Check if each expected entry is found in actual lines (more flexible matching) missing_entries = [] for expected in expected_entries: expected_path, expected_date = expected.split(':') found = False for actual_line in actual_lines: # Check if line contains both the expected path and date # More flexible: path can be anywhere in the line, not just at the beginning if expected_path in actual_line and expected_date in actual_line: found = True break if not found: missing_entries.append(expected) # Check for extra entries (lines that don't match any expected pattern) extra_entries = [] for actual_line in actual_lines: # Extract date from actual line date_match = re.search(r'\d{4}-\d{2}-\d{2}', actual_line) if not date_match: continue actual_date = date_match.group() # Try to extract file path from the line actual_path = None words = actual_line.split() for word in words: if ('/' in word or '.' in word) and not re.match(r'^\d{4}-\d{2}-\d{2}$', word.strip()): actual_path = word break if not actual_path: continue # Find if this line matches any expected entry found_expected = False for expected in expected_entries: expected_path, expected_date = expected.split(':') if expected_path in actual_path and expected_date == actual_date: found_expected = True break if not found_expected: extra_entries.append(actual_line) if missing_entries: print(f"❌ Missing {len(missing_entries)} expected entries") print(f" Examples: {missing_entries[:3]}") return False if extra_entries: print(f"❌ Found {len(extra_entries)} unexpected entries") print(f" Examples: {extra_entries[:3]}") return False print("✅ All expected entries are present, no extra entries") return True except Exception as e: print(f"❌ Error checking expected entries: {e}") return False def verify_no_duplicates(test_dir: Path) -> bool: """Verify that there are no duplicate entries.""" timeline_file = test_dir / "timeline.txt" try: content = timeline_file.read_text(encoding='utf-8') lines = [line.strip() for line in content.split('\n') if line.strip()] if len(lines) != len(set(lines)): print("❌ Duplicate entries found in timeline.txt") return False print("✅ No duplicate entries found") return True except Exception as e: print(f"❌ Error checking for duplicates: {e}") return False def verify_file_paths_exist(test_dir: Path) -> bool: """Verify that all file paths mentioned in timeline.txt actually exist.""" timeline_file = test_dir / "timeline.txt" try: content = timeline_file.read_text(encoding='utf-8') lines = [line.strip() for line in content.split('\n') if line.strip()] missing_files = [] for line in lines: # Try to extract file path from the line (more flexible) file_path_found = False # Method 1: Split by colon and check each part if ':' in line: parts = line.split(':') for part in parts: part = part.strip() if part and ('/' in part or '.' in part) and not re.match(r'^\d{4}-\d{2}-\d{2}$', part): # This looks like a file path full_path = test_dir / part if not full_path.exists(): missing_files.append(part) file_path_found = True break # Method 2: Split into words and look for path-like content if not file_path_found: words = line.split() for word in words: word = word.strip() if ('/' in word or '.' in word) and not re.match(r'^\d{4}-\d{2}-\d{2}$', word): # This looks like a file path full_path = test_dir / word if not full_path.exists(): missing_files.append(word) file_path_found = True break # Method 3: Look for path pattern in the entire line if not file_path_found: # Use regex to find path-like patterns path_pattern = r'[a-zA-Z0-9_\-\.\/]+/[a-zA-Z0-9_\-\.\/]+' path_matches = re.findall(path_pattern, line) for match in path_matches: if '.' in match or '/' in match: full_path = test_dir / match if not full_path.exists(): missing_files.append(match) file_path_found = True break if missing_files: print(f"❌ {len(missing_files)} referenced files do not exist") print(f" Examples: {missing_files[:3]}") return False print("✅ All referenced file paths exist") return True except Exception as e: print(f"❌ Error checking file paths: {e}") return False def main(): """Main verification function.""" test_dir = get_test_directory() print("🔍 Verifying Desktop Timeline Extraction Task...") # Define verification steps verification_steps = [ ("Timeline File Exists", verify_timeline_file_exists), ("File is Readable", verify_timeline_file_readable), ("Correct Line Count", verify_line_count), ("Line Format", verify_line_format), ("Date Format", verify_date_format), ("Chronological Order", verify_chronological_order), ("Expected Entries", verify_expected_entries), ("No Duplicates", verify_no_duplicates), ("File Paths Exist", verify_file_paths_exist), ] # Run all verification steps all_passed = True for step_name, verify_func in verification_steps: print(f"\n--- {step_name} ---") if not verify_func(test_dir): all_passed = False # Final result print("\n" + "="*50) if all_passed: print("✅ Desktop 2 Timeline Extraction completed correctly!") print("🎉 Task verification: PASS") sys.exit(0) else: print("❌ Task verification: FAIL") sys.exit(1) if __name__ == "__main__": main() ================================================ FILE: tasks/filesystem/standard/desktop_template/budget_computation/description.md ================================================ Please use FileSystem tools to finish the following task: ### Task Description You need to analyze all the files in the desktop environment to calculate personal life expenses and create a budget summary. ### Task Objectives 1. **Locate and analyze all files** in the desktop environment 2. **Extract personal life expenses** from the files (such as salary, food, living material, tax, expenses on the internet, ...) (exclude expenses in project/work) 3. **Create a file named `total_budget.txt`** in the main directory 4. **Format each expense entry** as `file_path;price` (one per line) 5. **Add total sum** as the last line, rounded to 2 decimal places ### Output Format The `total_budget.txt` file should contain: - One expense per line in format: `file_path;price` - File path should be the relative path from the main directory - Price should be rounded to 2 decimal places - Last line should be the total sum - No additional text or explanations ### Important Notes - Only include personal life expenses (not in project/work) - Use the cheapest available price when multiple options exist for one thing - The total should match the sum of all individual expenses - Hint: If a file contains 1 item for personal consumption, it means that all the entry in entire file is for personal consumption ================================================ FILE: tasks/filesystem/standard/desktop_template/budget_computation/meta.json ================================================ { "task_id": "budget_computation", "task_name": "Budget Computation", "category_id": "desktop_template", "category_name": "Desktop Template", "description": "Analyze personal expense data extracted from desktop files to create a detailed budget summary report for financial review.", "author": "Lingjun Chen", "created_at": "2025-08-14", "difficulty": "L3", "tags": [ "data extraction", "pattern analysis" ], "mcp": [ "filesystem" ], "meta_data": { "stateType": "text", "stateContent": "desktop_template/\n ├── Archives/\n │ ├── backup_contacts.csv\n │ └── tax_documents_2022.csv\n ├── Desktop/\n │ └── contacts.csv\n ├── Documents/\n │ ├── Personal/\n │ │ └── tax_info_2023.csv\n │ ├── Projects/\n │ │ └── budget_tracker.csv\n │ ├── Work/\n │ │ ├── client_list.csv\n │ │ └── timesheet.csv\n │ ├── budget.csv\n │ └── important_dates.csv\n ├── Downloads/\n │ ├── expenses.csv\n │ ├── fitness_log.csv\n │ └── price_comparisons.csv\n ├── Temp/\n │ └── test_data.csv\n ├── book_list.txt\n ├── bookmark_export.txt\n ├── calculations.txt\n ├── correspondence_2023.txt\n ├── draft_letter.txt\n ├── emergency_contacts.txt\n ├── example.txt\n └── experiment_results.txt\n", "stateUrl": "https://storage.mcpmark.ai/filesystem/desktop_template.zip", "stateOriginalUrl": null } } ================================================ FILE: tasks/filesystem/standard/desktop_template/budget_computation/verify.py ================================================ #!/usr/bin/env python3 """ Verification script for Budget Computation Task """ import sys from pathlib import Path import os from collections import Counter def get_test_directory() -> Path: """Get the test directory from FILESYSTEM_TEST_DIR env var.""" test_root = os.environ.get("FILESYSTEM_TEST_DIR") if not test_root: raise ValueError("FILESYSTEM_TEST_DIR environment variable is required") return Path(test_root) def verify_total_budget_file_exists(test_dir: Path) -> bool: """Verify that the total_budget.txt file exists.""" budget_file = test_dir / "total_budget.txt" if not budget_file.exists(): print("❌ File 'total_budget.txt' not found") return False print("✅ total_budget.txt file found") return True def verify_file_format(test_dir: Path) -> bool: """Verify that the total_budget.txt file has proper format.""" budget_file = test_dir / "total_budget.txt" try: content = budget_file.read_text() lines = [line.strip() for line in content.split('\n') if line.strip()] if len(lines) < 2: print("❌ File must contain at least 2 lines (expenses + total)") return False # Check that all lines except the last follow the format file_path;price for i, line in enumerate(lines[:-1]): if ';' not in line: print(f"❌ Line {i+1} does not contain ';' separator: {line}") return False parts = line.split(';') if len(parts) != 2: print(f"❌ Line {i+1} does not have exactly 2 parts: {line}") return False # Check if second part is a valid number try: float(parts[1]) except ValueError: print(f"❌ Line {i+1} price is not a valid number: {parts[1]}") return False # Check if last line is a valid number (total) try: float(lines[-1]) except ValueError: print(f"❌ Last line is not a valid number: {lines[-1]}") return False print("✅ File format is correct") return True except Exception as e: print(f"❌ Error reading or parsing file: {e}") return False def verify_expense_entries(test_dir: Path) -> bool: """Verify that all 15 required expense entries are present.""" budget_file = test_dir / "total_budget.txt" try: content = budget_file.read_text() lines = [line.strip() for line in content.split('\n') if line.strip()] # Should have 16 lines total (15 expenses + 1 total) if len(lines) != 16: print(f"❌ Expected 16 lines (15 expenses + 1 total), found {len(lines)}") return False # Check that we have exactly 15 expense entries expense_lines = lines[:-1] # All lines except the last if len(expense_lines) != 15: print(f"❌ Expected 15 expense entries, found {len(expense_lines)}") return False print("✅ File contains exactly 15 expense entries") return True except Exception as e: print(f"❌ Error checking expense entries: {e}") return False def verify_file_paths_and_counts(test_dir: Path) -> bool: """Verify that all required file paths are present with correct counts.""" budget_file = test_dir / "total_budget.txt" try: content = budget_file.read_text() lines = [line.strip() for line in content.split('\n') if line.strip()] expense_lines = lines[:-1] # All lines except the last # Extract file paths from expense lines file_paths = [] for line in expense_lines: file_path = line.split(';')[0] file_paths.append(file_path) # Count occurrences of each path path_counts = Counter(file_paths) # Expected file paths and their counts based on answer.txt expected_paths = { 'Archives/tax_documents_2022.csv': 3, 'Documents/Personal/tax_info_2023.csv': 3, 'Documents/budget.csv': 3, 'Downloads/expenses.csv': 3, 'Downloads/price_comparisons.csv': 3 } # Helper function to check if a path contains the expected path def path_matches_expected(actual_path: str, expected_path: str) -> bool: """Check if actual path contains the expected path (allowing for prefixes like './')""" # Remove common prefixes like './', '../', etc. normalized_actual = actual_path while normalized_actual.startswith('./') or normalized_actual.startswith('../'): normalized_actual = normalized_actual[2:] if normalized_actual.startswith('./') else normalized_actual[3:] # Check if the normalized path contains the expected path return expected_path in normalized_actual or normalized_actual == expected_path # Check if all expected paths are present with correct counts for expected_path, expected_count in expected_paths.items(): # Find matching actual paths matching_paths = [] for actual_path in path_counts.keys(): if path_matches_expected(actual_path, expected_path): matching_paths.append(actual_path) if not matching_paths: print(f"❌ Missing expected file path: {expected_path}") return False # Sum up the counts from all matching paths total_count = sum(path_counts[path] for path in matching_paths) if total_count != expected_count: print(f"❌ Path {expected_path} has wrong count: expected {expected_count}, found {total_count}") print(f" Matching paths: {matching_paths}") return False # Check if there are any completely unexpected paths (not matching any expected path) all_matching_paths = set() for expected_path in expected_paths.keys(): for actual_path in path_counts.keys(): if path_matches_expected(actual_path, expected_path): all_matching_paths.add(actual_path) unexpected_paths = set(path_counts.keys()) - all_matching_paths if unexpected_paths: print(f"❌ Unexpected file paths found: {unexpected_paths}") return False print("✅ All expected file paths are present with correct counts") return True except Exception as e: print(f"❌ Error checking file paths: {e}") return False def verify_individual_prices(test_dir: Path) -> bool: """Verify that all individual prices match the expected values.""" budget_file = test_dir / "total_budget.txt" try: content = budget_file.read_text() lines = [line.strip() for line in content.split('\n') if line.strip()] expense_lines = lines[:-1] # All lines except the last # Expected prices based on answer.txt expected_expenses = [ ('Archives/tax_documents_2022.csv', 42000.00), ('Archives/tax_documents_2022.csv', 1800.00), ('Archives/tax_documents_2022.csv', 950.00), ('Documents/Personal/tax_info_2023.csv', 45000.00), ('Documents/Personal/tax_info_2023.csv', 2500.00), ('Documents/Personal/tax_info_2023.csv', 1200.00), ('Documents/budget.csv', 250.00), ('Documents/budget.csv', 180.00), ('Documents/budget.csv', 120.00), ('Downloads/expenses.csv', 45.99), ('Downloads/expenses.csv', 99.00), ('Downloads/expenses.csv', 234.50), ('Downloads/price_comparisons.csv', 879.99), ('Downloads/price_comparisons.csv', 289.99), ('Downloads/price_comparisons.csv', 74.99) ] # Helper function to check if a path contains the expected path def path_matches_expected(actual_path: str, expected_path: str) -> bool: """Check if actual path contains the expected path (allowing for prefixes like './')""" # Remove common prefixes like './', '../', etc. normalized_actual = actual_path while normalized_actual.startswith('./') or normalized_actual.startswith('../'): normalized_actual = normalized_actual[2:] if normalized_actual.startswith('./') else normalized_actual[3:] # Check if the normalized path contains the expected path return expected_path in normalized_actual or normalized_actual == expected_path # Parse actual expenses actual_expenses = [] for line in expense_lines: parts = line.split(';') file_path = parts[0] price = float(parts[1]) actual_expenses.append((file_path, price)) # Create a counter for expected expenses to handle duplicates expected_expenses_counter = Counter(expected_expenses) actual_expenses_counter = Counter(actual_expenses) # Check if all expected expenses are present with correct counts for expected_expense, expected_count in expected_expenses_counter.items(): expected_path, expected_price = expected_expense # Find matching actual expenses matching_expenses = [] for actual_expense, actual_count in actual_expenses_counter.items(): actual_path, actual_price = actual_expense if path_matches_expected(actual_path, expected_path) and abs(actual_price - expected_price) < 0.01: matching_expenses.append(actual_expense) if not matching_expenses: print(f"❌ Missing expected expense: {expected_expense}") return False # Sum up the counts from all matching expenses total_count = sum(actual_expenses_counter[expense] for expense in matching_expenses) if total_count != expected_count: print(f"❌ Expense {expected_expense} has wrong count: expected {expected_count}, found {total_count}") print(f" Matching expenses: {matching_expenses}") return False # Check if there are any completely unexpected expenses (not matching any expected expense) all_matching_expenses = set() for expected_expense in expected_expenses_counter.keys(): expected_path, expected_price = expected_expense for actual_expense in actual_expenses_counter.keys(): actual_path, actual_price = actual_expense if path_matches_expected(actual_path, expected_path) and abs(actual_price - expected_price) < 0.01: all_matching_expenses.add(actual_expense) unexpected_expenses = set(actual_expenses_counter.keys()) - all_matching_expenses if unexpected_expenses: print(f"❌ Unexpected expenses found: {unexpected_expenses}") return False print("✅ All individual prices match expected values") return True except Exception as e: print(f"❌ Error checking individual prices: {e}") return False def verify_total_price(test_dir: Path) -> bool: """Verify that the total price is correct.""" budget_file = test_dir / "total_budget.txt" try: content = budget_file.read_text() lines = [line.strip() for line in content.split('\n') if line.strip()] # Get the total from the last line total_line = lines[-1] try: actual_total = float(total_line) except ValueError: print(f"❌ Last line is not a valid number: {total_line}") return False # Expected total based on answer.txt expected_total = 95624.46 if abs(actual_total - expected_total) > 0.01: # Allow small floating point differences print(f"❌ Expected total {expected_total}, found {actual_total}") return False print("✅ Total price is correct") return True except Exception as e: print(f"❌ Error checking total price: {e}") return False def verify_total_calculation(test_dir: Path) -> bool: """Verify that the total matches the sum of individual expenses.""" budget_file = test_dir / "total_budget.txt" try: content = budget_file.read_text() lines = [line.strip() for line in content.split('\n') if line.strip()] expense_lines = lines[:-1] # All lines except the last # Calculate sum of individual expenses calculated_total = 0.0 for line in expense_lines: price = float(line.split(';')[1]) calculated_total += price # Get the stated total from the last line stated_total = float(lines[-1]) # Check if they match (allow small floating point differences) if abs(calculated_total - stated_total) > 0.01: print(f"❌ Total calculation mismatch: calculated {calculated_total:.2f}, stated {stated_total:.2f}") return False print("✅ Total calculation is correct") return True except Exception as e: print(f"❌ Error verifying total calculation: {e}") return False def main(): """Main verification function.""" test_dir = get_test_directory() print("🔍 Verifying Budget Computation Task...") # Define verification steps verification_steps = [ ("Total Budget File Exists", verify_total_budget_file_exists), ("File Format", verify_file_format), ("Expense Entries Count", verify_expense_entries), ("File Paths and Counts", verify_file_paths_and_counts), ("Individual Prices", verify_individual_prices), ("Total Price", verify_total_price), ("Total Calculation", verify_total_calculation), ] # Run all verification steps all_passed = True for step_name, verify_func in verification_steps: print(f"\n--- {step_name} ---") if not verify_func(test_dir): all_passed = False # Final result print("\n" + "="*50) if all_passed: print("✅ Budget computation task completed successfully!") print("🎉 All verification steps passed") print("📊 Summary:") print(" - 15 expense entries found") print(" - 5 different file paths covered") print(" - All individual prices correct") print(" - Total price: $95,624.46") print(" - Calculation verified") sys.exit(0) else: print("❌ Budget computation task verification: FAIL") print("Please check the errors above and ensure all requirements are met") sys.exit(1) if __name__ == "__main__": main() ================================================ FILE: tasks/filesystem/standard/desktop_template/contact_information/description.md ================================================ Please use FileSystem tools to finish the following task: ### Task Description Your task is to compile all contact information from all the files into a single CSV table. You need to extract all people's contact information and organize it systematically. ### Task Objectives 1. **Scan all files** in the directory 2. **Extract contact information** for all individuals and organizations found 3. **Create a CSV file** named `contact_info.csv` in the main directory 4. **Structure the CSV** with the following columns: - First column: Name (required) - Second column: Email (required) - Third column: Phone (required) - Additional columns: Any other contact information types found 5. **Consolidate information** by merging the same types of information across entries into single columns 6. **Leave cells blank** if specific information is not available for a person/organization 7. Each entry from different files should be processed and listed separately, without any secondary processing. ### Expected Output - **File name**: `contact_info.csv` - **Format**: CSV with headers and data rows ### Reasoning Task After creating the contact_info.csv file, analyze the data to answer: **What is Charlie Davis's job/profession?** Hint: focus on the contact information in contact_info.csv. Write your answer in a file named `answer.txt` in the main directory. ### Important Notes - Do not modify any existing files - Only create the two new files: `contact_info.csv` and `answer.txt` ================================================ FILE: tasks/filesystem/standard/desktop_template/contact_information/meta.json ================================================ { "task_id": "contact_information", "task_name": "Contact Information", "category_id": "desktop_template", "category_name": "Desktop Template", "description": "Extract contact details from various file formats on desktop and perform reasoning analysis on the collected relationship data.", "author": "Lingjun Chen", "created_at": "2025-08-14", "difficulty": "L3", "tags": [ "data extraction", "cross-referencing" ], "mcp": [ "filesystem" ], "meta_data": { "stateType": "text", "stateContent": "desktop_template/\n ├── Archives/\n │ ├── backup_contacts.csv\n │ └── tax_documents_2022.csv\n ├── Desktop/\n │ └── contacts.csv\n ├── Documents/\n │ ├── Personal/\n │ │ └── tax_info_2023.csv\n │ ├── Projects/\n │ │ └── budget_tracker.csv\n │ ├── Work/\n │ │ ├── client_list.csv\n │ │ └── timesheet.csv\n │ ├── budget.csv\n │ └── important_dates.csv\n ├── Downloads/\n │ ├── expenses.csv\n │ ├── fitness_log.csv\n │ └── price_comparisons.csv\n ├── Temp/\n │ └── test_data.csv\n ├── book_list.txt\n ├── bookmark_export.txt\n ├── calculations.txt\n ├── correspondence_2023.txt\n ├── draft_letter.txt\n ├── emergency_contacts.txt\n ├── example.txt\n └── experiment_results.txt\n", "stateUrl": "https://storage.mcpmark.ai/filesystem/desktop_template.zip", "stateOriginalUrl": null } } ================================================ FILE: tasks/filesystem/standard/desktop_template/contact_information/verify.py ================================================ #!/usr/bin/env python3 """ Verification script for Contact Information Compilation Task """ import sys from pathlib import Path import csv import os import re def get_test_directory() -> Path: """Get the test directory from FILESYSTEM_TEST_DIR env var.""" test_root = os.environ.get("FILESYSTEM_TEST_DIR") if not test_root: raise ValueError("FILESYSTEM_TEST_DIR environment variable is required") return Path(test_root) def verify_contact_info_csv_exists(test_dir: Path) -> bool: """Verify that the contact_info.csv file exists in the main directory.""" contact_file = test_dir / "contact_info.csv" if not contact_file.exists(): print("❌ File 'contact_info.csv' not found in main directory") return False print("✅ contact_info.csv file found") return True def verify_answer_txt_exists(test_dir: Path) -> bool: """Verify that the answer.txt file exists in the main directory.""" answer_file = test_dir / "answer.txt" if not answer_file.exists(): print("❌ File 'answer.txt' not found in main directory") return False print("✅ answer.txt file found") return True def verify_csv_structure(test_dir: Path) -> bool: """Verify that the CSV file has the correct structure.""" contact_file = test_dir / "contact_info.csv" try: with open(contact_file, 'r', encoding='utf-8') as f: reader = csv.reader(f) rows = list(reader) if len(rows) < 2: # Need at least header + 1 data row print("❌ CSV file has insufficient rows") return False headers = rows[0] if not headers: print("❌ CSV file has no headers") return False # Check that Name is the first column if headers[0].lower() != 'name': print("❌ First column is not 'Name'") return False # Check that Email and Phone are present (order may vary) header_lower = [h.lower() for h in headers] if 'email' not in header_lower: print("❌ 'Email' column not found") return False if 'phone' not in header_lower: print("❌ 'Phone' column not found") return False print("✅ CSV structure is correct") return True except Exception as e: print(f"❌ Error reading CSV file: {e}") return False def verify_csv_content_accuracy(test_dir: Path) -> bool: """Verify that the CSV content contains all required data, regardless of row order or extra entries.""" contact_file = test_dir / "contact_info.csv" try: with open(contact_file, 'r', encoding='utf-8') as f: reader = csv.DictReader(f) rows = list(reader) # Expected data from answer.csv (hardcoded as required) expected_data = [ {"Name": "John Smith", "Email": "john@email.com", "Phone": "555-0101", "Status": "", "Industry": ""}, {"Name": "Jane Doe", "Email": "jane@email.com", "Phone": "555-0102", "Status": "", "Industry": ""}, {"Name": "Bob Johnson", "Email": "bob@email.com", "Phone": "555-0103", "Status": "", "Industry": ""}, {"Name": "Alice Brown", "Email": "alice@email.com", "Phone": "555-0201", "Status": "Inactive", "Industry": ""}, {"Name": "Charlie Davis", "Email": "charlie@email.com", "Phone": "555-0202", "Status": "Active", "Industry": ""}, {"Name": "David Wilson", "Email": "david@email.com", "Phone": "555-0203", "Status": "Inactive", "Industry": ""}, {"Name": "Acme Corp", "Email": "acme@corp.com", "Phone": "", "Status": "", "Industry": "Technology"}, {"Name": "Global Inc", "Email": "global@inc.com", "Phone": "", "Status": "", "Industry": "Finance"}, {"Name": "Local Business", "Email": "local@biz.com", "Phone": "", "Status": "", "Industry": "Retail"}, {"Name": "Spouse", "Email": "", "Phone": "+1-555-0124", "Status": "", "Industry": ""}, {"Name": "Parent", "Email": "", "Phone": "+1-555-0125", "Status": "", "Industry": ""}, {"Name": "Sibling", "Email": "", "Phone": "+1-555-0126", "Status": "", "Industry": ""}, {"Name": "Primary Doctor", "Email": "", "Phone": "+1-555-0201", "Status": "", "Industry": ""}, {"Name": "Dentist", "Email": "", "Phone": "+1-555-0202", "Status": "", "Industry": ""}, {"Name": "Pharmacy", "Email": "", "Phone": "+1-555-0203", "Status": "", "Industry": ""} ] # Convert expected data to a dictionary for easier lookup # We'll use Name as the key since it should be unique expected_dict = {} for entry in expected_data: expected_dict[entry["Name"]] = entry # Check each row for accuracy, regardless of order # Allow extra entries and mixed content found_entries = set() extra_entries = [] for i, row in enumerate(rows): row_name = row.get('Name', '') if not row_name: # Skip rows without names (they're not valid entries) continue if row_name in expected_dict: # This is one of our expected entries if row_name in found_entries: print(f"❌ Duplicate name found: '{row_name}'") return False found_entries.add(row_name) expected = expected_dict[row_name] # Check all columns for this entry for key, expected_value in expected.items(): if key in row: actual_value = row[key] if row[key] else "" if actual_value != expected_value: print(f"❌ Entry '{row_name}', column '{key}': expected '{expected_value}', got '{actual_value}'") return False else: print(f"❌ Entry '{row_name}' missing column '{key}'") return False else: # This is an extra entry - record it for informational purposes extra_entries.append(row_name) # Verify all expected entries were found if len(found_entries) != len(expected_data): missing = set(expected_dict.keys()) - found_entries print(f"❌ Missing entries: {missing}") return False # Report extra entries if any if extra_entries: print(f"ℹ️ Found {len(extra_entries)} extra entries: {extra_entries}") print(f"✅ CSV content accuracy verified: found all {len(expected_data)} required entries (plus {len(extra_entries)} extra entries)") return True except Exception as e: print(f"❌ Error verifying CSV content: {e}") return False def verify_csv_data_completeness(test_dir: Path) -> bool: """Verify that all required data is present and no entries are missing.""" contact_file = test_dir / "contact_info.csv" try: with open(contact_file, 'r', encoding='utf-8') as f: reader = csv.DictReader(f) rows = list(reader) # Check that all expected names are present expected_names = [ "John Smith", "Jane Doe", "Bob Johnson", "Alice Brown", "Charlie Davis", "David Wilson", "Acme Corp", "Global Inc", "Local Business", "Spouse", "Parent", "Sibling", "Primary Doctor", "Dentist", "Pharmacy" ] actual_names = [row.get('Name', '') for row in rows if row.get('Name')] missing_names = set(expected_names) - set(actual_names) if missing_names: print(f"❌ Missing names: {missing_names}") return False extra_names = set(actual_names) - set(expected_names) if extra_names: print(f"⚠️ Extra names found: {extra_names}") # This is a warning, not an error print("✅ CSV data completeness verified") return True except Exception as e: print(f"❌ Error checking data completeness: {e}") return False def verify_answer_content(test_dir: Path) -> bool: """Verify that the answer.txt contains the correct answer about Charlie Davis.""" answer_file = test_dir / "answer.txt" try: content = answer_file.read_text().strip().lower() # The answer should contain "dentist" (as per answer.txt) if "dentist" in content: print("✅ Answer about Charlie Davis's job is correct") return True else: print(f"❌ Answer does not contain 'dentist'. Found: '{content}'") return False except Exception as e: print(f"❌ Error reading answer.txt: {e}") return False def verify_file_locations(test_dir: Path) -> bool: """Verify that files are in the correct locations.""" contact_file = test_dir / "contact_info.csv" answer_file = test_dir / "answer.txt" # Check that files are in the main directory, not in subdirectories if contact_file.parent != test_dir: print(f"❌ contact_info.csv is not in main directory: {contact_file}") return False if answer_file.parent != test_dir: print(f"❌ answer.txt is not in main directory: {answer_file}") return False print("✅ Files are in correct locations") return True def main(): """Main verification function.""" test_dir = get_test_directory() print("🔍 Verifying Contact Information Compilation Task...") # Define verification steps verification_steps = [ ("Contact Info CSV Exists", verify_contact_info_csv_exists), ("Answer TXT Exists", verify_answer_txt_exists), ("Files in Correct Locations", verify_file_locations), ("CSV Structure", verify_csv_structure), ("CSV Content Accuracy (Flexible)", verify_csv_content_accuracy), ("CSV Data Completeness", verify_csv_data_completeness), ("Answer Content", verify_answer_content), ] # Run all verification steps all_passed = True for step_name, verify_func in verification_steps: print(f"\n--- {step_name} ---") if not verify_func(test_dir): all_passed = False # Final result print("\n" + "="*50) if all_passed: print("✅ Contact Information Compilation Task completed successfully!") print("🎉 Task verification: PASS") sys.exit(0) else: print("❌ Task verification: FAIL") sys.exit(1) if __name__ == "__main__": main() ================================================ FILE: tasks/filesystem/standard/desktop_template/file_arrangement/description.md ================================================ Please use FileSystem tools to finish the following task: ### Task Description You are tasked with organizing files on an AI researcher's desktop into a structured folder system. You need to create specific folders and move files to their designated locations according to the provided organization scheme. ### Task Objectives 1. **Create the following folder structure** in the main directory: - `work/` - for work, research and projects related files - `life/` - for personal life related files - `archives/` - for archived files or files with past dates in its file names - `temp/` - for temporary files, drafts - `others/` - for files that cannot be classified elsewhere ### Important Notes - All files must be moved from their current locations to the specified folders - The `others/` folder is for files that don't fit the other categories - Do not modify the contents of any files, only move them to the correct locations - If you are not sure about which folder it should belongs to, you can read the context in the files before making decisions - **Do not change files' name** ================================================ FILE: tasks/filesystem/standard/desktop_template/file_arrangement/meta.json ================================================ { "task_id": "file_arrangement", "task_name": "File Arrangement", "category_id": "desktop_template", "category_name": "Desktop Template", "description": "Classify and organize desktop files into appropriate categories following specified classification rules and naming convention standards.", "author": "Lingjun Chen", "created_at": "2025-08-14", "difficulty": "L3", "tags": [ "file organization" ], "mcp": [ "filesystem" ], "meta_data": { "stateType": "text", "stateContent": "desktop_template/\n ├── Archives/\n │ ├── backup_contacts.csv\n │ └── tax_documents_2022.csv\n ├── Desktop/\n │ └── contacts.csv\n ├── Documents/\n │ ├── Personal/\n │ │ └── tax_info_2023.csv\n │ ├── Projects/\n │ │ └── budget_tracker.csv\n │ ├── Work/\n │ │ ├── client_list.csv\n │ │ └── timesheet.csv\n │ ├── budget.csv\n │ └── important_dates.csv\n ├── Downloads/\n │ ├── expenses.csv\n │ ├── fitness_log.csv\n │ └── price_comparisons.csv\n ├── Temp/\n │ └── test_data.csv\n ├── book_list.txt\n ├── bookmark_export.txt\n ├── calculations.txt\n ├── correspondence_2023.txt\n ├── draft_letter.txt\n ├── emergency_contacts.txt\n ├── example.txt\n └── experiment_results.txt\n", "stateUrl": "https://storage.mcpmark.ai/filesystem/desktop_template.zip", "stateOriginalUrl": null } } ================================================ FILE: tasks/filesystem/standard/desktop_template/file_arrangement/verify.py ================================================ #!/usr/bin/env python3 """ Verification script for Desktop File Organization Task """ import sys from pathlib import Path import os def get_test_directory() -> Path: """Get the test directory from FILESYSTEM_TEST_DIR env var.""" test_root = os.environ.get("FILESYSTEM_TEST_DIR") if not test_root: raise ValueError("FILESYSTEM_TEST_DIR environment variable is required") return Path(test_root) def verify_folder_structure(test_dir: Path) -> bool: """Verify that all required folders exist.""" required_folders = ["work", "life", "archives", "temp", "others"] missing_folders = [] for folder in required_folders: folder_path = test_dir / folder if not folder_path.exists() or not folder_path.is_dir(): missing_folders.append(folder) if missing_folders: print(f"❌ Missing required folders: {missing_folders}") return False print("✅ All required folders exist") return True def verify_work_folder_files(test_dir: Path) -> bool: """Verify that work folder contains the required files.""" work_dir = test_dir / "work" required_files = [ "client_list.csv", "timesheet.csv", "experiment_results.txt", "budget_tracker.csv", "expenses.csv" ] missing_files = [] for file_name in required_files: file_path = work_dir / file_name if not file_path.exists(): missing_files.append(file_name) if missing_files: print(f"❌ Missing required files in work/ folder: {missing_files}") return False # Count total files in work folder for info total_files = len([f for f in work_dir.iterdir() if f.is_file()]) print(f"✅ All required files found in work/ folder (total: {total_files} files)") return True def verify_life_folder_files(test_dir: Path) -> bool: """Verify that life folder contains the required files.""" life_dir = test_dir / "life" required_files = [ "contacts.csv", "budget.csv", "fitness_log.csv", "price_comparisons.csv", "book_list.txt", "bookmark_export.txt", "emergency_contacts.txt" ] missing_files = [] for file_name in required_files: file_path = life_dir / file_name if not file_path.exists(): missing_files.append(file_name) if missing_files: print(f"❌ Missing required files in life/ folder: {missing_files}") return False # Count total files in life folder for info total_files = len([f for f in life_dir.iterdir() if f.is_file()]) print(f"✅ All required files found in life/ folder (total: {total_files} files)") return True def verify_archives_folder_files(test_dir: Path) -> bool: """Verify that archives folder contains the required files.""" archives_dir = test_dir / "archives" required_files = [ "backup_contacts.csv", "tax_documents_2022.csv", "correspondence_2023.txt", "tax_info_2023.csv" ] missing_files = [] for file_name in required_files: file_path = archives_dir / file_name if not file_path.exists(): missing_files.append(file_name) if missing_files: print(f"❌ Missing required files in archives/ folder: {missing_files}") return False # Count total files in archives folder for info total_files = len([f for f in archives_dir.iterdir() if f.is_file()]) print(f"✅ All required files found in archives/ folder (total: {total_files} files)") return True def verify_temp_folder_files(test_dir: Path) -> bool: """Verify that temp folder contains the required files.""" temp_dir = test_dir / "temp" required_files = [ "test_data.csv", "draft_letter.txt" ] missing_files = [] for file_name in required_files: file_path = temp_dir / file_name if not file_path.exists(): missing_files.append(file_name) if missing_files: print(f"❌ Missing required files in temp/ folder: {missing_files}") return False # Count total files in temp folder for info total_files = len([f for f in temp_dir.iterdir() if f.is_file()]) print(f"✅ All required files found in temp/ folder (total: {total_files} files)") return True def verify_others_folder_files(test_dir: Path) -> bool: """Verify that others folder exists and can contain any files.""" others_dir = test_dir / "others" if not others_dir.exists() or not others_dir.is_dir(): print("❌ others/ folder not found") return False # Count files in others folder for info total_files = len([f for f in others_dir.iterdir() if f.is_file()]) print(f"✅ others/ folder exists (contains {total_files} files)") return True def verify_required_files_in_correct_folders(test_dir: Path) -> bool: """Verify that all 18 required files are in their correct designated folders.""" # Define the mapping of required files to their correct folders required_file_mapping = { "work": [ "client_list.csv", "timesheet.csv", "experiment_results.txt", "budget_tracker.csv", "expenses.csv", ], "life": [ "contacts.csv", "budget.csv", "fitness_log.csv", "price_comparisons.csv", "book_list.txt", "bookmark_export.txt", "emergency_contacts.txt" ], "archives": [ "backup_contacts.csv", "tax_documents_2022.csv", "correspondence_2023.txt", "tax_info_2023.csv" ], "temp": [ "test_data.csv", "draft_letter.txt" ] } missing_files = [] # Check each required file is in its correct folder for folder, files in required_file_mapping.items(): folder_path = test_dir / folder for file_name in files: file_path = folder_path / file_name if not file_path.exists(): missing_files.append(f"{folder}/{file_name}") if missing_files: print(f"❌ Missing required files: {missing_files}") return False print("✅ All 18 required files are in their correct designated folders") return True def verify_no_duplicate_required_files(test_dir: Path) -> bool: """Verify that the 18 required files are not duplicated across folders.""" required_files = [ "client_list.csv", "timesheet.csv", "experiment_results.txt", "budget_tracker.csv", "contacts.csv", "budget.csv", "expenses.csv", "fitness_log.csv", "price_comparisons.csv", "book_list.txt", "bookmark_export.txt", "emergency_contacts.txt", "backup_contacts.csv", "tax_documents_2022.csv", "correspondence_2023.txt", "tax_info_2023.csv", "test_data.csv", "draft_letter.txt" ] # Check for duplicates of required files file_locations = {} duplicates = [] for folder in ["work", "life", "archives", "temp", "others"]: folder_path = test_dir / folder if folder_path.exists() and folder_path.is_dir(): for file_path in folder_path.iterdir(): if file_path.is_file() and file_path.name in required_files: if file_path.name in file_locations: duplicates.append(f"{file_path.name} (in {file_locations[file_path.name]} and {folder}/)") else: file_locations[file_path.name] = f"{folder}/" if duplicates: print(f"❌ Duplicate required files found: {duplicates}") return False print("✅ No duplicate required files found") return True def main(): """Main verification function.""" test_dir = get_test_directory() print("🔍 Verifying Desktop File Organization Task...") # Define verification steps verification_steps = [ ("Folder Structure", verify_folder_structure), ("Required Files in Work Folder", verify_work_folder_files), ("Required Files in Life Folder", verify_life_folder_files), ("Required Files in Archives Folder", verify_archives_folder_files), ("Required Files in Temp Folder", verify_temp_folder_files), ("Others Folder Exists", verify_others_folder_files), ("All Required Files in Correct Folders", verify_required_files_in_correct_folders), ("No Duplicate Required Files", verify_no_duplicate_required_files), ] # Run all verification steps all_passed = True for step_name, verify_func in verification_steps: print(f"\n--- {step_name} ---") if not verify_func(test_dir): all_passed = False # Final result print("\n" + "="*50) if all_passed: print("✅ Desktop file organization task completed successfully!") print("🎉 All 18 required files are correctly placed in their designated folders") print("📊 Summary:") print(" - work/ folder: 5 required files") print(" - life/ folder: 7 required files") print(" - archives/ folder: 4 required files") print(" - temp/ folder: 2 required files") print(" - others/ folder: can contain any files") print(" - Total required files: 18") print(" - Note: Other files can be placed in any folder") sys.exit(0) else: print("❌ Desktop file organization task verification: FAIL") print("Please check the errors above and ensure all 18 required files are in their correct locations") sys.exit(1) if __name__ == "__main__": main() ================================================ FILE: tasks/filesystem/standard/file_context/duplicates_searching/description.md ================================================ Please use FileSystem tools to finish the following task: ### Task Description You are given a directory containing multiple text files. Some files have identical content and need to be organized. Your task is to identify all files with duplicate content and move them to a newly created 'duplicates' directory. ### Task Objectives 1. **Scan all text files** in the test directory to identify groups with identical content 2. **Create a 'duplicates' directory** in the test directory root 3. **Move all duplicate files** into the 'duplicates' directory 4. **Leave unique files** in their original location ### Expected Output After completing the task, the directory structure should be: - `duplicates/` directory containing all files with duplicate content - Original directory containing only files with unique content ================================================ FILE: tasks/filesystem/standard/file_context/duplicates_searching/meta.json ================================================ { "task_id": "duplicates_searching", "task_name": "Duplicates Searching", "category_id": "file_context", "category_name": "File Context", "description": "Scan directory to identify files with identical content, then organize all duplicate files into a separate dedicated directory for cleanup.", "author": "Lingjun Chen", "created_at": "2025-08-06", "difficulty": "L3", "tags": [ "pattern analysis", "file organization" ], "mcp": [ "filesystem" ], "meta_data": { "stateType": "text", "stateContent": "file_context/\n ├── file_01.txt\n ├── file_02.txt\n ├── file_03.txt\n ├── file_04.txt\n ├── file_05.txt\n ├── file_06.txt\n ├── file_07.txt\n ├── file_08.txt\n ├── file_09.txt\n ├── file_10.txt\n ├── file_11.txt\n ├── file_12.txt\n ├── file_13.txt\n ├── file_14.txt\n ├── file_15.txt\n ├── file_16.txt\n ├── file_17.txt\n ├── file_18.txt\n ├── file_19.txt\n ├── file_20.txt\n └── large_file.txt", "stateUrl": "https://storage.mcpmark.ai/filesystem/file_context.zip", "stateOriginalUrl": null } } ================================================ FILE: tasks/filesystem/standard/file_context/duplicates_searching/verify.py ================================================ #!/usr/bin/env python3 """ Verification script for File Duplicates Detection and Organization Task """ import sys from pathlib import Path import os import hashlib def get_test_directory() -> Path: """Get the test directory from FILESYSTEM_TEST_DIR env var.""" test_root = os.environ.get("FILESYSTEM_TEST_DIR") if not test_root: raise ValueError("FILESYSTEM_TEST_DIR environment variable is required") return Path(test_root) def calculate_file_hash(file_path: Path) -> str: """Calculate MD5 hash of file content.""" try: with open(file_path, 'rb') as f: return hashlib.md5(f.read()).hexdigest() except Exception as e: print(f"❌ Error reading file {file_path}: {e}") return None def verify_duplicates_directory_exists(test_dir: Path) -> bool: """Verify that the duplicates directory exists.""" duplicates_dir = test_dir / "duplicates" if not duplicates_dir.exists(): print("❌ 'duplicates' directory not found") return False if not duplicates_dir.is_dir(): print("❌ 'duplicates' exists but is not a directory") return False print("✅ 'duplicates' directory exists") return True def get_expected_duplicate_groups(): """Return the expected duplicate file groups based on content analysis.""" # Based on the answer.md and content analysis return { # Group 1: file_01.txt, file_02.txt (identical content) "group1": ["file_01.txt", "file_02.txt"], # Group 2: file_03.txt, file_04.txt (identical content) "group2": ["file_03.txt", "file_04.txt"], # Group 3: file_07.txt, file_08.txt (identical content) "group3": ["file_07.txt", "file_08.txt"], # Group 4: file_10.txt, file_11.txt (identical content) "group4": ["file_10.txt", "file_11.txt"], # Group 5: file_13.txt, file_14.txt (identical content) "group5": ["file_13.txt", "file_14.txt"], # Group 6: file_15.txt, file_16.txt (identical content) "group6": ["file_15.txt", "file_16.txt"], # Group 7: file_18.txt, file_19.txt (identical content) "group7": ["file_18.txt", "file_19.txt"] } def get_expected_unique_files(): """Return the expected unique files that should remain in original location.""" return [ "file_05.txt", "file_06.txt", "file_09.txt", "file_12.txt", "file_17.txt", "file_20.txt" ] def verify_duplicate_files_moved(test_dir: Path) -> bool: """Verify that all duplicate files have been moved to the duplicates directory.""" duplicates_dir = test_dir / "duplicates" expected_duplicate_groups = get_expected_duplicate_groups() # Check that all expected duplicate files are in the duplicates directory missing_files = [] for group_name, files in expected_duplicate_groups.items(): for filename in files: file_path = duplicates_dir / filename if not file_path.exists(): missing_files.append(filename) if missing_files: print(f"❌ Missing duplicate files in 'duplicates' directory: {missing_files}") return False print("✅ All expected duplicate files are in the 'duplicates' directory") return True def verify_unique_files_remain(test_dir: Path) -> bool: """Verify that unique files remain in the original location.""" expected_unique_files = get_expected_unique_files() missing_files = [] for filename in expected_unique_files: file_path = test_dir / filename if not file_path.exists(): missing_files.append(filename) if missing_files: print(f"❌ Missing unique files in original location: {missing_files}") return False print("✅ All expected unique files remain in the original location") return True def verify_no_duplicate_files_in_original(test_dir: Path) -> bool: """Verify that no duplicate files remain in the original location.""" expected_duplicate_groups = get_expected_duplicate_groups() remaining_duplicates = [] for group_name, files in expected_duplicate_groups.items(): for filename in files: file_path = test_dir / filename if file_path.exists(): remaining_duplicates.append(filename) if remaining_duplicates: print(f"❌ Duplicate files still exist in original location: {remaining_duplicates}") return False print("✅ No duplicate files remain in the original location") return True def verify_content_integrity(test_dir: Path) -> bool: """Verify that file content integrity is maintained after moving.""" duplicates_dir = test_dir / "duplicates" expected_duplicate_groups = get_expected_duplicate_groups() # Check that files in each duplicate group have identical content for group_name, files in expected_duplicate_groups.items(): if len(files) < 2: continue # Calculate hash of the first file in the group first_file = duplicates_dir / files[0] if not first_file.exists(): print(f"❌ First file of group {group_name} not found: {files[0]}") return False first_hash = calculate_file_hash(first_file) if first_hash is None: return False # Check that all other files in the group have the same hash for filename in files[1:]: file_path = duplicates_dir / filename if not file_path.exists(): print(f"❌ File in group {group_name} not found: {filename}") return False file_hash = calculate_file_hash(file_path) if file_hash is None: return False if file_hash != first_hash: print(f"❌ Files in group {group_name} have different content: {files[0]} vs {filename}") return False print("✅ Content integrity verified - duplicate files have identical content") return True def verify_total_file_count(test_dir: Path) -> bool: """Verify that the duplicates directory contains exactly 14 files.""" duplicates_dir = test_dir / "duplicates" # Count files in original location (excluding the duplicates directory itself) original_files = [f for f in test_dir.iterdir() if f.is_file()] # Count files in duplicates directory duplicate_files = [f for f in duplicates_dir.iterdir() if f.is_file()] # Expected: 14 files in duplicates directory expected_duplicates = 14 actual_duplicates = len(duplicate_files) if actual_duplicates != expected_duplicates: print(f"❌ Wrong number of files in duplicates directory. Expected: {expected_duplicates}, Actual: {actual_duplicates}") return False print(f"✅ Duplicates directory has correct number of files: {actual_duplicates}") return True def main(): """Main verification function.""" test_dir = get_test_directory() print("🔍 Verifying File Duplicates Detection and Organization Task...") # Define verification steps verification_steps = [ ("Duplicates Directory Exists", verify_duplicates_directory_exists), ("Duplicate Files Moved", verify_duplicate_files_moved), ("Unique Files Remain", verify_unique_files_remain), ("No Duplicates in Original", verify_no_duplicate_files_in_original), ("Content Integrity", verify_content_integrity), ("Duplicates Count", verify_total_file_count), ] # Run all verification steps all_passed = True for step_name, verify_func in verification_steps: print(f"\n--- {step_name} ---") if not verify_func(test_dir): all_passed = False # Final result print("\n" + "="*50) if all_passed: print("✅ File duplicates detection and organization completed correctly!") print("🎉 Task verification: PASS") sys.exit(0) else: print("❌ Task verification: FAIL") sys.exit(1) if __name__ == "__main__": main() ================================================ FILE: tasks/filesystem/standard/file_context/file_merging/description.md ================================================ Please use FileSystem tools to finish the following task: ### Task Description You are given a directory containing multiple text files of varying sizes. Your task is to identify the 10 smallest .txt files, merge their content in alphabetical order, and create a consolidated file called "merged_content.txt" with proper formatting. ### Task Objectives 1. **Identify the 10 smallest .txt files** in the test directory 2. **Sort the selected files alphabetically** by filename 3. **Merge the content** of these files into a single file 4. **Add file headers** (file name) before each file's content ================================================ FILE: tasks/filesystem/standard/file_context/file_merging/meta.json ================================================ { "task_id": "file_merging", "task_name": "File Merging", "category_id": "file_context", "category_name": "File Context", "description": "Identify the 10 smallest text files in the directory, then merge their content in alphabetical order into a single consolidated file.", "author": "Lingjun Chen", "created_at": "2025-08-07", "difficulty": "L3", "tags": [ "content transformation", "file organization" ], "mcp": [ "filesystem" ], "meta_data": { "stateType": "text", "stateContent": "file_context/\n ├── file_01.txt\n ├── file_02.txt\n ├── file_03.txt\n ├── file_04.txt\n ├── file_05.txt\n ├── file_06.txt\n ├── file_07.txt\n ├── file_08.txt\n ├── file_09.txt\n ├── file_10.txt\n ├── file_11.txt\n ├── file_12.txt\n ├── file_13.txt\n ├── file_14.txt\n ├── file_15.txt\n ├── file_16.txt\n ├── file_17.txt\n ├── file_18.txt\n ├── file_19.txt\n ├── file_20.txt\n └── large_file.txt", "stateUrl": "https://storage.mcpmark.ai/filesystem/file_context.zip", "stateOriginalUrl": null } } ================================================ FILE: tasks/filesystem/standard/file_context/file_merging/verify.py ================================================ #!/usr/bin/env python3 """ Verification script for File Merging Task """ import sys from pathlib import Path import os def get_test_directory() -> Path: """Get the test directory from FILESYSTEM_TEST_DIR env var.""" test_root = os.environ.get("FILESYSTEM_TEST_DIR") if not test_root: raise ValueError("FILESYSTEM_TEST_DIR environment variable is required") return Path(test_root) def get_expected_files() -> list: """Get the expected 10 files in alphabetical order.""" # The 10 smallest files (excluding file_12.txt) in alphabetical order expected_files = [ "file_10.txt", "file_11.txt", "file_13.txt", "file_14.txt", "file_15.txt", "file_16.txt", "file_17.txt", "file_18.txt", "file_19.txt", "file_20.txt" ] return expected_files def verify_merged_file_exists(test_dir: Path) -> bool: """Verify that the merged_content.txt file exists.""" merged_file = test_dir / "merged_content.txt" if not merged_file.exists(): print("❌ File 'merged_content.txt' not found") return False print("✅ Merged content file found") return True def verify_correct_files_selected(test_dir: Path) -> bool: """Verify that the correct 10 files were selected and included.""" expected_files = get_expected_files() merged_file = test_dir / "merged_content.txt" try: content = merged_file.read_text() # Check if all expected files are present for expected_file in expected_files: if expected_file not in content: print(f"❌ Expected file '{expected_file}' not found in merged content") return False # Check if file_12.txt is NOT present (should be excluded) if "file_12.txt" in content: print("❌ file_12.txt should be excluded but was found in merged content") return False print("✅ Correct files selected and included") return True except Exception as e: print(f"❌ Error verifying file selection: {e}") return False def verify_alphabetical_order(test_dir: Path) -> bool: """Verify that files are in alphabetical order.""" expected_files = get_expected_files() merged_file = test_dir / "merged_content.txt" try: content = merged_file.read_text() lines = content.split('\n') # Extract filenames from the content (lines that contain .txt) found_files = [] for line in lines: line = line.strip() # Check if this line contains any of the expected filenames for expected_file in expected_files: if expected_file in line: found_files.append(expected_file) break # Check if files are in alphabetical order if found_files != expected_files: print(f"❌ Files not in correct alphabetical order") print(f" Expected: {expected_files}") print(f" Found: {found_files}") return False print("✅ Files are in correct alphabetical order") return True except Exception as e: print(f"❌ Error verifying alphabetical order: {e}") return False def verify_file_content_integrity(test_dir: Path) -> bool: """Verify that the content of each file is preserved correctly.""" expected_files = get_expected_files() merged_file = test_dir / "merged_content.txt" try: content = merged_file.read_text() lines = content.split('\n') for expected_file in expected_files: # Get the original file content original_file = test_dir / expected_file original_content = original_file.read_text().strip() # Find the line index where this file's header appears header_line_index = -1 for i, line in enumerate(lines): if expected_file in line: header_line_index = i break if header_line_index == -1: print(f"❌ Could not find header for {expected_file}") return False # Find the next header line or end of file next_header_index = len(lines) for i in range(header_line_index + 1, len(lines)): for other_file in expected_files: if other_file != expected_file and other_file in lines[i]: next_header_index = i break if next_header_index != len(lines): break # Extract content lines (from header + 1 to next header) content_lines = lines[header_line_index + 1:next_header_index] merged_content = '\n'.join(content_lines).strip() if merged_content != original_content: print(f"❌ Content mismatch for {expected_file}") print(f" Expected: {original_content}") print(f" Found: {merged_content}") return False print("✅ All file contents preserved correctly") return True except Exception as e: print(f"❌ Error verifying content integrity: {e}") return False def verify_filename_headers(test_dir: Path) -> bool: """Verify that each file section starts with the correct filename header.""" expected_files = get_expected_files() merged_file = test_dir / "merged_content.txt" try: content = merged_file.read_text() for expected_file in expected_files: # Check if the filename appears anywhere in the content (as part of a line) if expected_file not in content: print(f"❌ Filename header '{expected_file}' not found") return False print("✅ All filename headers present and correctly formatted") return True except Exception as e: print(f"❌ Error verifying filename headers: {e}") return False def main(): """Main verification function.""" test_dir = get_test_directory() print("🔍 Verifying File Merging Task...") # Show expected files for debugging expected_files = get_expected_files() print(f"📋 Expected files (10 smallest, excluding file_12.txt): {expected_files}") # Define verification steps verification_steps = [ ("Merged File Exists", verify_merged_file_exists), ("Correct Files Selected", verify_correct_files_selected), ("Alphabetical Order", verify_alphabetical_order), ("Filename Headers", verify_filename_headers), ("Content Integrity", verify_file_content_integrity), ] # Run all verification steps all_passed = True for step_name, verify_func in verification_steps: print(f"\n--- {step_name} ---") if not verify_func(test_dir): all_passed = False # Final result print("\n" + "="*50) if all_passed: print("✅ File merging task completed correctly!") print("🎉 Task verification: PASS") sys.exit(0) else: print("❌ Task verification: FAIL") sys.exit(1) if __name__ == "__main__": main() ================================================ FILE: tasks/filesystem/standard/file_context/file_splitting/description.md ================================================ Please use FileSystem tools to finish the following task: ### Task Description You need to split a large text file into multiple smaller files with equal character counts. The task involves creating a new directory and splitting the content into exactly 10 files. ### Task Objectives 1. **Create a new directory** named `split` in the test directory 2. **Split the file** `large_file.txt` into exactly 10 files with equal character counts 3. **Name the files** as `split_01.txt`, `split_02.txt`, ..., `split_10.txt` in the `split` directory ================================================ FILE: tasks/filesystem/standard/file_context/file_splitting/meta.json ================================================ { "task_id": "file_splitting", "task_name": "File Splitting", "category_id": "file_context", "category_name": "File Context", "description": "Split a large text file into multiple equal-length segments for easier processing, distribution, and parallel handling of content.", "author": "Lingjun Chen", "created_at": "2025-08-08", "difficulty": "L3", "tags": [ "content transformation" ], "mcp": [ "filesystem" ], "meta_data": { "stateType": "text", "stateContent": "file_context/\n ├── file_01.txt\n ├── file_02.txt\n ├── file_03.txt\n ├── file_04.txt\n ├── file_05.txt\n ├── file_06.txt\n ├── file_07.txt\n ├── file_08.txt\n ├── file_09.txt\n ├── file_10.txt\n ├── file_11.txt\n ├── file_12.txt\n ├── file_13.txt\n ├── file_14.txt\n ├── file_15.txt\n ├── file_16.txt\n ├── file_17.txt\n ├── file_18.txt\n ├── file_19.txt\n ├── file_20.txt\n └── large_file.txt", "stateUrl": "https://storage.mcpmark.ai/filesystem/file_context.zip", "stateOriginalUrl": null } } ================================================ FILE: tasks/filesystem/standard/file_context/file_splitting/verify.py ================================================ #!/usr/bin/env python3 """ Verification script for File Splitting Task """ import sys from pathlib import Path import os def get_test_directory() -> Path: """Get the test directory from FILESYSTEM_TEST_DIR env var.""" test_root = os.environ.get("FILESYSTEM_TEST_DIR") if not test_root: raise ValueError("FILESYSTEM_TEST_DIR environment variable is required") return Path(test_root) def verify_split_directory_exists(test_dir: Path) -> bool: """Verify that the split directory exists.""" split_dir = test_dir / "split" if not split_dir.exists(): print("❌ Directory 'split' not found") return False if not split_dir.is_dir(): print("❌ 'split' exists but is not a directory") return False print("✅ Split directory found") return True def verify_all_split_files_exist(test_dir: Path) -> bool: """Verify that all 10 split files exist with correct names.""" split_dir = test_dir / "split" expected_files = [f"split_{i:02d}.txt" for i in range(1, 11)] missing_files = [] for filename in expected_files: file_path = split_dir / filename if not file_path.exists(): missing_files.append(filename) if missing_files: print(f"❌ Missing files: {missing_files}") return False print("✅ All 10 split files exist with correct names") return True def verify_equal_file_lengths(test_dir: Path) -> bool: """Verify that all split files have equal character counts.""" split_dir = test_dir / "split" file_lengths = [] for i in range(1, 11): filename = f"split_{i:02d}.txt" file_path = split_dir / filename try: content = file_path.read_text() file_lengths.append(len(content)) except Exception as e: print(f"❌ Error reading {filename}: {e}") return False # Check if all lengths are equal if len(set(file_lengths)) != 1: print(f"❌ File lengths are not equal: {file_lengths}") return False print(f"✅ All files have equal length: {file_lengths[0]} characters") return True def verify_content_integrity(test_dir: Path) -> bool: """Verify that concatenated split files equal the original file.""" split_dir = test_dir / "split" original_file = test_dir / "large_file.txt" # Read original content try: original_content = original_file.read_text() except Exception as e: print(f"❌ Error reading original file: {e}") return False # Concatenate all split files concatenated_content = "" for i in range(1, 11): filename = f"split_{i:02d}.txt" file_path = split_dir / filename try: content = file_path.read_text() concatenated_content += content except Exception as e: print(f"❌ Error reading {filename}: {e}") return False # Compare content if concatenated_content != original_content: print("❌ Concatenated content does not match original file") print(f" Original length: {len(original_content)}") print(f" Concatenated length: {len(concatenated_content)}") return False print("✅ Concatenated content matches original file exactly") return True def verify_no_extra_files(test_dir: Path) -> bool: """Verify that no extra files exist in the split directory.""" split_dir = test_dir / "split" expected_files = {f"split_{i:02d}.txt" for i in range(1, 11)} actual_files = {f.name for f in split_dir.iterdir() if f.is_file()} extra_files = actual_files - expected_files if extra_files: print(f"❌ Extra files found in split directory: {extra_files}") return False print("✅ No extra files in split directory") return True def main(): """Main verification function.""" test_dir = get_test_directory() print("🔍 Verifying File Splitting Task...") # Define verification steps verification_steps = [ ("Split Directory Exists", verify_split_directory_exists), ("All Split Files Exist", verify_all_split_files_exist), ("Equal File Lengths", verify_equal_file_lengths), ("Content Integrity", verify_content_integrity), ("No Extra Files", verify_no_extra_files), ] # Run all verification steps all_passed = True for step_name, verify_func in verification_steps: print(f"\n--- {step_name} ---") if not verify_func(test_dir): all_passed = False # Final result print("\n" + "="*50) if all_passed: print("✅ File splitting task completed correctly!") print("🎉 Task verification: PASS") sys.exit(0) else: print("❌ Task verification: FAIL") sys.exit(1) if __name__ == "__main__": main() ================================================ FILE: tasks/filesystem/standard/file_context/pattern_matching/description.md ================================================ Please use FileSystem tools to finish the following task: ### Task Description Your task is to find all files that contain a substring of 30 or more characters that also appears in `large_file.txt`. **You are not allowed to use python code.** ### Task Objectives 1. **Read the reference file** `large_file.txt` to understand its content 2. **Examine each file** from file_01.txt to file_20.txt 3. **Find files** that contain a substring of 30 or more characters that matches a substring in `large_file.txt` 4. **Create a file `answer.txt`** and write the results to it with the following format: - One line per matching file - Format: `filename.txt,start_position` - Where start_position is the character position (1-indexed) of the matching substring in `large_file.txt` - Do not add any things else other than `filename.txt,start_position`. ================================================ FILE: tasks/filesystem/standard/file_context/pattern_matching/meta.json ================================================ { "task_id": "pattern_matching", "task_name": "Pattern Matching", "category_id": "file_context", "category_name": "File Context", "description": "Search multiple files for shared character sequences and precisely locate all matching pattern occurrences within the target files.", "author": "Lingjun Chen", "created_at": "2025-08-06", "difficulty": "L3", "tags": [ "pattern analysis", "cross-referencing" ], "mcp": [ "filesystem" ], "meta_data": { "stateType": "text", "stateContent": "file_context/\n ├── file_01.txt\n ├── file_02.txt\n ├── file_03.txt\n ├── file_04.txt\n ├── file_05.txt\n ├── file_06.txt\n ├── file_07.txt\n ├── file_08.txt\n ├── file_09.txt\n ├── file_10.txt\n ├── file_11.txt\n ├── file_12.txt\n ├── file_13.txt\n ├── file_14.txt\n ├── file_15.txt\n ├── file_16.txt\n ├── file_17.txt\n ├── file_18.txt\n ├── file_19.txt\n ├── file_20.txt\n └── large_file.txt", "stateUrl": "https://storage.mcpmark.ai/filesystem/file_context.zip", "stateOriginalUrl": null } } ================================================ FILE: tasks/filesystem/standard/file_context/pattern_matching/verify.py ================================================ #!/usr/bin/env python3 """ Verification script for File Filtering Task: Find Files with Common Substring """ import sys from pathlib import Path import os import re def get_test_directory() -> Path: """Get the test directory from FILESYSTEM_TEST_DIR env var.""" test_root = os.environ.get("FILESYSTEM_TEST_DIR") if not test_root: raise ValueError("FILESYSTEM_TEST_DIR environment variable is required") return Path(test_root) def verify_answer_file_exists(test_dir: Path) -> bool: """Verify that the answer.txt file exists.""" answer_file = test_dir / "answer.txt" if not answer_file.exists(): print("❌ File 'answer.txt' not found") return False print("✅ Answer file found") return True def verify_answer_format(test_dir: Path) -> bool: """Verify that the answer file has the correct format.""" answer_file = test_dir / "answer.txt" try: content = answer_file.read_text().strip() # If file is empty, that's acceptable (no matches found) if not content: print("✅ Answer file is empty (no matches found)") return True lines = content.split('\n') for i, line in enumerate(lines, 1): line = line.strip() if not line: continue # Check format: filename.txt,start_position parts = line.split(',') if len(parts) != 2: print(f"❌ Line {i} has incorrect format: {line}") print(" Expected format: filename.txt,start_position") return False filename, start_pos = parts # Check filename format if not filename.endswith('.txt') or not filename.startswith('file_'): print(f"❌ Line {i} has invalid filename: {filename}") return False # Check position format (should be integer) try: start_int = int(start_pos) if start_int <= 0: print(f"❌ Line {i} has invalid position: {start_pos}") return False except ValueError: print(f"❌ Line {i} has non-integer position: {start_pos}") return False print("✅ Answer format is correct") return True except Exception as e: print(f"❌ Error reading answer file: {e}") return False def find_30_plus_char_matches(test_dir: Path) -> dict: """Find all matches with 30 or more characters between files and large_file.txt.""" large_file = test_dir / "large_file.txt" if not large_file.exists(): print("❌ large_file.txt not found") return {} large_content = large_file.read_text() matches = {} # Check each file from file_01.txt to file_20.txt for i in range(1, 21): filename = f"file_{i:02d}.txt" file_path = test_dir / filename if not file_path.exists(): continue file_content = file_path.read_text() # Find the longest matching substring (30+ characters) longest_match = "" longest_match_start = -1 # Check all possible substrings in the file for start_pos in range(len(file_content)): for end_pos in range(start_pos + 30, len(file_content) + 1): # At least 30 characters substring = file_content[start_pos:end_pos] # Check if this substring exists in large_file.txt if substring in large_content: if len(substring) > len(longest_match): longest_match = substring # Find the position in large_file.txt where this substring starts large_start_pos = large_content.find(substring) longest_match_start = large_start_pos + 1 # 1-indexed # If we found a match of 30+ characters, record it if longest_match and len(longest_match) >= 30: matches[filename] = longest_match_start return matches def verify_matches_are_correct(test_dir: Path) -> bool: """Verify that the matches found in answer.txt are actually correct.""" answer_file = test_dir / "answer.txt" try: content = answer_file.read_text().strip() # If no content, check if there should actually be no matches if not content: expected_matches = find_30_plus_char_matches(test_dir) if expected_matches: print("❌ Answer file is empty but matches should exist") for filename, start_pos in expected_matches.items(): print(f" Expected: {filename},{start_pos}") return False else: print("✅ No matches found (correct)") return True # Parse answer file answer_matches = {} lines = content.split('\n') for line in lines: line = line.strip() if not line: continue filename, start_pos = line.split(',') answer_matches[filename] = int(start_pos) # Get expected matches expected_matches = find_30_plus_char_matches(test_dir) # Check if all answer matches are correct for filename, start_pos in answer_matches.items(): if filename not in expected_matches: print(f"❌ File {filename} listed in answer but has no valid 30+ character match") return False expected_start = expected_matches[filename] if start_pos != expected_start: print(f"❌ Incorrect match position for {filename}") print(f" Expected: {expected_start}") print(f" Found: {start_pos}") return False # Check if all expected matches are in answer for filename in expected_matches: if filename not in answer_matches: print(f"❌ Missing match for {filename} in answer file") return False print("✅ All matches are correct") return True except Exception as e: print(f"❌ Error verifying matches: {e}") return False def verify_match_length_is_30_plus(test_dir: Path) -> bool: """Verify that all matches are at least 30 characters long.""" answer_file = test_dir / "answer.txt" try: content = answer_file.read_text().strip() if not content: return True # No matches to verify large_file = test_dir / "large_file.txt" large_content = large_file.read_text() lines = content.split('\n') for line in lines: line = line.strip() if not line: continue filename, start_pos = line.split(',') start_int = int(start_pos) # Get the file content to check the match file_path = test_dir / filename file_content = file_path.read_text() # Find the longest matching substring starting from the given position longest_match = "" for end_pos in range(start_int + 30 - 1, len(large_content) + 1): # At least 30 characters substring = large_content[start_int - 1:end_pos] # Convert to 0-indexed if substring in file_content: longest_match = substring else: break if len(longest_match) < 30: print(f"❌ Match in {filename} is {len(longest_match)} characters, less than 30") print(f" Starting position: {start_int}") return False print("✅ All matches are at least 30 characters long") return True except Exception as e: print(f"❌ Error verifying match lengths: {e}") return False def verify_files_exist(test_dir: Path) -> bool: """Verify that all files mentioned in answer.txt actually exist.""" answer_file = test_dir / "answer.txt" try: content = answer_file.read_text().strip() if not content: return True # No files to verify lines = content.split('\n') for line in lines: line = line.strip() if not line: continue filename = line.split(',')[0] file_path = test_dir / filename if not file_path.exists(): print(f"❌ File mentioned in answer does not exist: {filename}") return False print("✅ All files mentioned in answer exist") return True except Exception as e: print(f"❌ Error verifying file existence: {e}") return False def main(): """Main verification function.""" test_dir = get_test_directory() print("🔍 Verifying Pattern Matching Task: Find Files with Common Substring...") # Define verification steps verification_steps = [ ("Answer File Exists", verify_answer_file_exists), ("Answer Format", verify_answer_format), ("Files Exist", verify_files_exist), ("Match Length is 30+", verify_match_length_is_30_plus), ("Matches are Correct", verify_matches_are_correct), ] # Run all verification steps all_passed = True for step_name, verify_func in verification_steps: print(f"\n--- {step_name} ---") if not verify_func(test_dir): all_passed = False # Final result print("\n" + "="*50) if all_passed: print("✅ File filtering task completed correctly!") print("🎉 Task verification: PASS") sys.exit(0) else: print("❌ Task verification: FAIL") sys.exit(1) if __name__ == "__main__": main() ================================================ FILE: tasks/filesystem/standard/file_context/uppercase/description.md ================================================ Please use FileSystem tools to finish the following task: ### Task Description You need to process 10 text files (file_01.txt to file_10.txt) and convert their content to uppercase format. ### Task Objectives 1. **Create an uppercase directory** in the test environment root 2. **Convert each file** from file_01.txt to file_10.txt to uppercase 3. **Save converted files** in the uppercase/ directory with the same names 4. **Count words** in each original file (file_01.txt to file_10.txt) 5. **Create answer.txt** with word counts in the specified format. ### Specified Format of answer.txt Create a file named `answer.txt` in uppercase/ **Requirements:** - Each line should follow the format: `<filename>:<word_count>` - Include all 10 files: file_01.txt, file_02.txt, ..., file_10.txt - Use the exact filename format (file_01.txt, file_02.txt, etc.) - One entry per line ================================================ FILE: tasks/filesystem/standard/file_context/uppercase/meta.json ================================================ { "task_id": "uppercase", "task_name": "Uppercase", "category_id": "file_context", "category_name": "File Context", "description": "Convert the content of 10 specified files to uppercase format and calculate the total word count across all processed files.", "author": "Lingjun Chen", "created_at": "2025-08-19", "difficulty": "L3", "tags": [ "content transformation", "data extraction" ], "mcp": [ "filesystem" ], "meta_data": { "stateType": "text", "stateContent": "file_context/\n ├── file_01.txt\n ├── file_02.txt\n ├── file_03.txt\n ├── file_04.txt\n ├── file_05.txt\n ├── file_06.txt\n ├── file_07.txt\n ├── file_08.txt\n ├── file_09.txt\n ├── file_10.txt\n ├── file_11.txt\n ├── file_12.txt\n ├── file_13.txt\n ├── file_14.txt\n ├── file_15.txt\n ├── file_16.txt\n ├── file_17.txt\n ├── file_18.txt\n ├── file_19.txt\n ├── file_20.txt\n └── large_file.txt", "stateUrl": "https://storage.mcpmark.ai/filesystem/file_context.zip", "stateOriginalUrl": null } } ================================================ FILE: tasks/filesystem/standard/file_context/uppercase/verify.py ================================================ #!/usr/bin/env python3 """ Verification script for File Context Task: Convert Files to Uppercase """ import sys from pathlib import Path import os import re def get_test_directory() -> Path: """Get the test directory from FILESYSTEM_TEST_DIR env var.""" test_root = os.environ.get("FILESYSTEM_TEST_DIR") if not test_root: raise ValueError("FILESYSTEM_TEST_DIR environment variable is required") return Path(test_root) def verify_uppercase_directory_exists(test_dir: Path) -> bool: """Verify that the uppercase directory exists.""" uppercase_dir = test_dir / "uppercase" if not uppercase_dir.exists(): print("| ❌ Directory 'uppercase' not found") return False if not uppercase_dir.is_dir(): print("| ❌ 'uppercase' exists but is not a directory") return False print("| ✓ Uppercase directory found") return True def verify_uppercase_files_exist(test_dir: Path) -> bool: """Verify that all 10 uppercase files exist.""" uppercase_dir = test_dir / "uppercase" for i in range(1, 11): filename = f"file_{i:02d}.txt" file_path = uppercase_dir / filename if not file_path.exists(): print(f"| ❌ File '{filename}' not found in uppercase directory") return False print("| ✓ All 10 uppercase files found") return True def verify_uppercase_content(test_dir: Path) -> bool: """Verify that uppercase files contain the correct uppercase content.""" uppercase_dir = test_dir / "uppercase" for i in range(1, 11): filename = f"file_{i:02d}.txt" original_file = test_dir / filename uppercase_file = uppercase_dir / filename if not original_file.exists(): print(f"| ❌ Original file '{filename}' not found") return False try: original_content = original_file.read_text() uppercase_content = uppercase_file.read_text() # Check if uppercase content is the uppercase version of original expected_uppercase = original_content.upper() if uppercase_content != expected_uppercase: print(f"| ❌ File '{filename}' content is not properly converted to uppercase") return False except Exception as e: print(f"| ❌ Error reading file '{filename}': {e}") return False print("| ✓ All uppercase files contain correct uppercase content") return True def verify_answer_file_exists(test_dir: Path) -> bool: """Verify that the answer.txt file exists in the uppercase directory.""" uppercase_dir = test_dir / "uppercase" answer_file = uppercase_dir / "answer.txt" if not answer_file.exists(): print("| ❌ File 'answer.txt' not found in uppercase directory") return False print("| ✓ Answer file found in uppercase directory") return True def verify_answer_format(test_dir: Path) -> bool: """Verify that the answer file has the correct format.""" uppercase_dir = test_dir / "uppercase" answer_file = uppercase_dir / "answer.txt" try: content = answer_file.read_text().strip() if not content: print("| ❌ Answer file is empty") return False lines = content.split('\n') # Check if we have exactly 10 lines if len(lines) != 10: print(f"| ❌ Answer file has {len(lines)} lines, expected 10") return False for i, line in enumerate(lines, 1): line = line.strip() if not line: print(f"| ❌ Line {i} is empty") return False # Check format: filename:word_count if ':' not in line: print(f"| ❌ Line {i} has incorrect format: {line}") print(" Expected format: filename:word_count") return False parts = line.split(':', 1) if len(parts) != 2: print(f"| ❌ Line {i} has incorrect format: {line}") print(" Expected format: filename:word_count") return False filename, word_count_str = parts # Check filename format if not filename.endswith('.txt') or not filename.startswith('file_'): print(f"| ❌ Line {i} has invalid filename: {filename}") return False # Check word count format (should be integer) try: word_count = int(word_count_str) if word_count <= 0: print(f"| ❌ Line {i} has invalid word count: {word_count_str}") return False except ValueError: print(f"| ❌ Line {i} has non-integer word count: {word_count_str}") return False print("| ✓ Answer format is correct") return True except Exception as e: print(f"| ❌ Error reading answer file: {e}") return False def count_words_in_file(file_path: Path) -> int: """Count words in a file.""" try: content = file_path.read_text() # Split by whitespace and filter out empty strings words = [word for word in content.split() if word.strip()] return len(words) except Exception as e: print(f"| ❌ Error reading file {file_path}: {e}") return 0 def verify_word_counts_are_correct(test_dir: Path) -> bool: """Verify that the word counts in answer.txt are correct.""" uppercase_dir = test_dir / "uppercase" answer_file = uppercase_dir / "answer.txt" try: content = answer_file.read_text().strip() lines = content.split('\n') # Expected word counts based on answer.md expected_counts = [22, 22, 22, 22, 18, 22, 22, 22, 18, 20] # Create a set of expected file entries for easier checking expected_entries = set() for i in range(1, 11): filename = f"file_{i:02d}.txt" expected_count = expected_counts[i - 1] if i == 6: # Special case for file_06.txt: can be 21 or 22 expected_entries.add(f"{filename}:21") expected_entries.add(f"{filename}:22") else: expected_entries.add(f"{filename}:{expected_count}") # Check each line in the answer file found_entries = set() for line in lines: line = line.strip() if line in expected_entries: found_entries.add(line) else: print(f"| ❌ Invalid entry: {line}") return False # Check if we found all expected entries if len(found_entries) != 10: print(f"| ❌ Found {len(found_entries)} entries, expected 10") missing = expected_entries - found_entries if missing: print(f" Missing entries: {missing}") return False print("| ✓ All word counts are correct") return True except Exception as e: print(f"| ❌ Error verifying word counts: {e}") return False def verify_all_files_are_included(test_dir: Path) -> bool: """Verify that all 10 files are included in the answer.""" uppercase_dir = test_dir / "uppercase" answer_file = uppercase_dir / "answer.txt" try: content = answer_file.read_text().strip() lines = content.split('\n') # Check that all 10 files are present found_files = set() for line in lines: parts = line.split(':', 1) filename = parts[0] found_files.add(filename) expected_files = {f"file_{i:02d}.txt" for i in range(1, 11)} if found_files != expected_files: missing = expected_files - found_files extra = found_files - expected_files if missing: print(f"| ❌ Missing files in answer: {missing}") if extra: print(f"| ❌ Extra files in answer: {extra}") return False print("| ✓ All 10 files are included in answer") return True except Exception as e: print(f"| ❌ Error verifying file inclusion: {e}") return False def main(): """Main verification function.""" try: test_dir = get_test_directory() print(f"| 🔍 Verifying Uppercase in: {test_dir}") print('|') # Run all verification checks checks = [ ("Uppercase directory exists", verify_uppercase_directory_exists), ("Uppercase files exist", verify_uppercase_files_exist), ("Uppercase content is correct", verify_uppercase_content), ("Answer file exists in uppercase directory", verify_answer_file_exists), ("Answer format is correct", verify_answer_format), ("All files are included", verify_all_files_are_included), ("Word counts are correct", verify_word_counts_are_correct), ] all_passed = True for check_name, check_func in checks: print(f"| Checking {check_name}...") if not check_func(test_dir): all_passed = False print('|') if all_passed: print("| 🎉 All verification checks passed!") sys.exit(0) else: print("| ❌ Some verification checks failed!") sys.exit(1) except Exception as e: print(f"| ❌ Verification failed with error: {e}") sys.exit(1) if __name__ == "__main__": main() ================================================ FILE: tasks/filesystem/standard/file_property/size_classification/description.md ================================================ Please use FileSystem tools to finish the following task: ### Task Description Classify all files in the test directory into three categories based on their file size. Create three subdirectories and move files accordingly. ### Task Objectives 1. **Create three directories** in the test directory: - `small_files/` - for files smaller than 300 bytes - `medium_files/` - for files between 300-700 bytes (inclusive) - `large_files/` - for files larger than 700 bytes 2. **Move all files** from the test directory into the appropriate subdirectory based on their size 3. **Handle all file types** - classify all files regardless of their extension (.txt, .jpg, .MOV, etc.) ================================================ FILE: tasks/filesystem/standard/file_property/size_classification/meta.json ================================================ { "task_id": "size_classification", "task_name": "Size Classification", "category_id": "file_property", "category_name": "File Property", "description": "Classify all files in the folder by size into distinct categories (small/medium/large) and generate a comprehensive summary report with statistics.", "author": "Lingjun Chen", "created_at": "2025-08-07", "difficulty": "L3", "tags": [ "file organization", "pattern analysis" ], "mcp": [ "filesystem" ], "meta_data": { "stateType": "text", "stateContent": "file_property/\n ├── bear.jpg\n ├── bridge.jpg\n ├── bus.MOV\n ├── random_file_1.txt\n ├── random_file_2.txt\n ├── random_file_3.txt\n ├── road.MOV\n └── sg.jpg", "stateUrl": "https://storage.mcpmark.ai/filesystem/file_property.zip", "stateOriginalUrl": null } } ================================================ FILE: tasks/filesystem/standard/file_property/size_classification/verify.py ================================================ #!/usr/bin/env python3 """ Verification script for File Classification Task """ import sys from pathlib import Path import os def get_test_directory() -> Path: """Get the test directory from FILESYSTEM_TEST_DIR env var.""" test_root = os.environ.get("FILESYSTEM_TEST_DIR") if not test_root: raise ValueError("FILESYSTEM_TEST_DIR environment variable is required") return Path(test_root) def get_expected_classification(): """Return the expected file classification based on answer.md.""" return { "small_files": ["random_file_1.txt", "random_file_3.txt"], "medium_files": ["random_file_2.txt"], "large_files": ["bear.jpg", "sg.jpg", "road.MOV", "bus.MOV", "bridge.jpg"] } def verify_directories_exist(test_dir: Path) -> bool: """Verify that all three required directories exist.""" required_dirs = ["small_files", "medium_files", "large_files"] for dir_name in required_dirs: dir_path = test_dir / dir_name if not dir_path.exists(): print(f"❌ Directory '{dir_name}' not found") return False if not dir_path.is_dir(): print(f"❌ '{dir_name}' exists but is not a directory") return False print("✅ All required directories exist") return True def verify_file_classification(test_dir: Path) -> bool: """Verify that files are correctly classified into the right directories.""" expected_classification = get_expected_classification() for dir_name, expected_files in expected_classification.items(): dir_path = test_dir / dir_name # Check that all expected files are in the directory missing_files = [] for filename in expected_files: file_path = dir_path / filename if not file_path.exists(): missing_files.append(filename) if missing_files: print(f"❌ Missing files in '{dir_name}': {missing_files}") return False # Check that no unexpected files are in the directory (ignore .DS_Store and similar system files) actual_files = [f.name for f in dir_path.iterdir() if f.is_file()] # Filter out system files that are commonly present system_files = ['.DS_Store', 'Thumbs.db', '.DS_Store?', '._.DS_Store'] unexpected_files = [f for f in actual_files if f not in expected_files and f not in system_files] if unexpected_files: print(f"❌ Unexpected files in '{dir_name}': {unexpected_files}") return False print("✅ All files are correctly classified") return True def verify_no_files_in_root(test_dir: Path) -> bool: """Verify that no files remain in the root test directory.""" root_files = [f for f in test_dir.iterdir() if f.is_file()] # Filter out system files that are commonly present system_files = ['.DS_Store', 'Thumbs.db', '.DS_Store?', '._.DS_Store'] non_system_files = [f for f in root_files if f.name not in system_files] if non_system_files: print(f"❌ Files still present in root directory: {[f.name for f in non_system_files]}") return False print("✅ No files remain in root directory") return True def verify_file_sizes(test_dir: Path) -> bool: """Verify that files are actually in the correct size categories.""" size_ranges = { "small_files": (0, 299), # < 300 bytes "medium_files": (300, 700), # 300-700 bytes (inclusive) "large_files": (701, float('inf')) # > 700 bytes } for dir_name, (min_size, max_size) in size_ranges.items(): dir_path = test_dir / dir_name for file_path in dir_path.iterdir(): if file_path.is_file(): file_size = file_path.stat().st_size if dir_name == "small_files" and file_size >= 300: print(f"❌ File {file_path.name} in small_files but size is {file_size} bytes") return False elif dir_name == "medium_files" and (file_size < 300 or file_size > 700): print(f"❌ File {file_path.name} in medium_files but size is {file_size} bytes") return False elif dir_name == "large_files" and file_size <= 700: print(f"❌ File {file_path.name} in large_files but size is {file_size} bytes") return False print("✅ All files are in correct size categories") return True def verify_total_file_count(test_dir: Path) -> bool: """Verify that all original files are accounted for.""" expected_classification = get_expected_classification() total_expected = sum(len(files) for files in expected_classification.values()) total_actual = 0 for dir_name in ["small_files", "medium_files", "large_files"]: dir_path = test_dir / dir_name if dir_path.exists(): # Count only non-system files system_files = ['.DS_Store', 'Thumbs.db', '.DS_Store?', '._.DS_Store'] files_in_dir = [f for f in dir_path.iterdir() if f.is_file() and f.name not in system_files] total_actual += len(files_in_dir) if total_actual != total_expected: print(f"❌ Expected {total_expected} files total, found {total_actual}") return False print(f"✅ Total file count is correct: {total_actual}") return True def main(): """Main verification function.""" try: test_dir = get_test_directory() print(f"🔍 Verifying file classification in: {test_dir}") # Run all verification checks checks = [ ("Directory existence", verify_directories_exist), ("File classification", verify_file_classification), ("No files in root", verify_no_files_in_root), ("File size validation", verify_file_sizes), ("Total file count", verify_total_file_count) ] all_passed = True for check_name, check_func in checks: print(f"\n📋 Checking: {check_name}") if not check_func(test_dir): all_passed = False if all_passed: print("\n🎉 All verification checks passed!") sys.exit(0) else: print("\n❌ Some verification checks failed!") sys.exit(1) except Exception as e: print(f"❌ Verification failed with error: {e}") sys.exit(1) if __name__ == "__main__": main() ================================================ FILE: tasks/filesystem/standard/file_property/time_classification/description.md ================================================ Please use FileSystem tools to finish the following task: ### Task Description Analyze the creation time (ctime) of all files in the test directory and organize them into a hierarchical directory structure based on their creation dates. ### Task Objectives 1. **Read metadata** of all files in the test directory 2. **Analyze creation times** (ctime) of all files (excluding .DS_Store) 3. **Create directory structure** organized by month/day based on creation time 4. **Move files** to appropriate directories 5. **Create metadata analysis files** in each directory ### Expected Output #### Directory Structure Create directories in the format: `MM/DD/` where: - MM = month (two digits, e.g., 01, 02) - DD = day (two digits, e.g., 07, 09, 11, 26) #### Metadata Analysis Files Create a file named `metadata_analyse.txt` in each directory containing exactly only two lines: - **Line 1**: Oldest filename and its creation time (excluding .DS_Store) - **Line 2**: Latest filename and its creation time (excluding .DS_Store) ================================================ FILE: tasks/filesystem/standard/file_property/time_classification/meta.json ================================================ { "task_id": "time_classification", "task_name": "Time Classification", "category_id": "file_property", "category_name": "File Property", "description": "Organize files based on modification timestamps into temporal categories and create a detailed time-based classification report with groupings.", "author": "Lingjun Chen", "created_at": "2025-08-07", "difficulty": "L3", "tags": [ "file organization", "data extraction", "pattern analysis" ], "mcp": [ "filesystem" ], "meta_data": { "stateType": "text", "stateContent": "file_property/\n ├── bear.jpg\n ├── bridge.jpg\n ├── bus.MOV\n ├── random_file_1.txt\n ├── random_file_2.txt\n ├── random_file_3.txt\n ├── road.MOV\n └── sg.jpg", "stateUrl": "https://storage.mcpmark.ai/filesystem/file_property.zip", "stateOriginalUrl": null } } ================================================ FILE: tasks/filesystem/standard/file_property/time_classification/verify.py ================================================ #!/usr/bin/env python3 """ Verification script for File Organization by Creation Time Task """ import sys from pathlib import Path import os from datetime import datetime import re def get_test_directory() -> Path: """Get the test directory from FILESYSTEM_TEST_DIR env var.""" test_root = os.environ.get("FILESYSTEM_TEST_DIR") if not test_root: raise ValueError("FILESYSTEM_TEST_DIR environment variable is required") return Path(test_root) def get_month_mapping(): """Return mapping for both numeric and alphabetic month representations.""" return { "07": ["07", "7", "jul", "Jul", "JUL"], "08": ["08", "8", "aug", "Aug", "AUG"] } def get_day_mapping(): """Return mapping for day representations.""" return { "09": ["09", "9"], "25": ["25"], "26": ["26"], "06": ["06", "6"] } def get_expected_directory_structure(): """Return the expected directory structure based on answer.md.""" return { "07": { "09": ["sg.jpg"], "25": ["bus.MOV"], "26": ["road.MOV"] }, "08": { "06": ["bear.jpg", "bridge.jpg", "random_file_1.txt", "random_file_2.txt", "random_file_3.txt"] } } def find_month_directory(test_dir: Path, expected_month: str) -> Path: """Find the actual month directory, handling both numeric and alphabetic representations.""" month_mapping = get_month_mapping() valid_month_names = month_mapping.get(expected_month, [expected_month]) for month_name in valid_month_names: month_dir = test_dir / month_name if month_dir.exists() and month_dir.is_dir(): return month_dir return None def find_day_directory(month_dir: Path, expected_day: str) -> Path: """Find the actual day directory, handling both numeric representations.""" day_mapping = get_day_mapping() valid_day_names = day_mapping.get(expected_day, [expected_day]) for day_name in valid_day_names: day_dir = month_dir / day_name if day_dir.exists() and day_dir.is_dir(): return day_dir return None def verify_directory_structure(test_dir: Path) -> bool: """Verify that the correct directory structure exists.""" expected_structure = get_expected_directory_structure() for expected_month, days in expected_structure.items(): month_dir = find_month_directory(test_dir, expected_month) if month_dir is None: valid_names = get_month_mapping().get(expected_month, [expected_month]) print(f"❌ Month directory not found. Expected one of: {valid_names}") return False for day, expected_files in days.items(): day_dir = find_day_directory(month_dir, day) if day_dir is None: valid_day_names = get_day_mapping().get(day, [day]) print(f"❌ Day directory '{month_dir.name}/{day}' not found. Expected one of: {valid_day_names}") return False if not day_dir.is_dir(): print(f"❌ '{month_dir.name}/{day_dir.name}' exists but is not a directory") return False print("✅ Directory structure is correct") return True def verify_files_in_directories(test_dir: Path) -> bool: """Verify that files are in the correct directories.""" expected_structure = get_expected_directory_structure() for expected_month, days in expected_structure.items(): month_dir = find_month_directory(test_dir, expected_month) if month_dir is None: continue # Already handled in verify_directory_structure for day, expected_files in days.items(): day_dir = find_day_directory(month_dir, day) if day_dir is None: continue # Already handled in verify_directory_structure # Check that all expected files are in the directory missing_files = [] for filename in expected_files: file_path = day_dir / filename if not file_path.exists(): missing_files.append(filename) if missing_files: print(f"❌ Missing files in '{month_dir.name}/{day_dir.name}': {missing_files}") return False # Check that no unexpected files are in the directory (ignore .DS_Store and metadata_analyse.txt) actual_files = [f.name for f in day_dir.iterdir() if f.is_file()] system_files = ['.DS_Store', 'Thumbs.db', '.DS_Store?', '._.DS_Store', 'metadata_analyse.txt'] unexpected_files = [f for f in actual_files if f not in expected_files and f not in system_files] if unexpected_files: print(f"❌ Unexpected files in '{month_dir.name}/{day_dir.name}': {unexpected_files}") return False print("✅ All files are in correct directories") return True def verify_metadata_analysis_files(test_dir: Path) -> bool: """Verify that metadata_analyse.txt files exist and have correct content.""" expected_structure = get_expected_directory_structure() for expected_month, days in expected_structure.items(): month_dir = find_month_directory(test_dir, expected_month) if month_dir is None: continue # Already handled in verify_directory_structure for day, expected_files in days.items(): day_dir = find_day_directory(month_dir, day) if day_dir is None: continue # Already handled in verify_directory_structure metadata_file = day_dir / "metadata_analyse.txt" if not metadata_file.exists(): print(f"❌ metadata_analyse.txt not found in '{month_dir.name}/{day_dir.name}'") return False try: content = metadata_file.read_text().strip() lines = content.split('\n') # Check that there are exactly 2 lines if len(lines) != 2: print(f"❌ metadata_analyse.txt in '{month_dir.name}/{day_dir.name}' has {len(lines)} lines, expected 2") return False # Check each line - more flexible verification for line_num, line in enumerate(lines, 1): line_lower = line.lower() # Check filename based on expected_month and day expected_filename = None if expected_month == "07" and day == "09": expected_filename = "sg.jpg" elif expected_month == "07" and day == "25": expected_filename = "bus.mov" elif expected_month == "07" and day == "26": expected_filename = "road.mov" elif expected_month == "08" and day == "06": # For 08/06, check if it's one of the expected files if line_num == 1: # First line should be bear.jpg expected_filename = "bear.jpg" else: # Second line should be one of the random files expected_filenames = ["random_file_1.txt", "random_file_2.txt", "random_file_3.txt"] if not any(filename in line_lower for filename in expected_filenames): print(f"❌ Line {line_num} in '{month_dir.name}/{day_dir.name}' should contain one of {expected_filenames}: {line}") return False continue # Skip other checks for this line if expected_filename and expected_filename not in line_lower: print(f"❌ Line {line_num} in '{month_dir.name}/{day_dir.name}' should contain '{expected_filename}': {line}") return False # Check month letters month_letters = None if expected_month == "07": month_letters = ["jul", "7"] elif expected_month == "08": month_letters = ["aug", "8"] if month_letters and not any(letter in line_lower for letter in month_letters): print(f"❌ Line {line_num} in '{month_dir.name}/{day_dir.name}' should contain month letters: {line}") return False # Check year (2025) if "2025" not in line_lower: print(f"❌ Line {line_num} in '{month_dir.name}/{day_dir.name}' should contain '2025': {line}") return False # Check day number - support both formats valid_day_names = get_day_mapping().get(day, [day]) if not any(day_name in line_lower for day_name in valid_day_names): print(f"❌ Line {line_num} in '{month_dir.name}/{day_dir.name}' should contain day '{day}' (or {valid_day_names}): {line}") return False except Exception as e: print(f"❌ Error reading metadata_analyse.txt in '{month_dir.name}/{day_dir.name}': {e}") return False print("✅ All metadata_analyse.txt files are correct") return True def verify_no_files_in_root(test_dir: Path) -> bool: """Verify that no files remain in the root test directory.""" root_files = [f for f in test_dir.iterdir() if f.is_file()] # Filter out system files that are commonly present system_files = ['.DS_Store', 'Thumbs.db', '.DS_Store?', '._.DS_Store'] non_system_files = [f for f in root_files if f.name not in system_files] if non_system_files: print(f"❌ Files still present in root directory: {[f.name for f in non_system_files]}") return False print("✅ No files remain in root directory") return True def verify_total_file_count(test_dir: Path) -> bool: """Verify that all original files are accounted for.""" expected_structure = get_expected_directory_structure() total_expected = sum(len(files) for days in expected_structure.values() for files in days.values()) total_actual = 0 for expected_month, days in expected_structure.items(): month_dir = find_month_directory(test_dir, expected_month) if month_dir is None: continue for day in days: day_dir = find_day_directory(month_dir, day) if day_dir and day_dir.exists(): # Count only non-system files system_files = ['.DS_Store', 'Thumbs.db', '.DS_Store?', '._.DS_Store', 'metadata_analyse.txt'] files_in_dir = [f for f in day_dir.iterdir() if f.is_file() and f.name not in system_files] total_actual += len(files_in_dir) if total_actual != total_expected: print(f"❌ Expected {total_expected} files total, found {total_actual}") return False print(f"✅ Total file count is correct: {total_actual}") return True def main(): """Main verification function.""" try: test_dir = get_test_directory() print(f"🔍 Verifying Time Classification in: {test_dir}") # Run all verification checks checks = [ ("Directory structure", verify_directory_structure), ("Files in directories", verify_files_in_directories), ("Metadata analysis files", verify_metadata_analysis_files), ("No files in root", verify_no_files_in_root), ("Total file count", verify_total_file_count) ] all_passed = True for check_name, check_func in checks: print(f"\n📋 Checking: {check_name}") if not check_func(test_dir): all_passed = False if all_passed: print("\n🎉 All verification checks passed!") sys.exit(0) else: print("\n❌ Some verification checks failed!") sys.exit(1) except Exception as e: print(f"❌ Verification failed with error: {e}") sys.exit(1) if __name__ == "__main__": main() ================================================ FILE: tasks/filesystem/standard/folder_structure/structure_analysis/description.md ================================================ Please use FileSystem tools to finish the following task: You need to recursively traverse the entire folder structure under the main directory and generate a detailed statistical report in a file named `structure_analysis.txt`. In all tasks, ignore `.DS_Store` files. In any tasks, you should not change or delete any existed files. Do not try to use python code. --- ### 1. File Statistics Count the following information for the entire directory structure: - total number of files - total number of folders - total size of the hole folder (in bytes, include .DS_Store only in this subtask) **Format (one item per line):** total number of files: X total number of folders: Y total size of all files: Z --- ### 2. Depth Analysis Identify the deepest folder path(s) in the directory and calculate its depth level. - Use relative paths based on main directory. - **Write the folder path only up to the folder, not including the file name.For example, if the file path is `./complex_structure/A/B/C/def.txt`, then the path in your report should be `complex_structure/A/B/C`, and the depth is `4`.** - If multiple deepest paths exist, list only one. **Format (one item per line):** depth: N PATH --- ### 3. File Type Classification Categorize files by their extensions and count the number of files for each type. Files without extensions should also be included. **Format (one extension per line):** txt: count py: count jpg: count mov: count (no extension): count ================================================ FILE: tasks/filesystem/standard/folder_structure/structure_analysis/meta.json ================================================ { "task_id": "structure_analysis", "task_name": "Structure Analysis", "category_id": "folder_structure", "category_name": "Folder Structure", "description": "Perform thorough analysis of complex folder hierarchy to generate a detailed structural summary report with comprehensive file statistics.", "author": "Lingjun Chen", "created_at": "2025-08-16", "difficulty": "L3", "tags": [ "pattern analysis", "data extraction" ], "mcp": [ "filesystem" ], "meta_data": { "stateType": "text", "stateContent": "folder_structure/\n └── complex_structure/\n ├── deeply/\n │ └── nested/\n │ └── folder/\n │ └── structure/\n ├── empty_folder/\n ├── folder_lxkHt_0_1/\n │ └── file_PeLzC_0.txt\n ├── folder_QdTAj_0_2/\n │ ├── folder_eXccj_1_0/\n │ │ ├── folder_Mqlwh_2_1/\n │ │ │ ├── folder_cKxcP_3_3/\n │ │ │ │ ├── folder_BPTMK_4_1/\n │ │ │ │ │ └── file_RHtBP_0.txt\n │ │ │ │ ├── folder_QNqjq_4_0/\n │ │ │ │ │ ├── folder_gRwPE_5_1/\n │ │ │ │ │ │ ├── file_jVlpp_0.txt\n │ │ │ │ │ │ └── file_vJuHz_1.txt\n │ │ │ │ │ ├── folder_XdXYJ_5_0/\n │ │ │ │ │ │ └── file_KvkKi_0.txt\n │ │ │ │ │ ├── file_gGxLG_2.txt\n │ │ │ │ │ ├── file_Hzkxo_0.txt\n │ │ │ │ │ └── file_XRjeh_1.txt\n │ │ │ │ ├── folder_vIBIt_4_2/\n │ │ │ │ │ ├── folder_kRDNS_5_0/\n │ │ │ │ │ │ └── file_wFSjJ_0.txt\n │ │ │ │ │ └── file_NyBSO_0.txt\n │ │ │ │ ├── file_EOCNf_1.txt\n │ │ │ │ └── file_gmrXA_0.txt\n │ │ │ ├── folder_NcruA_3_1/\n │ │ │ │ ├── file_bLWDj_1.txt\n │ │ │ │ └── file_WAftR_0.txt\n │ │ │ ├── folder_qCDFI_3_2/\n │ │ │ │ ├── file_eSMOJ_0.txt\n │ │ │ │ ├── file_oxADy_2.txt\n │ │ │ │ └── file_RTbbc_1.txt\n │ │ │ ├── folder_QVHUU_3_0/\n │ │ │ │ ├── folder_FEPTK_4_1/\n │ │ │ │ │ ├── folder_GHoMC_5_1/\n │ │ │ │ │ │ └── file_rAMYd_0.txt\n │ │ │ │ │ ├── folder_iBDUY_5_0/\n │ │ │ │ │ │ └── file_IJCaw_0.txt\n │ │ │ │ │ ├── folder_VRXgp_5_2/\n │ │ │ │ │ │ └── file_hkUmS_0.txt\n │ │ │ │ │ ├── file_nqLAf_1.txt\n │ │ │ │ │ └── file_XflmA_0.txt\n │ │ │ │ ├── folder_FlPoK_4_3/\n │ │ │ │ │ ├── folder_hSVNm_5_3/\n │ │ │ │ │ │ └── file_klnbn_0.txt\n │ │ │ │ │ ├── folder_iZuEl_5_0/\n │ │ │ │ │ │ └── file_LqAmy_0.txt\n │ │ │ │ │ ├── folder_LcURj_5_2/\n │ │ │ │ │ │ ├── file_RgwOS_1.txt\n │ │ │ │ │ │ └── file_ZHnYb_0.txt\n │ │ │ │ │ ├── folder_tuZQJ_5_1/\n │ │ │ │ │ │ └── file_LHuIx_0.txt\n │ │ │ │ │ ├── file_asJnB_1.txt\n │ │ │ │ │ └── file_EzLdu_0.txt\n │ │ │ │ ├── folder_ndhsJ_4_0/\n │ │ │ │ │ ├── folder_CUSXK_5_0/\n │ │ │ │ │ │ ├── file_DpiuM_1.txt\n │ │ │ │ │ │ └── file_pSqeG_0.txt\n │ │ │ │ │ ├── folder_pstmE_5_1/\n │ │ │ │ │ │ └── file_YwdJt_0.txt\n │ │ │ │ │ ├── folder_StlsP_5_2/\n │ │ │ │ │ │ ├── file_kriBJ_0.txt\n │ │ │ │ │ │ └── file_XCEdm_1.txt\n │ │ │ │ │ ├── file_ToDjh_1.txt\n │ │ │ │ │ └── file_xbIVx_0.txt\n │ │ │ │ ├── folder_PJBok_4_4/\n │ │ │ │ │ ├── folder_mzxaf_5_0/\n │ │ │ │ │ │ ├── file_ILBzj_2.txt\n │ │ │ │ │ │ ├── file_MTGMm_1.txt\n │ │ │ │ │ │ └── file_zBDqz_0.txt\n │ │ │ │ │ ├── folder_sULMj_5_1/\n │ │ │ │ │ │ ├── file_BHziw_1.txt\n │ │ │ │ │ │ ├── file_sIjiu_2.txt\n │ │ │ │ │ │ └── file_VqNkB_0.txt\n │ │ │ │ │ ├── folder_vypSi_5_3/\n │ │ │ │ │ │ ├── file_kZbIm_1.txt\n │ │ │ │ │ │ └── file_sOBtE_0.txt\n │ │ │ │ │ ├── folder_ZLGHy_5_2/\n │ │ │ │ │ │ ├── file_azaFF_0.txt\n │ │ │ │ │ │ └── file_nAFRe_1.txt\n │ │ │ │ │ ├── file_mIkQU_0.txt\n │ │ │ │ │ └── file_sGPxd_1.txt\n │ │ │ │ ├── folder_VTbEG_4_2/\n │ │ │ │ │ ├── file_HtYLg_0.txt\n │ │ │ │ │ ├── file_JXjMd_1.txt\n │ │ │ │ │ └── file_tPccB_2.txt\n │ │ │ │ ├── file_BuOSw_1.txt\n │ │ │ │ └── file_TpoqE_0.txt\n │ │ │ ├── folder_wTvun_3_4/\n │ │ │ │ ├── file_GyhyE_1.txt\n │ │ │ │ ├── file_POsla_2.txt\n │ │ │ │ └── file_tSsvk_0.txt\n │ │ │ ├── file_irNju_0.txt\n │ │ │ └── file_jYBRm_1.txt\n │ │ ├── folder_YlJLI_2_0/\n │ │ │ └── file_FpFSL_0.txt\n │ │ ├── file_cFgBr_2.txt\n │ │ ├── file_lKEWN_1.txt\n │ │ └── file_ZEWFP_0.txt\n │ └── file_ayUCH_0.txt\n ├── folder_xtgyi_0_0/\n │ └── file_BvSOB_0.txt\n ├── mixed_content/\n │ └── images_and_text/\n │ └── notes.txt\n ├── project/\n │ ├── docs/\n │ │ └── archive/\n │ │ └── 2023/\n │ │ └── reports/\n │ │ ├── report_0.txt\n │ │ ├── report_1.txt\n │ │ └── report_2.txt\n │ └── src/\n │ └── main/\n │ └── resources/\n └── m.py", "stateUrl": "https://storage.mcpmark.ai/filesystem/folder_structure.zip", "stateOriginalUrl": null } } ================================================ FILE: tasks/filesystem/standard/folder_structure/structure_analysis/verify.py ================================================ #!/usr/bin/env python3 """ Verification script for Directory Structure Analysis Task """ import sys from pathlib import Path import os import re def get_test_directory() -> Path: """Get the test directory from FILESYSTEM_TEST_DIR env var.""" test_root = os.environ.get("FILESYSTEM_TEST_DIR") if not test_root: raise ValueError("FILESYSTEM_TEST_DIR environment variable is required") return Path(test_root) def verify_structure_analysis_file_exists(test_dir: Path) -> bool: """Verify that the structure_analysis.txt file exists.""" analysis_file = test_dir / "structure_analysis.txt" if not analysis_file.exists(): print("❌ File 'structure_analysis.txt' not found") return False print("✅ structure_analysis.txt file found") return True def verify_structure_analysis_file_readable(test_dir: Path) -> bool: """Verify that the structure_analysis.txt file is readable.""" analysis_file = test_dir / "structure_analysis.txt" try: content = analysis_file.read_text() if not content.strip(): print("❌ structure_analysis.txt file is empty") return False print("✅ structure_analysis.txt file is readable") return True except Exception as e: print(f"❌ Error reading structure_analysis.txt file: {e}") return False def verify_subtask1_file_statistics(test_dir: Path) -> bool: """Verify subtask 1: File Statistics - files must be 69, folders must be 51, 58097 allows +-1000.""" analysis_file = test_dir / "structure_analysis.txt" try: content = analysis_file.read_text() # Extract numbers from the content file_count_match = re.search(r'total number of files:\s*(\d+)', content) folder_count_match = re.search(r'total number of folders:\s*(\d+)', content) size_match = re.search(r'total size of all files:\s*(\d+)', content) if not file_count_match or not folder_count_match or not size_match: print("❌ Could not extract file statistics from structure_analysis.txt") return False file_count = int(file_count_match.group(1)) folder_count = int(folder_count_match.group(1)) total_size = int(size_match.group(1)) print(f"📊 Found: files={file_count}, folders={folder_count}, size={total_size}") # Check if file count is exactly 69 if file_count != 69: print(f"❌ File count must be 69, found: {file_count}") return False # Check if folder count is exactly 51 if folder_count != 51: print(f"❌ Folder count must be 51, found: {folder_count}") return False # Check if size is within acceptable range (58097 ± 1000) expected_size = 58097 size_tolerance = 1000 if abs(total_size - expected_size) > size_tolerance: print(f"❌ Total size ({total_size}) is not within acceptable range ({expected_size} ± {size_tolerance})") return False print(f"✅ File statistics verified: files={file_count}, folders={folder_count}, size={total_size} (within tolerance)") return True except Exception as e: print(f"❌ Error verifying file statistics: {e}") return False def verify_subtask2_depth_analysis(test_dir: Path) -> bool: """Verify subtask 2: Depth Analysis - depth must be 7, verify path exists.""" analysis_file = test_dir / "structure_analysis.txt" try: content = analysis_file.read_text() # Extract depth and path depth_match = re.search(r'depth:\s*(\d+)', content) path_match = re.search(r'^([^\n]+)$', content, re.MULTILINE) if not depth_match: print("❌ Could not extract depth from structure_analysis.txt") return False depth = int(depth_match.group(1)) # Check if depth is exactly 7 if depth != 7: print(f"❌ Depth must be 7, found: {depth}") return False print(f"✅ Depth verified: {depth}") # Extract the path (it should be on a separate line after "depth: 7") lines = content.split('\n') path_line = None for i, line in enumerate(lines): if line.strip() == f"depth: {depth}": if i + 1 < len(lines): path_line = lines[i + 1].strip() break if not path_line: print("❌ Could not find path line after depth specification") return False print(f"📁 Found path: {path_line}") # Verify that the path depth matches the declared depth path_parts = path_line.split('/') actual_depth = len(path_parts) if actual_depth != depth: print(f"❌ Path depth mismatch: declared depth is {depth}, but path has {actual_depth} levels") print(f" Path: {path_line}") print(f" Path parts: {path_parts}") return False print(f"✅ Path depth verified: {actual_depth} levels") # Verify that this path exists in the test environment expected_path = test_dir / path_line if not expected_path.exists(): print(f"❌ Path does not exist: {expected_path}") return False if not expected_path.is_dir(): print(f"❌ Path exists but is not a directory: {expected_path}") return False print(f"✅ Path verified and exists: {path_line}") return True except Exception as e: print(f"❌ Error verifying depth analysis: {e}") return False def verify_subtask3_file_type_classification(test_dir: Path) -> bool: """Verify subtask 3: File Type Classification - 68 and 1 must be accurate.""" analysis_file = test_dir / "structure_analysis.txt" try: content = analysis_file.read_text() # Extract file type counts txt_match = re.search(r'txt:\s*(\d+)', content) py_match = re.search(r'py:\s*(\d+)', content) if not txt_match or not py_match: print("❌ Could not extract file type counts from structure_analysis.txt") return False txt_count = int(txt_match.group(1)) py_count = int(py_match.group(1)) print(f"📁 Found: txt={txt_count}, py={py_count}") # Check if txt count is exactly 68 if txt_count != 68: print(f"❌ txt count must be 68, found: {txt_count}") return False # Check if py count is exactly 1 if py_count != 1: print(f"❌ py count must be 1, found: {py_count}") return False print(f"✅ File type classification verified: txt={txt_count}, py={py_count}") return True except Exception as e: print(f"❌ Error verifying file type classification: {e}") return False def verify_file_format(test_dir: Path) -> bool: """Verify that the structure_analysis.txt file has proper format.""" analysis_file = test_dir / "structure_analysis.txt" try: content = analysis_file.read_text() lines = content.split('\n') # Check if file has the expected structure if len(lines) < 5: # Should have at least 5 lines print("❌ File seems too short to contain all required information") return False # Basic format check - ensure it's not completely corrupted if not content.strip(): print("❌ File is completely empty") return False print("✅ File format is acceptable") return True except Exception as e: print(f"❌ Error checking file format: {e}") return False def main(): """Main verification function.""" try: test_dir = get_test_directory() print(f"🔍 Verifying Directory Structure Analysis Task in: {test_dir}") # Define verification steps verification_steps = [ ("Structure Analysis File Exists", verify_structure_analysis_file_exists), ("File is Readable", verify_structure_analysis_file_readable), ("Subtask 1: File Statistics", verify_subtask1_file_statistics), ("Subtask 2: Depth Analysis", verify_subtask2_depth_analysis), ("Subtask 3: File Type Classification", verify_subtask3_file_type_classification), ("File Format", verify_file_format), ] # Run all verification steps all_passed = True for step_name, verify_func in verification_steps: print(f"\n--- {step_name} ---") if not verify_func(test_dir): all_passed = False # Final result print("\n" + "="*50) if all_passed: print("✅ Directory Structure Analysis completed correctly!") print("🎉 Structure Analysis verification: PASS") sys.exit(0) else: print("❌ Structure Analysis verification: FAIL") sys.exit(1) except Exception as e: print(f"❌ Verification failed with error: {e}") sys.exit(1) if __name__ == "__main__": main() ================================================ FILE: tasks/filesystem/standard/folder_structure/structure_mirror/description.md ================================================ Please use FileSystem tools to finish the following task: ### Task Copy the entire directory structure of `complex_structure/` to `complex_structure_mirror/` without copying any file contents. Do not use python code. ### Requirements - Create the entire directory structure in `complex_structure_mirror/` - Do not copy any file contents, only create directories - In each empty directory, create a `placeholder.txt` file containing the absolute path of that directory - Handle nested directories of any depth - You should also follow 2 rules: 1. **Discard any directory that directly contains more than 2 files (only count the immediate folder).** 2. **If a directory name contains numbers, append "_processed" to the mirror directory name** ================================================ FILE: tasks/filesystem/standard/folder_structure/structure_mirror/meta.json ================================================ { "task_id": "structure_mirror", "task_name": "Structure Mirror", "category_id": "folder_structure", "category_name": "Folder Structure", "description": "Create an exact mirror copy of the folder structure in a target location while applying specified transformation rules.", "author": "Lingjun Chen", "created_at": "2025-08-08", "difficulty": "L3", "tags": [ "file organization", "content transformation" ], "mcp": [ "filesystem" ], "meta_data": { "stateType": "text", "stateContent": "folder_structure/\n └── complex_structure/\n ├── deeply/\n │ └── nested/\n │ └── folder/\n │ └── structure/\n ├── empty_folder/\n ├── folder_lxkHt_0_1/\n │ └── file_PeLzC_0.txt\n ├── folder_QdTAj_0_2/\n │ ├── folder_eXccj_1_0/\n │ │ ├── folder_Mqlwh_2_1/\n │ │ │ ├── folder_cKxcP_3_3/\n │ │ │ │ ├── folder_BPTMK_4_1/\n │ │ │ │ │ └── file_RHtBP_0.txt\n │ │ │ │ ├── folder_QNqjq_4_0/\n │ │ │ │ │ ├── folder_gRwPE_5_1/\n │ │ │ │ │ │ ├── file_jVlpp_0.txt\n │ │ │ │ │ │ └── file_vJuHz_1.txt\n │ │ │ │ │ ├── folder_XdXYJ_5_0/\n │ │ │ │ │ │ └── file_KvkKi_0.txt\n │ │ │ │ │ ├── file_gGxLG_2.txt\n │ │ │ │ │ ├── file_Hzkxo_0.txt\n │ │ │ │ │ └── file_XRjeh_1.txt\n │ │ │ │ ├── folder_vIBIt_4_2/\n │ │ │ │ │ ├── folder_kRDNS_5_0/\n │ │ │ │ │ │ └── file_wFSjJ_0.txt\n │ │ │ │ │ └── file_NyBSO_0.txt\n │ │ │ │ ├── file_EOCNf_1.txt\n │ │ │ │ └── file_gmrXA_0.txt\n │ │ │ ├── folder_NcruA_3_1/\n │ │ │ │ ├── file_bLWDj_1.txt\n │ │ │ │ └── file_WAftR_0.txt\n │ │ │ ├── folder_qCDFI_3_2/\n │ │ │ │ ├── file_eSMOJ_0.txt\n │ │ │ │ ├── file_oxADy_2.txt\n │ │ │ │ └── file_RTbbc_1.txt\n │ │ │ ├── folder_QVHUU_3_0/\n │ │ │ │ ├── folder_FEPTK_4_1/\n │ │ │ │ │ ├── folder_GHoMC_5_1/\n │ │ │ │ │ │ └── file_rAMYd_0.txt\n │ │ │ │ │ ├── folder_iBDUY_5_0/\n │ │ │ │ │ │ └── file_IJCaw_0.txt\n │ │ │ │ │ ├── folder_VRXgp_5_2/\n │ │ │ │ │ │ └── file_hkUmS_0.txt\n │ │ │ │ │ ├── file_nqLAf_1.txt\n │ │ │ │ │ └── file_XflmA_0.txt\n │ │ │ │ ├── folder_FlPoK_4_3/\n │ │ │ │ │ ├── folder_hSVNm_5_3/\n │ │ │ │ │ │ └── file_klnbn_0.txt\n │ │ │ │ │ ├── folder_iZuEl_5_0/\n │ │ │ │ │ │ └── file_LqAmy_0.txt\n │ │ │ │ │ ├── folder_LcURj_5_2/\n │ │ │ │ │ │ ├── file_RgwOS_1.txt\n │ │ │ │ │ │ └── file_ZHnYb_0.txt\n │ │ │ │ │ ├── folder_tuZQJ_5_1/\n │ │ │ │ │ │ └── file_LHuIx_0.txt\n │ │ │ │ │ ├── file_asJnB_1.txt\n │ │ │ │ │ └── file_EzLdu_0.txt\n │ │ │ │ ├── folder_ndhsJ_4_0/\n │ │ │ │ │ ├── folder_CUSXK_5_0/\n │ │ │ │ │ │ ├── file_DpiuM_1.txt\n │ │ │ │ │ │ └── file_pSqeG_0.txt\n │ │ │ │ │ ├── folder_pstmE_5_1/\n │ │ │ │ │ │ └── file_YwdJt_0.txt\n │ │ │ │ │ ├── folder_StlsP_5_2/\n │ │ │ │ │ │ ├── file_kriBJ_0.txt\n │ │ │ │ │ │ └── file_XCEdm_1.txt\n │ │ │ │ │ ├── file_ToDjh_1.txt\n │ │ │ │ │ └── file_xbIVx_0.txt\n │ │ │ │ ├── folder_PJBok_4_4/\n │ │ │ │ │ ├── folder_mzxaf_5_0/\n │ │ │ │ │ │ ├── file_ILBzj_2.txt\n │ │ │ │ │ │ ├── file_MTGMm_1.txt\n │ │ │ │ │ │ └── file_zBDqz_0.txt\n │ │ │ │ │ ├── folder_sULMj_5_1/\n │ │ │ │ │ │ ├── file_BHziw_1.txt\n │ │ │ │ │ │ ├── file_sIjiu_2.txt\n │ │ │ │ │ │ └── file_VqNkB_0.txt\n │ │ │ │ │ ├── folder_vypSi_5_3/\n │ │ │ │ │ │ ├── file_kZbIm_1.txt\n │ │ │ │ │ │ └── file_sOBtE_0.txt\n │ │ │ │ │ ├── folder_ZLGHy_5_2/\n │ │ │ │ │ │ ├── file_azaFF_0.txt\n │ │ │ │ │ │ └── file_nAFRe_1.txt\n │ │ │ │ │ ├── file_mIkQU_0.txt\n │ │ │ │ │ └── file_sGPxd_1.txt\n │ │ │ │ ├── folder_VTbEG_4_2/\n │ │ │ │ │ ├── file_HtYLg_0.txt\n │ │ │ │ │ ├── file_JXjMd_1.txt\n │ │ │ │ │ └── file_tPccB_2.txt\n │ │ │ │ ├── file_BuOSw_1.txt\n │ │ │ │ └── file_TpoqE_0.txt\n │ │ │ ├── folder_wTvun_3_4/\n │ │ │ │ ├── file_GyhyE_1.txt\n │ │ │ │ ├── file_POsla_2.txt\n │ │ │ │ └── file_tSsvk_0.txt\n │ │ │ ├── file_irNju_0.txt\n │ │ │ └── file_jYBRm_1.txt\n │ │ ├── folder_YlJLI_2_0/\n │ │ │ └── file_FpFSL_0.txt\n │ │ ├── file_cFgBr_2.txt\n │ │ ├── file_lKEWN_1.txt\n │ │ └── file_ZEWFP_0.txt\n │ └── file_ayUCH_0.txt\n ├── folder_xtgyi_0_0/\n │ └── file_BvSOB_0.txt\n ├── mixed_content/\n │ └── images_and_text/\n │ └── notes.txt\n ├── project/\n │ ├── docs/\n │ │ └── archive/\n │ │ └── 2023/\n │ │ └── reports/\n │ │ ├── report_0.txt\n │ │ ├── report_1.txt\n │ │ └── report_2.txt\n │ └── src/\n │ └── main/\n │ └── resources/\n └── m.py", "stateUrl": "https://storage.mcpmark.ai/filesystem/folder_structure.zip", "stateOriginalUrl": null } } ================================================ FILE: tasks/filesystem/standard/folder_structure/structure_mirror/verify.py ================================================ #!/usr/bin/env python3 """ Verification script for Directory Structure Mirroring with Smart Placeholders Task """ import sys from pathlib import Path import os def get_test_directory() -> Path: """Get the test directory from FILESYSTEM_TEST_DIR env var.""" test_root = os.environ.get("FILESYSTEM_TEST_DIR") if not test_root: raise ValueError("FILESYSTEM_TEST_DIR environment variable is required") return Path(test_root) def verify_mirror_directory_exists(test_dir: Path, mirror_path: Path) -> bool: """Verify that a mirror directory exists.""" if not mirror_path.exists(): print(f"❌ Mirror directory not found: {mirror_path.relative_to(test_dir)}") return False if not mirror_path.is_dir(): print(f"❌ Mirror path exists but is not a directory: {mirror_path.relative_to(test_dir)}") return False print(f"✅ Mirror directory exists: {mirror_path.relative_to(test_dir)}") return True def verify_placeholder_file_exists(mirror_path: Path, test_dir: Path) -> bool: """Verify that placeholder.txt exists in the mirror directory.""" placeholder_file = mirror_path / "placeholder.txt" if not placeholder_file.exists(): print(f"❌ placeholder.txt not found in: {mirror_path.relative_to(test_dir)}") return False if not placeholder_file.is_file(): print(f"❌ placeholder.txt exists but is not a file in: {mirror_path.relative_to(test_dir)}") return False print(f"✅ placeholder.txt exists in: {mirror_path.relative_to(test_dir)}") return True def verify_placeholder_content(mirror_path: Path, test_dir: Path) -> bool: """Verify that placeholder.txt contains the correct path ending with complex_structure_mirror/...""" placeholder_file = mirror_path / "placeholder.txt" try: content = placeholder_file.read_text().strip() # Check if content is not empty if not content: print(f"❌ placeholder.txt is empty in: {mirror_path.relative_to(test_dir)}") return False # Check if it contains the correct path ending with complex_structure_mirror/... expected_ending = f"complex_structure_mirror/{mirror_path.relative_to(test_dir / 'complex_structure_mirror')}" if not content.endswith(expected_ending): print(f"❌ placeholder.txt content incorrect in: {mirror_path.relative_to(test_dir)}") print(f" Expected ending: {expected_ending}") print(f" Found: {content}") return False print(f"✅ placeholder.txt content is correct in: {mirror_path.relative_to(test_dir)}") return True except Exception as e: print(f"❌ Error reading placeholder.txt in {mirror_path.relative_to(test_dir)}: {e}") return False def verify_no_files_copied(test_dir: Path) -> bool: """Verify that no file contents were copied, only directory structure.""" source_dir = test_dir / "complex_structure" mirror_dir = test_dir / "complex_structure_mirror" if not mirror_dir.exists(): print("❌ Mirror directory 'complex_structure_mirror' not found") return False # Check that no files from source were copied (except placeholder.txt files) for source_file in source_dir.rglob("*"): if source_file.is_file(): # Calculate the corresponding mirror path relative_path = source_file.relative_to(source_dir) mirror_file = mirror_dir / relative_path # Skip if this would be a placeholder.txt file if mirror_file.name == "placeholder.txt": continue if mirror_file.exists(): print(f"❌ File was copied when it shouldn't be: {relative_path}") return False print("✅ No file contents were copied, only directory structure") return True def verify_mirror_structure_completeness(test_dir: Path) -> bool: """Verify that the mirror structure is complete and matches expected structure.""" mirror_dir = test_dir / "complex_structure_mirror" if not mirror_dir.exists(): print("❌ Mirror directory 'complex_structure_mirror' not found") return False # Define expected directories that should exist (based on backup structure) expected_dirs = [ "deeply", "deeply/nested", "deeply/nested/folder", "deeply/nested/folder/structure", "empty_folder", "folder_lxkHt_0_1_processed", "folder_QdTAj_0_2_processed", "folder_xtgyi_0_0_processed", "mixed_content", "mixed_content/images_and_text", "project", "project/docs", "project/docs/archive", "project/docs/archive/2023_processed", "project/src", "project/src/main", "project/src/main/resources" ] # Define which directories should have placeholder.txt files placeholder_dirs = [ "deeply/nested/folder/structure", "empty_folder", "folder_lxkHt_0_1_processed", "folder_QdTAj_0_2_processed", "folder_xtgyi_0_0_processed", "mixed_content/images_and_text", "project/docs/archive/2023_processed", "project/src/main/resources" ] all_passed = True # Check that all expected directories exist for expected_dir in expected_dirs: mirror_path = mirror_dir / expected_dir if not verify_mirror_directory_exists(test_dir, mirror_path): all_passed = False elif expected_dir in placeholder_dirs: # Check placeholder.txt for directories that should have it if not verify_placeholder_file_exists(mirror_path, test_dir): all_passed = False elif not verify_placeholder_content(mirror_path, test_dir): all_passed = False # Check that no unexpected directories exist for mirror_subdir in mirror_dir.rglob("*"): if mirror_subdir.is_dir(): relative_path = mirror_subdir.relative_to(mirror_dir) if str(relative_path) not in expected_dirs and str(relative_path) != ".": print(f"❌ Unexpected directory found: {relative_path}") all_passed = False return all_passed def main(): """Main verification function.""" try: test_dir = get_test_directory() print(f"🔍 Verifying Directory Structure Mirroring with Smart Placeholders in: {test_dir}") # Define verification steps verification_steps = [ ("No files copied", verify_no_files_copied), ("Mirror structure completeness", verify_mirror_structure_completeness), ] # Run all verification steps all_passed = True for step_name, verify_func in verification_steps: print(f"\n📋 Checking: {step_name}") if not verify_func(test_dir): all_passed = False # Final result print("\n" + "="*50) if all_passed: print("✅ Directory structure mirroring completed correctly!") print("🎉 Structure Mirror verification: PASS") sys.exit(0) else: print("❌ Structure Mirror verification: FAIL") sys.exit(1) except Exception as e: print(f"❌ Verification failed with error: {e}") sys.exit(1) if __name__ == "__main__": main() ================================================ FILE: tasks/filesystem/standard/legal_document/dispute_review/description.md ================================================ Please use FileSystem tools to finish the following task: **Overview** The folder "legal_files/" contains all versions (Preferred_Stock_Purchase_Agreement_v0.txt -- Preferred_Stock_Purchase_Agreement_v10.txt) of the Stock Purchase Agreement for a corporate investment project. There are comments in it, come from four people: - **Bill Harvey** (Company CEO) - **Michelle Jackson** (Investor) - **David Russel** (Company Counsel) - **Tony Taylor** (Investor Counsel) Between v1 and v9, these four people make comments on the clauses. The comment format is `[name:content]`, where: - `name` is the commenter's name - `content` is the revision note **Special Note:** If the name is "All parties", it represents a joint comment from all parties, which counts as one comment but does not count toward any individual's personal comment count. ## Task Your task is to review these versions and identify all clauses that have been commented in **v5,6,7 (in folder legal_files/)**. Generate a file named `dispute_review.txt` in the main directory. In this file, list each commented clause on a separate line and indicate the number of comments for each clause in the format "Clause number:number of comments". Clause number should be in the format of X.X. ================================================ FILE: tasks/filesystem/standard/legal_document/dispute_review/meta.json ================================================ { "task_id": "dispute_review", "task_name": "Dispute Review", "category_id": "legal_document", "category_name": "Legal Document", "description": "Analyze multiple versions of legal documents to track clause discussion frequency and generate a comprehensive dispute summary report.", "author": "Lingjun Chen", "created_at": "2025-08-15", "difficulty": "L3", "tags": [ "data extraction", "cross-referencing", "pattern analysis" ], "mcp": [ "filesystem" ], "meta_data": { "stateType": "text", "stateContent": "legal_document/\n └── legal_files/\n ├── Preferred_Stock_Purchase_Agreement_v0.txt\n ├── Preferred_Stock_Purchase_Agreement_v1.txt\n ├── Preferred_Stock_Purchase_Agreement_v2.txt\n ├── Preferred_Stock_Purchase_Agreement_v3.txt\n ├── Preferred_Stock_Purchase_Agreement_v4.txt\n ├── Preferred_Stock_Purchase_Agreement_v5.txt\n ├── Preferred_Stock_Purchase_Agreement_v6.txt\n ├── Preferred_Stock_Purchase_Agreement_v7.txt\n ├── Preferred_Stock_Purchase_Agreement_v8.txt\n ├── Preferred_Stock_Purchase_Agreement_v9.txt\n └── Preferred_Stock_Purchase_Agreement_v10.txt", "stateUrl": "https://storage.mcpmark.ai/filesystem/legal_document.zip", "stateOriginalUrl": "https://www.cooleygo.com/documents/nvca-financing-documents" } } ================================================ FILE: tasks/filesystem/standard/legal_document/dispute_review/verify.py ================================================ #!/usr/bin/env python3 """ Verification script for Legal Document Dispute Review Task """ import sys from pathlib import Path import re import os def get_test_directory() -> Path: """Get the test directory from FILESYSTEM_TEST_DIR env var.""" test_root = os.environ.get("FILESYSTEM_TEST_DIR") if not test_root: raise ValueError("FILESYSTEM_TEST_DIR environment variable is required") return Path(test_root) def verify_output_file_exists(test_dir: Path) -> bool: """Verify that the dispute_review.txt file exists.""" output_file = test_dir / "dispute_review.txt" if not output_file.exists(): print("❌ File 'dispute_review.txt' not found") return False print("✅ Output file found") return True def verify_output_format(test_dir: Path) -> bool: """Verify that the output file has the correct format.""" output_file = test_dir / "dispute_review.txt" try: content = output_file.read_text().strip() # Check if content is not empty if not content: print("❌ Output file is empty") return False # Check format: each line should be "X.X:number" lines = content.split('\n') for i, line in enumerate(lines, 1): line = line.strip() if not line: continue # Check format: X.X:number if not re.match(r'^\d+\.\d+:\d+$', line): print(f"❌ Line {i} has incorrect format: '{line}'") print(" Expected format: 'X.X:number' (e.g., '1.1:3')") return False print("✅ Output format is correct") return True except Exception as e: print(f"❌ Error reading output file: {e}") return False def verify_expected_entries(test_dir: Path) -> bool: """Verify that the output contains the expected entries with correct counts.""" output_file = test_dir / "dispute_review.txt" try: content = output_file.read_text().strip() lines = content.split('\n') # Parse the output into a dictionary output_entries = {} for line in lines: line = line.strip() if not line: continue clause, count_str = line.split(':', 1) output_entries[clause] = int(count_str) # Expected entries based on answer.txt expected_entries = { "1.1": 3, "1.3": 3, "4.6": [5, 6], # Can be either 5 or 6 "4.16": 5, "6.8": 4 } # Check if all expected entries are present missing_entries = [] for clause in expected_entries: if clause not in output_entries: missing_entries.append(clause) if missing_entries: print(f"❌ Missing expected entries: {missing_entries}") return False # Check if there are extra entries extra_entries = [] for clause in output_entries: if clause not in expected_entries: extra_entries.append(clause) if extra_entries: print(f"❌ Unexpected extra entries: {extra_entries}") return False # Check counts for each entry for clause, expected_count in expected_entries.items(): actual_count = output_entries[clause] if isinstance(expected_count, list): # For 4.6, accept either 5 or 6 if actual_count not in expected_count: print(f"❌ Clause {clause}: expected {expected_count}, got {actual_count}") return False else: if actual_count != expected_count: print(f"❌ Clause {clause}: expected {expected_count}, got {actual_count}") return False print("✅ All expected entries with correct counts") return True except Exception as e: print(f"❌ Error verifying entries: {e}") return False def verify_comment_count_accuracy(test_dir: Path) -> bool: """Verify that the comment counts are accurate by checking the actual files.""" # Since we already verify the expected entries in verify_expected_entries, # and the answer.txt contains the correct counts, we can skip this complex verification # to avoid false negatives due to regex matching issues. print("✅ Comment count accuracy check skipped - relying on expected entries verification") return True def main(): """Main verification function.""" test_dir = get_test_directory() print("🔍 Verifying Legal Document Dispute Review Task...") # Define verification steps verification_steps = [ ("Output File Exists", verify_output_file_exists), ("Output Format", verify_output_format), ("Expected Entries", verify_expected_entries), ("Comment Count Accuracy", verify_comment_count_accuracy), ] # Run all verification steps all_passed = True for step_name, verify_func in verification_steps: print(f"\n--- {step_name} ---") if not verify_func(test_dir): all_passed = False # Final result print("\n" + "="*50) if all_passed: print("✅ Legal document dispute review completed correctly!") print("🎉 Task verification: PASS") sys.exit(0) else: print("❌ Task verification: FAIL") sys.exit(1) if __name__ == "__main__": main() ================================================ FILE: tasks/filesystem/standard/legal_document/individual_comments/description.md ================================================ Please use FileSystem tools to finish the following task: **Overview** The folder "legal_files/" contains all versions (Preferred_Stock_Purchase_Agreement_v0.txt -- Preferred_Stock_Purchase_Agreement_v10.txt) of the Stock Purchase Agreement for a corporate investment project. There are comments in it, come from four people: - **Bill Harvey** (Company CEO) - **Michelle Jackson** (Investor) - **David Russel** (Company Counsel) - **Tony Taylor** (Investor Counsel) Between v1 and v9, these four people make comments on the clauses. The comment format is `[name:content]`, where: - `name` is the commenter's name - `content` is the revision note **Special Note:** If the name is "All parties", it represents a joint comment from all parties, which counts as one comment but does not count toward any individual's personal comment count. ## Task Your task is to count the number of comments made by Bill Harvey (Company CEO), Michelle Jackson (Investor), David Russel (Company Counsel), and Tony Taylor (Investor Counsel) in clauses 1.1, 1.3, 4.6, 4.16, 6.8, and 6.16 **in version 5-8.** Please generate `individual_comment.csv` in the **main directory** where the first row contains these clauses (1.1, 1.3, 4.6, 4.16, 6.8, 6.16) and the first column contains the four names (Bill Harvey, Michelle Jackson, David Russel, Tony Taylor). Fill in the table with the number of comments for each person and each clause. If there are no comments, write 0. ================================================ FILE: tasks/filesystem/standard/legal_document/individual_comments/meta.json ================================================ { "task_id": "individual_comments", "task_name": "Individual Comments", "category_id": "legal_document", "category_name": "Legal Document", "description": "Extract and analyze individual reviewer comments on legal clauses across multiple document versions to understand personal perspectives.", "author": "Lingjun Chen", "created_at": "2025-08-15", "difficulty": "L3", "tags": [ "data extraction", "cross-referencing", "pattern analysis" ], "mcp": [ "filesystem" ], "meta_data": { "stateType": "text", "stateContent": "legal_document/\n └── legal_files/\n ├── Preferred_Stock_Purchase_Agreement_v0.txt\n ├── Preferred_Stock_Purchase_Agreement_v1.txt\n ├── Preferred_Stock_Purchase_Agreement_v2.txt\n ├── Preferred_Stock_Purchase_Agreement_v3.txt\n ├── Preferred_Stock_Purchase_Agreement_v4.txt\n ├── Preferred_Stock_Purchase_Agreement_v5.txt\n ├── Preferred_Stock_Purchase_Agreement_v6.txt\n ├── Preferred_Stock_Purchase_Agreement_v7.txt\n ├── Preferred_Stock_Purchase_Agreement_v8.txt\n ├── Preferred_Stock_Purchase_Agreement_v9.txt\n └── Preferred_Stock_Purchase_Agreement_v10.txt", "stateUrl": "https://storage.mcpmark.ai/filesystem/legal_document.zip", "stateOriginalUrl": "https://www.cooleygo.com/documents/nvca-financing-documents" } } ================================================ FILE: tasks/filesystem/standard/legal_document/individual_comments/verify.py ================================================ #!/usr/bin/env python3 """ Verification script for Legal Document Individual Comments Task """ import sys from pathlib import Path import csv import os def get_test_directory() -> Path: """Get the test directory from FILESYSTEM_TEST_DIR env var.""" test_root = os.environ.get("FILESYSTEM_TEST_DIR") if not test_root: raise ValueError("FILESYSTEM_TEST_DIR environment variable is required") return Path(test_root) def verify_output_file_exists(test_dir: Path) -> bool: """Verify that the individual_comment.csv file exists.""" output_file = test_dir / "individual_comment.csv" if not output_file.exists(): print("❌ File 'individual_comment.csv' not found") return False print("✅ Output file 'individual_comment.csv' found") return True def verify_csv_format(test_dir: Path) -> bool: """Verify that the CSV file has the correct format.""" output_file = test_dir / "individual_comment.csv" try: with open(output_file, 'r', newline='', encoding='utf-8') as csvfile: reader = csv.reader(csvfile) rows = list(reader) if not rows: print("❌ CSV file is empty") return False # Check if there are at least 2 rows (header + data) if len(rows) < 2: print("❌ CSV file has insufficient rows") return False # Check if header row has correct number of columns header = rows[0] if len(header) != 7: # First column (can be anything) + 6 clauses print(f"❌ Header row has incorrect number of columns: {len(header)}, expected 7") return False # Check if data rows have correct number of columns for i, row in enumerate(rows[1:], 1): if len(row) != 7: print(f"❌ Data row {i} has incorrect number of columns: {len(row)}, expected 7") return False print("✅ CSV format is correct") return True except Exception as e: print(f"❌ Error reading CSV file: {e}") return False def verify_csv_content(test_dir: Path) -> bool: """Verify that the CSV content matches the expected answer exactly.""" output_file = test_dir / "individual_comment.csv" try: with open(output_file, 'r', newline='', encoding='utf-8') as csvfile: reader = csv.reader(csvfile) rows = list(reader) # Expected data based on answer.csv expected_data = { "Bill Harvey": ["0", "2", "3", "1", "1", "1"], "Michelle Jackson": ["0", "1", "2", "1", "1", "1"], "David Russel": ["2", "1", "1", "2", "1", "1"], "Tony Taylor": ["2", "0", "1", "2", "1", "1"] } # Expected header columns (excluding first column which can be anything) expected_header_columns = ["1.1", "1.3", "4.6", "4.16", "6.8", "6.16"] # Verify header has correct number of columns header = rows[0] if len(header) != 7: # First column + 6 clauses print(f"❌ Header row has incorrect number of columns: {len(header)}, expected 7") return False # Check if all expected clause columns are present (allow order to be different) # Allow first column to be anything, so we check columns 1-6 header_clauses = header[1:7] missing_clauses = [] for expected_clause in expected_header_columns: if expected_clause not in header_clauses: missing_clauses.append(expected_clause) if missing_clauses: print(f"❌ Missing expected clause columns: {missing_clauses}") return False # Check if there are extra clause columns extra_clauses = [] for clause in header_clauses: if clause not in expected_header_columns: extra_clauses.append(clause) if extra_clauses: print(f"❌ Unexpected extra clause columns: {extra_clauses}") return False # Create a mapping from expected clause order to actual column indices clause_mapping = {} for i, clause in enumerate(header_clauses): if clause in expected_header_columns: clause_mapping[clause] = i # Parse the CSV data into a dictionary with correct column mapping csv_data = {} for row in rows[1:]: if len(row) >= 7: name = row[0] # Map values according to the expected clause order values = [] for expected_clause in expected_header_columns: col_index = clause_mapping[expected_clause] + 1 # +1 because we skip first column values.append(row[col_index]) csv_data[name] = values # Check if all expected names are present missing_names = [] for expected_name in expected_data: if expected_name not in csv_data: missing_names.append(expected_name) if missing_names: print(f"❌ Missing expected names: {missing_names}") return False # Check if there are extra names extra_names = [] for name in csv_data: if name not in expected_data: extra_names.append(name) if extra_names: print(f"❌ Unexpected extra names: {extra_names}") return False # Check values for each person for name, expected_values in expected_data.items(): actual_values = csv_data[name] if actual_values != expected_values: print(f"❌ Values mismatch for {name}:") print(f" Expected: {expected_values}") print(f" Got: {actual_values}") return False print("✅ CSV content matches expected answer exactly") return True except Exception as e: print(f"❌ Error verifying CSV content: {e}") return False def verify_data_accuracy(test_dir: Path) -> bool: """Verify that the data values are accurate (all values are non-negative integers).""" output_file = test_dir / "individual_comment.csv" try: with open(output_file, 'r', newline='', encoding='utf-8') as csvfile: reader = csv.reader(csvfile) rows = list(reader) # Skip header row for i, row in enumerate(rows[1:], 1): if len(row) >= 7: name = row[0] values = row[1:7] for j, value in enumerate(values, 1): try: int_val = int(value) if int_val < 0: print(f"❌ Row {i}, column {j}: negative value '{value}' for {name}") return False except ValueError: print(f"❌ Row {i}, column {j}: non-integer value '{value}' for {name}") return False print("✅ All data values are valid non-negative integers") return True except Exception as e: print(f"❌ Error verifying data accuracy: {e}") return False def verify_file_location(test_dir: Path) -> bool: """Verify that the file is in the main directory (not in a subdirectory).""" output_file = test_dir / "individual_comment.csv" if output_file.exists(): print("✅ File is located in the main directory") return True else: print("❌ File is not in the main directory") return False def main(): """Main verification function.""" test_dir = get_test_directory() print("🔍 Verifying Legal Document Individual Comments Task...") # Define verification steps verification_steps = [ ("Output File Exists", verify_output_file_exists), ("CSV Format", verify_csv_format), ("CSV Content", verify_csv_content), ("Data Accuracy", verify_data_accuracy), ("File Location", verify_file_location), ] # Run all verification steps all_passed = True for step_name, verify_func in verification_steps: print(f"\n--- {step_name} ---") if not verify_func(test_dir): all_passed = False # Final result print("\n" + "="*50) if all_passed: print("✅ Legal document individual comments task completed correctly!") print("🎉 Task verification: PASS") sys.exit(0) else: print("❌ Task verification: FAIL") sys.exit(1) if __name__ == "__main__": main() ================================================ FILE: tasks/filesystem/standard/legal_document/solution_tracing/description.md ================================================ Please use FileSystem tools to finish the following task: ### Overview The folder "legal_files/" contains all versions (Preferred_Stock_Purchase_Agreement_v0.txt -- Preferred_Stock_Purchase_Agreement_v10.txt) of the Stock Purchase Agreement for a corporate investment project. There are comments in it, come from four people: - **Bill Harvey** (Company CEO) - **Michelle Jackson** (Investor) - **David Russel** (Company Counsel) - **Tony Taylor** (Investor Counsel) Between v1 and v9, these four people make comments on the clauses. The comment format is `[name:content]`, where: - `name` is the commenter's name - `content` is the revision note **Special Note:** If the name is "All parties", it represents a joint comment from all parties, which counts as one comment but does not count toward any individual's personal comment count. ### Task Description **Your task is to focus on clauses 4.6, 4.16, 6.8, and 6.16 in v5-9** to determine: 1. Who first proposed the idea that eventually led to the final agreed solution 2. In which version's comment it appeared **Important:** If the final solution was formed through multiple people's comments, count as the originator the person whose comment first provided the core motivation (or part of the idea) that shaped the final solution. The key is to identify who initially proposed the motivation for the final solution. ### Output Requirements **File Name:** `tracing.csv` (must be placed in the main directory) **CSV Structure:** - **First row** (excluding the top-left cell): `4.6, 4.16, 6.8, 6.16` - **First column** (excluding the top-left cell): `version_number, name` - **Remaining cells:** Fill in the `version_number` (the version in which the final solution was first proposed, only write a number without any other things) and the `name` (the person who proposed it) for each clause ================================================ FILE: tasks/filesystem/standard/legal_document/solution_tracing/meta.json ================================================ { "task_id": "solution_tracing", "task_name": "Solution Tracing", "category_id": "legal_document", "category_name": "Legal Document", "description": "Trace the evolution of clause resolutions across document versions to identify who first proposed each final accepted solution.", "author": "Lingjun Chen", "created_at": "2025-08-15", "difficulty": "L3", "tags": [ "cross-referencing", "pattern analysis" ], "mcp": [ "filesystem" ], "meta_data": { "stateType": "text", "stateContent": "legal_document/\n └── legal_files/\n ├── Preferred_Stock_Purchase_Agreement_v0.txt\n ├── Preferred_Stock_Purchase_Agreement_v1.txt\n ├── Preferred_Stock_Purchase_Agreement_v2.txt\n ├── Preferred_Stock_Purchase_Agreement_v3.txt\n ├── Preferred_Stock_Purchase_Agreement_v4.txt\n ├── Preferred_Stock_Purchase_Agreement_v5.txt\n ├── Preferred_Stock_Purchase_Agreement_v6.txt\n ├── Preferred_Stock_Purchase_Agreement_v7.txt\n ├── Preferred_Stock_Purchase_Agreement_v8.txt\n ├── Preferred_Stock_Purchase_Agreement_v9.txt\n └── Preferred_Stock_Purchase_Agreement_v10.txt", "stateUrl": "https://storage.mcpmark.ai/filesystem/legal_document.zip", "stateOriginalUrl": "https://www.cooleygo.com/documents/nvca-financing-documents" } } ================================================ FILE: tasks/filesystem/standard/legal_document/solution_tracing/verify.py ================================================ #!/usr/bin/env python3 """ Verification script for Legal Document Solution Tracing Task """ import sys from pathlib import Path import csv import os def get_test_directory() -> Path: """Get the test directory from FILESYSTEM_TEST_DIR env var.""" test_root = os.environ.get("FILESYSTEM_TEST_DIR") if not test_root: raise ValueError("FILESYSTEM_TEST_DIR environment variable is required") return Path(test_root) def verify_output_file_exists(test_dir: Path) -> bool: """Verify that the tracing.csv file exists.""" output_file = test_dir / "tracing.csv" if not output_file.exists(): print("❌ File 'tracing.csv' not found") return False print("✅ Output file 'tracing.csv' found") return True def verify_csv_format(test_dir: Path) -> bool: """Verify that the CSV file has the correct format.""" output_file = test_dir / "tracing.csv" try: with open(output_file, 'r', newline='', encoding='utf-8') as csvfile: reader = csv.reader(csvfile) rows = list(reader) if not rows: print("❌ CSV file is empty") return False # Check if there are at least 2 rows (header + data) if len(rows) < 2: print("❌ CSV file has insufficient rows") return False # Check if header row has correct number of columns header = rows[0] if len(header) != 5: # First column (can be anything) + 4 clauses print(f"❌ Header row has incorrect number of columns: {len(header)}, expected 5") return False # Check if data rows have correct number of columns for i, row in enumerate(rows[1:], 1): if len(row) != 5: print(f"❌ Data row {i} has incorrect number of columns: {len(row)}, expected 5") return False print("✅ CSV format is correct") return True except Exception as e: print(f"❌ Error reading CSV file: {e}") return False def verify_csv_content(test_dir: Path) -> bool: """Verify that the CSV content matches the expected answer exactly.""" output_file = test_dir / "tracing.csv" try: with open(output_file, 'r', newline='', encoding='utf-8') as csvfile: reader = csv.reader(csvfile) rows = list(reader) # Expected data based on answer.csv expected_data = { "version_number": ["5", "6", "7", "8"], "name": ["Bill Harvey", "Michelle Jackson", "Michelle Jackson", "Tony Taylor"] } # Expected header columns (excluding first column which can be anything) expected_header_columns = ["4.6", "4.16", "6.8", "6.16"] # Verify header has correct number of columns header = rows[0] if len(header) != 5: # First column + 4 clauses print(f"❌ Header row has incorrect number of columns: {len(header)}, expected 5") return False # Check if all expected clause columns are present (allow order to be different) # Allow first column to be anything, so we check columns 1-4 header_clauses = header[1:5] missing_clauses = [] for expected_clause in expected_header_columns: if expected_clause not in header_clauses: missing_clauses.append(expected_clause) if missing_clauses: print(f"❌ Missing expected clause columns: {missing_clauses}") return False # Check if there are extra clause columns extra_clauses = [] for clause in header_clauses: if clause not in expected_header_columns: extra_clauses.append(clause) if extra_clauses: print(f"❌ Unexpected extra clause columns: {extra_clauses}") return False # Create a mapping from expected clause order to actual column indices clause_mapping = {} for i, clause in enumerate(header_clauses): if clause in expected_header_columns: clause_mapping[clause] = i # Parse the CSV data into a dictionary with correct column mapping csv_data = {} for row in rows[1:]: if len(row) >= 5: row_type = row[0] # version_number or name # Map values according to the expected clause order values = [] for expected_clause in expected_header_columns: col_index = clause_mapping[expected_clause] + 1 # +1 because we skip first column values.append(row[col_index]) csv_data[row_type] = values # Check if all expected row types are present missing_types = [] for expected_type in expected_data: if expected_type not in csv_data: missing_types.append(expected_type) if missing_types: print(f"❌ Missing expected row types: {missing_types}") return False # Check if there are extra row types extra_types = [] for row_type in csv_data: if row_type not in expected_data: extra_types.append(row_type) if extra_types: print(f"❌ Unexpected extra row types: {extra_types}") return False # Check values for each row type for row_type, expected_values in expected_data.items(): actual_values = csv_data[row_type] if actual_values != expected_values: print(f"❌ Values mismatch for {row_type}:") print(f" Expected: {expected_values}") print(f" Got: {actual_values}") return False print("✅ CSV content matches expected answer exactly") return True except Exception as e: print(f"❌ Error verifying CSV content: {e}") return False def verify_data_accuracy(test_dir: Path) -> bool: """Verify that the data values are accurate.""" output_file = test_dir / "tracing.csv" try: with open(output_file, 'r', newline='', encoding='utf-8') as csvfile: reader = csv.reader(csvfile) rows = list(reader) # Skip header row for i, row in enumerate(rows[1:], 1): if len(row) >= 5: row_type = row[0] values = row[1:5] # Check version_number row if row_type == "version_number": for j, value in enumerate(values, 1): try: int_val = int(value) if int_val < 5 or int_val > 8: print(f"❌ Row {i}, column {j}: version number '{value}' is out of expected range [5-8]") return False except ValueError: print(f"❌ Row {i}, column {j}: non-integer version number '{value}'") return False # Check name row elif row_type == "name": expected_names = ["Bill Harvey", "Michelle Jackson", "Michelle Jackson", "Tony Taylor"] for j, value in enumerate(values, 1): if value not in expected_names: print(f"❌ Row {i}, column {j}: unexpected name '{value}'") return False print("✅ All data values are accurate") return True except Exception as e: print(f"❌ Error verifying data accuracy: {e}") return False def verify_file_location(test_dir: Path) -> bool: """Verify that the file is in the main directory (not in a subdirectory).""" output_file = test_dir / "tracing.csv" if output_file.exists(): print("✅ File is located in the main directory") return True else: print("❌ File is not in the main directory") return False def main(): """Main verification function.""" test_dir = get_test_directory() print("🔍 Verifying Legal Document Solution Tracing Task...") # Define verification steps verification_steps = [ ("Output File Exists", verify_output_file_exists), ("CSV Format", verify_csv_format), ("CSV Content", verify_csv_content), ("Data Accuracy", verify_data_accuracy), ("File Location", verify_file_location), ] # Run all verification steps all_passed = True for step_name, verify_func in verification_steps: print(f"\n--- {step_name} ---") if not verify_func(test_dir): all_passed = False # Final result print("\n" + "="*50) if all_passed: print("✅ Legal document solution tracing task completed correctly!") print("🎉 Task verification: PASS") sys.exit(0) else: print("❌ Task verification: FAIL") sys.exit(1) if __name__ == "__main__": main() ================================================ FILE: tasks/filesystem/standard/papers/author_folders/description.md ================================================ Please use FileSystem tools to finish the following task: ### Task Description You are given a directory containing multiple paper files. You have a collection of academic papers in HTML format from arXiv. Your task is to analyze these papers, identify authors who have published multiple papers, and organize them into author-specific folders based on specified criteria. ### Task Objectives #### Part 1: Frequent Authors (≥4 papers) 1. **Extract author information** from all HTML papers in the given directory 2. **Identify authors** who appear in 4 or more papers 3. **Create a directory** `frequent_authors` 4. **Create individual folders** within this directory for each frequent author (lowercase names with underscores) 5. **Copy their papers** to their respective folders #### Part 2: Prolific 2025 Authors (≥3 papers) 1. **Extract publication dates** along with author information 2. **Identify authors** who published 3 or more papers in 2025 3. **Create a directory** `2025_authors` for 2025 authors 4. **Create individual folders** within this directory for each prolific 2025 author (lowercase names with underscores) 5. **Copy their 2025 papers** to their respective folders ### Expected Output #### Directory Structure: ``` [given_task_folder]/ ├── [original HTML files remain untouched] ├── frequent_authors/ # Authors with ≥4 papers total │ ├── smith_john/ │ │ └── [copied papers] │ ├── johnson_sarah/ │ │ └── [copied papers] │ └── ... └── 2025_authors/ # Authors with ≥3 papers in 2025 ├── williams_david/ │ └── [copied 2025 papers] ├── brown_emily/ │ └── [copied 2025 papers] └── ... ``` #### Requirements: - Author folder names should be **lowercase** with underscores replacing spaces/commas (e.g., `smith_john`, `williams_david`) - Papers should be **copied** (not moved) to preserve originals - Author extraction should handle various name formats correctly ================================================ FILE: tasks/filesystem/standard/papers/author_folders/meta.json ================================================ { "task_id": "author_folders", "task_name": "Author Folders", "category_id": "papers", "category_name": "Papers", "description": "Analyze academic papers to identify and organize by author, creating separate folders for frequent authors (≥4 papers) and prolific 2025 authors (≥3 papers).", "author": "Xiangyan Liu", "created_at": "2025-08-12", "difficulty": "L3", "tags": [ "data extraction", "file organization", "pattern analysis" ], "mcp": [ "filesystem" ], "meta_data": { "stateType": "text", "stateContent": "papers/\n ├── 1707.06347.html\n ├── 2105.04165.html\n ├── 2201.11903.html\n ├── 2303.08774.html\n ├── 2306.08640.html\n ├── 2310.02255.html\n ├── 2310.08446.html\n ├── 2312.00849.html\n ├── 2312.07533.html\n ├── 2312.11805.html\n ├── 2402.00253.html\n ├── 2402.03300.html\n ├── 2403.05530.html\n ├── 2404.13046.html\n ├── 2404.14367.html\n ├── 2404.14396.html\n ├── 2405.09818.html\n ├── 2405.13911.html\n ├── 2405.16473.html\n ├── 2405.16640.html\n ├── 2406.08478.html\n ├── 2406.16852.html\n ├── 2406.17294.html\n ├── 2407.01284.html\n ├── 2407.01509.html\n ├── 2407.21783.html\n ├── 2408.03326.html\n ├── 2408.12528.html\n ├── 2409.19256.html\n ├── 2410.05993.html\n ├── 2410.06166.html\n ├── 2410.10563.html\n ├── 2410.13848.html\n ├── 2410.17885.html\n ├── 2410.21276.html\n ├── 2411.07975.html\n ├── 2411.10442.html\n ├── 2411.11930.html\n ├── 2411.14432.html\n ├── 2412.05271.html\n ├── 2412.08443.html\n ├── 2412.10302.html\n ├── 2412.15115.html\n ├── 2412.16720.html\n ├── 2412.17256.html\n ├── 2412.18319.html\n ├── 2412.20631.html\n ├── 2501.04686.html\n ├── 2501.06186.html\n ├── 2501.12599.html\n ├── 2501.12948.html\n ├── 2501.17811.html\n ├── 2502.01456.html\n ├── 2502.09621.html\n ├── 2502.10391.html\n ├── 2502.13923.html\n ├── 2503.01785.html\n ├── 2503.06520.html\n ├── 2503.06749.html\n ├── 2503.07065.html\n ├── 2503.07365.html\n ├── 2503.07536.html\n ├── 2503.10291.html\n ├── 2503.10615.html\n ├── 2503.12937.html\n ├── 2503.13939.html\n ├── 2503.14476.html\n ├── 2503.17352.html\n ├── 2503.18892.html\n ├── 2503.19786.html\n ├── 2503.20783.html\n ├── 2503.21620.html\n ├── 2503.21776.html\n ├── 2503.22679.html\n ├── 2504.02587.html\n ├── 2504.05599.html\n ├── 2504.07491.html\n ├── 2504.07934.html\n ├── 2504.07954.html\n ├── 2504.11455.html\n ├── 2504.14945.html\n ├── 2504.16656.html\n ├── 2505.00703.html\n └── arxiv_2025.bib", "stateUrl": "https://storage.mcpmark.ai/filesystem/papers.zip", "stateOriginalUrl": null } } ================================================ FILE: tasks/filesystem/standard/papers/author_folders/verify.py ================================================ #!/usr/bin/env python3 """ Verification script for Paper Organization Task: Author-Based Paper Categorization """ import sys from pathlib import Path import os import re from typing import Dict, List, Set from html.parser import HTMLParser from datetime import datetime def get_test_directory() -> Path: """Get the test directory from FILESYSTEM_TEST_DIR env var.""" test_root = os.environ.get("FILESYSTEM_TEST_DIR") if not test_root: raise ValueError("FILESYSTEM_TEST_DIR environment variable is required") return Path(test_root) class ArxivHTMLParser(HTMLParser): """Parser to extract author and date information from arXiv HTML papers.""" def __init__(self): super().__init__() self.authors = [] self.publication_date = None def handle_starttag(self, tag, attrs): # Look for author metadata tags if tag == 'meta': attr_dict = dict(attrs) if attr_dict.get('name') == 'citation_author': content = attr_dict.get('content', '') if content: self.authors.append(content) elif attr_dict.get('name') in ['citation_date', 'citation_online_date']: content = attr_dict.get('content', '') if content and not self.publication_date: self.publication_date = content def extract_paper_info(html_file: Path) -> tuple[List[str], str]: """Extract authors and publication year from an HTML paper.""" try: with open(html_file, 'r', encoding='utf-8', errors='ignore') as f: content = f.read() parser = ArxivHTMLParser() parser.feed(content) # Extract year from date if available year = None if parser.publication_date: # Parse year from date string (e.g., "2025/03/13") year_match = re.search(r'(\d{4})', parser.publication_date) if year_match: year = year_match.group(1) return parser.authors, year except Exception as e: print(f"Warning: Could not parse {html_file.name}: {e}") return [], None def normalize_author_name(author: str) -> str: """Normalize author name to lowercase with underscores.""" # Author names are in "Last, First Middle" format # We need to convert to "first_last" format # Remove any HTML entities or special characters that shouldn't be there author = author.strip() # Split by comma to separate last and first names parts = author.split(',', 1) if len(parts) == 2: last_name = parts[0].strip() first_names = parts[1].strip() # Take only the first name (not middle names) first_name_parts = first_names.split() if first_name_parts: first_name = first_name_parts[0] # Format as "first_last" normalized = f"{first_name}_{last_name}" else: normalized = last_name else: # If no comma, use as is normalized = author # Convert to lowercase and replace spaces/special chars with underscores normalized = re.sub(r'[^\w\s-]', '', normalized) normalized = re.sub(r'[\s-]+', '_', normalized) return normalized.lower() def verify_directories_exist(test_dir: Path) -> bool: """Verify that required directories exist.""" frequent_authors_dir = test_dir / "frequent_authors" authors_2025_dir = test_dir / "2025_authors" if not frequent_authors_dir.exists(): print("❌ 'frequent_authors' directory not found") return False if not authors_2025_dir.exists(): print("❌ '2025_authors' directory not found") return False if not frequent_authors_dir.is_dir(): print("❌ 'frequent_authors' exists but is not a directory") return False if not authors_2025_dir.is_dir(): print("❌ '2025_authors' exists but is not a directory") return False print("✅ Both required directories exist") return True def analyze_papers(test_dir: Path) -> tuple[Dict[str, List[Path]], Dict[str, List[Path]]]: """Analyze all HTML papers and return author-paper mappings.""" author_papers = {} # author -> list of papers author_2025_papers = {} # author -> list of 2025 papers # Find all HTML files html_files = list(test_dir.glob("*.html")) for html_file in html_files: authors, year = extract_paper_info(html_file) for author in authors: if not author: continue normalized_name = normalize_author_name(author) if not normalized_name: continue # Track all papers by author if normalized_name not in author_papers: author_papers[normalized_name] = [] author_papers[normalized_name].append(html_file) # Track 2025 papers if year == '2025': if normalized_name not in author_2025_papers: author_2025_papers[normalized_name] = [] author_2025_papers[normalized_name].append(html_file) return author_papers, author_2025_papers def verify_frequent_authors(test_dir: Path, author_papers: Dict[str, List[Path]]) -> bool: """Verify that authors with ≥4 papers have their folders and papers.""" frequent_authors_dir = test_dir / "frequent_authors" # Find authors with 4 or more papers frequent_authors = {author: papers for author, papers in author_papers.items() if len(papers) >= 4} if not frequent_authors: print("⚠️ No authors found with 4 or more papers") # This might be expected depending on the test data return True all_correct = True for author, expected_papers in frequent_authors.items(): author_dir = frequent_authors_dir / author # Check if author directory exists if not author_dir.exists(): print(f"❌ Missing directory for frequent author: {author}") all_correct = False continue # Check if all expected papers are present for paper in expected_papers: paper_copy = author_dir / paper.name if not paper_copy.exists(): print(f"❌ Missing paper {paper.name} in {author} directory") all_correct = False # Check for unexpected directories for item in frequent_authors_dir.iterdir(): if item.is_dir(): dir_name = item.name if dir_name not in frequent_authors: # Check if this author has less than 4 papers if dir_name in author_papers and len(author_papers[dir_name]) < 4: print(f"❌ Author {dir_name} has only {len(author_papers[dir_name])} papers but has a folder in frequent_authors") all_correct = False if all_correct: print(f"✅ Frequent authors correctly organized ({len(frequent_authors)} authors)") return all_correct def verify_2025_authors(test_dir: Path, author_2025_papers: Dict[str, List[Path]]) -> bool: """Verify that authors with ≥3 papers in 2025 have their folders and papers.""" authors_2025_dir = test_dir / "2025_authors" # Find authors with 3 or more papers in 2025 prolific_2025_authors = {author: papers for author, papers in author_2025_papers.items() if len(papers) >= 3} if not prolific_2025_authors: print("⚠️ No authors found with 3 or more papers in 2025") # This might be expected depending on the test data return True all_correct = True for author, expected_papers in prolific_2025_authors.items(): author_dir = authors_2025_dir / author # Check if author directory exists if not author_dir.exists(): print(f"❌ Missing directory for 2025 author: {author}") all_correct = False continue # Check if all expected 2025 papers are present for paper in expected_papers: paper_copy = author_dir / paper.name if not paper_copy.exists(): print(f"❌ Missing 2025 paper {paper.name} in {author} directory") all_correct = False # Check for unexpected directories for item in authors_2025_dir.iterdir(): if item.is_dir(): dir_name = item.name if dir_name not in prolific_2025_authors: # Check if this author has less than 3 papers in 2025 if dir_name in author_2025_papers and len(author_2025_papers[dir_name]) < 3: print(f"❌ Author {dir_name} has only {len(author_2025_papers[dir_name])} papers in 2025 but has a folder in 2025_authors") all_correct = False if all_correct: print(f"✅ 2025 authors correctly organized ({len(prolific_2025_authors)} authors)") return all_correct def verify_original_files_intact(test_dir: Path) -> bool: """Verify that original HTML files are still present (not moved).""" html_files = list(test_dir.glob("*.html")) if not html_files: print("❌ No original HTML files found in root directory") return False print(f"✅ Original HTML files remain intact ({len(html_files)} files)") return True def verify_naming_convention(test_dir: Path) -> bool: """Verify that author folder names follow the correct naming convention.""" frequent_authors_dir = test_dir / "frequent_authors" authors_2025_dir = test_dir / "2025_authors" all_correct = True # Check frequent_authors subdirectories for author_dir in frequent_authors_dir.iterdir(): if author_dir.is_dir(): name = author_dir.name # Check for lowercase and underscores only if not re.match(r'^[a-z0-9_]+$', name): print(f"❌ Invalid folder name in frequent_authors: {name} (should be lowercase with underscores)") all_correct = False # Check 2025_authors subdirectories for author_dir in authors_2025_dir.iterdir(): if author_dir.is_dir(): name = author_dir.name # Check for lowercase and underscores only if not re.match(r'^[a-z0-9_]+$', name): print(f"❌ Invalid folder name in 2025_authors: {name} (should be lowercase with underscores)") all_correct = False if all_correct: print("✅ All author folder names follow correct naming convention") return all_correct def main(): """Main verification function.""" try: test_dir = get_test_directory() print(f"🔍 Verifying paper organization in: {test_dir}") # Analyze papers first print("\n📊 Analyzing papers...") author_papers, author_2025_papers = analyze_papers(test_dir) # Run verification checks checks = [ ("Directory existence", lambda: verify_directories_exist(test_dir)), ("Original files intact", lambda: verify_original_files_intact(test_dir)), ("Frequent authors organization", lambda: verify_frequent_authors(test_dir, author_papers)), ("2025 authors organization", lambda: verify_2025_authors(test_dir, author_2025_papers)), ("Naming conventions", lambda: verify_naming_convention(test_dir)) ] all_passed = True for check_name, check_func in checks: print(f"\n📋 Checking: {check_name}") if not check_func(): all_passed = False if all_passed: print("\n🎉 All verification checks passed!") sys.exit(0) else: print("\n❌ Some verification checks failed!") sys.exit(1) except Exception as e: print(f"❌ Verification failed with error: {e}") sys.exit(1) if __name__ == "__main__": main() ================================================ FILE: tasks/filesystem/standard/papers/find_math_paper/description.md ================================================ Please use FileSystem tools to finish the following task: You are given a directory containing multiple paper files. Please help me find a math-related benchmark paper. I don’t remember its name, but I remember it not only checks whether the answer is correct, but also analyzes whether the model suffers from insufficient knowledge, lacks generalization ability, or relies on rote memorization. After finding this paper, rename its corresponding HTML file to `answer.html`. ================================================ FILE: tasks/filesystem/standard/papers/find_math_paper/meta.json ================================================ { "task_id": "find_math_paper", "task_name": "Find Math Paper", "category_id": "papers", "category_name": "Papers", "description": "Search through academic papers to identify and locate mathematics-related content that satisfies specific mathematical criteria and research requirements.", "author": "Xiangyan Liu", "created_at": "2025-08-12", "difficulty": "L3", "tags": [ "pattern analysis", "data extraction" ], "mcp": [ "filesystem" ], "meta_data": { "stateType": "text", "stateContent": "papers/\n ├── 1707.06347.html\n ├── 2105.04165.html\n ├── 2201.11903.html\n ├── 2303.08774.html\n ├── 2306.08640.html\n ├── 2310.02255.html\n ├── 2310.08446.html\n ├── 2312.00849.html\n ├── 2312.07533.html\n ├── 2312.11805.html\n ├── 2402.00253.html\n ├── 2402.03300.html\n ├── 2403.05530.html\n ├── 2404.13046.html\n ├── 2404.14367.html\n ├── 2404.14396.html\n ├── 2405.09818.html\n ├── 2405.13911.html\n ├── 2405.16473.html\n ├── 2405.16640.html\n ├── 2406.08478.html\n ├── 2406.16852.html\n ├── 2406.17294.html\n ├── 2407.01284.html\n ├── 2407.01509.html\n ├── 2407.21783.html\n ├── 2408.03326.html\n ├── 2408.12528.html\n ├── 2409.19256.html\n ├── 2410.05993.html\n ├── 2410.06166.html\n ├── 2410.10563.html\n ├── 2410.13848.html\n ├── 2410.17885.html\n ├── 2410.21276.html\n ├── 2411.07975.html\n ├── 2411.10442.html\n ├── 2411.11930.html\n ├── 2411.14432.html\n ├── 2412.05271.html\n ├── 2412.08443.html\n ├── 2412.10302.html\n ├── 2412.15115.html\n ├── 2412.16720.html\n ├── 2412.17256.html\n ├── 2412.18319.html\n ├── 2412.20631.html\n ├── 2501.04686.html\n ├── 2501.06186.html\n ├── 2501.12599.html\n ├── 2501.12948.html\n ├── 2501.17811.html\n ├── 2502.01456.html\n ├── 2502.09621.html\n ├── 2502.10391.html\n ├── 2502.13923.html\n ├── 2503.01785.html\n ├── 2503.06520.html\n ├── 2503.06749.html\n ├── 2503.07065.html\n ├── 2503.07365.html\n ├── 2503.07536.html\n ├── 2503.10291.html\n ├── 2503.10615.html\n ├── 2503.12937.html\n ├── 2503.13939.html\n ├── 2503.14476.html\n ├── 2503.17352.html\n ├── 2503.18892.html\n ├── 2503.19786.html\n ├── 2503.20783.html\n ├── 2503.21620.html\n ├── 2503.21776.html\n ├── 2503.22679.html\n ├── 2504.02587.html\n ├── 2504.05599.html\n ├── 2504.07491.html\n ├── 2504.07934.html\n ├── 2504.07954.html\n ├── 2504.11455.html\n ├── 2504.14945.html\n ├── 2504.16656.html\n ├── 2505.00703.html\n └── arxiv_2025.bib", "stateUrl": "https://storage.mcpmark.ai/filesystem/papers.zip", "stateOriginalUrl": null } } ================================================ FILE: tasks/filesystem/standard/papers/find_math_paper/verify.py ================================================ #!/usr/bin/env python3 """ Verification script for Find Math Paper Task """ import sys from pathlib import Path import os def get_test_directory() -> Path: """Get the test directory from FILESYSTEM_TEST_DIR env var.""" test_root = os.environ.get("FILESYSTEM_TEST_DIR") if not test_root: raise ValueError("FILESYSTEM_TEST_DIR environment variable is required") return Path(test_root) def verify_answer_file_exists(test_dir: Path) -> bool: """Verify that answer.html exists in the papers directory.""" answer_file = test_dir / "answer.html" if not answer_file.exists(): print("❌ File 'answer.html' not found") return False print("✅ answer.html found") return True def verify_original_file_removed(test_dir: Path) -> bool: """Verify that the original file (2407.01284.html) no longer exists.""" original_file = test_dir / "2407.01284.html" if original_file.exists(): print("❌ Original file 2407.01284.html still exists") return False print("✅ Original file has been renamed") return True def main(): """Main verification function.""" test_dir = get_test_directory() print("🔍 Verifying Find Math Paper Task...") # Define verification steps verification_steps = [ ("Answer File Exists", verify_answer_file_exists), ("Original File Renamed", verify_original_file_removed), ] # Run all verification steps all_passed = True for step_name, verify_func in verification_steps: print(f"\n--- {step_name} ---") if not verify_func(test_dir): all_passed = False # Final result print("\n" + "="*50) if all_passed: print("✅ Paper correctly renamed to answer.html!") print("🎉 Task verification: PASS") sys.exit(0) else: print("❌ Task verification: FAIL") sys.exit(1) if __name__ == "__main__": main() ================================================ FILE: tasks/filesystem/standard/papers/organize_legacy_papers/description.md ================================================ Please use FileSystem tools to finish the following task: ### Task Description You are given a directory containing multiple paper files. You have a collection of arXiv papers saved as HTML files in the papers directory, along with a BibTeX file. Your task is to organize the older papers (2023 and earlier) into a structured year-based hierarchy with proper documentation, while leaving newer papers in the original location. ### Task Objectives 1. **Organize by year**: Create a year-based directory structure for papers from 2023 and earlier 2. **Generate documentation**: Create INDEX.md files for each year with paper metadata 3. **Create summary**: Build a master SUMMARY.md file linking to all year indexes ### Detailed Requirements #### Step 1: Organization - Create directory structure: `organized/{year}/` where year is extracted from the arXiv ID - Example: `1707.06347.html` → `organized/2017/1707.06347.html` - Move each HTML file from 2023 and earlier to its corresponding year folder, keeping original filenames - Papers from 2024 onwards (arXiv IDs starting with `24` or `25`) should remain in the original papers directory #### Step 2: Year Index Files For each year folder, create an `INDEX.md` file containing: - A markdown table with three columns: `ArXiv ID | Authors | Local Path` - Extract authors from `<meta name="citation_author" content="..."/>` tags, keeping only the first 3 authors - If there are more than 3 authors, list the first 3 followed by "et al." - Format authors as: "Author1, Author2, Author3" or "Author1, Author2, Author3, et al." - Local Path should be just the filename (e.g., `1707.06347.html`) - Sort entries by arXiv ID in ascending order #### Step 3: Master Summary Create `organized/SUMMARY.md` with: - A markdown table with columns: `Year | Paper Count | Index Link` - Index Link should be a relative markdown link (e.g., `[View Index](2017/INDEX.md)`) - Sort by year in ascending order ### Expected Output Structure ``` papers/ ├── arxiv_2025.bib (remains here) ├── (2024+ HTML files remain here) └── organized/ ├── SUMMARY.md ├── 2017/ │ ├── INDEX.md │ └── 1707.06347.html ├── 2021/ │ ├── INDEX.md │ └── 2105.04165.html ├── 2022/ │ ├── INDEX.md │ └── 2201.11903.html └── 2023/ ├── INDEX.md ├── 2303.08774.html ├── 2306.08640.html ├── 2310.02255.html ├── 2310.08446.html ├── 2312.00849.html ├── 2312.07533.html └── 2312.11805.html ``` ================================================ FILE: tasks/filesystem/standard/papers/organize_legacy_papers/meta.json ================================================ { "task_id": "organize_legacy_papers", "task_name": "Organize Legacy Papers", "category_id": "papers", "category_name": "Papers", "description": "Structure and organize older academic papers from 2023 and earlier into a year-based hierarchical directory system with proper documentation.", "author": "Xiangyan Liu", "created_at": "2025-08-12", "difficulty": "L3", "tags": [ "file organization", "data extraction", "cross-referencing" ], "mcp": [ "filesystem" ], "meta_data": { "stateType": "text", "stateContent": "papers/\n ├── 1707.06347.html\n ├── 2105.04165.html\n ├── 2201.11903.html\n ├── 2303.08774.html\n ├── 2306.08640.html\n ├── 2310.02255.html\n ├── 2310.08446.html\n ├── 2312.00849.html\n ├── 2312.07533.html\n ├── 2312.11805.html\n ├── 2402.00253.html\n ├── 2402.03300.html\n ├── 2403.05530.html\n ├── 2404.13046.html\n ├── 2404.14367.html\n ├── 2404.14396.html\n ├── 2405.09818.html\n ├── 2405.13911.html\n ├── 2405.16473.html\n ├── 2405.16640.html\n ├── 2406.08478.html\n ├── 2406.16852.html\n ├── 2406.17294.html\n ├── 2407.01284.html\n ├── 2407.01509.html\n ├── 2407.21783.html\n ├── 2408.03326.html\n ├── 2408.12528.html\n ├── 2409.19256.html\n ├── 2410.05993.html\n ├── 2410.06166.html\n ├── 2410.10563.html\n ├── 2410.13848.html\n ├── 2410.17885.html\n ├── 2410.21276.html\n ├── 2411.07975.html\n ├── 2411.10442.html\n ├── 2411.11930.html\n ├── 2411.14432.html\n ├── 2412.05271.html\n ├── 2412.08443.html\n ├── 2412.10302.html\n ├── 2412.15115.html\n ├── 2412.16720.html\n ├── 2412.17256.html\n ├── 2412.18319.html\n ├── 2412.20631.html\n ├── 2501.04686.html\n ├── 2501.06186.html\n ├── 2501.12599.html\n ├── 2501.12948.html\n ├── 2501.17811.html\n ├── 2502.01456.html\n ├── 2502.09621.html\n ├── 2502.10391.html\n ├── 2502.13923.html\n ├── 2503.01785.html\n ├── 2503.06520.html\n ├── 2503.06749.html\n ├── 2503.07065.html\n ├── 2503.07365.html\n ├── 2503.07536.html\n ├── 2503.10291.html\n ├── 2503.10615.html\n ├── 2503.12937.html\n ├── 2503.13939.html\n ├── 2503.14476.html\n ├── 2503.17352.html\n ├── 2503.18892.html\n ├── 2503.19786.html\n ├── 2503.20783.html\n ├── 2503.21620.html\n ├── 2503.21776.html\n ├── 2503.22679.html\n ├── 2504.02587.html\n ├── 2504.05599.html\n ├── 2504.07491.html\n ├── 2504.07934.html\n ├── 2504.07954.html\n ├── 2504.11455.html\n ├── 2504.14945.html\n ├── 2504.16656.html\n ├── 2505.00703.html\n └── arxiv_2025.bib", "stateUrl": "https://storage.mcpmark.ai/filesystem/papers.zip", "stateOriginalUrl": null } } ================================================ FILE: tasks/filesystem/standard/papers/organize_legacy_papers/verify.py ================================================ #!/usr/bin/env python3 """ Verification script for Papers Collection Cleanup and Organization Task """ import sys from pathlib import Path import re import os def get_test_directory() -> Path: """Get the test directory from FILESYSTEM_TEST_DIR env var.""" test_root = os.environ.get("FILESYSTEM_TEST_DIR") if not test_root: raise ValueError("FILESYSTEM_TEST_DIR environment variable is required") return Path(test_root) def verify_papers_remain(test_dir: Path) -> bool: """Verify that BibTeX and 2024+ papers remain in original directory.""" papers_dir = test_dir # Check BibTeX file still exists bib_file = papers_dir / "arxiv_2025.bib" if not bib_file.exists(): print("❌ BibTeX file arxiv_2025.bib not found") return False print("✅ BibTeX file remains in place") # Check that 2024+ papers remain in original directory found_2024_plus = False if papers_dir.exists(): for html_file in papers_dir.glob("*.html"): arxiv_id = html_file.stem year_part = arxiv_id[:2] if len(arxiv_id) >= 2 else "" if year_part.isdigit(): year = int(year_part) if year >= 24: found_2024_plus = True break if found_2024_plus: print("✅ 2024+ papers remain in original directory") else: print("⚠️ No 2024+ papers found (this may be expected if none existed)") # Check that pre-2024 papers are NOT in original directory pre_2024_found = [] if papers_dir.exists(): for html_file in papers_dir.glob("*.html"): arxiv_id = html_file.stem year_part = arxiv_id[:2] if len(arxiv_id) >= 2 else "" if year_part.isdigit(): year = int(year_part) if year < 24: pre_2024_found.append(html_file.name) if pre_2024_found: print(f"❌ Pre-2024 papers still in original directory: {pre_2024_found[:3]}...") return False print("✅ Pre-2024 papers have been moved") return True def verify_directory_structure(test_dir: Path) -> bool: """Verify the organized directory structure exists.""" organized_dir = test_dir / "organized" if not organized_dir.exists(): print("❌ organized/ directory not found") return False print("✅ organized/ directory exists") # Expected years based on pre-2024 papers expected_years = ["2017", "2021", "2022", "2023"] found_years = [] for year in expected_years: year_dir = organized_dir / year if year_dir.exists() and year_dir.is_dir(): found_years.append(year) if len(found_years) != len(expected_years): print(f"❌ Expected year directories {expected_years}, found {found_years}") return False print(f"✅ All expected year directories exist: {found_years}") return True def verify_papers_moved(test_dir: Path) -> bool: """Verify papers are correctly moved to year folders.""" organized_dir = test_dir / "organized" # Expected paper distribution expected_papers = { "2017": ["1707.06347.html"], "2021": ["2105.04165.html"], "2022": ["2201.11903.html"], "2023": ["2303.08774.html", "2306.08640.html", "2310.02255.html", "2310.08446.html", "2312.00849.html", "2312.07533.html", "2312.11805.html"] } all_correct = True for year, papers in expected_papers.items(): year_dir = organized_dir / year if not year_dir.exists(): print(f"❌ Year directory {year} doesn't exist") return False actual_papers = sorted([f.name for f in year_dir.glob("*.html")]) expected_sorted = sorted(papers) if actual_papers != expected_sorted: print(f"❌ Papers in {year}/: expected {expected_sorted}, found {actual_papers}") all_correct = False else: print(f"✅ Correct papers in {year}/: {len(actual_papers)} files") return all_correct def verify_index_files(test_dir: Path) -> bool: """Verify INDEX.md files exist and have correct format.""" organized_dir = test_dir / "organized" years = ["2017", "2021", "2022", "2023"] for year in years: index_file = organized_dir / year / "INDEX.md" if not index_file.exists(): print(f"❌ INDEX.md missing in {year}/") return False content = index_file.read_text() # Check for table format if "ArXiv ID" not in content or "Authors" not in content or "Local Path" not in content: print(f"❌ INDEX.md in {year}/ missing required columns") return False # Check that papers are listed year_dir = organized_dir / year html_files = list(year_dir.glob("*.html")) for html_file in html_files: arxiv_id = html_file.stem if arxiv_id not in content: print(f"❌ INDEX.md in {year}/ missing paper {arxiv_id}") return False print(f"✅ INDEX.md in {year}/ has correct format") return True def verify_author_extraction(test_dir: Path) -> bool: """Verify that authors are correctly extracted from HTML metadata (max 3 authors).""" organized_dir = test_dir / "organized" # Check a sample paper's authors sample_file = organized_dir / "2017" / "1707.06347.html" if not sample_file.exists(): print("❌ Cannot verify author extraction - sample file missing") return False # Read the HTML to get expected authors html_content = sample_file.read_text() author_pattern = r'<meta name="citation_author" content="([^"]+)"' all_authors = re.findall(author_pattern, html_content) if not all_authors: print("❌ No authors found in sample HTML file") return False # Build expected author string (max 3 authors) if len(all_authors) <= 3: expected_author_str = ", ".join(all_authors) else: expected_author_str = ", ".join(all_authors[:3]) + ", et al." # Check if INDEX.md contains these authors index_file = organized_dir / "2017" / "INDEX.md" index_content = index_file.read_text() # Find the line with this paper found = False for line in index_content.split('\n'): if "1707.06347" in line: found = True # Check if authors are correctly formatted if len(all_authors) > 3: # Should have first 3 authors and "et al." if "et al." not in line: print("❌ Missing 'et al.' for paper with >3 authors") return False # Check first 3 authors are present for author in all_authors[:3]: if author not in line: print(f"❌ Author '{author}' not found in INDEX.md") return False # Check that 4th author is NOT present if len(all_authors) > 3 and all_authors[3] in line: print(f"❌ Fourth author '{all_authors[3]}' should not be in INDEX.md") return False else: # Should have all authors, no "et al." if "et al." in line: print("❌ Should not have 'et al.' for paper with ≤3 authors") return False for author in all_authors: if author not in line: print(f"❌ Author '{author}' not found in INDEX.md") return False break if not found: print("❌ Paper 1707.06347 not found in INDEX.md") return False print("✅ Authors correctly extracted (max 3) from HTML metadata") # Additional check: verify 3-author limit across all papers print("\nVerifying 3-author limit across all papers...") years = ["2017", "2021", "2022", "2023"] for year in years: year_dir = organized_dir / year if not year_dir.exists(): continue index_file = year_dir / "INDEX.md" if not index_file.exists(): continue index_content = index_file.read_text() # Check each HTML file in the year directory for html_file in year_dir.glob("*.html"): arxiv_id = html_file.stem # Get actual authors from HTML html_content = html_file.read_text() authors = re.findall(r'<meta name="citation_author" content="([^"]+)"', html_content) # Find corresponding line in INDEX.md for line in index_content.split('\n'): if arxiv_id in line and '|' in line and 'ArXiv ID' not in line: # Count authors in the line (split by comma) author_parts = line.split('|')[1] if '|' in line else "" # Check et al. usage if len(authors) > 3: if "et al." not in line: print(f"❌ {year}/{arxiv_id}: Missing 'et al.' for {len(authors)} authors") return False elif "et al." in line: print(f"❌ {year}/{arxiv_id}: Unexpected 'et al.' for {len(authors)} authors") return False # Verify no more than 3 authors are listed author_count = author_parts.count(',') + 1 if author_parts.strip() else 0 if "et al." in author_parts: author_count -= 1 # Don't count "et al." as an author if author_count > 3: print(f"❌ {year}/{arxiv_id}: More than 3 authors listed") return False break print("✅ All papers respect the 3-author limit") return True def verify_summary_file(test_dir: Path) -> bool: """Verify SUMMARY.md exists and has correct content.""" summary_file = test_dir / "organized" / "SUMMARY.md" if not summary_file.exists(): print("❌ SUMMARY.md not found") return False content = summary_file.read_text() # Check for required columns if "Year" not in content or "Paper Count" not in content or "Index Link" not in content: print("❌ SUMMARY.md missing required columns") return False # Check for year entries expected_years = ["2017", "2021", "2022", "2023"] for year in expected_years: if year not in content: print(f"❌ SUMMARY.md missing year {year}") return False # Check for links to INDEX.md files expected_links = [ f"{year}/INDEX.md" for year in expected_years ] for link in expected_links: if link not in content: print(f"❌ SUMMARY.md missing link to {link}") return False # Check paper counts expected_counts = { "2017": 1, "2021": 1, "2022": 1, "2023": 7 } for year, count in expected_counts.items(): # Look for the row with this year for line in content.split('\n'): if f"| {year}" in line or f"|{year}" in line: if str(count) not in line: print(f"❌ SUMMARY.md has incorrect paper count for {year}") return False break print("✅ SUMMARY.md has correct format and content") return True def verify_sorting(test_dir: Path) -> bool: """Verify that entries are sorted correctly.""" organized_dir = test_dir / "organized" # Check SUMMARY.md year sorting summary_file = organized_dir / "SUMMARY.md" content = summary_file.read_text() # Extract years from table rows years_in_summary = [] for line in content.split('\n'): if '|' in line and any(year in line for year in ["2017", "2021", "2022", "2023"]): # Extract year from the line for year in ["2017", "2021", "2022", "2023"]: if year in line: years_in_summary.append(year) break if years_in_summary != sorted(years_in_summary): print(f"❌ SUMMARY.md years not sorted: {years_in_summary}") return False print("✅ SUMMARY.md years sorted correctly") # Check INDEX.md arxiv ID sorting for one year index_file = organized_dir / "2023" / "INDEX.md" if index_file.exists(): content = index_file.read_text() arxiv_ids = [] for line in content.split('\n'): if '|' in line and '23' in line and 'ArXiv ID' not in line and '---' not in line: # Extract arxiv ID match = re.search(r'23\d{2}\.\d{5}', line) if match: arxiv_ids.append(match.group()) if arxiv_ids != sorted(arxiv_ids): print(f"❌ INDEX.md arxiv IDs not sorted in 2023/") return False print("✅ INDEX.md entries sorted by arxiv ID") return True def main(): """Main verification function.""" test_dir = get_test_directory() print("🔍 Verifying Papers Collection Cleanup and Organization...") # Define verification steps verification_steps = [ ("Papers Remain/Move Verification", verify_papers_remain), ("Directory Structure", verify_directory_structure), ("Papers Moved Correctly", verify_papers_moved), ("Index Files Format", verify_index_files), ("Author Extraction", verify_author_extraction), ("Summary File", verify_summary_file), ("Sorting Verification", verify_sorting), ] # Run all verification steps all_passed = True for step_name, verify_func in verification_steps: print(f"\n--- {step_name} ---") try: if not verify_func(test_dir): all_passed = False except Exception as e: print(f"❌ Error in {step_name}: {e}") all_passed = False # Final result print("\n" + "="*50) if all_passed: print("✅ Papers organized correctly!") print("🎉 Task verification: PASS") sys.exit(0) else: print("❌ Task verification: FAIL") sys.exit(1) if __name__ == "__main__": main() ================================================ FILE: tasks/filesystem/standard/student_database/duplicate_name/description.md ================================================ Please use FileSystem tools to finish the following task: Please help me identify duplicate names from the list of all the 150 students. Do not use python code. Then generate a `namesake.txt` file to record the results in the following format, with each group written in three lines: name: xxx count: xxx ids: xxx, xxx, ... Leave one blank line between every two groups. If there are multiple duplicates, just list all corresponding IDs in the third line. ================================================ FILE: tasks/filesystem/standard/student_database/duplicate_name/meta.json ================================================ { "task_id": "duplicate_name", "task_name": "Duplicate Name", "category_id": "student_database", "category_name": "Student Database", "description": "Identify students with identical names from a 150-student database and generate a formatted namesake grouping report file.", "author": "Lingjun Chen", "created_at": "2025-08-10", "difficulty": "L3", "tags": [ "pattern analysis", "data extraction" ], "mcp": [ "filesystem" ], "meta_data": { "stateType": "text", "stateContent": "student_database/\n ├── 20101250_Patricia_Jones/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20101701_Isabella_Davis/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20102572_Michael_Taylor/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20104233_Robert_Lopez/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20104498_Sarah_Brown/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20104653_Sophia_Brown/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20104675_Michael_Gonzalez/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20104846_Christopher_Brown/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20107487_Mia_Martin/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20108742_Sarah_Brown/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20109144_Emma_Thomas/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20109803_Oliver_Hernandez/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20111634_Isabella_Thomas/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20112439_Christopher_Moore/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20113368_William_Wilson/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20113603_Robert_Rodriguez/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20114397_Isabella_Martin/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20114869_Ethan_Martin/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20115252_Mason_Johnson/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20115632_Elizabeth_Anderson/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20115753_Charlotte_Johnson/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20115924_Michael_Lopez/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20116232_Olivia_Lopez/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20119528_Thomas_Brown/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20122427_Karen_Gonzalez/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20122977_Evelyn_Miller/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20123376_Joseph_Johnson/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20125451_Barbara_Brown/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20126203_Barbara_Davis/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20126394_Olivia_Williams/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20126471_Ethan_Taylor/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20127423_John_Williams/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20128249_Oliver_Smith/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20128879_Christopher_Taylor/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20129898_Jessica_Johnson/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20131271_Olivia_Brown/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20131518_Sophia_Smith/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20132026_Isabella_Smith/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20132370_James_Brown/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20132669_Noah_Smith/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20133527_Mason_Jackson/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20133697_Isabella_Smith/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20135821_Thomas_Wilson/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20136681_Benjamin_Anderson/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20136890_Benjamin_Brown/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20137514_Lucas_Anderson/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20139234_Harper_Martinez/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20139637_Noah_Johnson/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20139647_Patricia_Lopez/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20141421_Linda_Gonzalez/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20142085_William_Anderson/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20142383_Amelia_Brown/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20143406_Susan_Martin/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20143830_James_Garcia/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20146035_Christopher_Garcia/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20146277_William_Anderson/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20146279_Christopher_Moore/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20147301_James_Jones/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20147789_James_Anderson/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20148681_John_Hernandez/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20148778_Susan_Anderson/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20149712_Jessica_Rodriguez/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20151012_Harper_Miller/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20153174_Benjamin_Jackson/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20153412_Charlotte_Martin/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20153606_James_Anderson/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20153687_Richard_Taylor/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20154518_John_Gonzalez/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20154710_Benjamin_Rodriguez/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20156469_Jennifer_Hernandez/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20156522_Jennifer_Martinez/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20156851_Noah_Anderson/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20157943_Harper_Williams/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20158266_Sophia_Moore/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20158294_Sophia_Wilson/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20158819_Sarah_Wilson/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20159113_John_Rodriguez/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20159695_James_Moore/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20161279_William_Moore/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20162253_Mason_Rodriguez/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20162542_Mia_Anderson/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20163356_Ava_Anderson/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20164515_Patricia_Moore/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20164801_Noah_Rodriguez/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20165511_Mary_Gonzalez/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20166436_Christopher_Jackson/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20166487_Barbara_Hernandez/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20166564_Ava_Lopez/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20166998_Ava_Lopez/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20168311_Lucas_Jackson/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20168491_Karen_Martinez/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20169515_Thomas_Taylor/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20171050_Christopher_Rodriguez/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20171406_Mary_Anderson/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20171613_Ethan_Moore/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20172106_Isabella_Rodriguez/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20173259_Michael_Anderson/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20173492_Richard_Miller/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20173501_Mary_Smith/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20173517_Susan_Anderson/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20174207_Richard_Wilson/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20174369_Mary_Garcia/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20175314_William_Taylor/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20176169_Lucas_Smith/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20176947_Noah_Miller/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20177389_James_Smith/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20178687_Isabella_Anderson/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20179461_William_Johnson/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20179690_Linda_Thomas/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20181056_Sarah_Hernandez/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20182020_Patricia_Taylor/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20182390_Ethan_Wilson/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20183149_David_Smith/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20183219_Charlotte_Williams/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20184489_Jessica_Gonzalez/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20186154_Charlotte_Smith/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20186510_James_Thomas/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20187107_David_Martinez/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20187144_Mary_Jackson/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20187892_Christopher_Taylor/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20187921_Mary_Jones/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20187967_Sarah_Davis/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20188937_James_Moore/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20189123_Mary_Martin/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20189192_Olivia_Jones/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20189268_Emma_Williams/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20189854_William_Taylor/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20191265_Joseph_Lopez/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20192725_Robert_Martinez/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20194054_Michael_Jones/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20194160_Benjamin_Jackson/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20194164_Sarah_Jones/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20194525_John_Taylor/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20195164_Jennifer_Gonzalez/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20195982_David_Jackson/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20196776_William_Brown/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20196896_Olivia_Jones/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20196961_Joseph_Thomas/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20196998_Ethan_Wilson/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20198548_Evelyn_Moore/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20199036_Benjamin_Hernandez/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20199583_Mary_Brown/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20199735_Mason_Johnson/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20199872_Sophia_Jackson/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20199980_James_Rodriguez/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20201385_John_Taylor/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20201800_John_Jones/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20202548_Robert_Miller/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20203855_Mia_Miller/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n └── 20204611_Sarah_Wilson/\n ├── basic_info.txt\n └── recommendation_letter.txt", "stateUrl": "https://storage.mcpmark.ai/filesystem/student_database.zip", "stateOriginalUrl": null } } ================================================ FILE: tasks/filesystem/standard/student_database/duplicate_name/verify.py ================================================ #!/usr/bin/env python3 """ Verification script for Student Database Task: Find Duplicate Names Simplified version that only checks against expected results without folder validation """ import sys from pathlib import Path import os def get_test_directory() -> Path: """Get the test directory from FILESYSTEM_TEST_DIR env var.""" test_root = os.environ.get("FILESYSTEM_TEST_DIR") if not test_root: raise ValueError("FILESYSTEM_TEST_DIR environment variable is required") return Path(test_root) def verify_namesake_file_exists(test_dir: Path) -> bool: """Verify that the namesake.txt file exists.""" namesake_file = test_dir / "namesake.txt" if not namesake_file.exists(): print("❌ File 'namesake.txt' not found") return False print("✅ Namesake file found") return True def parse_namesake_file(test_dir: Path) -> dict: """Parse the namesake.txt file and return structured data.""" namesake_file = test_dir / "namesake.txt" try: content = namesake_file.read_text() lines = content.strip().split('\n') namesakes = {} current_line = 0 while current_line < len(lines): # Skip blank lines if not lines[current_line].strip(): current_line += 1 continue # Check if we have enough lines for a complete group if current_line + 2 >= len(lines): print(f"❌ Incomplete group at line {current_line + 1}") return {} # Parse group name_line = lines[current_line].strip() count_line = lines[current_line + 1].strip() ids_line = lines[current_line + 2].strip() # Extract name if not name_line.startswith("name: "): print(f"❌ Invalid name line format at line {current_line + 1}: {name_line}") return {} name = name_line.replace("name: ", "").strip() # Extract count if not count_line.startswith("count: "): print(f"❌ Invalid count line format at line {current_line + 2}: {count_line}") return {} count_str = count_line.replace("count: ", "").strip() try: count = int(count_str) except ValueError: print(f"❌ Invalid count format: {count_str}") return {} # Extract IDs if not ids_line.startswith("ids: "): print(f"❌ Invalid ids line format at line {current_line + 3}: {ids_line}") return {} ids_str = ids_line.replace("ids: ", "").strip() ids = [id.strip() for id in ids_str.split(",")] namesakes[name] = { 'count': count, 'ids': ids } current_line += 4 # Skip to next group (after blank line) return namesakes except Exception as e: print(f"❌ Error parsing namesake file: {e}") return {} def verify_against_expected_results(namesakes: dict) -> bool: """Verify that the results match the expected answer.md content exactly.""" # Expected duplicate names from answer.md (hardcoded) expected_duplicates = { 'Isabella Smith': ['20132026', '20133697'], 'Ava Lopez': ['20166564', '20166998'], 'James Moore': ['20159695', '20188937'], 'William Taylor': ['20175314', '20189854'], 'Ethan Wilson': ['20182390', '20196998'], 'Christopher Taylor': ['20128879', '20187892'], 'William Anderson': ['20142085', '20146277'], 'James Anderson': ['20147789', '20153606'], 'Olivia Jones': ['20189192', '20196896'], 'Mason Johnson': ['20115252', '20199735'], 'Benjamin Jackson': ['20153174', '20194160'], 'John Taylor': ['20194525', '20201385'], 'Susan Anderson': ['20148778', '20173517'], 'Christopher Moore': ['20112439', '20146279'], 'Sarah Wilson': ['20158819', '20204611'], 'Sarah Brown': ['20104498', '20108742'] } # Check if exactly 16 duplicate names are found if len(namesakes) != 16: print(f"❌ Expected exactly 16 duplicate names, but found {len(namesakes)}") return False # Check if all expected duplicate names are present for expected_name in expected_duplicates: if expected_name not in namesakes: print(f"❌ Missing expected duplicate name: '{expected_name}'") return False # Check if all namesakes in the file are actually duplicates for name, data in namesakes.items(): if name not in expected_duplicates: print(f"❌ Unexpected duplicate name found: '{name}' (not in expected list)") return False expected_ids = set(expected_duplicates[name]) stated_ids = set(data['ids']) if expected_ids != stated_ids: print(f"❌ ID mismatch for '{name}':") print(f" Expected: {sorted(expected_ids)}") print(f" Stated: {sorted(stated_ids)}") return False # Verify count matches if data['count'] != 2: print(f"❌ Count mismatch for '{name}': expected 2, got {data['count']}") return False print("✅ All 16 expected duplicate names are correctly identified") print("✅ All student IDs match expected results") print("✅ All counts are correct (2 for each duplicate name)") return True def main(): """Main verification function.""" test_dir = get_test_directory() print("🔍 Verifying Student Database Task: Find Duplicate Names...") # Check if namesake file exists print("\n--- File Existence Check ---") if not verify_namesake_file_exists(test_dir): print("\n❌ Basic verification failed, cannot proceed with content verification") sys.exit(1) # Parse the file and run content verification print("\n--- Content Verification ---") namesakes = parse_namesake_file(test_dir) if not namesakes: print("❌ Failed to parse namesake file") sys.exit(1) # Verify against expected results print("\n--- Results Verification ---") if not verify_against_expected_results(namesakes): print("\n❌ Task verification: FAIL") sys.exit(1) # Final result print("\n" + "="*50) print("✅ Namesake identification completed correctly!") print(f"🎉 Found exactly {len(namesakes)} duplicate names (16 expected)") print("🎉 Task verification: PASS") sys.exit(0) if __name__ == "__main__": main() ================================================ FILE: tasks/filesystem/standard/student_database/english_talent/description.md ================================================ Please use FileSystem tools to finish the following task: We are now recruiting students proficient in English to be responsible for the school’s English media operations. To contact with students, from the total of 150 students, select those who **meet both of the following criteria** : 1. Rated ****S** or** ****A** grade level in** `recommendation_letter.txt` by their teachers. 2. TOEFL score in the basic info is **higher than or equal to 100** . Please compile all their names, ids and emails into a `qualified_students.txt` file, with the format: name: xxx id: xxx email: xxx Each person’s information should occupy three lines, with one blank line between each block. ================================================ FILE: tasks/filesystem/standard/student_database/english_talent/meta.json ================================================ { "task_id": "english_talent", "task_name": "English Talent", "category_id": "student_database", "category_name": "Student Database", "description": "Select qualified students with S/A recommendation grades and TOEFL scores ≥100 for English media operations recruitment opportunities.", "author": "Lingjun Chen", "created_at": "2025-08-10", "difficulty": "L3", "tags": [ "data extraction", "cross-referencing", "pattern analysis" ], "mcp": [ "filesystem" ], "meta_data": { "stateType": "text", "stateContent": "student_database/\n ├── 20101250_Patricia_Jones/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20101701_Isabella_Davis/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20102572_Michael_Taylor/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20104233_Robert_Lopez/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20104498_Sarah_Brown/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20104653_Sophia_Brown/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20104675_Michael_Gonzalez/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20104846_Christopher_Brown/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20107487_Mia_Martin/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20108742_Sarah_Brown/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20109144_Emma_Thomas/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20109803_Oliver_Hernandez/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20111634_Isabella_Thomas/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20112439_Christopher_Moore/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20113368_William_Wilson/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20113603_Robert_Rodriguez/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20114397_Isabella_Martin/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20114869_Ethan_Martin/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20115252_Mason_Johnson/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20115632_Elizabeth_Anderson/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20115753_Charlotte_Johnson/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20115924_Michael_Lopez/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20116232_Olivia_Lopez/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20119528_Thomas_Brown/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20122427_Karen_Gonzalez/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20122977_Evelyn_Miller/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20123376_Joseph_Johnson/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20125451_Barbara_Brown/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20126203_Barbara_Davis/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20126394_Olivia_Williams/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20126471_Ethan_Taylor/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20127423_John_Williams/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20128249_Oliver_Smith/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20128879_Christopher_Taylor/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20129898_Jessica_Johnson/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20131271_Olivia_Brown/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20131518_Sophia_Smith/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20132026_Isabella_Smith/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20132370_James_Brown/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20132669_Noah_Smith/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20133527_Mason_Jackson/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20133697_Isabella_Smith/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20135821_Thomas_Wilson/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20136681_Benjamin_Anderson/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20136890_Benjamin_Brown/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20137514_Lucas_Anderson/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20139234_Harper_Martinez/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20139637_Noah_Johnson/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20139647_Patricia_Lopez/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20141421_Linda_Gonzalez/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20142085_William_Anderson/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20142383_Amelia_Brown/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20143406_Susan_Martin/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20143830_James_Garcia/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20146035_Christopher_Garcia/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20146277_William_Anderson/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20146279_Christopher_Moore/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20147301_James_Jones/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20147789_James_Anderson/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20148681_John_Hernandez/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20148778_Susan_Anderson/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20149712_Jessica_Rodriguez/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20151012_Harper_Miller/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20153174_Benjamin_Jackson/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20153412_Charlotte_Martin/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20153606_James_Anderson/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20153687_Richard_Taylor/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20154518_John_Gonzalez/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20154710_Benjamin_Rodriguez/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20156469_Jennifer_Hernandez/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20156522_Jennifer_Martinez/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20156851_Noah_Anderson/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20157943_Harper_Williams/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20158266_Sophia_Moore/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20158294_Sophia_Wilson/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20158819_Sarah_Wilson/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20159113_John_Rodriguez/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20159695_James_Moore/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20161279_William_Moore/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20162253_Mason_Rodriguez/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20162542_Mia_Anderson/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20163356_Ava_Anderson/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20164515_Patricia_Moore/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20164801_Noah_Rodriguez/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20165511_Mary_Gonzalez/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20166436_Christopher_Jackson/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20166487_Barbara_Hernandez/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20166564_Ava_Lopez/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20166998_Ava_Lopez/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20168311_Lucas_Jackson/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20168491_Karen_Martinez/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20169515_Thomas_Taylor/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20171050_Christopher_Rodriguez/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20171406_Mary_Anderson/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20171613_Ethan_Moore/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20172106_Isabella_Rodriguez/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20173259_Michael_Anderson/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20173492_Richard_Miller/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20173501_Mary_Smith/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20173517_Susan_Anderson/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20174207_Richard_Wilson/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20174369_Mary_Garcia/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20175314_William_Taylor/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20176169_Lucas_Smith/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20176947_Noah_Miller/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20177389_James_Smith/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20178687_Isabella_Anderson/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20179461_William_Johnson/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20179690_Linda_Thomas/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20181056_Sarah_Hernandez/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20182020_Patricia_Taylor/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20182390_Ethan_Wilson/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20183149_David_Smith/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20183219_Charlotte_Williams/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20184489_Jessica_Gonzalez/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20186154_Charlotte_Smith/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20186510_James_Thomas/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20187107_David_Martinez/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20187144_Mary_Jackson/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20187892_Christopher_Taylor/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20187921_Mary_Jones/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20187967_Sarah_Davis/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20188937_James_Moore/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20189123_Mary_Martin/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20189192_Olivia_Jones/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20189268_Emma_Williams/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20189854_William_Taylor/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20191265_Joseph_Lopez/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20192725_Robert_Martinez/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20194054_Michael_Jones/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20194160_Benjamin_Jackson/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20194164_Sarah_Jones/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20194525_John_Taylor/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20195164_Jennifer_Gonzalez/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20195982_David_Jackson/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20196776_William_Brown/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20196896_Olivia_Jones/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20196961_Joseph_Thomas/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20196998_Ethan_Wilson/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20198548_Evelyn_Moore/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20199036_Benjamin_Hernandez/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20199583_Mary_Brown/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20199735_Mason_Johnson/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20199872_Sophia_Jackson/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20199980_James_Rodriguez/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20201385_John_Taylor/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20201800_John_Jones/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20202548_Robert_Miller/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20203855_Mia_Miller/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n └── 20204611_Sarah_Wilson/\n ├── basic_info.txt\n └── recommendation_letter.txt", "stateUrl": "https://storage.mcpmark.ai/filesystem/student_database.zip", "stateOriginalUrl": null } } ================================================ FILE: tasks/filesystem/standard/student_database/english_talent/verify.py ================================================ #!/usr/bin/env python3 """ Verification script for Student Database Task: English Talent Recruitment """ import sys from pathlib import Path import re import os def get_test_directory() -> Path: """Get the test directory from FILESYSTEM_TEST_DIR env var.""" test_root = os.environ.get("FILESYSTEM_TEST_DIR") if not test_root: raise ValueError("FILESYSTEM_TEST_DIR environment variable is required") return Path(test_root) def verify_qualified_students_file_exists(test_dir: Path) -> bool: """Verify that the qualified_students.txt file exists.""" answer_file = test_dir / "qualified_students.txt" if not answer_file.exists(): print("❌ File 'qualified_students.txt' not found") return False print("✅ Qualified students file found") return True def verify_file_format(test_dir: Path) -> bool: """Verify that the qualified_students.txt file has the correct format.""" answer_file = test_dir / "qualified_students.txt" try: content = answer_file.read_text() lines = content.strip().split('\n') if not lines: print("❌ File is empty") return False # Check if content follows the expected pattern # Each student should have 3 lines: name, id, email # Students should be separated by blank lines current_line = 0 student_count = 0 while current_line < len(lines): # Skip blank lines if not lines[current_line].strip(): current_line += 1 continue # Check if we have enough lines for a complete student if current_line + 2 >= len(lines): print(f"❌ Incomplete student entry at line {current_line + 1}") return False # Verify name line format if not lines[current_line].strip().startswith("name: "): print(f"❌ Invalid name line format at line {current_line + 1}: {lines[current_line]}") return False # Verify id line format if not lines[current_line + 1].strip().startswith("id: "): print(f"❌ Invalid id line format at line {current_line + 2}: {lines[current_line + 1]}") return False # Verify email line format if not lines[current_line + 2].strip().startswith("email: "): print(f"❌ Invalid email line format at line {current_line + 3}: {lines[current_line + 2]}") return False student_count += 1 current_line += 3 # Check for blank line separator (except for the last student) if current_line < len(lines) and lines[current_line].strip(): print(f"❌ Missing blank line separator after student {student_count}") return False current_line += 1 if student_count == 0: print("❌ No valid student entries found") return False print(f"✅ File format is correct with {student_count} students") return True except Exception as e: print(f"❌ Error reading qualified students file: {e}") return False def parse_qualified_students_file(test_dir: Path) -> list: """Parse the qualified_students.txt file and return structured data.""" answer_file = test_dir / "qualified_students.txt" try: content = answer_file.read_text() lines = content.strip().split('\n') students = [] current_line = 0 while current_line < len(lines): # Skip blank lines if not lines[current_line].strip(): current_line += 1 continue # Parse student entry name_line = lines[current_line].strip() id_line = lines[current_line + 1].strip() email_line = lines[current_line + 2].strip() # Extract name name = name_line.replace("name: ", "").strip() # Extract id student_id = id_line.replace("id: ", "").strip() # Extract email email = email_line.replace("email: ", "").strip() students.append({ 'name': name, 'id': student_id, 'email': email }) current_line += 4 # Skip to next student (after blank line) return students except Exception as e: print(f"❌ Error parsing qualified students file: {e}") return [] def verify_student_count(students: list) -> bool: """Verify that exactly 19 students are found.""" expected_count = 19 actual_count = len(students) if actual_count != expected_count: print(f"❌ Expected {expected_count} students, but found {actual_count}") return False print(f"✅ Found exactly {expected_count} students") return True def verify_expected_students(students: list) -> bool: """Verify that all expected students are present with correct details.""" # Expected students from answer.md expected_students = { 'James Smith': {'id': '20177389', 'email': 'james.smith30@outlook.com'}, 'Ava Lopez': {'id': '20166998', 'email': 'ava.lopez67@outlook.com'}, 'James Anderson': {'id': '20153606', 'email': 'james.anderson71@yahoo.com'}, 'Benjamin Anderson': {'id': '20136681', 'email': 'benjamin.anderson37@qq.com'}, 'Sarah Wilson': {'id': '20158819', 'email': 'sarah.wilson96@outlook.com'}, 'Isabella Davis': {'id': '20101701', 'email': 'isabella.davis89@gmail.com'}, 'James Moore': {'id': '20188937', 'email': 'james.moore62@gmail.com'}, 'Harper Williams': {'id': '20157943', 'email': 'harper.williams38@163.com'}, 'Noah Smith': {'id': '20132669', 'email': 'noah.smith45@163.com'}, 'Emma Thomas': {'id': '20109144', 'email': 'emma.thomas68@163.com'}, 'Mary Brown': {'id': '20199583', 'email': 'mary.brown27@yahoo.com'}, 'John Jones': {'id': '20201800', 'email': 'john.jones46@gmail.com'}, 'Mia Anderson': {'id': '20162542', 'email': 'mia.anderson3@outlook.com'}, 'Barbara Davis': {'id': '20126203', 'email': 'barbara.davis67@163.com'}, 'Thomas Brown': {'id': '20119528', 'email': 'thomas.brown43@163.com'}, 'Susan Anderson': {'id': '20148778', 'email': 'susan.anderson16@163.com'}, 'Mary Garcia': {'id': '20174369', 'email': 'mary.garcia58@gmail.com'}, 'Richard Wilson': {'id': '20174207', 'email': 'richard.wilson39@outlook.com'}, 'Joseph Lopez': {'id': '20191265', 'email': 'joseph.lopez93@yahoo.com'} } # Check if all expected students are present found_students = set() for student in students: found_students.add(student['name']) missing_students = set(expected_students.keys()) - found_students if missing_students: print(f"❌ Missing expected students: {missing_students}") return False # Check if all found students are expected unexpected_students = found_students - set(expected_students.keys()) if unexpected_students: print(f"❌ Unexpected students found: {unexpected_students}") return False # Check if student details match exactly for student in students: expected = expected_students[student['name']] if student['id'] != expected['id']: print(f"❌ ID mismatch for {student['name']}: expected {expected['id']}, got {student['id']}") return False if student['email'] != expected['email']: print(f"❌ Email mismatch for {student['name']}: expected {expected['email']}, got {student['email']}") return False print("✅ All expected students are present with correct details") return True def main(): """Main verification function.""" test_dir = get_test_directory() print("🔍 Verifying Student Database Task: English Talent Recruitment...") # Define verification steps verification_steps = [ ("Qualified Students File Exists", verify_qualified_students_file_exists), ("File Format", verify_file_format), ] # Run basic verification steps first all_passed = True for step_name, verify_func in verification_steps: print(f"\n--- {step_name} ---") if not verify_func(test_dir): all_passed = False break if not all_passed: print("\n❌ Basic verification failed, cannot proceed with content verification") sys.exit(1) # Parse the file and run content verification print("\n--- Content Verification ---") students = parse_qualified_students_file(test_dir) if not students: print("❌ Failed to parse qualified students file") sys.exit(1) content_verification_steps = [ ("Student Count", lambda: verify_student_count(students)), ("Expected Students", lambda: verify_expected_students(students)), ] for step_name, verify_func in content_verification_steps: print(f"\n--- {step_name} ---") if not verify_func(): all_passed = False # Final result print("\n" + "="*50) if all_passed: print("✅ English talent recruitment completed correctly!") print(f"🎉 Found exactly {len(students)} qualified students") print("🎉 Task verification: PASS") sys.exit(0) else: print("❌ Task verification: FAIL") sys.exit(1) if __name__ == "__main__": main() ================================================ FILE: tasks/filesystem/standard/student_database/gradebased_score/description.md ================================================ Please use FileSystem tools to finish the following task: ### Simple Grade Calculation 1. Read Student Data: * Process all student basic_info.txt files from the database * Extract scores for Chinese, Math, and English subjects 2. Calculate Basic Grades: * Use simple grade scale: A (90+), B (80-89), C (70-79), D (60-69), F (<60) * Apply this same scale to all subjects ### Generate Output Files 1. Create student_grades.csv: * Columns: student_id, name, chinese_score, chinese_grade, math_score, math_grade, english_score, english_grade * Must contain exactly each students * Each students one row 2. Create grade_summary.txt: * Total number of students processed * Number of A's, B's, C's, D's, and F's for each subject * Simple count of students with passing grades (A, B, C) vs failing grades (D, F) for each subjects ================================================ FILE: tasks/filesystem/standard/student_database/gradebased_score/meta.json ================================================ { "task_id": "gradebased_score", "task_name": "Gradebased Score", "category_id": "student_database", "category_name": "Student Database", "description": "Process student numerical scores to calculate letter grades using A-F scale and produce comprehensive grade distribution analysis reports.", "author": "Lingjun Chen", "created_at": "2025-08-10", "difficulty": "L3", "tags": [ "data extraction", "content transformation", "pattern analysis" ], "mcp": [ "filesystem" ], "meta_data": { "stateType": "text", "stateContent": "student_database/\n ├── 20101250_Patricia_Jones/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20101701_Isabella_Davis/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20102572_Michael_Taylor/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20104233_Robert_Lopez/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20104498_Sarah_Brown/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20104653_Sophia_Brown/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20104675_Michael_Gonzalez/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20104846_Christopher_Brown/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20107487_Mia_Martin/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20108742_Sarah_Brown/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20109144_Emma_Thomas/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20109803_Oliver_Hernandez/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20111634_Isabella_Thomas/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20112439_Christopher_Moore/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20113368_William_Wilson/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20113603_Robert_Rodriguez/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20114397_Isabella_Martin/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20114869_Ethan_Martin/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20115252_Mason_Johnson/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20115632_Elizabeth_Anderson/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20115753_Charlotte_Johnson/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20115924_Michael_Lopez/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20116232_Olivia_Lopez/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20119528_Thomas_Brown/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20122427_Karen_Gonzalez/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20122977_Evelyn_Miller/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20123376_Joseph_Johnson/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20125451_Barbara_Brown/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20126203_Barbara_Davis/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20126394_Olivia_Williams/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20126471_Ethan_Taylor/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20127423_John_Williams/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20128249_Oliver_Smith/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20128879_Christopher_Taylor/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20129898_Jessica_Johnson/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20131271_Olivia_Brown/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20131518_Sophia_Smith/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20132026_Isabella_Smith/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20132370_James_Brown/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20132669_Noah_Smith/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20133527_Mason_Jackson/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20133697_Isabella_Smith/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20135821_Thomas_Wilson/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20136681_Benjamin_Anderson/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20136890_Benjamin_Brown/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20137514_Lucas_Anderson/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20139234_Harper_Martinez/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20139637_Noah_Johnson/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20139647_Patricia_Lopez/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20141421_Linda_Gonzalez/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20142085_William_Anderson/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20142383_Amelia_Brown/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20143406_Susan_Martin/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20143830_James_Garcia/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20146035_Christopher_Garcia/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20146277_William_Anderson/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20146279_Christopher_Moore/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20147301_James_Jones/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20147789_James_Anderson/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20148681_John_Hernandez/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20148778_Susan_Anderson/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20149712_Jessica_Rodriguez/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20151012_Harper_Miller/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20153174_Benjamin_Jackson/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20153412_Charlotte_Martin/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20153606_James_Anderson/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20153687_Richard_Taylor/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20154518_John_Gonzalez/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20154710_Benjamin_Rodriguez/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20156469_Jennifer_Hernandez/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20156522_Jennifer_Martinez/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20156851_Noah_Anderson/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20157943_Harper_Williams/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20158266_Sophia_Moore/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20158294_Sophia_Wilson/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20158819_Sarah_Wilson/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20159113_John_Rodriguez/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20159695_James_Moore/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20161279_William_Moore/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20162253_Mason_Rodriguez/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20162542_Mia_Anderson/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20163356_Ava_Anderson/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20164515_Patricia_Moore/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20164801_Noah_Rodriguez/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20165511_Mary_Gonzalez/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20166436_Christopher_Jackson/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20166487_Barbara_Hernandez/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20166564_Ava_Lopez/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20166998_Ava_Lopez/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20168311_Lucas_Jackson/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20168491_Karen_Martinez/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20169515_Thomas_Taylor/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20171050_Christopher_Rodriguez/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20171406_Mary_Anderson/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20171613_Ethan_Moore/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20172106_Isabella_Rodriguez/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20173259_Michael_Anderson/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20173492_Richard_Miller/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20173501_Mary_Smith/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20173517_Susan_Anderson/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20174207_Richard_Wilson/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20174369_Mary_Garcia/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20175314_William_Taylor/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20176169_Lucas_Smith/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20176947_Noah_Miller/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20177389_James_Smith/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20178687_Isabella_Anderson/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20179461_William_Johnson/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20179690_Linda_Thomas/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20181056_Sarah_Hernandez/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20182020_Patricia_Taylor/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20182390_Ethan_Wilson/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20183149_David_Smith/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20183219_Charlotte_Williams/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20184489_Jessica_Gonzalez/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20186154_Charlotte_Smith/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20186510_James_Thomas/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20187107_David_Martinez/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20187144_Mary_Jackson/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20187892_Christopher_Taylor/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20187921_Mary_Jones/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20187967_Sarah_Davis/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20188937_James_Moore/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20189123_Mary_Martin/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20189192_Olivia_Jones/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20189268_Emma_Williams/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20189854_William_Taylor/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20191265_Joseph_Lopez/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20192725_Robert_Martinez/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20194054_Michael_Jones/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20194160_Benjamin_Jackson/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20194164_Sarah_Jones/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20194525_John_Taylor/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20195164_Jennifer_Gonzalez/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20195982_David_Jackson/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20196776_William_Brown/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20196896_Olivia_Jones/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20196961_Joseph_Thomas/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20196998_Ethan_Wilson/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20198548_Evelyn_Moore/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20199036_Benjamin_Hernandez/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20199583_Mary_Brown/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20199735_Mason_Johnson/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20199872_Sophia_Jackson/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20199980_James_Rodriguez/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20201385_John_Taylor/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20201800_John_Jones/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20202548_Robert_Miller/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20203855_Mia_Miller/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n └── 20204611_Sarah_Wilson/\n ├── basic_info.txt\n └── recommendation_letter.txt", "stateUrl": "https://storage.mcpmark.ai/filesystem/student_database.zip", "stateOriginalUrl": null } } ================================================ FILE: tasks/filesystem/standard/student_database/gradebased_score/verify.py ================================================ #!/usr/bin/env python3 """ Verification script for Student Database Grade-Based Score Analysis Task """ import sys from pathlib import Path import os import re def get_test_directory() -> Path: """Get the test directory from FILESYSTEM_TEST_DIR env var.""" test_root = os.environ.get("FILESYSTEM_TEST_DIR") if not test_root: raise ValueError("FILESYSTEM_TEST_DIR environment variable is required") return Path(test_root) def verify_grade_summary_exists(test_dir: Path) -> bool: """Verify that grade_summary.txt file exists.""" grade_summary_file = test_dir / "grade_summary.txt" if not grade_summary_file.exists(): print("❌ File 'grade_summary.txt' not found") return False print("✅ grade_summary.txt file found") return True def verify_grade_summary_readable(test_dir: Path) -> bool: """Verify that the grade_summary.txt file is readable.""" grade_summary_file = test_dir / "grade_summary.txt" try: content = grade_summary_file.read_text() if not content.strip(): print("❌ grade_summary.txt file is empty") return False print("✅ grade_summary.txt file is readable") return True except Exception as e: print(f"❌ Error reading grade_summary.txt file: {e}") return False def extract_numbers_from_text(text: str) -> list: """Extract all numbers from text.""" numbers = re.findall(r'\d+', text) return [int(num) for num in numbers] def verify_three_subjects_present(test_dir: Path) -> bool: """Verify that grade_summary.txt contains all three subjects (case insensitive).""" grade_summary_file = test_dir / "grade_summary.txt" try: content = grade_summary_file.read_text() # Check if all three subjects are mentioned (case insensitive) subjects = ["chinese", "math", "english"] missing_subjects = [] for subject in subjects: if subject.lower() not in content.lower(): missing_subjects.append(subject) if missing_subjects: print(f"❌ Missing subjects in grade_summary.txt: {missing_subjects}") return False print("✅ All three subjects (Chinese, Math, English) found in grade_summary.txt") return True except Exception as e: print(f"❌ Error checking subjects: {e}") return False def verify_grade_summary_content(test_dir: Path) -> bool: """Verify that grade_summary.txt contains the correct statistics from answer.md.""" grade_summary_file = test_dir / "grade_summary.txt" try: content = grade_summary_file.read_text() # Extract all numbers from the content found_numbers = extract_numbers_from_text(content) if not found_numbers: print("❌ No numbers found in grade_summary.txt") return False # Expected numbers from answer.md # Format: [total_students, chinese_A, chinese_B, chinese_C, chinese_D, chinese_pass, chinese_fail, # math_A, math_B, math_C, math_D, math_pass, math_fail, # english_A, english_B, english_C, english_D, english_F, english_pass, english_fail] expected_numbers = [ # Total students 150, # Chinese grades: A(42), B(37), C(43), D(28), Pass(122), Fail(28) 42, 37, 43, 28, 122, 28, # Math grades: A(31), B(38), C(47), D(34), Pass(116), Fail(34) 31, 38, 47, 34, 116, 34, # English grades: A(32), B(38), C(38), D(41), F(1), Pass(108), Fail(42) 32, 38, 38, 41, 1, 108, 42 ] # Check if all expected numbers are present in the found numbers missing_numbers = [] for expected in expected_numbers: if expected not in found_numbers: missing_numbers.append(expected) if missing_numbers: print(f"❌ Missing expected numbers: {missing_numbers}") print(f" Found numbers: {found_numbers}") return False # Check if the counts match (each number should appear the expected number of times) for expected in expected_numbers: expected_count = expected_numbers.count(expected) found_count = found_numbers.count(expected) if found_count < expected_count: print(f"❌ Number {expected} appears {found_count} times, expected {expected_count} times") return False print("✅ All expected grade statistics found in grade_summary.txt") return True except Exception as e: print(f"❌ Error verifying grade summary content: {e}") return False def main(): """Main verification function.""" try: test_dir = get_test_directory() print(f"🔍 Verifying Student Database Grade-Based Score Analysis in: {test_dir}") # Define verification steps verification_steps = [ ("Grade Summary File Exists", verify_grade_summary_exists), ("File is Readable", verify_grade_summary_readable), ("Three Subjects Present", verify_three_subjects_present), ("Grade Statistics Content", verify_grade_summary_content), ] # Run all verification steps all_passed = True for step_name, verify_func in verification_steps: print(f"\n--- {step_name} ---") if not verify_func(test_dir): all_passed = False # Final result print("\n" + "="*50) if all_passed: print("✅ Student grade analysis completed correctly!") print("🎉 Grade-Based Score Analysis verification: PASS") sys.exit(0) else: print("❌ Grade-Based Score Analysis verification: FAIL") sys.exit(1) except Exception as e: print(f"❌ Verification failed with error: {e}") sys.exit(1) if __name__ == "__main__": main() ================================================ FILE: tasks/filesystem/standard/threestudio/code_locating/description.md ================================================ Please use FileSystem tools to finish the following task: ### Task Description ThreeStudio is a comprehensive codebase that implements various diffusion-based text-to-3D models, including NeRF-based rendering stage and diffusion guidance stage. Your task is to explore the codebase and identify the specific file that defines the guidance functionality for the Zero123 model. ### Task Objectives 1. **Explore the ThreeStudio codebase** using filesystem MCP tools 2. **Search through the project structure** to understand the codebase organization 3. **Identify the file** that contains the Zero123 guidance implementation 4. **Create an answer file** with the correct file path ### Expected Output Create a file named `answer.txt` in the test directory root **Requirements:** - Only include the file path, no additional text or explanation - Use forward slashes (/) for path separators - Include the full relative path from the project root - Ensure the path points to the actual file that defines Zero123 guidance ================================================ FILE: tasks/filesystem/standard/threestudio/code_locating/meta.json ================================================ { "task_id": "code_locating", "task_name": "Code Locating", "category_id": "threestudio", "category_name": "Threestudio", "description": "Navigate the ThreeStudio codebase to locate and identify the specific file that defines Zero123 guidance functionality implementation.", "author": "Lingjun Chen", "created_at": "2025-08-05", "difficulty": "L3", "tags": [ "code exploration" ], "mcp": [ "filesystem" ], "meta_data": { "stateType": "text", "stateContent": "threestudio/\n ├── configs/\n │ ├── debugging/\n │ │ ├── controlnet-canny.yaml\n │ │ ├── controlnet-normal.yaml\n │ │ ├── instructpix2pix.yaml\n │ │ └── stablediffusion.yaml\n │ ├── experimental/\n │ │ ├── unified-guidance/\n │ │ │ ├── dreamfusion-sd.yaml\n │ │ │ ├── hifa.yaml\n │ │ │ ├── prolificdreamer-hifa.yaml\n │ │ │ ├── prolificdreamer.yaml\n │ │ │ └── zero123-simple.yaml\n │ │ ├── co3d-imagecondition.yaml\n │ │ ├── imagecondition.yaml\n │ │ ├── imagecondition_zero123nerf.yaml\n │ │ ├── imagecondition_zero123nerf_refine.yaml\n │ │ ├── prolificdreamer-importance.yaml\n │ │ ├── prolificdreamer-neus-importance.yaml\n │ │ ├── prolificdreamer-propnet.yaml\n │ │ └── textmesh-if-importance.yaml\n │ ├── gradio/\n │ │ ├── dreamfusion-if.yaml\n │ │ ├── dreamfusion-sd.yaml\n │ │ ├── fantasia3d.yaml\n │ │ ├── latentnerf.yaml\n │ │ ├── sjc.yaml\n │ │ └── textmesh-if.yaml\n │ ├── control4d-static.yaml\n │ ├── dreamfusion-if.yaml\n │ ├── dreamfusion-sd-eff.yaml\n │ ├── dreamfusion-sd.yaml\n │ ├── fantasia3d-texture.yaml\n │ ├── fantasia3d.yaml\n │ ├── hifa.yaml\n │ ├── instructnerf2nerf.yaml\n │ ├── latentnerf-refine.yaml\n │ ├── latentnerf.yaml\n │ ├── magic123-coarse-sd.yaml\n │ ├── magic123-hifa-coarse-sd.yaml\n │ ├── magic123-hifa-refine-sd.yaml\n │ ├── magic123-refine-sd.yaml\n │ ├── magic3d-coarse-if.yaml\n │ ├── magic3d-coarse-sd.yaml\n │ ├── magic3d-refine-sd.yaml\n │ ├── prolificdreamer-geometry.yaml\n │ ├── prolificdreamer-hifa.yaml\n │ ├── prolificdreamer-patch.yaml\n │ ├── prolificdreamer-scene-hifa.yaml\n │ ├── prolificdreamer-scene.yaml\n │ ├── prolificdreamer-texture.yaml\n │ ├── prolificdreamer.yaml\n │ ├── sdi.yaml\n │ ├── sjc.yaml\n │ ├── sketchshape-refine.yaml\n │ ├── sketchshape.yaml\n │ ├── stable-zero123.yaml\n │ ├── textmesh-if.yaml\n │ ├── zero123-geometry.yaml\n │ └── zero123.yaml\n ├── custom/\n │ └── put_custom_extensions_here\n ├── docker/\n │ ├── compose.yaml\n │ └── Dockerfile\n ├── docs/\n │ └── installation.md\n ├── extern/\n │ ├── ldm_zero123/\n │ │ ├── models/\n │ │ │ ├── diffusion/\n │ │ │ │ ├── __init__.py\n │ │ │ │ ├── classifier.py\n │ │ │ │ ├── ddim.py\n │ │ │ │ ├── ddpm.py\n │ │ │ │ ├── plms.py\n │ │ │ │ └── sampling_util.py\n │ │ │ └── autoencoder.py\n │ │ ├── modules/\n │ │ │ ├── diffusionmodules/\n │ │ │ │ ├── __init__.py\n │ │ │ │ ├── model.py\n │ │ │ │ ├── openaimodel.py\n │ │ │ │ └── util.py\n │ │ │ ├── distributions/\n │ │ │ │ ├── __init__.py\n │ │ │ │ └── distributions.py\n │ │ │ ├── encoders/\n │ │ │ │ ├── __init__.py\n │ │ │ │ └── modules.py\n │ │ │ ├── evaluate/\n │ │ │ │ ├── adm_evaluator.py\n │ │ │ │ ├── evaluate_perceptualsim.py\n │ │ │ │ ├── frechet_video_distance.py\n │ │ │ │ ├── ssim.py\n │ │ │ │ └── torch_frechet_video_distance.py\n │ │ │ ├── image_degradation/\n │ │ │ │ ├── utils/\n │ │ │ │ │ └── test.png\n │ │ │ │ ├── __init__.py\n │ │ │ │ ├── bsrgan.py\n │ │ │ │ ├── bsrgan_light.py\n │ │ │ │ └── utils_image.py\n │ │ │ ├── losses/\n │ │ │ │ ├── __init__.py\n │ │ │ │ ├── contperceptual.py\n │ │ │ │ └── vqperceptual.py\n │ │ │ ├── attention.py\n │ │ │ ├── ema.py\n │ │ │ └── x_transformer.py\n │ │ ├── thirdp/\n │ │ │ └── psp/\n │ │ │ ├── helpers.py\n │ │ │ ├── id_loss.py\n │ │ │ └── model_irse.py\n │ │ ├── __init__.py\n │ │ ├── extras.py\n │ │ ├── guidance.py\n │ │ ├── lr_scheduler.py\n │ │ └── util.py\n │ ├── __init__.py\n │ └── zero123.py\n ├── load/\n │ ├── images/\n │ │ ├── anya_front.png\n │ │ ├── anya_front_depth.png\n │ │ ├── anya_front_normal.png\n │ │ ├── anya_front_rgba.png\n │ │ ├── baby_phoenix_on_ice.png\n │ │ ├── baby_phoenix_on_ice_depth.png\n │ │ ├── baby_phoenix_on_ice_normal.png\n │ │ ├── baby_phoenix_on_ice_rgba.png\n │ │ ├── beach_house_1.png\n │ │ ├── beach_house_1_depth.png\n │ │ ├── beach_house_1_normal.png\n │ │ ├── beach_house_1_rgba.png\n │ │ ├── beach_house_2.png\n │ │ ├── beach_house_2_depth.png\n │ │ ├── beach_house_2_normal.png\n │ │ ├── beach_house_2_rgba.png\n │ │ ├── bollywood_actress.png\n │ │ ├── bollywood_actress_depth.png\n │ │ ├── bollywood_actress_normal.png\n │ │ ├── bollywood_actress_rgba.png\n │ │ ├── cactus.png\n │ │ ├── cactus_depth.png\n │ │ ├── cactus_normal.png\n │ │ ├── cactus_rgba.png\n │ │ ├── catstatue.png\n │ │ ├── catstatue_depth.png\n │ │ ├── catstatue_normal.png\n │ │ ├── catstatue_rgba.png\n │ │ ├── church_ruins.png\n │ │ ├── church_ruins_depth.png\n │ │ ├── church_ruins_normal.png\n │ │ ├── church_ruins_rgba.png\n │ │ ├── dog1_rgba.png\n │ │ ├── dragon2_rgba.png\n │ │ ├── firekeeper.jpg\n │ │ ├── firekeeper_depth.png\n │ │ ├── firekeeper_normal.png\n │ │ ├── firekeeper_rgba.png\n │ │ ├── futuristic_car.png\n │ │ ├── futuristic_car_depth.png\n │ │ ├── futuristic_car_normal.png\n │ │ ├── futuristic_car_rgba.png\n │ │ ├── grootplant_rgba.png\n │ │ ├── hamburger.png\n │ │ ├── hamburger_depth.png\n │ │ ├── hamburger_rgba.png\n │ │ ├── mona_lisa.png\n │ │ ├── mona_lisa_depth.png\n │ │ ├── mona_lisa_normal.png\n │ │ ├── mona_lisa_rgba.png\n │ │ ├── robot_rgba.png\n │ │ ├── teddy.png\n │ │ ├── teddy_depth.png\n │ │ ├── teddy_normal.png\n │ │ ├── teddy_rgba.png\n │ │ └── thorhammer_rgba.png\n │ ├── lights/\n │ │ ├── bsdf_256_256.bin\n │ │ ├── LICENSE.txt\n │ │ └── mud_road_puresky_1k.hdr\n │ ├── shapes/\n │ │ ├── animal.obj\n │ │ ├── blub.obj\n │ │ ├── cabin.obj\n │ │ ├── env_sphere.obj\n │ │ ├── hand_prismatic.obj\n │ │ ├── human.obj\n │ │ ├── nascar.obj\n │ │ ├── potion.obj\n │ │ ├── README.md\n │ │ └── teddy.obj\n │ ├── tets/\n │ │ ├── 128_tets.npz\n │ │ ├── 32_tets.npz\n │ │ ├── 64_tets.npz\n │ │ └── generate_tets.py\n │ ├── zero123/\n │ │ ├── download.sh\n │ │ └── sd-objaverse-finetune-c_concat-256.yaml\n │ ├── make_prompt_library.py\n │ └── prompt_library.json\n ├── scripts/\n │ └── convert_zero123_to_diffusers.py\n ├── threestudio/\n │ ├── data/\n │ │ ├── __init__.py\n │ │ ├── co3d.py\n │ │ ├── image.py\n │ │ ├── multiview.py\n │ │ ├── uncond.py\n │ │ └── uncond_eff.py\n │ ├── models/\n │ │ ├── background/\n │ │ │ ├── __init__.py\n │ │ │ ├── base.py\n │ │ │ ├── neural_environment_map_background.py\n │ │ │ ├── solid_color_background.py\n │ │ │ └── textured_background.py\n │ │ ├── exporters/\n │ │ │ ├── __init__.py\n │ │ │ ├── base.py\n │ │ │ └── mesh_exporter.py\n │ │ ├── geometry/\n │ │ │ ├── __init__.py\n │ │ │ ├── base.py\n │ │ │ ├── custom_mesh.py\n │ │ │ ├── implicit_sdf.py\n │ │ │ ├── implicit_volume.py\n │ │ │ ├── tetrahedra_sdf_grid.py\n │ │ │ └── volume_grid.py\n │ │ ├── guidance/\n │ │ │ ├── __init__.py\n │ │ │ ├── controlnet_guidance.py\n │ │ │ ├── deep_floyd_guidance.py\n │ │ │ ├── instructpix2pix_guidance.py\n │ │ │ ├── stable_diffusion_guidance.py\n │ │ │ ├── stable_diffusion_sdi_guidance.py\n │ │ │ ├── stable_diffusion_unified_guidance.py\n │ │ │ ├── stable_diffusion_vsd_guidance.py\n │ │ │ ├── stable_zero123_guidance.py\n │ │ │ ├── zero123_guidance.py\n │ │ │ └── zero123_unified_guidance.py\n │ │ ├── materials/\n │ │ │ ├── __init__.py\n │ │ │ ├── base.py\n │ │ │ ├── diffuse_with_point_light_material.py\n │ │ │ ├── hybrid_rgb_latent_material.py\n │ │ │ ├── neural_radiance_material.py\n │ │ │ ├── no_material.py\n │ │ │ ├── pbr_material.py\n │ │ │ └── sd_latent_adapter_material.py\n │ │ ├── prompt_processors/\n │ │ │ ├── __init__.py\n │ │ │ ├── base.py\n │ │ │ ├── deepfloyd_prompt_processor.py\n │ │ │ ├── dummy_prompt_processor.py\n │ │ │ └── stable_diffusion_prompt_processor.py\n │ │ ├── renderers/\n │ │ │ ├── __init__.py\n │ │ │ ├── base.py\n │ │ │ ├── deferred_volume_renderer.py\n │ │ │ ├── gan_volume_renderer.py\n │ │ │ ├── nerf_volume_renderer.py\n │ │ │ ├── neus_volume_renderer.py\n │ │ │ ├── nvdiff_rasterizer.py\n │ │ │ └── patch_renderer.py\n │ │ ├── __init__.py\n │ │ ├── estimators.py\n │ │ ├── isosurface.py\n │ │ ├── mesh.py\n │ │ └── networks.py\n │ ├── scripts/\n │ │ ├── make_training_vid.py\n │ │ ├── run_zero123.sh\n │ │ ├── run_zero123_comparison.sh\n │ │ ├── run_zero123_phase.sh\n │ │ ├── run_zero123_phase2.sh\n │ │ ├── run_zero123_sbatch.py\n │ │ ├── zero123_demo.py\n │ │ └── zero123_sbatch.sh\n │ ├── systems/\n │ │ ├── __init__.py\n │ │ ├── base.py\n │ │ ├── control4d_multiview.py\n │ │ ├── dreamfusion.py\n │ │ ├── eff_dreamfusion.py\n │ │ ├── fantasia3d.py\n │ │ ├── imagedreamfusion.py\n │ │ ├── instructnerf2nerf.py\n │ │ ├── latentnerf.py\n │ │ ├── magic123.py\n │ │ ├── magic3d.py\n │ │ ├── optimizers.py\n │ │ ├── prolificdreamer.py\n │ │ ├── sdi.py\n │ │ ├── sjc.py\n │ │ ├── textmesh.py\n │ │ ├── utils.py\n │ │ ├── zero123.py\n │ │ └── zero123_simple.py\n │ ├── utils/\n │ │ ├── GAN/\n │ │ │ ├── __init__.py\n │ │ │ ├── attention.py\n │ │ │ ├── discriminator.py\n │ │ │ ├── distribution.py\n │ │ │ ├── loss.py\n │ │ │ ├── mobilenet.py\n │ │ │ ├── network_util.py\n │ │ │ ├── util.py\n │ │ │ └── vae.py\n │ │ ├── perceptual/\n │ │ │ ├── __init__.py\n │ │ │ ├── perceptual.py\n │ │ │ └── utils.py\n │ │ ├── __init__.py\n │ │ ├── base.py\n │ │ ├── callbacks.py\n │ │ ├── config.py\n │ │ ├── loss.py\n │ │ ├── misc.py\n │ │ ├── ops.py\n │ │ ├── rasterize.py\n │ │ ├── saving.py\n │ │ └── typing.py\n │ └── __init__.py\n ├── .editorconfig\n ├── .pre-commit-config.yaml\n ├── .pylintrc\n ├── 2dplayground.ipynb\n ├── 2dplayground_SDI_version.ipynb\n ├── CHANGELOG.md\n ├── DOCUMENTATION.md\n ├── gradio_app.py\n ├── launch.py\n ├── LICENSE\n ├── README.md\n ├── requirements-dev.txt\n ├── requirements.txt\n ├── setup.py\n └── threestudio.ipynb", "stateUrl": "https://storage.mcpmark.ai/filesystem/threestudio.zip", "stateOriginalUrl": "https://github.com/threestudio-project/threestudio" } } ================================================ FILE: tasks/filesystem/standard/threestudio/code_locating/verify.py ================================================ #!/usr/bin/env python3 """ Verification script for ThreeStudio Task 1: Find Zero123 Guidance Implementation """ import sys from pathlib import Path import re import os def get_test_directory() -> Path: """Get the test directory from FILESYSTEM_TEST_DIR env var.""" test_root = os.environ.get("FILESYSTEM_TEST_DIR") if not test_root: raise ValueError("FILESYSTEM_TEST_DIR environment variable is required") return Path(test_root) def verify_answer_file_exists(test_dir: Path) -> bool: """Verify that the answer.txt file exists.""" answer_file = test_dir / "answer.txt" if not answer_file.exists(): print("❌ File 'answer.txt' not found") return False print("✅ Answer file found") return True def verify_answer_format(test_dir: Path) -> bool: """Verify that the answer file has the correct format.""" answer_file = test_dir / "answer.txt" try: content = answer_file.read_text().strip() # Check if content is not empty if not content: print("❌ Answer file is empty") return False # Check if it contains only the file path (no additional text) if len(content.split('\n')) > 1: print("❌ Answer file contains multiple lines or additional text") return False # Check if it uses forward slashes if '\\' in content: print("❌ Answer uses backslashes instead of forward slashes") return False # Check if it's a relative path if content.startswith('/') or ':' in content: print("❌ Answer appears to be an absolute path") return False print("✅ Answer format is correct") return True except Exception as e: print(f"❌ Error reading answer file: {e}") return False def verify_file_path_structure(test_dir: Path) -> bool: """Verify that the file path has the expected structure.""" answer_file = test_dir / "answer.txt" try: content = answer_file.read_text().strip() # Expected path components for Zero123 guidance # In backup directories, the path is threestudio/models/guidance/zero123_guidance.py # In test_environments, the path is threestudio/threestudio/models/guidance/zero123_guidance.py expected_components = ["threestudio", "models", "guidance", "zero123_guidance.py"] # Check if all expected components are in the path for component in expected_components: if component not in content: print(f"❌ Path missing expected component: {component}") return False print("✅ File path structure is correct") return True except Exception as e: print(f"❌ Error verifying file path structure: {e}") return False def verify_file_exists(test_dir: Path) -> bool: """Verify that the identified file actually exists.""" answer_file = test_dir / "answer.txt" try: content = answer_file.read_text().strip() # Try the path as provided in the answer file file_path = test_dir / content # If that doesn't exist, try with the correct path structure # The answer file might have threestudio/models/guidance/zero123_guidance.py # but the actual path is threestudio/threestudio/models/guidance/zero123_guidance.py if not file_path.exists(): # Try to fix the path by adding the missing threestudio prefix if content.startswith("threestudio/models/"): corrected_path = content.replace("threestudio/models/", "threestudio/threestudio/models/") file_path = test_dir / corrected_path if file_path.exists(): print(f"✅ File exists with corrected path: {corrected_path}") return True if not file_path.exists(): print(f"❌ Identified file does not exist: {content}") return False print("✅ Identified file exists") return True except Exception as e: print(f"❌ Error verifying file existence: {e}") return False def verify_zero123_guidance_content(test_dir: Path) -> bool: """Verify that the identified file actually contains Zero123 guidance implementation.""" answer_file = test_dir / "answer.txt" try: content = answer_file.read_text().strip() # Try the path as provided in the answer file file_path = test_dir / content # If that doesn't exist, try with the correct path structure if not file_path.exists(): # Try to fix the path by adding the missing threestudio prefix if content.startswith("threestudio/models/"): corrected_path = content.replace("threestudio/models/", "threestudio/threestudio/models/") file_path = test_dir / corrected_path if not file_path.exists(): print(f"❌ Cannot find file for content verification: {content}") return False file_content = file_path.read_text() # Check for the main Zero123 guidance implementation # The main implementation should have the class name "Zero123Guidance" and register as "zero123-guidance" main_zero123_indicators = [ r'class Zero123Guidance', # Main class name r'@threestudio\.register\("zero123-guidance"\)', # Correct registration r'BaseObject', # Base class r'zero123', # General zero123 reference ] found_indicators = [] for indicator in main_zero123_indicators: if re.search(indicator, file_content, re.IGNORECASE): found_indicators.append(indicator) # Check if this is the main Zero123 guidance implementation is_main_implementation = ( 'class Zero123Guidance' in file_content and '@threestudio.register("zero123-guidance")' in file_content ) if not is_main_implementation: print(f"❌ File is not the main Zero123 guidance implementation") print(f" Expected: class Zero123Guidance and @threestudio.register('zero123-guidance')") return False print(f"✅ File contains main Zero123 guidance implementation indicators: {found_indicators}") return True except Exception as e: print(f"❌ Error verifying file content: {e}") return False def main(): """Main verification function.""" test_dir = get_test_directory() print("🔍 Verifying ThreeStudio Task 1: Find Zero123 Guidance Implementation...") # Define verification steps verification_steps = [ ("Answer File Exists", verify_answer_file_exists), ("Answer Format", verify_answer_format), ("File Path Structure", verify_file_path_structure), ("File Exists", verify_file_exists), ("Zero123 Guidance Content", verify_zero123_guidance_content), ] # Run all verification steps all_passed = True for step_name, verify_func in verification_steps: print(f"\n--- {step_name} ---") if not verify_func(test_dir): all_passed = False # Final result print("\n" + "="*50) if all_passed: print("✅ Zero123 guidance file path identified correctly!") print("🎉 Task 1 verification: PASS") sys.exit(0) else: print("❌ Task 1 verification: FAIL") sys.exit(1) if __name__ == "__main__": main() ================================================ FILE: tasks/filesystem/standard/threestudio/output_analysis/description.md ================================================ Please use FileSystem tools to finish the following task: ### Task Description ThreeStudio is a comprehensive codebase that implements various diffusion-based text-to-3D models, including NeRF-based rendering stage and diffusion guidance stage. Your task is to explore the codebase and identify the specific file that defines the guidance functionality for the Zero123 model. ### Task What is the output of `guidance_out`, returned by the code at line 137 in `threestudio/systems/zero123.py`? Clearly state the structure of it and where you find the answer (file and line numbers).Write your answer in a file named `answer.txt` in the test directory root. Do not add extra explanation or formatting beyond what is required by the task. ================================================ FILE: tasks/filesystem/standard/threestudio/output_analysis/meta.json ================================================ { "task_id": "output_analysis", "task_name": "Output Analysis", "category_id": "threestudio", "category_name": "Threestudio", "description": "Analyze the structure and components of guidance_out object returned by Zero123 guidance code at line 137 for understanding output format.", "author": "Lingjun Chen", "created_at": "2025-08-05", "difficulty": "L3", "tags": [ "code exploration", "pattern analysis" ], "mcp": [ "filesystem" ], "meta_data": { "stateType": "text", "stateContent": "threestudio/\n ├── configs/\n │ ├── debugging/\n │ │ ├── controlnet-canny.yaml\n │ │ ├── controlnet-normal.yaml\n │ │ ├── instructpix2pix.yaml\n │ │ └── stablediffusion.yaml\n │ ├── experimental/\n │ │ ├── unified-guidance/\n │ │ │ ├── dreamfusion-sd.yaml\n │ │ │ ├── hifa.yaml\n │ │ │ ├── prolificdreamer-hifa.yaml\n │ │ │ ├── prolificdreamer.yaml\n │ │ │ └── zero123-simple.yaml\n │ │ ├── co3d-imagecondition.yaml\n │ │ ├── imagecondition.yaml\n │ │ ├── imagecondition_zero123nerf.yaml\n │ │ ├── imagecondition_zero123nerf_refine.yaml\n │ │ ├── prolificdreamer-importance.yaml\n │ │ ├── prolificdreamer-neus-importance.yaml\n │ │ ├── prolificdreamer-propnet.yaml\n │ │ └── textmesh-if-importance.yaml\n │ ├── gradio/\n │ │ ├── dreamfusion-if.yaml\n │ │ ├── dreamfusion-sd.yaml\n │ │ ├── fantasia3d.yaml\n │ │ ├── latentnerf.yaml\n │ │ ├── sjc.yaml\n │ │ └── textmesh-if.yaml\n │ ├── control4d-static.yaml\n │ ├── dreamfusion-if.yaml\n │ ├── dreamfusion-sd-eff.yaml\n │ ├── dreamfusion-sd.yaml\n │ ├── fantasia3d-texture.yaml\n │ ├── fantasia3d.yaml\n │ ├── hifa.yaml\n │ ├── instructnerf2nerf.yaml\n │ ├── latentnerf-refine.yaml\n │ ├── latentnerf.yaml\n │ ├── magic123-coarse-sd.yaml\n │ ├── magic123-hifa-coarse-sd.yaml\n │ ├── magic123-hifa-refine-sd.yaml\n │ ├── magic123-refine-sd.yaml\n │ ├── magic3d-coarse-if.yaml\n │ ├── magic3d-coarse-sd.yaml\n │ ├── magic3d-refine-sd.yaml\n │ ├── prolificdreamer-geometry.yaml\n │ ├── prolificdreamer-hifa.yaml\n │ ├── prolificdreamer-patch.yaml\n │ ├── prolificdreamer-scene-hifa.yaml\n │ ├── prolificdreamer-scene.yaml\n │ ├── prolificdreamer-texture.yaml\n │ ├── prolificdreamer.yaml\n │ ├── sdi.yaml\n │ ├── sjc.yaml\n │ ├── sketchshape-refine.yaml\n │ ├── sketchshape.yaml\n │ ├── stable-zero123.yaml\n │ ├── textmesh-if.yaml\n │ ├── zero123-geometry.yaml\n │ └── zero123.yaml\n ├── custom/\n │ └── put_custom_extensions_here\n ├── docker/\n │ ├── compose.yaml\n │ └── Dockerfile\n ├── docs/\n │ └── installation.md\n ├── extern/\n │ ├── ldm_zero123/\n │ │ ├── models/\n │ │ │ ├── diffusion/\n │ │ │ │ ├── __init__.py\n │ │ │ │ ├── classifier.py\n │ │ │ │ ├── ddim.py\n │ │ │ │ ├── ddpm.py\n │ │ │ │ ├── plms.py\n │ │ │ │ └── sampling_util.py\n │ │ │ └── autoencoder.py\n │ │ ├── modules/\n │ │ │ ├── diffusionmodules/\n │ │ │ │ ├── __init__.py\n │ │ │ │ ├── model.py\n │ │ │ │ ├── openaimodel.py\n │ │ │ │ └── util.py\n │ │ │ ├── distributions/\n │ │ │ │ ├── __init__.py\n │ │ │ │ └── distributions.py\n │ │ │ ├── encoders/\n │ │ │ │ ├── __init__.py\n │ │ │ │ └── modules.py\n │ │ │ ├── evaluate/\n │ │ │ │ ├── adm_evaluator.py\n │ │ │ │ ├── evaluate_perceptualsim.py\n │ │ │ │ ├── frechet_video_distance.py\n │ │ │ │ ├── ssim.py\n │ │ │ │ └── torch_frechet_video_distance.py\n │ │ │ ├── image_degradation/\n │ │ │ │ ├── utils/\n │ │ │ │ │ └── test.png\n │ │ │ │ ├── __init__.py\n │ │ │ │ ├── bsrgan.py\n │ │ │ │ ├── bsrgan_light.py\n │ │ │ │ └── utils_image.py\n │ │ │ ├── losses/\n │ │ │ │ ├── __init__.py\n │ │ │ │ ├── contperceptual.py\n │ │ │ │ └── vqperceptual.py\n │ │ │ ├── attention.py\n │ │ │ ├── ema.py\n │ │ │ └── x_transformer.py\n │ │ ├── thirdp/\n │ │ │ └── psp/\n │ │ │ ├── helpers.py\n │ │ │ ├── id_loss.py\n │ │ │ └── model_irse.py\n │ │ ├── __init__.py\n │ │ ├── extras.py\n │ │ ├── guidance.py\n │ │ ├── lr_scheduler.py\n │ │ └── util.py\n │ ├── __init__.py\n │ └── zero123.py\n ├── load/\n │ ├── images/\n │ │ ├── anya_front.png\n │ │ ├── anya_front_depth.png\n │ │ ├── anya_front_normal.png\n │ │ ├── anya_front_rgba.png\n │ │ ├── baby_phoenix_on_ice.png\n │ │ ├── baby_phoenix_on_ice_depth.png\n │ │ ├── baby_phoenix_on_ice_normal.png\n │ │ ├── baby_phoenix_on_ice_rgba.png\n │ │ ├── beach_house_1.png\n │ │ ├── beach_house_1_depth.png\n │ │ ├── beach_house_1_normal.png\n │ │ ├── beach_house_1_rgba.png\n │ │ ├── beach_house_2.png\n │ │ ├── beach_house_2_depth.png\n │ │ ├── beach_house_2_normal.png\n │ │ ├── beach_house_2_rgba.png\n │ │ ├── bollywood_actress.png\n │ │ ├── bollywood_actress_depth.png\n │ │ ├── bollywood_actress_normal.png\n │ │ ├── bollywood_actress_rgba.png\n │ │ ├── cactus.png\n │ │ ├── cactus_depth.png\n │ │ ├── cactus_normal.png\n │ │ ├── cactus_rgba.png\n │ │ ├── catstatue.png\n │ │ ├── catstatue_depth.png\n │ │ ├── catstatue_normal.png\n │ │ ├── catstatue_rgba.png\n │ │ ├── church_ruins.png\n │ │ ├── church_ruins_depth.png\n │ │ ├── church_ruins_normal.png\n │ │ ├── church_ruins_rgba.png\n │ │ ├── dog1_rgba.png\n │ │ ├── dragon2_rgba.png\n │ │ ├── firekeeper.jpg\n │ │ ├── firekeeper_depth.png\n │ │ ├── firekeeper_normal.png\n │ │ ├── firekeeper_rgba.png\n │ │ ├── futuristic_car.png\n │ │ ├── futuristic_car_depth.png\n │ │ ├── futuristic_car_normal.png\n │ │ ├── futuristic_car_rgba.png\n │ │ ├── grootplant_rgba.png\n │ │ ├── hamburger.png\n │ │ ├── hamburger_depth.png\n │ │ ├── hamburger_rgba.png\n │ │ ├── mona_lisa.png\n │ │ ├── mona_lisa_depth.png\n │ │ ├── mona_lisa_normal.png\n │ │ ├── mona_lisa_rgba.png\n │ │ ├── robot_rgba.png\n │ │ ├── teddy.png\n │ │ ├── teddy_depth.png\n │ │ ├── teddy_normal.png\n │ │ ├── teddy_rgba.png\n │ │ └── thorhammer_rgba.png\n │ ├── lights/\n │ │ ├── bsdf_256_256.bin\n │ │ ├── LICENSE.txt\n │ │ └── mud_road_puresky_1k.hdr\n │ ├── shapes/\n │ │ ├── animal.obj\n │ │ ├── blub.obj\n │ │ ├── cabin.obj\n │ │ ├── env_sphere.obj\n │ │ ├── hand_prismatic.obj\n │ │ ├── human.obj\n │ │ ├── nascar.obj\n │ │ ├── potion.obj\n │ │ ├── README.md\n │ │ └── teddy.obj\n │ ├── tets/\n │ │ ├── 128_tets.npz\n │ │ ├── 32_tets.npz\n │ │ ├── 64_tets.npz\n │ │ └── generate_tets.py\n │ ├── zero123/\n │ │ ├── download.sh\n │ │ └── sd-objaverse-finetune-c_concat-256.yaml\n │ ├── make_prompt_library.py\n │ └── prompt_library.json\n ├── scripts/\n │ └── convert_zero123_to_diffusers.py\n ├── threestudio/\n │ ├── data/\n │ │ ├── __init__.py\n │ │ ├── co3d.py\n │ │ ├── image.py\n │ │ ├── multiview.py\n │ │ ├── uncond.py\n │ │ └── uncond_eff.py\n │ ├── models/\n │ │ ├── background/\n │ │ │ ├── __init__.py\n │ │ │ ├── base.py\n │ │ │ ├── neural_environment_map_background.py\n │ │ │ ├── solid_color_background.py\n │ │ │ └── textured_background.py\n │ │ ├── exporters/\n │ │ │ ├── __init__.py\n │ │ │ ├── base.py\n │ │ │ └── mesh_exporter.py\n │ │ ├── geometry/\n │ │ │ ├── __init__.py\n │ │ │ ├── base.py\n │ │ │ ├── custom_mesh.py\n │ │ │ ├── implicit_sdf.py\n │ │ │ ├── implicit_volume.py\n │ │ │ ├── tetrahedra_sdf_grid.py\n │ │ │ └── volume_grid.py\n │ │ ├── guidance/\n │ │ │ ├── __init__.py\n │ │ │ ├── controlnet_guidance.py\n │ │ │ ├── deep_floyd_guidance.py\n │ │ │ ├── instructpix2pix_guidance.py\n │ │ │ ├── stable_diffusion_guidance.py\n │ │ │ ├── stable_diffusion_sdi_guidance.py\n │ │ │ ├── stable_diffusion_unified_guidance.py\n │ │ │ ├── stable_diffusion_vsd_guidance.py\n │ │ │ ├── stable_zero123_guidance.py\n │ │ │ ├── zero123_guidance.py\n │ │ │ └── zero123_unified_guidance.py\n │ │ ├── materials/\n │ │ │ ├── __init__.py\n │ │ │ ├── base.py\n │ │ │ ├── diffuse_with_point_light_material.py\n │ │ │ ├── hybrid_rgb_latent_material.py\n │ │ │ ├── neural_radiance_material.py\n │ │ │ ├── no_material.py\n │ │ │ ├── pbr_material.py\n │ │ │ └── sd_latent_adapter_material.py\n │ │ ├── prompt_processors/\n │ │ │ ├── __init__.py\n │ │ │ ├── base.py\n │ │ │ ├── deepfloyd_prompt_processor.py\n │ │ │ ├── dummy_prompt_processor.py\n │ │ │ └── stable_diffusion_prompt_processor.py\n │ │ ├── renderers/\n │ │ │ ├── __init__.py\n │ │ │ ├── base.py\n │ │ │ ├── deferred_volume_renderer.py\n │ │ │ ├── gan_volume_renderer.py\n │ │ │ ├── nerf_volume_renderer.py\n │ │ │ ├── neus_volume_renderer.py\n │ │ │ ├── nvdiff_rasterizer.py\n │ │ │ └── patch_renderer.py\n │ │ ├── __init__.py\n │ │ ├── estimators.py\n │ │ ├── isosurface.py\n │ │ ├── mesh.py\n │ │ └── networks.py\n │ ├── scripts/\n │ │ ├── make_training_vid.py\n │ │ ├── run_zero123.sh\n │ │ ├── run_zero123_comparison.sh\n │ │ ├── run_zero123_phase.sh\n │ │ ├── run_zero123_phase2.sh\n │ │ ├── run_zero123_sbatch.py\n │ │ ├── zero123_demo.py\n │ │ └── zero123_sbatch.sh\n │ ├── systems/\n │ │ ├── __init__.py\n │ │ ├── base.py\n │ │ ├── control4d_multiview.py\n │ │ ├── dreamfusion.py\n │ │ ├── eff_dreamfusion.py\n │ │ ├── fantasia3d.py\n │ │ ├── imagedreamfusion.py\n │ │ ├── instructnerf2nerf.py\n │ │ ├── latentnerf.py\n │ │ ├── magic123.py\n │ │ ├── magic3d.py\n │ │ ├── optimizers.py\n │ │ ├── prolificdreamer.py\n │ │ ├── sdi.py\n │ │ ├── sjc.py\n │ │ ├── textmesh.py\n │ │ ├── utils.py\n │ │ ├── zero123.py\n │ │ └── zero123_simple.py\n │ ├── utils/\n │ │ ├── GAN/\n │ │ │ ├── __init__.py\n │ │ │ ├── attention.py\n │ │ │ ├── discriminator.py\n │ │ │ ├── distribution.py\n │ │ │ ├── loss.py\n │ │ │ ├── mobilenet.py\n │ │ │ ├── network_util.py\n │ │ │ ├── util.py\n │ │ │ └── vae.py\n │ │ ├── perceptual/\n │ │ │ ├── __init__.py\n │ │ │ ├── perceptual.py\n │ │ │ └── utils.py\n │ │ ├── __init__.py\n │ │ ├── base.py\n │ │ ├── callbacks.py\n │ │ ├── config.py\n │ │ ├── loss.py\n │ │ ├── misc.py\n │ │ ├── ops.py\n │ │ ├── rasterize.py\n │ │ ├── saving.py\n │ │ └── typing.py\n │ └── __init__.py\n ├── .editorconfig\n ├── .pre-commit-config.yaml\n ├── .pylintrc\n ├── 2dplayground.ipynb\n ├── 2dplayground_SDI_version.ipynb\n ├── CHANGELOG.md\n ├── DOCUMENTATION.md\n ├── gradio_app.py\n ├── launch.py\n ├── LICENSE\n ├── README.md\n ├── requirements-dev.txt\n ├── requirements.txt\n ├── setup.py\n └── threestudio.ipynb", "stateUrl": "https://storage.mcpmark.ai/filesystem/threestudio.zip", "stateOriginalUrl": "https://github.com/threestudio-project/threestudio" } } ================================================ FILE: tasks/filesystem/standard/threestudio/output_analysis/verify.py ================================================ #!/usr/bin/env python3 """ Verification script for ThreeStudio Task 2: Analyze Zero123 Guidance Output Structure """ import sys from pathlib import Path import re import os def get_test_directory() -> Path: """Get the test directory from FILESYSTEM_TEST_DIR env var.""" test_root = os.environ.get("FILESYSTEM_TEST_DIR") if not test_root: raise ValueError("FILESYSTEM_TEST_DIR environment variable is required") return Path(test_root) def verify_answer_file_exists(test_dir: Path) -> bool: """Verify that the answer.txt file exists.""" answer_file = test_dir / "answer.txt" if not answer_file.exists(): print("❌ File 'answer.txt' not found") return False print("✅ Answer file found") return True def verify_required_strings(test_dir: Path) -> bool: """Verify that the answer contains the four required strings.""" answer_file = test_dir / "answer.txt" try: content = answer_file.read_text() # Check for required strings required_strings = ["loss_sds", "grad_norm", "min_step", "max_step"] missing_strings = [] for string in required_strings: if string not in content: missing_strings.append(string) if missing_strings: print(f"❌ Missing required strings: {missing_strings}") return False print("✅ All required strings found") return True except Exception as e: print(f"❌ Error reading answer file: {e}") return False def verify_line_numbers(test_dir: Path) -> bool: """Verify that line numbers contain (323 or 324) AND (327 or 328).""" answer_file = test_dir / "answer.txt" try: content = answer_file.read_text() # Check for first number (323 or 324) has_first = "323" in content or "324" in content # Check for second number (327 or 328) has_second = "327" in content or "328" in content if not has_first: print("❌ Missing first line number (323 or 324)") return False if not has_second: print("❌ Missing second line number (327 or 328)") return False print("✅ Line numbers found: contains (323 or 324) and (327 or 328)") return True except Exception as e: print(f"❌ Error verifying line numbers: {e}") return False def verify_file_path(test_dir: Path) -> bool: """Verify that the file path contains the exact expected path string.""" answer_file = test_dir / "answer.txt" try: content = answer_file.read_text() # Check for the exact expected file path expected_path = "threestudio/models/guidance/zero123_guidance.py" if expected_path not in content: print(f"❌ Missing expected file path: {expected_path}") return False print("✅ File path found: threestudio/models/guidance/zero123_guidance.py") return True except Exception as e: print(f"❌ Error verifying file path: {e}") return False def main(): """Main verification function.""" test_dir = get_test_directory() print("🔍 Verifying ThreeStudio Task 2: Analyze Zero123 Guidance Output Structure...") # Define verification steps verification_steps = [ ("Answer File Exists", verify_answer_file_exists), ("Required Strings", verify_required_strings), ("Line Numbers Range", verify_line_numbers), ("File Path Components", verify_file_path), ] # Run all verification steps all_passed = True for step_name, verify_func in verification_steps: print(f"\n--- {step_name} ---") if not verify_func(test_dir): all_passed = False # Final result print("\n" + "="*50) if all_passed: print("✅ Zero123 guidance output structure analyzed correctly!") print("🎉 Task 2 verification: PASS") sys.exit(0) else: print("❌ Task 2 verification: FAIL") sys.exit(1) if __name__ == "__main__": main() ================================================ FILE: tasks/filesystem/standard/threestudio/requirements_completion/description.md ================================================ Please use FileSystem tools to finish the following task: ### Task Description The `requirements.txt` file in the ThreeStudio project is used to install necessary Python libraries. However, the Zero123-related dependencies were accidentally deleted from the file. Your task is to restore these missing dependencies. ### Task Objectives 1. **Locate the requirements.txt file** in the test environment 2. **Identify the missing Zero123 dependencies** that need to be restored 3. **Add the required dependencies** to the requirements.txt file 4. **Ensure the file format is correct** (one dependency per line) ================================================ FILE: tasks/filesystem/standard/threestudio/requirements_completion/meta.json ================================================ { "task_id": "requirements_completion", "task_name": "Requirements Completion", "category_id": "threestudio", "category_name": "Threestudio", "description": "Restore and complete missing Zero123-related dependencies in the requirements.txt file to ensure proper ThreeStudio project configuration.", "author": "Lingjun Chen", "created_at": "2025-08-05", "difficulty": "L3", "tags": [ "code exploration", "cross-referencing" ], "mcp": [ "filesystem" ], "meta_data": { "stateType": "text", "stateContent": "threestudio/\n ├── configs/\n │ ├── debugging/\n │ │ ├── controlnet-canny.yaml\n │ │ ├── controlnet-normal.yaml\n │ │ ├── instructpix2pix.yaml\n │ │ └── stablediffusion.yaml\n │ ├── experimental/\n │ │ ├── unified-guidance/\n │ │ │ ├── dreamfusion-sd.yaml\n │ │ │ ├── hifa.yaml\n │ │ │ ├── prolificdreamer-hifa.yaml\n │ │ │ ├── prolificdreamer.yaml\n │ │ │ └── zero123-simple.yaml\n │ │ ├── co3d-imagecondition.yaml\n │ │ ├── imagecondition.yaml\n │ │ ├── imagecondition_zero123nerf.yaml\n │ │ ├── imagecondition_zero123nerf_refine.yaml\n │ │ ├── prolificdreamer-importance.yaml\n │ │ ├── prolificdreamer-neus-importance.yaml\n │ │ ├── prolificdreamer-propnet.yaml\n │ │ └── textmesh-if-importance.yaml\n │ ├── gradio/\n │ │ ├── dreamfusion-if.yaml\n │ │ ├── dreamfusion-sd.yaml\n │ │ ├── fantasia3d.yaml\n │ │ ├── latentnerf.yaml\n │ │ ├── sjc.yaml\n │ │ └── textmesh-if.yaml\n │ ├── control4d-static.yaml\n │ ├── dreamfusion-if.yaml\n │ ├── dreamfusion-sd-eff.yaml\n │ ├── dreamfusion-sd.yaml\n │ ├── fantasia3d-texture.yaml\n │ ├── fantasia3d.yaml\n │ ├── hifa.yaml\n │ ├── instructnerf2nerf.yaml\n │ ├── latentnerf-refine.yaml\n │ ├── latentnerf.yaml\n │ ├── magic123-coarse-sd.yaml\n │ ├── magic123-hifa-coarse-sd.yaml\n │ ├── magic123-hifa-refine-sd.yaml\n │ ├── magic123-refine-sd.yaml\n │ ├── magic3d-coarse-if.yaml\n │ ├── magic3d-coarse-sd.yaml\n │ ├── magic3d-refine-sd.yaml\n │ ├── prolificdreamer-geometry.yaml\n │ ├── prolificdreamer-hifa.yaml\n │ ├── prolificdreamer-patch.yaml\n │ ├── prolificdreamer-scene-hifa.yaml\n │ ├── prolificdreamer-scene.yaml\n │ ├── prolificdreamer-texture.yaml\n │ ├── prolificdreamer.yaml\n │ ├── sdi.yaml\n │ ├── sjc.yaml\n │ ├── sketchshape-refine.yaml\n │ ├── sketchshape.yaml\n │ ├── stable-zero123.yaml\n │ ├── textmesh-if.yaml\n │ ├── zero123-geometry.yaml\n │ └── zero123.yaml\n ├── custom/\n │ └── put_custom_extensions_here\n ├── docker/\n │ ├── compose.yaml\n │ └── Dockerfile\n ├── docs/\n │ └── installation.md\n ├── extern/\n │ ├── ldm_zero123/\n │ │ ├── models/\n │ │ │ ├── diffusion/\n │ │ │ │ ├── __init__.py\n │ │ │ │ ├── classifier.py\n │ │ │ │ ├── ddim.py\n │ │ │ │ ├── ddpm.py\n │ │ │ │ ├── plms.py\n │ │ │ │ └── sampling_util.py\n │ │ │ └── autoencoder.py\n │ │ ├── modules/\n │ │ │ ├── diffusionmodules/\n │ │ │ │ ├── __init__.py\n │ │ │ │ ├── model.py\n │ │ │ │ ├── openaimodel.py\n │ │ │ │ └── util.py\n │ │ │ ├── distributions/\n │ │ │ │ ├── __init__.py\n │ │ │ │ └── distributions.py\n │ │ │ ├── encoders/\n │ │ │ │ ├── __init__.py\n │ │ │ │ └── modules.py\n │ │ │ ├── evaluate/\n │ │ │ │ ├── adm_evaluator.py\n │ │ │ │ ├── evaluate_perceptualsim.py\n │ │ │ │ ├── frechet_video_distance.py\n │ │ │ │ ├── ssim.py\n │ │ │ │ └── torch_frechet_video_distance.py\n │ │ │ ├── image_degradation/\n │ │ │ │ ├── utils/\n │ │ │ │ │ └── test.png\n │ │ │ │ ├── __init__.py\n │ │ │ │ ├── bsrgan.py\n │ │ │ │ ├── bsrgan_light.py\n │ │ │ │ └── utils_image.py\n │ │ │ ├── losses/\n │ │ │ │ ├── __init__.py\n │ │ │ │ ├── contperceptual.py\n │ │ │ │ └── vqperceptual.py\n │ │ │ ├── attention.py\n │ │ │ ├── ema.py\n │ │ │ └── x_transformer.py\n │ │ ├── thirdp/\n │ │ │ └── psp/\n │ │ │ ├── helpers.py\n │ │ │ ├── id_loss.py\n │ │ │ └── model_irse.py\n │ │ ├── __init__.py\n │ │ ├── extras.py\n │ │ ├── guidance.py\n │ │ ├── lr_scheduler.py\n │ │ └── util.py\n │ ├── __init__.py\n │ └── zero123.py\n ├── load/\n │ ├── images/\n │ │ ├── anya_front.png\n │ │ ├── anya_front_depth.png\n │ │ ├── anya_front_normal.png\n │ │ ├── anya_front_rgba.png\n │ │ ├── baby_phoenix_on_ice.png\n │ │ ├── baby_phoenix_on_ice_depth.png\n │ │ ├── baby_phoenix_on_ice_normal.png\n │ │ ├── baby_phoenix_on_ice_rgba.png\n │ │ ├── beach_house_1.png\n │ │ ├── beach_house_1_depth.png\n │ │ ├── beach_house_1_normal.png\n │ │ ├── beach_house_1_rgba.png\n │ │ ├── beach_house_2.png\n │ │ ├── beach_house_2_depth.png\n │ │ ├── beach_house_2_normal.png\n │ │ ├── beach_house_2_rgba.png\n │ │ ├── bollywood_actress.png\n │ │ ├── bollywood_actress_depth.png\n │ │ ├── bollywood_actress_normal.png\n │ │ ├── bollywood_actress_rgba.png\n │ │ ├── cactus.png\n │ │ ├── cactus_depth.png\n │ │ ├── cactus_normal.png\n │ │ ├── cactus_rgba.png\n │ │ ├── catstatue.png\n │ │ ├── catstatue_depth.png\n │ │ ├── catstatue_normal.png\n │ │ ├── catstatue_rgba.png\n │ │ ├── church_ruins.png\n │ │ ├── church_ruins_depth.png\n │ │ ├── church_ruins_normal.png\n │ │ ├── church_ruins_rgba.png\n │ │ ├── dog1_rgba.png\n │ │ ├── dragon2_rgba.png\n │ │ ├── firekeeper.jpg\n │ │ ├── firekeeper_depth.png\n │ │ ├── firekeeper_normal.png\n │ │ ├── firekeeper_rgba.png\n │ │ ├── futuristic_car.png\n │ │ ├── futuristic_car_depth.png\n │ │ ├── futuristic_car_normal.png\n │ │ ├── futuristic_car_rgba.png\n │ │ ├── grootplant_rgba.png\n │ │ ├── hamburger.png\n │ │ ├── hamburger_depth.png\n │ │ ├── hamburger_rgba.png\n │ │ ├── mona_lisa.png\n │ │ ├── mona_lisa_depth.png\n │ │ ├── mona_lisa_normal.png\n │ │ ├── mona_lisa_rgba.png\n │ │ ├── robot_rgba.png\n │ │ ├── teddy.png\n │ │ ├── teddy_depth.png\n │ │ ├── teddy_normal.png\n │ │ ├── teddy_rgba.png\n │ │ └── thorhammer_rgba.png\n │ ├── lights/\n │ │ ├── bsdf_256_256.bin\n │ │ ├── LICENSE.txt\n │ │ └── mud_road_puresky_1k.hdr\n │ ├── shapes/\n │ │ ├── animal.obj\n │ │ ├── blub.obj\n │ │ ├── cabin.obj\n │ │ ├── env_sphere.obj\n │ │ ├── hand_prismatic.obj\n │ │ ├── human.obj\n │ │ ├── nascar.obj\n │ │ ├── potion.obj\n │ │ ├── README.md\n │ │ └── teddy.obj\n │ ├── tets/\n │ │ ├── 128_tets.npz\n │ │ ├── 32_tets.npz\n │ │ ├── 64_tets.npz\n │ │ └── generate_tets.py\n │ ├── zero123/\n │ │ ├── download.sh\n │ │ └── sd-objaverse-finetune-c_concat-256.yaml\n │ ├── make_prompt_library.py\n │ └── prompt_library.json\n ├── scripts/\n │ └── convert_zero123_to_diffusers.py\n ├── threestudio/\n │ ├── data/\n │ │ ├── __init__.py\n │ │ ├── co3d.py\n │ │ ├── image.py\n │ │ ├── multiview.py\n │ │ ├── uncond.py\n │ │ └── uncond_eff.py\n │ ├── models/\n │ │ ├── background/\n │ │ │ ├── __init__.py\n │ │ │ ├── base.py\n │ │ │ ├── neural_environment_map_background.py\n │ │ │ ├── solid_color_background.py\n │ │ │ └── textured_background.py\n │ │ ├── exporters/\n │ │ │ ├── __init__.py\n │ │ │ ├── base.py\n │ │ │ └── mesh_exporter.py\n │ │ ├── geometry/\n │ │ │ ├── __init__.py\n │ │ │ ├── base.py\n │ │ │ ├── custom_mesh.py\n │ │ │ ├── implicit_sdf.py\n │ │ │ ├── implicit_volume.py\n │ │ │ ├── tetrahedra_sdf_grid.py\n │ │ │ └── volume_grid.py\n │ │ ├── guidance/\n │ │ │ ├── __init__.py\n │ │ │ ├── controlnet_guidance.py\n │ │ │ ├── deep_floyd_guidance.py\n │ │ │ ├── instructpix2pix_guidance.py\n │ │ │ ├── stable_diffusion_guidance.py\n │ │ │ ├── stable_diffusion_sdi_guidance.py\n │ │ │ ├── stable_diffusion_unified_guidance.py\n │ │ │ ├── stable_diffusion_vsd_guidance.py\n │ │ │ ├── stable_zero123_guidance.py\n │ │ │ ├── zero123_guidance.py\n │ │ │ └── zero123_unified_guidance.py\n │ │ ├── materials/\n │ │ │ ├── __init__.py\n │ │ │ ├── base.py\n │ │ │ ├── diffuse_with_point_light_material.py\n │ │ │ ├── hybrid_rgb_latent_material.py\n │ │ │ ├── neural_radiance_material.py\n │ │ │ ├── no_material.py\n │ │ │ ├── pbr_material.py\n │ │ │ └── sd_latent_adapter_material.py\n │ │ ├── prompt_processors/\n │ │ │ ├── __init__.py\n │ │ │ ├── base.py\n │ │ │ ├── deepfloyd_prompt_processor.py\n │ │ │ ├── dummy_prompt_processor.py\n │ │ │ └── stable_diffusion_prompt_processor.py\n │ │ ├── renderers/\n │ │ │ ├── __init__.py\n │ │ │ ├── base.py\n │ │ │ ├── deferred_volume_renderer.py\n │ │ │ ├── gan_volume_renderer.py\n │ │ │ ├── nerf_volume_renderer.py\n │ │ │ ├── neus_volume_renderer.py\n │ │ │ ├── nvdiff_rasterizer.py\n │ │ │ └── patch_renderer.py\n │ │ ├── __init__.py\n │ │ ├── estimators.py\n │ │ ├── isosurface.py\n │ │ ├── mesh.py\n │ │ └── networks.py\n │ ├── scripts/\n │ │ ├── make_training_vid.py\n │ │ ├── run_zero123.sh\n │ │ ├── run_zero123_comparison.sh\n │ │ ├── run_zero123_phase.sh\n │ │ ├── run_zero123_phase2.sh\n │ │ ├── run_zero123_sbatch.py\n │ │ ├── zero123_demo.py\n │ │ └── zero123_sbatch.sh\n │ ├── systems/\n │ │ ├── __init__.py\n │ │ ├── base.py\n │ │ ├── control4d_multiview.py\n │ │ ├── dreamfusion.py\n │ │ ├── eff_dreamfusion.py\n │ │ ├── fantasia3d.py\n │ │ ├── imagedreamfusion.py\n │ │ ├── instructnerf2nerf.py\n │ │ ├── latentnerf.py\n │ │ ├── magic123.py\n │ │ ├── magic3d.py\n │ │ ├── optimizers.py\n │ │ ├── prolificdreamer.py\n │ │ ├── sdi.py\n │ │ ├── sjc.py\n │ │ ├── textmesh.py\n │ │ ├── utils.py\n │ │ ├── zero123.py\n │ │ └── zero123_simple.py\n │ ├── utils/\n │ │ ├── GAN/\n │ │ │ ├── __init__.py\n │ │ │ ├── attention.py\n │ │ │ ├── discriminator.py\n │ │ │ ├── distribution.py\n │ │ │ ├── loss.py\n │ │ │ ├── mobilenet.py\n │ │ │ ├── network_util.py\n │ │ │ ├── util.py\n │ │ │ └── vae.py\n │ │ ├── perceptual/\n │ │ │ ├── __init__.py\n │ │ │ ├── perceptual.py\n │ │ │ └── utils.py\n │ │ ├── __init__.py\n │ │ ├── base.py\n │ │ ├── callbacks.py\n │ │ ├── config.py\n │ │ ├── loss.py\n │ │ ├── misc.py\n │ │ ├── ops.py\n │ │ ├── rasterize.py\n │ │ ├── saving.py\n │ │ └── typing.py\n │ └── __init__.py\n ├── .editorconfig\n ├── .pre-commit-config.yaml\n ├── .pylintrc\n ├── 2dplayground.ipynb\n ├── 2dplayground_SDI_version.ipynb\n ├── CHANGELOG.md\n ├── DOCUMENTATION.md\n ├── gradio_app.py\n ├── launch.py\n ├── LICENSE\n ├── README.md\n ├── requirements-dev.txt\n ├── requirements.txt\n ├── setup.py\n └── threestudio.ipynb", "stateUrl": "https://storage.mcpmark.ai/filesystem/threestudio.zip", "stateOriginalUrl": "https://github.com/threestudio-project/threestudio" } } ================================================ FILE: tasks/filesystem/standard/threestudio/requirements_completion/verify.py ================================================ #!/usr/bin/env python3 """ Verification script for ThreeStudio Task 3: Restore Zero123 Dependencies in Requirements.txt """ import sys from pathlib import Path import re import os def get_test_directory() -> Path: """Get the test directory from FILESYSTEM_TEST_DIR env var.""" test_root = os.environ.get("FILESYSTEM_TEST_DIR") if not test_root: raise ValueError("FILESYSTEM_TEST_DIR environment variable is required") return Path(test_root) def verify_requirements_file_exists(test_dir: Path) -> bool: """Verify that the requirements.txt file exists.""" requirements_file = test_dir / "requirements.txt" if not requirements_file.exists(): print("❌ File 'requirements.txt' not found") return False print("✅ Requirements.txt file found") return True def verify_requirements_file_readable(test_dir: Path) -> bool: """Verify that the requirements.txt file is readable.""" requirements_file = test_dir / "requirements.txt" try: content = requirements_file.read_text() if not content.strip(): print("❌ Requirements.txt file is empty") return False print("✅ Requirements.txt file is readable") return True except Exception as e: print(f"❌ Error reading requirements.txt file: {e}") return False def verify_required_dependencies_present(test_dir: Path) -> bool: """Verify that all required Zero123 dependencies are present.""" requirements_file = test_dir / "requirements.txt" try: content = requirements_file.read_text() # Required dependencies to check for (simplified) required_deps = [ "einops", "kornia", "taming", "openai", "clip" ] missing_deps = [] found_deps = [] for dep in required_deps: if dep.lower() in content.lower(): found_deps.append(dep) else: missing_deps.append(dep) if missing_deps: print(f"❌ Missing required dependencies: {missing_deps}") return False print(f"✅ All required dependencies found: {found_deps}") return True except Exception as e: print(f"❌ Error checking dependencies: {e}") return False def verify_specific_dependency_entries(test_dir: Path) -> bool: """Verify that the specific dependency entries are present.""" requirements_file = test_dir / "requirements.txt" try: content = requirements_file.read_text() # Check for specific dependency entries (simplified) # For taming, we only need to check if "taming" is present, not the full package name required_checks = [ ("einops", "einops"), ("kornia", "kornia"), ("taming", "taming"), # Just check for "taming" substring ] missing_entries = [] found_entries = [] for check_name, full_entry in required_checks: if check_name in content.lower(): found_entries.append(check_name) else: missing_entries.append(check_name) # Special check for openai and clip - they should be on the same line lines = content.split('\n') openai_clip_found = False for line in lines: line_lower = line.lower() if "openai" in line_lower and "clip" in line_lower: openai_clip_found = True break if openai_clip_found: found_entries.append("openai+clip") else: missing_entries.append("openai+clip") if missing_entries: print(f"❌ Missing required dependency checks: {missing_entries}") return False print(f"✅ All required dependency checks passed: {found_entries}") return True except Exception as e: print(f"❌ Error checking specific entries: {e}") return False def verify_file_format(test_dir: Path) -> bool: """Verify that the requirements.txt file has proper format.""" requirements_file = test_dir / "requirements.txt" try: content = requirements_file.read_text() lines = content.split('\n') # Basic format check - just ensure file is not completely empty if not content.strip(): print("❌ File is completely empty") return False print("✅ File format is acceptable") return True except Exception as e: print(f"❌ Error checking file format: {e}") return False def verify_no_duplicate_entries(test_dir: Path) -> bool: """Verify that there are no duplicate dependency entries.""" requirements_file = test_dir / "requirements.txt" try: content = requirements_file.read_text() # Simplified duplicate check - just ensure the file is not completely corrupted if len(content) < 10: # Basic sanity check print("❌ File seems too short to be valid") return False print("✅ File appears to be valid") return True except Exception as e: print(f"❌ Error checking file: {e}") return False def main(): """Main verification function.""" test_dir = get_test_directory() print("🔍 Verifying ThreeStudio Task 3: Restore Zero123 Dependencies in Requirements.txt...") # Define verification steps verification_steps = [ ("Requirements File Exists", verify_requirements_file_exists), ("File is Readable", verify_requirements_file_readable), ("Required Dependencies Present", verify_required_dependencies_present), ("Specific Entries Present", verify_specific_dependency_entries), ("File Format", verify_file_format), ("File Validity", verify_no_duplicate_entries), ] # Run all verification steps all_passed = True for step_name, verify_func in verification_steps: print(f"\n--- {step_name} ---") if not verify_func(test_dir): all_passed = False # Final result print("\n" + "="*50) if all_passed: print("✅ Zero123 dependencies successfully restored in requirements.txt!") print("🎉 Task 3 verification: PASS") sys.exit(0) else: print("❌ Task 3 verification: FAIL") sys.exit(1) if __name__ == "__main__": main() ================================================ FILE: tasks/filesystem/standard/votenet/dataset_comparison/description.md ================================================ Please use FileSystem tools to finish the following task: ### Task Description Analyze the codebase to map ScanNet object categories to SUN RGB-D categories and calculate object counts. ### Task Objectives 1. **Primary Goal**: Use SUN RGB-D's 10-category classification system as the target taxonomy 2. **Mapping Requirement**: Map each ScanNet object category (using the "category" field, not "raw_category") to the corresponding SUN RGB-D category 3. **Calculation**: For each SUN RGB-D category, calculate the total count of objects from ScanNet that map to that category (It only counts if the category (not raw category) name are exactly the same(night_stand = nightstand)) 4. **Output**: Generate an analysis.txt file in the main directory showing the mapping and counts ### Expected Output Create a file named `analysis.txt` in the test directory root with the following format: - Each SUN RGB-D category should be represented as a 2-line block - Line 1: category name - Line 2: total count - Each block should be separated by one empty line ================================================ FILE: tasks/filesystem/standard/votenet/dataset_comparison/meta.json ================================================ { "task_id": "dataset_comparison", "task_name": "Dataset Comparison", "category_id": "votenet", "category_name": "Votenet", "description": "Map ScanNet object categories to their SUN RGB-D equivalents and calculate detailed object counts for each mapped category.", "author": "Lingjun Chen", "created_at": "2025-08-13", "difficulty": "L3", "tags": [ "cross-referencing", "data extraction", "pattern analysis" ], "mcp": [ "filesystem" ], "meta_data": { "stateType": "text", "stateContent": "votenet/\n ├── doc/\n │ ├── teaser.jpg\n │ └── tips.md\n ├── models/\n │ ├── ap_helper.py\n │ ├── backbone_module.py\n │ ├── boxnet.py\n │ ├── dump_helper.py\n │ ├── loss_helper.py\n │ ├── loss_helper_boxnet.py\n │ ├── proposal_module.py\n │ ├── votenet.py\n │ └── voting_module.py\n ├── pointnet2/\n │ ├── _ext_src/\n │ │ ├── include/\n │ │ │ ├── ball_query.h\n │ │ │ ├── cuda_utils.h\n │ │ │ ├── group_points.h\n │ │ │ ├── interpolate.h\n │ │ │ ├── sampling.h\n │ │ │ └── utils.h\n │ │ └── src/\n │ │ ├── ball_query.cpp\n │ │ ├── ball_query_gpu.cu\n │ │ ├── bindings.cpp\n │ │ ├── group_points.cpp\n │ │ ├── group_points_gpu.cu\n │ │ ├── interpolate.cpp\n │ │ ├── interpolate_gpu.cu\n │ │ ├── sampling.cpp\n │ │ └── sampling_gpu.cu\n │ ├── pointnet2_modules.py\n │ ├── pointnet2_test.py\n │ ├── pointnet2_utils.py\n │ ├── pytorch_utils.py\n │ └── setup.py\n ├── scannet/\n │ ├── meta_data/\n │ │ ├── scannet_means.npz\n │ │ ├── scannet_train.txt\n │ │ ├── scannetv2-labels.combined.tsv\n │ │ ├── scannetv2_test.txt\n │ │ ├── scannetv2_train.txt\n │ │ └── scannetv2_val.txt\n │ ├── scans/\n │ ├── batch_load_scannet_data.py\n │ ├── data_viz.py\n │ ├── load_scannet_data.py\n │ ├── model_util_scannet.py\n │ ├── README.md\n │ ├── scannet_detection_dataset.py\n │ └── scannet_utils.py\n ├── sunrgbd/\n │ ├── matlab/\n │ │ ├── extract_rgbd_data_v1.m\n │ │ ├── extract_rgbd_data_v2.m\n │ │ └── extract_split.m\n │ ├── OFFICIAL_SUNRGBD/\n │ ├── sunrgbd_trainval/\n │ ├── model_util_sunrgbd.py\n │ ├── README.md\n │ ├── sunrgbd_data.py\n │ ├── sunrgbd_detection_dataset.py\n │ └── sunrgbd_utils.py\n ├── utils/\n │ ├── box_util.py\n │ ├── eval_det.py\n │ ├── metric_util.py\n │ ├── nms.py\n │ ├── nn_distance.py\n │ ├── pc_util.py\n │ ├── tf_logger.py\n │ └── tf_visualizer.py\n ├── CODE_OF_CONDUCT.md\n ├── CONTRIBUTING.md\n ├── demo.py\n ├── eval.py\n ├── LICENSE\n ├── README.md\n └── train.py", "stateUrl": "https://storage.mcpmark.ai/filesystem/votenet.zip", "stateOriginalUrl": "https://github.com/facebookresearch/votenet" } } ================================================ FILE: tasks/filesystem/standard/votenet/dataset_comparison/verify.py ================================================ #!/usr/bin/env python3 """ Verification script for Votenet Dataset Comparison Task """ import sys from pathlib import Path import re import os def get_test_directory() -> Path: """Get the test directory from FILESYSTEM_TEST_DIR env var.""" test_root = os.environ.get("FILESYSTEM_TEST_DIR") if not test_root: raise ValueError("FILESYSTEM_TEST_DIR environment variable is required") return Path(test_root) def verify_analysis_file_exists(test_dir: Path) -> bool: """Verify that the analysis.txt file exists.""" analysis_file = test_dir / "analysis.txt" if not analysis_file.exists(): print("❌ File 'analysis.txt' not found") return False print("✅ Analysis file found") return True def verify_analysis_format(test_dir: Path) -> bool: """Verify that the analysis file has the correct format.""" analysis_file = test_dir / "analysis.txt" try: content = analysis_file.read_text() lines = content.split('\n') # Check if content is not empty if not content.strip(): print("❌ Analysis file is empty") return False # Check if we have enough lines for at least one category block if len(lines) < 2: print("❌ Analysis file doesn't have enough lines for a category block") return False # Check if the format follows the 2-line block pattern with empty lines between blocks # Each block should have: category_name, count, empty_line line_index = 0 block_count = 0 while line_index < len(lines): # Skip leading empty lines while line_index < len(lines) and lines[line_index].strip() == "": line_index += 1 if line_index >= len(lines): break # Check if we have at least 2 lines for a block if line_index + 1 >= len(lines): print("❌ Incomplete category block at the end") return False # Line 1 should be category name category_line = lines[line_index].strip() if not category_line: print(f"❌ Empty category name at line {line_index + 1}") return False # Line 2 should be count count_line = lines[line_index + 1].strip() if not count_line: print(f"❌ Empty count at line {line_index + 2}") return False # Check if count line contains a number if not re.search(r'\d+', count_line): print(f"❌ Count line doesn't contain a number at line {line_index + 2}: '{count_line}'") return False block_count += 1 line_index += 2 # Skip empty line between blocks (if not at the end) if line_index < len(lines) and lines[line_index].strip() == "": line_index += 1 if block_count == 0: print("❌ No valid category blocks found") return False print(f"✅ Analysis format is correct with {block_count} category blocks") return True except Exception as e: print(f"❌ Error reading analysis file: {e}") return False def verify_required_categories(test_dir: Path) -> bool: """Verify that all required SUN RGB-D categories are present.""" analysis_file = test_dir / "analysis.txt" try: content = analysis_file.read_text() lines = content.split('\n') # Extract category names from the file categories_found = [] line_index = 0 while line_index < len(lines): # Skip empty lines while line_index < len(lines) and lines[line_index].strip() == "": line_index += 1 if line_index >= len(lines): break # Get category name category_line = lines[line_index].strip() if category_line: categories_found.append(category_line.lower()) # Skip to next block line_index += 2 while line_index < len(lines) and lines[line_index].strip() == "": line_index += 1 # Required categories required_categories = { 'chair', 'table', 'bed', 'bookshelf', 'desk', 'toilet', 'dresser', 'bathtub', 'sofa', 'night_stand' } # Check if all required categories are present missing_categories = required_categories - set(categories_found) if missing_categories: print(f"❌ Missing required categories: {missing_categories}") return False # Check for extra categories extra_categories = set(categories_found) - required_categories if extra_categories: print(f"⚠️ Extra categories found: {extra_categories}") print(f"✅ All required categories present: {sorted(required_categories)}") return True except Exception as e: print(f"❌ Error verifying required categories: {e}") return False def verify_category_counts(test_dir: Path) -> bool: """Verify that the category counts match the expected values.""" analysis_file = test_dir / "analysis.txt" try: content = analysis_file.read_text() lines = content.split('\n') # Expected counts from answer.txt expected_counts = { 'chair': 4681, 'table': 1170, 'bed': 370, 'bookshelf': 377, 'desk': 680, 'toilet': 256, 'dresser': 213, 'bathtub': 144, 'sofa': 1, 'night_stand': 224 } # Extract category counts from the file category_counts = {} line_index = 0 while line_index < len(lines): # Skip empty lines while line_index < len(lines) and lines[line_index].strip() == "": line_index += 1 if line_index >= len(lines): break # Get category name category_line = lines[line_index].strip() if not category_line: line_index += 1 continue # Get count if line_index + 1 < len(lines): count_line = lines[line_index + 1].strip() if count_line: # Extract number from count line count_match = re.search(r'(\d+)', count_line) if count_match: category = category_line.lower() count = int(count_match.group(1)) category_counts[category] = count # Skip to next block line_index += 2 while line_index < len(lines) and lines[line_index].strip() == "": line_index += 1 # Verify counts match expected values all_counts_correct = True for category, expected_count in expected_counts.items(): if category in category_counts: actual_count = category_counts[category] if actual_count != expected_count: print(f"❌ Count mismatch for {category}: expected {expected_count}, got {actual_count}") all_counts_correct = False else: print(f"❌ Category {category} not found in analysis") all_counts_correct = False if all_counts_correct: print("✅ All category counts match expected values") return True else: return False except Exception as e: print(f"❌ Error verifying category counts: {e}") return False def verify_file_structure(test_dir: Path) -> bool: """Verify that the analysis.txt file is in the correct location.""" analysis_file = test_dir / "analysis.txt" if not analysis_file.exists(): print("❌ Analysis file not found in test directory root") return False # Check if it's directly in the test directory root, not in a subdirectory if analysis_file.parent != test_dir: print("❌ Analysis file should be in the test directory root") return False print("✅ Analysis file is in the correct location") return True def main(): """Main verification function.""" test_dir = get_test_directory() print("🔍 Verifying Votenet Dataset Comparison Task...") # Define verification steps verification_steps = [ ("Analysis File Exists", verify_analysis_file_exists), ("File Location", verify_file_structure), ("File Format", verify_analysis_format), ("Required Categories", verify_required_categories), ("Category Counts", verify_category_counts), ] # Run all verification steps all_passed = True for step_name, verify_func in verification_steps: print(f"\n--- {step_name} ---") if not verify_func(test_dir): all_passed = False # Final result print("\n" + "="*50) if all_passed: print("✅ Votenet dataset comparison task completed correctly!") print("🎉 Task verification: PASS") sys.exit(0) else: print("❌ Task verification: FAIL") sys.exit(1) if __name__ == "__main__": main() ================================================ FILE: tasks/filesystem/standard/votenet/debugging/description.md ================================================ Please use FileSystem tools to finish the following task: ### Task Description There is a bug in the VoteNet backbone module that needs to be identified and fixed. ### Task Objectives 1. **Examine the codebase** using filesystem MCP tools 2. **Identify the bug** inside the hole process 3. **Fix the bug** in the code 4. **Create an answer file** with the bug location ### Expected Output 1. **Fix the bug** in the code file directly 2. **Create `answer.txt`** in the test directory root with the format: `path` **Requirements:** - Only include the bug's file path in answer.txt - No additional text or explanation ### Hint **The bug is not in demo.py**, please look deeper inside the codebase. ================================================ FILE: tasks/filesystem/standard/votenet/debugging/meta.json ================================================ { "task_id": "debugging", "task_name": "Debugging", "category_id": "votenet", "category_name": "Votenet", "description": "Identify and fix bugs in the VoteNet backbone module by examining the codebase and implementing necessary corrections.", "author": "Lingjun Chen", "created_at": "2025-08-13", "difficulty": "L3", "tags": [ "code exploration" ], "mcp": [ "filesystem" ], "meta_data": { "stateType": "text", "stateContent": "votenet/\n ├── doc/\n │ ├── teaser.jpg\n │ └── tips.md\n ├── models/\n │ ├── ap_helper.py\n │ ├── backbone_module.py\n │ ├── boxnet.py\n │ ├── dump_helper.py\n │ ├── loss_helper.py\n │ ├── loss_helper_boxnet.py\n │ ├── proposal_module.py\n │ ├── votenet.py\n │ └── voting_module.py\n ├── pointnet2/\n │ ├── _ext_src/\n │ │ ├── include/\n │ │ │ ├── ball_query.h\n │ │ │ ├── cuda_utils.h\n │ │ │ ├── group_points.h\n │ │ │ ├── interpolate.h\n │ │ │ ├── sampling.h\n │ │ │ └── utils.h\n │ │ └── src/\n │ │ ├── ball_query.cpp\n │ │ ├── ball_query_gpu.cu\n │ │ ├── bindings.cpp\n │ │ ├── group_points.cpp\n │ │ ├── group_points_gpu.cu\n │ │ ├── interpolate.cpp\n │ │ ├── interpolate_gpu.cu\n │ │ ├── sampling.cpp\n │ │ └── sampling_gpu.cu\n │ ├── pointnet2_modules.py\n │ ├── pointnet2_test.py\n │ ├── pointnet2_utils.py\n │ ├── pytorch_utils.py\n │ └── setup.py\n ├── scannet/\n │ ├── meta_data/\n │ │ ├── scannet_means.npz\n │ │ ├── scannet_train.txt\n │ │ ├── scannetv2-labels.combined.tsv\n │ │ ├── scannetv2_test.txt\n │ │ ├── scannetv2_train.txt\n │ │ └── scannetv2_val.txt\n │ ├── scans/\n │ ├── batch_load_scannet_data.py\n │ ├── data_viz.py\n │ ├── load_scannet_data.py\n │ ├── model_util_scannet.py\n │ ├── README.md\n │ ├── scannet_detection_dataset.py\n │ └── scannet_utils.py\n ├── sunrgbd/\n │ ├── matlab/\n │ │ ├── extract_rgbd_data_v1.m\n │ │ ├── extract_rgbd_data_v2.m\n │ │ └── extract_split.m\n │ ├── OFFICIAL_SUNRGBD/\n │ ├── sunrgbd_trainval/\n │ ├── model_util_sunrgbd.py\n │ ├── README.md\n │ ├── sunrgbd_data.py\n │ ├── sunrgbd_detection_dataset.py\n │ └── sunrgbd_utils.py\n ├── utils/\n │ ├── box_util.py\n │ ├── eval_det.py\n │ ├── metric_util.py\n │ ├── nms.py\n │ ├── nn_distance.py\n │ ├── pc_util.py\n │ ├── tf_logger.py\n │ └── tf_visualizer.py\n ├── CODE_OF_CONDUCT.md\n ├── CONTRIBUTING.md\n ├── demo.py\n ├── eval.py\n ├── LICENSE\n ├── README.md\n └── train.py", "stateUrl": "https://storage.mcpmark.ai/filesystem/votenet.zip", "stateOriginalUrl": "https://github.com/facebookresearch/votenet" } } ================================================ FILE: tasks/filesystem/standard/votenet/debugging/verify.py ================================================ #!/usr/bin/env python3 """ Verification script for VoteNet Task: Debug Backbone Module """ import sys from pathlib import Path import re import os def get_test_directory() -> Path: """Get the test directory from FILESYSTEM_TEST_DIR env var.""" test_root = os.environ.get("FILESYSTEM_TEST_DIR") if not test_root: raise ValueError("FILESYSTEM_TEST_DIR environment variable is required") return Path(test_root) def verify_answer_file_exists(test_dir: Path) -> bool: """Verify that the answer.txt file exists.""" answer_file = test_dir / "answer.txt" if not answer_file.exists(): print("❌ File 'answer.txt' not found") return False print("✅ Answer file found") return True def verify_answer_format(test_dir: Path) -> bool: """Verify that the answer file has the correct format.""" answer_file = test_dir / "answer.txt" try: content = answer_file.read_text().strip() # Check if content is not empty if not content: print("❌ Answer file is empty") return False # Check if it contains only one line (no additional text) if len(content.split('\n')) > 1: print("❌ Answer file contains multiple lines or additional text") return False # Check if path contains the expected components if 'models/backbone_module.py' not in content: print("❌ Answer should contain 'models/backbone_module.py'") return False print("✅ Answer format is correct") return True except Exception as e: print(f"❌ Error reading answer file: {e}") return False def verify_file_path_structure(test_dir: Path) -> bool: """Verify that the file path has the expected structure.""" answer_file = test_dir / "answer.txt" try: content = answer_file.read_text().strip() # Expected path components for backbone module expected_components = ["models", "backbone_module.py"] # Check if all expected components are in the content for component in expected_components: if component not in content: print(f"❌ Answer missing expected component: {component}") return False print("✅ Answer contains expected components") return True except Exception as e: print(f"❌ Error verifying answer structure: {e}") return False def verify_file_exists(test_dir: Path) -> bool: """Verify that the identified file actually exists.""" answer_file = test_dir / "answer.txt" try: content = answer_file.read_text().strip() # Try the expected path file_path = test_dir / "models/backbone_module.py" if not file_path.exists(): print(f"❌ Expected file does not exist: models/backbone_module.py") return False print("✅ Expected file exists") return True except Exception as e: print(f"❌ Error verifying file existence: {e}") return False def verify_bug_fix(test_dir: Path) -> bool: """Verify that the bug has been fixed in the code.""" answer_file = test_dir / "answer.txt" try: content = answer_file.read_text().strip() file_path = test_dir / "models/backbone_module.py" if not file_path.exists(): print(f"❌ Cannot find file for bug fix verification: models/backbone_module.py") return False # Read the file and search for the specific line containing self.fp2 = PointnetFPModule file_content = file_path.read_text() lines = file_content.split('\n') # Find the line containing self.fp2 = PointnetFPModule target_line = None target_line_number = None for i, line in enumerate(lines): if "self.fp2 = PointnetFPModule" in line: target_line = line.strip() target_line_number = i + 1 # Convert to 1-based line number break if target_line is None: print("❌ Could not find line containing 'self.fp2 = PointnetFPModule'") return False # Check if the original buggy line still exists original_bug = "self.fp2 = PointnetFPModule(mlp=[256,256,256])" if original_bug in target_line: print(f"❌ Bug has not been fixed - original line still exists at line {target_line_number}") print(f" Line {target_line_number} content: {target_line}") return False # Check for the correct fix correct_fixes = [ "self.fp2 = PointnetFPModule(mlp=[256+256,256,256])", "self.fp2 = PointnetFPModule(mlp=[512,256,256])" ] fix_found = False for fix in correct_fixes: if fix in target_line: fix_found = True break if not fix_found: print(f"❌ Bug fix not found at line {target_line_number}") print(f" Line {target_line_number} content: {target_line}") print(" Expected one of:") for fix in correct_fixes: print(f" - {fix}") return False print(f"✅ Bug has been fixed correctly at line {target_line_number}") return True except Exception as e: print(f"❌ Error verifying bug fix: {e}") return False def main(): """Main verification function.""" test_dir = get_test_directory() print("🔍 Verifying VoteNet Task: Debug Backbone Module...") # Define verification steps verification_steps = [ ("Answer File Exists", verify_answer_file_exists), ("Answer Format", verify_answer_format), ("Answer Structure", verify_file_path_structure), ("File Exists", verify_file_exists), ("Bug Fix Applied", verify_bug_fix), ] # Run all verification steps all_passed = True for step_name, verify_func in verification_steps: print(f"\n--- {step_name} ---") if not verify_func(test_dir): all_passed = False # Final result print("\n" + "="*50) if all_passed: print("✅ VoteNet backbone module bug has been correctly identified and fixed!") print("🎉 Task verification: PASS") sys.exit(0) else: print("❌ Task verification: FAIL") sys.exit(1) if __name__ == "__main__": main() ================================================ FILE: tasks/filesystem/standard/votenet/requirements_writing/description.md ================================================ Please use FileSystem tools to finish the following task: ### Task Description The VoteNet project is a 3D object detection framework for point clouds. Your task is to create a `requirements.txt` file that lists all the necessary Python dependencies for running this codebase. ### Task Objectives 1. **Create a requirements.txt file** in the main directory 2. **Include all essential dependencies** needed to run the VoteNet codebase 3. **Ensure the file format is correct** (one dependency per line) 4. **Save the file as `requirements.txt`** in the current working directory 5. **Not just** pip install or conda install, your answer should contain **every necessary dependencies in the hole process of VoteNet**. ### Requirements The requirements.txt file should contain Python packages that are necessary for: - 3D point cloud processing - Deep learning frameworks - Computer vision libraries - Data visualization - 3D mesh processing - Network/graph operations ### Note - You can examine the codebase structure and README to understand what packages are needed - The file should be saved as `requirements.txt` in the current directory - Each dependency should be on a separate line ================================================ FILE: tasks/filesystem/standard/votenet/requirements_writing/meta.json ================================================ { "task_id": "requirements_writing", "task_name": "Requirements Writing", "category_id": "votenet", "category_name": "VoteNet", "description": "Generate a complete requirements.txt file containing all necessary Python dependencies for running the VoteNet codebase successfully.", "author": "Lingjun Chen", "created_at": "2025-08-13", "difficulty": "L3", "tags": [ "code exploration", "cross-referencing" ], "mcp": [ "filesystem" ], "meta_data": { "stateType": "text", "stateContent": "votenet/\n ├── doc/\n │ ├── teaser.jpg\n │ └── tips.md\n ├── models/\n │ ├── ap_helper.py\n │ ├── backbone_module.py\n │ ├── boxnet.py\n │ ├── dump_helper.py\n │ ├── loss_helper.py\n │ ├── loss_helper_boxnet.py\n │ ├── proposal_module.py\n │ ├── votenet.py\n │ └── voting_module.py\n ├── pointnet2/\n │ ├── _ext_src/\n │ │ ├── include/\n │ │ │ ├── ball_query.h\n │ │ │ ├── cuda_utils.h\n │ │ │ ├── group_points.h\n │ │ │ ├── interpolate.h\n │ │ │ ├── sampling.h\n │ │ │ └── utils.h\n │ │ └── src/\n │ │ ├── ball_query.cpp\n │ │ ├── ball_query_gpu.cu\n │ │ ├── bindings.cpp\n │ │ ├── group_points.cpp\n │ │ ├── group_points_gpu.cu\n │ │ ├── interpolate.cpp\n │ │ ├── interpolate_gpu.cu\n │ │ ├── sampling.cpp\n │ │ └── sampling_gpu.cu\n │ ├── pointnet2_modules.py\n │ ├── pointnet2_test.py\n │ ├── pointnet2_utils.py\n │ ├── pytorch_utils.py\n │ └── setup.py\n ├── scannet/\n │ ├── meta_data/\n │ │ ├── scannet_means.npz\n │ │ ├── scannet_train.txt\n │ │ ├── scannetv2-labels.combined.tsv\n │ │ ├── scannetv2_test.txt\n │ │ ├── scannetv2_train.txt\n │ │ └── scannetv2_val.txt\n │ ├── scans/\n │ ├── batch_load_scannet_data.py\n │ ├── data_viz.py\n │ ├── load_scannet_data.py\n │ ├── model_util_scannet.py\n │ ├── README.md\n │ ├── scannet_detection_dataset.py\n │ └── scannet_utils.py\n ├── sunrgbd/\n │ ├── matlab/\n │ │ ├── extract_rgbd_data_v1.m\n │ │ ├── extract_rgbd_data_v2.m\n │ │ └── extract_split.m\n │ ├── OFFICIAL_SUNRGBD/\n │ ├── sunrgbd_trainval/\n │ ├── model_util_sunrgbd.py\n │ ├── README.md\n │ ├── sunrgbd_data.py\n │ ├── sunrgbd_detection_dataset.py\n │ └── sunrgbd_utils.py\n ├── utils/\n │ ├── box_util.py\n │ ├── eval_det.py\n │ ├── metric_util.py\n │ ├── nms.py\n │ ├── nn_distance.py\n │ ├── pc_util.py\n │ ├── tf_logger.py\n │ └── tf_visualizer.py\n ├── CODE_OF_CONDUCT.md\n ├── CONTRIBUTING.md\n ├── demo.py\n ├── eval.py\n ├── LICENSE\n ├── README.md\n └── train.py", "stateUrl": "https://storage.mcpmark.ai/filesystem/votenet.zip", "stateOriginalUrl": "https://github.com/facebookresearch/votenet" } } ================================================ FILE: tasks/filesystem/standard/votenet/requirements_writing/verify.py ================================================ #!/usr/bin/env python3 """ Verification script for VoteNet Task: Create Requirements.txt File """ import sys from pathlib import Path import os def get_test_directory() -> Path: """Get the test directory from FILESYSTEM_TEST_DIR env var.""" test_root = os.environ.get("FILESYSTEM_TEST_DIR") if not test_root: raise ValueError("FILESYSTEM_TEST_DIR environment variable is required") return Path(test_root) def verify_requirements_file_exists(test_dir: Path) -> bool: """Verify that the requirements.txt file exists.""" requirements_file = test_dir / "requirements.txt" if not requirements_file.exists(): print("❌ File 'requirements.txt' not found") return False print("✅ Requirements.txt file found") return True def verify_requirements_file_readable(test_dir: Path) -> bool: """Verify that the requirements.txt file is readable.""" requirements_file = test_dir / "requirements.txt" try: content = requirements_file.read_text() if not content.strip(): print("❌ Requirements.txt file is empty") return False print("✅ Requirements.txt file is readable") return True except Exception as e: print(f"❌ Error reading requirements.txt file: {e}") return False def verify_required_dependencies_present(test_dir: Path) -> bool: """Verify that all required dependencies are present.""" requirements_file = test_dir / "requirements.txt" try: content = requirements_file.read_text() # Required dependencies from answer.txt required_deps = [ "matplotlib", "opencv", "plyfile", "trimesh", "pointnet2", "networkx" ] missing_deps = [] found_deps = [] for dep in required_deps: if dep.lower() in content.lower(): found_deps.append(dep) else: missing_deps.append(dep) if missing_deps: print(f"❌ Missing required dependencies: {missing_deps}") return False print(f"✅ All required dependencies found: {found_deps}") return True except Exception as e: print(f"❌ Error checking dependencies: {e}") return False def verify_file_format(test_dir: Path) -> bool: """Verify that the requirements.txt file has proper format.""" requirements_file = test_dir / "requirements.txt" try: content = requirements_file.read_text() lines = content.split('\n') # Check if file has content and proper line structure if not content.strip(): print("❌ File is completely empty") return False # Check if there are multiple lines (indicating multiple dependencies) non_empty_lines = [line.strip() for line in lines if line.strip()] if len(non_empty_lines) < 3: # Should have at least 3 dependencies print("❌ File seems to have too few dependencies") return False print("✅ File format is acceptable") return True except Exception as e: print(f"❌ Error checking file format: {e}") return False def verify_no_duplicate_entries(test_dir: Path) -> bool: """Verify that there are no duplicate dependency entries.""" requirements_file = test_dir / "requirements.txt" try: content = requirements_file.read_text() lines = [line.strip().lower() for line in content.split('\n') if line.strip()] # Check for duplicates if len(lines) != len(set(lines)): print("❌ File contains duplicate entries") return False print("✅ No duplicate entries found") return True except Exception as e: print(f"❌ Error checking for duplicates: {e}") return False def main(): """Main verification function.""" test_dir = get_test_directory() print("🔍 Verifying VoteNet Task: Create Requirements.txt File...") # Define verification steps verification_steps = [ ("Requirements File Exists", verify_requirements_file_exists), ("File is Readable", verify_requirements_file_readable), ("Required Dependencies Present", verify_required_dependencies_present), ("File Format", verify_file_format), ("No Duplicate Entries", verify_no_duplicate_entries), ] # Run all verification steps all_passed = True for step_name, verify_func in verification_steps: print(f"\n--- {step_name} ---") if not verify_func(test_dir): all_passed = False # Final result print("\n" + "="*50) if all_passed: print("✅ Requirements.txt file successfully created with all required dependencies!") print("🎉 Task verification: PASS") sys.exit(0) else: print("❌ Task verification: FAIL") sys.exit(1) if __name__ == "__main__": main() ================================================ FILE: tasks/github/easy/build-your-own-x/close_commented_issues/description.md ================================================ Use the GitHub MCP tools to close every issue in `mcpmark-eval/build-your-own-x` that already has at least one comment. Leave all other issues unchanged. ================================================ FILE: tasks/github/easy/build-your-own-x/close_commented_issues/meta.json ================================================ { "task_id": "close_commented_issues", "task_name": "Close Commented Issues", "category_id": "build-your-own-x", "category_name": "Build Your Own X (Easy)", "description": "Use GitHub MCP tools to close every issue with comments in build-your-own-x and leave everything else alone.", "author": "Zijian Wu", "created_at": "2025-11-15", "difficulty": "L1", "tags": [ "issue management" ], "mcp": [ "github" ], "meta_data": { "stateType": "url", "stateContent": null, "stateUrl": "https://github.com/mcpmark-source/build-your-own-x", "stateOriginalUrl": "https://github.com/codecrafters-io/build-your-own-x" } } ================================================ FILE: tasks/github/easy/build-your-own-x/close_commented_issues/verify.py ================================================ import os import sys from typing import Optional import requests from dotenv import load_dotenv REPO_NAME = "build-your-own-x" TARGET_ISSUES = [23, 25] def _fetch_issue(org: str, token: str, number: int) -> Optional[dict]: url = f"https://api.github.com/repos/{org}/{REPO_NAME}/issues/{number}" headers = { "Authorization": f"Bearer {token}", "Accept": "application/vnd.github+json", } try: response = requests.get(url, headers=headers, timeout=30) except Exception as exc: print(f"Request error for issue #{number}: {exc}", file=sys.stderr) return None if response.status_code != 200: print( f"GitHub API returned {response.status_code} when fetching issue #{number}", file=sys.stderr, ) return None try: return response.json() except Exception as exc: print(f"Unable to parse issue #{number}: {exc}", file=sys.stderr) return None def verify() -> bool: load_dotenv(".mcp_env") token = os.environ.get("MCP_GITHUB_TOKEN") org = os.environ.get("GITHUB_EVAL_ORG") if not token: print("MCP_GITHUB_TOKEN is missing", file=sys.stderr) return False if not org: print("GITHUB_EVAL_ORG is missing", file=sys.stderr) return False print("Checking issue states in remote repository...") success = True for issue_number in TARGET_ISSUES: data = _fetch_issue(org, token, issue_number) if data is None: success = False continue state = data.get("state", "").lower() if state != "closed": print( f"Issue #{issue_number} is '{state}' but must be closed.", file=sys.stderr, ) success = False else: print(f"Issue #{issue_number} is closed as expected.") return success if __name__ == "__main__": sys.exit(0 if verify() else 1) ================================================ FILE: tasks/github/easy/build-your-own-x/record_recent_commits/description.md ================================================ Use the GitHub MCP tools to work in the `mcpmark-eval/build-your-own-x` repository. 1. Retrieve the newest five commits on the default branch. 2. Open a new issue titled exactly `Latest 5 Commit Snapshot`. 3. Set the issue body to exactly this format (newest commit first): ``` Latest 5 commits (newest first) 1. <full-sha> | <author name> | <commit subject> 2. <full-sha> | <author name> | <commit subject> 3. <full-sha> | <author name> | <commit subject> 4. <full-sha> | <author name> | <commit subject> 5. <full-sha> | <author name> | <commit subject> ``` Use the full 40-character SHA and only the first line of each commit message. The `<author name>` must come from the commit metadata's author name field (not the GitHub username/login). Leave the issue open and do not touch other issues. ================================================ FILE: tasks/github/easy/build-your-own-x/record_recent_commits/meta.json ================================================ { "task_id": "record_recent_commits", "task_name": "Record Recent Commits", "category_id": "build-your-own-x", "category_name": "Build Your Own X (Easy)", "description": "Summarize the latest five commits by opening an issue with their SHAs, authors, and subjects.", "author": "Zijian Wu", "created_at": "2025-11-15", "difficulty": "L1", "tags": [ "commits", "issue" ], "mcp": [ "github" ], "meta_data": { "stateType": "url", "stateContent": null, "stateUrl": "https://github.com/mcpmark-source/build-your-own-x", "stateOriginalUrl": "https://github.com/codecrafters-io/build-your-own-x" } } ================================================ FILE: tasks/github/easy/build-your-own-x/record_recent_commits/verify.py ================================================ import os import sys from typing import List, Optional import requests from dotenv import load_dotenv REPO_NAME = "build-your-own-x" BRANCH = "master" ISSUE_TITLE = "Latest 5 Commit Snapshot" EXPECTED_HEADER = "latest 5 commits (newest first)" def _request(url: str, token: str) -> Optional[requests.Response]: headers = { "Authorization": f"Bearer {token}", "Accept": "application/vnd.github+json", } try: response = requests.get(url, headers=headers, timeout=30) except Exception as exc: # pragma: no cover - network errors print(f"Request error for {url}: {exc}", file=sys.stderr) return None if response.status_code != 200: print( f"GitHub API returned {response.status_code} for {url}", file=sys.stderr, ) return None return response def _fetch_commits(org: str, token: str) -> Optional[List[dict]]: url = ( f"https://api.github.com/repos/{org}/{REPO_NAME}/commits" f"?per_page=5&sha={BRANCH}" ) response = _request(url, token) if response is None: return None try: return response.json() except Exception as exc: print(f"Unable to parse commits: {exc}", file=sys.stderr) return None def _find_issue(org: str, token: str) -> Optional[dict]: page = 1 while True: url = ( f"https://api.github.com/repos/{org}/{REPO_NAME}/issues" f"?state=open&per_page=100&page={page}" ) response = _request(url, token) if response is None: return None try: issues = response.json() except Exception as exc: print(f"Unable to parse issues: {exc}", file=sys.stderr) return None if not issues: break for issue in issues: if issue.get("title") == ISSUE_TITLE: # Exclude pull requests if "pull_request" in issue: continue return issue page += 1 print( f"No open issue titled '{ISSUE_TITLE}' was found.", file=sys.stderr, ) return None def verify() -> bool: load_dotenv(".mcp_env") token = os.environ.get("MCP_GITHUB_TOKEN") org = os.environ.get("GITHUB_EVAL_ORG") if not token: print("MCP_GITHUB_TOKEN is missing", file=sys.stderr) return False if not org: print("GITHUB_EVAL_ORG is missing", file=sys.stderr) return False commits = _fetch_commits(org, token) if commits is None: return False if len(commits) < 5: print("Less than five commits returned; cannot verify.", file=sys.stderr) return False issue = _find_issue(org, token) if issue is None: return False if issue.get("title") != ISSUE_TITLE: print( f"Found issue title '{issue.get('title')}', expected '{ISSUE_TITLE}'.", file=sys.stderr, ) return False if (issue.get("state") or "").lower() != "open": print("Issue must remain open.", file=sys.stderr) return False body = issue.get("body") or "" if not body.strip(): print("Issue body is empty.", file=sys.stderr) return False lines = [line.strip() for line in body.splitlines() if line.strip()] if not lines: print("Issue body contains no content.", file=sys.stderr) return False header = lines[0].lower() if header != EXPECTED_HEADER: print( "Issue body must start with 'Latest 5 commits (newest first)'.", file=sys.stderr, ) return False entries = lines[1:] if len(entries) != 5: print("Issue body must list exactly five commit entries.", file=sys.stderr) return False for idx in range(5): commit = commits[idx] sha = commit.get("sha", "") subject = (commit.get("commit", {}).get("message", "").splitlines()[0]).strip() author = commit.get("commit", {}).get("author", {}).get("name", "") expected_line = f"{idx + 1}. {sha} | {author} | {subject}" actual_line = entries[idx] if actual_line != expected_line: print( f"Entry {idx + 1} mismatch.\nExpected: {expected_line}\nFound: {actual_line}", file=sys.stderr, ) return False print("Issue contains the expected latest five commits.") return True if __name__ == "__main__": sys.exit(0 if verify() else 1) ================================================ FILE: tasks/github/easy/claude-code/add_terminal_shortcuts_doc/description.md ================================================ Use the GitHub MCP tools to edit the `mcpmark-eval/claude-code` repository. 1. On the `main` branch, add a new file `docs/TERMINAL_SHORTCUTS.md` containing exactly: ``` # Terminal Shortcuts - `claude plan`: Outline the next steps before making edits. - `claude apply`: Run the plan and apply the queued changes. - `claude check`: Re-run relevant tests or linters to validate the edits. ``` 2. Commit with the message `docs: add terminal shortcuts reference` and push directly to `main`. ================================================ FILE: tasks/github/easy/claude-code/add_terminal_shortcuts_doc/meta.json ================================================ { "task_id": "add_terminal_shortcuts_doc", "task_name": "Add Terminal Shortcuts Doc", "category_id": "claude-code", "category_name": "Claude Code (Easy)", "description": "Add a simple terminal shortcuts reference file to docs/TERMINAL_SHORTCUTS.md and push it to main.", "author": "Zijian Wu", "created_at": "2025-11-15", "difficulty": "L1", "tags": [ "docs update", "content creation" ], "mcp": [ "github" ], "meta_data": { "stateType": "url", "stateContent": null, "stateUrl": "https://github.com/mcpmark-source/claude-code", "stateOriginalUrl": "https://github.com/anthropics/claude-code" } } ================================================ FILE: tasks/github/easy/claude-code/add_terminal_shortcuts_doc/verify.py ================================================ import base64 import os import sys from typing import Optional import requests from dotenv import load_dotenv REPO_NAME = "claude-code" TARGET_FILE = "docs/TERMINAL_SHORTCUTS.md" BRANCH = "main" EXPECTED_CONTENT = """# Terminal Shortcuts - `claude plan`: Outline the next steps before making edits. - `claude apply`: Run the plan and apply the queued changes. - `claude check`: Re-run relevant tests or linters to validate the edits. """.strip() def _download_file(org: str, token: str) -> Optional[str]: url = f"https://api.github.com/repos/{org}/{REPO_NAME}/contents/{TARGET_FILE}?ref={BRANCH}" headers = { "Authorization": f"Bearer {token}", "Accept": "application/vnd.github+json", } try: response = requests.get(url, headers=headers, timeout=30) except Exception as exc: print(f"Request error for {TARGET_FILE}: {exc}", file=sys.stderr) return None if response.status_code != 200: print( f"GitHub API returned {response.status_code} when fetching {TARGET_FILE}", file=sys.stderr, ) return None data = response.json() try: content = base64.b64decode(data.get("content", "")).decode("utf-8").strip() except Exception as exc: print(f"Unable to decode {TARGET_FILE}: {exc}", file=sys.stderr) return None return content def verify() -> bool: load_dotenv(".mcp_env") token = os.environ.get("MCP_GITHUB_TOKEN") org = os.environ.get("GITHUB_EVAL_ORG") if not token: print("MCP_GITHUB_TOKEN is missing", file=sys.stderr) return False if not org: print("GITHUB_EVAL_ORG is missing", file=sys.stderr) return False print(f"Checking {TARGET_FILE} in remote repository...") content = _download_file(org, token) if content is None: return False normalized = content.strip() if normalized != EXPECTED_CONTENT: print("TERMINAL_SHORTCUTS.md does not match the expected content.", file=sys.stderr) print("Expected:") print(EXPECTED_CONTENT) print("Found:") print(content) return False print("All checks passed! docs/TERMINAL_SHORTCUTS.md contains the expected text.") return True if __name__ == "__main__": success = verify() sys.exit(0 if success else 1) ================================================ FILE: tasks/github/easy/claude-code/thank_docker_pr_author/description.md ================================================ Use the GitHub MCP tools to comment on the pull request in `mcpmark-eval/claude-code` that proposes automating Docker image builds with GitHub Actions. 1. Skim the PR description so you understand it’s the Docker workflow automation proposal. 2. Add a new comment on that PR that thanks the author and contains all of these keywords: `Docker workflow`, `automation`, `review`. ================================================ FILE: tasks/github/easy/claude-code/thank_docker_pr_author/meta.json ================================================ { "task_id": "thank_docker_pr_author", "task_name": "Thank Docker PR Author", "category_id": "claude-code", "category_name": "Claude Code (Easy)", "description": "Leave a thank-you comment on the Docker automation PR mentioning the workflow automation review keywords.", "author": "Zijian Wu", "created_at": "2025-11-15", "difficulty": "L1", "tags": [ "pull request", "comment" ], "mcp": [ "github" ], "meta_data": { "stateType": "url", "stateContent": null, "stateUrl": "https://github.com/mcpmark-source/claude-code", "stateOriginalUrl": "https://github.com/anthropics/claude-code" } } ================================================ FILE: tasks/github/easy/claude-code/thank_docker_pr_author/verify.py ================================================ import os import sys from typing import Optional, Union import requests from dotenv import load_dotenv REPO_NAME = "claude-code" PR_NUMBER = 53 KEYWORDS = ["docker workflow", "automation", "review"] def _github_get(org: str, token: str, path: str) -> Optional[Union[list, dict]]: url = f"https://api.github.com/repos/{org}/{REPO_NAME}/{path}" headers = { "Authorization": f"Bearer {token}", "Accept": "application/vnd.github+json", } try: response = requests.get(url, headers=headers, timeout=30) except Exception as exc: print(f"Request error for {path}: {exc}", file=sys.stderr) return None if response.status_code != 200: print( f"GitHub API returned {response.status_code} for {path}", file=sys.stderr, ) return None return response.json() def verify() -> bool: load_dotenv(".mcp_env") token = os.environ.get("MCP_GITHUB_TOKEN") org = os.environ.get("GITHUB_EVAL_ORG") if not token: print("MCP_GITHUB_TOKEN is missing", file=sys.stderr) return False if not org: print("GITHUB_EVAL_ORG is missing", file=sys.stderr) return False comments = _github_get(org, token, f"issues/{PR_NUMBER}/comments?per_page=100") if comments is None: return False for comment in comments: body = comment.get("body", "").strip() lowered = body.lower() if not body: continue if not any(thank_word in lowered for thank_word in ("thanks", "thank you")): continue if all(keyword in lowered for keyword in KEYWORDS): print("All checks passed! Keyword-rich thank-you comment found on PR #53.") return True print( "Did not find a thank-you comment containing all required keywords on PR #53.", file=sys.stderr, ) return False if __name__ == "__main__": success = verify() sys.exit(0 if success else 1) ================================================ FILE: tasks/github/easy/claude-code/triage_missing_tool_result_issue/description.md ================================================ Use the GitHub MCP tools to triage issue #24 in the `mcpmark-eval/claude-code` repository. 1. Read the issue details to understand the reported API error. 2. Add a triage comment on the issue that explicitly includes all of the following keywords: `invalid_request_error`, `toolu_01Kjp7i9iF3xJ3z9aH4pSaRw`, `tool_result`, `tool_use`. Use them while confirming the API error and asking for the missing result block. 3. Remove the `area:packaging` label from issue #24. ================================================ FILE: tasks/github/easy/claude-code/triage_missing_tool_result_issue/meta.json ================================================ { "task_id": "triage_missing_tool_result_issue", "task_name": "Triage Missing Tool Result Issue", "category_id": "claude-code", "category_name": "Claude Code (Easy)", "description": "Leave a predefined triage comment on issue #24 and remove the area:packaging label.", "author": "Zijian Wu", "created_at": "2025-11-15", "difficulty": "L1", "tags": [ "issue triage", "github" ], "mcp": [ "github" ], "meta_data": { "stateType": "url", "stateContent": null, "stateUrl": "https://github.com/mcpmark-source/claude-code", "stateOriginalUrl": "https://github.com/anthropics/claude-code" } } ================================================ FILE: tasks/github/easy/claude-code/triage_missing_tool_result_issue/verify.py ================================================ import os import sys from typing import Optional import requests from dotenv import load_dotenv REPO_NAME = "claude-code" ISSUE_NUMBER = 24 KEYWORDS = [ "invalid_request_error", "toolu_01kjp7i9if3xj3z9ah4psarw", "tool_result", "tool_use", ] REMOVED_LABEL = "area:packaging" def _github_get(org: str, token: str, path: str) -> Optional[dict]: url = f"https://api.github.com/repos/{org}/{REPO_NAME}/{path}" headers = { "Authorization": f"Bearer {token}", "Accept": "application/vnd.github+json", } try: response = requests.get(url, headers=headers, timeout=30) except Exception as exc: print(f"Request error for {path}: {exc}", file=sys.stderr) return None if response.status_code != 200: print( f"GitHub API returned {response.status_code} for {path}", file=sys.stderr, ) return None return response.json() def verify() -> bool: load_dotenv(".mcp_env") token = os.environ.get("MCP_GITHUB_TOKEN") org = os.environ.get("GITHUB_EVAL_ORG") if not token: print("MCP_GITHUB_TOKEN is missing", file=sys.stderr) return False if not org: print("GITHUB_EVAL_ORG is missing", file=sys.stderr) return False issue = _github_get(org, token, f"issues/{ISSUE_NUMBER}") if issue is None: return False label_names = {label.get("name", "") for label in issue.get("labels", [])} if REMOVED_LABEL in label_names: print(f"Label '{REMOVED_LABEL}' is still present on issue #{ISSUE_NUMBER}.", file=sys.stderr) return False comments = _github_get(org, token, f"issues/{ISSUE_NUMBER}/comments?per_page=100") if comments is None: return False found = False for comment in comments: body = comment.get("body", "").strip().lower() if all(keyword in body for keyword in KEYWORDS): found = True break if not found: print( "Did not find a triage comment containing all required keywords.", file=sys.stderr, ) return False print("All checks passed! Comment added and label removed.") return True if __name__ == "__main__": success = verify() sys.exit(0 if success else 1) ================================================ FILE: tasks/github/easy/mcpmark-cicd/basic_ci_checks/description.md ================================================ Use the GitHub MCP tools to update the `mcpmark-eval/mcpmark-cicd` repository with a very small CI workflow. ## Goal Add a GitHub Actions workflow named **Basic CI Checks** that automatically runs linting and unit tests any time work is pushed to or proposed for the `main` branch. ## Requirements 1. Create a branch called `basic-ci-checks` from `main`. 2. Add `.github/workflows/basic-ci.yml` with the following characteristics: - Workflow name: `Basic CI Checks`. - Trigger on both `push` and `pull_request`, limited to the `main` branch. - Single job called `quality-checks` that runs on `ubuntu-latest` and uses Node.js 18 (`actions/setup-node`). - Steps must include `actions/checkout`, `npm ci`, `npm run lint`, and `npm test` in that order after Node is configured. 3. Commit the workflow to your branch, open a pull request titled `Add basic CI checks`, and merge it so the workflow exists on `main`. That's it—no caching, matrix builds, or issue automation required. Keep it lightweight and focused on verifying the existing lint/test scripts. ================================================ FILE: tasks/github/easy/mcpmark-cicd/basic_ci_checks/meta.json ================================================ { "task_id": "basic_ci_checks", "task_name": "Basic CI Checks", "category_id": "mcpmark-cicd", "category_name": "MCPMark CI/CD (Easy)", "description": "Add a lightweight GitHub Actions workflow that runs npm ci, npm run lint, and npm test whenever main is updated or receives a pull request.", "author": "Zijian Wu", "created_at": "2025-11-15", "difficulty": "L1", "tags": [ "ci/cd", "github actions", "workflow basics" ], "mcp": [ "github" ], "meta_data": { "stateType": "url", "stateContent": null, "stateUrl": "https://github.com/mcpmark-source/mcpmark-cicd", "stateOriginalUrl": null } } ================================================ FILE: tasks/github/easy/mcpmark-cicd/basic_ci_checks/verify.py ================================================ import base64 import os import sys from typing import List, Optional import requests from dotenv import load_dotenv REPO_NAME = "mcpmark-cicd" WORKFLOW_PATH = ".github/workflows/basic-ci.yml" BRANCH = "main" def _download_file(org: str, token: str, path: str) -> Optional[str]: url = f"https://api.github.com/repos/{org}/{REPO_NAME}/contents/{path}?ref={BRANCH}" headers = { "Authorization": f"Bearer {token}", "Accept": "application/vnd.github+json", } try: response = requests.get(url, headers=headers, timeout=30) except Exception as exc: # pragma: no cover - network failure print(f"Request error for {path}: {exc}", file=sys.stderr) return None if response.status_code != 200: print( f"GitHub API returned {response.status_code} when fetching {path}", file=sys.stderr, ) return None data = response.json() try: content = base64.b64decode(data.get("content", "")).decode("utf-8") except Exception as exc: print(f"Unable to decode {path}: {exc}", file=sys.stderr) return None return content def _line_index(lines: List[str], needle: str) -> int: for idx, line in enumerate(lines): if needle in line: return idx return -1 def verify() -> bool: load_dotenv(".mcp_env") token = os.environ.get("MCP_GITHUB_TOKEN") org = os.environ.get("GITHUB_EVAL_ORG") if not token: print("MCP_GITHUB_TOKEN is missing", file=sys.stderr) return False if not org: print("GITHUB_EVAL_ORG is missing", file=sys.stderr) return False content = _download_file(org, token, WORKFLOW_PATH) if content is None: print( "Workflow file .github/workflows/basic-ci.yml was not found on main", file=sys.stderr, ) return False normalized = content.lower() normalized_lines = [line.strip().lower() for line in content.splitlines()] errors = [] required_snippets = { "workflow name": "name: basic ci checks", "job name": "quality-checks", "checkout step": "actions/checkout", "setup-node step": "actions/setup-node", "node version": "node-version: 18", "ubuntu runner": "runs-on: ubuntu-latest", "push trigger": "push:", "pull_request trigger": "pull_request:", } for label, snippet in required_snippets.items(): if snippet not in normalized: errors.append(f"Missing {label} ({snippet}) in workflow") branch_limited = "- main" in normalized or "[main]" in normalized if not branch_limited: errors.append("Workflow triggers must be limited to the main branch") for command in ["npm ci", "npm run lint", "npm test"]: if command not in normalized: errors.append(f"Missing '{command}' step") # Ensure npm commands happen in the expected order ci_index = _line_index(normalized_lines, "npm ci") lint_index = _line_index(normalized_lines, "npm run lint") test_index = _line_index(normalized_lines, "npm test") if ci_index == -1 or lint_index == -1 or test_index == -1: errors.append("Could not find all npm commands to validate ordering") else: if not (ci_index < lint_index < test_index): errors.append("npm commands must run in order: ci -> lint -> test") if errors: print("Verification failed:") for err in errors: print(f" - {err}", file=sys.stderr) return False print("✅ basic-ci workflow found with required steps and triggers") return True if __name__ == "__main__": sys.exit(0 if verify() else 1) ================================================ FILE: tasks/github/easy/mcpmark-cicd/issue_lint_guard/description.md ================================================ Use the GitHub MCP tools to wire up a tiny issue-triggered lint check for `mcpmark-eval/mcpmark-cicd`. ## Goal Whenever a maintainer opens the tracking issue **Lint workflow check**, the repo should automatically run `npm run lint` via GitHub Actions. Keep it simple—just prove the workflow fires for issue events. ## Requirements 1. Create a branch called `issue-lint-workflow` from `main`. 2. Add `.github/workflows/issue-lint.yml` with: - Workflow name **Issue Lint Guard**. - Trigger: `issues` with `types: [opened]` (no push/PR triggers). - Single job `lint` on `ubuntu-latest` using Node.js 18 via `actions/setup-node`. - Steps in order: `actions/checkout`, `npm ci`, `npm run lint`. 3. Open a pull request titled `Add issue lint workflow`, get it merged so the workflow exists on `main`. 4. After the merge, open a new issue titled **Lint workflow check** to trigger the workflow and wait until the matching run finishes successfully. Leave the issue open; we only care that the run went green. ================================================ FILE: tasks/github/easy/mcpmark-cicd/issue_lint_guard/meta.json ================================================ { "task_id": "issue_lint_guard", "task_name": "Issue Lint Guard", "category_id": "mcpmark-cicd", "category_name": "MCPMark CI/CD (Easy)", "description": "Add an issue-triggered lint workflow and prove it runs when the tracking issue is opened.", "author": "Zijian Wu", "created_at": "2025-11-15", "difficulty": "L1", "tags": [ "ci/cd", "github actions", "issues" ], "mcp": [ "github" ], "meta_data": { "stateType": "url", "stateContent": null, "stateUrl": "https://github.com/mcpmark-source/mcpmark-cicd", "stateOriginalUrl": null } } ================================================ FILE: tasks/github/easy/mcpmark-cicd/issue_lint_guard/verify.py ================================================ import base64 import os import sys import time from typing import List, Optional import requests from dotenv import load_dotenv REPO_NAME = "mcpmark-cicd" WORKFLOW_PATH = ".github/workflows/issue-lint.yml" WORKFLOW_FILE = "issue-lint.yml" TARGET_BRANCH = "main" TRACKING_ISSUE_TITLE = "Lint workflow check" MAX_POLL_ATTEMPTS = 12 POLL_INTERVAL_SECONDS = 10 def _download_file(org: str, token: str, path: str) -> Optional[str]: url = f"https://api.github.com/repos/{org}/{REPO_NAME}/contents/{path}?ref={TARGET_BRANCH}" headers = { "Authorization": f"Bearer {token}", "Accept": "application/vnd.github+json", } try: response = requests.get(url, headers=headers, timeout=30) except Exception as exc: # pragma: no cover - network error handling print(f"Request error for {path}: {exc}", file=sys.stderr) return None if response.status_code != 200: print( f"GitHub API returned {response.status_code} when fetching {path}", file=sys.stderr, ) return None data = response.json() try: content = base64.b64decode(data.get("content", "")).decode("utf-8") except Exception as exc: # pragma: no cover - decode error print(f"Unable to decode {path}: {exc}", file=sys.stderr) return None return content def _line_index(lines: List[str], needle: str) -> int: for idx, line in enumerate(lines): if needle in line: return idx return -1 def _list_workflow_runs(org: str, token: str) -> Optional[List[dict]]: url = ( f"https://api.github.com/repos/{org}/{REPO_NAME}/actions/workflows/{WORKFLOW_FILE}/runs" f"?event=issues&per_page=15" ) headers = { "Authorization": f"Bearer {token}", "Accept": "application/vnd.github+json", } try: response = requests.get(url, headers=headers, timeout=30) except Exception as exc: # pragma: no cover - network error handling print(f"Request error when listing workflow runs: {exc}", file=sys.stderr) return None if response.status_code != 200: print( f"GitHub API returned {response.status_code} when listing workflow runs", file=sys.stderr, ) return None data = response.json() return data.get("workflow_runs", []) def _wait_for_tracking_issue_run(org: str, token: str) -> bool: for attempt in range(1, MAX_POLL_ATTEMPTS + 1): runs = _list_workflow_runs(org, token) if runs is None: return False relevant = [ run for run in runs if run.get("display_title") == TRACKING_ISSUE_TITLE ] if not relevant: print( f"[{attempt}/{MAX_POLL_ATTEMPTS}] No Issue Lint Guard run for '{TRACKING_ISSUE_TITLE}' yet; waiting..." ) time.sleep(POLL_INTERVAL_SECONDS) continue latest = relevant[0] status = latest.get("status") conclusion = latest.get("conclusion") html_url = latest.get("html_url") if status != "completed": print( f"[{attempt}/{MAX_POLL_ATTEMPTS}] Latest run is '{status}'; waiting for completion..." ) time.sleep(POLL_INTERVAL_SECONDS) continue if conclusion != "success": print( "Latest Issue Lint Guard run finished without success.", file=sys.stderr, ) print(f"Status: {status}, Conclusion: {conclusion}", file=sys.stderr) if html_url: print(f"Run URL: {html_url}", file=sys.stderr) return False if html_url: print(f"✅ Latest Issue Lint Guard run succeeded: {html_url}") else: print("✅ Latest Issue Lint Guard run succeeded") return True print( f"Timed out waiting for a successful Issue Lint Guard run for '{TRACKING_ISSUE_TITLE}'", file=sys.stderr, ) return False def verify() -> bool: load_dotenv(".mcp_env") token = os.environ.get("MCP_GITHUB_TOKEN") org = os.environ.get("GITHUB_EVAL_ORG") if not token: print("MCP_GITHUB_TOKEN is missing", file=sys.stderr) return False if not org: print("GITHUB_EVAL_ORG is missing", file=sys.stderr) return False content = _download_file(org, token, WORKFLOW_PATH) if content is None: print( "Workflow file .github/workflows/issue-lint.yml was not found on main", file=sys.stderr, ) return False normalized = content.lower() normalized_lines = [line.strip().lower() for line in content.splitlines()] errors = [] required_snippets = { "workflow name": "name: issue lint guard", "issues trigger": "issues:", "types opened": "types:", "job name": "lint:", "runner": "runs-on: ubuntu-latest", "checkout": "actions/checkout", "setup-node": "actions/setup-node", "node version": "node-version: 18", "npm ci": "npm ci", "npm run lint": "npm run lint", } for label, snippet in required_snippets.items(): if snippet not in normalized: errors.append(f"Missing {label} ({snippet}) in workflow") types_line = next( (line for line in normalized_lines if "types" in line and "opened" in line), None, ) if types_line is None: errors.append("issues trigger must limit types to include 'opened'") checkout_idx = _line_index(normalized_lines, "actions/checkout") setup_idx = _line_index(normalized_lines, "actions/setup-node") ci_idx = _line_index(normalized_lines, "npm ci") lint_idx = _line_index(normalized_lines, "npm run lint") if -1 in [checkout_idx, setup_idx, ci_idx, lint_idx]: errors.append("Could not determine workflow step ordering") else: if not (checkout_idx < setup_idx < ci_idx < lint_idx): errors.append( "Steps must run in order: checkout -> setup-node -> npm ci -> npm run lint" ) if errors: print("Workflow validation failed:") for err in errors: print(f" - {err}", file=sys.stderr) return False print("✅ issue-lint workflow file looks correct") return _wait_for_tracking_issue_run(org, token) if __name__ == "__main__": sys.exit(0 if verify() else 1) ================================================ FILE: tasks/github/easy/mcpmark-cicd/nightly_health_check/description.md ================================================ Use the GitHub MCP tools to add a tiny bit of automation to `mcpmark-eval/mcpmark-cicd`. Goal: every night the repo should run the existing health check script. Do the usual branch/PR flow with a branch named `nightly-health` and a PR titled `Add nightly health check`. Create `.github/workflows/nightly-health.yml` with: - workflow name `Nightly Health Check` - triggers: `workflow_dispatch` plus a cron schedule `0 2 * * *` - one job called `health-check` on `ubuntu-latest` - use Node.js 18 via `actions/setup-node` - steps in order: checkout, npm ci, `npm run health-check` Merge the PR so the workflow lives on `main`. ================================================ FILE: tasks/github/easy/mcpmark-cicd/nightly_health_check/meta.json ================================================ { "task_id": "nightly_health_check", "task_name": "Nightly Health Check", "category_id": "mcpmark-cicd", "category_name": "MCPMark CI/CD (Easy)", "description": "Add a scheduled workflow that runs the npm health check script every night.", "author": "Zijian Wu", "created_at": "2025-11-15", "difficulty": "L1", "tags": [ "ci/cd", "github actions", "scheduling" ], "mcp": [ "github" ], "meta_data": { "stateType": "url", "stateContent": null, "stateUrl": "https://github.com/mcpmark-source/mcpmark-cicd", "stateOriginalUrl": null } } ================================================ FILE: tasks/github/easy/mcpmark-cicd/nightly_health_check/verify.py ================================================ import base64 import os import sys from typing import List, Optional import requests from dotenv import load_dotenv REPO_NAME = "mcpmark-cicd" WORKFLOW_PATH = ".github/workflows/nightly-health.yml" BRANCH = "main" def _download_file(org: str, token: str, path: str) -> Optional[str]: url = f"https://api.github.com/repos/{org}/{REPO_NAME}/contents/{path}?ref={BRANCH}" headers = { "Authorization": f"Bearer {token}", "Accept": "application/vnd.github+json", } try: response = requests.get(url, headers=headers, timeout=30) except Exception as exc: # pragma: no cover print(f"Request error for {path}: {exc}", file=sys.stderr) return None if response.status_code != 200: print( f"GitHub API returned {response.status_code} when fetching {path}", file=sys.stderr, ) return None data = response.json() try: content = base64.b64decode(data.get("content", "")).decode("utf-8") except Exception as exc: print(f"Unable to decode {path}: {exc}", file=sys.stderr) return None return content def _line_index(lines: List[str], needle: str) -> int: for idx, line in enumerate(lines): if needle in line: return idx return -1 def verify() -> bool: load_dotenv(".mcp_env") token = os.environ.get("MCP_GITHUB_TOKEN") org = os.environ.get("GITHUB_EVAL_ORG") if not token: print("MCP_GITHUB_TOKEN is missing", file=sys.stderr) return False if not org: print("GITHUB_EVAL_ORG is missing", file=sys.stderr) return False content = _download_file(org, token, WORKFLOW_PATH) if content is None: print( "Workflow file .github/workflows/nightly-health.yml was not found on main", file=sys.stderr, ) return False normalized = content.lower() normalized_lines = [line.strip().lower() for line in content.splitlines()] errors = [] required_bits = { "workflow name": "name: nightly health check", "workflow_dispatch trigger": "workflow_dispatch:", "schedule": "schedule:", "cron": "0 2 * * *", "job name": "health-check:", "runner": "runs-on: ubuntu-latest", "checkout": "actions/checkout", "setup-node": "actions/setup-node", "node version": "node-version: 18", "npm ci": "npm ci", "health script": "npm run health-check", } for label, snippet in required_bits.items(): if snippet not in normalized: errors.append(f"Missing {label} ({snippet}) in workflow") schedule_index = _line_index(normalized_lines, "schedule:") cron_index = _line_index(normalized_lines, "- cron: '0 2 * * *'") if cron_index == -1: cron_index = _line_index(normalized_lines, "cron: '0 2 * * *'") if cron_index == -1: cron_index = _line_index(normalized_lines, 'cron: "0 2 * * *"') if schedule_index == -1 or cron_index == -1 or cron_index < schedule_index: errors.append("Cron expression must appear under schedule trigger") ci_index = _line_index(normalized_lines, "npm ci") health_index = _line_index(normalized_lines, "npm run health-check") if ci_index == -1 or health_index == -1: errors.append("npm ci and npm run health-check must both appear") else: if not ci_index < health_index: errors.append("npm ci must run before npm run health-check") if errors: print("Verification failed:") for err in errors: print(f" - {err}", file=sys.stderr) return False print("✅ nightly-health workflow found with required schedule and steps") return True if __name__ == "__main__": sys.exit(0 if verify() else 1) ================================================ FILE: tasks/github/easy/missing-semester/count_translations/description.md ================================================ Use the GitHub MCP tools to inspect the `mcpmark-eval/missing-semester` repository. 1. Navigate the repository to find the list of community translations that appears on the site's home page. 2. Determine how many translation links are currently listed. 3. Record both the count and the specific file you used as evidence by creating an `ANSWER.md` file in the repository root that contains exactly: ``` Translation Count: <number> Source: <filename> ``` 4. Commit the new file and push the change to `master`. ================================================ FILE: tasks/github/easy/missing-semester/count_translations/meta.json ================================================ { "task_id": "count_translations", "task_name": "Count Translations", "category_id": "missing-semester", "category_name": "Missing Semester (Easy)", "description": "Use GitHub MCP to count the translations listed on the home page, record the value in ANSWER.md, and push the change to master.", "author": "Zijian Wu", "created_at": "2025-11-15", "difficulty": "L1", "tags": [ "content search", "answer file" ], "mcp": [ "github" ], "meta_data": { "stateType": "url", "stateContent": null, "stateUrl": "https://github.com/mcpmark-source/missing-semester", "stateOriginalUrl": "https://github.com/missing-semester/missing-semester" } } ================================================ FILE: tasks/github/easy/missing-semester/count_translations/verify.py ================================================ import base64 import os import sys from typing import Optional import requests from dotenv import load_dotenv REPO_NAME = "missing-semester" TARGET_FILE = "ANSWER.md" BRANCH = "master" EXPECTED_COUNT = "translation count: 14" EXPECTED_SOURCE = "source: index.md" def _download_file(org: str, token: str, path: str) -> Optional[str]: url = f"https://api.github.com/repos/{org}/{REPO_NAME}/contents/{path}?ref={BRANCH}" headers = { "Authorization": f"Bearer {token}", "Accept": "application/vnd.github+json", } try: response = requests.get(url, headers=headers, timeout=30) except Exception as exc: print(f"Request error for {path}: {exc}", file=sys.stderr) return None if response.status_code != 200: print( f"GitHub API returned {response.status_code} when fetching {path}", file=sys.stderr, ) return None data = response.json() try: content = base64.b64decode(data.get("content", "")).decode("utf-8").strip() except Exception as exc: print(f"Unable to decode {path}: {exc}", file=sys.stderr) return None return content def verify() -> bool: load_dotenv(".mcp_env") token = os.environ.get("MCP_GITHUB_TOKEN") org = os.environ.get("GITHUB_EVAL_ORG") if not token: print("MCP_GITHUB_TOKEN is missing", file=sys.stderr) return False if not org: print("GITHUB_EVAL_ORG is missing", file=sys.stderr) return False print("Checking ANSWER.md in remote repository...") answer_content = _download_file(org, token, TARGET_FILE) if answer_content is None: return False normalized = " ".join(answer_content.lower().split()) if EXPECTED_COUNT not in normalized: print( "ANSWER.md must include 'Translation Count: 14' (spacing/casing ignored).", file=sys.stderr, ) print("Found:") print(answer_content) return False if EXPECTED_SOURCE not in normalized: print( "ANSWER.md must include 'Source: index.md' (spacing/casing ignored).", file=sys.stderr, ) print("Found:") print(answer_content) return False print("All checks passed! ANSWER.md contains the expected count and source.") return True if __name__ == "__main__": success = verify() sys.exit(0 if success else 1) ================================================ FILE: tasks/github/easy/missing-semester/find_ga_tracking_id/description.md ================================================ Use the GitHub MCP tools to inspect the `mcpmark-eval/missing-semester` repository. 1. Determine the Analytics tracking ID that the Missing Semester site declares in its configuration. 2. Create an `ANSWER.md` file in the repository root that contains exactly: ``` Analytics Tracking ID: <value you found> ``` 3. Commit the new file and push the change to `master`. ================================================ FILE: tasks/github/easy/missing-semester/find_ga_tracking_id/meta.json ================================================ { "task_id": "find_ga_tracking_id", "task_name": "Find GA Tracking ID", "category_id": "missing-semester", "category_name": "Missing Semester (Easy)", "description": "Use GitHub MCP to discover the single Google Analytics tracking ID declared in the site configuration, write it to ANSWER.md, and push the change to master.", "author": "Zijian Wu", "created_at": "2025-11-15", "difficulty": "L1", "tags": [ "config search", "analytics", "answer file" ], "mcp": [ "github" ], "meta_data": { "stateType": "url", "stateContent": null, "stateUrl": "https://github.com/mcpmark-source/missing-semester", "stateOriginalUrl": "https://github.com/missing-semester/missing-semester" } } ================================================ FILE: tasks/github/easy/missing-semester/find_ga_tracking_id/verify.py ================================================ import base64 import os import sys from typing import Optional import requests from dotenv import load_dotenv # Accept either wording, regardless of casing EXPECTED_VARIANTS = { "google analytics tracking id: g-p7wvhd84d1", "analytics tracking id: g-p7wvhd84d1", } REPO_NAME = "missing-semester" TARGET_FILE = "ANSWER.md" BRANCH = "master" def _download_file(org: str, token: str) -> Optional[str]: url = f"https://api.github.com/repos/{org}/{REPO_NAME}/contents/{TARGET_FILE}?ref={BRANCH}" headers = { "Authorization": f"Bearer {token}", "Accept": "application/vnd.github+json", } try: response = requests.get(url, headers=headers) except Exception as exc: print(f"Request error for {TARGET_FILE}: {exc}", file=sys.stderr) return None if response.status_code != 200: print( f"GitHub API returned {response.status_code} when fetching {TARGET_FILE}", file=sys.stderr, ) return None data = response.json() try: content = base64.b64decode(data.get("content", "")).decode("utf-8").strip() except Exception as exc: print(f"Unable to decode {TARGET_FILE}: {exc}", file=sys.stderr) return None return content def verify() -> bool: load_dotenv(".mcp_env") token = os.environ.get("MCP_GITHUB_TOKEN") org = os.environ.get("GITHUB_EVAL_ORG") if not token: print("MCP_GITHUB_TOKEN is missing", file=sys.stderr) return False if not org: print("GITHUB_EVAL_ORG is missing", file=sys.stderr) return False print("Checking ANSWER.md in remote repository...") answer_content = _download_file(org, token) if answer_content is None: return False normalized = answer_content.strip().lower() if normalized not in EXPECTED_VARIANTS: print("ANSWER.md does not contain an accepted tracking ID format", file=sys.stderr) print("Accepted variants:", file=sys.stderr) for variant in EXPECTED_VARIANTS: print(f" - {variant}", file=sys.stderr) print(f"Found: {answer_content}", file=sys.stderr) return False print("All checks passed! ANSWER.md matches an accepted content variant.") return True if __name__ == "__main__": success = verify() sys.exit(0 if success else 1) ================================================ FILE: tasks/github/standard/build_your_own_x/find_commit_date/description.md ================================================ Find out when the entries in the Voxel Engine section were first created by Daniel Stefanovic. After finding this information, create an ANSWER.md file in the repository with the content being the date in [YYYY]-[MM]-[DD] format (e.g., 2000-06-02). ================================================ FILE: tasks/github/standard/build_your_own_x/find_commit_date/meta.json ================================================ { "task_id": "find_commit_date", "task_name": "Find Commit Date", "category_id": "build_your_own_x", "category_name": "Build Your Own X", "description": "Find when Voxel Engine entries were first created by Daniel Stefanovic and document the date.", "author": "Xiangyan Liu", "created_at": "2025-08-15", "difficulty": "L3", "tags": [ "repository analysis" ], "mcp": [ "github" ], "meta_data": { "stateType": "url", "stateContent": null, "stateUrl": "https://github.com/mcpmark-source/build-your-own-x", "stateOriginalUrl": "https://github.com/codecrafters-io/build-your-own-x" } } ================================================ FILE: tasks/github/standard/build_your_own_x/find_commit_date/verify.py ================================================ import sys import os import requests from typing import Dict, Optional, Tuple import base64 from dotenv import load_dotenv def _get_github_api( endpoint: str, headers: Dict[str, str], org: str, repo: str = "build-your-own-x" ) -> Tuple[bool, Optional[Dict]]: """Make a GET request to GitHub API and return (success, response).""" url = f"https://api.github.com/repos/{org}/{repo}/{endpoint}" try: response = requests.get(url, headers=headers) if response.status_code == 200: return True, response.json() elif response.status_code == 404: return False, None else: print(f"API error for {endpoint}: {response.status_code}", file=sys.stderr) return False, None except Exception as e: print(f"Exception for {endpoint}: {e}", file=sys.stderr) return False, None def _get_file_content( file_path: str, headers: Dict[str, str], org: str, repo: str = "build-your-own-x", ref: str = "master", ) -> Optional[str]: """Get the content of a file from the repository.""" success, result = _get_github_api( f"contents/{file_path}?ref={ref}", headers, org, repo ) if not success or not result: return None try: content = base64.b64decode(result.get("content", "")).decode("utf-8") return content except Exception as e: print(f"Content decode error for {file_path}: {e}", file=sys.stderr) return None def verify_task() -> bool: """Verify the find commit data task for Voxel Engine entries.""" # Load environment variables from .mcp_env load_dotenv(".mcp_env") # Get GitHub token and org github_token = os.environ.get("MCP_GITHUB_TOKEN") github_org = os.environ.get("GITHUB_EVAL_ORG") if not github_token: print("Error: MCP_GITHUB_TOKEN environment variable not set", file=sys.stderr) return False if not github_org: print("Error: GITHUB_EVAL_ORG environment variable not set", file=sys.stderr) return False headers = { "Authorization": f"Bearer {github_token}", "Accept": "application/vnd.github.v3+json", } print("Verifying Voxel Engine commit date task...") # 1. Check if ANSWER.md exists in the repository print("1. Checking if ANSWER.md exists...") content = _get_file_content("ANSWER.md", headers, github_org) if not content: print("Error: ANSWER.md not found in repository", file=sys.stderr) return False print("✓ ANSWER.md found") # 2. Check the content format print("2. Checking content format...") content = content.strip() # The expected date when Daniel Stefanovic added Voxel Engine entries # Based on historical records, this should be 2018-07-07 expected_date = "2018-07-07" # Check if the content matches the expected date format (YYYY-MM-DD) import re date_pattern = r'^\d{4}-\d{2}-\d{2}$' if not re.match(date_pattern, content): print(f"Error: Invalid date format. Expected YYYY-MM-DD, got: {content}", file=sys.stderr) return False print("✓ Date format is correct") # 3. Verify the date is correct print("3. Verifying the date...") if content != expected_date: print(f"Error: Incorrect date. Expected {expected_date}, got: {content}", file=sys.stderr) return False print(f"✓ Date is correct: {content}") # 4. Verify README.md contains Voxel Engine section print("4. Checking if README.md contains Voxel Engine section...") readme_content = _get_file_content("README.md", headers, github_org) if not readme_content: print("Error: README.md not found in repository", file=sys.stderr) return False if "Voxel Engine" not in readme_content: print("Error: Voxel Engine section not found in README.md", file=sys.stderr) return False # Check for specific Voxel Engine entries voxel_entries = [ "Let's Make a Voxel Engine", "Java Voxel Engine Tutorial" ] for entry in voxel_entries: if entry not in readme_content: print(f"Warning: Voxel Engine entry '{entry}' not found in README.md", file=sys.stderr) print("✓ Voxel Engine section found in README.md") print("\n✅ All verification checks passed!") print("Task completed successfully:") print(f" - ANSWER.md created with date: {content}") print(" - Date format is correct (YYYY-MM-DD)") print(" - Date matches expected creation date for Voxel Engine entries by Daniel Stefanovic") print(" - Voxel Engine section exists in README.md") return True if __name__ == "__main__": success = verify_task() sys.exit(0 if success else 1) ================================================ FILE: tasks/github/standard/build_your_own_x/find_rag_commit/description.md ================================================ Find out the specific commit SHA of adding an entry about "RAG for Document Search". After finding this information, create an ANSWER.md file in the repository with the content being the commit SHA (e.g., 023dfa35694db2709057488ad338afdbc89fb226). Hint: It should be in an "AI model" section I think. ================================================ FILE: tasks/github/standard/build_your_own_x/find_rag_commit/meta.json ================================================ { "task_id": "find_rag_commit", "task_name": "Find Rag Commit", "category_id": "build_your_own_x", "category_name": "Build Your Own X", "description": "Identify the specific commit SHA that added the RAG for Document Search entry to the repository.", "author": "Xiangyan Liu", "created_at": "2025-08-15", "difficulty": "L3", "tags": [ "repository analysis" ], "mcp": [ "github" ], "meta_data": { "stateType": "url", "stateContent": null, "stateUrl": "https://github.com/mcpmark-source/build-your-own-x", "stateOriginalUrl": "https://github.com/codecrafters-io/build-your-own-x" } } ================================================ FILE: tasks/github/standard/build_your_own_x/find_rag_commit/verify.py ================================================ import sys import os import requests from typing import Dict, Optional, Tuple import base64 from dotenv import load_dotenv def _get_github_api( endpoint: str, headers: Dict[str, str], org: str, repo: str = "build-your-own-x" ) -> Tuple[bool, Optional[Dict]]: """Make a GET request to GitHub API and return (success, response).""" url = f"https://api.github.com/repos/{org}/{repo}/{endpoint}" try: response = requests.get(url, headers=headers) if response.status_code == 200: return True, response.json() elif response.status_code == 404: return False, None else: print(f"API error for {endpoint}: {response.status_code}", file=sys.stderr) return False, None except Exception as e: print(f"Exception for {endpoint}: {e}", file=sys.stderr) return False, None def _get_file_content( file_path: str, headers: Dict[str, str], org: str, repo: str = "build-your-own-x", ref: str = "master", ) -> Optional[str]: """Get the content of a file from the repository.""" success, result = _get_github_api( f"contents/{file_path}?ref={ref}", headers, org, repo ) if not success or not result: return None try: content = base64.b64decode(result.get("content", "")).decode("utf-8") return content except Exception as e: print(f"Content decode error for {file_path}: {e}", file=sys.stderr) return None def verify_task() -> bool: """Verify the find RAG commit SHA task.""" # Load environment variables from .mcp_env load_dotenv(".mcp_env") # Get GitHub token and org github_token = os.environ.get("MCP_GITHUB_TOKEN") github_org = os.environ.get("GITHUB_EVAL_ORG") if not github_token: print("Error: MCP_GITHUB_TOKEN environment variable not set", file=sys.stderr) return False if not github_org: print("Error: GITHUB_EVAL_ORG environment variable not set", file=sys.stderr) return False headers = { "Authorization": f"Bearer {github_token}", "Accept": "application/vnd.github.v3+json", } print("Verifying RAG commit SHA task...") # Expected commit SHA for RAG for Document Search expected_sha = "048cd3b3de70e4b429057891576ea394a50cdf48" # 1. Check if ANSWER.md exists in the repository print("1. Checking if ANSWER.md exists...") content = _get_file_content("ANSWER.md", headers, github_org) if not content: print("Error: ANSWER.md not found in repository", file=sys.stderr) return False print("✓ ANSWER.md found") # 2. Check the content matches expected SHA print("2. Checking commit SHA...") content = content.strip() if content != expected_sha: print(f"Error: Incorrect commit SHA. Expected {expected_sha}, got: {content}", file=sys.stderr) return False print("✓ Commit SHA is correct") # 3. Verify the commit exists print("3. Verifying the commit exists...") success, commit_data = _get_github_api(f"commits/{content}", headers, github_org) if not success or not commit_data: print(f"Error: Commit {content} not found in repository", file=sys.stderr) return False print(f"✓ Commit {content} exists") print("\n✅ All verification checks passed!") print("Task completed successfully:") print(f" - ANSWER.md created with correct commit SHA: {content}") print(f" - Commit exists in the repository") print(f" - Commit message: {commit_data.get('commit', {}).get('message', '')}") return True if __name__ == "__main__": success = verify_task() sys.exit(0 if success else 1) ================================================ FILE: tasks/github/standard/claude-code/automated_changelog_generation/description.md ================================================ I need you to analyze all recently closed issues and open pull requests in the repository, then generate comprehensive documentation and organize them properly. **Step 1: Create Documentation Branch** Create a new branch called 'docs/changelog-and-migration' from the main branch. **Step 2: Generate Changelog from Closed Issues** Find all closed issues in the repository and create the file `CHANGELOG-GENERATED.md` on your branch with: - A heading "# Changelog - Recent Fixes" - A "### 🐛 Bug Fixes" section listing all closed issues with bug label, formatted as: "- **#[NUMBER]**: [Title] ([labels])" - A "### 📚 Documentation" section for closed issues with documentation label - A "### 🔄 Duplicates" section for issues marked as duplicate - A "### 📊 Statistics" section with: - Total number of closed issues - Distribution by platform labels (platform:macos, platform:linux, etc.) - Distribution by area labels (area:core, area:tools, etc.) **Step 3: Create Migration Guide for Open PRs** Analyze all open pull requests and create the file `docs/MIGRATION_GUIDE.md` with: - A heading "# Migration Guide for Pending Features" - For each open PR, create a section with: - PR title and number - Summary of changes based on the PR description - Any new configuration or environment variables mentioned - Installation or usage instructions if applicable **Step 4: Create Issue Analysis Report** Create the file `reports/ISSUE_ANALYSIS.md` with: - A heading "# Issue Analysis Report" - A "## Closed Issues by Category" section grouping closed issues by their primary label - A "## Resolution Patterns" section identifying common themes - A "## Platform Impact Analysis" section showing which platforms were most affected - Include references to specific issues that had cross-project impact or memory-related problems **Step 5: Create PR Integration Plan** Create the file `reports/PR_INTEGRATION_PLAN.md` with: - A heading "# Pull Request Integration Strategy" - A "## Open PRs Overview" section listing each open PR with a technical summary - A "## Dependencies and Conflicts" section analyzing potential conflicts between PRs - A "## Recommended Merge Order" section with reasoning - A "## Risk Assessment" section linking any risks to previously closed issues **Step 6: Create Documentation PR** Create a pull request from 'docs/changelog-and-migration' to 'main' with: - Title: "docs: Generated changelog and migration documentation" - Body including: - A "## Summary" section describing what was generated - A "## Files Created" section listing all new documentation - A "## Issues Processed" section mentioning the number of closed issues analyzed - A "## PRs Analyzed" section mentioning the open PRs reviewed **Step 7: Merge Documentation PR** Merge the documentation pull request using the "squash" merge method. ================================================ FILE: tasks/github/standard/claude-code/automated_changelog_generation/meta.json ================================================ { "task_id": "automated_changelog_generation", "task_name": "Automated Changelog Generation", "category_id": "claude-code", "category_name": "Claude Code", "description": "Analyze closed issues and open PRs to generate comprehensive documentation including changelog, migration guide, and analysis reports.", "author": "Zijian Wu", "created_at": "2025-08-15", "difficulty": "L3", "tags": [ "release coordination", "workflow automation" ], "mcp": [ "github" ], "meta_data": { "stateType": "url", "stateContent": null, "stateUrl": "https://github.com/mcpmark-source/claude-code", "stateOriginalUrl": "https://github.com/anthropics/claude-code" } } ================================================ FILE: tasks/github/standard/claude-code/automated_changelog_generation/verify.py ================================================ import sys import os import requests from typing import Dict, List, Optional, Tuple import base64 from dotenv import load_dotenv def _get_github_api( endpoint: str, headers: Dict[str, str], org: str, repo: str = "claude-code" ) -> Tuple[bool, Optional[Dict]]: """Make a GET request to GitHub API and return (success, response).""" url = f"https://api.github.com/repos/{org}/{repo}/{endpoint}" try: response = requests.get(url, headers=headers) if response.status_code == 200: return True, response.json() elif response.status_code == 404: return False, None else: print(f"API error for {endpoint}: {response.status_code}", file=sys.stderr) return False, None except Exception as e: print(f"Exception for {endpoint}: {e}", file=sys.stderr) return False, None def _check_branch_exists( branch_name: str, headers: Dict[str, str], org: str, repo: str = "claude-code" ) -> bool: """Verify that a branch exists in the repository.""" success, _ = _get_github_api(f"branches/{branch_name}", headers, org, repo) return success def _get_file_content( file_path: str, headers: Dict[str, str], org: str, repo: str = "claude-code", ref: str = "main", ) -> Optional[str]: """Get the content of a file from the repository.""" success, result = _get_github_api( f"contents/{file_path}?ref={ref}", headers, org, repo ) if not success or not result: return None try: content = base64.b64decode(result.get("content", "")).decode("utf-8") return content except Exception as e: print(f"Content decode error for {file_path}: {e}", file=sys.stderr) return None def _find_pr_by_title_keyword( keyword: str, headers: Dict[str, str], org: str, repo: str = "claude-code" ) -> Optional[Dict]: """Find a PR by title keyword and return the PR data.""" for state in ["open", "closed"]: success, prs = _get_github_api( f"pulls?state={state}&per_page=100", headers, org, repo ) if success and prs: for pr in prs: if keyword.lower() in pr.get("title", "").lower(): return pr return None def _get_pr_merge_commit( pr_number: int, headers: Dict[str, str], org: str, repo: str = "claude-code" ) -> Optional[Dict]: """Get the merge commit for a PR to check merge method.""" success, pr = _get_github_api(f"pulls/{pr_number}", headers, org, repo) if success and pr: merge_commit_sha = pr.get("merge_commit_sha") if merge_commit_sha: success, commit = _get_github_api( f"commits/{merge_commit_sha}", headers, org, repo ) if success: return commit return None def _check_file_sections(content: str, required_sections: List[str]) -> bool: """Check if file content contains required sections.""" if not content: return False return all(section in content for section in required_sections) def _check_issue_references(text: str, issue_numbers: List[int]) -> int: """Count how many of the specified issue numbers are referenced in the text.""" if not text: return 0 count = 0 for num in issue_numbers: if f"#{num}" in text: count += 1 return count def _check_pr_references(text: str, pr_numbers: List[int]) -> int: """Count how many of the specified PR numbers are referenced in the text.""" if not text: return 0 count = 0 for num in pr_numbers: if f"#{num}" in text or f"PR #{num}" in text: count += 1 return count def verify() -> bool: """ Programmatically verify that the changelog and migration documentation workflow meets the requirements described in description.md. """ # Configuration constants - these are known to us but not explicitly told to the model DOCS_BRANCH_NAME = "docs/changelog-and-migration" DOCS_PR_KEYWORD = "Generated changelog and migration" # Known issue and PR numbers for verification EXPECTED_BUG_ISSUES = [12, 13, 15, 21, 22, 23, 25, 37, 39, 48, 50] EXPECTED_OPEN_PRS = [51, 52, 53] # Expected file sections CHANGELOG_SECTIONS = [ "# Changelog - Recent Fixes", "### 🐛 Bug Fixes", "### 📚 Documentation", "### 🔄 Duplicates", "### 📊 Statistics", ] MIGRATION_GUIDE_SECTIONS = ["# Migration Guide for Pending Features"] ISSUE_ANALYSIS_SECTIONS = [ "# Issue Analysis Report", "## Closed Issues by Category", "## Resolution Patterns", "## Platform Impact Analysis", ] PR_INTEGRATION_SECTIONS = [ "# Pull Request Integration Strategy", "## Open PRs Overview", "## Dependencies and Conflicts", "## Recommended Merge Order", "## Risk Assessment", ] # Load environment variables from .mcp_env load_dotenv(".mcp_env") # Get GitHub token and org github_token = os.environ.get("MCP_GITHUB_TOKEN") github_org = os.environ.get("GITHUB_EVAL_ORG") if not github_token: print("Error: MCP_GITHUB_TOKEN environment variable not set", file=sys.stderr) return False if not github_org: print("Error: GITHUB_EVAL_ORG environment variable not set", file=sys.stderr) return False headers = { "Authorization": f"Bearer {github_token}", "Accept": "application/vnd.github.v3+json", } # Run verification checks print("Verifying changelog and migration documentation workflow...") # 1. Check that documentation branch exists print("1. Verifying documentation branch exists...") if not _check_branch_exists(DOCS_BRANCH_NAME, headers, github_org): print(f"Error: Branch '{DOCS_BRANCH_NAME}' not found", file=sys.stderr) return False print("✓ Documentation branch created") # 2. Check changelog file print("2. Verifying CHANGELOG-GENERATED.md...") changelog_content = _get_file_content( "CHANGELOG-GENERATED.md", headers, github_org, "claude-code", DOCS_BRANCH_NAME ) if not changelog_content: print("Error: CHANGELOG-GENERATED.md not found", file=sys.stderr) return False if not _check_file_sections(changelog_content, CHANGELOG_SECTIONS): print( "Error: CHANGELOG-GENERATED.md missing required sections", file=sys.stderr ) return False # Check that bug issues are referenced bug_refs = _check_issue_references(changelog_content, EXPECTED_BUG_ISSUES) if bug_refs < 8: # At least 8 of the bug issues print( f"Error: CHANGELOG-GENERATED.md only references {bug_refs} bug issues, expected at least 8", file=sys.stderr, ) return False # Check for platform and area statistics if ( "platform:" not in changelog_content.lower() or "area:" not in changelog_content.lower() ): print( "Error: CHANGELOG-GENERATED.md missing platform or area distribution", file=sys.stderr, ) return False print("✓ Changelog created with proper content") # 3. Check migration guide print("3. Verifying MIGRATION_GUIDE.md...") migration_content = _get_file_content( "docs/MIGRATION_GUIDE.md", headers, github_org, "claude-code", DOCS_BRANCH_NAME ) if not migration_content: print("Error: docs/MIGRATION_GUIDE.md not found", file=sys.stderr) return False if not _check_file_sections(migration_content, MIGRATION_GUIDE_SECTIONS): print("Error: MIGRATION_GUIDE.md missing required sections", file=sys.stderr) return False # Check that all expected open PRs are mentioned pr_refs = _check_pr_references(migration_content, EXPECTED_OPEN_PRS) if pr_refs < 3: print( f"Error: MIGRATION_GUIDE.md only references {pr_refs}/3 open PRs", file=sys.stderr, ) return False print("✓ Migration guide created with proper content") # 4. Check issue analysis report print("4. Verifying ISSUE_ANALYSIS.md...") issue_analysis_content = _get_file_content( "reports/ISSUE_ANALYSIS.md", headers, github_org, "claude-code", DOCS_BRANCH_NAME, ) if not issue_analysis_content: print("Error: reports/ISSUE_ANALYSIS.md not found", file=sys.stderr) return False if not _check_file_sections(issue_analysis_content, ISSUE_ANALYSIS_SECTIONS): print("Error: ISSUE_ANALYSIS.md missing required sections", file=sys.stderr) return False # Check for cross-project and memory issue mentions if "#50" not in issue_analysis_content and "#48" not in issue_analysis_content: print( "Warning: ISSUE_ANALYSIS.md may be missing cross-project issue references", file=sys.stderr, ) print("✓ Issue analysis report created") # 5. Check PR integration plan print("5. Verifying PR_INTEGRATION_PLAN.md...") pr_plan_content = _get_file_content( "reports/PR_INTEGRATION_PLAN.md", headers, github_org, "claude-code", DOCS_BRANCH_NAME, ) if not pr_plan_content: print("Error: reports/PR_INTEGRATION_PLAN.md not found", file=sys.stderr) return False if not _check_file_sections(pr_plan_content, PR_INTEGRATION_SECTIONS): print( "Error: PR_INTEGRATION_PLAN.md missing required sections", file=sys.stderr ) return False # Check that all open PRs are analyzed pr_refs_in_plan = _check_pr_references(pr_plan_content, EXPECTED_OPEN_PRS) if pr_refs_in_plan < 3: print( f"Error: PR_INTEGRATION_PLAN.md only references {pr_refs_in_plan}/3 open PRs", file=sys.stderr, ) return False print("✓ PR integration plan created") # 6. Find and verify the documentation PR print("6. Verifying documentation pull request...") docs_pr = _find_pr_by_title_keyword(DOCS_PR_KEYWORD, headers, github_org) if not docs_pr: # Try alternative keyword docs_pr = _find_pr_by_title_keyword( "changelog and migration", headers, github_org ) if not docs_pr: print("Error: Documentation PR not found", file=sys.stderr) return False pr_body = docs_pr.get("body", "") pr_number = docs_pr.get("number") # Check PR body sections required_sections = [ "## Summary", "## Files Created", "## Issues Processed", "## PRs Analyzed", ] missing_sections = [] for section in required_sections: if section not in pr_body: missing_sections.append(section) if len(missing_sections) > 1: # Allow 1 missing section for flexibility print( f"Error: Documentation PR missing sections: {missing_sections}", file=sys.stderr, ) return False print("✓ Documentation PR created") # 7. Check that the documentation PR has been merged with squash method print("7. Verifying documentation PR merge with squash method...") if docs_pr.get("state") != "closed" or not docs_pr.get("merged_at"): print("Error: Documentation PR has not been merged", file=sys.stderr) return False # Check merge method was squash by examining the merge commit merge_commit = _get_pr_merge_commit(pr_number, headers, github_org) if merge_commit: # Squash merges typically have only one parent (the base branch) parents = merge_commit.get("parents", []) if len(parents) != 1: print( f"Warning: Merge commit has {len(parents)} parents, may not be squash merge", file=sys.stderr, ) # Check commit message pattern typical of squash merges commit_message = merge_commit.get("commit", {}).get("message", "") if f"#{pr_number}" not in commit_message: print( "Warning: Merge commit message may not follow squash merge pattern", file=sys.stderr, ) else: print("Warning: Could not retrieve merge commit details", file=sys.stderr) merged_at = docs_pr.get("merged_at") if not merged_at: print("Error: Documentation PR merge timestamp not found", file=sys.stderr) return False print("✓ Documentation PR merged successfully") print("\n✅ All verification checks passed!") print("Changelog and migration documentation completed successfully:") print(f" - Documentation PR #{pr_number} (merged)") print(f" - Branch: {DOCS_BRANCH_NAME}") print(" - Files created: 4 documentation files") print(f" - Bug issues referenced: {bug_refs}/{len(EXPECTED_BUG_ISSUES)}") print(f" - Open PRs analyzed: {pr_refs}/{len(EXPECTED_OPEN_PRS)}") return True if __name__ == "__main__": success = verify() sys.exit(0 if success else 1) ================================================ FILE: tasks/github/standard/claude-code/claude_collaboration_analysis/description.md ================================================ I need you to analyze the collaboration patterns between human developers and Claude (the AI assistant) in the repository by examining all available commit history, then create a comprehensive analysis report and submit it as a new file to the repository. **Step 1: Commit History Analysis** Analyze ALL commits in the repository to identify: 1. **Claude Co-Authored Commits**: Find all commits that were co-authored by Claude (look for "Co-Authored-By: Claude <noreply@anthropic.com>" in commit messages) 2. **Top Claude Collaborators**: Identify the top 3 human developers who most frequently collaborated with Claude **Step 2: Create Collaboration Analysis Report** Create a file called `CLAUDE_COLLABORATION_ANALYSIS.md` in the repository root with: - A "# Claude AI Collaboration Analysis" title - A "## Summary Statistics" section with these exact format requirements: - "Total commits analyzed: [NUMBER]" - "Number of Claude co-authored commits found: [NUMBER]" - "Percentage of commits with Claude collaboration: [NUMBER]%" - "Number of unique human collaborators who worked with Claude: [NUMBER]" - A "## Top Claude Collaborators" section with this exact table format: ```markdown | Developer | GitHub Username | Claude Collaborations | |-----------|----------------|----------------------| ``` Include the top 3 developers by number of Claude collaborations. **Step 3: Commit Analysis to Repository** Commit the `CLAUDE_COLLABORATION_ANALYSIS.md` file to the main branch with: - Commit message: "Add Claude AI collaboration analysis report" - Ensure all statistics are accurate based on actual commit data ================================================ FILE: tasks/github/standard/claude-code/claude_collaboration_analysis/meta.json ================================================ { "task_id": "claude_collaboration_analysis", "task_name": "Claude Collaboration Analysis", "category_id": "claude-code", "category_name": "Claude Code", "description": "Analyze Claude AI collaboration patterns in commit history and create a comprehensive report of co-authored commits and top collaborators.", "author": "Zijian Wu", "created_at": "2025-08-15", "difficulty": "L3", "tags": [ "repository analysis" ], "mcp": [ "github" ], "meta_data": { "stateType": "url", "stateContent": null, "stateUrl": "https://github.com/mcpmark-source/claude-code", "stateOriginalUrl": "https://github.com/anthropics/claude-code" } } ================================================ FILE: tasks/github/standard/claude-code/claude_collaboration_analysis/verify.py ================================================ import sys import os import requests from typing import Dict, List, Optional, Tuple import base64 import re from dotenv import load_dotenv def _get_github_api( endpoint: str, headers: Dict[str, str], org: str, repo: str = "claude-code" ) -> Tuple[bool, Optional[Dict]]: """Make a GET request to GitHub API and return (success, response).""" url = f"https://api.github.com/repos/{org}/{repo}/{endpoint}" try: response = requests.get(url, headers=headers) if response.status_code == 200: return True, response.json() elif response.status_code == 404: return False, None else: print(f"API error for {endpoint}: {response.status_code}", file=sys.stderr) return False, None except Exception as e: print(f"Exception for {endpoint}: {e}", file=sys.stderr) return False, None def _get_file_content( file_path: str, headers: Dict[str, str], org: str, repo: str = "claude-code", ref: str = "main", ) -> Optional[str]: """Get the content of a file from the repository.""" success, result = _get_github_api( f"contents/{file_path}?ref={ref}", headers, org, repo ) if not success or not result: return None try: content = base64.b64decode(result.get("content", "")).decode("utf-8") return content except Exception as e: print(f"Content decode error for {file_path}: {e}", file=sys.stderr) return None def _parse_summary_statistics(content: str) -> Dict: """Parse the summary statistics section from the report.""" stats = {} lines = content.split("\n") in_summary = False for line in lines: if "## Summary Statistics" in line: in_summary = True continue if in_summary: if "##" in line and "Summary Statistics" not in line: break # Parse statistics lines if "Total commits analyzed" in line: match = re.search(r"(\d+)", line) if match: stats["total_analyzed"] = int(match.group(1)) elif "Number of Claude co-authored commits" in line: match = re.search(r"(\d+)", line) if match: stats["claude_commits"] = int(match.group(1)) elif "Percentage of commits with Claude collaboration" in line: match = re.search(r"([\d.]+)%", line) if match: stats["percentage"] = float(match.group(1)) elif "Number of unique human collaborators" in line: match = re.search(r"(\d+)", line) if match: stats["unique_collaborators"] = int(match.group(1)) return stats def _parse_collaborators_table(content: str) -> List[Dict]: """Parse the top collaborators table from the report.""" collaborators = [] lines = content.split("\n") in_table = False for line in lines: if "| Developer | GitHub Username | Claude Collaborations |" in line: in_table = True continue if in_table and line.startswith("|---"): continue if in_table and line.startswith("|"): parts = [p.strip() for p in line.split("|")] if len(parts) >= 4: # Should have 3 columns plus empty parts developer = parts[1].strip() username = parts[2].strip() collaborations = parts[3].strip() if developer and username and collaborations: try: collaborators.append( { "developer": developer, "username": username, "collaborations": int(collaborations), } ) except ValueError: pass if in_table and line and not line.startswith("|") and "##" in line: break return collaborators def verify_task() -> bool: """Verify the Claude collaboration analysis task.""" # Load environment variables from .mcp_env load_dotenv(".mcp_env") # Get GitHub token and org github_token = os.environ.get("MCP_GITHUB_TOKEN") github_org = os.environ.get("GITHUB_EVAL_ORG") if not github_token: print("Error: MCP_GITHUB_TOKEN environment variable not set", file=sys.stderr) return False if not github_org: print("Error: GITHUB_EVAL_ORG environment variable not set", file=sys.stderr) return False headers = { "Authorization": f"Bearer {github_token}", "Accept": "application/vnd.github.v3+json", } # Pre-computed expected values based on repository analysis # These are the correct answers the agent should find EXPECTED_TOP_COLLABORATORS = [ { "username": "bcherny", "min_collaborations": 14, }, # Boris Cherny has many Claude collaborations {"username": "ashwin-ant", "min_collaborations": 5}, # Ashwin Bhat has some {"username": "ant-kurt", "min_collaborations": 3}, # Kurt Carpenter has several ] # Expected exact values for summary statistics EXPECTED_STATS = { "total_analyzed": 158, "claude_commits": 25, "percentage": 15.82, "unique_collaborators": 6, } print("Verifying Claude collaboration analysis task...") # 1. Check if CLAUDE_COLLABORATION_ANALYSIS.md exists in main branch print("1. Checking if CLAUDE_COLLABORATION_ANALYSIS.md exists...") content = _get_file_content("CLAUDE_COLLABORATION_ANALYSIS.md", headers, github_org) if not content: print( "Error: CLAUDE_COLLABORATION_ANALYSIS.md not found in main branch", file=sys.stderr, ) return False print("✓ CLAUDE_COLLABORATION_ANALYSIS.md found") # 2. Check required sections exist print("2. Checking required sections...") required_sections = [ "# Claude AI Collaboration Analysis", "## Summary Statistics", "## Top Claude Collaborators", ] for section in required_sections: if section not in content: print(f"Error: Missing required section '{section}'", file=sys.stderr) return False print("✓ All required sections present") # 3. Parse and validate summary statistics print("3. Validating summary statistics...") stats = _parse_summary_statistics(content) if "total_analyzed" not in stats: print("Error: Total commits analyzed not found", file=sys.stderr) return False # Check exact values against expected statistics if stats.get("total_analyzed") != EXPECTED_STATS["total_analyzed"]: print( f"Error: Total analyzed should be {EXPECTED_STATS['total_analyzed']}, found {stats.get('total_analyzed')}", file=sys.stderr, ) return False if stats.get("claude_commits") != EXPECTED_STATS["claude_commits"]: print( f"Error: Claude commits should be {EXPECTED_STATS['claude_commits']}, found {stats.get('claude_commits')}", file=sys.stderr, ) return False # Allow 0.1% tolerance for percentage expected_percentage = EXPECTED_STATS["percentage"] actual_percentage = stats.get("percentage", 0) if abs(actual_percentage - expected_percentage) > 0.1: print( f"Error: Percentage should be around {expected_percentage}% (±0.1%), found {actual_percentage}%", file=sys.stderr, ) return False if stats.get("unique_collaborators") != EXPECTED_STATS["unique_collaborators"]: print( f"Error: Unique collaborators should be {EXPECTED_STATS['unique_collaborators']}, found {stats.get('unique_collaborators')}", file=sys.stderr, ) return False print("✓ Summary statistics validated") # 4. Validate top collaborators table print("4. Validating top collaborators...") collaborators = _parse_collaborators_table(content) if len(collaborators) < 3: print( f"Error: Expected 3 top collaborators, found {len(collaborators)}", file=sys.stderr, ) return False # Check that expected top collaborators are present found_usernames = [c["username"] for c in collaborators] # The top 3 should include at least 2 of our expected collaborators expected_found = 0 for expected in EXPECTED_TOP_COLLABORATORS: if expected["username"] in found_usernames[:3]: expected_found += 1 # Also check they have reasonable collaboration counts for collab in collaborators: if collab["username"] == expected["username"]: if collab["collaborations"] < expected["min_collaborations"]: print( f"Error: {expected['username']} should have at least {expected['min_collaborations']} collaborations, found {collab['collaborations']}", file=sys.stderr, ) return False if expected_found < 2: print( f"Error: Expected to find at least 2 of the known top collaborators in top 3, found {expected_found}", file=sys.stderr, ) print( f"Expected to see at least 2 of: {[e['username'] for e in EXPECTED_TOP_COLLABORATORS]}", file=sys.stderr, ) print(f"Found: {found_usernames[:3]}", file=sys.stderr) return False print("✓ Top collaborators validated") # 5. Check commit message verification print("5. Verifying commit message...") success, latest_commits = _get_github_api( "commits?per_page=10", headers, github_org ) if not success: print("Error: Failed to fetch recent commits", file=sys.stderr) return False # Look for commit with expected message expected_commit_message = "Add Claude AI collaboration analysis report" commit_found = False for commit in latest_commits: if commit["commit"]["message"].startswith(expected_commit_message): commit_found = True break if not commit_found: print( f"Error: Expected commit message '{expected_commit_message}' not found in recent commits", file=sys.stderr, ) return False print("✓ Commit message verified") # 6. Additional validation: Check unique collaborators count print("6. Final validation complete...") print("✓ All statistics match expected values") print("\n✅ All verification checks passed!") print("Claude collaboration analysis completed successfully:") print(" - File: CLAUDE_COLLABORATION_ANALYSIS.md created in main branch") print(f" - Commits analyzed: {stats.get('total_analyzed', 'N/A')}") print(f" - Claude collaborations found: {stats.get('claude_commits', 'N/A')}") print(f" - Top collaborators identified: {len(collaborators)}") print(" - All statistics verified") print(" - Commit message verified") return True if __name__ == "__main__": success = verify_task() sys.exit(0 if success else 1) ================================================ FILE: tasks/github/standard/claude-code/critical_issue_hotfix_workflow/description.md ================================================ I need you to implement a comprehensive critical issue hotfix workflow for the repository that demonstrates advanced PR management, selective merging, and issue resolution tracking. **Step 1: Create Critical Bug Tracking Issue** Create a new issue with: - Title: "CRITICAL: Memory and Context Management Issues - Hotfix Tracking" - Body must include: - A "## Critical Issues" heading listing issues #49 and #46 - A "## Impact Assessment" heading describing user impact - A "## Resolution Strategy" heading with planned approach - References to existing issues #49, #46, and #47 using "#" notation - Keywords: "memory exhaustion", "context auto-compact", "JavaScript heap", "hotfix priority" **Step 2: Create Memory Optimization Hotfix Branch** Create a new branch called 'hotfix/memory-optimization-v1.0.72' from the main branch. **Step 3: Implement Memory Management Documentation** On the hotfix branch, create the file `docs/MEMORY_OPTIMIZATION.md` with this exact content: ```markdown # Memory Optimization Guide for Claude Code v1.0.72 ## Overview This document addresses critical memory issues identified in issues #49 and #46. ## Memory Management Issues ### Context Auto-Compact Problem (Issue #49) - **Root Cause**: Context management stuck at 0% completion - **Impact**: Tool becomes unusable on macOS platforms - **Solution**: Implement progressive context cleanup with configurable thresholds ### JavaScript Heap Exhaustion (Issue #46) - **Root Cause**: Memory allocation failure during large MCP operations - **Impact**: Complete Claude Code crash requiring restart - **Solution**: Add streaming data processing and garbage collection optimization ## Optimization Strategies ### Immediate Fixes 1. **Context Buffer Management** - Implement 10MB default context buffer limit - Add automatic context pruning at 80% threshold - Enable manual context reset via `/memory-reset` command 2. **MCP Operation Streaming** - Process large datasets in 1MB chunks - Implement backpressure for MongoDB operations - Add memory usage monitoring and alerts ### Configuration Options ```json { "memory": { "contextBufferLimit": "10MB", "autoCompactThreshold": 0.8, "streamingChunkSize": "1MB", "gcOptimization": true } } ``` ## Related Issues - Fixes issue #49: Context auto-compact functionality - Addresses issue #46: JavaScript heap out of memory crashes - Related to issue #47: Cross-project hook execution problems ``` ``` **Step 4: Create Pull Request with Issue Cross-References** Create a pull request from 'hotfix/memory-optimization-v1.0.72' to 'main' with: - Title: "HOTFIX: Critical memory optimization for issues #49 and #46" - Body must include: - A "## Summary" heading describing the memory fixes - A "## Critical Issues Addressed" heading listing specific problems - A "## Documentation Changes" heading describing the new guide - "Addresses #49" and "Addresses #46" pattern linking to existing issues - Reference to your tracking issue using "Tracked in #[ISSUE_NUMBER]" - Keywords: "memory optimization", "context management", "heap exhaustion", "v1.0.72 hotfix" **Step 5: Update and Merge PR #51 (Statsig Logging)** For the existing PR #51: - Update the PR description to include technical implementation details - Add a "## Technical Implementation" section mentioning "event logging integration" - Add keywords: "workflow enhancement", "issue management automation", "logging consistency" - Merge the PR using the squash merge method **Step 6: Add Implementation Comment to Tracking Issue** Add a comment to your original tracking issue with: - Reference to your hotfix PR using "PR #[NUMBER]" pattern - Reference to actions taken on PR #51 - Technical details about the memory optimization approach - Keywords: "context buffer management", "streaming optimization", "progressive cleanup" - Mention of configuration options and thresholds **Step 7: Close Tracking Issue with Resolution Summary** Close your tracking issue by updating its state to 'closed' with: - A final comment summarizing completed actions - Reference to merged PR #51 and pending hotfix PR - Keywords: "hotfix deployment", "memory issues resolved", "documentation updated" ================================================ FILE: tasks/github/standard/claude-code/critical_issue_hotfix_workflow/meta.json ================================================ { "task_id": "critical_issue_hotfix_workflow", "task_name": "Critical Issue Hotfix Workflow", "category_id": "claude-code", "category_name": "Claude Code", "description": "Implement a critical issue hotfix workflow for memory and context management issues with proper PR management and issue tracking.", "author": "Zijian Wu", "created_at": "2025-08-15", "difficulty": "L3", "tags": [ "issue management", "pr workflows" ], "mcp": [ "github" ], "meta_data": { "stateType": "url", "stateContent": null, "stateUrl": "https://github.com/mcpmark-source/claude-code", "stateOriginalUrl": "https://github.com/anthropics/claude-code" } } ================================================ FILE: tasks/github/standard/claude-code/critical_issue_hotfix_workflow/verify.py ================================================ import sys import os import requests from typing import Dict, List, Optional, Tuple import base64 from dotenv import load_dotenv def _get_github_api( endpoint: str, headers: Dict[str, str], org: str, repo: str = "claude-code" ) -> Tuple[bool, Optional[Dict]]: """Make a GET request to GitHub API and return (success, response).""" url = f"https://api.github.com/repos/{org}/{repo}/{endpoint}" try: response = requests.get(url, headers=headers) if response.status_code == 200: return True, response.json() elif response.status_code == 404: return False, None else: print(f"API error for {endpoint}: {response.status_code}", file=sys.stderr) return False, None except Exception as e: print(f"Exception for {endpoint}: {e}", file=sys.stderr) return False, None def _check_branch_exists( branch_name: str, headers: Dict[str, str], org: str, repo: str = "claude-code" ) -> bool: """Verify that a branch exists in the repository.""" success, _ = _get_github_api(f"branches/{branch_name}", headers, org, repo) return success def _get_file_content( file_path: str, headers: Dict[str, str], org: str, repo: str = "claude-code", ref: str = "main", ) -> Optional[str]: """Get the content of a file from the repository.""" success, result = _get_github_api( f"contents/{file_path}?ref={ref}", headers, org, repo ) if not success or not result: return None try: content = base64.b64decode(result.get("content", "")).decode("utf-8") return content except Exception as e: print(f"Content decode error for {file_path}: {e}", file=sys.stderr) return None def _find_issue_by_title_keyword( keyword: str, headers: Dict[str, str], org: str, repo: str = "claude-code" ) -> Optional[Dict]: """Find an issue by title keyword and return the issue data.""" # Check both open and closed issues for state in ["open", "closed"]: success, issues = _get_github_api( f"issues?state={state}&per_page=100", headers, org, repo ) if success and issues: for issue in issues: if keyword.lower() in issue.get("title", "").lower(): return issue return None def _find_pr_by_title_keyword( keyword: str, headers: Dict[str, str], org: str, repo: str = "claude-code" ) -> Optional[Dict]: """Find a PR by title keyword and return the PR data.""" # Check both open and closed PRs for state in ["open", "closed"]: success, prs = _get_github_api( f"pulls?state={state}&per_page=100", headers, org, repo ) if success and prs: for pr in prs: if keyword.lower() in pr.get("title", "").lower(): return pr return None def _get_pr_by_number( pr_number: int, headers: Dict[str, str], org: str, repo: str = "claude-code" ) -> Optional[Dict]: """Get a specific PR by number.""" success, pr = _get_github_api(f"pulls/{pr_number}", headers, org, repo) if success: return pr return None def _check_issue_references(text: str, reference_numbers: List[str]) -> bool: """Check if text contains references to specified issue numbers.""" if not text: return False return all(f"#{ref}" in text for ref in reference_numbers) def _check_addresses_pattern(pr_body: str, issue_numbers: List[str]) -> bool: """Check if PR body contains 'Addresses #X' pattern for specified issues.""" if not pr_body: return False return all( f"Addresses #{num}" in pr_body or f"addresses #{num}" in pr_body for num in issue_numbers ) def _get_issue_comments( issue_number: int, headers: Dict[str, str], org: str, repo: str = "claude-code" ) -> List[Dict]: """Get all comments for an issue.""" success, comments = _get_github_api( f"issues/{issue_number}/comments", headers, org, repo ) if success and comments: return comments return [] def _get_pr_reviews( pr_number: int, headers: Dict[str, str], org: str, repo: str = "claude-code" ) -> List[Dict]: """Get all reviews for a PR.""" success, reviews = _get_github_api(f"pulls/{pr_number}/reviews", headers, org, repo) if success and reviews: return reviews return [] def _check_title_keywords(title: str, required_keywords: List[str]) -> bool: """Check if title contains all required keywords.""" return all(keyword.lower() in title.lower() for keyword in required_keywords) def _check_headings_and_keywords( body: str, headings: List[str], keywords: List[str] ) -> bool: """Check if body contains required headings and keywords.""" has_headings = all(heading in body for heading in headings) has_keywords = all(keyword.lower() in body.lower() for keyword in keywords) return has_headings and has_keywords def _check_exact_file_content(content: str, expected_sections: List[str]) -> bool: """Check if file content contains expected sections.""" return all(section in content for section in expected_sections) def verify() -> bool: """ Programmatically verify that the critical issue hotfix workflow meets the requirements described in description.md. """ # Configuration constants HOTFIX_BRANCH_NAME = "hotfix/memory-optimization-v1.0.72" TRACKING_ISSUE_KEYWORD = "Memory and Context Management Issues" HOTFIX_PR_KEYWORD = "HOTFIX: Critical memory optimization" # Expected file content sections MEMORY_DOC_SECTIONS = [ "# Memory Optimization Guide for Claude Code v1.0.72", "## Overview", "### Context Auto-Compact Problem (Issue #49)", "### JavaScript Heap Exhaustion (Issue #46)", "## Optimization Strategies", "### Immediate Fixes", "### Configuration Options", "## Related Issues", ] # Issue content requirements TRACKING_ISSUE_TITLE_KEYWORDS = [ "CRITICAL", "Memory", "Context Management", "Hotfix Tracking", ] TRACKING_ISSUE_REFERENCE_NUMBERS = ["49", "46", "47"] TRACKING_ISSUE_HEADINGS = [ "## Critical Issues", "## Impact Assessment", "## Resolution Strategy", ] TRACKING_ISSUE_KEYWORDS = [ "memory exhaustion", "context auto-compact", "JavaScript heap", "hotfix priority", ] # PR content requirements HOTFIX_PR_TITLE_KEYWORDS = [ "HOTFIX", "Critical memory optimization", "issues #49", "#46", ] HOTFIX_PR_ADDRESSES_NUMBERS = ["49", "46"] HOTFIX_PR_HEADINGS = [ "## Summary", "## Critical Issues Addressed", "## Documentation Changes", ] HOTFIX_PR_KEYWORDS = [ "memory optimization", "context management", "heap exhaustion", "v1.0.72 hotfix", ] # PR #51 update requirements PR51_UPDATE_KEYWORDS = [ "Technical Implementation", "event logging integration", "workflow enhancement", ] # Issue comment requirements ISSUE_COMMENT_KEYWORDS = [ "context buffer management", "streaming optimization", "progressive cleanup", ] # Load environment variables from .mcp_env load_dotenv(".mcp_env") # Get GitHub token and org github_token = os.environ.get("MCP_GITHUB_TOKEN") github_org = os.environ.get("GITHUB_EVAL_ORG") if not github_token: print("Error: MCP_GITHUB_TOKEN environment variable not set", file=sys.stderr) return False if not github_org: print("Error: GITHUB_EVAL_ORG environment variable not set", file=sys.stderr) return False headers = { "Authorization": f"Bearer {github_token}", "Accept": "application/vnd.github.v3+json", } # Run verification checks print("Verifying critical issue hotfix workflow completion...") # 1. Check that hotfix branch exists print("1. Verifying hotfix branch exists...") if not _check_branch_exists(HOTFIX_BRANCH_NAME, headers, github_org): print(f"Error: Branch '{HOTFIX_BRANCH_NAME}' not found", file=sys.stderr) return False print("✓ Hotfix branch created") # 2. Check that the memory optimization documentation exists with exact content print("2. Verifying MEMORY_OPTIMIZATION.md documentation...") memory_doc_content = _get_file_content( "docs/MEMORY_OPTIMIZATION.md", headers, github_org, "claude-code", HOTFIX_BRANCH_NAME, ) if not memory_doc_content: print( "Error: docs/MEMORY_OPTIMIZATION.md not found in hotfix branch", file=sys.stderr, ) return False if not _check_exact_file_content(memory_doc_content, MEMORY_DOC_SECTIONS): print( "Error: MEMORY_OPTIMIZATION.md missing required sections or content", file=sys.stderr, ) return False print("✓ Memory optimization documentation created with correct content") # 3. Find and verify the tracking issue print("3. Verifying tracking issue creation and content...") tracking_issue = _find_issue_by_title_keyword( TRACKING_ISSUE_KEYWORD, headers, github_org ) if not tracking_issue: print( f"Error: Tracking issue with keyword '{TRACKING_ISSUE_KEYWORD}' not found", file=sys.stderr, ) return False tracking_issue_number = tracking_issue.get("number") tracking_issue_title = tracking_issue.get("title", "") tracking_issue_body = tracking_issue.get("body", "") # Check tracking issue title keywords if not _check_title_keywords(tracking_issue_title, TRACKING_ISSUE_TITLE_KEYWORDS): print("Error: Tracking issue title missing required keywords", file=sys.stderr) return False # Check tracking issue headings, content and references if not _check_headings_and_keywords( tracking_issue_body, TRACKING_ISSUE_HEADINGS, TRACKING_ISSUE_KEYWORDS ): print( "Error: Tracking issue missing required headings or keywords", file=sys.stderr, ) return False if not _check_issue_references( tracking_issue_body, TRACKING_ISSUE_REFERENCE_NUMBERS ): print( "Error: Tracking issue does not reference required issues #49, #46, #47", file=sys.stderr, ) return False print("✓ Tracking issue created with correct content and references") # 4. Find and verify the hotfix PR print("4. Verifying hotfix pull request creation and content...") hotfix_pr = _find_pr_by_title_keyword(HOTFIX_PR_KEYWORD, headers, github_org) if not hotfix_pr: print( f"Error: Hotfix PR with keyword '{HOTFIX_PR_KEYWORD}' not found", file=sys.stderr, ) return False hotfix_pr_number = hotfix_pr.get("number") hotfix_pr_title = hotfix_pr.get("title", "") hotfix_pr_body = hotfix_pr.get("body", "") # Check hotfix PR title keywords if not _check_title_keywords(hotfix_pr_title, HOTFIX_PR_TITLE_KEYWORDS): print("Error: Hotfix PR title missing required keywords", file=sys.stderr) return False # Check hotfix PR headings and content if not _check_headings_and_keywords( hotfix_pr_body, HOTFIX_PR_HEADINGS, HOTFIX_PR_KEYWORDS ): print("Error: Hotfix PR missing required headings or keywords", file=sys.stderr) return False # Check hotfix PR addresses pattern if not _check_addresses_pattern(hotfix_pr_body, HOTFIX_PR_ADDRESSES_NUMBERS): print( "Error: Hotfix PR does not properly address issues #49 and #46", file=sys.stderr, ) return False # Check reference to tracking issue if f"#{tracking_issue_number}" not in hotfix_pr_body: print( f"Error: Hotfix PR does not reference tracking issue #{tracking_issue_number}", file=sys.stderr, ) return False print("✓ Hotfix PR created with correct content and references") # 5. Check PR #51 has been updated and merged print("5. Verifying PR #51 update and merge...") pr51 = _get_pr_by_number(51, headers, github_org) if not pr51: print("Error: PR #51 not found", file=sys.stderr) return False pr51_body = pr51.get("body", "") pr51_state = pr51.get("state", "") # Check PR #51 has been updated with required content if not _check_headings_and_keywords( pr51_body, ["## Technical Implementation"], PR51_UPDATE_KEYWORDS ): print( "Error: PR #51 missing updated technical implementation section", file=sys.stderr, ) return False # Check PR #51 has been merged if pr51_state != "closed" or not pr51.get("merged_at"): print("Error: PR #51 has not been merged", file=sys.stderr) return False print("✓ PR #51 updated and merged successfully") # 6. Check tracking issue has implementation comment print("6. Verifying tracking issue implementation comment...") tracking_issue_comments = _get_issue_comments( tracking_issue_number, headers, github_org ) has_implementation_comment = False for comment in tracking_issue_comments: body = comment.get("body", "") has_pr_ref = f"PR #{hotfix_pr_number}" in body has_pr51_ref = "PR #51" in body has_keywords = all( keyword.lower() in body.lower() for keyword in ISSUE_COMMENT_KEYWORDS ) if has_pr_ref and has_pr51_ref and has_keywords: has_implementation_comment = True break if not has_implementation_comment: print( f"Error: Tracking issue #{tracking_issue_number} missing implementation comment with required references and keywords", file=sys.stderr, ) return False print("✓ Tracking issue has implementation comment with PR references") # 7. Check tracking issue is closed print("7. Verifying tracking issue closure...") if tracking_issue.get("state") != "closed": print( f"Error: Tracking issue #{tracking_issue_number} is not closed", file=sys.stderr, ) return False print("✓ Tracking issue closed successfully") print("\n✅ All verification checks passed!") print("Critical issue hotfix workflow completed successfully:") print(f" - Tracking Issue #{tracking_issue_number}: {tracking_issue.get('title')}") print(f" - Hotfix PR #{hotfix_pr_number}: {hotfix_pr.get('title')}") print(f" - Branch: {HOTFIX_BRANCH_NAME}") print(" - PR #51 merged: ✓") print(" - Memory optimization documentation: ✓") return True if __name__ == "__main__": success = verify() sys.exit(0 if success else 1) ================================================ FILE: tasks/github/standard/claude-code/feature_commit_tracking/description.md ================================================ I need you to research the development history of the repository across multiple branches and commits, then create a comprehensive feature tracking document and submit it as a new file to the repository. **Step 1: Multi-Branch Feature Investigation** Research and identify the exact commit SHAs where these specific features were introduced by analyzing commits across different branches: 1. **Shell Completion Scripts**: Find when shell completion functionality was first added to the repository 2. **CHANGELOG Version 1.0.65**: Find when the changelog was updated to include version 1.0.65 3. **Rust Extraction Improvements**: Find when workflow improvements for Rust code extraction were implemented **Step 2: Create Feature Tracking Documentation** Create a file called `FEATURE_COMMITS.md` in the repository root with: - A "# Feature Development Tracking" title - A "## Overview" section explaining this tracks major feature additions across repository branches - A "## Feature Commit History" section with this exact table format: ```markdown | Feature Name | Commit SHA | Author | Branch | Date | Files Changed | Commit Message | |-------------|------------|---------|---------|------|---------------|----------------| ``` For each feature, populate the table with: - Exact commit SHA (full 40-character hash) - GitHub username of the commit author - Branch where the commit was made - Commit date in YYYY-MM-DD format - Number of files changed in that commit - First line of the commit message **Step 3: Commit Documentation to Repository** Commit the `FEATURE_COMMITS.md` file to the main branch with: - Commit message: "Add feature development tracking documentation" - Ensure the file is properly formatted markdown - Verify all commit SHAs in the table are accurate and verifiable The verification process will check that your table contains the correct commit SHAs for each specific feature, along with accurate author, branch, and date information. ================================================ FILE: tasks/github/standard/claude-code/feature_commit_tracking/meta.json ================================================ { "task_id": "feature_commit_tracking", "task_name": "Feature Commit Tracking", "category_id": "claude-code", "category_name": "Claude Code", "description": "Research development history across branches to track when specific features were introduced and create comprehensive documentation.", "author": "Zijian Wu", "created_at": "2025-08-15", "difficulty": "L3", "tags": [ "repository analysis", "release coordination" ], "mcp": [ "github" ], "meta_data": { "stateType": "url", "stateContent": null, "stateUrl": "https://github.com/mcpmark-source/claude-code", "stateOriginalUrl": "https://github.com/anthropics/claude-code" } } ================================================ FILE: tasks/github/standard/claude-code/feature_commit_tracking/verify.py ================================================ import sys import os import requests from typing import Dict, List, Optional, Tuple import base64 import re from dotenv import load_dotenv def _get_github_api( endpoint: str, headers: Dict[str, str], org: str, repo: str = "claude-code" ) -> Tuple[bool, Optional[Dict]]: """Make a GET request to GitHub API and return (success, response).""" url = f"https://api.github.com/repos/{org}/{repo}/{endpoint}" try: response = requests.get(url, headers=headers) if response.status_code == 200: return True, response.json() elif response.status_code == 404: return False, None else: print(f"API error for {endpoint}: {response.status_code}", file=sys.stderr) return False, None except Exception as e: print(f"Exception for {endpoint}: {e}", file=sys.stderr) return False, None def _get_file_content( file_path: str, headers: Dict[str, str], org: str, repo: str = "claude-code", ref: str = "main", ) -> Optional[str]: """Get the content of a file from the repository.""" success, result = _get_github_api( f"contents/{file_path}?ref={ref}", headers, org, repo ) if not success or not result: return None try: content = base64.b64decode(result.get("content", "")).decode("utf-8") return content except Exception as e: print(f"Content decode error for {file_path}: {e}", file=sys.stderr) return None def _verify_commit_exists( commit_sha: str, headers: Dict[str, str], org: str, repo: str = "claude-code" ) -> Tuple[bool, Optional[Dict]]: """Verify that a commit exists and return its details.""" success, commit_data = _get_github_api(f"commits/{commit_sha}", headers, org, repo) return success, commit_data def _parse_feature_table(content: str) -> List[Dict]: """Parse the feature commit table from markdown content.""" features = [] lines = content.split("\n") in_table = False for line in lines: # Look for table header if ( "| Feature Name | Commit SHA | Author | Branch | Date | Files Changed | Commit Message |" in line ): in_table = True continue if in_table and line.startswith("|---"): continue # Parse table rows if in_table and line.startswith("|"): parts = [p.strip() for p in line.split("|")] if len(parts) >= 8: # Should have 7 columns plus empty parts at start/end feature_name = parts[1].strip() commit_sha = parts[2].strip() author = parts[3].strip() branch = parts[4].strip() date = parts[5].strip() files_changed = parts[6].strip() commit_message = parts[7].strip() if feature_name and commit_sha and author and branch and date: features.append( { "name": feature_name, "sha": commit_sha, "author": author, "branch": branch, "date": date, "files_changed": files_changed, "commit_message": commit_message, } ) # Stop at end of table section if in_table and line and not line.startswith("|") and "##" in line: break return features def verify_task() -> bool: """Verify the feature commit tracking task.""" # Load environment variables from .mcp_env load_dotenv(".mcp_env") # Get GitHub token and org github_token = os.environ.get("MCP_GITHUB_TOKEN") github_org = os.environ.get("GITHUB_EVAL_ORG") if not github_token: print("Error: MCP_GITHUB_TOKEN environment variable not set", file=sys.stderr) return False if not github_org: print("Error: GITHUB_EVAL_ORG environment variable not set", file=sys.stderr) return False headers = { "Authorization": f"Bearer {github_token}", "Accept": "application/vnd.github.v3+json", } # Expected feature commits based on exploration expected_features = { "Shell Completion Scripts": "8a0febdd09bda32f38c351c0881784460d69997d", "CHANGELOG Version 1.0.65": "94dcaca5d71ad82644ae97f3a2b0c5eb8b63eae0", "Rust Extraction Improvements": "50e58affdf1bfc7d875202bc040ebe0dcfb7d332", } # Expected authors for each commit expected_authors = { "8a0febdd09bda32f38c351c0881784460d69997d": "gitmpr", "94dcaca5d71ad82644ae97f3a2b0c5eb8b63eae0": "QwertyJack", "50e58affdf1bfc7d875202bc040ebe0dcfb7d332": "alokdangre", } # Expected commit messages for each commit expected_messages = { "8a0febdd09bda32f38c351c0881784460d69997d": "feat: add shell completions (bash, zsh, fish)", "94dcaca5d71ad82644ae97f3a2b0c5eb8b63eae0": "Merge branch 'anthropics:main' into main", "50e58affdf1bfc7d875202bc040ebe0dcfb7d332": "Enhance Rust extraction and output handling in workflows", } # Expected dates for each commit (YYYY-MM-DD format) expected_dates = { "8a0febdd09bda32f38c351c0881784460d69997d": "2025-08-01", "94dcaca5d71ad82644ae97f3a2b0c5eb8b63eae0": "2025-08-02", "50e58affdf1bfc7d875202bc040ebe0dcfb7d332": "2025-08-09", } print("Verifying feature commit tracking task...") # 1. Check if FEATURE_COMMITS.md exists in main branch print("1. Checking if FEATURE_COMMITS.md exists...") content = _get_file_content("FEATURE_COMMITS.md", headers, github_org) if not content: print("Error: FEATURE_COMMITS.md not found in main branch", file=sys.stderr) return False print("✓ FEATURE_COMMITS.md found") # 2. Check required sections exist print("2. Checking required sections...") required_sections = [ "# Feature Development Tracking", "## Overview", "## Feature Commit History", ] for section in required_sections: if section not in content: print(f"Error: Missing required section '{section}'", file=sys.stderr) return False print("✓ All required sections present") # 3. Parse and validate feature table print("3. Parsing and validating feature table...") features = _parse_feature_table(content) if len(features) < 3: print( f"Error: Expected at least 3 features, found {len(features)}", file=sys.stderr, ) return False # 4. Verify each expected feature is present with correct commit SHA print("4. Verifying feature commit SHAs...") found_features = {} for feature in features: found_features[feature["name"]] = feature["sha"] for feature_name, expected_sha in expected_features.items(): if feature_name not in found_features: print( f"Error: Feature '{feature_name}' not found in table", file=sys.stderr ) return False actual_sha = found_features[feature_name] if actual_sha != expected_sha: print( f"Error: Wrong SHA for '{feature_name}'. Expected: {expected_sha}, Got: {actual_sha}", file=sys.stderr, ) return False print("✓ All feature commit SHAs are correct") # 5. Verify each commit exists and has correct author print("5. Verifying commit details...") for feature in features: if feature["sha"] in expected_features.values(): success, commit_data = _verify_commit_exists( feature["sha"], headers, github_org ) if not success: print(f"Error: Commit {feature['sha']} not found", file=sys.stderr) return False # Check author expected_author = expected_authors.get(feature["sha"]) if expected_author: actual_author = commit_data.get("author", {}).get("login", "") if actual_author != expected_author: print( f"Error: Wrong author for {feature['sha']}. Expected: {expected_author}, Got: {actual_author}", file=sys.stderr, ) return False # Check commit message (compare with table entry) expected_message = expected_messages.get(feature["sha"]) if expected_message and "commit_message" in feature: if feature["commit_message"] != expected_message: print( f"Error: Wrong commit message in table for {feature['sha']}. Expected: '{expected_message}', Got: '{feature['commit_message']}'", file=sys.stderr, ) return False # Also verify against actual commit data if expected_message: actual_message = ( commit_data.get("commit", {}).get("message", "").split("\n")[0] ) # First line only if actual_message != expected_message: print( f"Error: Wrong commit message for {feature['sha']}. Expected: '{expected_message}', Got: '{actual_message}'", file=sys.stderr, ) return False # Check date format (YYYY-MM-DD) if not re.match(r"^\d{4}-\d{2}-\d{2}$", feature["date"]): print( f"Error: Invalid date format for {feature['name']}: {feature['date']}", file=sys.stderr, ) return False # Check actual date matches expected expected_date = expected_dates.get(feature["sha"]) if expected_date: if feature["date"] != expected_date: print( f"Error: Wrong date for {feature['sha']}. Expected: {expected_date}, Got: {feature['date']}", file=sys.stderr, ) return False print("✓ All commit details verified") # 6. Verify the table format is correct print("6. Verifying table format...") table_header = "| Feature Name | Commit SHA | Author | Branch | Date | Files Changed | Commit Message |" if table_header not in content: print("Error: Table header format is incorrect", file=sys.stderr) return False # Check that all features have complete information for feature in features: if not all( [ feature["name"], feature["sha"], feature["author"], feature["branch"], feature["date"], feature.get("commit_message", ""), ] ): print( f"Error: Incomplete information for feature: {feature['name']}", file=sys.stderr, ) return False print("✓ Table format is correct and complete") print("\n✅ All verification checks passed!") print("Feature commit tracking completed successfully:") print(" - File: FEATURE_COMMITS.md created in main branch") print(f" - Features tracked: {len(features)}") print(" - All expected commit SHAs verified") print(" - All commit authors verified") print(" - Analysis summary complete") return True if __name__ == "__main__": success = verify_task() sys.exit(0 if success else 1) ================================================ FILE: tasks/github/standard/claude-code/label_color_standardization/description.md ================================================ I need you to implement a comprehensive label documentation and organization workflow for the repository. **Step 1: Create Label Documentation Issue** Create a new issue with: - Title containing: "Document label organization for better visual organization" and "label guide" - Body must include: - A "## Problem" heading describing the need for better label documentation - A "## Proposed Solution" heading about creating a comprehensive label guide for different label categories - A "## Benefits" heading listing improved visual organization and easier issue triage - Keywords: "label documentation", "visual organization", "label guide", "organization" - Labels: Initially add "enhancement" and "documentation" labels to the issue **Step 2: Create Feature Branch** Create a new branch called 'feat/label-color-guide' from main. **Step 3: Create Label Documentation** On the feature branch, create the file `docs/LABEL_COLORS.md` with: - A "# Label Organization Guide" title - A "## Label Categories" section with a table that MUST follow this exact format: ```markdown | Label Name | Category | Description | |------------|----------|-------------| ``` The table must include ALL existing labels in the repository. For each label: - Group labels by category (e.g., issue-type, platform, area, status, performance) - Include a description for each label - A "## Usage Guidelines" section explaining when to use each label category **Step 4: Apply ALL Labels to the Documentation Issue** Update the issue you created in Step 1 by adding ALL existing labels from the repository. This serves as a visual demonstration of the label organization. The issue should have every single label that exists in the repository applied to it. **Step 5: Create Pull Request** Create a pull request from 'feat/label-color-guide' to 'main' with: - Title containing: "Add label organization guide" and "visual organization" - Body must include: - A "## Summary" heading explaining the label organization documentation - A "## Changes" heading with a bullet list of what was added - "Fixes #[ISSUE_NUMBER]" pattern linking to your created issue - A "## Verification" section stating that all labels have been documented - Keywords: "label documentation", "organization guide", "visual improvement", "documentation" - Labels: Add a reasonable subset of labels to the PR (at least 5-10 labels from different categories) **Step 6: Document Changes in Issue** Add a comment to the original issue with: - Confirmation that the label documentation has been created - Total count of labels documented - Reference to the PR using "PR #[NUMBER]" pattern - Keywords: "documentation created", "label guide complete", "organization complete" ================================================ FILE: tasks/github/standard/claude-code/label_color_standardization/meta.json ================================================ { "task_id": "label_color_standardization", "task_name": "Label Color Standardization", "category_id": "claude-code", "category_name": "Claude Code", "description": "Standardize label colors from default gray to a comprehensive color scheme for better visual organization and issue triage.", "author": "Zijian Wu", "created_at": "2025-08-15", "difficulty": "L3", "tags": [ "issue management", "workflow automation" ], "mcp": [ "github" ], "meta_data": { "stateType": "url", "stateContent": null, "stateUrl": "https://github.com/mcpmark-source/claude-code", "stateOriginalUrl": "https://github.com/anthropics/claude-code" } } ================================================ FILE: tasks/github/standard/claude-code/label_color_standardization/verify.py ================================================ import sys import os import requests from typing import Dict, List, Optional, Tuple from dotenv import load_dotenv def _get_github_api( endpoint: str, headers: Dict[str, str], org: str, repo: str = "claude-code" ) -> Tuple[bool, Optional[Dict]]: """Make a GET request to GitHub API and return (success, response).""" url = f"https://api.github.com/repos/{org}/{repo}/{endpoint}" try: response = requests.get(url, headers=headers) if response.status_code == 200: return True, response.json() elif response.status_code == 404: return False, None else: print(f"API error for {endpoint}: {response.status_code}", file=sys.stderr) return False, None except Exception as e: print(f"Exception for {endpoint}: {e}", file=sys.stderr) return False, None def _check_branch_exists( branch_name: str, headers: Dict[str, str], org: str, repo: str = "claude-code" ) -> bool: """Verify that a branch exists in the repository.""" success, _ = _get_github_api(f"branches/{branch_name}", headers, org, repo) return success def _check_file_content( branch: str, file_path: str, headers: Dict[str, str], org: str, repo: str = "claude-code", ) -> Optional[str]: """Get file content from a branch.""" import base64 success, result = _get_github_api( f"contents/{file_path}?ref={branch}", headers, org, repo ) if not success or not result: return None if result.get("content"): try: content = base64.b64decode(result.get("content", "")).decode("utf-8") return content except Exception as e: print(f"Content decode error for {file_path}: {e}", file=sys.stderr) return None return None def _parse_label_table(content: str) -> List[str]: """Parse the label table from markdown content and return label names.""" documented_labels = [] # Find the table in the content lines = content.split("\n") in_table = False for line in lines: # Skip header and separator lines if "| Label Name | Category |" in line: in_table = True continue if in_table and line.startswith("|---"): continue # Parse table rows if in_table and line.startswith("|"): parts = [p.strip() for p in line.split("|")] if len(parts) >= 3: # Should have at least label, category label_name = parts[1].strip() if label_name: documented_labels.append(label_name) # Stop at end of table if in_table and line and not line.startswith("|"): break return documented_labels def _find_issue_by_title_keywords( title_keywords: List[str], headers: Dict[str, str], org: str, repo: str = "claude-code", ) -> Optional[Dict]: """Find an issue by title keywords and return the issue data.""" for state in ["open", "closed"]: success, issues = _get_github_api( f"issues?state={state}&per_page=100", headers, org, repo ) if success and issues: for issue in issues: # Skip pull requests if "pull_request" in issue: continue title = issue.get("title", "").lower() if all(keyword.lower() in title for keyword in title_keywords): return issue return None def _find_pr_by_title_keywords( title_keywords: List[str], headers: Dict[str, str], org: str, repo: str = "claude-code", ) -> Optional[Dict]: """Find a PR by title keywords and return the PR data.""" for state in ["open", "closed"]: success, prs = _get_github_api( f"pulls?state={state}&per_page=100", headers, org, repo ) if success and prs: for pr in prs: title = pr.get("title", "").lower() if all(keyword.lower() in title for keyword in title_keywords): return pr return None def _get_issue_comments( issue_number: int, headers: Dict[str, str], org: str, repo: str = "claude-code" ) -> List[Dict]: """Get all comments for an issue.""" success, comments = _get_github_api( f"issues/{issue_number}/comments", headers, org, repo ) if success and comments: return comments return [] def verify() -> bool: """ Programmatically verify that the label color standardization workflow meets the requirements described in description.md. """ # Load environment variables from .mcp_env load_dotenv(".mcp_env") # Get GitHub token and org github_token = os.environ.get("MCP_GITHUB_TOKEN") github_org = os.environ.get("GITHUB_EVAL_ORG") if not github_token: print("Error: MCP_GITHUB_TOKEN environment variable not set", file=sys.stderr) return False if not github_org: print("Error: GITHUB_EVAL_ORG environment variable not set", file=sys.stderr) return False # Configuration constants BRANCH_NAME = "feat/label-color-guide" # Issue requirements ISSUE_TITLE_KEYWORDS = ["Document label organization", "label guide"] ISSUE_KEYWORDS = [ "label documentation", "visual organization", "label guide", "organization", ] # PR requirements PR_TITLE_KEYWORDS = ["label organization guide", "visual organization"] PR_KEYWORDS = [ "label documentation", "organization guide", "visual improvement", "documentation", ] # All expected labels in the repository that are actually used/discoverable via MCP tools # Note: Excludes 'wontfix', 'invalid', 'good first issue', 'help wanted' as they exist # in the repository but are not used by any issues (not discoverable via MCP search) ALL_EXPECTED_LABELS = [ "bug", "enhancement", "duplicate", "question", "documentation", "platform:macos", "platform:linux", "platform:windows", "area:core", "area:tools", "area:tui", "area:ide", "area:mcp", "area:api", "area:security", "area:model", "area:auth", "area:packaging", "has repro", "memory", "perf:memory", "external", ] headers = { "Authorization": f"token {github_token}", "Accept": "application/vnd.github.v3+json", } # Run verification checks print("Verifying label color standardization workflow completion...") # 1. Check that feature branch exists print("1. Verifying feature branch exists...") if not _check_branch_exists(BRANCH_NAME, headers, github_org): print(f"Error: Branch '{BRANCH_NAME}' not found", file=sys.stderr) return False # 2. Check documentation file exists and has correct format print("2. Verifying label documentation file...") doc_content = _check_file_content( BRANCH_NAME, "docs/LABEL_COLORS.md", headers, github_org ) if not doc_content: print("Error: docs/LABEL_COLORS.md not found", file=sys.stderr) return False # Parse the label table from documentation documented_labels = _parse_label_table(doc_content) if len(documented_labels) < 20: print( f"Error: Documentation table incomplete, found only {len(documented_labels)} labels", file=sys.stderr, ) return False # 3. Verify labels are documented print("3. Verifying expected labels are documented...") print(f" ✓ {len(ALL_EXPECTED_LABELS)} expected labels defined for verification") # 4. Find the created issue print("4. Verifying issue creation...") issue = _find_issue_by_title_keywords(ISSUE_TITLE_KEYWORDS, headers, github_org) if not issue: print( "Error: Issue with title containing required keywords not found", file=sys.stderr, ) return False issue_number = issue.get("number") issue_body = issue.get("body", "") # Check issue content has required sections and keywords issue_required_sections = ["## Problem", "## Proposed Solution", "## Benefits"] for section in issue_required_sections: if section not in issue_body: print(f"Error: Issue body missing required section: {section}", file=sys.stderr) return False # Check issue has required keywords if not all(keyword.lower() in issue_body.lower() for keyword in ISSUE_KEYWORDS): missing_keywords = [kw for kw in ISSUE_KEYWORDS if kw.lower() not in issue_body.lower()] print(f"Error: Issue body missing required keywords: {missing_keywords}", file=sys.stderr) return False # Check issue has initial required labels (enhancement and documentation) issue_label_names = [label["name"] for label in issue.get("labels", [])] initial_required_labels = ["enhancement", "documentation"] for required_label in initial_required_labels: if required_label not in issue_label_names: print(f"Error: Issue missing initial required label: {required_label}", file=sys.stderr) return False # 5. Find the created PR print("5. Verifying pull request creation...") pr = _find_pr_by_title_keywords(PR_TITLE_KEYWORDS, headers, github_org) if not pr: print( "Error: PR with title containing required keywords not found", file=sys.stderr, ) return False pr_number = pr.get("number") pr_body = pr.get("body", "") pr_labels = pr.get("labels", []) # Check PR references issue with correct pattern if f"Fixes #{issue_number}" not in pr_body and f"fixes #{issue_number}" not in pr_body: print(f"Error: PR does not contain 'Fixes #{issue_number}' pattern", file=sys.stderr) return False # Check PR body has required sections and keywords pr_required_sections = ["## Summary", "## Changes", "## Verification"] for section in pr_required_sections: if section not in pr_body: print(f"Error: PR body missing required section: {section}", file=sys.stderr) return False # Check PR has required keywords if not all(keyword.lower() in pr_body.lower() for keyword in PR_KEYWORDS): missing_keywords = [kw for kw in PR_KEYWORDS if kw.lower() not in pr_body.lower()] print(f"Error: PR body missing required keywords: {missing_keywords}", file=sys.stderr) return False # Check PR has sufficient labels (at least 5 from different categories) if len(pr_labels) < 5: print(f"Error: PR has only {len(pr_labels)} labels, needs at least 5", file=sys.stderr) return False # 6. Verify issue has ALL expected/usable labels applied (demonstrates organization) print("6. Verifying issue has all expected labels applied...") issue_label_names = [label["name"] for label in issue.get("labels", [])] # Use our expected labels list instead of all repo labels (excludes unused labels) expected_labels_to_check = ALL_EXPECTED_LABELS missing_labels = [] for expected_label in expected_labels_to_check: if expected_label not in issue_label_names: missing_labels.append(expected_label) if missing_labels: print( f"Error: Issue missing {len(missing_labels)} expected labels: {missing_labels[:5]}...", file=sys.stderr, ) return False print(f" ✓ Issue has all {len(expected_labels_to_check)} expected labels applied") # 7. Verify issue has comment documenting changes print("7. Verifying issue comment with documentation...") issue_comments = _get_issue_comments(issue_number, headers, github_org) found_update_comment = False comment_required_keywords = ["documentation created", "label guide complete", "organization complete"] for comment in issue_comments: body = comment.get("body", "") # Check for PR reference and required keywords if (f"PR #{pr_number}" in body and any(keyword.lower() in body.lower() for keyword in comment_required_keywords) and "total" in body.lower() and "labels" in body.lower()): found_update_comment = True break if not found_update_comment: print("Error: Issue missing comment documenting changes with required content", file=sys.stderr) print(" Comment should include: PR reference, label count, and completion keywords", file=sys.stderr) return False # 8. Final verification of complete workflow print("8. Final verification of workflow completion...") # Skip repository label existence check - we trust that our expected labels # are the ones actually discoverable/usable via MCP tools # Ensure expected labels are documented (not all repo labels, since some are unused) documented_label_count = len(documented_labels) expected_label_count = len(ALL_EXPECTED_LABELS) if documented_label_count < expected_label_count: print( f"Error: Documentation incomplete - {documented_label_count} documented vs {expected_label_count} expected", file=sys.stderr, ) return False # Check that all expected labels are documented missing_documented_labels = [] for expected_label in ALL_EXPECTED_LABELS: if expected_label not in documented_labels: missing_documented_labels.append(expected_label) if missing_documented_labels: print( f"Error: Documentation missing expected labels: {missing_documented_labels}", file=sys.stderr, ) return False print(f" ✓ All {expected_label_count} expected labels documented") print(f" ✓ All {len(ALL_EXPECTED_LABELS)} expected labels present and documented") print("\n✓ All verification checks passed!") print("Label documentation workflow completed successfully:") print( f" - Issue #{issue_number}: {issue.get('title')} (with all {len(issue_label_names)} labels)" ) print(f" - PR #{pr_number}: {pr.get('title')}") print(f" - Branch: {BRANCH_NAME}") print(" - Documentation: docs/LABEL_COLORS.md") print(f" - {expected_label_count} labels documented for better organization") return True if __name__ == "__main__": success = verify() sys.exit(0 if success else 1) ================================================ FILE: tasks/github/standard/easyr1/advanced_branch_strategy/description.md ================================================ The EasyR1 repository has a critical production issue: all development happens directly on the `main` branch, which is extremely risky for a project with 25 active issues. A recent commit `098931530606d22f867fd121b1dcb3225a43661f` introduced protocol changes that need to be properly managed through a structured branching workflow. I need you to implement a complete GitFlow strategy by working through a realistic development scenario. **The Scenario:** You're preparing for the v1.0.0 release while simultaneously handling a critical protocol serialization bug that was introduced in the recent data proto changes. **Step 1: Initialize GitFlow Structure** Create a `develop` branch from `main` as the new integration branch. Then create a `release/v1.0.0` branch from `develop` to prepare for the upcoming release. **Step 2: Address the Critical Bug** Create a `feature/protocol-serialization-fix` branch from `develop`. In this branch, create a new file called `PROTOCOL_FIXES.md` with the exact content: ``` # Protocol Serialization Fixes ## Critical Fix for Data Proto Issue - Enhanced serialization safety check implemented - Addresses issue from commit 098931530606d22f867fd121b1dcb3225a43661f - Status: Ready for integration testing ``` **Step 3: Integrate the Fix Through Proper Workflow** Create a pull request from `feature/protocol-serialization-fix` to `develop` to integrate the fix documentation. This demonstrates the feature → develop integration pattern. **Step 4: Update Release Branch and CI/CD** Merge the develop branch changes into `release/v1.0.0` branch to include the critical fix in the release. **Step 5: Document the New Process** Create an issue titled `Implement Advanced Branch Protection Strategy` with exactly these 3 checkboxes in the body: - [ ] All development flows through develop branch - [ ] Release preparation happens in release/v1.0.0 branch - [ ] Feature integration uses PR workflow Add the label `process-implementation` to this issue to track the process implementation. ================================================ FILE: tasks/github/standard/easyr1/advanced_branch_strategy/meta.json ================================================ { "task_id": "advanced_branch_strategy", "task_name": "Advanced Branch Strategy", "category_id": "easyr1", "category_name": "EasyR1", "description": "Implement GitFlow branching strategy with develop, release, and feature branches to replace risky direct-to-main development.", "author": "Xiangyan Liu", "created_at": "2025-08-15", "difficulty": "L3", "tags": [ "pr workflows", "release coordination" ], "mcp": [ "github" ], "meta_data": { "stateType": "url", "stateContent": null, "stateUrl": "https://github.com/mcpmark-source/EasyR1", "stateOriginalUrl": "https://github.com/hiyouga/EasyR1" } } ================================================ FILE: tasks/github/standard/easyr1/advanced_branch_strategy/verify.py ================================================ import sys import os import requests from typing import Dict, Optional, Tuple from dotenv import load_dotenv load_dotenv(".mcp_env") def _get_github_api( endpoint: str, headers: Dict[str, str] ) -> Tuple[bool, Optional[Dict]]: """Make a GET request to GitHub API and return (success, response).""" github_org = os.environ.get("GITHUB_EVAL_ORG") url = f"https://api.github.com/repos/{github_org}/EasyR1/{endpoint}" try: response = requests.get(url, headers=headers) if response.status_code == 200: return True, response.json() elif response.status_code == 404: return False, None else: print(f"API error for {endpoint}: {response.status_code}", file=sys.stderr) return False, None except Exception as e: print(f"Exception for {endpoint}: {e}", file=sys.stderr) return False, None def _check_gitflow_branches(headers: Dict[str, str]) -> bool: """Check if GitFlow branches are properly created from correct base branches.""" success, branches_data = _get_github_api("branches", headers) if not success or not branches_data: print("Error: Could not fetch branches", file=sys.stderr) return False existing_branches = [branch.get("name", "") for branch in branches_data] required_branches = [ "develop", "release/v1.0.0", "feature/protocol-serialization-fix", ] for branch in required_branches: if branch not in existing_branches: print(f"Error: Required branch '{branch}' not found", file=sys.stderr) return False return True def _check_protocol_fixes_file(headers: Dict[str, str]) -> bool: """Check if PROTOCOL_FIXES.md file exists in feature branch with correct content.""" success, file_data = _get_github_api( "contents/PROTOCOL_FIXES.md?ref=feature/protocol-serialization-fix", headers ) if not success or not file_data: print("Error: PROTOCOL_FIXES.md not found in feature branch", file=sys.stderr) return False # Decode base64 content import base64 content = base64.b64decode(file_data.get("content", "")).decode("utf-8") # Check for required content elements required_elements = [ "# Protocol Serialization Fixes", "## Critical Fix for Data Proto Issue", "Enhanced serialization safety check implemented", "098931530606d22f867fd121b1dcb3225a43661f", "Status: Ready for integration testing", ] for element in required_elements: if element not in content: print( f"Error: PROTOCOL_FIXES.md missing required content: {element}", file=sys.stderr, ) return False return True def _check_integration_workflow(headers: Dict[str, str]) -> Optional[Dict]: """Verify the feature → develop integration pull request exists.""" # Check both open and closed PRs since the workflow may have completed success, prs = _get_github_api("pulls?state=all", headers) if not success or not prs: print("Error: Could not fetch pull requests", file=sys.stderr) return None for pr in prs: head_ref = pr.get("head", {}).get("ref", "") base_ref = pr.get("base", {}).get("ref", "") if head_ref == "feature/protocol-serialization-fix" and base_ref == "develop": return pr print( "Error: Integration PR from feature/protocol-serialization-fix to develop not found", file=sys.stderr, ) return None def _check_release_branch_updated(headers: Dict[str, str]) -> bool: """Check if release branch contains the develop branch changes.""" # Check if PROTOCOL_FIXES.md exists in release branch success, file_data = _get_github_api( "contents/PROTOCOL_FIXES.md?ref=release/v1.0.0", headers ) if not success or not file_data: print( "Error: PROTOCOL_FIXES.md not found in release branch - develop changes not merged", file=sys.stderr, ) return False return True def _check_process_documentation(headers: Dict[str, str]) -> Optional[Dict]: """Check if process is properly documented in an issue.""" success, issues = _get_github_api("issues", headers) if not success or not issues: print("Error: Could not fetch issues for documentation check", file=sys.stderr) return None expected_title = "Implement Advanced Branch Protection Strategy" expected_checkboxes = [ "All development flows through develop branch", "Release preparation happens in release/v1.0.0 branch", "Feature integration uses PR workflow", ] for issue in issues: title = issue.get("title", "") if title == expected_title: body = issue.get("body", "") # Check for exactly 3 checkboxes with specific content checkbox_count = body.count("- [ ]") + body.count("- [x]") if checkbox_count != 3: print( f"Error: Documentation issue should have 3 checkboxes, found {checkbox_count}", file=sys.stderr, ) return None # Check for specific checkbox content for expected_text in expected_checkboxes: if expected_text not in body: print( f"Error: Documentation issue missing required checkbox: {expected_text}", file=sys.stderr, ) return None # Check label assignment labels = issue.get("labels", []) label_names = [label.get("name") for label in labels] if "process-implementation" not in label_names: print( "Error: Documentation issue not labeled with 'process-implementation'", file=sys.stderr, ) return None return issue print("Error: Process documentation issue not found", file=sys.stderr) return None def verify() -> bool: """ Verify the complete GitFlow implementation following the integrated workflow described in description.md. """ # Get GitHub token github_token = os.environ.get("MCP_GITHUB_TOKEN") if not github_token: print("Error: MCP_GITHUB_TOKEN environment variable not set", file=sys.stderr) return False headers = { "Authorization": f"token {github_token}", "Accept": "application/vnd.github.v3+json", } print("Verifying integrated GitFlow workflow implementation...") # 1. Verify GitFlow structure initialization print("1. Checking GitFlow branch structure...") if not _check_gitflow_branches(headers): return False # 2. Verify critical bug fix implementation via new file print("2. Checking protocol serialization fix documentation...") if not _check_protocol_fixes_file(headers): return False # 3. Verify integration workflow (feature → develop PR) print("3. Checking feature integration workflow...") integration_pr = _check_integration_workflow(headers) if not integration_pr: return False # 4. Verify release branch updated and CI configured print("4. Checking release branch sync and CI configuration...") if not _check_release_branch_updated(headers): return False # 5. Verify process documentation print("5. Checking process documentation...") doc_issue = _check_process_documentation(headers) if not doc_issue: return False print("\n✓ Integrated GitFlow workflow successfully implemented!") print("✓ GitFlow structure: main → develop → release/v1.0.0 branches created") print("✓ Critical fix: Protocol fix documented in PROTOCOL_FIXES.md file") print( f"✓ Integration: PR #{integration_pr.get('number')} demonstrates feature → develop workflow" ) print( "✓ Release prep: Release branch contains develop changes, CI configured for both branches" ) print( f"✓ Documentation: Process documented in issue #{doc_issue.get('number')} with proper checkboxes" ) print( "\nThe repository now has a structured GitFlow workflow ready for implementation!" ) return True if __name__ == "__main__": success = verify() sys.exit(0 if success else 1) ================================================ FILE: tasks/github/standard/easyr1/config_parameter_audit/description.md ================================================ I need you to perform a deep investigation into recent configuration changes in our EasyR1 repository that may be causing training instability issues. ## Task Requirements ### 1. Deep Commit Analysis Find the exact commit SHA where the `micro_batch_size_per_device_for_update` parameter was changed from `4` to `1` in the `examples/config.yaml` file. Use GitHub API to: - Examine recent commits that modified `examples/config.yaml` - Get the specific commit diff showing this parameter change - Identify the commit author and timestamp ### 2. Related Parameter Investigation In the same commit you found above, identify what value the `micro_batch_size_per_device_for_experience` parameter was changed to. Document: - The before value for this parameter - The after value for this parameter - The specific line numbers in the diff where these changes occurred ### 3. Issue Search and Verification Search through all GitHub issues (both open and closed) to find issues that contain specific keywords. Identify all issue numbers where the issue title or body text contains any of these exact terms: - "OOM" (case insensitive) - "memory" (case insensitive) - "batch" (case insensitive) - "显存" (GPU memory in Chinese) You must find and list ALL issues that contain any of these keywords in their titles or bodies, regardless of whether you think they're related to the parameter changes. ### 4. File Creation and Results Create a file named exactly `ANALYSIS_RESULTS.json` in the repository root with this exact structure: ```json { "target_commit_sha": "full-40-character-commit-sha", "commit_author": "author-username", "commit_date": "YYYY-MM-DD", "parameter_changes": { "micro_batch_size_per_device_for_update": { "before": 4, "after": 1, "line_number": 123 }, "micro_batch_size_per_device_for_experience": { "before": 16, "after": 2, "line_number": 124 } }, "related_issue_number_list": [9, 46] } ``` ### 5. Verification Requirements - The commit SHA must be exactly 40 hexadecimal characters - The parameter values must match the actual repository changes - The issue number must reference a real issue in the repository - All data must be obtained through GitHub API analysis, not guesswork ================================================ FILE: tasks/github/standard/easyr1/config_parameter_audit/meta.json ================================================ { "task_id": "config_parameter_audit", "task_name": "Config Parameter Audit", "category_id": "easyr1", "category_name": "EasyR1", "description": "Investigate configuration changes causing training instability by analyzing commits and identifying related memory issues.", "author": "Xiangyan Liu", "created_at": "2025-08-15", "difficulty": "L3", "tags": [ "repository analysis", "issue management" ], "mcp": [ "github" ], "meta_data": { "stateType": "url", "stateContent": null, "stateUrl": "https://github.com/mcpmark-source/EasyR1", "stateOriginalUrl": "https://github.com/hiyouga/EasyR1" } } ================================================ FILE: tasks/github/standard/easyr1/config_parameter_audit/verify.py ================================================ import sys import os import json import requests import re from typing import Dict, Optional, Tuple from dotenv import load_dotenv load_dotenv(".mcp_env") def _get_github_api( endpoint: str, headers: Dict[str, str] ) -> Tuple[bool, Optional[Dict]]: """Make a GET request to GitHub API and return (success, response).""" github_org = os.environ.get("GITHUB_EVAL_ORG") url = f"https://api.github.com/repos/{github_org}/EasyR1/{endpoint}" try: response = requests.get(url, headers=headers) if response.status_code == 200: return True, response.json() elif response.status_code == 404: return False, None else: print(f"API error for {endpoint}: {response.status_code}", file=sys.stderr) return False, None except Exception as e: print(f"Exception for {endpoint}: {e}", file=sys.stderr) return False, None def _get_analysis_results(headers: Dict[str, str]) -> Optional[Dict]: """Get ANALYSIS_RESULTS.json file content.""" success, file_data = _get_github_api("contents/ANALYSIS_RESULTS.json", headers) if not success: return None # Decode base64 content import base64 content = file_data.get("content", "") if content: try: decoded_content = base64.b64decode(content).decode("utf-8") return json.loads(decoded_content) except Exception as e: print(f"Error parsing JSON: {e}", file=sys.stderr) return None return None def _verify_commit_data(results: Dict, headers: Dict[str, str]) -> bool: """Verify the commit data is accurate.""" commit_sha = results.get("target_commit_sha") # Validate SHA format if not re.match(r"^[a-f0-9]{40}$", commit_sha, re.IGNORECASE): print(f"Error: Invalid commit SHA format: {commit_sha}", file=sys.stderr) return False # Get commit details success, commit_data = _get_github_api(f"commits/{commit_sha}", headers) if not success: print(f"Error: Commit {commit_sha} not found in repository", file=sys.stderr) return False # Verify author expected_author = results.get("commit_author") actual_author = commit_data.get("author", {}).get("login") if expected_author != actual_author: print( f"Error: Commit author mismatch. Expected: {expected_author}, Actual: {actual_author}", file=sys.stderr, ) return False # Verify date format commit_date = results.get("commit_date") if not re.match(r"^\d{4}-\d{2}-\d{2}$", commit_date): print( f"Error: Invalid date format: {commit_date}. Expected YYYY-MM-DD", file=sys.stderr, ) return False return True def _verify_parameter_changes(results: Dict, headers: Dict[str, str]) -> bool: """Verify the parameter changes are accurate.""" param_changes = results.get("parameter_changes", {}) # Check required parameters exist required_params = [ "micro_batch_size_per_device_for_update", "micro_batch_size_per_device_for_experience", ] for param in required_params: if param not in param_changes: print(f"Error: Missing parameter change data for: {param}", file=sys.stderr) return False change_data = param_changes[param] if not all(key in change_data for key in ["before", "after", "line_number"]): print( f"Error: Incomplete change data for parameter: {param}", file=sys.stderr ) return False # Verify specific expected values based on known repository state update_param = param_changes.get("micro_batch_size_per_device_for_update", {}) if update_param.get("before") != 4 or update_param.get("after") != 1: print( "Error: Incorrect values for micro_batch_size_per_device_for_update", file=sys.stderr, ) return False experience_param = param_changes.get( "micro_batch_size_per_device_for_experience", {} ) if experience_param.get("before") != 16 or experience_param.get("after") != 2: print( "Error: Incorrect values for micro_batch_size_per_device_for_experience", file=sys.stderr, ) return False return True def _get_all_issues_with_keywords(headers: Dict[str, str]) -> set: """Find all issues in repository that contain the required keywords.""" required_keywords = ["oom", "memory", "batch", "显存"] keyword_issues = set() # Get all issues from repository (both open and closed) page = 1 while True: success, issues = _get_github_api( f"issues?state=all&per_page=100&page={page}", headers ) if not success or not issues: break for issue in issues: issue_number = issue.get("number") title = issue.get("title", "").lower() body = issue.get("body", "").lower() if issue.get("body") else "" issue_text = title + " " + body # Check if any keyword appears in title or body for keyword in required_keywords: if keyword.lower() in issue_text: keyword_issues.add(issue_number) break # If we got less than 100 issues, we're done if len(issues) < 100: break page += 1 return keyword_issues def _verify_issue_references(results: Dict, headers: Dict[str, str]) -> bool: """Verify the issue references contain the required keywords.""" issue_number_list = results.get("related_issue_number_list") if not isinstance(issue_number_list, list) or len(issue_number_list) == 0: print( "Error: related_issue_number_list must be a non-empty list", file=sys.stderr, ) return False # Required keywords to search for (case insensitive) required_keywords = ["oom", "memory", "batch", "显存"] # First, dynamically find all issues that contain the required keywords expected_issues = _get_all_issues_with_keywords(headers) print(expected_issues) provided_issues = set(issue_number_list) # Verify each provided issue contains at least one of the required keywords for issue_number in issue_number_list: if not isinstance(issue_number, int) or issue_number <= 0: print( f"Error: Invalid issue number format: {issue_number}", file=sys.stderr ) return False # Get issue details success, issue_data = _get_github_api(f"issues/{issue_number}", headers) if not success: print( f"Error: Issue #{issue_number} not found in repository", file=sys.stderr ) return False # Check if issue title or body contains any required keywords title = issue_data.get("title", "").lower() body = issue_data.get("body", "").lower() if issue_data.get("body") else "" issue_text = title + " " + body issue_has_keyword = False for keyword in required_keywords: if keyword.lower() in issue_text: issue_has_keyword = True break if not issue_has_keyword: print( f"Error: Issue #{issue_number} does not contain any required keywords: {required_keywords}", file=sys.stderr, ) return False # Verify agent found exactly the same issues as our dynamic search if provided_issues != expected_issues: missing = expected_issues - provided_issues extra = provided_issues - expected_issues if missing: print( f"Error: Missing issues that contain required keywords: {missing}", file=sys.stderr, ) if extra: print( f"Error: Extra issues that don't contain required keywords: {extra}", file=sys.stderr, ) return False print( f"✓ Found all {len(issue_number_list)} issues containing required keywords: {issue_number_list}" ) return True def verify() -> bool: """ Programmatically verify that the deep commit analysis meets the requirements. """ # Get GitHub token github_token = os.environ.get("MCP_GITHUB_TOKEN") if not github_token: print("Error: MCP_GITHUB_TOKEN environment variable not set", file=sys.stderr) return False headers = { "Authorization": f"token {github_token}", "Accept": "application/vnd.github.v3+json", } print("Verifying deep commit analysis completion...") # 1. Check ANALYSIS_RESULTS.json exists and is valid JSON print("1. Checking ANALYSIS_RESULTS.json exists and is valid...") results = _get_analysis_results(headers) if not results: print("Error: ANALYSIS_RESULTS.json not found or invalid JSON", file=sys.stderr) return False print("✓ Found valid ANALYSIS_RESULTS.json") # 2. Verify commit data accuracy print("2. Verifying commit data accuracy...") if not _verify_commit_data(results, headers): return False print("✓ Commit SHA, author, and date verified") # 3. Verify parameter changes accuracy print("3. Verifying parameter changes accuracy...") if not _verify_parameter_changes(results, headers): return False print("✓ Parameter changes verified with correct before/after values") # 4. Verify issue references print("4. Verifying issue references...") if not _verify_issue_references(results, headers): return False print("\n✓ Task completed successfully!") print("Deep commit analysis results verified:") print(f"- Found target commit: {results.get('target_commit_sha')}") print( "- Verified parameter changes: micro_batch_size_per_device_for_update (4→1), micro_batch_size_per_device_for_experience (16→2)" ) print( f"- Verified memory/performance issue correlations: {results.get('related_issue_number_list')}" ) print("- All data obtained through accurate GitHub API analysis") return True if __name__ == "__main__": success = verify() sys.exit(0 if success else 1) ================================================ FILE: tasks/github/standard/easyr1/performance_regression_investigation/description.md ================================================ In the EasyR1 repo, I've discovered that the recent commit `098931530606d22f867fd121b1dcb3225a43661f` (which fixed data proto) may have introduced performance regressions based on user reports in issues #39 and #41. I need you to create a systematic investigation workflow: **Step 1: Create Main Tracking Issue** Create a main issue with the exact title "Performance Regression Analysis: Data Protocol Changes" and add these 3 labels: "bug", "performance", "investigation". **Step 2: Create Investigation Branches** Create exactly 3 feature branches from main for different investigation tracks: - `investigate-protocol-changes` - for testing protocol-related performance issues - `investigate-batch-processing` - for testing batch processing performance issues - `investigate-memory-usage` - for testing memory utilization performance issues **Step 3: Create Sub-Issues** Create 3 sub-issues and link them to the main tracking issue using sub-issue functionality: - "Test Performance Impact: fix multi modal data oom" - "Test Performance Impact: upgrade vllm to 0.10" - "Test Performance Impact: non blocking false by default" **Step 4: Document Changes** Add at least 2 comments to the main tracking issue documenting the specific file changes from commit `098931530606d22f867fd121b1dcb3225a43661f`. Reference the exact files `verl/protocol.py` and `examples/config.yaml` with their commit SHA. **Step 5: Create Analysis PR** Create a pull request from the `investigate-protocol-changes` branch to main with the exact title "Performance Analysis: Protocol Changes Investigation". ================================================ FILE: tasks/github/standard/easyr1/performance_regression_investigation/meta.json ================================================ { "task_id": "performance_regression_investigation", "task_name": "Performance Regression Investigation", "category_id": "easyr1", "category_name": "EasyR1", "description": "Create systematic investigation workflow for performance regressions with tracking issues, investigation branches, and sub-issues.", "author": "Xiangyan Liu", "created_at": "2025-08-15", "difficulty": "L3", "tags": [ "issue management", "repository analysis" ], "mcp": [ "github" ], "meta_data": { "stateType": "url", "stateContent": null, "stateUrl": "https://github.com/mcpmark-source/EasyR1", "stateOriginalUrl": "https://github.com/hiyouga/EasyR1" } } ================================================ FILE: tasks/github/standard/easyr1/performance_regression_investigation/verify.py ================================================ import sys import os import requests from typing import Dict, List, Optional, Tuple from dotenv import load_dotenv load_dotenv(".mcp_env") def _get_github_api( endpoint: str, headers: Dict[str, str] ) -> Tuple[bool, Optional[Dict]]: """Make a GET request to GitHub API and return (success, response).""" github_org = os.environ.get("GITHUB_EVAL_ORG") url = f"https://api.github.com/repos/{github_org}/EasyR1/{endpoint}" try: response = requests.get(url, headers=headers) if response.status_code == 200: return True, response.json() elif response.status_code == 404: return False, None else: print(f"API error for {endpoint}: {response.status_code}", file=sys.stderr) return False, None except Exception as e: print(f"Exception for {endpoint}: {e}", file=sys.stderr) return False, None def _find_main_tracking_issue(headers: Dict[str, str]) -> Optional[Dict]: """Find the main tracking issue with exact title and required labels.""" success, issues = _get_github_api("issues?state=open&per_page=50", headers) if not success or not issues: return None for issue in issues: title = issue.get("title", "") if title == "Performance Regression Analysis: Data Protocol Changes": # Check labels labels = [label.get("name", "") for label in issue.get("labels", [])] required_labels = {"bug", "performance", "investigation"} if required_labels.issubset(set(labels)): return issue return None def _check_branches_exist(branch_names: List[str], headers: Dict[str, str]) -> bool: """Check if all required branches exist.""" for branch_name in branch_names: success, _ = _get_github_api(f"branches/{branch_name}", headers) if not success: print(f"Error: Branch '{branch_name}' not found", file=sys.stderr) return False return True def _check_sub_issues( main_issue_number: int, expected_titles: List[str], headers: Dict[str, str] ) -> bool: """Check if sub-issues are created and linked to main issue.""" success, sub_issues = _get_github_api( f"issues/{main_issue_number}/sub_issues", headers ) if not success: # If sub_issues endpoint doesn't exist, check for issues mentioning the main issue success, all_issues = _get_github_api("issues?state=open&per_page=100", headers) if not success: return False sub_issues = [] for issue in all_issues: body = issue.get("body", "") title = issue.get("title", "") # Check if issue references main issue or has expected title pattern if f"#{main_issue_number}" in body or any( expected_title in title for expected_title in expected_titles ): sub_issues.append(issue) if not sub_issues or len(sub_issues) < 3: print( f"Error: Expected 3 sub-issues linked to main issue #{main_issue_number}", file=sys.stderr, ) return False # Check if sub-issues have expected titles found_titles = [issue.get("title", "") for issue in sub_issues] for expected_title in expected_titles: if not any(expected_title in title for title in found_titles): print( f"Error: Sub-issue with title containing '{expected_title}' not found", file=sys.stderr, ) return False return True def _check_issue_comments(issue_number: int, headers: Dict[str, str]) -> bool: """Check if main issue has at least 2 comments with file references.""" success, comments = _get_github_api(f"issues/{issue_number}/comments", headers) if not success or not comments: print(f"Error: No comments found on issue #{issue_number}", file=sys.stderr) return False if len(comments) < 2: print( f"Error: Expected at least 2 comments on issue #{issue_number}", file=sys.stderr, ) return False # Check if comments reference specific files and commit required_refs = [ "verl/protocol.py", "examples/config.yaml", "0989315", ] comment_text = " ".join([comment.get("body", "") for comment in comments]) for ref in required_refs: if ref not in comment_text: print(f"Error: Comments missing reference to '{ref}'", file=sys.stderr) return False return True def _find_analysis_pr(headers: Dict[str, str]) -> Optional[Dict]: """Find the analysis PR with exact title from specific branch.""" success, prs = _get_github_api("pulls?state=open&per_page=50", headers) if not success or not prs: return None expected_title = "Performance Analysis: Protocol Changes Investigation" expected_head = "investigate-protocol-changes" for pr in prs: title = pr.get("title", "") head_ref = pr.get("head", {}).get("ref", "") if title == expected_title and head_ref == expected_head: return pr return None def verify() -> bool: """ Programmatically verify that the performance regression investigation workflow meets the requirements described in description.md. """ # Get GitHub token github_token = os.environ.get("MCP_GITHUB_TOKEN") if not github_token: print("Error: MCP_GITHUB_TOKEN environment variable not set", file=sys.stderr) return False headers = { "Authorization": f"token {github_token}", "Accept": "application/vnd.github.v3+json", } # Run verification checks print("Verifying performance regression investigation workflow completion...") # 1. Check main tracking issue exists with exact title and labels print("1. Checking main tracking issue with required title and labels...") main_issue = _find_main_tracking_issue(headers) if not main_issue: print( "Error: Main tracking issue not found with exact title 'Performance Regression Analysis: Data Protocol Changes' and labels 'bug', 'performance', 'investigation'", file=sys.stderr, ) return False main_issue_number = main_issue.get("number") print(f"Found main tracking issue #{main_issue_number}") # 2. Check that all 3 investigation branches exist print("2. Checking investigation branches exist...") required_branches = [ "investigate-protocol-changes", "investigate-batch-processing", "investigate-memory-usage", ] if not _check_branches_exist(required_branches, headers): return False # 3. Check sub-issues are created and linked print("3. Checking sub-issues are created and linked...") expected_sub_titles = [ "Test Performance Impact: fix multi modal data oom", "Test Performance Impact: upgrade vllm to 0.10", "Test Performance Impact: non blocking false by default", ] if not _check_sub_issues(main_issue_number, expected_sub_titles, headers): return False # 4. Check issue comments document file changes print("4. Checking issue comments document file changes...") if not _check_issue_comments(main_issue_number, headers): return False # 5. Check analysis PR exists with exact title from correct branch print("5. Checking analysis PR exists with exact title and branch...") analysis_pr = _find_analysis_pr(headers) if not analysis_pr: print( "Error: Analysis PR not found with title 'Performance Analysis: Protocol Changes Investigation' from branch 'investigate-protocol-changes'", file=sys.stderr, ) return False print(f"Found analysis PR #{analysis_pr.get('number')}") print("\n✓ Task completed successfully!") print( f"Main tracking issue #{main_issue_number} created with proper labels and documentation" ) print("All 3 investigation branches created for different investigation tracks") print("3 sub-issues created and linked to main tracking issue") print("Issue comments document file changes with commit SHA references") print(f"Analysis PR #{analysis_pr.get('number')} created from correct branch") return True if __name__ == "__main__": success = verify() sys.exit(0 if success else 1) ================================================ FILE: tasks/github/standard/easyr1/qwen3_issue_management/description.md ================================================ The EasyR1 repository has several Qwen3-related issues that were closed but need to be reopened for further investigation. Qwen3 is an important model variant that requires continued attention. I need you to find and reopen all closed issues related to Qwen3 and properly tag them for tracking. **Step 1: Find All Closed Qwen3 Issues** Search for ALL closed issues that mention 'qwen3' (case-insensitive) in either the title or body. Make note of each issue number and title. **Step 2: Reopen Each Qwen3 Issue** For every closed issue that contains 'qwen3' (regardless of when it was closed or any other factors), reopen it by changing its state from closed to open. **Step 3: Add Tracking Label** After reopening each issue, add the label `qwen3-related` to it. This will help track all Qwen3-related issues in the future. **Step 4: Create Summary Issue** Create a new issue titled "Reopened Qwen3 Issues Summary" with the following content in the body: ``` # Qwen3 Issues Reopened The following closed issues containing 'qwen3' have been reopened: [List each reopened issue as: - #NUMBER: TITLE] Total issues reopened: [NUMBER] All reopened issues have been tagged with the `qwen3-related` label for easy tracking. ``` Add the label `qwen3-related` to this summary issue as well. This straightforward workflow ensures all Qwen3-related closed issues are reopened and properly tagged for visibility. ================================================ FILE: tasks/github/standard/easyr1/qwen3_issue_management/meta.json ================================================ { "task_id": "qwen3_issue_management", "task_name": "Qwen3 Issue Management", "category_id": "easyr1", "category_name": "EasyR1", "description": "Find and reopen all closed Qwen3-related issues with proper tagging for continued tracking and investigation.", "author": "Zijian Wu", "created_at": "2025-08-15", "difficulty": "L3", "tags": [ "issue management" ], "mcp": [ "github" ], "meta_data": { "stateType": "url", "stateContent": null, "stateUrl": "https://github.com/mcpmark-source/EasyR1", "stateOriginalUrl": "https://github.com/hiyouga/EasyR1" } } ================================================ FILE: tasks/github/standard/easyr1/qwen3_issue_management/verify.py ================================================ import sys import os import requests from typing import Dict, List, Optional, Tuple from dotenv import load_dotenv load_dotenv(".mcp_env") def _get_github_api( endpoint: str, headers: Dict[str, str] ) -> Tuple[bool, Optional[Dict]]: """Make a GET request to GitHub API and return (success, response).""" github_org = os.environ.get("GITHUB_EVAL_ORG") url = f"https://api.github.com/repos/{github_org}/EasyR1/{endpoint}" try: response = requests.get(url, headers=headers) if response.status_code == 200: return True, response.json() elif response.status_code == 404: return False, None else: print(f"API error for {endpoint}: {response.status_code}", file=sys.stderr) return False, None except Exception as e: print(f"Exception for {endpoint}: {e}", file=sys.stderr) return False, None def _search_github_issues( query: str, headers: Dict[str, str] ) -> Tuple[bool, Optional[List]]: """Search GitHub issues using the search API.""" url = f"https://api.github.com/search/issues?q={query}&per_page=100" try: response = requests.get(url, headers=headers) if response.status_code == 200: data = response.json() return True, data.get("items", []) else: print(f"Search API error: {response.status_code}", file=sys.stderr) return False, None except Exception as e: print(f"Search exception: {e}", file=sys.stderr) return False, None def _check_qwen3_issues_reopened(headers: Dict[str, str]) -> Tuple[bool, List]: """Check if all Qwen3 issues have been reopened and tagged.""" # Search for all issues mentioning qwen3 (both open and closed) github_org = os.environ.get("GITHUB_EVAL_ORG") success, all_qwen3_issues = _search_github_issues( f"repo:{github_org}/EasyR1 qwen3", headers ) if not success or not all_qwen3_issues: print("Error: Could not search for Qwen3 issues", file=sys.stderr) return False, [] reopened_issues = [] issues_not_reopened = [] issues_not_tagged = [] for issue in all_qwen3_issues: issue_number = issue.get("number") issue_state = issue.get("state") issue_title = issue.get("title", "") # Check if the issue is open (should be reopened) if issue_state == "closed": issues_not_reopened.append(f"#{issue_number}: {issue_title}") continue # Check if issue has qwen3-related label labels = [label.get("name") for label in issue.get("labels", [])] if "qwen3-related" not in labels: issues_not_tagged.append(f"#{issue_number}: {issue_title}") else: reopened_issues.append(issue) # Report any issues not properly processed if issues_not_reopened: print("Error: The following Qwen3 issues are still closed:", file=sys.stderr) for issue in issues_not_reopened: print(f" - {issue}", file=sys.stderr) return False, [] if issues_not_tagged: print( "Error: The following reopened issues are missing 'qwen3-related' label:", file=sys.stderr, ) for issue in issues_not_tagged: print(f" - {issue}", file=sys.stderr) return False, reopened_issues return True, reopened_issues def _check_summary_issue( headers: Dict[str, str], reopened_issues: List ) -> Optional[Dict]: """Check if the summary issue exists with proper content.""" success, issues = _get_github_api("issues?state=all", headers) if not success or not issues: print("Error: Could not fetch issues for summary check", file=sys.stderr) return None expected_title = "Reopened Qwen3 Issues Summary" for issue in issues: title = issue.get("title", "") if title == expected_title: body = issue.get("body", "") # Check for required content if "# Qwen3 Issues Reopened" not in body: print("Error: Summary issue missing header", file=sys.stderr) return None if ( "The following closed issues containing 'qwen3' have been reopened:" not in body ): print("Error: Summary issue missing description", file=sys.stderr) return None if "Total issues reopened:" not in body: print("Error: Summary issue missing total count", file=sys.stderr) return None if ( "All reopened issues have been tagged with the `qwen3-related` label" not in body ): print("Error: Summary issue missing tagging note", file=sys.stderr) return None # Check if all reopened issues are listed for reopened_issue in reopened_issues: issue_num = reopened_issue.get("number") if f"#{issue_num}" not in body: print( f"Error: Summary issue missing reference to issue #{issue_num}", file=sys.stderr, ) return None # Check if summary issue has the label labels = [label.get("name") for label in issue.get("labels", [])] if "qwen3-related" not in labels: print( "Error: Summary issue missing 'qwen3-related' label", file=sys.stderr, ) return None return issue print( "Error: Summary issue 'Reopened Qwen3 Issues Summary' not found", file=sys.stderr, ) return None def verify() -> bool: """ Verify that all Qwen3-related closed issues have been reopened and tagged. """ # Get GitHub token github_token = os.environ.get("MCP_GITHUB_TOKEN") if not github_token: print("Error: MCP_GITHUB_TOKEN environment variable not set", file=sys.stderr) return False headers = { "Authorization": f"token {github_token}", "Accept": "application/vnd.github.v3+json", } print("Verifying Qwen3 issue reopening workflow...") # 1. Check if all Qwen3 issues have been reopened and tagged print("1. Checking if Qwen3 issues are reopened and tagged...") all_reopened, reopened_issues = _check_qwen3_issues_reopened(headers) if not all_reopened: return False if not reopened_issues: print("Error: No Qwen3 issues found or reopened", file=sys.stderr) return False # 2. Check if summary issue exists print("2. Checking summary issue...") summary_issue = _check_summary_issue(headers, reopened_issues) if not summary_issue: return False print("\n✓ Qwen3 issue reopening workflow successfully completed!") print(f"✓ Reopened Issues: {len(reopened_issues)} Qwen3-related issues reopened") print("✓ Tagging: All reopened issues tagged with 'qwen3-related' label") print( f"✓ Summary: Issue #{summary_issue.get('number')} created with complete list of reopened issues" ) print("\nAll Qwen3-related closed issues have been reopened and properly tagged!") return True if __name__ == "__main__": success = verify() sys.exit(0 if success else 1) ================================================ FILE: tasks/github/standard/harmony/fix_conflict/description.md ================================================ I have some pull requests that won't merge due to conflicts. Can you help me fix the merge conflicts by creating the missing infrastructure? **Step 1: Find Conflicted PR** Look through the open pull requests and find the one that has `mergeable: false` and `mergeable_state: "dirty"`. Check what file it's trying to modify - it appears to be missing a file that the PR is trying to add or modify. **Step 2: Create Infrastructure PR** Create a new branch and PR to add the missing file that the conflicted PR needs. The PR must have: - **Title**: Must contain "Add CI infrastructure" and "resolve conflicts" - **Body**: Must include: - Reference to the conflicted PR using "Fixes #[PR_NUMBER]" or "Resolves #[PR_NUMBER]" - Explanation that this "prepares infrastructure" for the other PR - Mention of "missing .github directory" and "workflow conflicts" - **File Content**: Extract the complete file content from the conflicted PR's changes and add it to main. This ensures the conflicted PR can merge cleanly without conflicts. **Step 3: Merge Infrastructure PR** Merge the infrastructure PR to main. **Step 4: Add Comment to Original PR** Add a comment to the original conflicted PR that references the infrastructure PR you just created and merged. The comment must mention the infrastructure PR number using "PR #[NUMBER]" format. **Step 5: Merge Original PR** Now merge the original conflicted PR since it should be able to merge cleanly. ================================================ FILE: tasks/github/standard/harmony/fix_conflict/meta.json ================================================ { "task_id": "fix_conflict", "task_name": "Fix Conflict", "category_id": "harmony", "category_name": "Harmony", "description": "Resolve merge conflicts by creating missing infrastructure and ensuring conflicted PRs can merge cleanly.", "author": "Zijian Wu", "created_at": "2025-08-15", "difficulty": "L3", "tags": [ "pr workflows" ], "mcp": [ "github" ], "meta_data": { "stateType": "url", "stateContent": null, "stateUrl": "https://github.com/mcpmark-source/harmony", "stateOriginalUrl": "https://github.com/openai/harmony" } } ================================================ FILE: tasks/github/standard/harmony/fix_conflict/verify.py ================================================ import sys import os import requests from typing import Dict, Optional, Tuple from dotenv import load_dotenv def _get_github_api( endpoint: str, headers: Dict[str, str], org: str, repo: str = "harmony" ) -> Tuple[bool, Optional[Dict]]: """Make a GET request to GitHub API and return (success, response).""" url = f"https://api.github.com/repos/{org}/{repo}/{endpoint}" try: response = requests.get(url, headers=headers) if response.status_code == 200: return True, response.json() elif response.status_code == 404: return False, None else: print(f"API error for {endpoint}: {response.status_code}", file=sys.stderr) return False, None except Exception as e: print(f"Exception for {endpoint}: {e}", file=sys.stderr) return False, None def _check_ci_file_exists( file_path: str, headers: Dict[str, str], org: str, repo: str = "harmony" ) -> bool: """Check if CI file exists in main branch.""" success, _ = _get_github_api(f"contents/{file_path}?ref=main", headers, org, repo) return success def _check_pr_comments( pr_number: int, infra_pr_number: int, headers: Dict[str, str], org: str, repo: str = "harmony", ) -> bool: """Check if PR has a comment linking to the infrastructure PR using 'PR #[NUMBER]' format.""" success, comments = _get_github_api( f"issues/{pr_number}/comments", headers, org, repo ) if not success or not comments: return False # Look for "PR #123" pattern (case insensitive) import re for comment in comments: body = comment.get("body", "") if re.search(rf"PR\s*#{infra_pr_number}", body, re.IGNORECASE): return True return False def _find_infrastructure_pr( headers: Dict[str, str], org: str, repo: str = "harmony" ) -> Optional[Dict]: """Find the infrastructure PR by checking title and body content.""" success, prs = _get_github_api("pulls?state=all&per_page=50", headers, org, repo) if success and prs: for pr in prs: title = pr.get("title", "").lower() body = pr.get("body", "").lower() # Check title contains required keywords title_ok = "add ci infrastructure" in title and "resolve conflicts" in title # Check body contains required elements has_reference = "fixes #" in body or "resolves #" in body has_prep_text = "prepares infrastructure" in body has_github_text = "missing .github directory" in body has_workflow_text = "workflow conflicts" in body body_ok = ( has_reference and has_prep_text and has_github_text and has_workflow_text ) if title_ok and body_ok: return pr return None def verify() -> bool: """ Programmatically verify that the merge conflict resolution workflow meets the requirements described in description.md. """ # Load environment variables from .mcp_env load_dotenv(".mcp_env") # Get GitHub token and org github_token = os.environ.get("MCP_GITHUB_TOKEN") github_org = os.environ.get("GITHUB_EVAL_ORG") if not github_token: print("Error: MCP_GITHUB_TOKEN environment variable not set", file=sys.stderr) return False if not github_org: print("Error: GITHUB_EVAL_ORG environment variable not set", file=sys.stderr) return False headers = { "Authorization": f"token {github_token}", "Accept": "application/vnd.github.v3+json", } # Run verification checks print("Verifying merge conflict resolution workflow completion...") # 1. Check that CI infrastructure file exists in main (extracted from conflicted PR) print("1. Checking CI infrastructure was added to main...") # Check for both CI.yml and ci.yml (case-insensitive) ci_exists = _check_ci_file_exists(".github/workflows/CI.yml", headers, github_org) if not ci_exists: ci_exists = _check_ci_file_exists(".github/workflows/ci.yml", headers, github_org) if not ci_exists: print("Error: Neither .github/workflows/CI.yml nor .github/workflows/ci.yml found in main", file=sys.stderr) return False # 2. Find infrastructure PR with required title and body content print("2. Finding infrastructure PR with required content...") infra_pr = _find_infrastructure_pr(headers, github_org) if not infra_pr: print( "Error: No infrastructure PR found with required title and body content", file=sys.stderr, ) print( "Required title: 'Add CI infrastructure' and 'resolve conflicts'", file=sys.stderr, ) print( "Required body: reference with 'Fixes #' or 'Resolves #', 'prepares infrastructure', 'missing .github directory', 'workflow conflicts'", file=sys.stderr, ) return False print(f"Found infrastructure PR #{infra_pr.get('number')}: {infra_pr.get('title')}") # 3. Check that infrastructure PR is merged if not infra_pr.get("merged_at"): print( f"Error: Infrastructure PR #{infra_pr.get('number')} not merged yet", file=sys.stderr, ) return False # 4. Check that PR #24 is merged print("3. Checking that PR #24 is merged...") success, pr24 = _get_github_api("pulls/24", headers, github_org) if not success or not pr24: print("Error: PR #24 not found", file=sys.stderr) return False if not pr24.get("merged_at"): print("Error: PR #24 is not merged yet", file=sys.stderr) return False # 5. Check that PR #24 has a comment linking to the infrastructure PR print("4. Checking that PR #24 has comment linking to infrastructure PR...") if not _check_pr_comments(24, infra_pr.get("number"), headers, github_org): print( f"Error: PR #24 missing comment linking to infrastructure PR #{infra_pr.get('number')}", file=sys.stderr, ) return False print("\n✓ Task completed successfully!") print( f"Infrastructure PR #{infra_pr.get('number')} extracted content from PR #24 and resolved conflicts" ) print( "PR #24 is now merged cleanly and has a comment linking to the infrastructure PR" ) return True if __name__ == "__main__": success = verify() sys.exit(0 if success else 1) ================================================ FILE: tasks/github/standard/harmony/issue_pr_commit_workflow/description.md ================================================ I need you to implement a complete bug tracking and resolution workflow that demonstrates proper cross-referencing between issues, pull requests, and commits. Here's what you need to do: **Step 1: Create Issue for Race Condition Bug** Create a new issue with: - Title containing: 'race condition', 'HarmonyEncoding', 'concurrent access' - Body must include: - A "## Problem" heading describing threading issues - A "## Root Cause" heading about file locking - A "## Expected Solution" heading with bullet points - References to issues #6 and #1 - Keywords: "multiple threads", "tokenizer file downloads", "mutex-based file locking" **Step 2: Create Feature Branch** Create a new branch called 'fix/race-condition-tokenizer-loading' from main. **Step 3: Implement Thread-Safe Loading** On the feature branch, create/update the file `src/concurrent_loading.rs` with: ```rust use std::sync::Mutex; use std::sync::OnceLock; // Thread-safe tokenizer loading with file locks static DOWNLOAD_MUTEX: OnceLock<Mutex<()>> = OnceLock::new(); pub fn load_harmony_encoding_safe(name: &str) -> Result<HarmonyEncoding, HarmonyError> { let _guard = DOWNLOAD_MUTEX.get_or_init(|| Mutex::new(())).lock().unwrap(); // Implementation for thread-safe loading // Addresses race condition from issue #6 Ok(HarmonyEncoding::new()) } pub fn load_harmony_encoding_from_file(path: &str) -> Result<HarmonyEncoding, HarmonyError> { // Offline loading API as requested in issue #1 HarmonyEncoding::from_file(path) } ``` **Step 4: Create Pull Request with Cross-References** Create a pull request from 'fix/race-condition-tokenizer-loading' to 'main' with: - Title containing: 'Fix race condition', 'tokenizer loading', 'threading issues' - Body must include: - A "## Summary" heading explaining the fix - A "## Changes" heading with bullet points about mutex implementation - A "## Testing" heading mentioning related issues - "Closes #[ISSUE_NUMBER]" pattern linking to your created issue - References to #1 and #6 - Keywords: "thread-safe", "concurrent downloads", "offline loading API" **Step 5: Add PR Review Comments** Create a pending review and add a review comment to the PR with: - Technical analysis of the implementation approach - Discussion of thread safety mechanisms - Keywords that must be included: "OnceLock", "mutex", "thread safety", "concurrent access" - Reference to issue #1 and the offline loading capability - Explanation of how the solution prevents race conditions Then submit the review as a COMMENT type review. **Step 6: Update Issue with Implementation Details** Add a comment to the original issue you created with: - Reference to the PR number using "PR #[NUMBER]" pattern - Technical details about the mutex-based solution - Keywords: "std::sync::Mutex", "OnceLock", "thread-safe initialization" - Mention of key implementation changes (DOWNLOAD_MUTEX, offline loading) - Reference back to issue #1 for offline loading requirement **Step 7: Close the Issue** Close the issue you created by updating its state to 'closed' with state_reason 'completed'. ================================================ FILE: tasks/github/standard/harmony/issue_pr_commit_workflow/meta.json ================================================ { "task_id": "issue_pr_commit_workflow", "task_name": "Issue Pr Commit Workflow", "category_id": "harmony", "category_name": "Harmony", "description": "Implement complete bug tracking workflow demonstrating proper cross-referencing between issues, PRs, and commits for race condition fixes.", "author": "Zijian Wu", "created_at": "2025-08-15", "difficulty": "L3", "tags": [ "issue management", "pr workflows" ], "mcp": [ "github" ], "meta_data": { "stateType": "url", "stateContent": null, "stateUrl": "https://github.com/mcpmark-source/harmony", "stateOriginalUrl": "https://github.com/openai/harmony" } } ================================================ FILE: tasks/github/standard/harmony/issue_pr_commit_workflow/verify.py ================================================ import sys import os import requests from typing import Dict, List, Optional, Tuple import base64 from dotenv import load_dotenv def _get_github_api( endpoint: str, headers: Dict[str, str], org: str, repo: str = "harmony" ) -> Tuple[bool, Optional[Dict]]: """Make a GET request to GitHub API and return (success, response).""" url = f"https://api.github.com/repos/{org}/{repo}/{endpoint}" try: response = requests.get(url, headers=headers) if response.status_code == 200: return True, response.json() elif response.status_code == 404: return False, None else: print(f"API error for {endpoint}: {response.status_code}", file=sys.stderr) return False, None except Exception as e: print(f"Exception for {endpoint}: {e}", file=sys.stderr) return False, None def _check_branch_exists( branch_name: str, headers: Dict[str, str], org: str, repo: str = "harmony" ) -> bool: """Verify that a branch exists in the repository.""" success, _ = _get_github_api(f"branches/{branch_name}", headers, org, repo) return success def _check_file_content( branch: str, file_path: str, keywords: List[str], headers: Dict[str, str], org: str, repo: str = "harmony", ) -> bool: """Verify that a file exists in branch and contains required keywords.""" success, result = _get_github_api( f"contents/{file_path}?ref={branch}", headers, org, repo ) if not success or not result: return False if keywords and result.get("content"): try: content = base64.b64decode(result.get("content", "")).decode("utf-8") return all(keyword in content for keyword in keywords) except Exception as e: print(f"Content decode error for {file_path}: {e}", file=sys.stderr) return False return True def _find_issue_by_title( title_substring: str, headers: Dict[str, str], org: str, repo: str = "harmony" ) -> Optional[Dict]: """Find an issue by title substring and return the issue data.""" # Check both open and closed issues for state in ["open", "closed"]: success, issues = _get_github_api( f"issues?state={state}&per_page=100", headers, org, repo ) if success and issues: for issue in issues: if title_substring.lower() in issue.get("title", "").lower(): return issue return None def _find_pr_by_title( title_substring: str, headers: Dict[str, str], org: str, repo: str = "harmony" ) -> Optional[Dict]: """Find a PR by title substring and return the PR data.""" # Check both open and closed PRs for state in ["open", "closed"]: success, prs = _get_github_api( f"pulls?state={state}&per_page=100", headers, org, repo ) if success and prs: for pr in prs: if title_substring.lower() in pr.get("title", "").lower(): return pr return None def _check_issue_references(issue_body: str, reference_numbers: List[str]) -> bool: """Check if issue body contains references to specified issue numbers.""" if not issue_body: return False return all(f"#{ref}" in issue_body for ref in reference_numbers) def _check_pr_references( pr_body: str, issue_number: int, reference_numbers: List[str] ) -> bool: """Check if PR body contains proper references.""" if not pr_body: return False # Check for "Closes #X" pattern closes_pattern = ( f"Closes #{issue_number}" in pr_body or f"closes #{issue_number}" in pr_body ) # Check for other references refs_present = all(f"#{ref}" in pr_body for ref in reference_numbers) return closes_pattern and refs_present def _get_issue_comments( issue_number: int, headers: Dict[str, str], org: str, repo: str = "harmony" ) -> List[Dict]: """Get all comments for an issue.""" success, comments = _get_github_api( f"issues/{issue_number}/comments", headers, org, repo ) if success and comments: return comments return [] def _get_pr_reviews( pr_number: int, headers: Dict[str, str], org: str, repo: str = "harmony" ) -> List[Dict]: """Get all reviews for a PR.""" success, reviews = _get_github_api(f"pulls/{pr_number}/reviews", headers, org, repo) if success and reviews: return reviews return [] def _check_issue_comment_references( comments: List[Dict], pr_number: int, keywords: List[str] ) -> bool: """Check if issue has a comment referencing the PR number with required technical keywords.""" for comment in comments: body = comment.get("body", "") has_pr_ref = ( f"PR #{pr_number}" in body or f"PR#{pr_number}" in body or f"pr #{pr_number}" in body.lower() ) has_keywords = all(keyword.lower() in body.lower() for keyword in keywords) if has_pr_ref and has_keywords: return True return False def _check_title_keywords(title: str, required_keywords: List[str]) -> bool: """Check if title contains all required keywords.""" return all(keyword.lower() in title.lower() for keyword in required_keywords) def _check_headings_and_content( body: str, headings: List[str], keywords: List[str] ) -> bool: """Check if body contains required headings and keywords.""" has_headings = all(heading in body for heading in headings) has_keywords = all(keyword.lower() in body.lower() for keyword in keywords) return has_headings and has_keywords def _check_pr_review_content(reviews: List[Dict], keywords: List[str]) -> bool: """Check if PR has review comments containing required keywords.""" for review in reviews: body = review.get("body", "") if body and all(keyword.lower() in body.lower() for keyword in keywords): return True return False def verify() -> bool: """ Programmatically verify that the issue-PR-commit workflow meets the requirements described in description.md. """ # Load environment variables from .mcp_env load_dotenv(".mcp_env") # Get GitHub token and org github_token = os.environ.get("MCP_GITHUB_TOKEN") github_org = os.environ.get("GITHUB_EVAL_ORG") if not github_token: print("Error: MCP_GITHUB_TOKEN environment variable not set", file=sys.stderr) return False if not github_org: print("Error: GITHUB_EVAL_ORG environment variable not set", file=sys.stderr) return False # Configuration constants BRANCH_NAME = "fix/race-condition-tokenizer-loading" ISSUE_TITLE_SUBSTRING = "race condition in HarmonyEncoding" PR_TITLE_SUBSTRING = "Fix race condition in tokenizer loading" # File content checks RUST_FILE_KEYWORDS = [ "DOWNLOAD_MUTEX", "OnceLock<Mutex<()>>", "load_harmony_encoding_safe", "load_harmony_encoding_from_file", "Thread-safe tokenizer loading", ] # Issue content requirements ISSUE_TITLE_KEYWORDS = ["race condition", "HarmonyEncoding", "concurrent access"] ISSUE_REFERENCE_NUMBERS = ["6", "1"] ISSUE_HEADINGS = ["## Problem", "## Root Cause", "## Expected Solution"] ISSUE_KEYWORDS = [ "multiple threads", "tokenizer file downloads", "mutex-based file locking", ] # PR content requirements PR_TITLE_KEYWORDS = ["Fix race condition", "tokenizer loading", "threading issues"] PR_REFERENCE_NUMBERS = ["1", "6"] PR_HEADINGS = ["## Summary", "## Changes", "## Testing"] PR_KEYWORDS = ["thread-safe", "concurrent downloads", "offline loading API"] # Review comment requirements REVIEW_KEYWORDS = ["OnceLock", "mutex", "thread safety", "concurrent access"] # Issue comment requirements ISSUE_COMMENT_KEYWORDS = [ "std::sync::Mutex", "OnceLock", "thread-safe initialization", "DOWNLOAD_MUTEX", ] headers = { "Authorization": f"token {github_token}", "Accept": "application/vnd.github.v3+json", } # Run verification checks print("Verifying GitHub issue-PR-commit workflow completion...") # 1. Check that feature branch exists print("1. Verifying feature branch exists...") if not _check_branch_exists(BRANCH_NAME, headers, github_org): print(f"Error: Branch '{BRANCH_NAME}' not found", file=sys.stderr) return False # 2. Check that the Rust implementation file exists with required content print("2. Verifying concurrent_loading.rs implementation...") if not _check_file_content( BRANCH_NAME, "src/concurrent_loading.rs", RUST_FILE_KEYWORDS, headers, github_org, ): print( "Error: src/concurrent_loading.rs not found or missing required content", file=sys.stderr, ) return False # 3. Find the created issue print("3. Verifying issue creation and content...") issue = _find_issue_by_title(ISSUE_TITLE_SUBSTRING, headers, github_org) if not issue: print( f"Error: Issue with title containing '{ISSUE_TITLE_SUBSTRING}' not found", file=sys.stderr, ) return False issue_number = issue.get("number") issue_title = issue.get("title", "") issue_body = issue.get("body", "") # Check issue title keywords if not _check_title_keywords(issue_title, ISSUE_TITLE_KEYWORDS): print("Error: Issue title missing required keywords", file=sys.stderr) return False # Check issue headings, content and references if not _check_headings_and_content(issue_body, ISSUE_HEADINGS, ISSUE_KEYWORDS): print("Error: Issue missing required headings or keywords", file=sys.stderr) return False if not _check_issue_references(issue_body, ISSUE_REFERENCE_NUMBERS): print( "Error: Issue does not reference required issues #6 and #1", file=sys.stderr ) return False # 4. Find the created PR print("4. Verifying pull request creation and content...") pr = _find_pr_by_title(PR_TITLE_SUBSTRING, headers, github_org) if not pr: print( f"Error: PR with title containing '{PR_TITLE_SUBSTRING}' not found", file=sys.stderr, ) return False pr_number = pr.get("number") pr_title = pr.get("title", "") pr_body = pr.get("body", "") # Check PR title keywords if not _check_title_keywords(pr_title, PR_TITLE_KEYWORDS): print("Error: PR title missing required keywords", file=sys.stderr) return False # Check PR headings and content if not _check_headings_and_content(pr_body, PR_HEADINGS, PR_KEYWORDS): print("Error: PR missing required headings or keywords", file=sys.stderr) return False # Check PR references if not _check_pr_references(pr_body, issue_number, PR_REFERENCE_NUMBERS): print( f"Error: PR does not properly reference issue #{issue_number} or issues #1, #6", file=sys.stderr, ) return False # 5. Check PR review comments print("5. Verifying PR review comments...") reviews = _get_pr_reviews(pr_number, headers, github_org) if not _check_pr_review_content(reviews, REVIEW_KEYWORDS): print( "Error: PR missing review comment with required technical keywords", file=sys.stderr, ) return False # 6. Check issue comments for PR reference with technical keywords print("6. Verifying issue comment referencing PR...") issue_comments = _get_issue_comments(issue_number, headers, github_org) if not _check_issue_comment_references( issue_comments, pr_number, ISSUE_COMMENT_KEYWORDS ): print( f"Error: Issue #{issue_number} missing comment referencing PR #{pr_number} with required technical keywords", file=sys.stderr, ) return False # 7. Check issue is closed print("7. Verifying issue closure...") if issue.get("state") != "closed": print(f"Error: Issue #{issue_number} is not closed", file=sys.stderr) return False print("\n✓ All verification checks passed!") print("Issue-PR-commit workflow completed successfully:") print(f" - Issue #{issue_number}: {issue.get('title')}") print(f" - PR #{pr_number}: {pr.get('title')}") print(f" - Branch: {BRANCH_NAME}") return True if __name__ == "__main__": success = verify() sys.exit(0 if success else 1) ================================================ FILE: tasks/github/standard/harmony/issue_tagging_pr_closure/description.md ================================================ I need you to simulate a realistic development workflow where an enhancement issue is created, implementation is attempted via a pull request, but then the PR must be closed without merging due to technical constraints discovered during the implementation process. **Step 1: Create Enhancement Issue** Create a new issue with: - Title containing: "Upgrade JavaScript demo to use ESM imports" and "modern module system" - Body must include: - A "## Problem" heading describing CommonJS limitations - A "## Proposed Solution" heading about ESM migration - A "## Benefits" heading listing advantages - Reference to issue #26 (which is about JavaScript demo issues) - Keywords: "CommonJS", "ESM imports", "module bundling", "modern JavaScript" - Labels: Add "enhancement" label to the issue **Step 2: Create Feature Branch** Create a new branch called 'feat/esm-migration-attempt' from main. **Step 3: Attempt ESM Implementation** On the feature branch, update the file `javascript/demo/package.json` with: ```json { "type": "module", "scripts": { "build": "webpack --mode production --entry ./src/main.js" }, "dependencies": { "@openai/harmony": "^0.1.0", "webpack": "^5.0.0" } } ``` Also create `javascript/demo/src/main.js` with: ```javascript // ESM import attempt - fails due to harmony core requirements import { HarmonyEncoding } from '@openai/harmony'; // This breaks the existing CommonJS integration // harmony core requires specific CommonJS patterns export const initHarmony = () => { throw new Error("ESM migration incompatible with harmony core"); }; ``` **Step 4: Create Pull Request** Create a pull request from 'feat/esm-migration-attempt' to 'main' with: - Title containing: "Upgrade JavaScript demo to ESM imports" and "modern modules" - Body must include: - A "## Summary" heading explaining the attempted migration - A "## Changes" heading with bullet points about ESM implementation - A "## Issues Discovered" heading describing technical problems found - "Addresses #[ISSUE_NUMBER]" pattern linking to your created issue - Keywords: "ESM migration", "webpack configuration", "module compatibility", "breaking changes" - Labels: Add "enhancement" and "needs-investigation" labels to the PR **Step 5: Investigate and Document Problems** Add a comment to the PR explaining the technical barriers discovered. The comment must contain these exact keywords: - "CommonJS required" - "breaking compatibility" - "build system constraints" - "core tokenization" - "approach is not viable" Also include technical analysis of harmony core's CommonJS dependencies and webpack configuration conflicts. **Step 6: Update Issue with Findings** Add a comment to the original issue you created. The comment must contain these exact keywords: - "technical constraints" - "CommonJS dependency" - "harmony core limitations" - "build system compatibility" - "not viable at this time" Also reference the PR number using "PR #[NUMBER]" pattern and provide detailed explanation of why ESM migration cannot proceed. **Step 7: Close PR Without Merging** Close the pull request without merging by updating its state to 'closed', and add a final comment. The comment must contain these exact keywords: - "architectural limitations" - "future consideration" - "core refactoring required" - "cannot be merged" Also explain why the PR cannot be merged, what would need to change in the future, reference back to the issue, and add "wontfix" label to the PR. **Step 8: Close Issue** Close the original issue by updating its state to 'closed'. Add a final comment to the issue that must contain these exact keywords: - "closing as not planned" - "architectural constraints" - "future implementation blocked" - "requires core redesign" ================================================ FILE: tasks/github/standard/harmony/issue_tagging_pr_closure/meta.json ================================================ { "task_id": "issue_tagging_pr_closure", "task_name": "Issue Tagging Pr Closure", "category_id": "harmony", "category_name": "Harmony", "description": "Simulate development workflow where enhancement PR is closed without merging due to technical constraints discovered during implementation.", "author": "Zijian Wu", "created_at": "2025-08-15", "difficulty": "L3", "tags": [ "issue management", "pr workflows" ], "mcp": [ "github" ], "meta_data": { "stateType": "url", "stateContent": null, "stateUrl": "https://github.com/mcpmark-source/harmony", "stateOriginalUrl": "https://github.com/openai/harmony" } } ================================================ FILE: tasks/github/standard/harmony/issue_tagging_pr_closure/verify.py ================================================ import sys import os import requests from typing import Dict, List, Optional, Tuple from dotenv import load_dotenv def _get_github_api( endpoint: str, headers: Dict[str, str], org: str, repo: str = "harmony" ) -> Tuple[bool, Optional[Dict]]: """Make a GET request to GitHub API and return (success, response).""" url = f"https://api.github.com/repos/{org}/{repo}/{endpoint}" try: response = requests.get(url, headers=headers) if response.status_code == 200: return True, response.json() elif response.status_code == 404: return False, None else: print(f"API error for {endpoint}: {response.status_code}", file=sys.stderr) return False, None except Exception as e: print(f"Exception for {endpoint}: {e}", file=sys.stderr) return False, None def _check_branch_exists( branch_name: str, headers: Dict[str, str], org: str, repo: str = "harmony" ) -> bool: """Verify that a branch exists in the repository.""" success, _ = _get_github_api(f"branches/{branch_name}", headers, org, repo) return success def _check_file_content( branch: str, file_path: str, keywords: List[str], headers: Dict[str, str], org: str, repo: str = "harmony", ) -> bool: """Verify that a file exists in branch and contains required keywords.""" import base64 success, result = _get_github_api( f"contents/{file_path}?ref={branch}", headers, org, repo ) if not success or not result: return False if keywords and result.get("content"): try: content = base64.b64decode(result.get("content", "")).decode("utf-8") return all(keyword in content for keyword in keywords) except Exception as e: print(f"Content decode error for {file_path}: {e}", file=sys.stderr) return False return True def _find_issue_by_title_keywords( title_keywords: List[str], headers: Dict[str, str], org: str, repo: str = "harmony" ) -> Optional[Dict]: """Find an issue by title keywords and return the issue data.""" for state in ["open", "closed"]: success, issues = _get_github_api( f"issues?state={state}&per_page=100", headers, org, repo ) if success and issues: for issue in issues: title = issue.get("title", "").lower() if all(keyword.lower() in title for keyword in title_keywords): return issue return None def _find_pr_by_title_keywords( title_keywords: List[str], headers: Dict[str, str], org: str, repo: str = "harmony" ) -> Optional[Dict]: """Find a PR by title keywords and return the PR data.""" for state in ["open", "closed"]: success, prs = _get_github_api( f"pulls?state={state}&per_page=100", headers, org, repo ) if success and prs: for pr in prs: title = pr.get("title", "").lower() if all(keyword.lower() in title for keyword in title_keywords): return pr return None def _check_labels(labels: List[Dict], required_labels: List[str]) -> bool: """Check if required labels are present.""" label_names = [label.get("name", "").lower() for label in labels] return all(req_label.lower() in label_names for req_label in required_labels) def _check_headings_and_keywords( body: str, headings: List[str], keywords: List[str] ) -> bool: """Check if body contains required headings and keywords.""" if not body: return False has_headings = all(heading in body for heading in headings) has_keywords = all(keyword.lower() in body.lower() for keyword in keywords) return has_headings and has_keywords def _check_issue_reference(body: str, issue_number: int) -> bool: """Check if body contains reference to the issue.""" if not body: return False return f"#{issue_number}" in body or f"Addresses #{issue_number}" in body def _get_issue_comments( issue_number: int, headers: Dict[str, str], org: str, repo: str = "harmony" ) -> List[Dict]: """Get all comments for an issue.""" success, comments = _get_github_api( f"issues/{issue_number}/comments", headers, org, repo ) if success and comments: return comments return [] def _get_pr_comments( pr_number: int, headers: Dict[str, str], org: str, repo: str = "harmony" ) -> List[Dict]: """Get all comments for a PR.""" success, comments = _get_github_api( f"issues/{pr_number}/comments", headers, org, repo ) if success and comments: return comments return [] def _check_pr_technical_comment(comments: List[Dict], keywords: List[str]) -> bool: """Check if PR has a comment with technical analysis containing required keywords.""" for comment in comments: body = comment.get("body", "") if body and all(keyword.lower() in body.lower() for keyword in keywords): return True return False def _check_issue_comment_with_pr_ref( comments: List[Dict], pr_number: int, keywords: List[str] ) -> bool: """Check if issue has a comment referencing the PR with required keywords.""" for comment in comments: body = comment.get("body", "") has_pr_ref = ( f"PR #{pr_number}" in body or f"PR#{pr_number}" in body or f"pr #{pr_number}" in body.lower() ) has_keywords = all(keyword.lower() in body.lower() for keyword in keywords) if has_pr_ref and has_keywords: return True return False def verify() -> bool: """ Programmatically verify that the issue tagging and PR closure workflow meets the requirements described in description.md. """ # Load environment variables from .mcp_env load_dotenv(".mcp_env") # Get GitHub token and org github_token = os.environ.get("MCP_GITHUB_TOKEN") github_org = os.environ.get("GITHUB_EVAL_ORG") if not github_token: print("Error: MCP_GITHUB_TOKEN environment variable not set", file=sys.stderr) return False if not github_org: print("Error: GITHUB_EVAL_ORG environment variable not set", file=sys.stderr) return False # Configuration constants BRANCH_NAME = "feat/esm-migration-attempt" # Issue requirements ISSUE_TITLE_KEYWORDS = [ "Upgrade JavaScript demo to use ESM imports", "modern module system", ] ISSUE_HEADINGS = ["## Problem", "## Proposed Solution", "## Benefits"] ISSUE_KEYWORDS = ["CommonJS", "ESM imports", "module bundling", "modern JavaScript"] ISSUE_LABELS = ["enhancement"] # PR requirements PR_TITLE_KEYWORDS = ["Upgrade JavaScript demo to ESM imports", "modern modules"] PR_HEADINGS = ["## Summary", "## Changes", "## Issues Discovered"] PR_KEYWORDS = [ "ESM migration", "webpack configuration", "module compatibility", "breaking changes", ] PR_LABELS = ["enhancement", "needs-investigation", "wontfix"] # File content requirements PACKAGE_JSON_KEYWORDS = ['"type": "module"', "webpack", "@openai/harmony"] MAIN_JS_KEYWORDS = [ "import { HarmonyEncoding }", "ESM import attempt", "harmony core", ] # Comment requirements PR_TECHNICAL_KEYWORDS = [ "CommonJS required", "breaking compatibility", "build system constraints", "core tokenization", "approach is not viable", ] ISSUE_COMMENT_KEYWORDS = [ "technical constraints", "CommonJS dependency", "harmony core limitations", "build system compatibility", "not viable at this time", ] PR_CLOSURE_KEYWORDS = [ "architectural limitations", "future consideration", "core refactoring required", "cannot be merged", ] ISSUE_CLOSURE_KEYWORDS = [ "closing as not planned", "architectural constraints", "future implementation blocked", "requires core redesign", ] headers = { "Authorization": f"token {github_token}", "Accept": "application/vnd.github.v3+json", } # Run verification checks print("Verifying issue tagging and PR closure workflow completion...") # 1. Check that feature branch exists print("1. Verifying feature branch exists...") if not _check_branch_exists(BRANCH_NAME, headers, github_org): print(f"Error: Branch '{BRANCH_NAME}' not found", file=sys.stderr) return False # 2. Check that implementation files exist with required content print("2. Verifying ESM implementation files...") if not _check_file_content( BRANCH_NAME, "javascript/demo/package.json", PACKAGE_JSON_KEYWORDS, headers, github_org, ): print( "Error: javascript/demo/package.json not found or missing required content", file=sys.stderr, ) return False if not _check_file_content( BRANCH_NAME, "javascript/demo/src/main.js", MAIN_JS_KEYWORDS, headers, github_org, ): print( "Error: javascript/demo/src/main.js not found or missing required content", file=sys.stderr, ) return False # 3. Find the created issue print("3. Verifying issue creation and content...") issue = _find_issue_by_title_keywords(ISSUE_TITLE_KEYWORDS, headers, github_org) if not issue: print( "Error: Issue with title containing required keywords not found", file=sys.stderr, ) return False issue_number = issue.get("number") issue_body = issue.get("body", "") issue_labels = issue.get("labels", []) # Check issue content if not _check_headings_and_keywords(issue_body, ISSUE_HEADINGS, ISSUE_KEYWORDS): print("Error: Issue missing required headings or keywords", file=sys.stderr) return False # Check issue references #26 if "#26" not in issue_body: print("Error: Issue does not reference issue #26", file=sys.stderr) return False # Check issue labels if not _check_labels(issue_labels, ISSUE_LABELS): print(f"Error: Issue missing required labels: {ISSUE_LABELS}", file=sys.stderr) return False # 4. Find the created PR print("4. Verifying pull request creation and content...") pr = _find_pr_by_title_keywords(PR_TITLE_KEYWORDS, headers, github_org) if not pr: print( "Error: PR with title containing required keywords not found", file=sys.stderr, ) return False pr_number = pr.get("number") pr_body = pr.get("body", "") pr_labels = pr.get("labels", []) pr_state = pr.get("state") # Check PR content if not _check_headings_and_keywords(pr_body, PR_HEADINGS, PR_KEYWORDS): print("Error: PR missing required headings or keywords", file=sys.stderr) return False # Check PR references issue if not _check_issue_reference(pr_body, issue_number): print(f"Error: PR does not reference issue #{issue_number}", file=sys.stderr) return False # Check PR labels if not _check_labels(pr_labels, PR_LABELS): print(f"Error: PR missing required labels: {PR_LABELS}", file=sys.stderr) return False # 5. Check PR is closed (not merged) print("5. Verifying PR is closed without merging...") if pr_state != "closed": print(f"Error: PR #{pr_number} is not closed", file=sys.stderr) return False if pr.get("merged_at"): print( f"Error: PR #{pr_number} was merged (should be closed without merging)", file=sys.stderr, ) return False # 6. Check PR technical analysis comment print("6. Verifying PR technical analysis comment...") pr_comments = _get_pr_comments(pr_number, headers, github_org) if not _check_pr_technical_comment(pr_comments, PR_TECHNICAL_KEYWORDS): print( "Error: PR missing technical analysis comment with required keywords", file=sys.stderr, ) return False # 7. Check issue comment with PR reference print("7. Verifying issue comment referencing PR...") issue_comments = _get_issue_comments(issue_number, headers, github_org) if not _check_issue_comment_with_pr_ref( issue_comments, pr_number, ISSUE_COMMENT_KEYWORDS ): print( f"Error: Issue #{issue_number} missing comment referencing PR #{pr_number} with required keywords", file=sys.stderr, ) return False # 8. Check PR closure comment with required keywords print("8. Verifying PR closure comment...") pr_closure_comment_found = False for comment in pr_comments: body = comment.get("body", "") if body and all( keyword.lower() in body.lower() for keyword in PR_CLOSURE_KEYWORDS ): pr_closure_comment_found = True break if not pr_closure_comment_found: print( "Error: PR missing closure comment with required keywords", file=sys.stderr ) return False # 9. Verify issue is closed print("9. Verifying issue is closed...") if issue.get("state") != "closed": print(f"Error: Issue #{issue_number} should be closed", file=sys.stderr) return False # 10. Check issue closure comment with required keywords print("10. Verifying issue closure comment...") issue_closure_comment_found = False for comment in issue_comments: body = comment.get("body", "") if body and all( keyword.lower() in body.lower() for keyword in ISSUE_CLOSURE_KEYWORDS ): issue_closure_comment_found = True break if not issue_closure_comment_found: print( "Error: Issue missing closure comment with required keywords", file=sys.stderr, ) return False print("\n✓ All verification checks passed!") print("Issue tagging and PR closure workflow completed successfully:") print(f" - Issue #{issue_number}: {issue.get('title')} (closed)") print(f" - PR #{pr_number}: {pr.get('title')} (closed without merging)") print(f" - Branch: {BRANCH_NAME}") print(" - All comments contain required keywords") print(" - Technical constraints properly documented and communicated") return True if __name__ == "__main__": success = verify() sys.exit(0 if success else 1) ================================================ FILE: tasks/github/standard/harmony/multi_branch_commit_aggregation/description.md ================================================ I need you to create a comprehensive commit history report by aggregating changes from multiple branches. Here's what you need to do: **Step 1: Create Analysis Branch** Create a new branch called 'history-report-2025' from the main branch. **Step 2: Generate Branch Commits Report** In the 'history-report-2025' branch, create a file called `BRANCH_COMMITS.json` that contains a JSON object with the following structure: - For each of these branches: ['pr/45-googlefan256-main', 'pr/25-neuralsorcerer-patch-1', 'pr/41-amirhosseinghanipour-fix-race-conditions-and-offline-api'] - List the 3 most recent commits for each branch - Each commit must include: SHA, GitHub username, commit message, and files changed count - The JSON structure should be: ```json { "pr/45-googlefan256-main": [ { "sha": "commit_sha", "author": "github_username", "message": "commit message", "files_changed": number } ], "pr/25-neuralsorcerer-patch-1": [...], "pr/41-amirhosseinghanipour-fix-race-conditions-and-offline-api": [...] } ``` **Step 3: Create Cross-Branch Analysis** Create a file `CROSS_BRANCH_ANALYSIS.md` that contains: - A section "## Top Contributors" listing the 3 contributors with the most commits on the main branch, sorted by commit count (format: "github_username: X commits") - Must include keywords: "contributors" **Step 4: Generate Merge Timeline** Create a file `MERGE_TIMELINE.txt` that lists the 10 most recent merge commits from the main branch: - Format: `DATE | MERGE_COMMIT_MESSAGE | COMMIT_SHA` - List in reverse chronological order (newest first) - Only include actual merge commits (commits that have exactly 2 parent commits) - Note: While the commit messages reference PR numbers, those PRs no longer exist in the repository ================================================ FILE: tasks/github/standard/harmony/multi_branch_commit_aggregation/meta.json ================================================ { "task_id": "multi_branch_commit_aggregation", "task_name": "Multi Branch Commit Aggregation", "category_id": "harmony", "category_name": "Harmony", "description": "Generate comprehensive commit history report by aggregating changes from multiple branches with contributor analysis and merge timeline.", "author": "Zijian Wu", "created_at": "2025-08-15", "difficulty": "L3", "tags": [ "repository analysis", "release coordination" ], "mcp": [ "github" ], "meta_data": { "stateType": "url", "stateContent": null, "stateUrl": "https://github.com/mcpmark-source/harmony", "stateOriginalUrl": "https://github.com/openai/harmony" } } ================================================ FILE: tasks/github/standard/harmony/multi_branch_commit_aggregation/verify.py ================================================ import sys import os import requests from typing import Dict, Optional, Tuple import base64 import json from dotenv import load_dotenv def _get_github_api( endpoint: str, headers: Dict[str, str], org: str ) -> Tuple[bool, Optional[Dict]]: """Make a GET request to GitHub API and return (success, response).""" url = f"https://api.github.com/repos/{org}/harmony/{endpoint}" try: response = requests.get(url, headers=headers) if response.status_code == 200: return True, response.json() elif response.status_code == 404: return False, None else: print(f"API error for {endpoint}: {response.status_code}", file=sys.stderr) return False, None except Exception as e: print(f"Exception for {endpoint}: {e}", file=sys.stderr) return False, None def _check_branch_exists(branch_name: str, headers: Dict[str, str], org: str) -> bool: """Verify that a branch exists in the repository.""" success, _ = _get_github_api(f"branches/{branch_name}", headers, org) return success def _get_file_content( branch: str, file_path: str, headers: Dict[str, str], org: str ) -> Optional[str]: """Get the content of a file from a specific branch.""" success, result = _get_github_api(f"contents/{file_path}?ref={branch}", headers, org) if not success or not result: return None try: content = base64.b64decode(result.get("content", "")).decode("utf-8") return content except Exception as e: print(f"Content decode error for {file_path}: {e}", file=sys.stderr) return None def _check_branch_commits_json(content: str) -> bool: """Verify BRANCH_COMMITS.json has correct structure and expected data.""" expected_data = { "pr/45-googlefan256-main": [ { "sha": "9fa3f54cf2a2501c7dcbf554d5fbdd0de619fdda", "author": "googlefan256", "message": "Update format.md", "files_changed": 1, }, { "sha": "3efbf742533a375fc148d75513597e139329578b", "author": "scott-oai", "message": "Merge pull request #29 from axion66/improve-readme-and-checks", "files_changed": 1, }, { "sha": "9d653a4c7382abc42d115014d195d9354e7ad357", "author": "scott-oai", "message": "Merge pull request #30 from Yuan-ManX/harmony-format", "files_changed": 1, }, ], "pr/25-neuralsorcerer-patch-1": [ { "sha": "c505a03e9c9a388a511b6125756097eee523742a", "author": "neuralsorcerer", "message": "fix: `meta_sep` token and add to registry", "files_changed": 1, }, { "sha": "c044bf33f7e835ca6a723ccc97848de25dba5164", "author": "neuralsorcerer", "message": "fix: `meta_sep` token in `encoding.rs`", "files_changed": 1, }, { "sha": "b255cbeb6274adbea774f26fd9590922ce8874ed", "author": "scott-oai", "message": "Merge pull request #18 from openai/dev/scl/better-ci", "files_changed": 6, }, ], "pr/41-amirhosseinghanipour-fix-race-conditions-and-offline-api": [ { "sha": "1dca6392934bf4e3c403b2ecc2104e8ff3f67f45", "author": "amirhosseinghanipour", "message": "fix race conditions and add offline tokenizer loading api", "files_changed": 8, }, { "sha": "9528c7b4a00a3307fd9685fc1328aee11c3d9c90", "author": "scott-oai", "message": "version bump", "files_changed": 2, }, { "sha": "82b3afb9eb043343f322c937262cc50405e892c3", "author": "scott-oai", "message": "Merge pull request #26 from jordan-wu-97/jordan/fix-function-call-atomic-bool", "files_changed": 6, }, ], } try: data = json.loads(content) # Check if all required branches are present for branch in expected_data.keys(): if branch not in data: print( f"Missing branch {branch} in BRANCH_COMMITS.json", file=sys.stderr ) return False # Verify the exact content matches expected data for branch, expected_commits in expected_data.items(): actual_commits = data.get(branch, []) if len(actual_commits) != 3: print( f"Branch {branch} should have exactly 3 commits, found {len(actual_commits)}", file=sys.stderr, ) return False for i, expected_commit in enumerate(expected_commits): if i >= len(actual_commits): print( f"Missing commit {i + 1} for branch {branch}", file=sys.stderr ) return False actual_commit = actual_commits[i] for field in ["sha", "author", "files_changed"]: if actual_commit.get(field) != expected_commit.get(field): print( f"Mismatch in {field} for commit {i + 1} in branch {branch}", file=sys.stderr, ) print( f"Expected: {expected_commit.get(field)}, Got: {actual_commit.get(field)}", file=sys.stderr, ) return False # For message field, use substring matching to be more flexible expected_message = expected_commit.get("message", "") actual_message = actual_commit.get("message", "") if expected_message not in actual_message: print( f"Mismatch in message for commit {i + 1} in branch {branch}", file=sys.stderr, ) print( f"Expected: {expected_message}, Got: {actual_message}", file=sys.stderr, ) return False return True except json.JSONDecodeError as e: print(f"Invalid JSON in BRANCH_COMMITS.json: {e}", file=sys.stderr) return False except Exception as e: print(f"Error checking BRANCH_COMMITS.json: {e}", file=sys.stderr) return False def _check_cross_branch_analysis(content: str) -> bool: """Verify CROSS_BRANCH_ANALYSIS.md contains required sections and data.""" # Check for required section header if "## Top Contributors" not in content: print( "Missing section '## Top Contributors' in CROSS_BRANCH_ANALYSIS.md", file=sys.stderr, ) return False # Check for required keyword if "contributors" not in content.lower(): print( "Missing keyword 'contributors' in CROSS_BRANCH_ANALYSIS.md", file=sys.stderr, ) return False # Verify the top 3 contributors with correct counts from main branch (order matters) expected_contributors = [ "scott-oai: 35 commits", "egorsmkv: 4 commits", "axion66: 2 commits", ] for contributor in expected_contributors: if contributor not in content: print( f"Missing or incorrect contributor entry: {contributor}", file=sys.stderr, ) return False return True def _check_merge_timeline(content: str) -> bool: """Verify MERGE_TIMELINE.txt has correct format and expected merge commits.""" expected_timeline = [ "2025-08-06 | Merge pull request #29 from axion66/improve-readme-and-checks | 3efbf742533a375fc148d75513597e139329578b", "2025-08-06 | Merge pull request #30 from Yuan-ManX/harmony-format | 9d653a4c7382abc42d115014d195d9354e7ad357", "2025-08-06 | Merge pull request #28 from dkqjrm/fix-typo-format-md | 161e5fe2a57c63e9f8353c4c5b8faa3c3854bb5f", "2025-08-05 | Merge pull request #26 from jordan-wu-97/jordan/fix-function-call-atomic-bool | 82b3afb9eb043343f322c937262cc50405e892c3", "2025-08-05 | Merge pull request #18 from openai/dev/scl/better-ci | b255cbeb6274adbea774f26fd9590922ce8874ed", "2025-08-05 | Merge pull request #21 from Tialo/main | 058ef3257c24fb099aac7960c10ce51c8e55d9fe", "2025-08-05 | Merge branch 'main' into dev/scl/better-ci | 6375a15ea1b0a486cbb1468964cf8f5800ff5a5c", "2025-08-05 | Merge pull request #8 from RustedBytes/main | f6179119ca894eda4124c86d408c01fdbf5281f0", "2025-08-05 | Merge branch 'main' into main | eb86106b6980790b94f5702dc510483c66027277", "2025-08-05 | Merge pull request #17 from openai/dev/scl/add-docs-to-cargo | 64bca4cf327ebeafa0bbd0345650d86e2d02142f", ] # Verify each expected timeline entry exists in the content for i, expected_line in enumerate(expected_timeline): if expected_line not in content: print(f"Missing expected timeline entry {i + 1} in MERGE_TIMELINE.txt", file=sys.stderr) print(f"Expected: {expected_line}", file=sys.stderr) return False return True def verify_task() -> bool: """Verify the multi-branch commit aggregation task.""" # Get GitHub token from environment load_dotenv(".mcp_env") github_token = os.environ.get("MCP_GITHUB_TOKEN") if not github_token: print("Error: MCP_GITHUB_TOKEN environment variable not set", file=sys.stderr) return False # Get GitHub organization from environment github_org = os.environ.get("GITHUB_EVAL_ORG") if not github_org: print("Error: GITHUB_EVAL_ORG environment variable not set", file=sys.stderr) return False headers = { "Authorization": f"Bearer {github_token}", "Accept": "application/vnd.github.v3+json", } # 1. Check if branch 'history-report-2025' exists if not _check_branch_exists("history-report-2025", headers, github_org): print("Branch 'history-report-2025' does not exist", file=sys.stderr) return False print("✓ Branch 'history-report-2025' exists") # 2. Check BRANCH_COMMITS.json content = _get_file_content("history-report-2025", "BRANCH_COMMITS.json", headers, github_org) if not content: print( "File 'BRANCH_COMMITS.json' not found in 'history-report-2025' branch", file=sys.stderr, ) return False if not _check_branch_commits_json(content): return False print("✓ BRANCH_COMMITS.json has correct structure and data") # 3. Check CROSS_BRANCH_ANALYSIS.md content = _get_file_content( "history-report-2025", "CROSS_BRANCH_ANALYSIS.md", headers, github_org ) if not content: print( "File 'CROSS_BRANCH_ANALYSIS.md' not found in 'history-report-2025' branch", file=sys.stderr, ) return False if not _check_cross_branch_analysis(content): return False print("✓ CROSS_BRANCH_ANALYSIS.md contains required sections and data") # 4. Check MERGE_TIMELINE.txt content = _get_file_content("history-report-2025", "MERGE_TIMELINE.txt", headers, github_org) if not content: print( "File 'MERGE_TIMELINE.txt' not found in 'history-report-2025' branch", file=sys.stderr, ) return False if not _check_merge_timeline(content): return False print("✓ MERGE_TIMELINE.txt has correct format and data") print("\nAll verification checks passed! ✅") return True if __name__ == "__main__": success = verify_task() sys.exit(0 if success else 1) ================================================ FILE: tasks/github/standard/harmony/release_management_workflow/description.md ================================================ I need help implementing a comprehensive release management workflow for this harmony repository. Here's what I need you to do: **Step 1: Analyze Current State** First, analyze the current open pull requests to understand what changes they contain and their impact on the codebase. **Step 2: Create Release Branch** Create a release preparation branch called 'release-v1.1.0' from the current main branch. **Step 3: Apply Critical Bug Fixes** On the release branch, apply the MetaSep token fix from PR #25 by creating/updating the file `src/encoding.rs` with the corrected content where FormattingToken::MetaSep maps to "<|meta_sep|>" instead of "<|channel|>". Also create/update `src/registry.rs` to include the missing MetaSep and MetaEnd token registrations: ```rust (FormattingToken::MetaSep, "<|meta_sep|>"), (FormattingToken::MetaEnd, "<|meta_end|>"), ``` **Step 4: Add Missing Utility File** From PR #26, create the missing shadcn utils file `demo/harmony-demo/src/lib/utils.ts` with content: ```typescript import { clsx, type ClassValue } from "clsx" import { twMerge } from "tailwind-merge" export function cn(...inputs: ClassValue[]) { return twMerge(clsx(inputs)) } ``` And create/update `.gitignore` to add: ``` # Avoid ignoring shadcn utils !demo/harmony-demo/src/lib ``` **Step 5: Version Update** Update the version number in `Cargo.toml`: Change the `version` field in the `[package]` section to `version = "1.1.0"`. **Step 6: Create Comprehensive Changelog** Create a `CHANGELOG.md` file in the release branch with the following content: ```markdown # Changelog ## [1.1.0] - 2025-08-07 ### Added - Added missing shadcn utils.ts file for demo application - Enhanced gitignore rules to preserve shadcn utilities ### Fixed - Fixed MetaSep token mapping bug (was incorrectly mapped to channel token) - Added missing MetaSep and MetaEnd token registrations in registry - Improved tokenizer registry functionality for meta formatting tokens ### Changed - Updated version to 1.1.0 for new release cycle ### Technical Details - MetaSep token now correctly maps to `<|meta_sep|>` instead of `<|channel|>` - Registry now properly recognizes MetaSep and MetaEnd formatting tokens - Demo application now includes required utility functions for UI components ``` **Step 7: Create Release Pull Request** Create a pull request from 'release-v1.1.0' to 'main' with title "Release v1.1.0 - Bug fixes and utility additions" and a detailed description explaining all the integrated changes. **Step 8: Merge the Pull Request** After creating the PR, merge it into the main branch using the "squash and merge" method. **Step 9: Verification** Ensure the release branch contains at least 4 distinct commits before merging: 1. MetaSep token fix commit 2. Utility file addition commit 3. Version update commit 4. Changelog addition commit ================================================ FILE: tasks/github/standard/harmony/release_management_workflow/meta.json ================================================ { "task_id": "release_management_workflow", "task_name": "Release Management Workflow", "category_id": "harmony", "category_name": "Harmony", "description": "Implement comprehensive release management workflow including bug fixes, version updates, changelog creation, and PR merging.", "author": "Zijian Wu", "created_at": "2025-08-15", "difficulty": "L3", "tags": [ "release coordination", "pr workflows" ], "mcp": [ "github" ], "meta_data": { "stateType": "url", "stateContent": null, "stateUrl": "https://github.com/mcpmark-source/harmony", "stateOriginalUrl": "https://github.com/openai/harmony" } } ================================================ FILE: tasks/github/standard/harmony/release_management_workflow/verify.py ================================================ import sys import os import requests from typing import Dict, List, Optional, Tuple import base64 from dotenv import load_dotenv def _get_github_api( endpoint: str, headers: Dict[str, str], org: str, repo: str = "harmony" ) -> Tuple[bool, Optional[Dict]]: """Make a GET request to GitHub API and return (success, response).""" url = f"https://api.github.com/repos/{org}/{repo}/{endpoint}" try: response = requests.get(url, headers=headers) if response.status_code == 200: return True, response.json() elif response.status_code == 404: return False, None else: print(f"API error for {endpoint}: {response.status_code}", file=sys.stderr) return False, None except Exception as e: print(f"Exception for {endpoint}: {e}", file=sys.stderr) return False, None def _check_branch_exists( branch_name: str, headers: Dict[str, str], org: str, repo: str = "harmony" ) -> bool: """Verify that a branch exists in the repository.""" success, _ = _get_github_api(f"branches/{branch_name}", headers, org, repo) return success def _check_file_content( branch: str, file_path: str, keywords: List[str], headers: Dict[str, str], org: str, repo: str = "harmony", ) -> bool: """Verify that a file exists in branch and contains required keywords.""" success, result = _get_github_api( f"contents/{file_path}?ref={branch}", headers, org, repo ) if not success or not result: return False if keywords and result.get("content"): try: content = base64.b64decode(result.get("content", "")).decode("utf-8") return all(keyword in content for keyword in keywords) except Exception as e: print(f"Content decode error for {file_path}: {e}", file=sys.stderr) return False return True def _check_specific_file_content( branch: str, file_path: str, expected_content: str, headers: Dict[str, str], org: str, repo: str = "harmony", min_length: int = 100, ) -> bool: """Verify that a file contains specific exact content and has reasonable size.""" success, result = _get_github_api( f"contents/{file_path}?ref={branch}", headers, org, repo ) if not success or not result: return False if result.get("content"): try: content = base64.b64decode(result.get("content", "")).decode("utf-8") # Check both that expected content exists and file has reasonable content return expected_content in content and len(content) >= min_length except Exception as e: print(f"Content decode error for {file_path}: {e}", file=sys.stderr) return False return False def _check_pr_merged( title_substring: str, base_branch: str, headers: Dict[str, str], org: str, repo: str = "harmony", ) -> Tuple[bool, Optional[int]]: """Check if a PR with specified title was merged into base branch and return PR number.""" # Check closed PRs to find merged ones success, prs = _get_github_api( "pulls?state=closed&per_page=100", headers, org, repo ) if not success or not prs: return False, None for pr in prs: title_match = title_substring.lower() in pr.get("title", "").lower() base_match = pr.get("base", {}).get("ref") == base_branch is_merged = pr.get("merged_at") is not None if title_match and base_match and is_merged: return True, pr.get("number") return False, None def _check_pr_squash_merged( pr_number: int, headers: Dict[str, str], org: str, repo: str = "harmony" ) -> bool: """Check if a PR was merged using squash and merge method.""" # Get the PR details success, pr = _get_github_api(f"pulls/{pr_number}", headers, org, repo) if not success or not pr: return False if not pr.get("merged_at"): return False merge_commit_sha = pr.get("merge_commit_sha") if not merge_commit_sha: return False # Get the merge commit details success, commit = _get_github_api(f"commits/{merge_commit_sha}", headers, org, repo) if not success or not commit: return False # For squash and merge, the commit will have exactly one parent # and the commit message typically includes the PR number parents = commit.get("parents", []) commit_message = commit.get("commit", {}).get("message", "") # Squash and merge commits have exactly 1 parent (the base branch) # Regular merge commits have 2 parents (base and head branches) if len(parents) == 1 and f"#{pr_number}" in commit_message: return True return False def verify() -> bool: """ Programmatically verify that the release management workflow meets the requirements described in description.md. """ # Load environment variables from .mcp_env load_dotenv(".mcp_env") # Get GitHub token and org github_token = os.environ.get("MCP_GITHUB_TOKEN") github_org = os.environ.get("GITHUB_EVAL_ORG") if not github_token: print("Error: MCP_GITHUB_TOKEN environment variable not set", file=sys.stderr) return False if not github_org: print("Error: GITHUB_EVAL_ORG environment variable not set", file=sys.stderr) return False # Configuration constants RELEASE_BRANCH = "release-v1.1.0" # Expected content checks with minimum file sizes to ensure files aren't just stubs METASEP_FIX = 'FormattingToken::MetaSep => "<|meta_sep|>"' REGISTRY_FIX = '(FormattingToken::MetaSep, "<|meta_sep|>")' METAEND_FIX = '(FormattingToken::MetaEnd, "<|meta_end|>")' UTILS_CONTENT = "export function cn(...inputs: ClassValue[])" GITIGNORE_ADDITION = "!demo/harmony-demo/src/lib" VERSION_110 = 'version = "1.1.0"' CHANGELOG_KEYWORDS = [ "## [1.1.0] - 2025-08-07", "MetaSep token mapping bug", "shadcn utils.ts file", "Fixed MetaSep token", "Registry now properly recognizes", ] headers = { "Authorization": f"token {github_token}", "Accept": "application/vnd.github.v3+json", } # Run verification checks print("Verifying GitHub release management workflow completion...") # 1. Check release branch exists print("1. Verifying release branch exists...") if not _check_branch_exists(RELEASE_BRANCH, headers, github_org): print(f"Error: Branch '{RELEASE_BRANCH}' not found", file=sys.stderr) return False # 2. Check MetaSep fix in encoding.rs (with min content length to ensure file wasn't gutted) print("2. Verifying MetaSep token fix in encoding.rs...") if not _check_specific_file_content( "main", "src/encoding.rs", METASEP_FIX, headers, github_org, min_length=500 ): print( "Error: MetaSep token fix not found in src/encoding.rs or file is too small", file=sys.stderr, ) return False # 3. Check registry updates (both MetaSep and MetaEnd) print("3. Verifying MetaSep and MetaEnd registry additions...") if not _check_specific_file_content( "main", "src/registry.rs", REGISTRY_FIX, headers, github_org, min_length=500 ): print( "Error: MetaSep registry fix not found in src/registry.rs or file is too small", file=sys.stderr, ) return False if not _check_specific_file_content( "main", "src/registry.rs", METAEND_FIX, headers, github_org, min_length=500 ): print( "Error: MetaEnd registry fix not found in src/registry.rs", file=sys.stderr ) return False # 4. Check utils.ts file exists with correct content print("4. Verifying shadcn utils.ts file...") if not _check_specific_file_content( "main", "demo/harmony-demo/src/lib/utils.ts", UTILS_CONTENT, headers, github_org, min_length=50, ): print("Error: utils.ts file not found or incorrect content", file=sys.stderr) return False # 5. Check .gitignore update print("5. Verifying .gitignore update...") if not _check_specific_file_content( "main", ".gitignore", GITIGNORE_ADDITION, headers, github_org, min_length=100 ): print("Error: .gitignore update not found", file=sys.stderr) return False # 6. Check version update in Cargo.toml only (pyproject.toml uses dynamic versioning) print("6. Verifying version update in Cargo.toml...") if not _check_specific_file_content( "main", "Cargo.toml", VERSION_110, headers, github_org, min_length=200 ): print("Error: Version 1.1.0 not found in Cargo.toml", file=sys.stderr) return False # 7. Check CHANGELOG.md exists with required content print("7. Verifying CHANGELOG.md...") if not _check_file_content( "main", "CHANGELOG.md", CHANGELOG_KEYWORDS, headers, github_org ): print( "Error: CHANGELOG.md not found or missing required content", file=sys.stderr ) return False # 8. Check release PR was merged and get PR number print("8. Verifying release pull request was merged...") pr_merged, pr_number = _check_pr_merged( "Release v1.1.0", "main", headers, github_org ) if not pr_merged: print("Error: Release pull request not found or not merged", file=sys.stderr) return False # 9. Check PR was merged using squash and merge print("9. Verifying pull request was merged using 'squash and merge' method...") if pr_number and not _check_pr_squash_merged(pr_number, headers, github_org): print( f"Error: Pull request #{pr_number} was not merged using 'squash and merge' method", file=sys.stderr, ) return False print("\n✓ All verification checks passed!") print("Release management workflow completed successfully.") return True if __name__ == "__main__": success = verify() sys.exit(0 if success else 1) ================================================ FILE: tasks/github/standard/mcpmark-cicd/deployment_status_workflow/description.md ================================================ I need you to create a Deployment Status workflow for this Node.js project. The project currently has no GitHub Actions workflows, so you'll be building a deployment-focused CI/CD workflow from scratch that responds to push events on the main branch. Here's what needs to be implemented: ## Deployment Status Workflow Create `.github/workflows/deployment-status.yml` that triggers on `push` to `main` branch with these sequential jobs: ### 1. **pre-deployment** job (name: `pre-deployment`): - Runs basic quality checks (lint and test) - Creates deployment tracking issue with title: "Deployment Tracking - [commit-sha]" - Adds labels: `deployment`, `in-progress` - Captures previous commit SHA and package version information - Posts comment containing "Pre-deployment checks completed" ### 2. **rollback-preparation** job (name: `rollback-preparation`): - Depends on: pre-deployment - Creates comprehensive rollback artifacts including: * Executable rollback script with proper error handling * Configuration backups (package.json, package-lock.json, environment templates) * Dependency verification script for compatibility checking * Detailed rollback documentation with step-by-step instructions * Compressed rollback package with SHA256 checksums - Uploads rollback artifacts to GitHub Actions with 30-day retention - Posts comment on deployment issue that MUST contain the following verifiable elements: * Title: "🔄 Rollback Plan Ready" * Previous commit SHA (format: "Previous Commit: [sha]") * Current commit SHA (format: "Current Commit: [sha]") * Package version (format: "Package Version: [version]") * Artifact name (format: "Artifact: rollback-package-[commit-sha]") * At least 5 checkmarks (✅) indicating completed rollback components * Quick rollback command section with bash code block * Script verification status: "Rollback script created: true" * Backup verification status: "Configuration backup: true" * Artifact checksum (format: "SHA256: [checksum-value]") ### 3. **post-deployment** job (name: `post-deployment`): - Depends on: rollback-preparation - Removes `in-progress` label and adds `completed` label - Posts final comment containing "Deployment Completed Successfully" with rollback artifact details - Closes the deployment tracking issue ## Implementation Requirements: **Step 1: Create Feature Branch** Create a new branch called `deployment-status-workflow` from main. **Step 2: Implement the Workflow** Create `.github/workflows/deployment-status.yml` with proper YAML syntax: - Trigger only on push to main branch - Sequential job execution: pre-deployment → rollback-preparation → post-deployment - Use github-script actions for issue management - Avoid identifier conflicts in github-script actions (don't redeclare 'github') - Include proper error handling and script validation - Implement comprehensive rollback artifact creation and verification - Use proper fetch-depth for accessing commit history - Include artifact upload/download capabilities with checksums **Step 3: Create and Merge Pull Request** Create a comprehensive pull request and merge it to main: - Title: "Implement Deployment Status Workflow" - Detailed description of the workflow and its purpose - Merge the pull request to main branch to trigger the deployment workflow ================================================ FILE: tasks/github/standard/mcpmark-cicd/deployment_status_workflow/meta.json ================================================ { "task_id": "deployment_status_workflow", "task_name": "Deployment Status Workflow", "category_id": "mcpmark-cicd", "category_name": "MCPMark CI/CD", "description": "Create deployment status workflow with pre-deployment checks, rollback preparation, and comprehensive issue tracking for deployments.", "author": "Zijian Wu", "created_at": "2025-08-15", "difficulty": "L3", "tags": [ "ci/cd automation", "workflow automation" ], "mcp": [ "github" ], "meta_data": { "stateType": "url", "stateContent": null, "stateUrl": "https://github.com/mcpmark-source/mcpmark-cicd", "stateOriginalUrl": null } } ================================================ FILE: tasks/github/standard/mcpmark-cicd/deployment_status_workflow/verify.py ================================================ import sys import os import requests import time from typing import Dict, List, Optional, Tuple from dotenv import load_dotenv def _get_github_api( endpoint: str, headers: Dict[str, str], owner: str, repo: str ) -> Tuple[bool, Optional[Dict]]: """Make a GET request to GitHub API and return (success, response).""" url = f"https://api.github.com/repos/{owner}/{repo}/{endpoint}" try: response = requests.get(url, headers=headers) if response.status_code == 200: return True, response.json() elif response.status_code == 404: return False, None else: print(f"API error for {endpoint}: {response.status_code}", file=sys.stderr) return False, None except Exception as e: print(f"Exception for {endpoint}: {e}", file=sys.stderr) return False, None def _search_github_issues( query: str, headers: Dict[str, str] ) -> Tuple[bool, Optional[List]]: """Search GitHub issues using the search API.""" url = f"https://api.github.com/search/issues?q={query}&per_page=100" try: response = requests.get(url, headers=headers) if response.status_code == 200: data = response.json() return True, data.get("items", []) else: print(f"Search API error: {response.status_code}", file=sys.stderr) return False, None except Exception as e: print(f"Search exception: {e}", file=sys.stderr) return False, None def _wait_for_workflow_completion( headers: Dict[str, str], owner: str, repo: str, max_wait: int = 90 ) -> bool: """Wait for GitHub Actions workflows to complete processing.""" print("⏳ Waiting for deployment status workflows to complete...") start_time = time.time() no_workflow_check_count = 0 while time.time() - start_time < max_wait: try: # Check workflow runs for deployment-status.yml success, response = _get_github_api( "actions/workflows/deployment-status.yml/runs?per_page=10", headers, owner, repo, ) if success and response: runs = response.get("workflow_runs", []) if len(runs) > 0: # Check status of recent runs running_count = 0 completed_count = 0 failed_count = 0 for run in runs[:3]: # Check recent runs status = run["status"] conclusion = run.get("conclusion") if status == "completed": completed_count += 1 if conclusion == "failure": failed_count += 1 elif status in ["in_progress", "queued"]: running_count += 1 print( f" Status: {completed_count} completed, {running_count} running/queued" ) # Wait until NO workflows are running if running_count == 0: if failed_count > 0: print( f"⚠️ Warning: {failed_count} workflow runs failed, but continuing verification..." ) print( f"✅ All workflows completed. Found {completed_count} completed runs." ) # Additional wait to ensure all processing is done print( "⏳ Additional wait for deployment processing to complete..." ) time.sleep(5) return True else: # No workflow runs found no_workflow_check_count += 1 if no_workflow_check_count == 1: print( " No workflow runs found yet, waiting 5 seconds and checking once more..." ) time.sleep(5) continue elif no_workflow_check_count >= 2: print( "⚠️ No workflow runs detected after 2 checks. Workflow may not have been triggered." ) print(" Continuing with verification...") return False print(f"⏳ Still waiting... ({int(time.time() - start_time)}s elapsed)") time.sleep(5) except Exception as e: print(f"⚠️ Error checking workflow status: {e}") time.sleep(5) print(f"⚠️ Workflow completion wait timed out after {max_wait}s") return False def _verify_workflow_runs( headers: Dict[str, str], owner: str, repo: str ) -> Tuple[bool, List[str], Optional[Dict]]: """Verify that the deployment status workflow runs have the correct jobs.""" print("\n⚙️ Verifying deployment status workflow runs...") errors = [] # Get the most recent workflow run success, response = _get_github_api( "actions/workflows/deployment-status.yml/runs?per_page=5", headers, owner, repo, ) if not success or not response: return False, ["Failed to fetch workflow runs"], None runs = response.get("workflow_runs", []) if not runs: return False, ["No workflow runs found for deployment-status.yml"], None # Find the most recent successful run latest_successful_run = None for run in runs: if run.get("conclusion") == "success": latest_successful_run = run break if not latest_successful_run: return False, ["No successful workflow runs found"], None run_id = latest_successful_run["id"] print(f" Found successful workflow run #{run_id}") # Get jobs for this run success, jobs_response = _get_github_api( f"actions/runs/{run_id}/jobs", headers, owner, repo ) if not success: return False, ["Failed to fetch workflow jobs"], None jobs = jobs_response.get("jobs", []) expected_jobs = ["pre-deployment", "rollback-preparation", "post-deployment"] found_jobs = [job["name"] for job in jobs] missing_jobs = [job for job in expected_jobs if job not in found_jobs] if missing_jobs: errors.append(f"Missing jobs: {missing_jobs}. Found: {found_jobs}") else: print(f" ✅ All 3 required jobs found: {found_jobs}") # Verify all jobs succeeded failed_jobs = [job["name"] for job in jobs if job["conclusion"] != "success"] if failed_jobs: errors.append(f"Failed jobs: {failed_jobs}") else: print(" ✅ All jobs completed successfully") # Verify sequential execution (each job should start after the previous one) if len(jobs) >= 3: job_times = {} for job in jobs: if job["name"] in expected_jobs and job["started_at"]: job_times[job["name"]] = job["started_at"] if len(job_times) >= 3: # Check that jobs ran in correct sequence import datetime times = { name: datetime.datetime.fromisoformat(time.replace("Z", "+00:00")) for name, time in job_times.items() } # pre-deployment should start first # rollback-preparation should start after pre-deployment # post-deployment should start after rollback-preparation if all(job in times for job in expected_jobs): if ( times["rollback-preparation"] <= times["pre-deployment"] or times["post-deployment"] <= times["rollback-preparation"] ): errors.append("Jobs did not run in correct sequential order") else: print(" ✅ Jobs ran in correct sequential order") else: errors.append( "Not enough job timing data to verify sequential execution" ) return len(errors) == 0, errors, latest_successful_run def _verify_deployment_issue( run_data: Dict, headers: Dict[str, str], owner: str, repo: str ) -> Tuple[bool, List[str]]: """Verify that a deployment tracking issue was created and closed properly.""" print("\n📋 Verifying deployment tracking issue...") errors = [] # Extract commit SHA from the workflow run head_sha = run_data.get("head_sha") if not head_sha: return False, ["Could not determine head SHA from workflow run"] short_sha = head_sha[:7] expected_title = f"Deployment Tracking - {short_sha}" # Search for the deployment tracking issue success, issues = _search_github_issues( f'repo:{owner}/{repo} "{expected_title}" is:issue', headers ) if not success: return False, ["Failed to search for deployment tracking issue"] # Find the exact issue deployment_issue = None for issue in issues: if issue.get("title") == expected_title: deployment_issue = issue break if not deployment_issue: return False, [f"Deployment tracking issue '{expected_title}' not found"] issue_number = deployment_issue["number"] print(f" Found deployment tracking issue #{issue_number}: {expected_title}") # Check that issue is closed if deployment_issue.get("state") != "closed": errors.append( f"Deployment issue #{issue_number} is not closed (state: {deployment_issue.get('state')})" ) else: print(f" ✅ Deployment issue #{issue_number} is closed") # Check required labels expected_labels = ["deployment", "completed"] actual_labels = [label["name"] for label in deployment_issue.get("labels", [])] missing_labels = [label for label in expected_labels if label not in actual_labels] if missing_labels: errors.append( f"Missing labels on deployment issue: {missing_labels}. Found: {actual_labels}" ) else: print(f" ✅ Required labels found: {expected_labels}") # Get issue comments to verify GitHub Actions bot comments success, comments = _get_github_api( f"issues/{issue_number}/comments", headers, owner, repo ) if not success: errors.append("Failed to get deployment issue comments") return len(errors) == 0, errors # Filter for GitHub Actions bot comments only bot_comments = [ comment for comment in comments if comment.get("user", {}).get("login") == "github-actions[bot]" ] if not bot_comments: errors.append("No comments found from GitHub Actions bot") return len(errors) == 0, errors print(f" Found {len(bot_comments)} comment(s) from GitHub Actions bot") # Get all bot comment bodies bot_comment_bodies = [comment.get("body", "") for comment in bot_comments] all_bot_comments = " ".join(bot_comment_bodies) # Check for required GitHub Actions bot comment indicators required_comment_indicators = [ "Pre-deployment checks completed", "🔄 Rollback Plan Ready", "Deployment Completed Successfully", ] for indicator in required_comment_indicators: if indicator not in all_bot_comments: errors.append( f"Missing required GitHub Actions bot comment indicator: '{indicator}'" ) else: print(f" ✅ Found GitHub Actions bot comment indicator: '{indicator}'") # Find and verify the rollback plan comment from GitHub Actions bot rollback_comment = None for comment in bot_comments: if "🔄 Rollback Plan Ready" in comment.get("body", ""): rollback_comment = comment.get("body", "") break if rollback_comment: print(" ✅ Found rollback plan comment from GitHub Actions bot") # Check for required rollback plan elements required_elements = [ "**Previous Commit**:", "**Current Commit**:", "**Package Version**:", "✅ Executable rollback script created", "✅ Configuration backups saved", "✅ Dependency verification script prepared", "✅ Comprehensive rollback documentation generated", "✅ Compressed rollback package created", "**SHA256**:", "**Artifact**:", "Quick Rollback Commands", ] for element in required_elements: if element not in rollback_comment: errors.append(f"Missing element in rollback plan: '{element}'") else: print(f" ✅ Found rollback plan element: '{element}'") # Verify commit SHAs in rollback comment if f"**Current Commit**: {head_sha}" in rollback_comment: print(f" ✅ Current commit SHA verified: {head_sha}") else: errors.append( f"Current commit SHA {head_sha} not found in rollback comment" ) # Extract and verify previous commit SHA if "**Previous Commit**:" in rollback_comment: import re prev_sha_match = re.search( r"\*\*Previous Commit\*\*:\s*([a-f0-9]{40})", rollback_comment ) if prev_sha_match: prev_sha = prev_sha_match.group(1) print(f" ✅ Previous commit SHA found: {prev_sha}") # Verify it's a valid 40-character SHA if len(prev_sha) != 40: errors.append( f"Previous commit SHA has invalid length: {len(prev_sha)}" ) else: errors.append( "Previous commit SHA format not found in rollback comment" ) else: errors.append("Previous commit SHA not found in rollback comment") # Verify SHA256 checksum is present sha256_match = re.search(r"\*\*SHA256\*\*:\s*([a-f0-9]{64})", rollback_comment) if sha256_match: sha256_value = sha256_match.group(1) print(f" ✅ SHA256 checksum found: {sha256_value[:16]}...") else: errors.append( "SHA256 checksum not found or invalid format in rollback comment" ) else: errors.append("Rollback plan comment not found from GitHub Actions bot") return len(errors) == 0, errors def verify() -> bool: """ Verify that the deployment status workflow automation is working correctly. """ # Load environment variables load_dotenv(".mcp_env") github_token = os.environ.get("MCP_GITHUB_TOKEN") if not github_token: print("Error: MCP_GITHUB_TOKEN environment variable not set", file=sys.stderr) return False # Get GitHub organization github_org = os.environ.get("GITHUB_EVAL_ORG") if not github_org: print("Error: GITHUB_EVAL_ORG environment variable not set", file=sys.stderr) return False # Repository configuration owner = github_org repo = "mcpmark-cicd" headers = { "Authorization": f"token {github_token}", "Accept": "application/vnd.github.v3+json", } print("🔍 Starting Deployment Status Workflow Verification") print("=" * 60) # Wait for workflows to complete workflows_completed = _wait_for_workflow_completion(headers, owner, repo) if not workflows_completed: print( "⚠️ Warning: Workflows may still be running. Continuing with verification..." ) # Verify workflow runs and jobs all_passed = True # 1. Verify workflow runs have correct jobs runs_ok, runs_errors, run_data = _verify_workflow_runs(headers, owner, repo) if not runs_ok: all_passed = False print("❌ Workflow Runs Verification Failed:") for error in runs_errors: print(f" - {error}") else: print("✅ Workflow Runs Verification Passed") # 2. Verify deployment issue if workflow runs passed if run_data: issue_ok, issue_errors = _verify_deployment_issue( run_data, headers, owner, repo ) if not issue_ok: all_passed = False print("❌ Deployment Issue Verification Failed:") for error in issue_errors: print(f" - {error}") else: print("✅ Deployment Issue Verification Passed") print("\n" + "=" * 60) if all_passed: print("🎉 All Deployment Status Workflow verifications PASSED!") print("\n📋 Summary:") print( " ✅ Workflow runs with correct 3 sequential jobs: pre-deployment, rollback-preparation, post-deployment" ) print(" ✅ Deployment tracking issue created and closed with proper labels") print(" ✅ Issue contains rollback plan with all required elements") print(" ✅ Previous and current commit SHAs are correctly tracked") print(" ✅ All workflow automation comments are present") print( "\n🤖 The GitHub Actions deployment status workflow is working correctly!" ) else: print("❌ Deployment Status Workflow verification FAILED!") print(" Some components did not meet the expected automation requirements.") return all_passed if __name__ == "__main__": success = verify() sys.exit(0 if success else 1) ================================================ FILE: tasks/github/standard/mcpmark-cicd/issue_management_workflow/description.md ================================================ I need you to create an intelligent Issue Management automation workflow for this Node.js project. The project currently has no GitHub Actions workflows, so you'll be building an issue-focused automation system from scratch that responds to issue events. Here's what needs to be implemented: ## Issue Management Workflow Create `.github/workflows/issue-automation.yml` that triggers on `issues` events (opened, labeled) with these jobs: ### 1. **issue-triage** job: - Auto-assigns category labels based on keywords in **issue title** (case-insensitive): - Title contains "bug" → adds `bug` label - Title contains "epic" → adds `epic` label - Title contains "maintenance" → adds `maintenance` label - Auto-assigns priority labels based on keywords in **issue title OR body** (case-insensitive, highest priority wins if multiple keywords found): - "critical", "urgent", "production", "outage" → `priority-critical` - "important", "high", "blocking" → `priority-high` - "medium", "normal" → `priority-medium` (default if no priority keywords found) - "low", "nice-to-have", "minor" → `priority-low` - All issues get `needs-triage` label initially ### 2. **task-breakdown** job: - For issues with a title containing "Epic", create exactly 4 sub-issues with the pattern: "[SUBTASK] [Original Title] - Task N: [Task Name]" - Task names: 1. Requirements Analysis, 2. Design and Architecture, 3. Implementation, 4. Testing and Documentation - Links sub-issues to parent using "Related to #[parent-number]" in sub-issue body - Updates parent issue body with "## Epic Tasks" checklist linking to sub-issue numbers - All sub-issues get `enhancement` and `needs-review` labels ### 3. **auto-response** job: - Checks if the issue author is creating their first issue in this repository (not first on GitHub globally, but first in this specific repo) - If first issue in repo: adds `first-time-contributor` label and posts welcome message - Posts different responses based on issue type: - `bug` issues: comment must contain "Bug Report Guidelines" - `epic` issues: comment must contain "Feature Request Process" - `maintenance` issues: comment must contain "Maintenance Guidelines" - Sets milestone "v1.0.0" for `priority-high` and `priority-critical` issues - Changes status from `needs-triage` to `needs-review` after response ## Label Management Requirements The system must create and manage these specific labels: ### Category Labels: - `bug` - Something isn't working - `enhancement` - New feature or request - `epic` - Large feature requiring multiple sub-tasks - `maintenance` - Maintenance and housekeeping tasks ### Priority Labels: - `priority-critical` - Critical priority issue - `priority-high` - High priority issue - `priority-medium` - Medium priority issue - `priority-low` - Low priority issue ### Status Labels: - `needs-triage` - Needs to be reviewed by maintainers - `needs-review` - Awaiting review from maintainers - `first-time-contributor` - Issue created by first-time contributor ## Implementation Requirements: **Step 1: Create Feature Branch** Create a new branch called `issue-management-workflow` from main. **Step 2: Create Supporting Files** Create these additional files on the new branch: - `.github/ISSUE_TEMPLATE/bug_report.md` - Bug report template - `.github/ISSUE_TEMPLATE/feature_request.md` - Feature request template - `.github/ISSUE_TEMPLATE/maintenance_report.md` - Maintenance report template **Step 3: Implement the Workflow** Create `.github/workflows/issue-automation.yml` with proper YAML syntax. Include: - Appropriate triggers for issues events - Job dependencies where needed - Error handling and graceful fallbacks - Avoid identifier conflicts in github-script actions (don't redeclare 'github') **Step 4: Create and Merge Pull Request** Create a comprehensive pull request and merge it to main: - Title: "Implement Issue Management Automation Workflow" - Detailed description of the workflow and its purpose - Include all workflow files and templates created - Merge the pull request to main branch **Step 5: Test the Workflow** Create test issues to demonstrate the issue automation workflow: 1. **Bug Issue**: "Bug: Login form validation not working" - Expected: `bug`, `priority-high`, `needs-triage`→`needs-review`, milestone "v1.0.0" - Auto-response comment must contain "Bug Report Guidelines" 2. **Epic Issue**: "Epic: Redesign user dashboard interface" - Expected: `epic`, `priority-high`, `needs-triage`→`needs-review`, milestone "v1.0.0" - Must create 4 sub-issues with `enhancement` and `needs-review` labels - Parent updated with "## Epic Tasks" checklist, sub-issues linked with "Related to #[parent-number]" - Auto-response comment must contain "Feature Request Process" 3. **Maintenance Issue**: "Weekly maintenance cleanup and refactor" - Expected: `maintenance`, `priority-medium`, `needs-triage`→`needs-review`, no milestone - Auto-response comment must contain "Maintenance Guidelines" ================================================ FILE: tasks/github/standard/mcpmark-cicd/issue_management_workflow/meta.json ================================================ { "task_id": "issue_management_workflow", "task_name": "Issue Management Workflow", "category_id": "mcpmark-cicd", "category_name": "MCPMark CI/CD", "description": "Build intelligent issue management automation with auto-triage, task breakdown for epics, and first-time contributor handling.", "author": "Zijian Wu", "created_at": "2025-08-15", "difficulty": "L3", "tags": [ "issue management", "workflow automation" ], "mcp": [ "github" ], "meta_data": { "stateType": "url", "stateContent": null, "stateUrl": "https://github.com/mcpmark-source/mcpmark-cicd", "stateOriginalUrl": null } } ================================================ FILE: tasks/github/standard/mcpmark-cicd/issue_management_workflow/verify.py ================================================ import sys import os import requests import time from typing import Dict, List, Optional, Tuple from dotenv import load_dotenv def _get_github_api( endpoint: str, headers: Dict[str, str], owner: str, repo: str ) -> Tuple[bool, Optional[Dict]]: """Make a GET request to GitHub API and return (success, response).""" url = f"https://api.github.com/repos/{owner}/{repo}/{endpoint}" try: response = requests.get(url, headers=headers) if response.status_code == 200: return True, response.json() elif response.status_code == 404: return False, None else: print(f"API error for {endpoint}: {response.status_code}", file=sys.stderr) return False, None except Exception as e: print(f"Exception for {endpoint}: {e}", file=sys.stderr) return False, None def _search_github_issues( query: str, headers: Dict[str, str] ) -> Tuple[bool, Optional[List]]: """Search GitHub issues using the search API.""" url = f"https://api.github.com/search/issues?q={query}&per_page=100" try: response = requests.get(url, headers=headers) if response.status_code == 200: data = response.json() return True, data.get("items", []) else: print(f"Search API error: {response.status_code}", file=sys.stderr) return False, None except Exception as e: print(f"Search exception: {e}", file=sys.stderr) return False, None def _wait_for_workflow_completion( headers: Dict[str, str], owner: str, repo: str, max_wait: int = 90 ) -> bool: """Wait for GitHub Actions workflows to complete processing.""" print("⏳ Waiting for GitHub Actions workflows to complete...") start_time = time.time() expected_runs = 3 # We created 3 test issues no_workflow_check_count = 0 while time.time() - start_time < max_wait: try: # Check workflow runs success, response = _get_github_api( "actions/workflows/issue-automation.yml/runs?per_page=20", headers, owner, repo, ) if success and response: runs = response.get("workflow_runs", []) if len(runs) >= expected_runs: # Check status of recent runs recent_runs = runs[:expected_runs] running_count = 0 completed_count = 0 failed_count = 0 for run in recent_runs: status = run["status"] conclusion = run.get("conclusion") if status == "completed": completed_count += 1 if conclusion == "failure": failed_count += 1 elif status in ["in_progress", "queued"]: running_count += 1 print( f" Status: {completed_count} completed, {running_count} running/queued" ) # Wait until NO workflows are running and we have enough completed runs if running_count == 0 and completed_count >= expected_runs: if failed_count > 0: print( f"⚠️ Warning: {failed_count} workflow runs failed, but continuing verification..." ) print( f"✅ All workflows completed. Found {completed_count} completed runs." ) # Additional wait to ensure all issue processing is done print("⏳ Additional wait for issue processing to complete...") time.sleep(5) return True elif len(runs) == 0: # No workflow runs found no_workflow_check_count += 1 if no_workflow_check_count == 1: print( " No workflow runs found yet, waiting 5 seconds and checking once more..." ) time.sleep(5) continue elif no_workflow_check_count >= 2: print( "⚠️ No workflow runs detected after 2 checks. Workflow may not have been triggered." ) print(" Continuing with verification...") return False else: print( f" Waiting for workflow runs... Found {len(runs)}, expected {expected_runs}" ) print(f"⏳ Still waiting... ({int(time.time() - start_time)}s elapsed)") time.sleep(5) except Exception as e: print(f"⚠️ Error checking workflow status: {e}") time.sleep(5) print(f"⚠️ Workflow completion wait timed out after {max_wait}s") return False def _find_issue_by_title( title: str, headers: Dict[str, str], owner: str, repo: str ) -> Optional[Dict]: """Find an issue by exact title match.""" success, issues = _search_github_issues( f'repo:{owner}/{repo} "{title}" is:issue', headers ) if success and issues: for issue in issues: if issue.get("title") == title: return issue return None def _check_issue_labels( issue: Dict, expected_labels: List[str] ) -> Tuple[bool, List[str]]: """Check if issue has the expected labels.""" actual_labels = [label["name"] for label in issue.get("labels", [])] missing_labels = [label for label in expected_labels if label not in actual_labels] if missing_labels: return False, [f"Missing labels: {missing_labels}. Found: {actual_labels}"] return True, [] def _check_issue_milestone( issue: Dict, expected_milestone: str ) -> Tuple[bool, List[str]]: """Check if issue has the expected milestone.""" milestone = issue.get("milestone") if not milestone: if expected_milestone: return False, [f"No milestone found. Expected: {expected_milestone}"] return True, [] if milestone.get("title") != expected_milestone: return False, [ f"Wrong milestone: {milestone.get('title')}. Expected: {expected_milestone}" ] return True, [] def _check_issue_comments( issue_number: int, expected_content: str, headers: Dict[str, str], owner: str, repo: str, ) -> Tuple[bool, List[str]]: """Check if issue has a comment containing expected content.""" success, comments = _get_github_api( f"issues/{issue_number}/comments", headers, owner, repo ) if not success: return False, ["Failed to get issue comments"] if not comments: return False, [f"No comments found. Expected comment with: {expected_content}"] for comment in comments: if expected_content in comment.get("body", ""): return True, [] return False, [f"Expected content '{expected_content}' not found in comments"] def _find_epic_sub_issues( parent_issue_number: int, headers: Dict[str, str], owner: str, repo: str ) -> Tuple[List[Dict], List[str]]: """Find sub-issues created for an epic.""" # Search for each expected sub-task by exact title expected_subtasks = [ "[SUBTASK] Epic: Redesign user dashboard interface - Task 1: Requirements Analysis", "[SUBTASK] Epic: Redesign user dashboard interface - Task 2: Design and Architecture", "[SUBTASK] Epic: Redesign user dashboard interface - Task 3: Implementation", "[SUBTASK] Epic: Redesign user dashboard interface - Task 4: Testing and Documentation", ] subtasks = [] errors = [] for expected_title in expected_subtasks: # Search for exact title success, issues = _search_github_issues( f'repo:{owner}/{repo} "{expected_title}" is:issue', headers ) if not success: errors.append(f"Failed to search for sub-issue: {expected_title}") continue # Find exact match found = False for issue in issues: if issue.get("title") == expected_title: # Verify it references the parent issue body = issue.get("body", "") if ( f"#{parent_issue_number}" in body or f"Related to #{parent_issue_number}" in body ): subtasks.append(issue) found = True break if not found: errors.append( f"Sub-issue not found or doesn't reference parent: {expected_title}" ) return subtasks, errors def _check_epic_checklist( issue: Dict, subtask_numbers: List[int] ) -> Tuple[bool, List[str]]: """Check if epic issue has the Epic Tasks checklist with correct issue references.""" body = issue.get("body", "") errors = [] if "## Epic Tasks" not in body: return False, ["Epic Tasks section not found in issue body"] # Check that all subtask issue numbers are referenced in checkbox format for number in subtask_numbers: # Check for checkbox format: - [ ] #number if f"- [ ] #{number}" not in body: errors.append( f"Sub-issue #{number} not found in Epic Tasks checklist format (expected: '- [ ] #{number}')" ) # Also verify the expected task names are present expected_tasks = [ "Requirements Analysis", "Design and Architecture", "Implementation", "Testing and Documentation", ] for task in expected_tasks: if task not in body: errors.append(f"Task name '{task}' not found in Epic Tasks section") if errors: return False, errors return True, [] def _verify_bug_issue( headers: Dict[str, str], owner: str, repo: str ) -> Tuple[bool, List[str]]: """Verify the bug issue requirements.""" print("\n🐛 Verifying Bug Issue...") errors = [] # Find bug issue bug_issue = _find_issue_by_title( "Bug: Login form validation not working", headers, owner, repo ) if not bug_issue: return False, ["Bug issue 'Bug: Login form validation not working' not found"] issue_number = bug_issue["number"] print(f" Found bug issue #{issue_number}") # Check labels (including first-time-contributor since it's the first issue) expected_labels = ["bug", "priority-high", "needs-review", "first-time-contributor"] labels_ok, label_errors = _check_issue_labels(bug_issue, expected_labels) if not labels_ok: errors.extend(label_errors) else: print(f" ✅ Labels verified: {expected_labels}") # Check milestone milestone_ok, milestone_errors = _check_issue_milestone(bug_issue, "v1.0.0") if not milestone_ok: errors.extend(milestone_errors) else: print(" ✅ Milestone verified: v1.0.0") # Check comment comment_ok, comment_errors = _check_issue_comments( issue_number, "Bug Report Guidelines", headers, owner, repo ) if not comment_ok: errors.extend(comment_errors) else: print(" ✅ Bug Report Guidelines comment found") return len(errors) == 0, errors def _verify_epic_issue( headers: Dict[str, str], owner: str, repo: str ) -> Tuple[bool, List[str]]: """Verify the epic issue requirements.""" print("\n🚀 Verifying Epic Issue...") errors = [] # Find epic issue epic_issue = _find_issue_by_title( "Epic: Redesign user dashboard interface", headers, owner, repo ) if not epic_issue: return False, ["Epic issue 'Epic: Redesign user dashboard interface' not found"] issue_number = epic_issue["number"] print(f" Found epic issue #{issue_number}") # Check labels expected_labels = ["epic", "priority-high", "needs-review"] labels_ok, label_errors = _check_issue_labels(epic_issue, expected_labels) if not labels_ok: errors.extend(label_errors) else: print(f" ✅ Labels verified: {expected_labels}") # Check milestone milestone_ok, milestone_errors = _check_issue_milestone(epic_issue, "v1.0.0") if not milestone_ok: errors.extend(milestone_errors) else: print(" ✅ Milestone verified: v1.0.0") # Check comment comment_ok, comment_errors = _check_issue_comments( issue_number, "Feature Request Process", headers, owner, repo ) if not comment_ok: errors.extend(comment_errors) else: print(" ✅ Feature Request Process comment found") # Find and verify sub-issues sub_issues, sub_errors = _find_epic_sub_issues(issue_number, headers, owner, repo) if sub_errors: errors.extend(sub_errors) elif len(sub_issues) != 4: errors.append(f"Expected 4 sub-issues, found {len(sub_issues)}") else: print(f" ✅ Found {len(sub_issues)} sub-issues") # Collect sub-issue numbers for checklist verification subtask_numbers = [] # Verify each sub-issue has correct labels and link to parent for sub_issue in sub_issues: sub_number = sub_issue["number"] subtask_numbers.append(sub_number) # Check labels sub_labels = [label["name"] for label in sub_issue.get("labels", [])] expected_sub_labels = ["enhancement", "needs-review"] missing_sub_labels = [ label for label in expected_sub_labels if label not in sub_labels ] if missing_sub_labels: errors.append( f"Sub-issue #{sub_number} missing labels: {missing_sub_labels}" ) # Verify parent reference in body sub_body = sub_issue.get("body", "") if ( f"#{issue_number}" not in sub_body and f"Related to #{issue_number}" not in sub_body ): errors.append( f"Sub-issue #{sub_number} doesn't reference parent issue #{issue_number}" ) if not errors: print( " ✅ All 4 sub-tasks created with correct labels and parent references" ) # Check Epic Tasks checklist with correct issue numbers checklist_ok, checklist_errors = _check_epic_checklist( epic_issue, subtask_numbers ) if not checklist_ok: errors.extend(checklist_errors) else: print( f" ✅ Epic Tasks checklist verified with correct issue references: {subtask_numbers}" ) return len(errors) == 0, errors def _verify_maintenance_issue( headers: Dict[str, str], owner: str, repo: str ) -> Tuple[bool, List[str]]: """Verify the maintenance issue requirements.""" print("\n🔧 Verifying Maintenance Issue...") errors = [] # Find maintenance issue maintenance_issue = _find_issue_by_title( "Weekly maintenance cleanup and refactor", headers, owner, repo ) if not maintenance_issue: return False, [ "Maintenance issue 'Weekly maintenance cleanup and refactor' not found" ] issue_number = maintenance_issue["number"] print(f" Found maintenance issue #{issue_number}") # Check labels expected_labels = ["maintenance", "priority-medium", "needs-review"] labels_ok, label_errors = _check_issue_labels(maintenance_issue, expected_labels) if not labels_ok: errors.extend(label_errors) else: print(f" ✅ Labels verified: {expected_labels}") # Check NO milestone (maintenance issues shouldn't get v1.0.0) milestone_ok, milestone_errors = _check_issue_milestone(maintenance_issue, None) if not milestone_ok: errors.extend(milestone_errors) else: print(" ✅ No milestone assigned (correct for maintenance issue)") # Check comment comment_ok, comment_errors = _check_issue_comments( issue_number, "Maintenance Guidelines", headers, owner, repo ) if not comment_ok: errors.extend(comment_errors) else: print(" ✅ Maintenance Guidelines comment found") return len(errors) == 0, errors def verify() -> bool: """ Verify that the issue management workflow automation is working correctly. """ # Load environment variables load_dotenv(".mcp_env") github_token = os.environ.get("MCP_GITHUB_TOKEN") if not github_token: print("Error: MCP_GITHUB_TOKEN environment variable not set", file=sys.stderr) return False # Get GitHub organization github_org = os.environ.get("GITHUB_EVAL_ORG") if not github_org: print("Error: GITHUB_EVAL_ORG environment variable not set", file=sys.stderr) return False # Repository configuration owner = github_org repo = "mcpmark-cicd" headers = { "Authorization": f"token {github_token}", "Accept": "application/vnd.github.v3+json", } print("🔍 Starting Issue Management Workflow Verification") print("=" * 60) # Wait for workflows to complete workflows_completed = _wait_for_workflow_completion(headers, owner, repo) if not workflows_completed: print( "⚠️ Warning: Workflows may still be running. Continuing with verification..." ) # Verify each test issue all_passed = True # 1. Verify bug issue bug_ok, bug_errors = _verify_bug_issue(headers, owner, repo) if not bug_ok: all_passed = False print("❌ Bug Issue Verification Failed:") for error in bug_errors: print(f" - {error}") else: print("✅ Bug Issue Verification Passed") # 2. Verify epic issue epic_ok, epic_errors = _verify_epic_issue(headers, owner, repo) if not epic_ok: all_passed = False print("❌ Epic Issue Verification Failed:") for error in epic_errors: print(f" - {error}") else: print("✅ Epic Issue Verification Passed") # 3. Verify maintenance issue maintenance_ok, maintenance_errors = _verify_maintenance_issue(headers, owner, repo) if not maintenance_ok: all_passed = False print("❌ Maintenance Issue Verification Failed:") for error in maintenance_errors: print(f" - {error}") else: print("✅ Maintenance Issue Verification Passed") print("\n" + "=" * 60) if all_passed: print("🎉 All Issue Management Workflow verifications PASSED!") print("\n📋 Summary:") print( " ✅ Bug issue: labels (including first-time-contributor), milestone, and auto-response verified" ) print( " ✅ Epic issue: labels, milestone, 4 sub-issues with checklist, and correct issue references verified" ) print( " ✅ Maintenance issue: labels, no milestone, and auto-response verified" ) print("\n🤖 The GitHub Actions workflow automation is working correctly!") else: print("❌ Issue Management Workflow verification FAILED!") print(" Some issues did not meet the expected automation requirements.") return all_passed if __name__ == "__main__": success = verify() sys.exit(0 if success else 1) ================================================ FILE: tasks/github/standard/mcpmark-cicd/linting_ci_workflow/description.md ================================================ I need you to set up a proper linting workflow for our CI pipeline to ensure code quality standards are enforced on all pull requests. Here's what you need to do: **Step 1: Create Linting Configuration Branch** Create a new branch called 'ci/add-eslint-workflow' from the main branch. **Step 2: Create ESLint Configuration** On the new branch, create the file `.eslintrc.json` in the repository root with: ```json { "env": { "browser": true, "es2021": true, "node": true }, "extends": [ "eslint:recommended" ], "parserOptions": { "ecmaVersion": 12, "sourceType": "module" }, "rules": { "no-unused-vars": "error", "no-console": "warn", "semi": ["error", "always"], "quotes": ["error", "single"] }, "ignorePatterns": ["node_modules/", "dist/", "build/"] } ``` **Step 3: Create GitHub Actions Linting Workflow** Create the file `.github/workflows/lint.yml` with: - Workflow name: "Code Linting" - Triggers on: push to main, pull_request events - Uses ubuntu-latest runner - Sets up Node.js version 18 using actions/setup-node - Installs dependencies with npm ci - Installs ESLint globally - Runs ESLint on all JavaScript files in src/ directories - Fails the workflow if linting errors are found **Step 4: Create a File That Will Fail Linting** Create the file `src/example.js` with intentional linting violations that will cause the CI check to fail. **Step 5: Create Pull Request** Commit all the changes (ESLint config, workflow file, and example file with linting errors) in a single commit, then create a pull request from 'ci/add-eslint-workflow' to 'main' with: - Title: "Add ESLint workflow for code quality enforcement" - Body must include: - A "## Summary" heading describing the linting setup - A "## Changes" heading listing the files added - A "## Testing" heading explaining how to test the workflow - Mention that the PR intentionally includes linting errors to demonstrate the workflow **Step 6: Fix Linting Errors and Update PR** Fix the linting errors in `src/example.js` and commit the changes in a single commit to update the PR so that the CI check passes. ================================================ FILE: tasks/github/standard/mcpmark-cicd/linting_ci_workflow/meta.json ================================================ { "task_id": "linting_ci_workflow", "task_name": "Linting Ci Workflow", "category_id": "mcpmark-cicd", "category_name": "MCPMark CI/CD", "description": "Set up ESLint workflow for code quality enforcement on all pull requests with proper CI integration.", "author": "Zijian Wu", "created_at": "2025-08-15", "difficulty": "L3", "tags": [ "ci/cd automation", "pr workflows" ], "mcp": [ "github" ], "meta_data": { "stateType": "url", "stateContent": null, "stateUrl": "https://github.com/mcpmark-source/mcpmark-cicd", "stateOriginalUrl": null } } ================================================ FILE: tasks/github/standard/mcpmark-cicd/linting_ci_workflow/verify.py ================================================ import sys import os import requests from typing import Dict, List, Optional, Tuple import base64 from dotenv import load_dotenv import time import json def _get_github_api( endpoint: str, headers: Dict[str, str], org: str, repo: str = "claude-code" ) -> Tuple[bool, Optional[Dict]]: """Make a GET request to GitHub API and return (success, response).""" url = f"https://api.github.com/repos/{org}/{repo}/{endpoint}" try: response = requests.get(url, headers=headers) if response.status_code == 200: return True, response.json() elif response.status_code == 404: return False, None else: print(f"API error for {endpoint}: {response.status_code}", file=sys.stderr) return False, None except Exception as e: print(f"Exception for {endpoint}: {e}", file=sys.stderr) return False, None def _check_branch_exists( branch_name: str, headers: Dict[str, str], org: str, repo: str = "mcpmark-cicd" ) -> bool: """Verify that a branch exists in the repository.""" success, _ = _get_github_api(f"branches/{branch_name}", headers, org, repo) return success def _get_file_content( file_path: str, headers: Dict[str, str], org: str, repo: str = "claude-code", ref: str = "main", ) -> Optional[str]: """Get the content of a file from the repository.""" success, result = _get_github_api( f"contents/{file_path}?ref={ref}", headers, org, repo ) if not success or not result: return None try: content = base64.b64decode(result.get("content", "")).decode("utf-8") return content except Exception as e: print(f"Content decode error for {file_path}: {e}", file=sys.stderr) return None def _find_pr_by_title_keyword( keyword: str, headers: Dict[str, str], org: str, repo: str = "mcpmark-cicd" ) -> Optional[Dict]: """Find a PR by title keyword and return the PR data.""" for state in ["open", "closed"]: success, prs = _get_github_api( f"pulls?state={state}&per_page=100", headers, org, repo ) if success and prs: for pr in prs: if keyword.lower() in pr.get("title", "").lower(): return pr return None def _get_workflow_runs_for_pr( pr_number: int, headers: Dict[str, str], org: str, repo: str = "mcpmark-cicd" ) -> List[Dict]: """Get workflow runs for a specific PR.""" success, runs = _get_github_api( "actions/runs?event=pull_request&per_page=100", headers, org, repo ) if not success or not runs: return [] pr_runs = [] for run in runs.get("workflow_runs", []): # Check if this run is associated with our PR for pr in run.get("pull_requests", []): if pr.get("number") == pr_number: pr_runs.append(run) break return pr_runs def _get_pr_commits( pr_number: int, headers: Dict[str, str], org: str, repo: str = "mcpmark-cicd" ) -> List[Dict]: """Get commits for a specific PR.""" success, commits = _get_github_api(f"pulls/{pr_number}/commits", headers, org, repo) if not success or not commits: return [] return commits def _get_workflow_runs_for_commit( commit_sha: str, headers: Dict[str, str], org: str, repo: str = "mcpmark-cicd" ) -> List[Dict]: """Get workflow runs for a specific commit.""" success, runs = _get_github_api( f"actions/runs?head_sha={commit_sha}&per_page=100", headers, org, repo ) if not success or not runs: return [] return runs.get("workflow_runs", []) def verify() -> bool: """ Programmatically verify that the ESLint CI workflow setup meets the requirements described in description.md. """ # Configuration constants BRANCH_NAME = "ci/add-eslint-workflow" PR_KEYWORD = "eslint workflow" # Expected files and their content checks ESLINT_CONFIG_PATH = ".eslintrc.json" WORKFLOW_PATH = ".github/workflows/lint.yml" EXAMPLE_FILE_PATH = "src/example.js" # Expected workflow content keywords WORKFLOW_KEYWORDS = [ "Code Linting", "ubuntu-latest", "actions/setup-node", "npm ci", "eslint", "src/", ] # Load environment variables from .mcp_env load_dotenv(".mcp_env") # Get GitHub token and org github_token = os.environ.get("MCP_GITHUB_TOKEN") github_org = os.environ.get("GITHUB_EVAL_ORG") if not github_token: print("Error: MCP_GITHUB_TOKEN environment variable not set", file=sys.stderr) return False if not github_org: print("Error: GITHUB_EVAL_ORG environment variable not set", file=sys.stderr) return False headers = { "Authorization": f"Bearer {github_token}", "Accept": "application/vnd.github.v3+json", } # Run verification checks print("Verifying ESLint CI workflow setup...") # 1. Check that branch exists print("1. Verifying CI branch exists...") if not _check_branch_exists(BRANCH_NAME, headers, github_org): print(f"Error: Branch '{BRANCH_NAME}' not found", file=sys.stderr) return False print("✓ CI branch created") # 2. Check ESLint configuration file print("2. Verifying .eslintrc.json...") eslint_content = _get_file_content( ESLINT_CONFIG_PATH, headers, github_org, "mcpmark-cicd", BRANCH_NAME ) if not eslint_content: print("Error: .eslintrc.json not found", file=sys.stderr) return False # Validate ESLint config is valid JSON and contains required rules try: eslint_config = json.loads(eslint_content) rules = eslint_config.get("rules", {}) required_rules = ["no-unused-vars", "semi", "quotes"] missing_rules = [rule for rule in required_rules if rule not in rules] if missing_rules: print( f"Error: .eslintrc.json missing rules: {missing_rules}", file=sys.stderr ) return False except json.JSONDecodeError: print("Error: .eslintrc.json is not valid JSON", file=sys.stderr) return False print("✓ ESLint configuration created with proper rules") # 3. Check GitHub Actions workflow file print("3. Verifying .github/workflows/lint.yml...") workflow_content = _get_file_content( WORKFLOW_PATH, headers, github_org, "mcpmark-cicd", BRANCH_NAME ) if not workflow_content: print("Error: .github/workflows/lint.yml not found", file=sys.stderr) return False # Check workflow contains required keywords missing_keywords = [kw for kw in WORKFLOW_KEYWORDS if kw not in workflow_content] if missing_keywords: print(f"Error: Workflow missing keywords: {missing_keywords}", file=sys.stderr) return False # Check trigger configuration if "pull_request" not in workflow_content or "push" not in workflow_content: print("Error: Workflow missing proper triggers", file=sys.stderr) return False print("✓ GitHub Actions workflow created with proper configuration") # 4. Check example file with linting errors initially exists print("4. Verifying src/example.js...") example_content = _get_file_content( EXAMPLE_FILE_PATH, headers, github_org, "mcpmark-cicd", BRANCH_NAME ) if not example_content: print("Error: src/example.js not found", file=sys.stderr) return False print("✓ Example file created") # 5. Find and verify the linting PR print("5. Verifying linting pull request...") lint_pr = _find_pr_by_title_keyword(PR_KEYWORD, headers, github_org) if not lint_pr: # Try alternative keywords lint_pr = _find_pr_by_title_keyword("eslint", headers, github_org) if not lint_pr: print("Error: Linting PR not found", file=sys.stderr) return False pr_body = lint_pr.get("body", "") pr_number = lint_pr.get("number") # Check PR body sections required_sections = ["## Summary", "## Changes", "## Testing"] missing_sections = [ section for section in required_sections if section not in pr_body ] if missing_sections: print( f"Error: Linting PR missing sections: {missing_sections}", file=sys.stderr ) return False print("✓ Linting PR created with proper structure") # 6. Check workflow runs and status changes print("6. Verifying workflow execution and status...") # First get the commits for this PR commits = _get_pr_commits(pr_number, headers, github_org) if len(commits) != 2: print( f"Error: Expected exactly 2 commits, found {len(commits)}", file=sys.stderr ) return False print("✓ Found exactly 2 commits as expected") # Sort commits chronologically (oldest first) commits.sort(key=lambda x: x.get("commit", {}).get("author", {}).get("date", "")) first_commit_sha = commits[0].get("sha") second_commit_sha = commits[1].get("sha") print(f"First commit (should fail): {first_commit_sha[:7]}") print(f"Second commit (should pass): {second_commit_sha[:7]}") # Wait for workflows on both commits to complete print("Waiting for workflow completion on first commit...") first_commit_runs = [] second_commit_runs = [] start_time = time.time() timeout = 90 no_workflow_check_count = 0 while time.time() - start_time < timeout: first_commit_runs = _get_workflow_runs_for_commit( first_commit_sha, headers, github_org ) second_commit_runs = _get_workflow_runs_for_commit( second_commit_sha, headers, github_org ) # Check if any workflows exist if not first_commit_runs and not second_commit_runs: no_workflow_check_count += 1 if no_workflow_check_count == 1: print( "No workflow runs found yet, waiting 5 seconds and checking once more..." ) time.sleep(5) continue elif no_workflow_check_count >= 2: print( "⚠️ No workflow runs detected after 2 checks. Workflows may not have been triggered." ) print(" Continuing with verification...") break # Check if workflows are completed first_completed = any( run.get("status") == "completed" for run in first_commit_runs ) second_completed = any( run.get("status") == "completed" for run in second_commit_runs ) if first_completed and second_completed: break print("Waiting for workflows to complete...") time.sleep(10) # Verify first commit workflow failed first_commit_status = None for run in first_commit_runs: if run.get("status") == "completed": conclusion = run.get("conclusion") if conclusion in ["failure", "cancelled"]: first_commit_status = "failed" print("✓ First commit workflow failed as expected") break elif conclusion == "success": first_commit_status = "passed" break if first_commit_status != "failed": print( "Error: First commit workflow should have failed due to linting errors", file=sys.stderr, ) return False # Verify second commit workflow succeeded second_commit_status = None for run in second_commit_runs: if run.get("status") == "completed": conclusion = run.get("conclusion") if conclusion == "success": second_commit_status = "passed" print("✓ Second commit workflow passed as expected") break elif conclusion in ["failure", "cancelled"]: second_commit_status = "failed" break if second_commit_status != "passed": print( "Error: Second commit workflow should have passed after fixing linting errors", file=sys.stderr, ) return False print( "✓ Workflow status sequence verified: first commit failed → second commit passed" ) # 7. Verify the final state shows clean code print("7. Verifying final file state...") final_example_content = _get_file_content( EXAMPLE_FILE_PATH, headers, github_org, "mcpmark-cicd", BRANCH_NAME ) if final_example_content: # Check that obvious linting errors are fixed if ( "unusedVariable" in final_example_content or 'console.log("Hello World")' in final_example_content ): print( "Warning: Example file may still contain linting errors", file=sys.stderr, ) else: print("✓ Linting errors appear to be fixed") print("\n✅ All verification checks passed!") print("ESLint CI workflow setup completed successfully:") print(f" - Linting PR #{pr_number}") print(f" - Branch: {BRANCH_NAME}") print( " - Files created: .eslintrc.json, .github/workflows/lint.yml, src/example.js" ) print(" - Workflow configured for pull_request and push triggers") print( f" - Total workflow runs found: {len(first_commit_runs) + len(second_commit_runs)}" ) print( f" - First commit runs: {len(first_commit_runs)}, Second commit runs: {len(second_commit_runs)}" ) return True if __name__ == "__main__": success = verify() sys.exit(0 if success else 1) ================================================ FILE: tasks/github/standard/mcpmark-cicd/pr_automation_workflow/description.md ================================================ I need you to create a comprehensive Pull Request automation workflow for this Node.js project. The project currently has no GitHub Actions workflows, so you'll be building a PR-focused CI/CD workflow from scratch that responds to pull request events. Here's what needs to be implemented: ## Pull Request Automation Workflow Create `.github/workflows/pr-automation.yml` that triggers on `pull_request` events (opened, synchronize, reopened) with these jobs: ### 1. **code-quality** job (name: `code-quality`): - Runs ESLint checks using `npm run lint` - Runs Prettier formatting checks - Posts code quality results as PR comment (must include keywords: "Code Quality Report", "ESLint", "Prettier") ### 2. **testing-suite** job (name: `testing-suite`): - Runs full test suite with `npm test` - Generates test coverage report - Posts coverage summary as PR comment (must include keywords: "Test Coverage Report") - Uploads coverage artifacts ### 3. **security-scan** job (name: `security-scan`): - Runs dependency vulnerability checks - Scans for secrets in code changes - Creates security report as PR comment (must include keywords: "Security Scan Report", "Vulnerabilities", "Dependencies") ### 4. **build-validation** job (name: `build-validation`): - Attempts to build the application - Validates all endpoints are accessible - Creates deployment preview artifacts - Posts build status as PR comment (must include keywords: "Build Validation") **IMPORTANT: All four jobs must run in parallel.** ## Implementation Requirements: **Step 1: Create Feature Branch** Create a new branch called `pr-automation-workflow` from main. **Step 2: Create the Workflow** Create `.github/workflows/pr-automation.yml` with proper YAML syntax: - Appropriate triggers for pull_request events - All four jobs configured to run in parallel - Avoid identifier conflicts in github-script actions **Step 3: Create and Merge Pull Request** Create a comprehensive pull request and merge it to main: - Title: "Implement Pull Request Automation Workflow" - Detailed description of the workflow and its purpose - Merge the pull request to main branch ## Important Notes: - **All jobs MUST run in parallel** - Ensure your PR satisfies ALL required checks - The workflow should handle edge cases, have proper error recovery, and provide clear logging ================================================ FILE: tasks/github/standard/mcpmark-cicd/pr_automation_workflow/meta.json ================================================ { "task_id": "pr_automation_workflow", "task_name": "Pr Automation Workflow", "category_id": "mcpmark-cicd", "category_name": "MCPMark CI/CD", "description": "Create comprehensive PR automation with parallel jobs for code quality, testing, security scanning, and build validation.", "author": "Zijian Wu", "created_at": "2025-08-15", "difficulty": "L3", "tags": [ "pr workflows", "ci/cd automation", "workflow automation" ], "mcp": [ "github" ], "meta_data": { "stateType": "url", "stateContent": null, "stateUrl": "https://github.com/mcpmark-source/mcpmark-cicd", "stateOriginalUrl": null } } ================================================ FILE: tasks/github/standard/mcpmark-cicd/pr_automation_workflow/verify.py ================================================ import sys import os import requests import time from typing import Dict, List, Optional, Tuple from dotenv import load_dotenv import base64 def _get_github_api( endpoint: str, headers: Dict[str, str], owner: str, repo: str ) -> Tuple[bool, Optional[Dict]]: """Make a GET request to GitHub API and return (success, response).""" url = f"https://api.github.com/repos/{owner}/{repo}/{endpoint}" try: response = requests.get(url, headers=headers) if response.status_code == 200: return True, response.json() elif response.status_code == 404: return False, None else: print(f"API error for {endpoint}: {response.status_code}", file=sys.stderr) return False, None except Exception as e: print(f"Exception for {endpoint}: {e}", file=sys.stderr) return False, None def _post_github_api( endpoint: str, headers: Dict[str, str], owner: str, repo: str, data: Dict ) -> Tuple[bool, Optional[Dict]]: """Make a POST request to GitHub API and return (success, response).""" url = f"https://api.github.com/repos/{owner}/{repo}/{endpoint}" try: response = requests.post(url, headers=headers, json=data) if response.status_code in [200, 201]: return True, response.json() else: print( f"API error for {endpoint}: {response.status_code} - {response.text}", file=sys.stderr, ) return False, None except Exception as e: print(f"Exception for {endpoint}: {e}", file=sys.stderr) return False, None def _patch_github_api( endpoint: str, headers: Dict[str, str], owner: str, repo: str, data: Dict ) -> Tuple[bool, Optional[Dict]]: """Make a PATCH request to GitHub API and return (success, response).""" url = f"https://api.github.com/repos/{owner}/{repo}/{endpoint}" try: response = requests.patch(url, headers=headers, json=data) if response.status_code == 200: return True, response.json() else: print( f"API error for {endpoint}: {response.status_code} - {response.text}", file=sys.stderr, ) return False, None except Exception as e: print(f"Exception for {endpoint}: {e}", file=sys.stderr) return False, None def _get_file_content( file_path: str, headers: Dict[str, str], owner: str, repo: str, ref: str = "main", ) -> Optional[str]: """Get the content of a file from the repository.""" success, result = _get_github_api( f"contents/{file_path}?ref={ref}", headers, owner, repo ) if not success or not result: return None try: content = base64.b64decode(result.get("content", "")).decode("utf-8") return content except Exception as e: print(f"Content decode error for {file_path}: {e}", file=sys.stderr) return None def _find_pr_by_title( title: str, headers: Dict[str, str], owner: str, repo: str ) -> Optional[Dict]: """Find a PR by exact title match.""" for state in ["closed", "open"]: success, prs = _get_github_api( f"pulls?state={state}&per_page=100", headers, owner, repo ) if success and prs: for pr in prs: if pr.get("title") == title: return pr return None def _wait_for_workflow_completion( headers: Dict[str, str], owner: str, repo: str, workflow_file: str, max_wait: int = 90, ) -> bool: """Wait for GitHub Actions workflows to complete processing.""" print(f"⏳ Waiting for {workflow_file} workflows to complete...") start_time = time.time() no_workflow_check_count = 0 while time.time() - start_time < max_wait: try: success, response = _get_github_api( f"actions/workflows/{workflow_file}/runs?per_page=10", headers, owner, repo, ) if success and response: runs = response.get("workflow_runs", []) if len(runs) > 0: running_count = 0 completed_count = 0 for run in runs[:5]: # Check recent runs status = run["status"] if status == "completed": completed_count += 1 elif status in ["in_progress", "queued"]: running_count += 1 print( f" Status: {completed_count} completed, {running_count} running/queued" ) if running_count == 0: print(f"✅ All {workflow_file} workflows completed.") return True else: # No workflow runs found no_workflow_check_count += 1 if no_workflow_check_count == 1: print( " No workflow runs found yet, waiting 5 seconds and checking once more..." ) time.sleep(5) continue elif no_workflow_check_count >= 2: print( f"⚠️ No workflow runs detected after 2 checks. {workflow_file} may not have been triggered." ) print(" Continuing with verification...") return False print(f"⏳ Still waiting... ({int(time.time() - start_time)}s elapsed)") time.sleep(10) except Exception as e: print(f"⚠️ Error checking workflow status: {e}") time.sleep(10) print(f"⚠️ Workflow completion wait timed out after {max_wait}s") return False def _verify_workflow_file( headers: Dict[str, str], owner: str, repo: str ) -> Tuple[bool, List[str]]: """Verify that the workflow file exists and has correct content.""" print("\n📄 Verifying workflow file...") errors = [] workflow_content = _get_file_content( ".github/workflows/pr-automation.yml", headers, owner, repo ) if not workflow_content: return False, [ "Workflow file .github/workflows/pr-automation.yml not found in main branch" ] print(" ✅ Workflow file exists in main branch") # Verify required components required_events = ["opened", "synchronize", "reopened"] required_jobs = [ "code-quality", "testing-suite", "security-scan", "build-validation", ] if "pull_request:" not in workflow_content: errors.append("Workflow missing pull_request trigger") else: print(" ✅ Pull request trigger found") for event in required_events: if event not in workflow_content: errors.append(f"Missing event trigger: {event}") if not errors: print(f" ✅ Required events found: {required_events}") for job in required_jobs: if f"{job}:" not in workflow_content: errors.append(f"Missing job: {job}") if not errors: print(f" ✅ All 4 required jobs found: {required_jobs}") return len(errors) == 0, errors def _verify_main_pr_merged( headers: Dict[str, str], owner: str, repo: str ) -> Tuple[bool, List[str], Optional[Dict]]: """Verify that the main PR implementing the workflow was merged.""" print("\n🔍 Verifying main PR was merged...") errors = [] pr = _find_pr_by_title( "Implement Pull Request Automation Workflow", headers, owner, repo ) if not pr: return ( False, ["Main PR 'Implement Pull Request Automation Workflow' not found"], None, ) pr_number = pr["number"] print(f" Found PR #{pr_number}") if not pr.get("merged_at", False): errors.append(f"PR #{pr_number} was not merged") else: print(f" ✅ PR #{pr_number} was merged") if pr.get("head", {}).get("ref") != "pr-automation-workflow": errors.append(f"PR #{pr_number} was not from pr-automation-workflow branch") else: print(" ✅ PR was from pr-automation-workflow branch") if pr.get("base", {}).get("ref") != "main": errors.append(f"PR #{pr_number} was not merged to main branch") else: print(" ✅ PR was merged to main branch") return len(errors) == 0, errors, pr def _verify_workflow_runs( pr_data: Dict, headers: Dict[str, str], owner: str, repo: str ) -> Tuple[bool, List[str]]: """Verify that workflow runs occurred for the PR and all 4 jobs ran in parallel.""" print("\n⚙️ Verifying workflow runs...") errors = [] pr_number = pr_data["number"] # Get workflow runs for the PR success, runs_response = _get_github_api( "actions/runs?event=pull_request&per_page=50", headers, owner, repo ) if not success: return False, ["Failed to fetch workflow runs"] pr_runs = [] pr_head_sha = pr_data.get("head", {}).get("sha") for run in runs_response.get("workflow_runs", []): # Method 1: Check if this run is associated with the PR's head SHA if pr_head_sha and run.get("head_sha") == pr_head_sha: pr_runs.append(run) continue # Method 2: Check pull_requests field (may be empty for merged PRs) for pr in run.get("pull_requests", []): if pr.get("number") == pr_number: pr_runs.append(run) break if not pr_runs: # Try alternative approach: get runs by head branch pr_head_ref = pr_data.get("head", {}).get("ref") if pr_head_ref: success, branch_runs = _get_github_api( f"actions/runs?branch={pr_head_ref}&per_page=50", headers, owner, repo ) if success: pr_runs = branch_runs.get("workflow_runs", []) if not pr_runs: return False, [ f"No workflow runs found for PR #{pr_number} (head_sha: {pr_head_sha})" ] print(f" Found {len(pr_runs)} workflow run(s) for PR #{pr_number}") # Check the most recent run latest_run = pr_runs[0] # GitHub returns runs in descending order by creation time run_id = latest_run["id"] if latest_run["conclusion"] != "success": errors.append( f"Latest workflow run {run_id} did not succeed (conclusion: {latest_run['conclusion']})" ) else: print(f" ✅ Latest workflow run {run_id} succeeded") # Get jobs for this run success, jobs_response = _get_github_api( f"actions/runs/{run_id}/jobs", headers, owner, repo ) if not success: return False, ["Failed to fetch workflow jobs"] jobs = jobs_response.get("jobs", []) expected_jobs = [ "code-quality", "testing-suite", "security-scan", "build-validation", ] found_jobs = [job["name"] for job in jobs] missing_jobs = [job for job in expected_jobs if job not in found_jobs] if missing_jobs: errors.append(f"Missing jobs: {missing_jobs}. Found: {found_jobs}") else: print(f" ✅ All 4 required jobs found: {found_jobs}") # Verify all jobs succeeded failed_jobs = [job["name"] for job in jobs if job["conclusion"] != "success"] if failed_jobs: errors.append(f"Failed jobs: {failed_jobs}") else: print(" ✅ All jobs completed successfully") # Verify jobs ran in parallel (started around the same time) if len(jobs) >= 4: start_times = [job["started_at"] for job in jobs if job["started_at"]] if len(start_times) >= 4: # Check if all jobs started within 2 minutes of each other import datetime start_dt = [ datetime.datetime.fromisoformat(t.replace("Z", "+00:00")) for t in start_times ] time_diff = max(start_dt) - min(start_dt) if time_diff.total_seconds() > 120: # 2 minutes errors.append( f"Jobs did not run in parallel (time span: {time_diff.total_seconds()}s)" ) else: print(" ✅ Jobs ran in parallel") else: errors.append("Not enough job start times to verify parallel execution") return len(errors) == 0, errors def _verify_pr_comments( pr_data: Dict, headers: Dict[str, str], owner: str, repo: str ) -> Tuple[bool, List[str]]: """Verify that PR has required automation comments from GitHub Actions bot.""" print("\n💬 Verifying PR comments...") errors = [] pr_number = pr_data["number"] success, comments = _get_github_api( f"issues/{pr_number}/comments", headers, owner, repo ) if not success: return False, ["Failed to fetch PR comments"] # Filter for GitHub Actions bot comments only bot_comments = [ comment for comment in comments if comment.get("user", {}).get("login") == "github-actions[bot]" ] if not bot_comments: return False, ["No comments found from GitHub Actions bot"] print(f" Found {len(bot_comments)} comment(s) from GitHub Actions bot") # Get all bot comment bodies bot_comment_bodies = [comment.get("body", "") for comment in bot_comments] # Define required automation reports with their keywords required_reports = [ { "name": "Code Quality Report", "main_keywords": ["Code Quality Report"], "sub_keywords": ["ESLint", "Prettier"], "found": False, }, { "name": "Test Coverage Report", "main_keywords": ["Test Coverage Report"], "sub_keywords": [], "found": False, }, { "name": "Security Scan Report", "main_keywords": ["Security Scan Report"], "sub_keywords": ["Vulnerabilities", "Dependencies"], "found": False, }, { "name": "Build Validation Report", "main_keywords": ["Build Validation"], "sub_keywords": [], "found": False, }, ] # Check each bot comment for the required reports for comment_body in bot_comment_bodies: for report in required_reports: # Check if this comment contains any of the main keywords for this report if any(keyword in comment_body for keyword in report["main_keywords"]): if not report["found"]: # Only mark as found once report["found"] = True print(f" ✅ Found {report['name']}") # Verify sub-keywords are present in this specific comment for sub_keyword in report["sub_keywords"]: if sub_keyword not in comment_body: errors.append( f"Missing sub-keyword '{sub_keyword}' in {report['name']}" ) else: print( f" ✅ Found sub-keyword '{sub_keyword}' in {report['name']}" ) # Check if all required reports were found for report in required_reports: if not report["found"]: errors.append(f"Missing {report['name']} from GitHub Actions bot") # Verify we have exactly 4 automation reports found_reports = sum(1 for report in required_reports if report["found"]) if found_reports != 4: errors.append(f"Expected 4 automation reports, but found {found_reports}") else: print(" ✅ All 4 required automation reports found from GitHub Actions bot") return len(errors) == 0, errors def _create_test_pr( title: str, branch: str, content: str, file_path: str, headers: Dict[str, str], owner: str, repo: str, ) -> Optional[int]: """Create a test PR with specific content designed to fail a check.""" print(f" Creating test PR: {title}") # Create branch success, main_ref = _get_github_api("git/ref/heads/main", headers, owner, repo) if not success: print(" ❌ Failed to get main branch reference") return None main_sha = main_ref["object"]["sha"] branch_data = {"ref": f"refs/heads/{branch}", "sha": main_sha} success, _ = _post_github_api("git/refs", headers, owner, repo, branch_data) if not success: # Branch might already exist, try to delete and recreate print(f" Branch {branch} already exists, trying to delete and recreate...") import requests # Force delete existing branch delete_url = ( f"https://api.github.com/repos/{owner}/{repo}/git/refs/heads/{branch}" ) delete_response = requests.delete(delete_url, headers=headers) if delete_response.status_code == 204: print(f" Successfully deleted existing branch {branch}") # Wait a moment for deletion to complete import time time.sleep(2) # Try creating again success, _ = _post_github_api("git/refs", headers, owner, repo, branch_data) if not success: print(f" ❌ Failed to create branch {branch} after cleanup") return None else: print(f" ✅ Successfully created branch {branch} after cleanup") else: print( f" ❌ Failed to delete existing branch {branch}: {delete_response.status_code}" ) return None # Create or update file file_content = base64.b64encode(content.encode()).decode() file_data = { "message": f"Test commit for {title}", "content": file_content, "branch": branch, } # Check if file exists in main branch first success, file_info = _get_github_api( f"contents/{file_path}?ref=main", headers, owner, repo ) if success and file_info: # File exists, need SHA for update file_data["sha"] = file_info["sha"] print(f" File {file_path} exists, updating with SHA") else: print(f" Creating new file {file_path}") # Use PUT method for file creation/update url = f"https://api.github.com/repos/{owner}/{repo}/contents/{file_path}" try: import requests response = requests.put(url, headers=headers, json=file_data) if response.status_code in [200, 201]: print(f" ✅ Successfully created/updated file {file_path}") else: print( f" ❌ Failed to create/update file {file_path}: {response.status_code} - {response.text}" ) return None except Exception as e: print(f" ❌ Exception creating file {file_path}: {e}") return None # Create PR pr_data = { "title": title, "head": branch, "base": "main", "body": f"Test PR to validate that {title.split(':')[1].strip()} check fails correctly.", } success, pr_response = _post_github_api("pulls", headers, owner, repo, pr_data) if not success: print(" ❌ Failed to create PR") return None pr_number = pr_response["number"] print(f" ✅ Created test PR #{pr_number}") return pr_number def _close_pr(pr_number: int, headers: Dict[str, str], owner: str, repo: str) -> bool: """Close a PR.""" success, _ = _patch_github_api( f"pulls/{pr_number}", headers, owner, repo, {"state": "closed"} ) return success def _run_unit_tests( headers: Dict[str, str], owner: str, repo: str ) -> Tuple[bool, List[str]]: """Create test PRs to verify workflow correctly fails on bad code.""" print("\n🧪 Running unit tests with failing PRs...") errors = [] created_prs = [] test_cases = [ { "title": "Test: Code Quality Failure", "branch": "test-code-quality-fail", "file_path": "src/lint-fail-test.js", "content": "// This file contains intentional ESLint violations\nvar unused_variable = 'this will trigger unused-vars rule'\nconsole.log('missing semicolon - will trigger semi rule')\nconst badly_spaced = 'too many spaces'\nif(true){console.log('missing spaces around braces')}\nfunction unusedFunction() { return 'unused'; }\neeval('alert(\"dangerous eval\")');\nwith (Math) { var x = cos(3 * PI) + sin(LN10) }\nvar a = 1; var a = 2; // redeclared variable", "expected_failure": "code-quality", }, { "title": "Test: Testing Suite Failure", "branch": "test-testing-fail", "file_path": "tests/fail-test.test.js", "content": "const request = require('supertest');\n\ndescribe('Intentional Test Failures', () => {\n test('This test should always fail', () => {\n expect(2 + 2).toBe(5); // Intentionally wrong\n });\n \n test('Another failing test', () => {\n expect(true).toBe(false); // Intentionally wrong\n });\n \n test('Math failure', () => {\n expect(Math.max(1, 2, 3)).toBe(1); // Intentionally wrong\n });\n});", "expected_failure": "testing-suite", }, { "title": "Test: Security Scan Failure", "branch": "test-security-fail", "file_path": "src/security-fail-test.js", "content": "// This file contains patterns that should trigger secret detection\nconst hardcodedPassword = 'admin123password';\nconst fakeApiKey = 'sk_test_' + 'fake123key456here789';\nconst awsLikeKey = 'AKIA' + 'FAKEKEY7EXAMPLE';\nconst dbPassword = 'password' + '=' + 'supersecret123';\nconst tokenPattern = 'token' + '=' + 'ghp_1234567890abcdef';\n\n// These patterns should trigger secret detection\nconsole.log('Password:', hardcodedPassword);\nconsole.log('API Key:', fakeApiKey);\nconsole.log('AWS Key:', awsLikeKey);\nconsole.log('DB Password:', dbPassword);\nconsole.log('Token:', tokenPattern);\n\nmodule.exports = {\n password: hardcodedPassword,\n apiKey: fakeApiKey\n};", "expected_failure": "security-scan", }, { "title": "Test: Build Validation Failure", "branch": "test-build-fail", "file_path": "src/build-fail-test.js", "content": "// This file will cause build/startup failures\nconst express = require('express');\nconst nonExistentModule = require('this-module-does-not-exist-anywhere');\nconst anotherMissing = require('@fake/missing-package');\n\n// This will cause runtime errors during startup\nconst app = express();\n\n// Define a route that will cause issues\napp.get('/test', (req, res) => {\n // Try to use non-existent modules\n nonExistentModule.doSomething();\n anotherMissing.initialize();\n res.send('This should never work');\n});\n\n// Override the listen method to always fail\nconst originalListen = app.listen;\napp.listen = function(port, callback) {\n console.log('Attempting to start server...');\n // This will crash during build validation\n throw new Error('Intentional build failure for testing');\n};\n\nmodule.exports = app;", "expected_failure": "build-validation", }, ] for test_case in test_cases: pr_number = _create_test_pr( test_case["title"], test_case["branch"], test_case["content"], test_case["file_path"], headers, owner, repo, ) if pr_number: created_prs.append(pr_number) else: errors.append(f"Failed to create test PR: {test_case['title']}") if created_prs: print(f" Created {len(created_prs)} test PRs, waiting for workflows...") # Wait a bit for workflows to start time.sleep(5) # Wait for workflows to complete _wait_for_workflow_completion( headers, owner, repo, "pr-automation.yml", max_wait=90 ) # Verify each test PR failed appropriately for i, pr_number in enumerate(created_prs): test_case = test_cases[i] print( f" Checking test PR #{pr_number} ({test_case['expected_failure']} failure)..." ) # Get workflow runs for this PR success, runs_response = _get_github_api( "actions/runs?event=pull_request&per_page=20", headers, owner, repo ) if success: pr_runs = [] for run in runs_response.get("workflow_runs", []): # Check pull_requests field for pr in run.get("pull_requests", []): if pr.get("number") == pr_number: pr_runs.append(run) break # If no runs found via pull_requests, try matching by branch if not pr_runs: branch_name = test_case["branch"] for run in runs_response.get("workflow_runs", []): if run.get("head_branch") == branch_name: pr_runs.append(run) if pr_runs: latest_run = pr_runs[0] if latest_run["conclusion"] != "failure": errors.append( f"Test PR #{pr_number} should have failed but got: {latest_run['conclusion']}" ) else: print(f" ✅ Test PR #{pr_number} correctly failed") else: errors.append(f"No workflow runs found for test PR #{pr_number}") # Clean up test PRs and branches print(" Cleaning up test PRs and branches...") for i, pr_number in enumerate(created_prs): if _close_pr(pr_number, headers, owner, repo): print(f" ✅ Closed test PR #{pr_number}") else: print(f" ⚠️ Failed to close test PR #{pr_number}") # Delete test branch branch_name = test_cases[i]["branch"] import requests url = f"https://api.github.com/repos/{owner}/{repo}/git/refs/heads/{branch_name}" response = requests.delete(url, headers=headers) if response.status_code == 204: print(f" ✅ Deleted test branch {branch_name}") else: print(f" ⚠️ Failed to delete test branch {branch_name}") return len(errors) == 0, errors def verify() -> bool: """ Verify that the PR automation workflow is working correctly. """ load_dotenv(".mcp_env") github_token = os.environ.get("MCP_GITHUB_TOKEN") if not github_token: print("Error: MCP_GITHUB_TOKEN environment variable not set", file=sys.stderr) return False github_org = os.environ.get("GITHUB_EVAL_ORG") if not github_org: print("Error: GITHUB_EVAL_ORG environment variable not set", file=sys.stderr) return False owner = github_org repo = "mcpmark-cicd" headers = { "Authorization": f"token {github_token}", "Accept": "application/vnd.github.v3+json", } print("🔍 Starting PR Automation Workflow Verification") print("=" * 60) all_passed = True # 1. Verify workflow file exists workflow_ok, workflow_errors = _verify_workflow_file(headers, owner, repo) if not workflow_ok: all_passed = False print("❌ Workflow File Verification Failed:") for error in workflow_errors: print(f" - {error}") else: print("✅ Workflow File Verification Passed") # 2. Verify main PR was merged pr_ok, pr_errors, pr_data = _verify_main_pr_merged(headers, owner, repo) if not pr_ok: all_passed = False print("❌ Main PR Verification Failed:") for error in pr_errors: print(f" - {error}") else: print("✅ Main PR Verification Passed") # 3. Verify workflow runs (only if PR verification passed) if pr_ok and pr_data: runs_ok, runs_errors = _verify_workflow_runs(pr_data, headers, owner, repo) if not runs_ok: all_passed = False print("❌ Workflow Runs Verification Failed:") for error in runs_errors: print(f" - {error}") else: print("✅ Workflow Runs Verification Passed") # 4. Verify PR comments comments_ok, comments_errors = _verify_pr_comments( pr_data, headers, owner, repo ) if not comments_ok: all_passed = False print("❌ PR Comments Verification Failed:") for error in comments_errors: print(f" - {error}") else: print("✅ PR Comments Verification Passed") # 5. Run unit tests with failing PRs tests_ok, tests_errors = _run_unit_tests(headers, owner, repo) if not tests_ok: all_passed = False print("❌ Unit Tests Failed:") for error in tests_errors: print(f" - {error}") else: print("✅ Unit Tests Passed") print("\n" + "=" * 60) if all_passed: print("🎉 All PR Automation Workflow verifications PASSED!") print("\n📋 Summary:") print(" ✅ Workflow file exists with correct triggers and 4 parallel jobs") print(" ✅ Main PR was merged from pr-automation-workflow to main") print(" ✅ Workflow runs show all 4 jobs executed in parallel and succeeded") print(" ✅ PR comments contain required automation reports") print(" ✅ Unit tests confirmed workflow correctly fails on problematic code") print("\n🤖 The GitHub Actions PR automation workflow is working correctly!") else: print("❌ PR Automation Workflow verification FAILED!") print(" Some components did not meet the expected automation requirements.") return all_passed if __name__ == "__main__": success = verify() sys.exit(0 if success else 1) ================================================ FILE: tasks/github/standard/missing-semester/assign_contributor_labels/description.md ================================================ Assign assignees for each open issue and open PR by adding labels instead of using direct assignees. Only contributors who appeared in the past 100 commits are considered. First, collect all such contributors and identify the most frequent author among them. For each open issue or PR, assign using labels according to the following rules: • If the comments mention an author with @username, add a label in the format assigned-username. • If multiple authors are mentioned, add labels in the same format for all of them. • If no authors are mentioned in the comments, add a label for the most frequent contributor from the past 100 commits, using the format assigned-username. ================================================ FILE: tasks/github/standard/missing-semester/assign_contributor_labels/meta.json ================================================ { "task_id": "assign_contributor_labels", "task_name": "Assign Contributor Labels", "category_id": "missing-semester", "category_name": "Missing Semester", "description": "Assign labels to open issues and PRs based on contributors mentioned in comments or the most frequent contributor from past 100 commits, using assigned-username format.", "author": "Zijian Wu", "created_at": "2025-08-15", "difficulty": "L3", "tags": [ "issue management", "label automation", "contributor analysis" ], "mcp": [ "github" ], "meta_data": { "stateType": "url", "stateContent": null, "stateUrl": "https://github.com/mcpmark-source/missing-semester", "stateOriginalUrl": "https://github.com/missing-semester/missing-semester" } } ================================================ FILE: tasks/github/standard/missing-semester/assign_contributor_labels/verify.py ================================================ import sys import os import requests from typing import Dict, Optional, Tuple, List from dotenv import load_dotenv def _get_github_api( endpoint: str, headers: Dict[str, str], org: str, repo: str = "missing-semester" ) -> Tuple[bool, Optional[Dict]]: """Make a GET request to GitHub API and return (success, response).""" url = f"https://api.github.com/repos/{org}/{repo}/{endpoint}" try: response = requests.get(url, headers=headers) if response.status_code == 200: return True, response.json() elif response.status_code == 404: return False, None else: print(f"API error for {endpoint}: {response.status_code}", file=sys.stderr) return False, None except Exception as e: print(f"Exception for {endpoint}: {e}", file=sys.stderr) return False, None def _get_issue_labels( issue_number: int, headers: Dict[str, str], org: str, repo: str = "missing-semester" ) -> Optional[List[str]]: """Get labels for a specific issue/PR.""" success, result = _get_github_api(f"issues/{issue_number}", headers, org, repo) if not success or not result: return None labels = result.get("labels", []) return [label["name"] for label in labels] def verify() -> bool: """ Programmatically verify that the labels were assigned correctly to issues and PRs. """ # Load environment variables from .mcp_env load_dotenv(".mcp_env") # Get GitHub token and org github_token = os.environ.get("MCP_GITHUB_TOKEN") github_org = os.environ.get("GITHUB_EVAL_ORG") if not github_token: print("Error: MCP_GITHUB_TOKEN environment variable not set", file=sys.stderr) return False if not github_org: print("Error: GITHUB_EVAL_ORG environment variable not set", file=sys.stderr) return False headers = { "Authorization": f"Bearer {github_token}", "Accept": "application/vnd.github.v3+json", } print("Verifying contributor labels assignment task completion...") # Expected labels configuration expected_labels = { # Issues 9: ["assigned-jonhoo", "assigned-anishathalye"], # Issue #9 14: ["assigned-jonhoo", "assigned-anishathalye"], # Issue #14 15: ["assigned-anishathalye"], # Issue #15 # PRs 21: ["assigned-anishathalye"], # PR #21 22: ["assigned-anishathalye"], # PR #22 23: ["assigned-anishathalye"], # PR #23 24: ["assigned-anishathalye"], # PR #24 } all_passed = True for item_number, expected in expected_labels.items(): item_type = "Issue" if item_number in [9, 14, 15] else "PR" print(f"\nChecking {item_type} #{item_number}...") labels = _get_issue_labels(item_number, headers, github_org, "missing-semester") if labels is None: print(f" ❌ Failed to retrieve {item_type} #{item_number}", file=sys.stderr) all_passed = False continue # Sort both lists for comparison labels_sorted = sorted(labels) expected_sorted = sorted(expected) if labels_sorted == expected_sorted: print(f" ✅ {item_type} #{item_number} has correct labels: {labels_sorted}") else: print(f" ❌ {item_type} #{item_number} has incorrect labels", file=sys.stderr) print(f" Expected: {expected_sorted}", file=sys.stderr) print(f" Found: {labels_sorted}", file=sys.stderr) all_passed = False if all_passed: print("\n✅ All verification checks passed!") print("Contributor labels assignment task completed successfully:") print(" - Issues #9 and #14 have both 'assigned-jonhoo' and 'assigned-anishathalye' labels") print(" - Issue #15 and all 4 open PRs have 'assigned-anishathalye' label") else: print("\n❌ Some verification checks failed", file=sys.stderr) return all_passed if __name__ == "__main__": success = verify() sys.exit(0 if success else 1) ================================================ FILE: tasks/github/standard/missing-semester/find_legacy_name/description.md ================================================ I remember that a long time ago, *The Missing Semester of Your CS Education* had a different name and domain. There should be some related commit history. Please find the old name and domain and create an **ANSWER.md** file with them, formatted as: [title](url) Then push the file to the `master` branch. ================================================ FILE: tasks/github/standard/missing-semester/find_legacy_name/meta.json ================================================ { "task_id": "find_legacy_name", "task_name": "Find Legacy Name", "category_id": "missing-semester", "category_name": "Missing Semester", "description": "Find the old name and domain of The Missing Semester course from commit history and document the findings.", "author": "Zijian Wu", "created_at": "2025-08-15", "difficulty": "L3", "tags": [ "repository analysis" ], "mcp": [ "github" ], "meta_data": { "stateType": "url", "stateContent": null, "stateUrl": "https://github.com/mcpmark-source/missing-semester", "stateOriginalUrl": "https://github.com/missing-semester/missing-semester" } } ================================================ FILE: tasks/github/standard/missing-semester/find_legacy_name/verify.py ================================================ import sys import os import requests import base64 from typing import Dict, Optional, Tuple from dotenv import load_dotenv def _get_github_api( endpoint: str, headers: Dict[str, str], org: str, repo: str = "missing-semester" ) -> Tuple[bool, Optional[Dict]]: """Make a GET request to GitHub API and return (success, response).""" url = f"https://api.github.com/repos/{org}/{repo}/{endpoint}" try: response = requests.get(url, headers=headers) if response.status_code == 200: return True, response.json() elif response.status_code == 404: return False, None else: print(f"API error for {endpoint}: {response.status_code}", file=sys.stderr) return False, None except Exception as e: print(f"Exception for {endpoint}: {e}", file=sys.stderr) return False, None def _get_file_content( file_path: str, headers: Dict[str, str], org: str, repo: str = "missing-semester", ref: str = "master", ) -> Optional[str]: """Get the content of a file from the repository.""" success, result = _get_github_api( f"contents/{file_path}?ref={ref}", headers, org, repo ) if not success or not result: return None try: content = base64.b64decode(result.get("content", "")).decode("utf-8") return content except Exception as e: print(f"Content decode error for {file_path}: {e}", file=sys.stderr) return None def verify() -> bool: """ Programmatically verify that the legacy name finding task was completed correctly. Checks for ANSWER.md file in master branch with the correct content. """ # Expected answer content (accept both with and without trailing slash) EXPECTED_CONTENTS = { "[Hacker Tools](https://hacker-tools.github.io)", "[Hacker Tools](https://hacker-tools.github.io/)", } # Load environment variables from .mcp_env load_dotenv(".mcp_env") # Get GitHub token and org github_token = os.environ.get("MCP_GITHUB_TOKEN") github_org = os.environ.get("GITHUB_EVAL_ORG") if not github_token: print("Error: MCP_GITHUB_TOKEN environment variable not set", file=sys.stderr) return False if not github_org: print("Error: GITHUB_EVAL_ORG environment variable not set", file=sys.stderr) return False headers = { "Authorization": f"Bearer {github_token}", "Accept": "application/vnd.github.v3+json", } # Run verification checks print("Verifying legacy name finding task completion...") # 1. Check that ANSWER.md exists in master branch print("1. Checking ANSWER.md exists in master branch...") answer_content = _get_file_content("ANSWER.md", headers, github_org, "missing-semester", "master") if not answer_content: print("Error: ANSWER.md not found in master branch", file=sys.stderr) return False print("✓ ANSWER.md found in master branch") # 2. Check that the content matches expected answer print("2. Verifying ANSWER.md content...") answer_content = answer_content.strip() if answer_content not in EXPECTED_CONTENTS: print(f"Error: ANSWER.md content does not match expected answer(s)", file=sys.stderr) print(f"Expected one of: {sorted(EXPECTED_CONTENTS)}", file=sys.stderr) print(f"Found: {answer_content}", file=sys.stderr) return False print("✓ ANSWER.md contains correct legacy name and URL") print("\n✅ All verification checks passed!") print("Legacy name finding task completed successfully:") print(f" - ANSWER.md created in master branch") print(f" - Content accepted: {answer_content}") return True if __name__ == "__main__": success = verify() sys.exit(0 if success else 1) ================================================ FILE: tasks/github/standard/missing-semester/find_salient_file/description.md ================================================ I want to know which file has been modified most frequently in the past 100 commits. However, I don't want to consider files related to GitHub Actions. Please find the file and create an ANSWER.md, then write the file name in it. ================================================ FILE: tasks/github/standard/missing-semester/find_salient_file/meta.json ================================================ { "task_id": "find_salient_file", "task_name": "Find Salient File", "category_id": "missing-semester", "category_name": "Missing Semester", "description": "Identify the most frequently modified file in the past 100 commits, excluding GitHub Actions related files, and create an ANSWER.md with the file name.", "author": "Zijian Wu", "created_at": "2025-08-15", "difficulty": "L3", "tags": [ "commit analysis", "file tracking", "git history" ], "mcp": [ "github" ], "meta_data": { "stateType": "url", "stateContent": null, "stateUrl": "https://github.com/mcpmark-source/missing-semester", "stateOriginalUrl": "https://github.com/missing-semester/missing-semester" } } ================================================ FILE: tasks/github/standard/missing-semester/find_salient_file/verify.py ================================================ import sys import os import requests import base64 from typing import Dict, Optional, Tuple from dotenv import load_dotenv def _get_github_api( endpoint: str, headers: Dict[str, str], org: str, repo: str = "missing-semester" ) -> Tuple[bool, Optional[Dict]]: """Make a GET request to GitHub API and return (success, response).""" url = f"https://api.github.com/repos/{org}/{repo}/{endpoint}" try: response = requests.get(url, headers=headers) if response.status_code == 200: return True, response.json() elif response.status_code == 404: return False, None else: print(f"API error for {endpoint}: {response.status_code}", file=sys.stderr) return False, None except Exception as e: print(f"Exception for {endpoint}: {e}", file=sys.stderr) return False, None def _get_file_content( file_path: str, headers: Dict[str, str], org: str, repo: str = "missing-semester", ref: str = "master", ) -> Optional[str]: """Get the content of a file from the repository.""" success, result = _get_github_api( f"contents/{file_path}?ref={ref}", headers, org, repo ) if not success or not result: return None try: content = base64.b64decode(result.get("content", "")).decode("utf-8") return content except Exception as e: print(f"Content decode error for {file_path}: {e}", file=sys.stderr) return None def verify() -> bool: """ Programmatically verify that the most frequently modified file was identified correctly. Checks for ANSWER.md file in master branch with the correct content. """ # Expected answer content (excluding GitHub Actions files) EXPECTED_CONTENT = "index.md" # Load environment variables from .mcp_env load_dotenv(".mcp_env") # Get GitHub token and org github_token = os.environ.get("MCP_GITHUB_TOKEN") github_org = os.environ.get("GITHUB_EVAL_ORG") if not github_token: print("Error: MCP_GITHUB_TOKEN environment variable not set", file=sys.stderr) return False if not github_org: print("Error: GITHUB_EVAL_ORG environment variable not set", file=sys.stderr) return False headers = { "Authorization": f"Bearer {github_token}", "Accept": "application/vnd.github.v3+json", } # Run verification checks print("Verifying salient file identification task completion...") # 1. Check that ANSWER.md exists in master branch print("1. Checking ANSWER.md exists in master branch...") answer_content = _get_file_content("ANSWER.md", headers, github_org, "missing-semester", "master") if not answer_content: print("Error: ANSWER.md not found in master branch", file=sys.stderr) return False print("✅ ANSWER.md found in master branch") # 2. Check that the content matches expected answer print("2. Verifying ANSWER.md content...") answer_content = answer_content.strip() if answer_content != EXPECTED_CONTENT: print(f"Error: ANSWER.md content does not match expected answer", file=sys.stderr) print(f"Expected: {EXPECTED_CONTENT}", file=sys.stderr) print(f"Found: {answer_content}", file=sys.stderr) return False print("✅ ANSWER.md contains correct filename") print("\n✅ All verification checks passed!") print("Salient file identification task completed successfully:") print(f" - ANSWER.md created in master branch") print(f" - Content: {EXPECTED_CONTENT}") return True if __name__ == "__main__": success = verify() sys.exit(0 if success else 1) ================================================ FILE: tasks/notion/easy/.gitkeep ================================================ ================================================ FILE: tasks/notion/easy/computer_science_student_dashboard/simple__code_snippets_go/description.md ================================================ Find the page named "Computer Science Student Dashboard" and extend the **Code Snippets** section with Go content. **Task Requirements:** 1. Add a bold paragraph that contains exactly the text `Go` to mark the start of the Go snippets. 2. Directly under that heading, add three code blocks configured with `language` set to **go**: a. **Basic Go program** – Caption must be `Basic Go program` and the code content must be exactly: ```go package main import "fmt" func main() { fmt.Println("Hello, World!") } ``` b. **For loop in Go** – Caption must be `For loop in Go` and the code content must be exactly: ```go for i := 0; i < 5; i++ { fmt.Println(i) } ``` c. **Function definition in Go** – Caption must be `Function definition in Go` and the code content must be exactly: ```go func add(a, b int) int { return a + b } ``` ================================================ FILE: tasks/notion/easy/computer_science_student_dashboard/simple__code_snippets_go/meta.json ================================================ { "task_id": "simple__code_snippets_go", "task_name": "Simple Code Snippets Go", "category_id": "computer_science_student_dashboard", "category_name": "Computer Science Student Dashboard", "description": "Add a new Go column to the Code Snippets section between Python and JavaScript columns.", "author": "Xiangyan Liu", "created_at": "2025-11-15", "difficulty": "L1", "tags": [ "content organization", "visual formatting", "template population" ], "mcp": [ "notion" ], "meta_data": { "stateType": "url", "stateContent": null, "stateUrl": "https://painted-tennis-ebc.notion.site/Computer-Science-Student-Dashboard-23e81626b6d78083b787d3c832b02ef4", "stateOriginalUrl": "https://www.notion.so/marketplace/templates/computer-science-student-dashboard" } } ================================================ FILE: tasks/notion/easy/computer_science_student_dashboard/simple__code_snippets_go/verify.py ================================================ import sys from notion_client import Client from tasks.utils import notion_utils # Expected code blocks (language=go) EXPECTED_CODE_BLOCKS = [ { "caption": "Basic Go program", "code": ( 'package main\n\nimport "fmt"\n\nfunc main() {\n fmt.Println("Hello, World!")\n}' ), }, { "caption": "For loop in Go", "code": ("for i := 0; i < 5; i++ {\n fmt.Println(i)\n}"), }, { "caption": "Function definition in Go", "code": ("func add(a, b int) int {\n return a + b\n}"), }, ] HEADER_TEXT = "Go" def _normalize(text: str) -> str: """Remove trailing spaces on each line and strip leading/trailing blank lines.""" return "\n".join(line.rstrip() for line in text.strip().splitlines()) def _find_page(notion: Client, main_id: str | None) -> str | None: """Return a page_id to verify against or None if not found.""" page_id = None if main_id: found_id, object_type = notion_utils.find_page_or_database_by_id( notion, main_id ) if found_id and object_type == "page": page_id = found_id if not page_id: page_id = notion_utils.find_page(notion, "Computer Science Student Dashboard") return page_id def _has_bold_header_text(block, text: str) -> bool: """Generic bold header/paragraph check for a given text.""" block_type = block.get("type") if block_type not in {"paragraph", "heading_1", "heading_2", "heading_3"}: return False rich_text_list = block.get(block_type, {}).get("rich_text", []) if not rich_text_list: return False plain = "".join(rt.get("plain_text", "") for rt in rich_text_list).strip() if plain != text: return False return any(rt.get("annotations", {}).get("bold", False) for rt in rich_text_list) def _collect_code_blocks(blocks): """Return list of (code_content, caption) tuples for code blocks with language 'go'.""" collected = [] for block in blocks: if block.get("type") != "code": continue code_data = block.get("code", {}) if code_data.get("language") != "go": continue code_plain = "".join( rt.get("plain_text", "") for rt in code_data.get("rich_text", []) ) caption_plain = "".join( rt.get("plain_text", "") for rt in code_data.get("caption", []) ) collected.append((code_plain, caption_plain)) return collected def verify(notion: Client, main_id: str | None = None) -> bool: page_id = _find_page(notion, main_id) if not page_id: print("Error: Target page not found.", file=sys.stderr) return False all_blocks = notion_utils.get_all_blocks_recursively(notion, page_id) # Verify header header_ok = any(_has_bold_header_text(b, HEADER_TEXT) for b in all_blocks) if not header_ok: print("Failure: Bold header 'Go' not found.", file=sys.stderr) return False # Verify code blocks code_blocks_found = _collect_code_blocks(all_blocks) remaining = EXPECTED_CODE_BLOCKS.copy() for code, caption in code_blocks_found: norm_code = _normalize(code) for expected in remaining: if ( _normalize(expected["code"]) == norm_code and expected["caption"] == caption ): remaining.remove(expected) break if remaining: missing = ", ".join(exp["caption"] for exp in remaining) print( f"Failure: Missing or incorrect Go code blocks: {missing}", file=sys.stderr ) return False print( "Success: Verified Go header and required Go code blocks." ) return True def main(): notion = notion_utils.get_notion_client() main_id = sys.argv[1] if len(sys.argv) > 1 else None sys.exit(0 if verify(notion, main_id) else 1) if __name__ == "__main__": main() ================================================ FILE: tasks/notion/easy/computer_science_student_dashboard/simple__study_session_tracker/description.md ================================================ Create a new study-session entry on the **Computer Science Student Dashboard** page. 1. Locate the ☑️ Habit tracker section of the page. 2. **Insert a new date mention** for `2025-01-29` immediately **after the existing `2022-09-02` items but before the divider block** that follows them. Match the formatting of the existing dates (bold text with a Notion date mention). ================================================ FILE: tasks/notion/easy/computer_science_student_dashboard/simple__study_session_tracker/meta.json ================================================ { "task_id": "simple__study_session_tracker", "task_name": "Simple Study Session Tracker", "category_id": "computer_science_student_dashboard", "category_name": "Computer Science Student Dashboard", "description": "Create a new study-session entry in the Habit tracker section with four unchecked to-do items.", "author": "Xiangyan Liu", "created_at": "2025-11-15", "difficulty": "L1", "tags": [ "content organization", "visual formatting", "status tracking" ], "mcp": [ "notion" ], "meta_data": { "stateType": "url", "stateContent": null, "stateUrl": "https://painted-tennis-ebc.notion.site/Computer-Science-Student-Dashboard-23e81626b6d78083b787d3c832b02ef4", "stateOriginalUrl": "https://www.notion.so/marketplace/templates/computer-science-student-dashboard" } } ================================================ FILE: tasks/notion/easy/computer_science_student_dashboard/simple__study_session_tracker/verify.py ================================================ import sys from notion_client import Client from tasks.utils import notion_utils def verify(notion: Client, main_id: str | None = None) -> bool: """Verify that the new study-session entry for 2025-01-29 was added correctly. The script checks that: 1. A bold date-mention with start=2025-01-29 exists. 2. The mention sits after the 2022-09-02 section but before the divider that originally followed that section. """ # --------------------------------------------------------------------- # Locate the main page ------------------------------------------------- # --------------------------------------------------------------------- page_id: str | None = None if main_id: found_id, object_type = notion_utils.find_page_or_database_by_id( notion, main_id ) if found_id and object_type == "page": page_id = found_id if not page_id: page_id = notion_utils.find_page(notion, "Computer Science Student Dashboard") if not page_id: print( "Error: Page 'Computer Science Student Dashboard' not found.", file=sys.stderr, ) return False # --------------------------------------------------------------------- # Fetch all blocks under the page (flattened order) -------------------- # --------------------------------------------------------------------- all_blocks = notion_utils.get_all_blocks_recursively(notion, page_id) # --------------------------------------------------------------------- # Locate reference blocks --------------------------------------------- # --------------------------------------------------------------------- TARGET_DATE = "2025-01-29" PREVIOUS_DATE = "2022-09-02" index_previous_date: int | None = None index_new_date: int | None = None index_divider_after_previous: int | None = None for idx, block in enumerate(all_blocks): # Divider detection (we care only about the first divider that appears after # the 2022-09-02 block) if block.get("type") == "divider": if index_previous_date is not None and index_divider_after_previous is None: index_divider_after_previous = idx # We only need to inspect paragraph blocks that contain a date mention if block.get("type") != "paragraph": continue rich_text_list = block["paragraph"].get("rich_text", []) for rt in rich_text_list: if ( rt.get("type") != "mention" or rt.get("mention", {}).get("type") != "date" ): continue date_start = rt["mention"]["date"].get("start") if date_start == PREVIOUS_DATE and index_previous_date is None: index_previous_date = idx if date_start == TARGET_DATE and index_new_date is None: index_new_date = idx # (1) Verify bold annotation if not rt.get("annotations", {}).get("bold", False): print( "Error: The 2025-01-29 date mention is not bold.", file=sys.stderr, ) return False # Ensure all reference indices were found if index_previous_date is None: print("Error: Could not locate the 2022-09-02 date section.", file=sys.stderr) return False if index_divider_after_previous is None: print( "Error: Could not locate the divider that follows the 2022-09-02 section.", file=sys.stderr, ) return False if index_new_date is None: print( "Error: Could not locate the new 2025-01-29 date mention.", file=sys.stderr ) return False # (2) Verify ordering if not (index_previous_date < index_new_date < index_divider_after_previous): print( "Error: The 2025-01-29 section is positioned incorrectly.", file=sys.stderr ) return False # --------------------------------------------------------------------- # Success -------------------------------------------------------------- # --------------------------------------------------------------------- print("Success: Date mention for 2025-01-29 added in the correct position.") return True # ------------------------------------------------------------------------- # Command-line entry-point ------------------------------------------------- # ------------------------------------------------------------------------- def main() -> None: notion = notion_utils.get_notion_client() main_id = sys.argv[1] if len(sys.argv) > 1 else None if verify(notion, main_id): sys.exit(0) else: sys.exit(1) if __name__ == "__main__": main() ================================================ FILE: tasks/notion/easy/it_trouble_shooting_hub/simple__asset_retirement_migration/description.md ================================================ Please migrate expiring assets out of the **IT Inventory** database using the simplified checklist below. Your changes will be verified automatically, so match the details exactly. --- Task Steps 1. Inside the **IT Trouble Shooting Hub** page, locate the database named **IT Inventory**. 2. Collect every page in **IT Inventory** whose **Status** is **Expired** or **To be returned**. 3. Create a **new full-page database** under the same hub titled **IT Asset Retirement Queue** with exactly these properties (names and types must match): • Serial – title • Status – select • Expiration date – date 4. For every item gathered in step 2, create a page in **IT Asset Retirement Queue**, copy over the Serial, Status, and Expiration date values, then archive the original inventory page once the copy is made. ================================================ FILE: tasks/notion/easy/it_trouble_shooting_hub/simple__asset_retirement_migration/meta.json ================================================ { "task_id": "simple__asset_retirement_migration", "task_name": "Simple Asset Retirement Migration", "category_id": "it_trouble_shooting_hub", "category_name": "IT Trouble Shooting Hub", "description": "Restructure the IT Inventory database by migrating expired assets to a new IT Asset Retirement Queue database.", "author": "Xiangyan Liu", "created_at": "2025-11-15", "difficulty": "L1", "tags": [ "database manipulation", "automated migration", "conditional filtering", "data aggregation", "report generation" ], "mcp": [ "notion" ], "meta_data": { "stateType": "url", "stateContent": null, "stateUrl": "https://painted-tennis-ebc.notion.site/It-Trouble-Shooting-Hub-23e81626b6d78020aba7eb65ae1cc2d5", "stateOriginalUrl": "https://www.notion.so/marketplace/templates/it-trouble-shooting-hub" } } ================================================ FILE: tasks/notion/easy/it_trouble_shooting_hub/simple__asset_retirement_migration/verify.py ================================================ import sys from typing import Dict from notion_client import Client from tasks.utils import notion_utils def _get_database(root_page_id: str, notion: Client, name: str) -> str | None: """Helper that finds a child database by title inside a page.""" return notion_utils.find_database_in_block(notion, root_page_id, name) def _check_property(props: Dict, name: str, expected_type: str) -> bool: if name not in props: print(f"Error: Property '{name}' missing in database.", file=sys.stderr) return False if props[name]["type"] != expected_type: print( f"Error: Property '{name}' expected type '{expected_type}', found '{props[name]['type']}'.", file=sys.stderr, ) return False return True def verify(notion: Client, main_id: str | None = None) -> bool: """Verifies that the IT Asset Retirement Queue was created and populated correctly.""" # ------------------------------------------------------------------------- # Resolve the root IT Trouble Shooting Hub page # ------------------------------------------------------------------------- root_page_id = None if main_id: found_id, obj_type = notion_utils.find_page_or_database_by_id(notion, main_id) if found_id and obj_type == "page": root_page_id = found_id if not root_page_id: root_page_id = notion_utils.find_page(notion, "IT Trouble Shooting Hub") if not root_page_id: print( "Error: Could not locate the 'IT Trouble Shooting Hub' page.", file=sys.stderr, ) return False # ------------------------------------------------------------------------- # Locate the original and new databases # ------------------------------------------------------------------------- inventory_db_id = _get_database(root_page_id, notion, "IT Inventory") if not inventory_db_id: print("Error: 'IT Inventory' database not found.", file=sys.stderr) return False retirement_db_id = _get_database(root_page_id, notion, "IT Asset Retirement Queue") if not retirement_db_id: print("Error: 'IT Asset Retirement Queue' database not found.", file=sys.stderr) return False # ------------------------------------------------------------------------- # Validate schema of the retirement queue database # ------------------------------------------------------------------------- retirement_db = notion.databases.retrieve(database_id=retirement_db_id) r_props = retirement_db["properties"] required_schema = { "Serial": "title", "Status": "select", "Expiration date": "date", } for pname, ptype in required_schema.items(): if not _check_property(r_props, pname, ptype): return False # ------------------------------------------------------------------------- # Validate that inventory items are moved & archived # ------------------------------------------------------------------------- expired_filter = { "property": "Status", "select": {"equals": "Expired"}, } to_return_filter = { "property": "Status", "select": {"equals": "To be returned"}, } compound_filter = {"or": [expired_filter, to_return_filter]} # Query for any *active* items that still match these statuses remaining_items = notion.databases.query( database_id=inventory_db_id, filter=compound_filter, archived=False, ).get("results", []) if remaining_items: print( f"Error: {len(remaining_items)} 'Expired' / 'To be returned' items still present in IT Inventory.", file=sys.stderr, ) return False # There should be at least one entry in the retirement queue retirement_pages = notion.databases.query(database_id=retirement_db_id).get( "results", [] ) expected_serials = {"65XYQ/GB", "36x10PIQ"} if len(retirement_pages) != len(expected_serials): print( f"Error: Expected {len(expected_serials)} retirement pages, found {len(retirement_pages)}.", file=sys.stderr, ) return False serials_seen = set() for page in retirement_pages: props = page["properties"] # Collect Serial title title_rich = props.get("Serial", {}).get("title", []) serial_val = "".join([t.get("plain_text", "") for t in title_rich]).strip() serials_seen.add(serial_val) if serials_seen != expected_serials: print( f"Error: Serial values mismatch. Expected {sorted(expected_serials)}, found {sorted(serials_seen)}.", file=sys.stderr, ) return False print("Success: All verification criteria satisfied.") return True def main(): notion = notion_utils.get_notion_client() main_id = sys.argv[1] if len(sys.argv) > 1 else None if verify(notion, main_id): sys.exit(0) else: sys.exit(1) if __name__ == "__main__": main() ================================================ FILE: tasks/notion/easy/japan_travel_planner/simple__remove_osaka_itinerary/description.md ================================================ Go to Japan Travel Planner, and go to the Travel Itineray database, and remove the itinerary in OSAKA after 6 PM (excluding 6 PM) in Day 1 and Day 2. ================================================ FILE: tasks/notion/easy/japan_travel_planner/simple__remove_osaka_itinerary/meta.json ================================================ { "task_id": "simple__remove_osaka_itinerary", "task_name": "Simple Remove Osaka Itinerary", "category_id": "japan_travel_planner", "category_name": "Japan Travel Planner", "description": "Remove the itinerary items in Osaka after 6 PM from Day 1 and Day 2 travel schedules.", "author": "Xiangyan Liu", "created_at": "2025-11-15", "difficulty": "L1", "tags": [ "conditional filtering", "automated migration" ], "mcp": [ "notion" ], "meta_data": { "stateType": "url", "stateContent": null, "stateUrl": "https://painted-tennis-ebc.notion.site/Japan-Travel-Planner-23181626b6d781c4b6bedb12786b5abe", "stateOriginalUrl": "https://www.notion.so/marketplace/templates/japantravelplanner101" } } ================================================ FILE: tasks/notion/easy/japan_travel_planner/simple__remove_osaka_itinerary/verify.py ================================================ import sys from notion_client import Client from tasks.utils import notion_utils def get_page_title(page_result): """Extract title from a page result""" properties = page_result.get('properties', {}) name_property = properties.get('Name', {}) if name_property.get('type') == 'title': title_array = name_property.get('title', []) if title_array and len(title_array) > 0: return title_array[0].get('plain_text', '') return '' def get_page_time(page_result): """Extract time from Notes field""" properties = page_result.get('properties', {}) notes_property = properties.get('Notes', {}) if notes_property.get('type') == 'rich_text': rich_text_array = notes_property.get('rich_text', []) if rich_text_array and len(rich_text_array) > 0: notes_text = rich_text_array[0].get('plain_text', '') return notes_text.strip() return '' def get_page_group(page_result): """Extract group/location from page""" properties = page_result.get('properties', {}) group_property = properties.get('Group', {}) if group_property.get('type') == 'select': select = group_property.get('select') if select: return select.get('name', '') return '' def get_page_day(page_result): """Extract day from page""" properties = page_result.get('properties', {}) day_property = properties.get('Day', {}) if day_property.get('type') == 'select': select = day_property.get('select') if select: return select.get('name', '') return '' def parse_time_to_minutes(time_str): """Convert time string to minutes for comparison Returns None if time cannot be parsed""" if not time_str: return None # Clean the time string time_str = time_str.strip().upper() # Remove any text after the time (e.g., "7:30 PM\n" -> "7:30 PM") time_str = time_str.split('\n')[0].strip() # Extract time components try: if 'PM' in time_str: time_part = time_str.replace('PM', '').strip() if ':' in time_part: hours, minutes = time_part.split(':') hours = int(hours) minutes = int(minutes) else: hours = int(time_part) minutes = 0 # Convert PM hours (add 12 for PM times except 12 PM) if hours != 12: hours += 12 return hours * 60 + minutes elif 'AM' in time_str: time_part = time_str.replace('AM', '').strip() if ':' in time_part: hours, minutes = time_part.split(':') hours = int(hours) minutes = int(minutes) else: hours = int(time_part) minutes = 0 # Handle 12 AM (midnight) if hours == 12: hours = 0 return hours * 60 + minutes except: return None return None def verify(notion: Client, main_id: str = None) -> bool: """ Verifies that all OSAKA events after 6PM have been removed from Day 1 and Day 2 in the Japan Travel Planner. Expected items that should be deleted (all in OSAKA, after 6PM, on Day 1 or Day 2): 1. Rikuro's Namba Main Branch - 7 PM (Day 1) 2. Shin Sekai "New World" - 8 PM (Day 2) 3. Katsudon Chiyomatsu - 7:30 PM (Day 2) 4. Ebisubashi Bridge - 9 PM (Day 1) Note: Kuromon Ichiba Market at 6 PM should NOT be deleted (it's at 6PM, not after) Items after 6PM on other days (Day 3-8) should NOT be deleted """ # Step 1: Find the main Japan Travel Planner page if main_id: found_id, object_type = notion_utils.find_page_or_database_by_id(notion, main_id) if not found_id or object_type != 'page': print("Error: Japan Travel Planner page not found.", file=sys.stderr) return False else: # Try to find the page by searching found_id = notion_utils.find_page(notion, "Japan Travel Planner") if not found_id: print("Error: Japan Travel Planner page not found.", file=sys.stderr) return False print(f"Found Japan Travel Planner page: {found_id}") # Step 2: Find the Travel Itinerary database all_blocks = notion_utils.get_all_blocks_recursively(notion, found_id) travel_itinerary_db_id = None for block in all_blocks: if block and block.get("type") == "child_database": title = block.get("child_database", {}).get("title", "") if "Travel Itinerary" in title: travel_itinerary_db_id = block.get("id") print(f"Found Travel Itinerary database: {travel_itinerary_db_id}") break if not travel_itinerary_db_id: print("Error: Travel Itinerary database not found", file=sys.stderr) return False # Step 3: Query the database for OSAKA items on Day 1 and Day 2 try: query_result = notion.databases.query( database_id=travel_itinerary_db_id, filter={ "and": [ {"property": "Group", "select": {"equals": "Osaka"}}, {"or": [ {"property": "Day", "select": {"equals": "Day 1"}}, {"property": "Day", "select": {"equals": "Day 2"}} ]} ] } ) except Exception as e: print(f"Error querying Travel Itinerary database: {e}", file=sys.stderr) return False # Step 4: Check for items that should have been deleted six_pm_minutes = 18 * 60 # 6 PM in minutes (18:00) # Expected deleted items (4 specific items after 6 PM on Day 1 and Day 2) expected_deleted = { "Rikuro's Namba Main Branch": {"time": "7 PM", "day": "Day 1", "found": False}, "Shin Sekai \"New World\"": {"time": "8 PM", "day": "Day 2", "found": False}, "Katsudon Chiyomatsu": {"time": "7:30 PM", "day": "Day 2", "found": False}, "Ebisubashi Bridge": {"time": "9 PM", "day": "Day 1", "found": False} } # Items that should remain (at or before 6 PM) expected_remaining = { "Kuromon Ichiba Market": {"time": "6 PM", "found": False} } osaka_items_after_6pm = [] osaka_items_at_or_before_6pm = [] # Debug: Show total query results print(f"Debug: Found {len(query_result.get('results', []))} total OSAKA items on Day 1 and Day 2") # Process all OSAKA items on Day 1 and Day 2 for page in query_result.get('results', []): page_title = get_page_title(page).strip() page_time = get_page_time(page) page_group = get_page_group(page) page_day = get_page_day(page) if page_group != "Osaka": continue # Parse time to check if after 6 PM time_minutes = parse_time_to_minutes(page_time) if time_minutes is not None and time_minutes > six_pm_minutes: osaka_items_after_6pm.append({ "title": page_title, "time": page_time, "day": page_day, "id": page.get('id') }) # Check if this is one of the expected deleted items for expected_title, expected_info in expected_deleted.items(): # Clean up the titles for comparison clean_page_title = page_title.strip().lower() clean_expected_title = expected_title.strip().lower() # Check for "Rikuro's" or "Rikuro's" (different apostrophe types) if "rikuro" in clean_page_title and "rikuro" in clean_expected_title: title_match = True elif clean_page_title == clean_expected_title: title_match = True elif clean_expected_title in clean_page_title or clean_page_title in clean_expected_title: title_match = True else: title_match = False if title_match and page_day == expected_info["day"]: print(f"Debug: Found '{page_title}' on {page_day} at {page_time} - matches expected '{expected_title}'") expected_deleted[expected_title]["found"] = True elif time_minutes is not None and time_minutes <= six_pm_minutes: osaka_items_at_or_before_6pm.append({ "title": page_title, "time": page_time, "day": page_day, "id": page.get('id') }) # Check if this is one of the expected remaining items for expected_title in expected_remaining: if expected_title.lower() in page_title.lower() or page_title.lower() in expected_title.lower(): expected_remaining[expected_title]["found"] = True # Step 5: Verify results print(f"\nVerification Summary:") print(f"=" * 50) all_passed = True # Check that the 4 expected items after 6 PM have been deleted print("\n4 Items that should be deleted (after 6 PM on Day 1 and Day 2):") for item_name, item_info in expected_deleted.items(): if item_info["found"]: # If found = True, it means the item still exists (was not deleted) print(f"✗ {item_name} ({item_info['day']}, {item_info['time']}) - Still exists, should be deleted", file=sys.stderr) all_passed = False else: # If found = False, it means the item was deleted correctly print(f"✓ {item_name} ({item_info['day']}, {item_info['time']}) - Correctly deleted") # Check that items at or before 6 PM remain print("\nItems that should remain (at or before 6 PM on Day 1 and Day 2):") for item_name, item_info in expected_remaining.items(): if item_info["found"]: print(f"✓ {item_name} ({item_info['time']}) - Correctly retained") else: print(f"✗ {item_name} ({item_info['time']}) - Missing, should not be deleted", file=sys.stderr) all_passed = False # Report any items after 6 PM that still exist if osaka_items_after_6pm: print(f"\n✗ Found {len(osaka_items_after_6pm)} OSAKA item(s) after 6 PM on Day 1/Day 2:", file=sys.stderr) for item in osaka_items_after_6pm: print(f" - {item['title']} at {item['time']} ({item['day']})", file=sys.stderr) else: print(f"\n✓ No OSAKA items found after 6 PM on Day 1/Day 2 (all correctly deleted)") # Report count summary print(f"\nCount Summary:") print(f"- OSAKA items after 6 PM on Day 1/Day 2 found: {len(osaka_items_after_6pm)} (should be 0)") print(f"- OSAKA items at/before 6 PM on Day 1/Day 2 found: {len(osaka_items_at_or_before_6pm)}") print(f"- Expected deletions verified: {sum(1 for item in expected_deleted.values() if not item['found'])}/4") return all_passed def main(): """ Executes the verification process and exits with a status code. """ notion = notion_utils.get_notion_client() main_id = sys.argv[1] if len(sys.argv) > 1 else None if verify(notion, main_id): print("\nVerification passed: All 4 required OSAKA events after 6 PM on Day 1 and Day 2 have been removed") sys.exit(0) else: print("\nVerification failed: Some OSAKA events after 6 PM on Day 1/Day 2 still exist") sys.exit(1) if __name__ == "__main__": main() ================================================ FILE: tasks/notion/easy/online_resume/simple__skills_development_tracker/description.md ================================================ Create a comprehensive skills audit system by performing the following tasks: **Task Requirements:** 1. Create a new database named "Skills Development Tracker" as a child database in the main resume page with the following properties: - Name (title property) - Current Skill (relation to Skills database) - Current Proficiency (rollup from related skill's "Skill Level" property) - Target Proficiency (number property with format "percent") - Gap (formula: Target Proficiency - Current Proficiency) - Learning Resources (rich text property) - Progress Notes (rich text property) 2. Populate the Skills Development Tracker database with entries for all skills that have a proficiency level below 70% (0.7): - For each qualifying skill, create an entry with: - Name: "[Skill Name] Development Plan" - Link to the corresponding skill in Skills database - Target Proficiency: Set to Current + 25% (capped at 95%) - Learning Resources: "Online courses and practice projects" - Progress Notes: "Initial assessment completed" ================================================ FILE: tasks/notion/easy/online_resume/simple__skills_development_tracker/meta.json ================================================ { "task_id": "simple__skills_development_tracker", "task_name": "Simple Skills Development Tracker", "category_id": "online_resume", "category_name": "Online Resume", "description": "Create a comprehensive skills audit system with development tracking for skills below 70% proficiency.", "author": "Xiangyan Liu", "created_at": "2025-11-15", "difficulty": "L1", "tags": [ "database manipulation", "cross-reference linking", "conditional filtering", "data aggregation", "template population", "visual formatting" ], "mcp": [ "notion" ], "meta_data": { "stateType": "url", "stateContent": null, "stateUrl": "https://painted-tennis-ebc.notion.site/Online-Resume-23181626b6d781159faaeb5eadaf612e", "stateOriginalUrl": "https://www.notion.so/marketplace/templates/online-resume" } } ================================================ FILE: tasks/notion/easy/online_resume/simple__skills_development_tracker/verify.py ================================================ import sys from notion_client import Client from tasks.utils import notion_utils def verify(notion: Client, main_id: str = None) -> bool: """ Verifies that the Skills Development Tracker database was created correctly. """ page_id = None if main_id: found_id, object_type = notion_utils.find_page_or_database_by_id( notion, main_id ) if found_id and object_type == "page": page_id = found_id if not page_id: page_id = notion_utils.find_page(notion, "New Online Resume") if not page_id: print("Error: Page 'New Online Resume' not found.", file=sys.stderr) return False # Step 1: Verify Skills Development Tracker database exists tracker_db_id = notion_utils.find_database_in_block( notion, page_id, "Skills Development Tracker" ) if not tracker_db_id: print( "Error: Database 'Skills Development Tracker' not found.", file=sys.stderr ) return False # Step 2: Verify database schema try: db_info = notion.databases.retrieve(database_id=tracker_db_id) properties = db_info.get("properties", {}) # Check required properties required_props = { "Name": "title", "Current Skill": "relation", "Current Proficiency": "rollup", "Target Proficiency": "number", "Gap": "formula", "Learning Resources": "rich_text", "Progress Notes": "rich_text", } for prop_name, expected_type in required_props.items(): if prop_name not in properties: print( f"Error: Property '{prop_name}' not found in database.", file=sys.stderr, ) return False if properties[prop_name]["type"] != expected_type: print( f"Error: Property '{prop_name}' has incorrect type. Expected '{expected_type}', got '{properties[prop_name]['type']}'.", file=sys.stderr, ) return False # Verify Target Proficiency is percent format if ( properties["Target Proficiency"].get("number", {}).get("format") != "percent" ): print( "Error: Target Proficiency should have 'percent' format.", file=sys.stderr, ) return False except Exception as e: print(f"Error retrieving database info: {e}", file=sys.stderr) return False # Step 3: Get Skills database to check entries skills_db_id = notion_utils.find_database_in_block(notion, page_id, "Skills") if not skills_db_id: print("Error: Skills database not found.", file=sys.stderr) return False # Get all skills with proficiency < 70% skills_below_70 = [] try: skills_results = notion.databases.query(database_id=skills_db_id).get( "results", [] ) for skill in skills_results: skill_level = ( skill.get("properties", {}).get("Skill Level", {}).get("number", 1.0) ) if skill_level < 0.7: skill_name = ( skill.get("properties", {}).get("Skill", {}).get("title", []) ) if skill_name: skill_name_text = skill_name[0].get("text", {}).get("content", "") skills_below_70.append( { "name": skill_name_text, "id": skill["id"], "level": skill_level, } ) except Exception as e: print(f"Error querying Skills database: {e}", file=sys.stderr) return False if not skills_below_70: print("Warning: No skills found with proficiency below 70%.", file=sys.stderr) # This might be OK if all skills are above 70% # Step 4: Verify entries in Skills Development Tracker try: tracker_results = notion.databases.query(database_id=tracker_db_id).get( "results", [] ) # Check that we have entries for skills below 70% if len(skills_below_70) > 0 and len(tracker_results) == 0: print( "Error: No entries found in Skills Development Tracker database.", file=sys.stderr, ) return False # Verify each entry for entry in tracker_results: props = entry.get("properties", {}) # Check name format name_prop = props.get("Name", {}).get("title", []) if not name_prop: print("Error: Entry missing Name property.", file=sys.stderr) return False name_text = name_prop[0].get("text", {}).get("content", "") if not name_text.endswith(" Development Plan"): print( f"Error: Entry name '{name_text}' doesn't follow expected format.", file=sys.stderr, ) return False # Check relation to Skills database skill_relation = props.get("Current Skill", {}).get("relation", []) if not skill_relation: print( f"Error: Entry '{name_text}' missing Current Skill relation.", file=sys.stderr, ) return False # Check Target Proficiency (should be set) target_prof = props.get("Target Proficiency", {}).get("number") if target_prof is None: print( f"Error: Entry '{name_text}' missing Target Proficiency.", file=sys.stderr, ) return False # Check Learning Resources learning_resources = props.get("Learning Resources", {}).get( "rich_text", [] ) if not learning_resources: print( f"Error: Entry '{name_text}' missing Learning Resources.", file=sys.stderr, ) return False # Check Progress Notes progress_notes = props.get("Progress Notes", {}).get("rich_text", []) if not progress_notes: print( f"Error: Entry '{name_text}' missing Progress Notes.", file=sys.stderr, ) return False except Exception as e: print(f"Error querying Skills Development Tracker: {e}", file=sys.stderr) return False print("Success: Skills Development Tracker database verified successfully.") return True def main(): """ Executes the verification process and exits with a status code. """ notion = notion_utils.get_notion_client() main_id = sys.argv[1] if len(sys.argv) > 1 else None if verify(notion, main_id): sys.exit(0) else: sys.exit(1) if __name__ == "__main__": main() ================================================ FILE: tasks/notion/easy/python_roadmap/simple__expert_level_lessons/description.md ================================================ # Task: Expert Level Learning Path (Simplified) ## Objective Extend the Python Roadmap with a new Expert Level chapter, create a bridge lesson, and add two expert lessons that build on existing material. ## Requirements ### 1. Add the Expert Level chapter - **Database**: Chapters - **Name**: `Expert Level` - **Icon**: 🟣 (purple circle emoji) - Make sure it is linked into the roadmap alongside the existing chapters. ### 2. Create the bridge lesson Create a lesson that connects advanced material to the new chapter: - **Title**: `Advanced Foundations Review` - **Status**: Done - **Chapter**: Link it to `Expert Level` - **Parent item**: Link to the lesson whose title contains "Control" (e.g., "Control Flow") - **Sub-items**: Include links to the lessons containing "Decorators" and "Calling API" ### 3. Add two expert lessons Add the following entries to the Steps database: | Lesson Title | Status | Chapter | Parent item | Date | |--------------|--------|---------|-------------|------| | `Metaprogramming and AST Manipulation` | To Do | Expert Level | Advanced Foundations Review | 2025-09-15 | | `Async Concurrency Patterns` | To Do | Expert Level | Calling API | 2025-09-20 | The lessons must inherit the correct chapter link, parent relationship, and due date as shown above. ================================================ FILE: tasks/notion/easy/python_roadmap/simple__expert_level_lessons/meta.json ================================================ { "task_id": "expert_level_lessons", "task_name": "Expert Level Lessons", "category_id": "python_roadmap", "category_name": "Python Roadmap", "description": "Create an Expert Level chapter with sophisticated prerequisite chains and four expert-level lessons.", "author": "Xiangyan Liu", "created_at": "2025-11-15", "difficulty": "L1", "tags": [ "database manipulation", "cross-reference linking", "conditional filtering", "status tracking", "template population" ], "mcp": [ "notion" ], "meta_data": { "stateType": "url", "stateContent": null, "stateUrl": "https://painted-tennis-ebc.notion.site/Python-Roadmap-25281626b6d78012bf2bce1fa8711f4d", "stateOriginalUrl": "https://www.notion.so/marketplace/templates/python-roadmap" } } ================================================ FILE: tasks/notion/easy/python_roadmap/simple__expert_level_lessons/verify.py ================================================ import sys from notion_client import Client from tasks.utils import notion_utils TARGET_PAGE_TITLE = "Python Roadmap" CHAPTER_NAME = "Expert Level" CHAPTER_ICON = "🟣" BRIDGE_TITLE = "Advanced Foundations Review" REQUIRED_SUBITEM_TITLES = ["Decorators", "Calling API"] LESSON_REQUIREMENTS = [ { "title": "Metaprogramming and AST Manipulation", "status": "To Do", "date": "2025-09-15", "parent_title": BRIDGE_TITLE, }, { "title": "Async Concurrency Patterns", "status": "To Do", "date": "2025-09-20", "parent_title": "Calling API", }, ] def _get_database_ids(notion: Client, page_id: str) -> tuple[str | None, str | None]: """Return the block IDs for the Chapters and Steps databases on the page.""" chapters_db_id = None steps_db_id = None blocks = notion_utils.get_all_blocks_recursively(notion, page_id) for block in blocks: if block.get("type") != "child_database": continue title = block.get("child_database", {}).get("title", "") if "Chapters" in title and not chapters_db_id: chapters_db_id = block["id"] elif "Steps" in title and not steps_db_id: steps_db_id = block["id"] return chapters_db_id, steps_db_id def _query_step_by_title(notion: Client, database_id: str, title: str, *, exact: bool = True): """Return the first step entry matching the given title pattern.""" title_filter = {"equals": title} if exact else {"contains": title} response = notion.databases.query( database_id=database_id, filter={"property": "Lessons", "title": title_filter}, page_size=5, ) results = response.get("results", []) return results[0] if results else None def verify(notion: Client, main_id: str | None = None) -> bool: """Verify the simplified Expert Level learning path setup.""" # Resolve the roadmap page. if main_id: page_id, object_type = notion_utils.find_page_or_database_by_id(notion, main_id) if not page_id or object_type != "page": print("Error: Python Roadmap page not found.", file=sys.stderr) return False else: page_id = notion_utils.find_page(notion, TARGET_PAGE_TITLE) if not page_id: print("Error: Python Roadmap page not found.", file=sys.stderr) return False # Locate the Chapters and Steps databases. chapters_db_id, steps_db_id = _get_database_ids(notion, page_id) if not chapters_db_id: print("Error: Chapters database not found on the page.", file=sys.stderr) return False if not steps_db_id: print("Error: Steps database not found on the page.", file=sys.stderr) return False # Ensure the Expert Level chapter exists with the purple icon. try: chapter_resp = notion.databases.query( database_id=chapters_db_id, filter={"property": "Name", "title": {"equals": CHAPTER_NAME}}, page_size=1, ) except Exception as exc: print(f"Error querying Chapters database: {exc}", file=sys.stderr) return False results = chapter_resp.get("results", []) if not results: print("Error: Expert Level chapter not found.", file=sys.stderr) return False expert_chapter = results[0] expert_chapter_id = expert_chapter["id"] icon = expert_chapter.get("icon") or {} if icon.get("type") != "emoji" or icon.get("emoji") != CHAPTER_ICON: print("Error: Expert Level chapter must use the purple circle emoji icon.", file=sys.stderr) return False print("✓ Expert Level chapter exists with the correct icon.") # Locate prerequisite lessons (Control Flow, Decorators, Calling API). control_lesson = _query_step_by_title(notion, steps_db_id, "Control", exact=False) if not control_lesson: print("Error: Could not find a lesson containing 'Control' in its title.", file=sys.stderr) return False control_lesson_id = control_lesson["id"] prerequisite_ids = {} for title in REQUIRED_SUBITEM_TITLES: lesson = _query_step_by_title(notion, steps_db_id, title, exact=False) if not lesson: print(f"Error: Required lesson containing '{title}' not found.", file=sys.stderr) return False prerequisite_ids[title] = lesson["id"] # Verify the bridge lesson. bridge_lesson = _query_step_by_title(notion, steps_db_id, BRIDGE_TITLE, exact=True) if not bridge_lesson: print("Error: Advanced Foundations Review lesson not found.", file=sys.stderr) return False status = (bridge_lesson["properties"].get("Status", {}).get("status") or {}).get("name") if status != "Done": print("Error: Advanced Foundations Review must have status 'Done'.", file=sys.stderr) return False # Ensure chapter relation includes Expert Level. chapter_rel = bridge_lesson["properties"].get("Chapters", {}).get("relation", []) if not any(rel["id"] == expert_chapter_id for rel in chapter_rel): print("Error: Advanced Foundations Review must link to the Expert Level chapter.", file=sys.stderr) return False # Parent item should be the control lesson. parent_rel = bridge_lesson["properties"].get("Parent item", {}).get("relation", []) if not parent_rel or parent_rel[0]["id"] != control_lesson_id: print("Error: Advanced Foundations Review should use the control lesson as its Parent item.", file=sys.stderr) return False # Sub-items must include the required lessons. sub_rel = bridge_lesson["properties"].get("Sub-item", {}).get("relation", []) sub_ids = {rel["id"] for rel in sub_rel} missing = [title for title, rel_id in prerequisite_ids.items() if rel_id not in sub_ids] if missing: print( f"Error: Advanced Foundations Review must include these lessons as sub-items: {', '.join(missing)}.", file=sys.stderr, ) return False print("✓ Bridge lesson configured with the correct status, chapter, parent, and sub-items.") # Verify the two expert lessons. overall_success = True for spec in LESSON_REQUIREMENTS: lesson = _query_step_by_title(notion, steps_db_id, spec["title"], exact=True) if not lesson: print(f"Error: Lesson '{spec['title']}' not found.", file=sys.stderr) overall_success = False continue lesson_ok = True # Status check. status_name = (lesson["properties"].get("Status", {}).get("status") or {}).get("name") if status_name != spec["status"]: print( f"Error: Lesson '{spec['title']}' should have status '{spec['status']}', found '{status_name}'.", file=sys.stderr, ) lesson_ok = False # Chapter relation check. lesson_chapters = lesson["properties"].get("Chapters", {}).get("relation", []) if not any(rel["id"] == expert_chapter_id for rel in lesson_chapters): print(f"Error: Lesson '{spec['title']}' must link to the Expert Level chapter.", file=sys.stderr) lesson_ok = False # Parent relation check. parent_title = spec["parent_title"] if parent_title == BRIDGE_TITLE: expected_parent_id = bridge_lesson["id"] else: expected_parent_id = prerequisite_ids.get(parent_title) parent_relation = lesson["properties"].get("Parent item", {}).get("relation", []) if not expected_parent_id: print( f"Error: Could not resolve expected parent '{parent_title}' for lesson '{spec['title']}'.", file=sys.stderr, ) lesson_ok = False else: if not parent_relation or parent_relation[0]["id"] != expected_parent_id: print( f"Error: Lesson '{spec['title']}' should have '{parent_title}' as its Parent item.", file=sys.stderr, ) lesson_ok = False # Date check. date_prop = lesson["properties"].get("Date", {}).get("date") or {} if date_prop.get("start") != spec["date"]: print( f"Error: Lesson '{spec['title']}' should use date {spec['date']}, found {date_prop.get('start')}.", file=sys.stderr, ) lesson_ok = False if lesson_ok: print(f"✓ Lesson '{spec['title']}' has the expected properties.") else: overall_success = False if not overall_success: return False print("Success: Expert Level chapter, bridge lesson, and expert lessons configured correctly.") return True def main() -> None: notion = notion_utils.get_notion_client() main_id = sys.argv[1] if len(sys.argv) > 1 else None if verify(notion, main_id): sys.exit(0) sys.exit(1) if __name__ == "__main__": main() ================================================ FILE: tasks/notion/easy/self_assessment/simple__faq_column_layout/description.md ================================================ Navigate to the "Self Assessment" page and reorganize the FAQ toggle content to make it easier to scan. **Task Requirements:** 1. Add a column list with two columns inside the FAQ toggle. 2. Move the first two existing Q&A pairs from the FAQ into the left column. 3. Move the third existing Q&A pair into the right column, keeping the original heading/paragraph formatting. ================================================ FILE: tasks/notion/easy/self_assessment/simple__faq_column_layout/meta.json ================================================ { "task_id": "simple__faq_column_layout", "task_name": "Simple FAQ Column Layout", "category_id": "self_assessment", "category_name": "Self Assessment", "description": "Reorganize the FAQ section content into a two-column layout with balanced Q&A pairs.", "author": "Xiangyan Liu", "created_at": "2025-11-15", "difficulty": "L1", "tags": [ "content organization", "visual formatting", "template population" ], "mcp": [ "notion" ], "meta_data": { "stateType": "url", "stateContent": null, "stateUrl": "https://painted-tennis-ebc.notion.site/Self-Assessment-24381626b6d780fe9f56c2ba14ea042d", "stateOriginalUrl": "https://painted-tennis-ebc.notion.site/Self-Assessment-24381626b6d780fe9f56c2ba14ea042d" } } ================================================ FILE: tasks/notion/easy/self_assessment/simple__faq_column_layout/verify.py ================================================ import sys from notion_client import Client from tasks.utils import notion_utils def verify(notion: Client, main_id: str = None) -> bool: """ Verifies that the FAQ toggle has been properly reorganized with a column list. """ # Start from main_id if provided page_id = None if main_id: found_id, object_type = notion_utils.find_page_or_database_by_id( notion, main_id ) if found_id and object_type == "page": page_id = found_id if not page_id: # Try to find the Self Assessment page page_id = notion_utils.find_page(notion, "Self Assessment") if not page_id: print("Error: Self Assessment page not found.", file=sys.stderr) return False # Get all blocks recursively from the page all_blocks = notion_utils.get_all_blocks_recursively(notion, page_id) # Find the FAQ toggle block faq_toggle_block = None faq_toggle_id = None for block in all_blocks: if block.get("type") == "toggle": block_text = notion_utils.get_block_plain_text(block) if "FAQ" in block_text: faq_toggle_block = block faq_toggle_id = block.get("id") print(f"Found FAQ toggle block: {block_text}") break if not faq_toggle_block: print("Error: FAQ toggle block not found.", file=sys.stderr) return False # Find column_list inside the FAQ toggle column_list_block = None for block in all_blocks: if ( block.get("type") == "column_list" and block.get("parent", {}).get("block_id") == faq_toggle_id ): column_list_block = block break if not column_list_block: print("Error: No column_list found inside FAQ toggle.", file=sys.stderr) return False # Check that there are no Q&A pairs directly under FAQ toggle (outside column_list) direct_faq_children = [] for block in all_blocks: if block.get("parent", {}).get("block_id") == faq_toggle_id and block.get( "id" ) != column_list_block.get("id"): direct_faq_children.append(block) # Check if any of these are heading_3 or paragraph blocks (Q&A content) for block in direct_faq_children: if block.get("type") in ["heading_3", "paragraph"]: print( f"Error: Found Q&A content outside column_list: {notion_utils.get_block_plain_text(block)[:50]}...", file=sys.stderr, ) return False # Find the two columns columns = [] column_list_id = column_list_block.get("id") for block in all_blocks: if ( block.get("type") == "column" and block.get("parent", {}).get("block_id") == column_list_id ): columns.append(block) if len(columns) != 2: print(f"Error: Expected 2 columns, found {len(columns)}.", file=sys.stderr) return False # Count Q&A pairs in each column qa_counts = [] total_pairs = 0 for i, column in enumerate(columns[:2]): column_id = column.get("id") column_blocks = [ block for block in all_blocks if block.get("parent", {}).get("block_id") == column_id ] qa_pairs = 0 j = 0 while j < len(column_blocks): if ( column_blocks[j].get("type") == "heading_3" and j + 1 < len(column_blocks) and column_blocks[j + 1].get("type") == "paragraph" ): qa_pairs += 1 j += 2 else: j += 1 qa_counts.append(qa_pairs) total_pairs += qa_pairs print(f"Column {i + 1}: Found {qa_pairs} Q&A pairs") if qa_counts[0] < 2: print( f"Error: Left column should contain at least 2 Q&A pairs, found {qa_counts[0]}.", file=sys.stderr, ) return False if qa_counts[1] < 1: print( f"Error: Right column should contain at least 1 Q&A pair, found {qa_counts[1]}.", file=sys.stderr, ) return False if total_pairs < 3: print( f"Error: Expected at least 3 total Q&A pairs across both columns, found {total_pairs}.", file=sys.stderr, ) return False print( "Success: FAQ toggle organized with two columns holding the existing Q&A pairs (two on the left, one on the right)." ) return True def main(): """ Executes the verification process and exits with a status code. """ notion = notion_utils.get_notion_client() main_id = sys.argv[1] if len(sys.argv) > 1 else None if verify(notion, main_id): sys.exit(0) else: sys.exit(1) if __name__ == "__main__": main() ================================================ FILE: tasks/notion/easy/standard_operating_procedure/simple__section_organization/description.md ================================================ # Task: Reorganize Standard Operating Procedure Page Sections ## Objective Modify the structure of the Standard Operating Procedure page in Notion by updating the order of two sections. ## Requirements - Navigate to the Standard Operating Procedure page - Swap the positions of the "Terminologies" and "Roles & responsibilities" sections - Preserve all content within each section exactly as is - Maintain the original formatting and structure of each section ================================================ FILE: tasks/notion/easy/standard_operating_procedure/simple__section_organization/meta.json ================================================ { "task_id": "simple__section_organization", "task_name": "Simple Section Organization", "category_id": "standard_operating_procedure", "category_name": "Standard Operating Procedure", "description": "Reorganize the Standard Operating Procedure page by swapping sections and creating a column layout.", "author": "Xiangyan Liu", "created_at": "2025-11-15", "difficulty": "L1", "tags": [ "content organization", "cross-reference linking", "visual formatting" ], "mcp": [ "notion" ], "meta_data": { "stateType": "url", "stateContent": null, "stateUrl": "https://painted-tennis-ebc.notion.site/Standard-Operating-Procedure-24381626b6d780a8b678f9e62ae5b152", "stateOriginalUrl": "https://www.notion.so/marketplace/templates/standard-operating-procedure" } } ================================================ FILE: tasks/notion/easy/standard_operating_procedure/simple__section_organization/verify.py ================================================ import sys from notion_client import Client from tasks.utils import notion_utils TARGET_PAGE_TITLE = "Standard Operating Procedure" ROLES_HEADING = "Roles & responsibilities" TERMINOLOGIES_HEADING = "Terminologies" def _find_heading_indices(blocks: list[dict]) -> tuple[int | None, int | None]: """Return the indices of the target headings within the flattened block list.""" roles_index = None terminologies_index = None for index, block in enumerate(blocks): if block.get("type") != "heading_2": continue rich_text = block.get("heading_2", {}).get("rich_text", []) if not rich_text: continue heading_text = rich_text[0].get("text", {}).get("content", "") if heading_text == ROLES_HEADING and roles_index is None: roles_index = index elif heading_text == TERMINOLOGIES_HEADING and terminologies_index is None: terminologies_index = index return roles_index, terminologies_index def verify(notion: Client, main_id: str | None = None) -> bool: """Ensure the Roles & responsibilities section appears before Terminologies.""" # Resolve page id. if main_id: page_id, object_type = notion_utils.find_page_or_database_by_id(notion, main_id) if not page_id or object_type != "page": print("Error: Standard Operating Procedure page not found.", file=sys.stderr) return False else: page_id = notion_utils.find_page(notion, TARGET_PAGE_TITLE) if not page_id: print("Error: Standard Operating Procedure page not found.", file=sys.stderr) return False # Fetch all blocks (flattened order from top to bottom). blocks = notion_utils.get_all_blocks_recursively(notion, page_id) roles_index, terminologies_index = _find_heading_indices(blocks) if roles_index is None: print("Error: 'Roles & responsibilities' section not found.", file=sys.stderr) return False if terminologies_index is None: print("Error: 'Terminologies' section not found.", file=sys.stderr) return False if roles_index >= terminologies_index: print( "Error: Sections are not swapped. 'Roles & responsibilities' should appear before 'Terminologies'.", file=sys.stderr, ) return False print("Success: Section order updated so 'Roles & responsibilities' precedes 'Terminologies'.") return True def main() -> None: notion = notion_utils.get_notion_client() main_id = sys.argv[1] if len(sys.argv) > 1 else None if verify(notion, main_id): sys.exit(0) sys.exit(1) if __name__ == "__main__": main() ================================================ FILE: tasks/notion/easy/team_projects/simple__swap_tasks/description.md ================================================ Go to the Team Projects page, find the person responsible for the most tasks (10 in total) and the person responsible for the fewest tasks (3 in total), then swap their assigned tasks. ================================================ FILE: tasks/notion/easy/team_projects/simple__swap_tasks/meta.json ================================================ { "task_id": "simple__swap_tasks", "task_name": "Simple Swap Tasks", "category_id": "team_projects", "category_name": "Team Projects", "description": "Find the person responsible for the most and fewest tasks, then swap their assigned tasks.", "author": "Xiangyan Liu", "created_at": "2025-11-15", "difficulty": "L1", "tags": [ "data aggregation", "automated migration", "conditional filtering" ], "mcp": [ "notion" ], "meta_data": { "stateType": "url", "stateContent": null, "stateUrl": "https://painted-tennis-ebc.notion.site/Team-Projects-24e81626b6d7809c982fdb7a25825898", "stateOriginalUrl": "https://www.notion.so/marketplace/templates/gantt-chart" } } ================================================ FILE: tasks/notion/easy/team_projects/simple__swap_tasks/verify.py ================================================ import sys from notion_client import Client from tasks.utils import notion_utils def verify(notion: Client, main_id: str = None) -> bool: """ Verifies that the task assignees have been swapped correctly. Checks: 1. "Develop a plan for promotion" and "Evaluate different third-party services" have swapped assignees 2. The person with most tasks and person with least tasks have swapped all their tasks """ # Step 1: Find the Team Projects page if main_id: found_id, object_type = notion_utils.find_page_or_database_by_id(notion, main_id) if not found_id or object_type != 'page': print("Error: Team Projects page not found.", file=sys.stderr) return False else: # Try to find the page by searching found_id = notion_utils.find_page(notion, "Team Projects") if not found_id: print("Error: Team Projects page not found.", file=sys.stderr) return False # Get all blocks from the page to find database references all_blocks = notion_utils.get_all_blocks_recursively(notion, found_id) # Find Tasks database ID from the page tasks_db_id = None for block in all_blocks: if block and block.get("type") == "child_database": db_title = block.get("child_database", {}).get("title", "") if "Tasks" in db_title: tasks_db_id = block["id"] break if not tasks_db_id: print("Error: Tasks database not found.", file=sys.stderr) return False print("\n📋 Starting verification...") # Step 2: Query all tasks to analyze assignees try: all_tasks_response = notion.databases.query( database_id=tasks_db_id, page_size=100 ) if not all_tasks_response.get("results"): print("Error: No tasks found in Tasks database.", file=sys.stderr) return False tasks = all_tasks_response["results"] except Exception as e: print(f"Error querying Tasks database: {e}", file=sys.stderr) return False # Step 3: Check specific tasks have swapped assignees develop_plan_task = None evaluate_services_task = None for task in tasks: task_name = task["properties"]["Name"]["title"][0]["text"]["content"] if task_name == "Develop a plan for promotion": develop_plan_task = task elif task_name == "Evaluate different third-party services": evaluate_services_task = task if not develop_plan_task or not evaluate_services_task: print("Error: Could not find both required tasks.", file=sys.stderr) return False # Get assignees for these tasks develop_plan_assignees = develop_plan_task["properties"]["Assigned"]["people"] evaluate_services_assignees = evaluate_services_task["properties"]["Assigned"]["people"] if not develop_plan_assignees or not evaluate_services_assignees: print("Error: Tasks don't have assignees.", file=sys.stderr) return False develop_plan_assignee_id = develop_plan_assignees[0]["id"] evaluate_services_assignee_id = evaluate_services_assignees[0]["id"] # These should be different (swapped) if develop_plan_assignee_id == evaluate_services_assignee_id: print("Error: Tasks should have different assignees after swap.", file=sys.stderr) return False # Step 4: Count tasks per person task_counts = {} unassigned_count = 0 for task in tasks: assignees = task["properties"]["Assigned"]["people"] if assignees: assignee_id = assignees[0]["id"] if assignee_id not in task_counts: task_counts[assignee_id] = [] task_counts[assignee_id].append(task["properties"]["Name"]["title"][0]["text"]["content"]) else: unassigned_count += 1 # Sort by task count sorted_assignees = sorted(task_counts.items(), key=lambda x: len(x[1])) if len(sorted_assignees) < 2: print("Error: Need at least 2 people with tasks to verify swap.", file=sys.stderr) return False # Get person with least and most tasks person_with_least = sorted_assignees[0] person_with_most = sorted_assignees[-1] least_id, least_tasks = person_with_least most_id, most_tasks = person_with_most # Step 5: Verify the swap pattern # Original distribution (before swap): # - 5ac96c02-49a4-4320-8de6-b663ba83126b had 3 tasks (least) # - ac7a3bd0-c111-4464-8f45-8a857a1abc8a had 10 tasks (most) # After complete swap, we expect: # - 5ac96c02-49a4-4320-8de6-b663ba83126b should have 10 tasks # - ac7a3bd0-c111-4464-8f45-8a857a1abc8a should have 3 tasks original_least_id = "5ac96c02-49a4-4320-8de6-b663ba83126b" original_most_id = "ac7a3bd0-c111-4464-8f45-8a857a1abc8a" # Check if the swap has been completed swap_completed = False for assignee_id, assignee_tasks in task_counts.items(): if assignee_id == original_least_id and len(assignee_tasks) == 10: # Person who had 3 now has 10 for other_id, other_tasks in task_counts.items(): if other_id == original_most_id and len(other_tasks) == 3: # Person who had 10 now has 3 swap_completed = True break # Step 6: Summary print(f"\n📊 Task Distribution:") print(f" • Total tasks: {len(tasks)}") print(f" • Assigned tasks: {len(tasks) - unassigned_count}") print(f" • Unassigned tasks: {unassigned_count}") print(f" • People with tasks: {len(task_counts)}") print(f"\n Task counts by person:") for assignee_id, assignee_tasks in sorted_assignees: print(f" - {assignee_id[:8]}...: {len(assignee_tasks)} tasks") # Step 7: Final verification print("\n🔍 Verification Results:") # Check that the swap has created a significant difference if len(most_tasks) - len(least_tasks) < 5: print(f"Warning: Difference between most and least is only {len(most_tasks) - len(least_tasks)} tasks", file=sys.stderr) # Verify specific expected outcomes verification_passed = True # Check 1: Specific tasks have been swapped specific_tasks_swapped = develop_plan_assignee_id != evaluate_services_assignee_id if specific_tasks_swapped: print(" ✓ Specific tasks have been swapped") else: print(" ✗ Specific tasks were not swapped", file=sys.stderr) verification_passed = False # Check 2: Task distribution shows a complete swap if swap_completed: print(" ✓ Complete task swap verified (3↔10 tasks)") else: # Show actual distribution for debugging person1_tasks = len(task_counts.get(original_least_id, [])) person2_tasks = len(task_counts.get(original_most_id, [])) print(f" ✗ Swap incomplete! Expected 5ac96c02→10 tasks, ac7a3bd0→3 tasks", file=sys.stderr) print(f" Actual: 5ac96c02→{person1_tasks} tasks, ac7a3bd0→{person2_tasks} tasks", file=sys.stderr) verification_passed = False # Check 3: Total task count is preserved total_assigned_tasks = sum(len(tasks) for _, tasks in task_counts.items()) expected_total = len(tasks) - unassigned_count if total_assigned_tasks == expected_total: print(f" ✓ Total task count preserved ({total_assigned_tasks} assigned)") else: print(f" ✗ Task count mismatch: {total_assigned_tasks} vs {expected_total} expected", file=sys.stderr) verification_passed = False if verification_passed: print("\n✅ All verification checks passed!") return True else: print("\n❌ Verification failed", file=sys.stderr) return False def main(): """ Executes the verification process and exits with a status code. """ notion = notion_utils.get_notion_client() main_id = sys.argv[1] if len(sys.argv) > 1 else None if verify(notion, main_id): sys.exit(0) else: sys.exit(1) if __name__ == "__main__": main() ================================================ FILE: tasks/notion/easy/toronto_guide/simple__change_color/description.md ================================================ Open the **Toronto Guide** page and refresh the colors of the tags in the **Food** database. ## Requirements 1. Find and open the Toronto Guide page in Notion. 2. Locate the *Food* database on that page. 3. Update every tag in the Food database that is currently pink so that it uses a different color of your choice (any non-pink color is fine). 4. Do not modify callouts or tags in the other databases. ================================================ FILE: tasks/notion/easy/toronto_guide/simple__change_color/meta.json ================================================ { "task_id": "simple__change_color", "task_name": "Simple Change Color", "category_id": "toronto_guide", "category_name": "Toronto Guide", "description": "Navigate to the Toronto Guide page and change all pink-colored elements to different colors.", "author": "Xiangyan Liu", "created_at": "2025-11-15", "difficulty": "L1", "tags": [ "visual formatting", "conditional filtering" ], "mcp": [ "notion" ], "meta_data": { "stateType": "url", "stateContent": null, "stateUrl": "https://painted-tennis-ebc.notion.site/Toronto-Guide-25281626b6d7802caa7cc394647e901c", "stateOriginalUrl": "https://www.notion.so/marketplace/templates/conquering-toronto-a-destination-guide" } } ================================================ FILE: tasks/notion/easy/toronto_guide/simple__change_color/verify.py ================================================ import sys from notion_client import Client from tasks.utils import notion_utils TARGET_PAGE_TITLE = "Toronto Guide" FOOD_DATABASE_KEYWORD = "Food" TARGET_TAG_NAMES = [ "Middle Eastern", "Jamaican", "Indian", ] def _get_food_database_id(notion: Client, page_id: str) -> str | None: """Return the block ID of the Food database shown on the target page.""" all_blocks = notion_utils.get_all_blocks_recursively(notion, page_id) for block in all_blocks: if not block or block.get("type") != "child_database": continue title = block.get("child_database", {}).get("title", "") if FOOD_DATABASE_KEYWORD.lower() in title.lower(): return block.get("id") return None def verify(notion: Client, main_id: str | None = None) -> bool: """Check that all target tags in the Food database are no longer pink.""" # Resolve the Toronto Guide page ID. if main_id: found_id, object_type = notion_utils.find_page_or_database_by_id(notion, main_id) if not found_id or object_type != "page": print("Error: Toronto Guide page not found.", file=sys.stderr) return False page_id = found_id else: page_id = notion_utils.find_page(notion, TARGET_PAGE_TITLE) if not page_id: print("Error: Toronto Guide page not found.", file=sys.stderr) return False # Locate the Food database block. food_db_id = _get_food_database_id(notion, page_id) if not food_db_id: print("Error: Food database not found on the Toronto Guide page.", file=sys.stderr) return False # Fetch database definition and inspect tag options. try: db_info = notion.databases.retrieve(database_id=food_db_id) except Exception as exc: print(f"Error: Unable to retrieve Food database ({exc}).", file=sys.stderr) return False tags_property = db_info.get("properties", {}).get("Tags", {}) if tags_property.get("type") != "multi_select": print("Error: Food database does not have a multi-select Tags property.", file=sys.stderr) return False options = tags_property.get("multi_select", {}).get("options", []) remaining_targets = set(TARGET_TAG_NAMES) failures = False for option in options: tag_name = option.get("name", "").strip() if tag_name not in remaining_targets: continue remaining_targets.discard(tag_name) color = option.get("color") if color == "pink": print(f"Error: Tag '{tag_name}' in Food database is still pink.", file=sys.stderr) failures = True else: print(f"✓ Tag '{tag_name}' color updated to '{color}'.") if remaining_targets: print( f"Error: Food tags not found (expected to exist): {sorted(remaining_targets)}.", file=sys.stderr, ) return False if failures: return False print("Success: All Food database tags are now non-pink.") return True def main() -> None: notion = notion_utils.get_notion_client() main_id = sys.argv[1] if len(sys.argv) > 1 else None if verify(notion, main_id): sys.exit(0) sys.exit(1) if __name__ == "__main__": main() ================================================ FILE: tasks/notion/standard/company_in_a_box/employee_onboarding/description.md ================================================ Build an integrated **Employee Onboarding** system for the existing **Company In A Box** page. **Task Requirements:** 1. Create a new **database** titled **Employee Onboarding Checklist** with the following properties *exactly*: • **Employee Name** – title • **Start Date** – date • **Department** – select (options: Product, Marketing, Sales, HR, Engineering) Populate this database with **3** sample new-hire pages covering three different departments. Every property in each entry must be filled. 2. Under the top-level page **Company In A Box**, create a new child page titled **Onboarding Hub** containing, in order: 1) The **Employee Onboarding Checklist** database embedded at the top. 2) A section headed **Benefits Overview** that includes linked mentions (@-mentions or link-to-page blocks) to **≥ 3** distinct benefit-policy pages from the **Company Wiki** (for example *Benefits policy*, *Vacation Policy*, *Corporate travel*). 3) A section headed **30-Day Timeline** that presents a numbered list with **7** steps covering the first 30 days. **Each step must reference (via @-mention) an existing page or database**. 4) A section headed **Feedback Form** that provides **≥ 3** to-do items for new hires to check off. ================================================ FILE: tasks/notion/standard/company_in_a_box/employee_onboarding/meta.json ================================================ { "task_id": "employee_onboarding", "task_name": "Employee Onboarding", "category_id": "company_in_a_box", "category_name": "Company In A Box", "description": "Build an integrated Employee Onboarding system for the existing Company In A Box page with a checklist database, onboarding hub, and feedback form.", "author": "Zijian Wu", "created_at": "2025-07-27", "difficulty": "L3", "tags": [ "database manipulation", "template population", "cross-reference linking", "status tracking" ], "mcp": [ "notion" ], "meta_data": { "stateType": "url", "stateContent": null, "stateUrl": "https://painted-tennis-ebc.notion.site/Company-In-A-Box-23d81626b6d7800098f3d0e64a706cd8", "stateOriginalUrl": "https://www.notion.so/marketplace/templates/company-in-a-box" } } ================================================ FILE: tasks/notion/standard/company_in_a_box/employee_onboarding/verify.py ================================================ import sys from typing import Dict, Set from notion_client import Client from tasks.utils import notion_utils def _check_db_schema(db_props: Dict[str, Dict], required: Dict[str, str]) -> bool: """Return True if every required property exists with the correct type.""" for prop_name, expected_type in required.items(): if prop_name not in db_props: print( f"Error: Property '{prop_name}' missing from database.", file=sys.stderr ) return False actual_type = db_props[prop_name]["type"] if actual_type != expected_type: print( f"Error: Property '{prop_name}' has type '{actual_type}', expected '{expected_type}'.", file=sys.stderr, ) return False return True def verify(notion: Client, main_id: str | None = None) -> bool: # noqa: C901 """Programmatically verify the onboarding system described in description.md.""" DB_TITLE = "Employee Onboarding Checklist" HUB_PAGE_TITLE = "Onboarding Hub" DEPARTMENT_OPTIONS: Set[str] = { "Product", "Marketing", "Sales", "HR", "Engineering", } REQUIRED_DB_PROPERTIES = { "Employee Name": "title", "Start Date": "date", "Department": "select", } # 1. Locate onboarding database db_id = notion_utils.find_database(notion, DB_TITLE) if not db_id: print(f"Error: Database '{DB_TITLE}' not found.", file=sys.stderr) return False try: db_obj = notion.databases.retrieve(database_id=db_id) except Exception as exc: print(f"Error retrieving database: {exc}", file=sys.stderr) return False db_props = db_obj.get("properties", {}) if not _check_db_schema(db_props, REQUIRED_DB_PROPERTIES): return False # Extra: validate select options dept_options = {opt["name"] for opt in db_props["Department"]["select"]["options"]} if not DEPARTMENT_OPTIONS.issubset(dept_options): print( f"Error: Department select options must include {sorted(DEPARTMENT_OPTIONS)}. Current: {sorted(dept_options)}", file=sys.stderr, ) return False # Check there are at least 3 entries in the database try: db_pages = notion.databases.query(database_id=db_id).get("results", []) except Exception as exc: print(f"Error querying database: {exc}", file=sys.stderr) return False if len(db_pages) < 3: print( "Error: Less than 3 onboarding entries found in the database.", file=sys.stderr, ) return False # 2. Locate Onboarding Hub page hub_page_id = notion_utils.find_page(notion, HUB_PAGE_TITLE) if not hub_page_id: print(f"Error: Page '{HUB_PAGE_TITLE}' not found.", file=sys.stderr) return False # 3. Ensure the onboarding database is embedded in the hub page embedded_db_id = notion_utils.find_database_in_block(notion, hub_page_id, DB_TITLE) if embedded_db_id != db_id: print( "Error: The Employee Onboarding Checklist database is not embedded in the Onboarding Hub page.", file=sys.stderr, ) return False # 4. Analyse blocks within the hub page for linked mentions, timeline, and feedback form all_blocks = notion_utils.get_all_blocks_recursively(notion, hub_page_id) seen_link_targets: Set[str] = set() numbered_list_count = 0 todo_count = 0 for blk in all_blocks: blk_type = blk.get("type") # Direct link-to-page blocks if blk_type == "link_to_page": info = blk.get("link_to_page", {}) target_id = info.get("page_id") or info.get("database_id") if target_id: seen_link_targets.add(target_id) continue # Rich-text mentions inside content blocks if blk_type in { "paragraph", "numbered_list_item", "bulleted_list_item", "to_do", }: content = blk.get(blk_type, {}) for rt in content.get("rich_text", []): if rt.get("type") == "mention": mention = rt.get("mention", {}) if mention.get("type") in {"page", "database"}: target_id = mention.get("page", {}).get("id") or mention.get( "database", {} ).get("id") if target_id: seen_link_targets.add(target_id) # Count numbered list items if blk_type == "numbered_list_item": numbered_list_count += 1 # Count to-do items in Feedback Form if blk_type == "to_do": todo_count += 1 if len(seen_link_targets) < 3: print( "Error: Fewer than 3 linked mentions to benefit policy pages found in the Benefits Overview section.", file=sys.stderr, ) return False if numbered_list_count < 7: print( "Error: Numbered list contains fewer than 7 steps in the 30-Day Timeline section.", file=sys.stderr, ) return False if todo_count < 3: print( "Error: Feedback Form section contains fewer than 3 to-do items.", file=sys.stderr, ) return False print( "Success: Verified Employee Onboarding Checklist database, Onboarding Hub page, and all required sections." ) return True def main(): notion = notion_utils.get_notion_client() main_id = sys.argv[1] if len(sys.argv) > 1 else None if verify(notion, main_id): sys.exit(0) else: sys.exit(1) if __name__ == "__main__": main() ================================================ FILE: tasks/notion/standard/company_in_a_box/goals_restructure/description.md ================================================ Please restructure the **Current Goals** section on my **Company In A Box** page as follows: 1. **Add a new goal heading** — create a new `heading_3` block titled: `🔄 Digital Transformation Initiative` 2. **Convert all four goal headings to toggles** — the three existing goals * ⚙️ Expand Operations to LATAM * 🛠️ Push for Enterprise * 🩶 Boost Employee Engagement * 🔄 Digital Transformation Initiative 3. **Move descriptions inside the toggles** — every paragraph or list that originally sat directly under a goal heading should become a **child block** of that heading after it is made toggleable. 4. **Preserve content & order** — apart from the changes above, do **not** modify the text, formatting, or order of existing goal descriptions. The end result should be a clean **Current Goals** section containing four toggleable goal headings, each with its corresponding details tucked inside. ================================================ FILE: tasks/notion/standard/company_in_a_box/goals_restructure/meta.json ================================================ { "task_id": "goals_restructure", "task_name": "Goals Restructure", "category_id": "company_in_a_box", "category_name": "Company In A Box", "description": "Restructure the Current Goals section on the Company In A Box page by adding a new goal heading and converting all goal headings to toggles with content inside.", "author": "Zijian Wu", "created_at": "2025-07-27", "difficulty": "L3", "tags": [ "content organization", "visual formatting" ], "mcp": [ "notion" ], "meta_data": { "stateType": "url", "stateContent": null, "stateUrl": "https://painted-tennis-ebc.notion.site/Company-In-A-Box-23d81626b6d7800098f3d0e64a706cd8", "stateOriginalUrl": "https://www.notion.so/marketplace/templates/company-in-a-box" } } ================================================ FILE: tasks/notion/standard/company_in_a_box/goals_restructure/verify.py ================================================ import sys from typing import List from notion_client import Client from tasks.utils import notion_utils # Expected new goal heading text (including emoji) NEW_GOAL_HEADING = "🔄 Digital Transformation Initiative" # Section title to look for GOALS_SECTION_TITLE = "Current Goals" def _plain(block) -> str: """Return concatenated plain text of a block.""" return notion_utils.get_block_plain_text(block) # Some Notion rich-text strings may include non-breaking spaces (\xa0) after emoji. # Normalize them to plain spaces so text matching is robust. def _normalize_string(s: str) -> str: return s.replace("\xa0", " ") def _is_heading(block) -> bool: return block.get("type") in ["heading_1", "heading_2", "heading_3"] def _is_toggle(block) -> bool: """Determine whether a block is a toggle (standard toggle block or toggle-able heading).""" btype = block.get("type") # In our scenario, goal blocks are headings (usually heading_3) marked as toggleable. if btype in ["heading_1", "heading_2", "heading_3"]: heading_data = block.get(btype, {}) return heading_data.get("is_toggleable", False) # Some Notion pages may contain classic toggle blocks (type == "toggle"). They are # not expected in this task, but keeping this check allows broader compatibility. return btype == "toggle" def _get_children(notion: Client, block_id: str) -> List[dict]: """Retrieve **direct** children of a block (no pagination handling needed for small test pages).""" try: return notion.blocks.children.list(block_id=block_id).get("results", []) except Exception: return [] def verify(notion: Client, main_id: str = None) -> bool: """Verifies that the Company in a Box page has been updated per the task requirements.""" # 1. Locate the main page page_id = None if main_id: found_id, obj_type = notion_utils.find_page_or_database_by_id(notion, main_id) if found_id and obj_type == "page": page_id = found_id if not page_id: # Try a few case variations just in case for title in [ "Company In A Box", ]: page_id = notion_utils.find_page(notion, title) if page_id: break if not page_id: print("Error: Could not find the 'Company in a Box' page.", file=sys.stderr) return False # 2. Recursively locate the "Current Goals" heading and collect its sibling blocks that # constitute the section. def _fetch_children(bid: str) -> List[dict]: try: return notion.blocks.children.list(block_id=bid).get("results", []) except Exception: return [] goals_section_blocks: List[dict] = [] # Breadth-first traversal to find the heading queue = [page_id] found_parent = None found_index = None while queue and found_parent is None: parent_id = queue.pop(0) children = _fetch_children(parent_id) for idx, child in enumerate(children): if ( _is_heading(child) and GOALS_SECTION_TITLE.lower() in _normalize_string(_plain(child)).lower() ): found_parent = parent_id found_index = idx break # enqueue grandchildren for further search for ch in children: if ch.get("has_children"): queue.append(ch["id"]) if found_parent is None: print( "Error: Could not find the 'Current Goals' heading anywhere in the page.", file=sys.stderr, ) return False # Retrieve siblings once more to get the final list and slice after heading. siblings = _fetch_children(found_parent) if found_index is None or found_index >= len(siblings): print( "Error: Internal logic issue when locating Current Goals section.", file=sys.stderr, ) return False goals_section_blocks = siblings[found_index + 1 :] if not goals_section_blocks: print("Error: 'Current Goals' section appears to be empty.", file=sys.stderr) return False # 3. Identify toggle blocks that represent goals toggle_blocks = [b for b in goals_section_blocks if _is_toggle(b)] if len(toggle_blocks) != 4: print( f"Error: Expected 4 toggle blocks for goals, found {len(toggle_blocks)}.", file=sys.stderr, ) return False # 4. Ensure the new goal heading exists among the toggles found_new_goal = False for tb in toggle_blocks: if ( _normalize_string(NEW_GOAL_HEADING).lower() in _normalize_string(_plain(tb)).lower() ): found_new_goal = True break if not found_new_goal: print( f"Error: Did not find a toggle block with heading '{NEW_GOAL_HEADING}'.", file=sys.stderr, ) return False # 5. Validate that each toggle has at least one child paragraph/description for tb in toggle_blocks: if ( _normalize_string(NEW_GOAL_HEADING).lower() in _normalize_string(_plain(tb)).lower() ): # Skip checking the new goal itself, as it does not have a description yet. continue if not tb.get("has_children", False): print( f"Error: Toggle '{_normalize_string(_plain(tb))}' has no child blocks (description not moved).", file=sys.stderr, ) return False children = _get_children(notion, tb["id"]) # Ensure there is at least one content child (paragraph, list item, etc.) content_types = { "paragraph", "bulleted_list_item", "numbered_list_item", "to_do", "callout", "quote", } if not any(c.get("type") in content_types for c in children): print( f"Error: Toggle '{_normalize_string(_plain(tb))}' seems to lack any description/content inside it.", file=sys.stderr, ) return False # 6. Confirm that there are **no** residual heading_3 blocks (non-toggle) for the goals non_toggle_headings = [ b for b in goals_section_blocks if b.get("type") == "heading_3" and not _is_toggle(b) ] if non_toggle_headings: titles = [_normalize_string(_plain(b)) for b in non_toggle_headings] print( f"Error: Found heading_3 blocks that were not converted to toggles: {titles}.", file=sys.stderr, ) return False print( "Success: Verified goal restructuring with new toggle blocks and descriptions." ) return True def main(): notion = notion_utils.get_notion_client() main_id = sys.argv[1] if len(sys.argv) > 1 else None if verify(notion, main_id): sys.exit(0) else: sys.exit(1) if __name__ == "__main__": main() ================================================ FILE: tasks/notion/standard/company_in_a_box/quarterly_review_dashboard/description.md ================================================ Create a quarterly business review dashboard in Notion based on the existing **Company In A Box** workspace. **Task Requirements:** 1. Inside the **Company Wiki** page you will find a sub-page named **Company Goals**. Extract every departmental objective listed under the four departments — **Product**, **Marketing**, **Sales**, and **HR**. 2. Under the top-level page **Company In A Box**, create a new child page titled **Q4 2024 Business Review Dashboard**. 3. Inside that new page build the following structure (all parts must exist): 1. A single **callout** block near the top that summarises progress toward the three *Current Goals* shown on the main page: • *LATAM expansion* • *Enterprise push* • *Employee engagement* (All three phrases must appear in the callout text.) 2. Four separate **section headings** (any heading level) – one for each department (**Product**, **Marketing**, **Sales**, **Human Resources**) – placed below the callout. Under each heading list that department’s objectives in a progress-tracking format (e.g. to-dos, check-box list). Each objective from the **Company Goals** page must appear at least once. 3. Add a **database** named **Action Items** with the following properties *exactly*: • **Task Name** – title • **Department** – select (options: Product, Marketing, Sales, HR) • **Priority** – select (options: High, Medium, Low) • **Status** – status Populate this database with **≥ 5** action-item pages derived from the departmental objectives, making sure every field in each entry is filled: • **Task Name** & **Department** must correctly correspond to the underlying objective/department. • **Priority** and **Status** can be any allowed value, but they must **not** be left empty. 4. Keep the overall visual style consistent with the existing wiki (use headings, dividers, etc.). ================================================ FILE: tasks/notion/standard/company_in_a_box/quarterly_review_dashboard/meta.json ================================================ { "task_id": "quarterly_review_dashboard", "task_name": "Quarterly Review Dashboard", "category_id": "company_in_a_box", "category_name": "Company In A Box", "description": "Create a quarterly business review dashboard in Notion based on the existing Company In A Box workspace with department objectives and action items database.", "author": "Zijian Wu", "created_at": "2025-07-27", "difficulty": "L3", "tags": [ "database manipulation", "data aggregation", "report generation", "status tracking", "template population" ], "mcp": [ "notion" ], "meta_data": { "stateType": "url", "stateContent": null, "stateUrl": "https://painted-tennis-ebc.notion.site/Company-In-A-Box-23d81626b6d7800098f3d0e64a706cd8", "stateOriginalUrl": "https://www.notion.so/marketplace/templates/company-in-a-box" } } ================================================ FILE: tasks/notion/standard/company_in_a_box/quarterly_review_dashboard/verify.py ================================================ import sys from typing import List from notion_client import Client from tasks.utils import notion_utils def _contains_keywords(text: str, keywords: List[str]) -> bool: lowered = text.lower() return all(kw.lower() in lowered for kw in keywords) def verify(notion: Client, main_id: str = None) -> bool: """Programmatically verify that the dashboard page and its contents meet the requirements described in description.md. """ DASHBOARD_TITLE = "Q4 2024 Business Review Dashboard" PARENT_PAGE_TITLE = "Company In A Box" CALL_OUT_KEYWORDS = ["latam", "enterprise", "employee engagement"] DEPARTMENTS = ["Product", "Marketing", "Sales", "Human Resources"] REQUIRED_DB_PROPERTIES = { "Task Name": "title", "Department": "select", "Priority": "select", "Status": "status", } PRIORITY_OPTIONS = {"High", "Medium", "Low"} # 1. Locate the dashboard page page_id = None if main_id: found_id, obj_type = notion_utils.find_page_or_database_by_id(notion, main_id) if found_id and obj_type == "page": page_id = found_id if not page_id: page_id = notion_utils.find_page(notion, DASHBOARD_TITLE) if not page_id: print(f"Error: Page '{DASHBOARD_TITLE}' not found.", file=sys.stderr) return False # Optional: ensure it is a child of Company In A Box try: page_obj = notion.pages.retrieve(page_id=page_id) parent_id = page_obj.get("parent", {}).get("page_id") if parent_id: parent_page = notion.pages.retrieve(page_id=parent_id) parent_title_rt = ( parent_page.get("properties", {}).get("title", {}).get("title", []) ) parent_title = ( parent_title_rt[0].get("plain_text") if parent_title_rt else None ) if parent_title != PARENT_PAGE_TITLE: print( f"Error: Dashboard page is not a direct child of '{PARENT_PAGE_TITLE}'.", file=sys.stderr, ) return False except Exception: pass # parent check is best-effort only # 2. Verify callout with keywords all_blocks = notion_utils.get_all_blocks_recursively(notion, page_id) callout_ok = False for block in all_blocks: if block.get("type") == "callout": callout_text = notion_utils.get_block_plain_text(block) if _contains_keywords(callout_text, CALL_OUT_KEYWORDS): callout_ok = True break if not callout_ok: print( "Error: No callout found that includes all three Current Goal keywords (LATAM, Enterprise, Employee engagement).", file=sys.stderr, ) return False # 3. Verify department section headings found_depts = set() for block in all_blocks: if block.get("type") in {"heading_1", "heading_2", "heading_3"}: heading_text = notion_utils.get_block_plain_text(block) for dept in DEPARTMENTS: if dept.lower() in heading_text.lower(): found_depts.add(dept) if set(DEPARTMENTS) != found_depts: missing = set(DEPARTMENTS) - found_depts print( f"Error: Missing department headings: {', '.join(missing)}.", file=sys.stderr, ) return False # 4. Verify Action Items database exists and has correct schema db_id = notion_utils.find_database_in_block(notion, page_id, "Action Items") if not db_id: print( "Error: Database 'Action Items' not found on the dashboard.", file=sys.stderr, ) return False try: db = notion.databases.retrieve(database_id=db_id) except Exception as exc: print(f"Error: Unable to retrieve database: {exc}", file=sys.stderr) return False db_props = db.get("properties", {}) for prop_name, expected_type in REQUIRED_DB_PROPERTIES.items(): if prop_name not in db_props: print( f"Error: Property '{prop_name}' missing from database.", file=sys.stderr ) return False actual_type = db_props[prop_name]["type"] if isinstance(expected_type, list): if actual_type not in expected_type: print( f"Error: Property '{prop_name}' has type '{actual_type}', expected one of {expected_type}.", file=sys.stderr, ) return False else: if actual_type != expected_type: print( f"Error: Property '{prop_name}' has type '{actual_type}', expected '{expected_type}'.", file=sys.stderr, ) return False # Extra check for Priority options if prop_name == "Priority": options = {opt["name"] for opt in db_props[prop_name]["select"]["options"]} if not PRIORITY_OPTIONS.issubset(options): print( f"Error: Priority property options must include High/Medium/Low. Current options: {options}", file=sys.stderr, ) return False # 5. Verify at least 5 action items exist try: pages = notion.databases.query(database_id=db_id).get("results", []) except Exception as exc: print(f"Error querying database pages: {exc}", file=sys.stderr) return False if len(pages) < 5: print("Error: Database contains fewer than 5 action items.", file=sys.stderr) return False # Optional: Verify Department values valid for page in pages: props = page.get("properties", {}) # Task Name must be non-empty title_rt = props.get("Task Name", {}).get("title", []) task_name = title_rt[0].get("plain_text") if title_rt else "" if not task_name.strip(): print( f"Error: Action item '{page.get('id')}' is missing a Task Name.", file=sys.stderr, ) return False # Department must be valid dept_select = props.get("Department", {}).get("select", {}).get("name") if not dept_select or dept_select not in DEPARTMENTS: print( f"Error: Action item '{page.get('id')}' has invalid or missing Department value.", file=sys.stderr, ) return False # Priority and Status must be set (any value) priority_val = props.get("Priority", {}).get("select", {}).get("name") status_val = props.get("Status", {}).get("status", {}).get("name") if not priority_val or not status_val: print( f"Error: Action item '{page.get('id')}' must have both Priority and Status set.", file=sys.stderr, ) return False print( "Success: Verified Business Review Dashboard, departmental sections, callout, and Action Items database with ≥5 entries." ) return True def main(): notion = notion_utils.get_notion_client() main_id = sys.argv[1] if len(sys.argv) > 1 else None if verify(notion, main_id): sys.exit(0) else: sys.exit(1) if __name__ == "__main__": main() ================================================ FILE: tasks/notion/standard/computer_science_student_dashboard/code_snippets_go/description.md ================================================ Find the page named "Computer Science Student Dashboard" and add a new Go column to the "Code Snippets" section. **Task Requirements:** 1. In the "Code Snippets" section, create (or locate) a column dedicated to the Go programming language. **This column must appear between the existing Python and JavaScript columns** within the same column list. 2. At the top of the Go column, add a bold paragraph that contains exactly the text `Go`. 3. Under the header paragraph, add three code-block blocks configured with `language` set to **go**: a. **Basic Go program** – Caption must be `Basic Go program` and the code content must be exactly: ```go package main import "fmt" func main() { fmt.Println("Hello, World!") } ``` b. **For loop in Go** – Caption must be `For loop in Go` and the code content must be exactly: ```go for i := 0; i < 5; i++ { fmt.Println(i) } ``` c. **Function definition in Go** – Caption must be `Function definition in Go` and the code content must be exactly: ```go func add(a, b int) int { return a + b } ``` ================================================ FILE: tasks/notion/standard/computer_science_student_dashboard/code_snippets_go/meta.json ================================================ { "task_id": "code_snippets_go", "task_name": "Code Snippets Go", "category_id": "computer_science_student_dashboard", "category_name": "Computer Science Student Dashboard", "description": "Add a new Go column to the Code Snippets section between Python and JavaScript columns.", "author": "Zijian Wu", "created_at": "2025-07-27", "difficulty": "L3", "tags": [ "content organization", "visual formatting", "template population" ], "mcp": [ "notion" ], "meta_data": { "stateType": "url", "stateContent": null, "stateUrl": "https://painted-tennis-ebc.notion.site/Computer-Science-Student-Dashboard-23e81626b6d78083b787d3c832b02ef4", "stateOriginalUrl": "https://www.notion.so/marketplace/templates/computer-science-student-dashboard" } } ================================================ FILE: tasks/notion/standard/computer_science_student_dashboard/code_snippets_go/verify.py ================================================ import sys from notion_client import Client from tasks.utils import notion_utils # Expected code blocks (language=go) EXPECTED_CODE_BLOCKS = [ { "caption": "Basic Go program", "code": ( 'package main\n\nimport "fmt"\n\nfunc main() {\n fmt.Println("Hello, World!")\n}' ), }, { "caption": "For loop in Go", "code": ("for i := 0; i < 5; i++ {\n fmt.Println(i)\n}"), }, { "caption": "Function definition in Go", "code": ("func add(a, b int) int {\n return a + b\n}"), }, ] HEADER_TEXT = "Go" def _normalize(text: str) -> str: """Remove trailing spaces on each line and strip leading/trailing blank lines.""" return "\n".join(line.rstrip() for line in text.strip().splitlines()) def _find_page(notion: Client, main_id: str | None) -> str | None: """Return a page_id to verify against or None if not found.""" page_id = None if main_id: found_id, object_type = notion_utils.find_page_or_database_by_id( notion, main_id ) if found_id and object_type == "page": page_id = found_id if not page_id: page_id = notion_utils.find_page(notion, "Computer Science Student Dashboard") return page_id def _has_bold_header_text(block, text: str) -> bool: """Generic bold header/paragraph check for a given text.""" block_type = block.get("type") if block_type not in {"paragraph", "heading_1", "heading_2", "heading_3"}: return False rich_text_list = block.get(block_type, {}).get("rich_text", []) if not rich_text_list: return False plain = "".join(rt.get("plain_text", "") for rt in rich_text_list).strip() if plain != text: return False return any(rt.get("annotations", {}).get("bold", False) for rt in rich_text_list) def _go_column_order_correct(notion: Client, page_id: str) -> bool: """Return True if there exists a column list where Python → Go → JavaScript order holds.""" # Gather all blocks once (flat list) to locate column_list blocks all_blocks = notion_utils.get_all_blocks_recursively(notion, page_id) column_list_ids = [ blk["id"] for blk in all_blocks if blk.get("type") == "column_list" ] for cl_id in column_list_ids: # Retrieve columns in explicit order columns = notion.blocks.children.list(block_id=cl_id).get("results", []) header_to_idx: dict[str, int] = {} for idx, col in enumerate(columns): # Recursively inspect blocks within this column col_blocks = notion_utils.get_all_blocks_recursively(notion, col["id"]) for blk in col_blocks: if _has_bold_header_text(blk, "Python"): header_to_idx.setdefault("Python", idx) elif _has_bold_header_text(blk, "Go"): header_to_idx.setdefault("Go", idx) elif _has_bold_header_text(blk, "JavaScript"): header_to_idx.setdefault("JavaScript", idx) # Short-circuit if all three found within current traversal if len(header_to_idx) == 3: break if ( "Python" in header_to_idx and "Go" in header_to_idx and "JavaScript" in header_to_idx and header_to_idx["Python"] < header_to_idx["Go"] < header_to_idx["JavaScript"] ): return True return False def _collect_code_blocks(blocks): """Return list of (code_content, caption) tuples for code blocks with language 'go'.""" collected = [] for block in blocks: if block.get("type") != "code": continue code_data = block.get("code", {}) if code_data.get("language") != "go": continue code_plain = "".join( rt.get("plain_text", "") for rt in code_data.get("rich_text", []) ) caption_plain = "".join( rt.get("plain_text", "") for rt in code_data.get("caption", []) ) collected.append((code_plain, caption_plain)) return collected def verify(notion: Client, main_id: str | None = None) -> bool: page_id = _find_page(notion, main_id) if not page_id: print("Error: Target page not found.", file=sys.stderr) return False all_blocks = notion_utils.get_all_blocks_recursively(notion, page_id) # Verify header header_ok = any(_has_bold_header_text(b, HEADER_TEXT) for b in all_blocks) if not header_ok: print("Failure: Bold header 'Go' not found.", file=sys.stderr) return False # Verify code blocks code_blocks_found = _collect_code_blocks(all_blocks) remaining = EXPECTED_CODE_BLOCKS.copy() for code, caption in code_blocks_found: norm_code = _normalize(code) for expected in remaining: if ( _normalize(expected["code"]) == norm_code and expected["caption"] == caption ): remaining.remove(expected) break if remaining: missing = ", ".join(exp["caption"] for exp in remaining) print( f"Failure: Missing or incorrect Go code blocks: {missing}", file=sys.stderr ) return False # Verify column order Python → Go → JavaScript if not _go_column_order_correct(notion, page_id): print( "Failure: Go column is not positioned between Python and JavaScript.", file=sys.stderr, ) return False print( "Success: Verified Go column with required code blocks and correct positioning." ) return True def main(): notion = notion_utils.get_notion_client() main_id = sys.argv[1] if len(sys.argv) > 1 else None sys.exit(0 if verify(notion, main_id) else 1) if __name__ == "__main__": main() ================================================ FILE: tasks/notion/standard/computer_science_student_dashboard/courses_internships_relation/description.md ================================================ Your goal is to connect the `Courses` and `Internship search` databases inside the **Computer Science Student Dashboard** page and populate them with sample data that can be verified automatically. **Task Requirements:** 1. In the **Courses** database, add a new **relation** property named **Related Internships** that points to the **Internship search** database. 2. Ensure the relation is **bidirectional** by adding a relation property in the **Internship search** database named **Relevant Courses** that points back to the **Courses** database. 3. Create **exactly three** new pages in the **Courses** database with realistic computer-science course data. Each course page must include **all** of the following properties and values: • **Code** (text) – unique codes `CS301`, `CS302`, and `CS303` respectively • **Name** (text) – pick appropriate names (e.g., *Computer Networks*, *Operating Systems*, *Machine Learning*) • **Credit** (number) – any positive integer • **Status** (status) – choose from `Planned`, `In Progress`, or `Completed` • **Related Internships** (relation) – link to at least one internship created in step4. 4. Create **exactly two** new pages in the **Internship search** database with complete application information. Each internship page must include **all** of the following properties and values: • **Company** (text) – `OpenAI` and `Google` respectively • **Role** (text) – `Machine Learning Intern` and `Software Engineering Intern` • **Status** (status) – set to `Interested` • **Relevant Courses** (relation) – link to one or more of the courses created in step3. 5. Every course created in step3 must be linked to at least one internship from step4 **and** every internship must be linked back to at least one course. The task is considered complete when the relation properties exist, the specified course and internship pages are present with the exact values above, and the relations correctly connect the two databases in both directions. ================================================ FILE: tasks/notion/standard/computer_science_student_dashboard/courses_internships_relation/meta.json ================================================ { "task_id": "courses_internships_relation", "task_name": "Courses Internships Relation", "category_id": "computer_science_student_dashboard", "category_name": "Computer Science Student Dashboard", "description": "Connect the Courses and Internship search databases with bidirectional relations and populate with sample data.", "author": "Zijian Wu", "created_at": "2025-07-27", "difficulty": "L3", "tags": [ "database manipulation", "cross-reference linking", "template population" ], "mcp": [ "notion" ], "meta_data": { "stateType": "url", "stateContent": null, "stateUrl": "https://painted-tennis-ebc.notion.site/Computer-Science-Student-Dashboard-23e81626b6d78083b787d3c832b02ef4", "stateOriginalUrl": "https://www.notion.so/marketplace/templates/computer-science-student-dashboard" } } ================================================ FILE: tasks/notion/standard/computer_science_student_dashboard/courses_internships_relation/verify.py ================================================ import sys from notion_client import Client from tasks.utils import notion_utils # --------------------------------------------------------------------------- # Constants ----------------------------------------------------------------- # --------------------------------------------------------------------------- MAIN_PAGE_TITLE = "Computer Science Student Dashboard" COURSES_DB_TITLE = "Courses" INTERNSHIP_DB_TITLE = "Internship search" COURSE_CODES = {"CS301", "CS302", "CS303"} COURSE_RELATION_NAME = "Related Internships" INTERNSHIP_RELATION_NAME = "Relevant Courses" INTERNSHIP_COMPANIES = {"OpenAI", "Google"} # --------------------------------------------------------------------------- # Helper functions ----------------------------------------------------------- # --------------------------------------------------------------------------- def _locate_main_page(notion: Client, main_id: str | None) -> str | None: """Return the page_id of the dashboard page or None if not found.""" page_id = None if main_id: found_id, obj_type = notion_utils.find_page_or_database_by_id(notion, main_id) if found_id and obj_type == "page": page_id = found_id if not page_id: page_id = notion_utils.find_page(notion, MAIN_PAGE_TITLE) return page_id def _locate_database(notion: Client, parent_page_id: str, db_title: str) -> str | None: """Recursively search for a child database by title and return its id.""" return notion_utils.find_database_in_block(notion, parent_page_id, db_title) # --------------------------------------------------------------------------- # Verification logic --------------------------------------------------------- # --------------------------------------------------------------------------- def verify(notion: Client, main_id: str | None = None) -> bool: """Verify completion of the Courses ↔ Internship relation task.""" # ------------------------------------------------------------------ # Locate main page and databases ----------------------------------- # ------------------------------------------------------------------ page_id = _locate_main_page(notion, main_id) if not page_id: print(f"Error: Page '{MAIN_PAGE_TITLE}' not found.", file=sys.stderr) return False courses_db_id = _locate_database(notion, page_id, COURSES_DB_TITLE) internships_db_id = _locate_database(notion, page_id, INTERNSHIP_DB_TITLE) if not courses_db_id: print(f"Error: Database '{COURSES_DB_TITLE}' not found.", file=sys.stderr) return False if not internships_db_id: print(f"Error: Database '{INTERNSHIP_DB_TITLE}' not found.", file=sys.stderr) return False # ------------------------------------------------------------------ # Validate relation properties ------------------------------------- # ------------------------------------------------------------------ courses_db_obj = notion.databases.retrieve(database_id=courses_db_id) internships_db_obj = notion.databases.retrieve(database_id=internships_db_id) courses_props = courses_db_obj.get("properties", {}) internships_props = internships_db_obj.get("properties", {}) # Courses → Internships relation if COURSE_RELATION_NAME not in courses_props: print( f"Error: Property '{COURSE_RELATION_NAME}' missing in Courses database.", file=sys.stderr, ) return False course_rel_prop = courses_props[COURSE_RELATION_NAME] if ( course_rel_prop.get("type") != "relation" or course_rel_prop["relation"].get("database_id") != internships_db_id ): print( "Error: Courses relation property is not configured correctly.", file=sys.stderr, ) return False # Internships → Courses relation if INTERNSHIP_RELATION_NAME not in internships_props: print( f"Error: Property '{INTERNSHIP_RELATION_NAME}' missing in Internship search database.", file=sys.stderr, ) return False intern_rel_prop = internships_props[INTERNSHIP_RELATION_NAME] if ( intern_rel_prop.get("type") != "relation" or intern_rel_prop["relation"].get("database_id") != courses_db_id ): print( "Error: Internship relation property is not configured correctly.", file=sys.stderr, ) return False # ------------------------------------------------------------------ # Validate course pages -------------------------------------------- # ------------------------------------------------------------------ course_pages = notion.databases.query(database_id=courses_db_id).get("results", []) valid_course_count = 0 course_page_id_set = set() internship_ids_seen: set[str] = set() for page in course_pages: props = page.get("properties", {}) code_rts = props.get("Code", {}).get("rich_text", []) code_val = "".join(rt.get("plain_text", "") for rt in code_rts).strip() if code_val not in COURSE_CODES: continue # not one of the new course entries we care about # Check required scalar props title_rts = props.get("Name", {}).get("title", []) name_ok = bool("".join(rt.get("plain_text", "") for rt in title_rts).strip()) credits_ok = props.get("Credit", {}).get("number") is not None status_name = props.get("Status", {}).get("status", {}).get("name", "") status_allowed = {"planned", "in progress", "completed"} status_ok = status_name.lower() in status_allowed # Relation must point to at least one internship relations = props.get(COURSE_RELATION_NAME, {}).get("relation", []) if not (name_ok and credits_ok and status_ok and relations): print( f"Error: Course '{code_val}' is missing required property values or relations, or wrong values.", file=sys.stderr, ) return False # Collect IDs for further mutual check course_page_id_set.add(page["id"]) internship_ids_seen.update(rel["id"] for rel in relations) valid_course_count += 1 if valid_course_count != 3: print( f"Error: Expected exactly 3 new course pages with codes {COURSE_CODES}, found {valid_course_count}.", file=sys.stderr, ) return False # ------------------------------------------------------------------ # Validate internship pages ---------------------------------------- # ------------------------------------------------------------------ internship_pages = notion.databases.query(database_id=internships_db_id).get( "results", [] ) valid_intern_count = 0 internship_page_ids = set() course_ids_seen_from_intern: set[str] = set() for page in internship_pages: props = page.get("properties", {}) company_rts = props.get("Company", {}).get("rich_text", []) company = "".join(rt.get("plain_text", "") for rt in company_rts).strip() if company not in INTERNSHIP_COMPANIES: continue # not one of the two new internships role_rts = props.get("Role", {}).get("title", []) role_ok = bool("".join(rt.get("plain_text", "") for rt in role_rts).strip()) status_name = props.get("Status", {}).get("status", {}).get("name", "") status_ok = status_name.lower() == "interested" relations = props.get(INTERNSHIP_RELATION_NAME, {}).get("relation", []) if not (role_ok and status_ok and relations): print( f"Error: Internship at '{company}' is missing required property values or relations, or wrong values.", file=sys.stderr, ) return False internship_page_ids.add(page["id"]) course_ids_seen_from_intern.update(rel["id"] for rel in relations) valid_intern_count += 1 if valid_intern_count != 2: print( f"Error: Expected exactly 2 new internship pages for companies {INTERNSHIP_COMPANIES}, found {valid_intern_count}.", file=sys.stderr, ) return False # ------------------------------------------------------------------ # Mutual relation consistency -------------------------------------- # ------------------------------------------------------------------ # Each relation from courses should point to one of the two internships identified if not internship_ids_seen.issubset(internship_page_ids): print( "Error: Some course relations point to pages outside the expected internships.", file=sys.stderr, ) return False # Each relation from internships should point back to the three course pages identified if not course_ids_seen_from_intern.issubset(course_page_id_set): print( "Error: Some internship relations point to pages outside the expected courses.", file=sys.stderr, ) return False print( "Success: Verified bidirectional relations, course and internship entries as required." ) return True # --------------------------------------------------------------------------- # CLI entry-point ----------------------------------------------------------- # --------------------------------------------------------------------------- def main() -> None: notion = notion_utils.get_notion_client() main_id = sys.argv[1] if len(sys.argv) > 1 else None sys.exit(0 if verify(notion, main_id) else 1) if __name__ == "__main__": main() ================================================ FILE: tasks/notion/standard/computer_science_student_dashboard/study_session_tracker/description.md ================================================ Your goal is to create a new study-session entry in the **Computer Science Student Dashboard** page. 1. Locate the ☑️ Habit tracker section of the page. 2. **Insert a new date section** immediately **after the existing `2022-09-02` to-do items but *before* the divider block** that follows them. Make sure the new date has proper formatting with a date mention and bold styling like the existing dates, and all to-do items should be unchecked initially. The new section should be inserted right after the 2022-09-02 to-do items but before the divider. 3. Directly **beneath** this new date mention, add **exactly four unchecked to-do blocks** with the following plain text (including the leading emoji on each line): • 🧠 Review algorithms for technical interview • 📚 Study database systems chapter 7 • ⚡ Practice system design problems • 🎯 Complete data structures assignment ================================================ FILE: tasks/notion/standard/computer_science_student_dashboard/study_session_tracker/meta.json ================================================ { "task_id": "study_session_tracker", "task_name": "Study Session Tracker", "category_id": "computer_science_student_dashboard", "category_name": "Computer Science Student Dashboard", "description": "Create a new study-session entry in the Habit tracker section with four unchecked to-do items.", "author": "Zijian Wu", "created_at": "2025-07-27", "difficulty": "L3", "tags": [ "content organization", "visual formatting", "status tracking" ], "mcp": [ "notion" ], "meta_data": { "stateType": "url", "stateContent": null, "stateUrl": "https://painted-tennis-ebc.notion.site/Computer-Science-Student-Dashboard-23e81626b6d78083b787d3c832b02ef4", "stateOriginalUrl": "https://www.notion.so/marketplace/templates/computer-science-student-dashboard" } } ================================================ FILE: tasks/notion/standard/computer_science_student_dashboard/study_session_tracker/verify.py ================================================ import sys from notion_client import Client from tasks.utils import notion_utils from typing import Dict def _normalize_string(s: str) -> str: """Replace non-breaking space with regular space for safe comparison.""" return s.replace("\xa0", " ") def verify(notion: Client, main_id: str | None = None) -> bool: """Verify that the new study-session entry for 2025-01-29 was added correctly. The script checks that: 1. A bold date-mention with start=2025-01-29 exists. 2. The mention sits after the 2022-09-02 section but before the divider that originally followed that section. 3. Exactly four specified to-do items follow the new date mention and they are all unchecked. """ # --------------------------------------------------------------------- # Locate the main page ------------------------------------------------- # --------------------------------------------------------------------- page_id: str | None = None if main_id: found_id, object_type = notion_utils.find_page_or_database_by_id( notion, main_id ) if found_id and object_type == "page": page_id = found_id if not page_id: page_id = notion_utils.find_page(notion, "Computer Science Student Dashboard") if not page_id: print( "Error: Page 'Computer Science Student Dashboard' not found.", file=sys.stderr, ) return False # --------------------------------------------------------------------- # Fetch all blocks under the page (flattened order) -------------------- # --------------------------------------------------------------------- all_blocks = notion_utils.get_all_blocks_recursively(notion, page_id) # --------------------------------------------------------------------- # Locate reference blocks --------------------------------------------- # --------------------------------------------------------------------- TARGET_DATE = "2025-01-29" PREVIOUS_DATE = "2022-09-02" index_previous_date: int | None = None index_new_date: int | None = None index_divider_after_previous: int | None = None for idx, block in enumerate(all_blocks): # Divider detection (we care only about the first divider that appears after # the 2022-09-02 block) if block.get("type") == "divider": if index_previous_date is not None and index_divider_after_previous is None: index_divider_after_previous = idx # We only need to inspect paragraph blocks that contain a date mention if block.get("type") != "paragraph": continue rich_text_list = block["paragraph"].get("rich_text", []) for rt in rich_text_list: if ( rt.get("type") != "mention" or rt.get("mention", {}).get("type") != "date" ): continue date_start = rt["mention"]["date"].get("start") if date_start == PREVIOUS_DATE and index_previous_date is None: index_previous_date = idx if date_start == TARGET_DATE and index_new_date is None: index_new_date = idx # (1) Verify bold annotation if not rt.get("annotations", {}).get("bold", False): print( "Error: The 2025-01-29 date mention is not bold.", file=sys.stderr, ) return False # Ensure all reference indices were found if index_previous_date is None: print("Error: Could not locate the 2022-09-02 date section.", file=sys.stderr) return False if index_divider_after_previous is None: print( "Error: Could not locate the divider that follows the 2022-09-02 section.", file=sys.stderr, ) return False if index_new_date is None: print( "Error: Could not locate the new 2025-01-29 date mention.", file=sys.stderr ) return False # (2) Verify ordering if not (index_previous_date < index_new_date < index_divider_after_previous): print( "Error: The 2025-01-29 section is positioned incorrectly.", file=sys.stderr ) return False # --------------------------------------------------------------------- # Verify to-do items under the new date section ------------------------ # --------------------------------------------------------------------- expected_texts = [ "🧠 Review algorithms for technical interview", "📚 Study database systems chapter 7", "⚡ Practice system design problems", "🎯 Complete data structures assignment", ] expected_todos: Dict[str, bool] = { _normalize_string(t): False for t in expected_texts } # Look through the blocks that lie between the new date mention and the divider for block in all_blocks[index_new_date + 1 : index_divider_after_previous]: if block.get("type") != "to_do": # Any non to-do block inside this range indicates mis-placement. # We simply ignore it – correctness is determined by presence of required to-dos. continue plain_text = notion_utils.get_block_plain_text(block).strip() plain_text_norm = _normalize_string(plain_text) if plain_text_norm in expected_todos: # (3a) Verify the to-do is unchecked if block["to_do"].get("checked", False): print(f"Error: To-do '{plain_text}' is checked.", file=sys.stderr) return False expected_todos[plain_text_norm] = True missing_items = [text for text, found in expected_todos.items() if not found] if missing_items: print(f"Error: Missing to-do items: {missing_items}", file=sys.stderr) return False # --------------------------------------------------------------------- # Success -------------------------------------------------------------- # --------------------------------------------------------------------- print("Success: Study session for 2025-01-29 added correctly.") return True # ------------------------------------------------------------------------- # Command-line entry-point ------------------------------------------------- # ------------------------------------------------------------------------- def main() -> None: notion = notion_utils.get_notion_client() main_id = sys.argv[1] if len(sys.argv) > 1 else None if verify(notion, main_id): sys.exit(0) else: sys.exit(1) if __name__ == "__main__": main() ================================================ FILE: tasks/notion/standard/it_trouble_shooting_hub/asset_retirement_migration/description.md ================================================ Please restructure the **IT Inventory** database as described below. Your automation will be checked by an automated script, so follow every detail exactly. --- Task Steps 1. Inside the **IT Trouble Shooting Hub** page, locate the database named **IT Inventory**. 2. Query this database and collect every page whose **Status** property is **Expired** or **To be returned**. 3. Create a **new full-page database** directly under the same IT Trouble Shooting Hub page called **IT Asset Retirement Queue**. 4. Configure this new database so that it contains **exactly** the following properties (spellings and types must match): • Serial – title • Tags – multi_select • Status – select • Vendor – select • Expiration date – date • Retirement Reason – select with option set { **Expired License**, **Hardware Obsolete**, **Security Risk**, **User Offboarding** } 5. For every inventory item gathered in step2: a. Create a corresponding page in **IT Asset Retirement Queue** and copy over the values of the Serial, Tags, Status, Vendor and Expiration date properties. b. Set **Retirement Reason** to one of the four options above (choose the most appropriate). c. Archive the original inventory page **after** the new page has been created. 6. After all items are migrated: a. Update the **description** of the **IT Asset Retirement Queue** database so it is **exactly** `AUTO-GENERATED MIGRATION COMPLETED` (no additional text). b. Create a new page under **IT Trouble Shooting Hub** titled **Retirement Migration Log**. Inside this page, add a **callout block** whose text follows the exact pattern: `Successfully migrated <N> assets to the retirement queue on 2025-03-24.` • `<N>` is the total number of items moved. ================================================ FILE: tasks/notion/standard/it_trouble_shooting_hub/asset_retirement_migration/meta.json ================================================ { "task_id": "asset_retirement_migration", "task_name": "Asset Retirement Migration", "category_id": "it_trouble_shooting_hub", "category_name": "IT Trouble Shooting Hub", "description": "Restructure the IT Inventory database by migrating expired assets to a new IT Asset Retirement Queue database.", "author": "Zijian Wu", "created_at": "2025-07-27", "difficulty": "L3", "tags": [ "database manipulation", "automated migration", "conditional filtering", "data aggregation", "report generation" ], "mcp": [ "notion" ], "meta_data": { "stateType": "url", "stateContent": null, "stateUrl": "https://painted-tennis-ebc.notion.site/It-Trouble-Shooting-Hub-23e81626b6d78020aba7eb65ae1cc2d5", "stateOriginalUrl": "https://www.notion.so/marketplace/templates/it-trouble-shooting-hub" } } ================================================ FILE: tasks/notion/standard/it_trouble_shooting_hub/asset_retirement_migration/verify.py ================================================ import sys from typing import Dict, Set from notion_client import Client from tasks.utils import notion_utils def _get_database(root_page_id: str, notion: Client, name: str) -> str | None: """Helper that finds a child database by title inside a page.""" return notion_utils.find_database_in_block(notion, root_page_id, name) def _check_property(props: Dict, name: str, expected_type: str) -> bool: if name not in props: print(f"Error: Property '{name}' missing in database.", file=sys.stderr) return False if props[name]["type"] != expected_type: print( f"Error: Property '{name}' expected type '{expected_type}', found '{props[name]['type']}'.", file=sys.stderr, ) return False return True def verify(notion: Client, main_id: str | None = None) -> bool: """Verifies that the IT Asset Retirement Queue was created and populated correctly.""" # ------------------------------------------------------------------------- # Resolve the root IT Trouble Shooting Hub page # ------------------------------------------------------------------------- root_page_id = None if main_id: found_id, obj_type = notion_utils.find_page_or_database_by_id(notion, main_id) if found_id and obj_type == "page": root_page_id = found_id if not root_page_id: root_page_id = notion_utils.find_page(notion, "IT Trouble Shooting Hub") if not root_page_id: print( "Error: Could not locate the 'IT Trouble Shooting Hub' page.", file=sys.stderr, ) return False # ------------------------------------------------------------------------- # Locate the original and new databases # ------------------------------------------------------------------------- inventory_db_id = _get_database(root_page_id, notion, "IT Inventory") if not inventory_db_id: print("Error: 'IT Inventory' database not found.", file=sys.stderr) return False retirement_db_id = _get_database(root_page_id, notion, "IT Asset Retirement Queue") if not retirement_db_id: print("Error: 'IT Asset Retirement Queue' database not found.", file=sys.stderr) return False # ------------------------------------------------------------------------- # Validate schema of the retirement queue database # ------------------------------------------------------------------------- retirement_db = notion.databases.retrieve(database_id=retirement_db_id) r_props = retirement_db["properties"] required_schema = { "Serial": "title", "Tags": "multi_select", "Status": "select", "Vendor": "select", "Expiration date": "date", "Retirement Reason": "select", } for pname, ptype in required_schema.items(): if not _check_property(r_props, pname, ptype): return False # Check Retirement Reason options expected_reason_options: Set[str] = { "Expired License", "Hardware Obsolete", "Security Risk", "User Offboarding", } actual_options = { opt["name"] for opt in r_props["Retirement Reason"]["select"]["options"] } if actual_options != expected_reason_options: print( "Error: 'Retirement Reason' select options mismatch.\n" f"Expected: {sorted(expected_reason_options)}\n" f"Found: {sorted(actual_options)}", file=sys.stderr, ) return False # --------------------------------------------------------------- # Validate database description starts with required phrase # --------------------------------------------------------------- desc_rich = retirement_db.get("description", []) desc_text = "".join([t.get("plain_text", "") for t in desc_rich]) required_desc = "AUTO-GENERATED MIGRATION COMPLETED" if desc_text.strip() != required_desc: print( f"Error: Retirement database description must be exactly '{required_desc}'.", file=sys.stderr, ) return False # ------------------------------------------------------------------------- # Validate that inventory items are moved & archived # ------------------------------------------------------------------------- expired_filter = { "property": "Status", "select": {"equals": "Expired"}, } to_return_filter = { "property": "Status", "select": {"equals": "To be returned"}, } compound_filter = {"or": [expired_filter, to_return_filter]} # Query for any *active* items that still match these statuses remaining_items = notion.databases.query( database_id=inventory_db_id, filter=compound_filter, archived=False, ).get("results", []) if remaining_items: print( f"Error: {len(remaining_items)} 'Expired' / 'To be returned' items still present in IT Inventory.", file=sys.stderr, ) return False # There should be at least one entry in the retirement queue retirement_pages = notion.databases.query(database_id=retirement_db_id).get( "results", [] ) expected_serials = {"65XYQ/GB", "36x10PIQ"} if len(retirement_pages) != len(expected_serials): print( f"Error: Expected {len(expected_serials)} retirement pages, found {len(retirement_pages)}.", file=sys.stderr, ) return False # Each retirement page must have a Retirement Reason serials_seen = set() for page in retirement_pages: props = page["properties"] reason = props.get("Retirement Reason", {}).get("select", {}) if not reason or reason.get("name") not in expected_reason_options: print( f"Error: Page {page['id']} missing valid 'Retirement Reason'.", file=sys.stderr, ) return False # Collect Serial title title_rich = props.get("Serial", {}).get("title", []) serial_val = "".join([t.get("plain_text", "") for t in title_rich]).strip() serials_seen.add(serial_val) if serials_seen != expected_serials: print( f"Error: Serial values mismatch. Expected {sorted(expected_serials)}, found {sorted(serials_seen)}.", file=sys.stderr, ) return False # ----------------------------------------------------------------- # Verify the migration log page and callout block contents # ----------------------------------------------------------------- log_page_title = "Retirement Migration Log" log_page_id = notion_utils.find_page(notion, log_page_title) if not log_page_id: print(f"Error: Page '{log_page_title}' not found.", file=sys.stderr) return False # Search for a callout block with required pattern import re callout_pattern = re.compile( r"Successfully migrated (\d+) assets to the retirement queue on 2025-03-24\." ) blocks = notion_utils.get_all_blocks_recursively(notion, log_page_id) match_found = False for blk in blocks: if blk.get("type") == "callout": text = notion_utils.get_block_plain_text(blk) m = callout_pattern.search(text) if m: migrated_num = int(m.group(1)) if migrated_num == len(expected_serials): match_found = True else: print( f"Error: Callout reports {migrated_num} assets, but {len(retirement_pages)} retirement pages found.", file=sys.stderr, ) return False break if not match_found: print( "Error: Required callout block not found in migration log page.", file=sys.stderr, ) return False print("Success: All verification criteria satisfied.") return True def main(): notion = notion_utils.get_notion_client() main_id = sys.argv[1] if len(sys.argv) > 1 else None if verify(notion, main_id): sys.exit(0) else: sys.exit(1) if __name__ == "__main__": main() ================================================ FILE: tasks/notion/standard/it_trouble_shooting_hub/security_audit_ticket/description.md ================================================ Please help me create a comprehensive security audit ticket based on the data already stored in the **IT Trouble Shooting Hub** page. Your automation should: 1. In the **IT Inventory** database, find every item whose **Expiration date** is **before 2023-07-15**. 2. In the **IT FAQs** database, look up any FAQ entries that have the **"Security"** tag. 3. **Create a new page** inside the **IT Requests** database with **exact title**: `Quarterly Security Audit - Expired Assets Review` 4. Set its **Priority** property to **High**. 5. Set its **Due** property to **2023-06-22**. 6. In the page body, add a bullet-list block that enumerates **each expired inventory item**. **Each bullet item must follow this exact text format (including the dashes):** `<Serial> - <Tag> - <Recommendation>` • `<Serial>` is the item’s Serial value. • `<Tag>` is the first tag assigned to the inventory item (e.g., "Laptop"). • `<Recommendation>` is a brief action you suggest based on the security FAQ entry (any text is acceptable). Example (do **not** copy): `ABC123 - Laptop - Renew warranty and enable disk encryption` ================================================ FILE: tasks/notion/standard/it_trouble_shooting_hub/security_audit_ticket/meta.json ================================================ { "task_id": "security_audit_ticket", "task_name": "Security Audit Ticket", "category_id": "it_trouble_shooting_hub", "category_name": "IT Trouble Shooting Hub", "description": "Create a comprehensive security audit ticket based on expired inventory items and security FAQ entries.", "author": "Zijian Wu", "created_at": "2025-07-27", "difficulty": "L3", "tags": [ "conditional filtering", "database manipulation", "data aggregation", "report generation" ], "mcp": [ "notion" ], "meta_data": { "stateType": "url", "stateContent": null, "stateUrl": "https://painted-tennis-ebc.notion.site/It-Trouble-Shooting-Hub-23e81626b6d78020aba7eb65ae1cc2d5", "stateOriginalUrl": "https://www.notion.so/marketplace/templates/it-trouble-shooting-hub" } } ================================================ FILE: tasks/notion/standard/it_trouble_shooting_hub/security_audit_ticket/verify.py ================================================ import sys from notion_client import Client from tasks.utils import notion_utils import re def _get_title_text(page_properties: dict) -> str: """Extract the plain text of the first title property from a page.""" for prop in page_properties.values(): if prop.get("type") == "title": title_rich = prop.get("title", []) if title_rich: return title_rich[0].get("plain_text") return "" def verify(notion: Client, main_id: str | None = None) -> bool: """Verify that the automation created the expected security audit ticket.""" # ---------------------------------------------------------------------------------- # Locate the root page (IT Trouble Shooting Hub) either via main_id or by title. # ---------------------------------------------------------------------------------- root_page_id = None if main_id: found_id, object_type = notion_utils.find_page_or_database_by_id( notion, main_id ) if found_id and object_type == "page": root_page_id = found_id if not root_page_id: root_page_id = notion_utils.find_page(notion, "IT Trouble Shooting Hub") if not root_page_id: print( "Error: Could not locate the 'IT Trouble Shooting Hub' page.", file=sys.stderr, ) return False # ---------------------------------------------------------------------------------- # Find the IT Requests database under the root page. # ---------------------------------------------------------------------------------- requests_db_id = notion_utils.find_database_in_block( notion, root_page_id, "IT Requests" ) if not requests_db_id: print( "Error: 'IT Requests' database not found in the workspace.", file=sys.stderr ) return False # ---------------------------------------------------------------------------------- # Search for the expected ticket inside the IT Requests database. # ---------------------------------------------------------------------------------- expected_title = "Quarterly Security Audit - Expired Assets Review" results = notion.databases.query(database_id=requests_db_id).get("results", []) target_page = None for page in results: title_text = _get_title_text(page.get("properties", {})) if title_text == expected_title: target_page = page break if not target_page: print( f"Failure: Ticket with title '{expected_title}' was not found in 'IT Requests' database.", file=sys.stderr, ) return False props = target_page.get("properties", {}) # ---------------------------------------------------------------------------------- # Validate Priority property. # ---------------------------------------------------------------------------------- priority_value = props.get("Priority", {}).get("select", {}).get("name") if priority_value != "High": print( f"Failure: Expected Priority 'High', found '{priority_value}'.", file=sys.stderr, ) return False # ---------------------------------------------------------------------------------- # Validate Due date property. # ---------------------------------------------------------------------------------- due_date_start = props.get("Due", {}).get("date", {}).get("start") expected_due_iso = "2023-06-22" if not due_date_start or not due_date_start.startswith(expected_due_iso): print( f"Failure: Expected Due date '{expected_due_iso}', found '{due_date_start}'.", file=sys.stderr, ) return False # ---------------------------------------------------------------------------------- # Validate the bulleted list contains the correct expired items in required format. # ---------------------------------------------------------------------------------- page_id = target_page["id"] blocks = notion.blocks.children.list(block_id=page_id).get("results", []) bullet_texts = [ notion_utils.get_block_plain_text(b) for b in blocks if b.get("type") == "bulleted_list_item" ] expected_items = { "192371-8910/54": "Computer Accessory", "32x11PIP": "Computer Accessory", "76x87PCY": "Laptop", "36x10PIQ": "Computer Accessory", "65XYQ/GB": "License", } if len(bullet_texts) != len(expected_items): print( f"Failure: Expected {len(expected_items)} bullet items, found {len(bullet_texts)}.", file=sys.stderr, ) return False bullet_pattern = re.compile(r"^\s*(.*?)\s+-\s+(.*?)\s+-\s+(.+?)\s*$") matched = set() for text in bullet_texts: m = bullet_pattern.match(text) if not m: print( f"Failure: Bullet item '{text}' does not follow '<Serial> - <Tag> - <Recommendation>' format.", file=sys.stderr, ) return False serial, tag, advice = m.group(1).strip(), m.group(2).strip(), m.group(3).strip() if serial not in expected_items: print( f"Failure: Unexpected Serial '{serial}' found in bullet list.", file=sys.stderr, ) return False if expected_items[serial] != tag: print( f"Failure: Serial '{serial}' expected tag '{expected_items[serial]}', found '{tag}'.", file=sys.stderr, ) return False if not advice: print( f"Failure: Bullet item for Serial '{serial}' is missing a recommendation/advice.", file=sys.stderr, ) return False matched.add(serial) if len(matched) != len(expected_items): missing = set(expected_items.keys()) - matched print( f"Failure: Missing bullet items for serials: {', '.join(missing)}.", file=sys.stderr, ) return False print("Success: All verification criteria satisfied.") return True def main(): notion = notion_utils.get_notion_client() main_id = sys.argv[1] if len(sys.argv) > 1 else None if verify(notion, main_id): sys.exit(0) else: sys.exit(1) if __name__ == "__main__": main() ================================================ FILE: tasks/notion/standard/it_trouble_shooting_hub/verification_expired_update/description.md ================================================ **Task Overview** My IT knowledge base contains pages whose verification status has expired: **Task Requirements** 1. Locate the database named **"IT Homepage"** inside the main page **"It Trouble Shooting Hub"**. 2. Within that database, find every page (except for **"It Inventory"**) where the **Verification** property state contains `expired`. 3. For **each** expired page: • Insert a **callout block** at the very top (as the first child block) whose rich-text content is: `VERIFICATION EXPIRED - This page needs review and re-verification` • Set the callout’s icon to ⚠️. • Set the callout’s colour to `red_background`. 4. Create a new entry in the **"IT Requests"** database with: • Title (property **Task name**) **exactly** `Batch Verification Update Required`. • **Priority** set to `High`. • **Status** set to `In progress`. • In the page body add a **bulleted list** where each bullet is a **mention** of the page processed in step 3 (i.e., use the Notion mention object linking to that page). ================================================ FILE: tasks/notion/standard/it_trouble_shooting_hub/verification_expired_update/meta.json ================================================ { "task_id": "verification_expired_update", "task_name": "Verification Expired Update", "category_id": "it_trouble_shooting_hub", "category_name": "IT Trouble Shooting Hub", "description": "Update pages with expired verification status by adding warning callouts and creating a batch update request.", "author": "Zijian Wu", "created_at": "2025-07-27", "difficulty": "L3", "tags": [ "conditional filtering", "visual formatting", "database manipulation", "cross-reference linking", "status tracking" ], "mcp": [ "notion" ], "meta_data": { "stateType": "url", "stateContent": null, "stateUrl": "https://painted-tennis-ebc.notion.site/It-Trouble-Shooting-Hub-23e81626b6d78020aba7eb65ae1cc2d5", "stateOriginalUrl": "https://www.notion.so/marketplace/templates/it-trouble-shooting-hub" } } ================================================ FILE: tasks/notion/standard/it_trouble_shooting_hub/verification_expired_update/verify.py ================================================ import sys from notion_client import Client from tasks.utils import notion_utils CALL_OUT_TEXT = "VERIFICATION EXPIRED - This page needs review and re-verification" CALL_OUT_ICON = "⚠️" CALL_OUT_COLOR = "red_background" IT_HOMEPAGE_DB_TITLE = "IT Homepage" IT_REQUESTS_DB_TITLE = "IT Requests" REQUEST_TITLE = "Batch Verification Update Required" PRIORITY_HIGH = "High" STATUS_IN_PROGRESS = "In progress" def _get_main_page_id(notion: Client, main_id: str | None) -> str | None: """Resolve the main page id starting from CLI arg or by title search.""" if main_id: found_id, obj_type = notion_utils.find_page_or_database_by_id(notion, main_id) if found_id and obj_type == "page": return found_id # Fallback to title search (case-insensitive) return notion_utils.find_page(notion, "It Trouble Shooting Hub") def _fetch_database_id( notion: Client, parent_page_id: str, db_title: str ) -> str | None: """Locate a child database by title inside a given page.""" return notion_utils.find_database_in_block(notion, parent_page_id, db_title) def _expired_pages(notion: Client, db_id: str) -> list[dict]: """Return list of page objects with Verification.state == 'expired'.""" # Query all pages (API max 100 per call). If many pages expected, iterate. results = notion.databases.query(database_id=db_id).get("results", []) expired = [] for page in results: verification_prop = page.get("properties", {}).get("Verification", {}) state = verification_prop.get("verification", {}).get("state") # Skip the IT Inventory database entry title_prop = page.get("properties", {}).get("Page", {}).get("title", []) title_text = title_prop[0].get("plain_text") if title_prop else "" if title_text.strip().lower() == "it inventory": continue if state and "expired" in state.lower(): expired.append(page) return expired def _check_callout_present(notion: Client, page_id: str) -> bool: """Verify the specified callout is the first child block of the page.""" children = notion.blocks.children.list(block_id=page_id, page_size=1).get( "results", [] ) if not children: return False first_block = children[0] if first_block.get("type") != "callout": return False data = first_block.get("callout", {}) # Check color if data.get("color") != CALL_OUT_COLOR: return False # Check icon icon = data.get("icon", {}) if icon.get("type") != "emoji" or icon.get("emoji") != CALL_OUT_ICON: return False # Check text content (callout rich text plain text) plain_text = notion_utils.get_block_plain_text(first_block) return CALL_OUT_TEXT in plain_text def _find_request_page(notion: Client, db_id: str) -> dict | None: """Find the IT Request page with the expected title.""" # Use a simple search inside database res = notion.databases.query( database_id=db_id, filter={"property": "Task name", "title": {"equals": REQUEST_TITLE}}, ).get("results", []) return res[0] if res else None def _check_request_properties(page: dict) -> bool: props = page.get("properties", {}) priority = props.get("Priority", {}).get("select", {}).get("name") status = ( props.get("Status", {}).get("status", {}).get("name") if props.get("Status", {}).get("status") else props.get("Status", {}).get("select", {}).get("name") ) return priority == PRIORITY_HIGH and status == STATUS_IN_PROGRESS def _request_page_contains_mentions( notion: Client, request_page_id: str, expected_page_ids: list[str] ) -> bool: children = notion.blocks.children.list(block_id=request_page_id, page_size=100).get( "results", [] ) bullet_blocks = [b for b in children if b.get("type") == "bulleted_list_item"] mentioned_ids: set[str] = set() for block in bullet_blocks: rich_text = block.get("bulleted_list_item", {}).get("rich_text", []) for rt in rich_text: if rt.get("type") == "mention": mention = rt.get("mention", {}) if mention.get("type") == "page": mentioned_ids.add(mention.get("page", {}).get("id")) if len(mentioned_ids) < len(expected_page_ids): return False return all(pid in mentioned_ids for pid in expected_page_ids) def verify(notion: Client, main_id: str | None = None) -> bool: main_page_id = _get_main_page_id(notion, main_id) if not main_page_id: print( "Error: Could not locate the main page 'It Trouble Shooting Hub'.", file=sys.stderr, ) return False # Locate required databases it_home_db_id = _fetch_database_id(notion, main_page_id, IT_HOMEPAGE_DB_TITLE) it_req_db_id = _fetch_database_id(notion, main_page_id, IT_REQUESTS_DB_TITLE) if not all([it_home_db_id, it_req_db_id]): print( "Error: Required databases not found under the main page.", file=sys.stderr ) return False # Identify expired pages expired_pages = _expired_pages(notion, it_home_db_id) if not expired_pages: print( "Failure: No expired pages found; expected at least one for this task.", file=sys.stderr, ) return False # Verify callout on each expired page for pg in expired_pages: pid = pg["id"] if not _check_callout_present(notion, pid): print( f"Failure: Callout missing or incorrect on page {pid}.", file=sys.stderr ) return False # Verify IT Request entry request_page = _find_request_page(notion, it_req_db_id) if not request_page: print( "Failure: IT Request 'Batch Verification Update Required' not found.", file=sys.stderr, ) return False if not _check_request_properties(request_page): print("Failure: Priority or Status incorrect on IT Request.", file=sys.stderr) return False # Verify bullet list in IT Request body expired_titles = [] for p in expired_pages: title_prop = p.get("properties", {}).get("Page", {}).get("title", []) title_text = title_prop[0].get("plain_text") if title_prop else None if title_text: expired_titles.append(title_text) expected_page_ids = [p["id"] for p in expired_pages] if not _request_page_contains_mentions( notion, request_page["id"], expected_page_ids ): print( "Failure: IT Request body does not contain mentions for all affected pages.", file=sys.stderr, ) return False print("Success: All verification checks passed.") return True def main(): notion = notion_utils.get_notion_client() main_id = sys.argv[1] if len(sys.argv) > 1 else None if verify(notion, main_id): sys.exit(0) sys.exit(1) if __name__ == "__main__": main() ================================================ FILE: tasks/notion/standard/japan_travel_planner/daily_itinerary_overview/description.md ================================================ Create a comprehensive daily itinerary overview page to help organize my Japan travel plans. I need you to create a new page called 'Daily Itinerary Overview' as a child of the main Japan Travel Planner page. **Task Requirements:** 1. Create a new page titled 'Daily Itinerary Overview' as a child page of the main Japan Travel Planner page 2. Query the Travel Itinerary database to retrieve all activities 3. Structure the page with the following specific format: - Add a heading_1 block with text "📅 Daily Itinerary Overview" - Add a heading_2 block with text "📊 Trip Summary" - Under Trip Summary, add a paragraph listing the total number of visited activities - Create heading_2 blocks for "🌅 Day 1", "🌆 Day 2", and "🌃 Day 3" - Under each day heading, list the activities scheduled for that day in to do list - Each activity (use To-do list) should show: Activity Name - City (if available), for example, "Osaka Castle - Osaka". Check it if it's visited. 4. The summary paragraph must contain the exact text "Total activities visited (from Day 1 to Day 3): [NUMBER]" where [NUMBER] is the actual count. 5. Ensure all headings use the exact emoji and text format specified above ================================================ FILE: tasks/notion/standard/japan_travel_planner/daily_itinerary_overview/meta.json ================================================ { "task_id": "daily_itinerary_overview", "task_name": "Daily Itinerary Overview", "category_id": "japan_travel_planner", "category_name": "Japan Travel Planner", "description": "Create a comprehensive daily itinerary overview page to organize Japan travel plans with structured day-by-day activities.", "author": "Xiangyan Liu", "created_at": "2025-07-27", "difficulty": "L3", "tags": [ "database manipulation", "data aggregation", "report generation", "visual formatting", "status tracking" ], "mcp": [ "notion" ], "meta_data": { "stateType": "url", "stateContent": null, "stateUrl": "https://painted-tennis-ebc.notion.site/Japan-Travel-Planner-23181626b6d781c4b6bedb12786b5abe" } } ================================================ FILE: tasks/notion/standard/japan_travel_planner/daily_itinerary_overview/verify.py ================================================ import sys import re from notion_client import Client from tasks.utils import notion_utils def verify_todo_database_correspondence(all_blocks, activities_by_day, _): """ Verify that to-do items in the overview page correspond exactly to database activities. """ # Extract to-do items organized by day from the overview page todos_by_day = {"Day 1": [], "Day 2": [], "Day 3": []} current_day = None checked_todos_count = 0 for block in all_blocks: block_type = block.get("type") block_text = notion_utils.get_block_plain_text(block) # Track which day section we're in if block_type == "heading_2": if "🌅 Day 1" in block_text: current_day = "Day 1" elif "🌆 Day 2" in block_text: current_day = "Day 2" elif "🌃 Day 3" in block_text: current_day = "Day 3" else: current_day = None # Reset for non-day headings # Collect to-do items under day headings elif block_type == "to_do" and current_day: to_do_data = block.get("to_do", {}) is_checked = to_do_data.get("checked", False) if is_checked: checked_todos_count += 1 todos_by_day[current_day].append( {"text": block_text, "checked": is_checked} ) # Verify each day's activities match for day in ["Day 1", "Day 2", "Day 3"]: db_activities = activities_by_day[day] page_todos = todos_by_day[day] # Check if counts match if len(db_activities) != len(page_todos): print( f"Error: {day} activity count mismatch. Database has {len(db_activities)} activities, page has {len(page_todos)} to-dos.", file=sys.stderr, ) return False # Verify each database activity has corresponding to-do for db_activity in db_activities: expected_format = f"{db_activity['name']}" if db_activity["city"]: expected_format += f" - {db_activity['city']}" # Find matching to-do item matching_todo = None for todo in page_todos: if ( expected_format in todo["text"] or db_activity["name"] in todo["text"] ): matching_todo = todo break if not matching_todo: print( f"Error: {day} - Database activity '{expected_format}' not found in to-do list.", file=sys.stderr, ) return False # Verify checked status matches visited status if db_activity["visited"] != matching_todo["checked"]: status_desc = "checked" if db_activity["visited"] else "unchecked" actual_desc = "checked" if matching_todo["checked"] else "unchecked" print( f"Error: {day} - Activity '{db_activity['name']}' should be {status_desc} but is {actual_desc}.", file=sys.stderr, ) return False # Verify summary count matches checked to-dos for block in all_blocks: if block.get("type") == "paragraph": block_text = notion_utils.get_block_plain_text(block) if "Total activities visited (from Day 1 to Day 3): 8" in block_text: print( f"Success: Daily Itinerary Overview page created with correct structure. All {checked_todos_count} visited activities match database." ) return True print( f"Error: Summary shows incorrect visited activity count. Expected: {checked_todos_count} (based on checked to-do items)", file=sys.stderr, ) return False def verify(notion: Client, main_id: str = None) -> bool: """ Verifies that the Daily Itinerary Overview page has been created correctly. """ # Find the main Japan Travel Planner page page_id = None if main_id: found_id, object_type = notion_utils.find_page_or_database_by_id( notion, main_id ) if found_id and object_type == "page": page_id = found_id if not page_id: page_id = notion_utils.find_page(notion, "Japan Travel Planner") if not page_id: print("Error: Main 'Japan Travel Planner' page not found.", file=sys.stderr) return False # Find the Daily Itinerary Overview child page overview_page_id = None try: # Get all child pages of the main page response = notion.search( query="Daily Itinerary Overview", filter={"property": "object", "value": "page"}, ) for result in response.get("results", []): # Check if this page is a child of the main page parent = result.get("parent", {}) if parent.get("type") == "page_id" and parent.get("page_id") == page_id: overview_page_id = result["id"] break if not overview_page_id: # Alternative method: check page title directly for result in response.get("results", []): title_list = ( result.get("properties", {}).get("title", {}).get("title", []) ) for title_obj in title_list: if "Daily Itinerary Overview" in title_obj.get("plain_text", ""): overview_page_id = result["id"] break if overview_page_id: break except Exception as e: print( f"Error searching for Daily Itinerary Overview page: {e}", file=sys.stderr ) return False if not overview_page_id: print( "Error: 'Daily Itinerary Overview' page not found as child of main page.", file=sys.stderr, ) return False # Get all blocks from the overview page all_blocks = notion_utils.get_all_blocks_recursively(notion, overview_page_id) # Required content to verify - must appear in this exact order required_headings_sequence = [ ("📅 Daily Itinerary Overview", "heading_1"), ("📊 Trip Summary", "heading_2"), ("🌅 Day 1", "heading_2"), ("🌆 Day 2", "heading_2"), ("🌃 Day 3", "heading_2"), ] found_headings_in_order = [] found_summary = False summary_has_correct_format = False found_todo_items = False # Check each block and track heading sequence for block in all_blocks: block_text = notion_utils.get_block_plain_text(block) block_type = block.get("type") # Check for required headings in sequence for heading_text, expected_type in required_headings_sequence: if heading_text in block_text and block_type == expected_type: found_headings_in_order.append((heading_text, expected_type)) # Check for trip summary paragraph if ( block_type == "paragraph" and "Total activities visited (from Day 1 to Day 3):" in block_text ): found_summary = True # Check if the format is correct (contains a number) if re.search( r"Total activities visited \(from Day 1 to Day 3\):\s*\d+", block_text ): summary_has_correct_format = True # Check for to-do list items (activities under day headings) if block_type == "to_do": found_todo_items = True # Check if to-do items follow the format "Activity Name - City" if " - " in block_text: # Format appears to be correct (contains dash separator) pass # Verify all required headings are found in correct sequence if len(found_headings_in_order) != len(required_headings_sequence): missing_headings = [] for heading_text, heading_type in required_headings_sequence: if (heading_text, heading_type) not in found_headings_in_order: missing_headings.append(f"{heading_text} ({heading_type})") print(f"Error: Missing required headings: {missing_headings}", file=sys.stderr) return False # Verify headings appear in correct order for i, (found_heading, found_type) in enumerate(found_headings_in_order): expected_heading, expected_type = required_headings_sequence[i] if found_heading != expected_heading or found_type != expected_type: print( f"Error: Headings not in correct order. Expected '{expected_heading}' ({expected_type}) at position {i + 1}, but found '{found_heading}' ({found_type})", file=sys.stderr, ) return False # Verify trip summary exists and has correct format if not found_summary: print( "Error: Trip summary paragraph with 'Total activities visite' not found.", file=sys.stderr, ) return False if not summary_has_correct_format: print( "Error: Trip summary does not have correct format 'Total activities visited: [NUMBER]'.", file=sys.stderr, ) return False # Verify to-do list items exist (activities should be in to-do format) if not found_todo_items: print( "Error: No to-do list items found. Activities should be listed as to-do items under day headings.", file=sys.stderr, ) return False # Additional verification: Check if Travel Itinerary database exists and has data try: itinerary_db_id = notion_utils.find_database_in_block( notion, page_id, "Travel Itinerary" ) if not itinerary_db_id: itinerary_db_id = notion_utils.find_database(notion, "Travel Itinerary") if itinerary_db_id: # Query the database to get all activities db_response = notion.databases.query(database_id=itinerary_db_id) db_activities = db_response.get("results", []) # Organize database activities by day activities_by_day = {"Day 1": [], "Day 2": [], "Day 3": []} visited_count = 0 for result in db_activities: properties = result.get("properties", {}) # Extract activity info activity_info = {"name": "", "city": "", "visited": False, "day": None} for prop_name, prop_value in properties.items(): prop_type = prop_value.get("type") # Get activity name (usually from title property) if prop_type == "title" and prop_value.get("title"): activity_info["name"] = prop_value["title"][0]["plain_text"] # Get city info elif "city" in prop_name.lower() and prop_type in [ "rich_text", "select", ]: if prop_type == "rich_text" and prop_value.get("rich_text"): activity_info["city"] = prop_value["rich_text"][0][ "plain_text" ] elif prop_type == "select" and prop_value.get("select"): activity_info["city"] = prop_value["select"]["name"] # Get visited status elif prop_type == "checkbox": if prop_value.get("checkbox"): activity_info["visited"] = True visited_count += 1 # Get day info elif "day" in prop_name.lower() and prop_type in [ "select", "rich_text", ]: if prop_type == "select" and prop_value.get("select"): day_value = prop_value["select"]["name"] if day_value in activities_by_day: activity_info["day"] = day_value elif prop_type == "rich_text" and prop_value.get("rich_text"): day_value = prop_value["rich_text"][0]["plain_text"] if day_value in activities_by_day: activity_info["day"] = day_value # Add to appropriate day if day is specified if activity_info["day"] and activity_info["name"]: activities_by_day[activity_info["day"]].append(activity_info) # Now verify to-do items match database activities return verify_todo_database_correspondence( all_blocks, activities_by_day, visited_count ) else: print( "Warning: Travel Itinerary database not found, using to-do items for count verification." ) # Count checked to-do items in the overview page even without database checked_todos_count = 0 for block in all_blocks: if block.get("type") == "to_do": to_do_data = block.get("to_do", {}) if to_do_data.get("checked", False): checked_todos_count += 1 # Verify the summary shows the correct visited count based on checked to-dos for block in all_blocks: if block.get("type") == "paragraph": block_text = notion_utils.get_block_plain_text(block) if f"Total activities visited: {checked_todos_count}" in block_text: print( f"Success: Daily Itinerary Overview page created with correct structure and {checked_todos_count} visited activities." ) return True print( f"Error: Summary shows incorrect visited activity count. Expected: {checked_todos_count} (based on checked to-do items)", file=sys.stderr, ) return False except Exception as e: print(f"Warning: Could not verify activity count: {e}") print("Success: Daily Itinerary Overview page created with correct structure.") return True def main(): """ Executes the verification process and exits with a status code. """ notion = notion_utils.get_notion_client() main_id = sys.argv[1] if len(sys.argv) > 1 else None if verify(notion, main_id): sys.exit(0) else: sys.exit(1) if __name__ == "__main__": main() ================================================ FILE: tasks/notion/standard/japan_travel_planner/packing_progress_summary/description.md ================================================ I'm preparing for my Japan trip and need to organize my packing list. Please help me: **Step 1: Update Items in the Packing List Database** In the Clothes category, all items have already been packed except for the hat After this, check the `SIM Card` entry and the `Wallet` entry. **Step 2: Create Packing Progress Summary** After adding the items, create a new section in the main Japan Travel Planner page immediately after the "Packing List 💼" heading. This section should contain: 1. A paragraph block with the bold text "**Packing Progress Summary**" 2. Followed by bullet list items showing statistics for each category in the format: - "Category: X/Y packed" (where X is packed items, Y is total items), for example: "Shoes: 2/10 packed" - ... ================================================ FILE: tasks/notion/standard/japan_travel_planner/packing_progress_summary/meta.json ================================================ { "task_id": "packing_progress_summary", "task_name": "Packing Progress Summary", "category_id": "japan_travel_planner", "category_name": "Japan Travel Planner", "description": "Update packing list items and create a progress summary section showing statistics for each category.", "author": "Xiangyan Liu", "created_at": "2025-07-27", "difficulty": "L3", "tags": [ "database manipulation", "data aggregation", "report generation", "status tracking" ], "mcp": [ "notion" ], "meta_data": { "stateType": "url", "stateContent": null, "stateUrl": "https://painted-tennis-ebc.notion.site/Japan-Travel-Planner-23181626b6d781c4b6bedb12786b5abe", "stateOriginalUrl": "https://www.notion.so/marketplace/templates/japantravelplanner101" } } ================================================ FILE: tasks/notion/standard/japan_travel_planner/packing_progress_summary/verify.py ================================================ import sys from notion_client import Client from tasks.utils import notion_utils def verify(notion: Client, main_id: str = None) -> bool: """ Verifies that: 1. All Clothes items except hat are marked as packed 2. SIM Card and Wallet entries are checked 3. Packing Progress Summary section is created with statistics """ # Find the main Japan Travel Planner page page_id = None if main_id: found_id, object_type = notion_utils.find_page_or_database_by_id( notion, main_id ) if found_id and object_type == "page": page_id = found_id if not page_id: page_id = notion_utils.find_page(notion, "Japan Travel Planner") if not page_id: print("Error: Page 'Japan Travel Planner' not found.", file=sys.stderr) return False # Find the Packing List database all_blocks = notion_utils.get_all_blocks_recursively(notion, page_id) packing_list_db_id = None packing_list_heading_id = None for i, block in enumerate(all_blocks): # Find the Packing List heading if block.get("type") == "heading_2": heading_text = notion_utils.get_block_plain_text(block) if "Packing List" in heading_text and "💼" in heading_text: packing_list_heading_id = block["id"] # Look for the database after this heading for j in range(i + 1, len(all_blocks)): if all_blocks[j].get("type") == "child_database": packing_list_db_id = all_blocks[j]["id"] break break if not packing_list_db_id: print("Error: Packing List database not found.", file=sys.stderr) return False # Query the database for all items try: db_items = notion.databases.query(database_id=packing_list_db_id) # Track items for verification clothes_items = [] sim_card_found = False sim_card_packed = False wallet_found = False wallet_packed = False # Process all items for page in db_items.get("results", []): props = page.get("properties", {}) # Get item name name_prop = props.get("Name", {}) if name_prop.get("type") == "title": name = "".join( [t.get("plain_text", "") for t in name_prop.get("title", [])] ) else: continue # Get type (multi_select) type_prop = props.get("Type", {}) types = [] if type_prop.get("type") == "multi_select": types = [ opt.get("name", "") for opt in type_prop.get("multi_select", []) ] # Get packed status packed_prop = props.get("Packed", {}) packed = False if packed_prop.get("type") == "checkbox": packed = packed_prop.get("checkbox", False) # Check specific items if name == "SIM Card": sim_card_found = True sim_card_packed = packed elif name == "Wallet": wallet_found = True wallet_packed = packed # Track Clothes items if "Clothes" in types: clothes_items.append( {"name": name, "packed": packed, "is_hat": "hat" in name.lower()} ) # Verify Clothes items (all packed except hat) for item in clothes_items: if item["is_hat"]: if item["packed"]: print( "Error: Hat should not be packed but is marked as packed.", file=sys.stderr, ) return False else: if not item["packed"]: print( f"Error: Clothes item '{item['name']}' should be packed but is not.", file=sys.stderr, ) return False print("Success: All Clothes items are correctly marked (packed except hat).") # Verify SIM Card and Wallet if not sim_card_found: print("Error: SIM Card entry not found.", file=sys.stderr) return False if not sim_card_packed: print("Error: SIM Card entry is not checked (packed).", file=sys.stderr) return False if not wallet_found: print("Error: Wallet entry not found.", file=sys.stderr) return False if not wallet_packed: print("Error: Wallet entry is not checked (packed).", file=sys.stderr) return False print("Success: SIM Card and Wallet entries are checked.") except Exception as e: print(f"Error querying Packing List database: {e}", file=sys.stderr) return False # Expected ground truth statistics expected_stats = { "Clothes": {"packed": 12, "total": 13}, "Electronics": {"packed": 1, "total": 10}, "Essentials": {"packed": 1, "total": 12}, "Miscellaneous": {"packed": 0, "total": 10}, "Shoes": {"packed": 0, "total": 2}, "Toiletries": {"packed": 0, "total": 19}, } # Verify Packing Progress Summary section # Re-fetch blocks to get updated content all_blocks = notion_utils.get_all_blocks_recursively(notion, page_id) # Find the Packing List heading again and check blocks after it packing_heading_index = None for i, block in enumerate(all_blocks): if block.get("id") == packing_list_heading_id: packing_heading_index = i break summary_found = False statistics_verified = True found_statistics = {} if packing_heading_index is not None: # Look for summary in the next few blocks for i in range( packing_heading_index + 1, min(packing_heading_index + 15, len(all_blocks)) ): block = all_blocks[i] block_text = notion_utils.get_block_plain_text(block) # Check for "Packing Progress Summary" paragraph if "Packing Progress Summary" in block_text: summary_found = True # Check if it's bold if block.get("type") == "paragraph": rich_text_list = block.get("paragraph", {}).get("rich_text", []) for text_obj in rich_text_list: if "Packing Progress Summary" in text_obj.get("text", {}).get( "content", "" ): if not text_obj.get("annotations", {}).get("bold", False): print( "Error: 'Packing Progress Summary' text is not bold.", file=sys.stderr, ) return False # Check for statistics bullet points in format "Category: X/Y packed" if ( block.get("type") == "bulleted_list_item" and ":" in block_text and "/" in block_text and "packed" in block_text ): # Parse the statistic line # Expected format: "Category: X/Y packed" try: parts = block_text.split(":") if len(parts) >= 2: category = parts[0].strip() stats_part = parts[1].strip() # Extract X/Y from "X/Y packed" if "/" in stats_part and "packed" in stats_part: nums = stats_part.split("packed")[0].strip() if "/" in nums: x_str, y_str = nums.split("/") x = int(x_str.strip()) y = int(y_str.strip()) found_statistics[category] = {"packed": x, "total": y} except: pass # Continue if parsing fails if not summary_found: print( "Error: 'Packing Progress Summary' section not found after Packing List heading.", file=sys.stderr, ) return False if not found_statistics: print( "Error: No valid packing statistics bullet points found in format 'Category: X/Y packed'.", file=sys.stderr, ) return False # Verify the statistics match the expected values for category, stats in expected_stats.items(): if category not in found_statistics: print( f"Error: Category '{category}' missing from Packing Progress Summary.", file=sys.stderr, ) statistics_verified = False else: found = found_statistics[category] if found["packed"] != stats["packed"] or found["total"] != stats["total"]: print( f"Error: Statistics mismatch for '{category}': expected {stats['packed']}/{stats['total']} packed, found {found['packed']}/{found['total']} packed.", file=sys.stderr, ) statistics_verified = False # Check for extra categories in summary that don't exist in expected for category in found_statistics: if category not in expected_stats: print( f"Error: Unexpected category '{category}' in summary.", file=sys.stderr ) statistics_verified = False if not statistics_verified: return False print("Success: Packing Progress Summary section created with correct statistics.") # print(f"Verified statistics: {', '.join(f'{k}: {v['packed']}/{v['total']} packed' for k, v in expected_stats.items())}") return True def main(): """ Executes the verification process and exits with a status code. """ notion = notion_utils.get_notion_client() main_id = sys.argv[1] if len(sys.argv) > 1 else None if verify(notion, main_id): sys.exit(0) else: sys.exit(1) if __name__ == "__main__": main() ================================================ FILE: tasks/notion/standard/japan_travel_planner/remove_osaka_itinerary/description.md ================================================ Go to Japan Travel Planner and remove the itinerary in OSAKA after 6 PM (excluding 6 PM) in Day 1 and Day 2. ================================================ FILE: tasks/notion/standard/japan_travel_planner/remove_osaka_itinerary/meta.json ================================================ { "task_id": "remove_osaka_itinerary", "task_name": "Remove Osaka Itinerary", "category_id": "japan_travel_planner", "category_name": "Japan Travel Planner", "description": "Remove the itinerary items in Osaka after 6 PM from Day 1 and Day 2 travel schedules.", "author": "Xiangyan Liu", "created_at": "2025-07-27", "difficulty": "L3", "tags": [ "conditional filtering", "automated migration" ], "mcp": [ "notion" ], "meta_data": { "stateType": "url", "stateContent": null, "stateUrl": "https://painted-tennis-ebc.notion.site/Japan-Travel-Planner-23181626b6d781c4b6bedb12786b5abe", "stateOriginalUrl": "https://www.notion.so/marketplace/templates/japantravelplanner101" } } ================================================ FILE: tasks/notion/standard/japan_travel_planner/remove_osaka_itinerary/verify.py ================================================ import sys from notion_client import Client from tasks.utils import notion_utils def get_page_title(page_result): """Extract title from a page result""" properties = page_result.get('properties', {}) name_property = properties.get('Name', {}) if name_property.get('type') == 'title': title_array = name_property.get('title', []) if title_array and len(title_array) > 0: return title_array[0].get('plain_text', '') return '' def get_page_time(page_result): """Extract time from Notes field""" properties = page_result.get('properties', {}) notes_property = properties.get('Notes', {}) if notes_property.get('type') == 'rich_text': rich_text_array = notes_property.get('rich_text', []) if rich_text_array and len(rich_text_array) > 0: notes_text = rich_text_array[0].get('plain_text', '') return notes_text.strip() return '' def get_page_group(page_result): """Extract group/location from page""" properties = page_result.get('properties', {}) group_property = properties.get('Group', {}) if group_property.get('type') == 'select': select = group_property.get('select') if select: return select.get('name', '') return '' def get_page_day(page_result): """Extract day from page""" properties = page_result.get('properties', {}) day_property = properties.get('Day', {}) if day_property.get('type') == 'select': select = day_property.get('select') if select: return select.get('name', '') return '' def parse_time_to_minutes(time_str): """Convert time string to minutes for comparison Returns None if time cannot be parsed""" if not time_str: return None # Clean the time string time_str = time_str.strip().upper() # Remove any text after the time (e.g., "7:30 PM\n" -> "7:30 PM") time_str = time_str.split('\n')[0].strip() # Extract time components try: if 'PM' in time_str: time_part = time_str.replace('PM', '').strip() if ':' in time_part: hours, minutes = time_part.split(':') hours = int(hours) minutes = int(minutes) else: hours = int(time_part) minutes = 0 # Convert PM hours (add 12 for PM times except 12 PM) if hours != 12: hours += 12 return hours * 60 + minutes elif 'AM' in time_str: time_part = time_str.replace('AM', '').strip() if ':' in time_part: hours, minutes = time_part.split(':') hours = int(hours) minutes = int(minutes) else: hours = int(time_part) minutes = 0 # Handle 12 AM (midnight) if hours == 12: hours = 0 return hours * 60 + minutes except: return None return None def verify(notion: Client, main_id: str = None) -> bool: """ Verifies that all OSAKA events after 6PM have been removed from Day 1 and Day 2 in the Japan Travel Planner. Expected items that should be deleted (all in OSAKA, after 6PM, on Day 1 or Day 2): 1. Rikuro's Namba Main Branch - 7 PM (Day 1) 2. Shin Sekai "New World" - 8 PM (Day 2) 3. Katsudon Chiyomatsu - 7:30 PM (Day 2) 4. Ebisubashi Bridge - 9 PM (Day 1) Note: Kuromon Ichiba Market at 6 PM should NOT be deleted (it's at 6PM, not after) Items after 6PM on other days (Day 3-8) should NOT be deleted """ # Step 1: Find the main Japan Travel Planner page if main_id: found_id, object_type = notion_utils.find_page_or_database_by_id(notion, main_id) if not found_id or object_type != 'page': print("Error: Japan Travel Planner page not found.", file=sys.stderr) return False else: # Try to find the page by searching found_id = notion_utils.find_page(notion, "Japan Travel Planner") if not found_id: print("Error: Japan Travel Planner page not found.", file=sys.stderr) return False print(f"Found Japan Travel Planner page: {found_id}") # Step 2: Find the Travel Itinerary database all_blocks = notion_utils.get_all_blocks_recursively(notion, found_id) travel_itinerary_db_id = None for block in all_blocks: if block and block.get("type") == "child_database": title = block.get("child_database", {}).get("title", "") if "Travel Itinerary" in title: travel_itinerary_db_id = block.get("id") print(f"Found Travel Itinerary database: {travel_itinerary_db_id}") break if not travel_itinerary_db_id: print("Error: Travel Itinerary database not found", file=sys.stderr) return False # Step 3: Query the database for OSAKA items on Day 1 and Day 2 try: query_result = notion.databases.query( database_id=travel_itinerary_db_id, filter={ "and": [ {"property": "Group", "select": {"equals": "Osaka"}}, {"or": [ {"property": "Day", "select": {"equals": "Day 1"}}, {"property": "Day", "select": {"equals": "Day 2"}} ]} ] } ) except Exception as e: print(f"Error querying Travel Itinerary database: {e}", file=sys.stderr) return False # Step 4: Check for items that should have been deleted six_pm_minutes = 18 * 60 # 6 PM in minutes (18:00) # Expected deleted items (4 specific items after 6 PM on Day 1 and Day 2) expected_deleted = { "Rikuro's Namba Main Branch": {"time": "7 PM", "day": "Day 1", "found": False}, "Shin Sekai \"New World\"": {"time": "8 PM", "day": "Day 2", "found": False}, "Katsudon Chiyomatsu": {"time": "7:30 PM", "day": "Day 2", "found": False}, "Ebisubashi Bridge": {"time": "9 PM", "day": "Day 1", "found": False} } # Items that should remain (at or before 6 PM) expected_remaining = { "Kuromon Ichiba Market": {"time": "6 PM", "found": False} } osaka_items_after_6pm = [] osaka_items_at_or_before_6pm = [] # Debug: Show total query results print(f"Debug: Found {len(query_result.get('results', []))} total OSAKA items on Day 1 and Day 2") # Process all OSAKA items on Day 1 and Day 2 for page in query_result.get('results', []): page_title = get_page_title(page).strip() page_time = get_page_time(page) page_group = get_page_group(page) page_day = get_page_day(page) if page_group != "Osaka": continue # Parse time to check if after 6 PM time_minutes = parse_time_to_minutes(page_time) if time_minutes is not None and time_minutes > six_pm_minutes: osaka_items_after_6pm.append({ "title": page_title, "time": page_time, "day": page_day, "id": page.get('id') }) # Check if this is one of the expected deleted items for expected_title, expected_info in expected_deleted.items(): # Clean up the titles for comparison clean_page_title = page_title.strip().lower() clean_expected_title = expected_title.strip().lower() # Check for "Rikuro's" or "Rikuro's" (different apostrophe types) if "rikuro" in clean_page_title and "rikuro" in clean_expected_title: title_match = True elif clean_page_title == clean_expected_title: title_match = True elif clean_expected_title in clean_page_title or clean_page_title in clean_expected_title: title_match = True else: title_match = False if title_match and page_day == expected_info["day"]: print(f"Debug: Found '{page_title}' on {page_day} at {page_time} - matches expected '{expected_title}'") expected_deleted[expected_title]["found"] = True elif time_minutes is not None and time_minutes <= six_pm_minutes: osaka_items_at_or_before_6pm.append({ "title": page_title, "time": page_time, "day": page_day, "id": page.get('id') }) # Check if this is one of the expected remaining items for expected_title in expected_remaining: if expected_title.lower() in page_title.lower() or page_title.lower() in expected_title.lower(): expected_remaining[expected_title]["found"] = True # Step 5: Verify results print(f"\nVerification Summary:") print(f"=" * 50) all_passed = True # Check that the 4 expected items after 6 PM have been deleted print("\n4 Items that should be deleted (after 6 PM on Day 1 and Day 2):") for item_name, item_info in expected_deleted.items(): if item_info["found"]: # If found = True, it means the item still exists (was not deleted) print(f"✗ {item_name} ({item_info['day']}, {item_info['time']}) - Still exists, should be deleted", file=sys.stderr) all_passed = False else: # If found = False, it means the item was deleted correctly print(f"✓ {item_name} ({item_info['day']}, {item_info['time']}) - Correctly deleted") # Check that items at or before 6 PM remain print("\nItems that should remain (at or before 6 PM on Day 1 and Day 2):") for item_name, item_info in expected_remaining.items(): if item_info["found"]: print(f"✓ {item_name} ({item_info['time']}) - Correctly retained") else: print(f"✗ {item_name} ({item_info['time']}) - Missing, should not be deleted", file=sys.stderr) all_passed = False # Report any items after 6 PM that still exist if osaka_items_after_6pm: print(f"\n✗ Found {len(osaka_items_after_6pm)} OSAKA item(s) after 6 PM on Day 1/Day 2:", file=sys.stderr) for item in osaka_items_after_6pm: print(f" - {item['title']} at {item['time']} ({item['day']})", file=sys.stderr) else: print(f"\n✓ No OSAKA items found after 6 PM on Day 1/Day 2 (all correctly deleted)") # Report count summary print(f"\nCount Summary:") print(f"- OSAKA items after 6 PM on Day 1/Day 2 found: {len(osaka_items_after_6pm)} (should be 0)") print(f"- OSAKA items at/before 6 PM on Day 1/Day 2 found: {len(osaka_items_at_or_before_6pm)}") print(f"- Expected deletions verified: {sum(1 for item in expected_deleted.values() if not item['found'])}/4") return all_passed def main(): """ Executes the verification process and exits with a status code. """ notion = notion_utils.get_notion_client() main_id = sys.argv[1] if len(sys.argv) > 1 else None if verify(notion, main_id): print("\nVerification passed: All 4 required OSAKA events after 6 PM on Day 1 and Day 2 have been removed") sys.exit(0) else: print("\nVerification failed: Some OSAKA events after 6 PM on Day 1/Day 2 still exist") sys.exit(1) if __name__ == "__main__": main() ================================================ FILE: tasks/notion/standard/japan_travel_planner/restaurant_expenses_sync/description.md ================================================ Please find the restaurants that appear in Day 1 of the Travel Itinerary database, then create corresponding entries in the Expenses database, one restaurant per entry. Set the date uniformly to Jan 1, 2025, and the cost uniformly to $120. Display the restaurant name in the Expense field. Set Category to Dining. For Comment, use the Description from the corresponding restaurant page. Leave other properties empty. ================================================ FILE: tasks/notion/standard/japan_travel_planner/restaurant_expenses_sync/meta.json ================================================ { "task_id": "restaurant_expenses_sync", "task_name": "Restaurant Expenses Sync", "category_id": "japan_travel_planner", "category_name": "Japan Travel Planner", "description": "Find restaurants from Day 1 Travel Itinerary and create corresponding entries in the Expenses database.", "author": "Xiangyan Liu", "created_at": "2025-07-27", "difficulty": "L3", "tags": [ "conditional filtering", "database manipulation", "cross-reference linking", "template population" ], "mcp": [ "notion" ], "meta_data": { "stateType": "url", "stateContent": null, "stateUrl": "https://painted-tennis-ebc.notion.site/Japan-Travel-Planner-23181626b6d781c4b6bedb12786b5abe", "stateOriginalUrl": "https://www.notion.so/marketplace/templates/japantravelplanner101" } } ================================================ FILE: tasks/notion/standard/japan_travel_planner/restaurant_expenses_sync/verify.py ================================================ import sys from notion_client import Client from tasks.utils import notion_utils def verify(notion: Client, main_id: str = None) -> bool: """ Verifies that restaurants from Day 1 of Travel Itinerary have corresponding expense entries. """ page_id = None if main_id: found_id, object_type = notion_utils.find_page_or_database_by_id( notion, main_id ) if found_id and object_type == "page": page_id = found_id if not page_id: page_id = notion_utils.find_page(notion, "Japan Travel Planner") if not page_id: print("Error: Page 'Japan Travel Planner' not found.", file=sys.stderr) return False # Find Travel Itinerary database itinerary_db_id = notion_utils.find_database_in_block( notion, page_id, "Travel Itinerary" ) if not itinerary_db_id: print("Error: Database 'Travel Itinerary' not found.", file=sys.stderr) return False # Find Expenses database expenses_db_id = notion_utils.find_database_in_block(notion, page_id, "Expenses") if not expenses_db_id: print("Error: Database 'Expenses' not found.", file=sys.stderr) return False # Find Japan Places to Visit database places_db_id = notion_utils.find_database_in_block( notion, page_id, "Travel Itinerary" ) if not places_db_id: print("Error: Database 'Japan Places to Visit' not found.", file=sys.stderr) return False # Query Day 1 restaurants from Travel Itinerary try: itinerary_results = notion.databases.query( database_id=itinerary_db_id, filter={ "and": [ {"property": "Day", "select": {"equals": "Day 1"}}, {"property": "Type", "multi_select": {"contains": "Food"}}, ] }, ).get("results", []) except Exception as e: print(f"Error querying Travel Itinerary database: {e}", file=sys.stderr) return False if not itinerary_results: print( "Error: No restaurants found for Day 1 in Travel Itinerary.", file=sys.stderr, ) return False # Extract restaurant names restaurant_names = [] for entry in itinerary_results: props = entry.get("properties", {}) name_prop = props.get("Name", {}) name_text = "".join(t.get("plain_text", "") for t in name_prop.get("title", [])) if name_text: restaurant_names.append(name_text.strip()) if not restaurant_names: print("Error: No restaurant names found in Day 1 entries.", file=sys.stderr) return False # Get descriptions from Japan Places to Visit database try: places_results = notion.databases.query(database_id=places_db_id).get( "results", [] ) except Exception as e: print(f"Error querying Japan Places to Visit database: {e}", file=sys.stderr) return False # Create a map of restaurant names to descriptions restaurant_descriptions = {} for place in places_results: props = place.get("properties", {}) name_prop = props.get("Name", {}) name_text = "".join(t.get("plain_text", "") for t in name_prop.get("title", [])) desc_prop = props.get("Description", {}) desc_text = "".join( t.get("plain_text", "") for t in desc_prop.get("rich_text", []) ) if name_text and desc_text: restaurant_descriptions[name_text.strip()] = desc_text.strip() # Query Expenses database try: expenses_results = notion.databases.query(database_id=expenses_db_id).get( "results", [] ) except Exception as e: print(f"Error querying Expenses database: {e}", file=sys.stderr) return False # Verify each restaurant has a corresponding expense entry verified_restaurants = [] for restaurant_name in restaurant_names: found_matching_expense = False expected_description = restaurant_descriptions.get(restaurant_name, "") for expense in expenses_results: props = expense.get("properties", {}) # Check Expense field (title) expense_prop = props.get("Expense", {}) expense_text = "".join( t.get("plain_text", "") for t in expense_prop.get("title", []) ) if expense_text.strip() != restaurant_name: continue # Check Date date_prop = props.get("Date", {}) date_start = date_prop.get("date", {}).get("start") if date_start != "2025-01-01": continue # Check Transaction Amount amount_prop = props.get("Transaction Amount", {}) amount = amount_prop.get("number") if amount != 120: continue # Check Category contains Dining category_prop = props.get("Category", {}) categories = [c.get("name") for c in category_prop.get("multi_select", [])] if "Dining" not in categories: continue # Check Comment matches description (if description exists) if expected_description: comment_prop = props.get("Comment", {}) comment_text = "".join( t.get("plain_text", "") for t in comment_prop.get("rich_text", []) ) if comment_text.strip().replace( "\u202f", " " ) != expected_description.replace("\u202f", " "): continue found_matching_expense = True verified_restaurants.append(restaurant_name) break if not found_matching_expense: print( f"Error: No matching expense entry found for restaurant '{restaurant_name}'.", file=sys.stderr, ) return False if len(verified_restaurants) == len(restaurant_names): print( f"Success: Found matching expense entries for all {len(restaurant_names)} Day 1 restaurants." ) return True else: print( f"Error: Only {len(verified_restaurants)} out of {len(restaurant_names)} restaurants have matching expense entries.", file=sys.stderr, ) return False def main(): """ Executes the verification process and exits with a status code. """ notion = notion_utils.get_notion_client() main_id = sys.argv[1] if len(sys.argv) > 1 else None if verify(notion, main_id): sys.exit(0) else: sys.exit(1) if __name__ == "__main__": main() ================================================ FILE: tasks/notion/standard/online_resume/layout_adjustment/description.md ================================================ Please go to my Online Resume page and adjust the Skills display with the following requirements: ## Skills Section Adjustment 1. Delete the Skills database from the right side of the page 2. Add a new Skills section on the left side, under the Languages section 3. Format skills as "[icon] skill description (type)", for example "✨✨ Photoshop (Design Tool)" - Use ✨✨ icon for skills with level >= 50% - Use ✨ icon for skills with level < 50% ## Work History and Education Layout Adjustment 1. Adjust the layout so that logo/image columns take up 50% width in each section - Note: Column width ratio might not be returned by API when columns are equal (50/50) 2. Replace all images/icons with black placeholder images using URL containing "https://singlecolorimage.com/get/000000/1024x128" ================================================ FILE: tasks/notion/standard/online_resume/layout_adjustment/meta.json ================================================ { "task_id": "layout_adjustment", "task_name": "Layout Adjustment", "category_id": "online_resume", "category_name": "Online Resume", "description": "This task involves modifying the layout and content of an online resume page by restructuring the Skills section with icon indicators and adjusting the Work History and Education sections to use equal column widths with placeholder images.", "author": "Xiangyan Liu", "created_at": "2025-08-14", "difficulty": "L3", "tags": [ "content organization", "visual formatting", "conditional filtering", "template population" ], "mcp": [ "notion" ], "meta_data": { "stateType": "url", "stateContent": null, "stateUrl": "https://painted-tennis-ebc.notion.site/Online-Resume-23181626b6d781159faaeb5eadaf612e", "stateOriginalUrl": "https://www.notion.so/marketplace/templates/online-resume" } } ================================================ FILE: tasks/notion/standard/online_resume/layout_adjustment/verify.py ================================================ import sys from notion_client import Client from tasks.utils import notion_utils def verify(notion: Client, main_id: str = None) -> bool: """ Verifies that the Skills display has been adjusted correctly: 1. Skills database on the right side should be deleted 2. Skills section should be added on the left side under Languages 3. Skills should be formatted with correct icons based on skill level 4. Work History and Education sections should use black placeholder images """ page_id = None if main_id: found_id, object_type = notion_utils.find_page_or_database_by_id( notion, main_id ) if found_id and object_type == "page": page_id = found_id if not page_id: page_id = notion_utils.find_page(notion, "Online Resume") if not page_id: print("Error: Page 'Online Resume' not found.", file=sys.stderr) return False all_blocks = notion_utils.get_all_blocks_recursively(notion, page_id) # Step 1: Verify Skills database is NOT in the right column anymore # Find the main column list for block in all_blocks: if block.get("type") == "column_list": column_list_id = block["id"] columns = notion_utils.get_all_blocks_recursively(notion, column_list_id) # Check if this is the main two-column layout if len(columns) == 2: # Find the right column (usually the one with larger width ratio) for column in columns: if column.get("type") == "column": width_ratio = column.get("column", {}).get("width_ratio", 0) # Right column typically has width_ratio > 0.5 if width_ratio > 0.5: right_column_id = column["id"] right_column_blocks = notion_utils.get_all_blocks_recursively( notion, right_column_id ) # Check if Skills database exists in right column for right_block in right_column_blocks: if ( right_block.get("type") == "child_database" and right_block.get("child_database", {}).get("title") == "Skills" ): print( "Error: Skills database still exists in the right column.", file=sys.stderr, ) return False # Step 2: Find the left column and verify Skills section exists there skills_section_found = False skills_with_double_sparkles = [] skills_with_single_sparkle = [] # First, find the main column_list (top-level) main_column_list_id = None for block in all_blocks: if block.get("type") == "column_list" and block.get("parent", {}).get("type") == "page_id": main_column_list_id = block["id"] break if not main_column_list_id: print("Error: Main column list not found.", file=sys.stderr) return False # Get the columns directly columns = notion_utils.get_all_blocks_recursively(notion, main_column_list_id) # Find the left column (the one with width_ratio around 0.25) left_column_id = None for column in columns: if column.get("type") == "column": width_ratio = column.get("column", {}).get("width_ratio", 0) # Left column has width_ratio around 0.25 if 0.2 <= width_ratio <= 0.3: left_column_id = column["id"] break if not left_column_id: print("Error: Left column not found.", file=sys.stderr) return False # Get all blocks in the left column left_column_blocks = notion_utils.get_all_blocks_recursively(notion, left_column_id) # Find Languages heading languages_index = -1 for i, left_block in enumerate(left_column_blocks): if ( left_block.get("type") == "heading_2" and "Languages" in notion_utils.get_block_plain_text(left_block) ): languages_index = i break if languages_index == -1: print("Error: Languages heading not found in left column.", file=sys.stderr) return False # Look for Skills heading after Languages for i in range(languages_index + 1, len(left_column_blocks)): left_block = left_column_blocks[i] if ( left_block.get("type") == "heading_2" and "Skills" in notion_utils.get_block_plain_text(left_block) ): skills_section_found = True # Check divider after Skills heading if i + 1 < len(left_column_blocks): next_block = left_column_blocks[i + 1] if next_block.get("type") != "divider": print( "Error: Divider not found after Skills heading.", file=sys.stderr, ) return False # Collect skills after divider for j in range(i + 2, len(left_column_blocks)): skill_block = left_column_blocks[j] if skill_block.get("type") == "paragraph": skill_text = notion_utils.get_block_plain_text(skill_block) if skill_text and skill_text.strip(): # Check for non-empty text # Check if text is bold rich_text = skill_block.get("paragraph", {}).get("rich_text", []) if rich_text and not rich_text[0].get("annotations", {}).get("bold"): print( f"Error: Skill '{skill_text}' is not bold.", file=sys.stderr, ) return False # Check icon format if skill_text.startswith("✨✨"): skills_with_double_sparkles.append(skill_text) elif skill_text.startswith("✨"): skills_with_single_sparkle.append(skill_text) else: print( f"Error: Skill '{skill_text}' doesn't start with sparkle icon.", file=sys.stderr, ) return False # Check format includes type in parentheses if "(" not in skill_text or ")" not in skill_text: print( f"Error: Skill '{skill_text}' doesn't include type in parentheses.", file=sys.stderr, ) return False elif skill_block.get("type") in ["heading_1", "heading_2", "heading_3"]: # Stop when we reach another section break break if not skills_section_found: print( "Error: Skills section not found in the left column under Languages.", file=sys.stderr, ) return False # Step 3: Verify we have the expected skills expected_double_sparkle_skills = [ "Photoshop", "Figma", "Notion", "Framer" ] expected_single_sparkle_skills = [ "Webflow", "Rive", "CSS + Basic JS" ] # Check if all expected skills are present for skill_name in expected_double_sparkle_skills: found = any(skill_name in skill for skill in skills_with_double_sparkles) if not found: print( f"Error: Expected skill '{skill_name}' with ✨✨ not found.", file=sys.stderr, ) return False for skill_name in expected_single_sparkle_skills: found = any(skill_name in skill for skill in skills_with_single_sparkle) if not found: print( f"Error: Expected skill '{skill_name}' with ✨ not found.", file=sys.stderr, ) return False # Step 4: Verify Work History and Education sections have black placeholder images work_history_images_found = 0 education_images_found = 0 black_placeholder_url = "https://singlecolorimage.com/get/000000/" # Find Work History and Education sections in the right column right_column_id = None for column in columns: if column.get("type") == "column": width_ratio = column.get("column", {}).get("width_ratio", 0.5) # Right column has width_ratio around 0.75 or no width_ratio (which means equal split) if width_ratio > 0.6 or width_ratio == 0.5: right_column_id = column["id"] break if right_column_id: right_column_blocks = notion_utils.get_all_blocks_recursively(notion, right_column_id) # Find Work History section work_history_index = -1 education_index = -1 for i, block in enumerate(right_column_blocks): if block.get("type") == "heading_1": heading_text = notion_utils.get_block_plain_text(block) if "Work History" in heading_text: work_history_index = i elif "Education" in heading_text: education_index = i # Check Work History column lists for images if work_history_index != -1: for i in range(work_history_index + 1, min(education_index if education_index > work_history_index else len(right_column_blocks), len(right_column_blocks))): block = right_column_blocks[i] if block.get("type") == "column_list": column_list_blocks = notion_utils.get_all_blocks_recursively(notion, block["id"]) for column in column_list_blocks: if column.get("type") == "column": # Check width_ratio - must be 50% (0.5) or absent (which defaults to 50%) col_width = column.get("column", {}).get("width_ratio") # First column should be image column (either no ratio=50%, or exactly 0.5) if col_width is None or col_width == 0.5: column_contents = notion_utils.get_all_blocks_recursively(notion, column["id"]) for content_block in column_contents: if content_block.get("type") == "embed": embed_url = content_block.get("embed", {}).get("url", "") if black_placeholder_url in embed_url: work_history_images_found += 1 elif content_block.get("type") == "image": # Also check for image blocks with external URL image_url = content_block.get("image", {}).get("external", {}).get("url", "") if black_placeholder_url in image_url: work_history_images_found += 1 break # Only check first column # Check Education column list for images if education_index != -1: for i in range(education_index + 1, len(right_column_blocks)): block = right_column_blocks[i] if block.get("type") == "heading_1": break # Stop at next section if block.get("type") == "column_list": column_list_blocks = notion_utils.get_all_blocks_recursively(notion, block["id"]) for column in column_list_blocks: if column.get("type") == "column": # Check width_ratio - must be 50% (0.5) or absent (which defaults to 50%) col_width = column.get("column", {}).get("width_ratio") # First column should be image column (either no ratio=50%, or exactly 0.5) if col_width is None or col_width == 0.5: column_contents = notion_utils.get_all_blocks_recursively(notion, column["id"]) for content_block in column_contents: if content_block.get("type") == "embed": embed_url = content_block.get("embed", {}).get("url", "") if black_placeholder_url in embed_url: education_images_found += 1 elif content_block.get("type") == "image": image_url = content_block.get("image", {}).get("external", {}).get("url", "") if black_placeholder_url in image_url: education_images_found += 1 break # Only check first column break # Only check first column_list in Education # Verify images were found if work_history_images_found < 2: print( f"Warning: Expected at least 2 Work History images with black placeholder, found {work_history_images_found}.", file=sys.stderr, ) return False if education_images_found < 1: print( f"Warning: Expected at least 1 Education image with black placeholder, found {education_images_found}.", file=sys.stderr, ) return False print("Success: Skills display adjusted correctly.") print(f"- Found {len(skills_with_double_sparkles)} skills with ✨✨ (skill level >= 50%)") print(f"- Found {len(skills_with_single_sparkle)} skills with ✨ (skill level < 50%)") print("- Skills database removed from right column") print("- Skills section added to left column under Languages") print(f"- Found {work_history_images_found} Work History images with black placeholder") print(f"- Found {education_images_found} Education images with black placeholder") return True def main(): """ Executes the verification process and exits with a status code. """ notion = notion_utils.get_notion_client() main_id = sys.argv[1] if len(sys.argv) > 1 else None if verify(notion, main_id): sys.exit(0) else: sys.exit(1) if __name__ == "__main__": main() ================================================ FILE: tasks/notion/standard/online_resume/projects_section_update/description.md ================================================ Find the page named "Online Resume" and reorganize the projects section to showcase only the most recent and relevant work. **Task Requirements:** 1. Delete the project named "Knitties eComm Website" from the Projects database since it's from 2022 and no longer relevant 2. Create a new project entry called "Zapier Dashboard Redesign" with: - Description: "Led the complete redesign of Zapier's main dashboard, focusing on improved usability and modern design patterns. Implemented new navigation system and responsive layouts." - Date: Start "2024-01-01", End "2024-06-30" - Tags: Add the existing "UI Design" tag, and create a new tag "Enterprise" with purple color, then add both tags to this project - Phone: Same as the phone number under the Contact section - Url: Same as the personal website under the Contact section 3. After the Projects database block, add the following blocks in sequence: - A divider block - A heading_2 block with text "Current Focus" - A paragraph block with content that dynamically references: - The highest skill level from your Skills database (find the skill with the highest Skill Level percentage) - Incorporate this into the text: "The Zapier Dashboard Redesign represents my most impactful recent work, leveraging my expertise in [highest skill name] ([skill level]%) to deliver enterprise-grade solutions that prioritize both aesthetics and functionality." ================================================ FILE: tasks/notion/standard/online_resume/projects_section_update/meta.json ================================================ { "task_id": "projects_section_update", "task_name": "Projects Section Update", "category_id": "online_resume", "category_name": "Online Resume", "description": "Reorganize the projects section by removing outdated projects and adding new relevant work with proper formatting.", "author": "Xiangyan Liu", "created_at": "2025-07-27", "difficulty": "L3", "tags": [ "database manipulation", "template population", "data aggregation", "visual formatting", "cross-reference linking" ], "mcp": [ "notion" ], "meta_data": { "stateType": "url", "stateContent": null, "stateUrl": "https://painted-tennis-ebc.notion.site/Online-Resume-23181626b6d781159faaeb5eadaf612e", "stateOriginalUrl": "https://www.notion.so/marketplace/templates/online-resume" } } ================================================ FILE: tasks/notion/standard/online_resume/projects_section_update/verify.py ================================================ import sys from notion_client import Client from tasks.utils import notion_utils def verify(notion: Client, main_id: str = None) -> bool: """ Verifies that the projects section has been reorganized correctly with cross-section references. """ page_id = None if main_id: found_id, object_type = notion_utils.find_page_or_database_by_id( notion, main_id ) if found_id and object_type == "page": page_id = found_id if not page_id: page_id = notion_utils.find_page(notion, "Online Resume") if not page_id: print("Error: Page 'Online Resume' not found.", file=sys.stderr) return False # Find the Projects database projects_db_id = notion_utils.find_database_in_block(notion, page_id, "Projects") if not projects_db_id: print("Error: Database 'Projects' not found.", file=sys.stderr) return False # Find the Skills database to get the highest skill level skills_db_id = notion_utils.find_database_in_block(notion, page_id, "Skills") if not skills_db_id: print("Error: Database 'Skills' not found.", file=sys.stderr) return False # Query Skills database to find the highest skill level skills_results = notion.databases.query(database_id=skills_db_id).get("results", []) highest_skill_name = "" highest_skill_level = 0 for skill_page in skills_results: properties = skill_page.get("properties", {}) skill_name_prop = properties.get("Skill", {}).get("title", []) skill_level_prop = properties.get("Skill Level", {}).get("number") if skill_name_prop and skill_level_prop is not None: skill_name = skill_name_prop[0].get("text", {}).get("content", "") if skill_level_prop > highest_skill_level: highest_skill_level = skill_level_prop highest_skill_name = skill_name if not highest_skill_name: print("Error: Could not find any skills with skill levels.", file=sys.stderr) return False # Query Projects database projects_results = notion.databases.query(database_id=projects_db_id).get( "results", [] ) # Check that "Knitties eComm Website" is deleted for page in projects_results: properties = page.get("properties", {}) name_prop = properties.get("Name", {}).get("title", []) if ( name_prop and name_prop[0].get("text", {}).get("content") == "Knitties eComm Website" ): print( "Failure: 'Knitties eComm Website' project was not deleted.", file=sys.stderr, ) return False # Check that "Zapier Dashboard Redesign" exists with correct properties zapier_project_found = False for page in projects_results: properties = page.get("properties", {}) name_prop = properties.get("Name", {}).get("title", []) if ( name_prop and name_prop[0].get("text", {}).get("content") == "Zapier Dashboard Redesign" ): zapier_project_found = True # Check description contains reference to UI Design Internship desc_prop = properties.get("Description", {}).get("rich_text", []) if not desc_prop: print("Failure: Zapier project has no description.", file=sys.stderr) return False description_text = desc_prop[0].get("text", {}).get("content", "") base_desc = "Led the complete redesign of Zapier's main dashboard, focusing on improved usability and modern design patterns. Implemented new navigation system and responsive layouts." if base_desc not in description_text: print( "Failure: Zapier project description is missing base content.", file=sys.stderr, ) return False # Check date date_prop = properties.get("Date", {}).get("date", {}) if ( not date_prop or date_prop.get("start") != "2024-01-01" or date_prop.get("end") != "2024-06-30" ): print( "Failure: Zapier project date range is incorrect.", file=sys.stderr ) return False # Check tags tags_prop = properties.get("Tags", {}).get("multi_select", []) tag_names = {tag.get("name") for tag in tags_prop} if "UI Design" not in tag_names or "Enterprise" not in tag_names: print( "Failure: Zapier project is missing required tags.", file=sys.stderr ) return False # Check phone phone_prop = properties.get("Phone", {}).get("phone_number", []) if not phone_prop or phone_prop != "+44 7871263013": print( "Failure: Zapier project phone number is incorrect.", file=sys.stderr, ) return # Check url url_prop = properties.get("Url", {}).get("url", []) if not url_prop or url_prop != "www.zinenwine.com": print("Failure: Zapier project url is incorrect.", file=sys.stderr) return # Check Enterprise tag color enterprise_tag_purple = False for tag in tags_prop: if tag.get("name") == "Enterprise" and tag.get("color") == "purple": enterprise_tag_purple = True break if not enterprise_tag_purple: print( "Failure: Enterprise tag does not have purple color.", file=sys.stderr, ) return False break if not zapier_project_found: print( "Failure: 'Zapier Dashboard Redesign' project not found.", file=sys.stderr ) return False # Find the Projects database block and verify blocks after it all_blocks = notion_utils.get_all_blocks_recursively(notion, page_id) # Find the Projects database block projects_db_index = -1 for i, block in enumerate(all_blocks): if ( block.get("type") == "child_database" and block.get("child_database", {}).get("title") == "Projects" ): projects_db_index = i break if projects_db_index == -1: print("Error: Could not find Projects database block.", file=sys.stderr) return False # Check blocks after Projects database if projects_db_index + 3 > len(all_blocks): print("Failure: Not enough blocks after Projects database.", file=sys.stderr) return False # Check divider block divider_block = all_blocks[projects_db_index + 1] if divider_block.get("type") != "divider": print( "Failure: Expected divider block after Projects database.", file=sys.stderr ) return False # Check heading block heading_block = all_blocks[projects_db_index + 2] if heading_block.get("type") != "heading_2": print("Failure: Expected heading_2 block after divider.", file=sys.stderr) return False heading_text = heading_block.get("heading_2", {}).get("rich_text", []) if ( not heading_text or heading_text[0].get("text", {}).get("content") != "Current Focus" ): print("Failure: Heading text is incorrect.", file=sys.stderr) return False # Check paragraph block with dynamic skill reference paragraph_block = all_blocks[projects_db_index + 3] if paragraph_block.get("type") != "paragraph": print("Failure: Expected paragraph block after heading.", file=sys.stderr) return False paragraph_text = paragraph_block.get("paragraph", {}).get("rich_text", []) if not paragraph_text: print("Failure: Paragraph block is empty.", file=sys.stderr) return False paragraph_content = paragraph_text[0].get("text", {}).get("content", "") # Check that paragraph contains the base text base_text = "The Zapier Dashboard Redesign represents my most impactful recent work, leveraging my expertise in" if base_text not in paragraph_content: print("Failure: Paragraph does not contain base text.", file=sys.stderr) return False # Check that paragraph references the highest skill skill_level_percent = int(highest_skill_level * 100) expected_skill_ref = f"{highest_skill_name} ({skill_level_percent}%)" if expected_skill_ref not in paragraph_content: print( f"Failure: Paragraph does not reference highest skill '{expected_skill_ref}'.", file=sys.stderr, ) return False # Check that paragraph contains the ending text ending_text = ( "enterprise-grade solutions that prioritize both aesthetics and functionality" ) if ending_text not in paragraph_content: print( "Failure: Paragraph does not contain proper ending text.", file=sys.stderr ) return False print( f"Success: Projects section has been reorganized correctly with cross-section references (highest skill: {highest_skill_name} at {skill_level_percent}%)." ) return True def main(): """ Executes the verification process and exits with a status code. """ notion = notion_utils.get_notion_client() main_id = sys.argv[1] if len(sys.argv) > 1 else None if verify(notion, main_id): sys.exit(0) else: sys.exit(1) if __name__ == "__main__": main() ================================================ FILE: tasks/notion/standard/online_resume/skills_development_tracker/description.md ================================================ Create a comprehensive skills audit system by performing the following tasks: **Task Requirements:** 1. Create a new database named "Skills Development Tracker" as a child database in the main resume page with the following properties: - Name (title property) - Current Skill (relation to Skills database) - Current Proficiency (rollup from related skill's "Skill Level" property) - Target Proficiency (number property with format "percent") - Gap (formula: Target Proficiency - Current Proficiency) - Learning Resources (rich text property) - Progress Notes (rich text property) 2. Populate the Skills Development Tracker database with entries for all skills that have a proficiency level below 70% (0.7): - For each qualifying skill, create an entry with: - Name: "[Skill Name] Development Plan" - Link to the corresponding skill in Skills database - Target Proficiency: Set to Current + 25% (capped at 95%) - Learning Resources: "Online courses and practice projects" - Progress Notes: "Initial assessment completed" 3. Create a callout block immediately after the Skills section (after the Skills database) with: - Background color: blue_background - Icon: 🎯 (target emoji) - Content: "Focus Areas: [3 skills with lowest current proficiency]" ================================================ FILE: tasks/notion/standard/online_resume/skills_development_tracker/meta.json ================================================ { "task_id": "skills_development_tracker", "task_name": "Skills Development Tracker", "category_id": "online_resume", "category_name": "Online Resume", "description": "Create a comprehensive skills audit system with development tracking for skills below 70% proficiency.", "author": "Xiangyan Liu", "created_at": "2025-07-27", "difficulty": "L3", "tags": [ "database manipulation", "cross-reference linking", "conditional filtering", "data aggregation", "template population", "visual formatting" ], "mcp": [ "notion" ], "meta_data": { "stateType": "url", "stateContent": null, "stateUrl": "https://painted-tennis-ebc.notion.site/Online-Resume-23181626b6d781159faaeb5eadaf612e", "stateOriginalUrl": "https://www.notion.so/marketplace/templates/online-resume" } } ================================================ FILE: tasks/notion/standard/online_resume/skills_development_tracker/verify.py ================================================ import sys from notion_client import Client from tasks.utils import notion_utils def verify(notion: Client, main_id: str = None) -> bool: """ Verifies that the Skills Development Tracker database and callout block were created correctly. """ page_id = None if main_id: found_id, object_type = notion_utils.find_page_or_database_by_id( notion, main_id ) if found_id and object_type == "page": page_id = found_id if not page_id: page_id = notion_utils.find_page(notion, "New Online Resume") if not page_id: print("Error: Page 'New Online Resume' not found.", file=sys.stderr) return False # Step 1: Verify Skills Development Tracker database exists tracker_db_id = notion_utils.find_database_in_block( notion, page_id, "Skills Development Tracker" ) if not tracker_db_id: print( "Error: Database 'Skills Development Tracker' not found.", file=sys.stderr ) return False # Step 2: Verify database schema try: db_info = notion.databases.retrieve(database_id=tracker_db_id) properties = db_info.get("properties", {}) # Check required properties required_props = { "Name": "title", "Current Skill": "relation", "Current Proficiency": "rollup", "Target Proficiency": "number", "Gap": "formula", "Learning Resources": "rich_text", "Progress Notes": "rich_text", } for prop_name, expected_type in required_props.items(): if prop_name not in properties: print( f"Error: Property '{prop_name}' not found in database.", file=sys.stderr, ) return False if properties[prop_name]["type"] != expected_type: print( f"Error: Property '{prop_name}' has incorrect type. Expected '{expected_type}', got '{properties[prop_name]['type']}'.", file=sys.stderr, ) return False # Verify Target Proficiency is percent format if ( properties["Target Proficiency"].get("number", {}).get("format") != "percent" ): print( "Error: Target Proficiency should have 'percent' format.", file=sys.stderr, ) return False except Exception as e: print(f"Error retrieving database info: {e}", file=sys.stderr) return False # Step 3: Get Skills database to check entries skills_db_id = notion_utils.find_database_in_block(notion, page_id, "Skills") if not skills_db_id: print("Error: Skills database not found.", file=sys.stderr) return False # Get all skills with proficiency < 70% skills_below_70 = [] try: skills_results = notion.databases.query(database_id=skills_db_id).get( "results", [] ) for skill in skills_results: skill_level = ( skill.get("properties", {}).get("Skill Level", {}).get("number", 1.0) ) if skill_level < 0.7: skill_name = ( skill.get("properties", {}).get("Skill", {}).get("title", []) ) if skill_name: skill_name_text = skill_name[0].get("text", {}).get("content", "") skills_below_70.append( { "name": skill_name_text, "id": skill["id"], "level": skill_level, } ) except Exception as e: print(f"Error querying Skills database: {e}", file=sys.stderr) return False if not skills_below_70: print("Warning: No skills found with proficiency below 70%.", file=sys.stderr) # This might be OK if all skills are above 70% # Step 4: Verify entries in Skills Development Tracker try: tracker_results = notion.databases.query(database_id=tracker_db_id).get( "results", [] ) # Check that we have entries for skills below 70% if len(skills_below_70) > 0 and len(tracker_results) == 0: print( "Error: No entries found in Skills Development Tracker database.", file=sys.stderr, ) return False # Verify each entry for entry in tracker_results: props = entry.get("properties", {}) # Check name format name_prop = props.get("Name", {}).get("title", []) if not name_prop: print("Error: Entry missing Name property.", file=sys.stderr) return False name_text = name_prop[0].get("text", {}).get("content", "") if not name_text.endswith(" Development Plan"): print( f"Error: Entry name '{name_text}' doesn't follow expected format.", file=sys.stderr, ) return False # Check relation to Skills database skill_relation = props.get("Current Skill", {}).get("relation", []) if not skill_relation: print( f"Error: Entry '{name_text}' missing Current Skill relation.", file=sys.stderr, ) return False # Check Target Proficiency (should be set) target_prof = props.get("Target Proficiency", {}).get("number") if target_prof is None: print( f"Error: Entry '{name_text}' missing Target Proficiency.", file=sys.stderr, ) return False # Check Learning Resources learning_resources = props.get("Learning Resources", {}).get( "rich_text", [] ) if not learning_resources: print( f"Error: Entry '{name_text}' missing Learning Resources.", file=sys.stderr, ) return False # Check Progress Notes progress_notes = props.get("Progress Notes", {}).get("rich_text", []) if not progress_notes: print( f"Error: Entry '{name_text}' missing Progress Notes.", file=sys.stderr, ) return False except Exception as e: print(f"Error querying Skills Development Tracker: {e}", file=sys.stderr) return False # Step 5: Verify callout block exists after Skills section all_blocks = notion_utils.get_all_blocks_recursively(notion, page_id) # Find Skills database block skills_db_block_index = None for i, block in enumerate(all_blocks): if ( block.get("type") == "child_database" and block.get("child_database", {}).get("title") == "Skills" ): skills_db_block_index = i break if skills_db_block_index is None: print("Error: Could not find Skills database block.", file=sys.stderr) return False # Look for callout block after Skills database callout_found = False block = all_blocks[skills_db_block_index + 1] if block.get("type") == "callout": callout_data = block.get("callout", {}) # Check background color if callout_data.get("color") != "blue_background": print("Error: Could not find callout block with blue background.") return False # Check icon icon = callout_data.get("icon", {}) if icon.get("type") != "emoji" or icon.get("emoji") != "🎯": print("Error: Could not find callout block with 🎯 emoji.") return False # Check content starts with "Focus Areas:" rich_text = callout_data.get("rich_text", []) if rich_text: content = rich_text[0].get("text", {}).get("content", "") if ( content.startswith("Focus Areas:") and "CSS + Basic JS" in content and "Webflow" in content and "Rive" in content ): callout_found = True print(f"Success: Found callout block with content: {content}") else: print("Error: Could not find callout block with required text content.") return False if not callout_found: print( "Error: Could not find callout block with Focus Areas after Skills section.", file=sys.stderr, ) return False print( "Success: Skills Development Tracker database and callout block verified successfully." ) return True def main(): """ Executes the verification process and exits with a status code. """ notion = notion_utils.get_notion_client() main_id = sys.argv[1] if len(sys.argv) > 1 else None if verify(notion, main_id): sys.exit(0) else: sys.exit(1) if __name__ == "__main__": main() ================================================ FILE: tasks/notion/standard/online_resume/work_history_addition/description.md ================================================ Hi! I realized I forgot to include one work experience on my resume page titled "Online Resume." Could you please help me add it to the "Work History" section? The position is "Research Assistant," and it took place from January to August 2023. The description should be: "Assisted in conducting user experience research projects at my bachelor’s program, supporting data collection, analyzing user feedback, and preparing research reports. Developed strong skills in research methodologies and improved collaboration with interdisciplinary teams." For the image or logo, please use the one from the "Education" section (my bachelor school) to keep everything consistent. Also, please make sure that the formatting — including font style, size, and layout — matches the existing entries in the Work History section so it looks seamless. Thank you! ================================================ FILE: tasks/notion/standard/online_resume/work_history_addition/meta.json ================================================ { "task_id": "work_history_addition", "task_name": "Work History Addition", "category_id": "online_resume", "category_name": "Online Resume", "description": "Add a Research Assistant position to the Work History section with consistent formatting and university logo.", "author": "Xiangyan Liu", "created_at": "2025-07-27", "difficulty": "L3", "tags": [ "database manipulation", "template population", "cross-reference linking", "visual formatting" ], "mcp": [ "notion" ], "meta_data": { "stateType": "url", "stateContent": null, "stateUrl": "https://painted-tennis-ebc.notion.site/Online-Resume-23181626b6d781159faaeb5eadaf612e", "stateOriginalUrl": "https://www.notion.so/marketplace/templates/online-resume" } } ================================================ FILE: tasks/notion/standard/online_resume/work_history_addition/verify.py ================================================ import sys from notion_client import Client from tasks.utils import notion_utils def verify(notion: Client, main_id: str = None) -> bool: """ Verifies that the new work history entry for 'Research Assistant' has been added correctly. """ page_id = None if main_id: found_id, object_type = notion_utils.find_page_or_database_by_id( notion, main_id ) if found_id and object_type == "page": page_id = found_id if not page_id: page_id = notion_utils.find_page(notion, "Online Resume") if not page_id: print("Error: Page 'Online Resume' not found.", file=sys.stderr) return False all_blocks = notion_utils.get_all_blocks_recursively(notion, page_id) def find_image_url_under_heading(blocks, heading_text, notion_client): heading_index = -1 for i, block in enumerate(blocks): block_type = block.get("type") if block_type == "heading_1": if heading_text in notion_utils.get_block_plain_text(block): heading_index = i break if heading_index == -1: return None for i in range(heading_index + 1, len(blocks)): block = blocks[i] if block.get("type") in ["heading_1", "heading_2", "heading_3"]: break if block.get("type") == "image" and block.get("image", {}).get("file"): return block.get("image", {}).get("file", {}).get("url") if block.get("type") == "column_list": column_list_id = block["id"] columns = notion_utils.get_all_blocks_recursively( notion_client, column_list_id ) for column in columns: if column.get("type") == "column": column_id = column["id"] column_blocks = notion_utils.get_all_blocks_recursively( notion_client, column_id ) for inner_block in column_blocks: if inner_block.get("type") == "image" and inner_block.get( "image", {} ).get("file"): return ( inner_block.get("image", {}) .get("file", {}) .get("url") ) return None def get_block_annotations(block): block_type = block.get("type") if not block_type: return {} block_content = block.get(block_type) if not block_content: return {} rich_text_list = block_content.get("rich_text", []) if not rich_text_list: return {} return rich_text_list[0].get("annotations", {}) education_image_url = find_image_url_under_heading(all_blocks, "Education", notion) if not education_image_url: print( "Error: Could not find the image in the 'Education' section.", file=sys.stderr, ) return False heading_text = "Work History" heading_index = -1 for i, block in enumerate(all_blocks): if block.get( "type" ) == "heading_1" and heading_text in notion_utils.get_block_plain_text(block): heading_index = i break if heading_index == -1: print(f"Error: Could not find the '{heading_text}' heading.", file=sys.stderr) return False for i in range(heading_index + 1, len(all_blocks)): block = all_blocks[i] if block.get("type") in ["heading_1", "heading_2", "heading_3"]: break if block.get("type") == "column_list": column_list_id = block["id"] columns = notion_utils.get_all_blocks_recursively(notion, column_list_id) if len(columns) < 2: continue for column in columns: if column.get("type") == "column": if column.get("column", {}).get("width_ratio") == 0.125: image_column = column elif column.get("column", {}).get("width_ratio") == 0.875: text_column = column image_column_blocks = notion_utils.get_all_blocks_recursively( notion, image_column["id"] ) text_column_blocks = notion_utils.get_all_blocks_recursively( notion, text_column["id"] ) column_image_url = None for inner_block in image_column_blocks: if inner_block.get("type") == "image" and inner_block.get( "image", {} ).get("file"): column_image_url = ( inner_block.get("image", {}).get("file", {}).get("url") ) break if ( not column_image_url or column_image_url[:100] != education_image_url[:100] ): continue for j, inner_block in enumerate(text_column_blocks): if "Research Assistant" in notion_utils.get_block_plain_text( inner_block ): title_annotations = get_block_annotations(inner_block) if j + 2 < len(text_column_blocks): date_block = text_column_blocks[j + 1] description_block = text_column_blocks[j + 2] date_text = "January - August 2023" description_text = "Assisted in conducting user experience research projects at my bachelor’s program, supporting data collection, analyzing user feedback, and preparing research reports. Developed strong skills in research methodologies and improved collaboration with interdisciplinary teams." date_annotations = get_block_annotations(date_block) description_annotations = get_block_annotations( description_block ) if ( date_text in notion_utils.get_block_plain_text(date_block) and description_text in notion_utils.get_block_plain_text(description_block) and title_annotations.get("bold") and date_annotations.get("italic") and date_annotations.get("color") == "gray" and description_annotations.get("color") == "default" and description_annotations.get("italic") != True and description_annotations.get("bold") != True ): print("Success: Verified new work history entry.") return True print("Failure: Could not verify the new work history entry.", file=sys.stderr) return False def main(): """ Executes the verification process and exits with a status code. """ notion = notion_utils.get_notion_client() main_id = sys.argv[1] if len(sys.argv) > 1 else None if verify(notion, main_id): sys.exit(0) else: sys.exit(1) if __name__ == "__main__": main() ================================================ FILE: tasks/notion/standard/python_roadmap/expert_level_lessons/description.md ================================================ # Task: Expert Level Learning Path with Complex Prerequisites ## Objective Create an Expert Level chapter in the Python Roadmap with sophisticated prerequisite chains that require deep understanding of the existing course structure. ## Requirements ### 1. Create Expert Level Chapter - **Database**: Chapters database - **Properties**: - Name: `Expert Level` - Icon: 🟣 (purple circle emoji) - Must appear after Advanced Level in the database ### 2. Create Bridge Lesson Create a lesson that bridges advanced and expert content: - **Title**: `Advanced Foundations Review` - **Status**: Done - **Chapter**: Link to Expert Level - **Parent item**: Link to the lesson that currently has status "In Progress" and contains "Control" in its title - **Sub-items**: Must link to exactly these three lessons: - The lesson with title containing "Decorators" - The lesson with title containing "Calling API" - The lesson with title containing "Regular Expressions" ### 3. Create Expert Level Lessons Add exactly 4 expert lessons to the Steps database: **Lesson 1**: `Metaprogramming and AST Manipulation` - Status: To Do - Chapter: Expert Level - Parent item: Link to "Advanced Foundations Review" - Date: 2025-09-15 **Lesson 2**: `Async Concurrency Patterns` - Status: To Do - Chapter: Expert Level - Parent item: Link to the lesson titled "Calling API" - Date: 2025-09-20 **Lesson 3**: `Memory Management and GC Tuning` - Status: In Progress - Chapter: Expert Level - Parent item: Link to "Advanced Foundations Review" - Sub-item: Must have exactly 2 links: - Link to any lesson from "Data Structures" that has status "To Do" - Link to the lesson containing "OOP" in its title - Date: 2025-09-25 **Lesson 4**: `Building Python C Extensions` - Status: To Do - Chapter: Expert Level - Parent item: Link to "Metaprogramming and AST Manipulation" - Date: 2025-10-01 ### 4. Update Existing Lessons - Change the status of "Decorators" from "To Do" to "Done" - Add "Async Concurrency Patterns" as a Sub-item to "Error Handling" - Update "Control Flow" status from "In Progress" to "Done" ### 5. Create Learning Path Notes Add content to the "Advanced Foundations Review" lesson page: - **Block 1**: Heading 2 with text `Prerequisites Checklist` - **Block 2**: Bulleted list with exactly 3 items: - `✅ Advanced Python Features (Decorators, Context Managers)` - `✅ API Integration and Async Basics` - `✅ Pattern Matching and Text Processing` - **Block 3**: Paragraph with text: `This lesson serves as a checkpoint before entering expert-level content. Ensure you have mastered all prerequisites listed above.` ================================================ FILE: tasks/notion/standard/python_roadmap/expert_level_lessons/meta.json ================================================ { "task_id": "expert_level_lessons", "task_name": "Expert Level Lessons", "category_id": "python_roadmap", "category_name": "Python Roadmap", "description": "Create an Expert Level chapter with sophisticated prerequisite chains and four expert-level lessons.", "author": "Lingjun Chen", "created_at": "2025-08-02", "difficulty": "L3", "tags": [ "database manipulation", "cross-reference linking", "conditional filtering", "status tracking", "template population" ], "mcp": [ "notion" ], "meta_data": { "stateType": "url", "stateContent": null, "stateUrl": "https://painted-tennis-ebc.notion.site/Python-Roadmap-25281626b6d78012bf2bce1fa8711f4d", "stateOriginalUrl": "https://www.notion.so/marketplace/templates/python-roadmap" } } ================================================ FILE: tasks/notion/standard/python_roadmap/expert_level_lessons/verify.py ================================================ import sys from notion_client import Client from tasks.utils import notion_utils def verify(notion: Client, main_id: str = None) -> bool: """ Verifies that the Expert Level chapter and its lessons have been created correctly with complex prerequisites. """ # Step 1: Find the main page and get database IDs if main_id: found_id, object_type = notion_utils.find_page_or_database_by_id(notion, main_id) if not found_id or object_type != 'page': print("Error: Main page not found.", file=sys.stderr) return False else: # Try to find the main page by searching found_id = notion_utils.find_page(notion, "Python Roadmap") if not found_id: print("Error: Main page not found.", file=sys.stderr) return False print(f"Found main page: {found_id}") # Get all blocks from the page to find database references all_blocks = notion_utils.get_all_blocks_recursively(notion, found_id) print(f"Found {len(all_blocks)} blocks") # Find database IDs from the page chapters_db_id = None steps_db_id = None for block in all_blocks: if block and block.get("type") == "child_database": db_title = block.get("child_database", {}).get("title", "") if "Chapters" in db_title: chapters_db_id = block["id"] print(f"Found Chapters database: {chapters_db_id}") elif "Steps" in db_title: steps_db_id = block["id"] print(f"Found Steps database: {steps_db_id}") if not chapters_db_id: print("Error: Chapters database not found.", file=sys.stderr) return False if not steps_db_id: print("Error: Steps database not found.", file=sys.stderr) return False print("Starting verification...") # Step 2: Verify the Expert Level chapter exists print("2. Checking for Expert Level chapter...") expert_chapter_id = None try: chapters_response = notion.databases.query( database_id=chapters_db_id, filter={ "property": "Name", "title": { "equals": "Expert Level" } } ) if not chapters_response.get("results"): print(f"Error: Expert Level chapter not found in Chapters database.", file=sys.stderr) return False expert_chapter = chapters_response["results"][0] expert_chapter_id = expert_chapter["id"] # Check chapter icon (purple circle) chapter_icon = expert_chapter.get("icon") if not chapter_icon or chapter_icon.get("type") != "emoji" or chapter_icon.get("emoji") != "🟣": print(f"Error: Expert Level chapter does not have the correct purple circle emoji icon.", file=sys.stderr) return False print(f"✓ Expert Level chapter found with correct icon: 🟣") except Exception as e: print(f"Error querying Chapters database: {e}", file=sys.stderr) return False # Step 3: Find Control Flow lesson (In Progress status) print("3. Finding Control Flow lesson...") control_flow_id = None try: control_flow_response = notion.databases.query( database_id=steps_db_id, filter={ "and": [ { "property": "Lessons", "title": { "contains": "Control" } }, { "property": "Status", "status": { "equals": "Done" # Should be updated to Done } } ] } ) if control_flow_response.get("results"): control_flow_lesson = control_flow_response["results"][0] control_flow_id = control_flow_lesson["id"] print(f"✓ Found Control Flow lesson with status 'Done'") else: print(f"Error: Control Flow lesson not found with status 'Done'.", file=sys.stderr) return False except Exception as e: print(f"Error finding Control Flow lesson: {e}", file=sys.stderr) return False # Step 4: Find prerequisite lessons print("4. Finding prerequisite lessons...") decorators_id = None calling_api_id = None regex_id = None try: # Find Decorators (should be Done) decorators_response = notion.databases.query( database_id=steps_db_id, filter={ "property": "Lessons", "title": { "contains": "Decorators" } } ) if decorators_response.get("results"): decorators_lesson = decorators_response["results"][0] decorators_id = decorators_lesson["id"] # Check status is Done if decorators_lesson["properties"]["Status"]["status"]["name"] != "Done": print(f"Error: Decorators lesson should have status 'Done'.", file=sys.stderr) return False print(f"✓ Found Decorators lesson with status 'Done'") else: print(f"Error: Decorators lesson not found.", file=sys.stderr) return False # Find Calling API calling_api_response = notion.databases.query( database_id=steps_db_id, filter={ "property": "Lessons", "title": { "equals": "Calling API" } } ) if calling_api_response.get("results"): calling_api_lesson = calling_api_response["results"][0] calling_api_id = calling_api_lesson["id"] print(f"✓ Found Calling API lesson") else: print(f"Error: Calling API lesson not found.", file=sys.stderr) return False # Find Regular Expressions regex_response = notion.databases.query( database_id=steps_db_id, filter={ "property": "Lessons", "title": { "contains": "Regular Expressions" } } ) if regex_response.get("results"): regex_lesson = regex_response["results"][0] regex_id = regex_lesson["id"] print(f"✓ Found Regular Expressions lesson") else: print(f"Error: Regular Expressions lesson not found.", file=sys.stderr) return False except Exception as e: print(f"Error finding prerequisite lessons: {e}", file=sys.stderr) return False # Step 5: Verify Advanced Foundations Review bridge lesson print("5. Checking Advanced Foundations Review bridge lesson...") bridge_id = None try: bridge_response = notion.databases.query( database_id=steps_db_id, filter={ "property": "Lessons", "title": { "equals": "Advanced Foundations Review" } } ) if not bridge_response.get("results"): print(f"Error: Advanced Foundations Review lesson not found.", file=sys.stderr) return False bridge_lesson = bridge_response["results"][0] bridge_id = bridge_lesson["id"] # Check status is Done if bridge_lesson["properties"]["Status"]["status"]["name"] != "Done": print(f"Error: Advanced Foundations Review should have status 'Done'.", file=sys.stderr) return False # Check linked to Expert Level chapter bridge_chapters = bridge_lesson["properties"]["Chapters"]["relation"] if not any(rel["id"] == expert_chapter_id for rel in bridge_chapters): print(f"Error: Advanced Foundations Review not linked to Expert Level chapter.", file=sys.stderr) return False # Check Parent item is Control Flow bridge_parent = bridge_lesson["properties"]["Parent item"]["relation"] if not bridge_parent or bridge_parent[0]["id"] != control_flow_id: print(f"Error: Advanced Foundations Review should have Control Flow as Parent item.", file=sys.stderr) return False # Check Sub-items (should have at least 3 specific lessons plus any that reference it as parent) bridge_subitems = bridge_lesson["properties"]["Sub-item"]["relation"] required_subitems = {decorators_id, calling_api_id, regex_id} actual_subitems = {item["id"] for item in bridge_subitems} if not required_subitems.issubset(actual_subitems): print(f"Error: Advanced Foundations Review should have at least these 3 sub-items: Decorators, Calling API, Regular Expressions.", file=sys.stderr) return False # Due to bidirectional relations, lessons that have this as parent will also appear as sub-items # We expect at least 5: 3 initial + 2 that reference it as parent (Metaprogramming and Memory Management) if len(bridge_subitems) < 5: print(f"Error: Advanced Foundations Review should have at least 5 sub-items (3 initial + 2 from parent relations), found {len(bridge_subitems)}.", file=sys.stderr) return False print(f"✓ Advanced Foundations Review has {len(bridge_subitems)} sub-items, including the 3 required ones") print(f"✓ Advanced Foundations Review found with correct properties") except Exception as e: print(f"Error checking bridge lesson: {e}", file=sys.stderr) return False # Step 6: Verify the 4 expert lessons print("6. Checking the 4 expert lessons...") # Note: Async Concurrency Patterns will have Error Handling as parent (due to sub-item relation) # We'll need to find Error Handling's ID first error_handling_response = notion.databases.query( database_id=steps_db_id, filter={ "property": "Lessons", "title": { "equals": "Error Handling" } } ) error_handling_id = None if error_handling_response.get("results"): error_handling_id = error_handling_response["results"][0]["id"] else: print(f"Error: Error Handling lesson not found.", file=sys.stderr) return False expert_lessons = { "Metaprogramming and AST Manipulation": { "status": "To Do", "parent": bridge_id, "date": "2025-09-15" }, "Async Concurrency Patterns": { "status": "To Do", "parent": error_handling_id, # Parent is Error Handling due to sub-item relation "date": "2025-09-20" }, "Memory Management and GC Tuning": { "status": "In Progress", "parent": bridge_id, "date": "2025-09-25" }, "Building Python C Extensions": { "status": "To Do", "date": "2025-10-01" } } lesson_ids = {} try: for lesson_name, expected in expert_lessons.items(): lesson_response = notion.databases.query( database_id=steps_db_id, filter={ "property": "Lessons", "title": { "equals": lesson_name } } ) if not lesson_response.get("results"): print(f"Error: Lesson '{lesson_name}' not found.", file=sys.stderr) return False lesson = lesson_response["results"][0] lesson_ids[lesson_name] = lesson["id"] # Check status if lesson["properties"]["Status"]["status"]["name"] != expected["status"]: print(f"Error: Lesson '{lesson_name}' should have status '{expected['status']}'.", file=sys.stderr) return False # Check linked to Expert Level chapter lesson_chapters = lesson["properties"]["Chapters"]["relation"] if not any(rel["id"] == expert_chapter_id for rel in lesson_chapters): print(f"Error: Lesson '{lesson_name}' not linked to Expert Level chapter.", file=sys.stderr) return False # Check date lesson_date = lesson["properties"]["Date"]["date"] if lesson_date and lesson_date.get("start") != expected["date"]: print(f"Error: Lesson '{lesson_name}' should have date '{expected['date']}'.", file=sys.stderr) return False # Check parent item for lessons that have specific parent requirements if "parent" in expected: lesson_parent = lesson["properties"]["Parent item"]["relation"] if not lesson_parent or lesson_parent[0]["id"] != expected["parent"]: print(f"Error: Lesson '{lesson_name}' should have correct parent item.", file=sys.stderr) return False print(f"✓ Lesson '{lesson_name}' found with correct properties") # Special checks for Building Python C Extensions parent relationship # (other parent checks are handled in the loop above) building_lesson = notion.databases.query( database_id=steps_db_id, filter={ "property": "Lessons", "title": { "equals": "Building Python C Extensions" } } )["results"][0] building_parent = building_lesson["properties"]["Parent item"]["relation"] if not building_parent or building_parent[0]["id"] != lesson_ids["Metaprogramming and AST Manipulation"]: print(f"Error: Building Python C Extensions should have Metaprogramming and AST Manipulation as parent.", file=sys.stderr) return False # Memory Management should have 2 sub-items memory_lesson = notion.databases.query( database_id=steps_db_id, filter={ "property": "Lessons", "title": { "equals": "Memory Management and GC Tuning" } } )["results"][0] memory_subitems = memory_lesson["properties"]["Sub-item"]["relation"] if len(memory_subitems) != 2: print(f"Error: Memory Management and GC Tuning should have exactly 2 sub-items.", file=sys.stderr) return False except Exception as e: print(f"Error checking expert lessons: {e}", file=sys.stderr) return False # Step 7: Verify Error Handling has Async Concurrency Patterns as sub-item print("7. Checking Error Handling sub-item...") try: error_handling_response = notion.databases.query( database_id=steps_db_id, filter={ "property": "Lessons", "title": { "equals": "Error Handling" } } ) if error_handling_response.get("results"): error_handling_lesson = error_handling_response["results"][0] error_subitems = error_handling_lesson["properties"]["Sub-item"]["relation"] if not any(item["id"] == lesson_ids["Async Concurrency Patterns"] for item in error_subitems): print(f"Error: Error Handling should have Async Concurrency Patterns as sub-item.", file=sys.stderr) return False print(f"✓ Error Handling has Async Concurrency Patterns as sub-item") else: print(f"Error: Error Handling lesson not found.", file=sys.stderr) return False except Exception as e: print(f"Error checking Error Handling: {e}", file=sys.stderr) return False # Step 8: Verify block content in Advanced Foundations Review print("8. Checking Advanced Foundations Review page content...") try: blocks = notion_utils.get_all_blocks_recursively(notion, bridge_id) if len(blocks) < 3: print(f"Error: Advanced Foundations Review should have at least 3 blocks.", file=sys.stderr) return False # Check Block 1: Heading 2 block1 = blocks[0] if block1.get("type") != "heading_2": print(f"Error: First block should be heading_2.", file=sys.stderr) return False heading_text = block1.get("heading_2", {}).get("rich_text", [{}])[0].get("text", {}).get("content", "") if heading_text != "Prerequisites Checklist": print(f"Error: Heading should be 'Prerequisites Checklist'.", file=sys.stderr) return False # Check Block 2: Bulleted list block2 = blocks[1] if block2.get("type") != "bulleted_list_item": print(f"Error: Second block should be bulleted_list_item.", file=sys.stderr) return False # Check Block 3 and 4 are also bulleted list items if len(blocks) >= 4: block3 = blocks[2] block4 = blocks[3] if block3.get("type") != "bulleted_list_item" or block4.get("type") != "bulleted_list_item": print(f"Error: Blocks 2-4 should be bulleted list items.", file=sys.stderr) return False # Check last block is paragraph last_block = blocks[-1] if last_block.get("type") != "paragraph": print(f"Error: Last block should be paragraph.", file=sys.stderr) return False paragraph_text = last_block.get("paragraph", {}).get("rich_text", [{}])[0].get("text", {}).get("content", "") if "checkpoint" not in paragraph_text.lower(): print(f"Error: Paragraph should contain text about checkpoint.", file=sys.stderr) return False print(f"✓ Advanced Foundations Review page has correct content structure") except Exception as e: print(f"Error checking page content: {e}", file=sys.stderr) return False # Step 9: Final verification counts print("9. Verifying final state counts...") try: # Count total lessons by status all_lessons = notion.databases.query(database_id=steps_db_id, page_size=100)["results"] done_lessons = [l for l in all_lessons if l["properties"]["Status"]["status"]["name"] == "Done"] done_count = len(done_lessons) in_progress_count = sum(1 for l in all_lessons if l["properties"]["Status"]["status"]["name"] == "In Progress") # Print out all Done lessons for debugging if done_count != 14: print(f"Found {done_count} Done lessons (expected 14):", file=sys.stderr) for lesson in done_lessons: lesson_name = lesson["properties"]["Lessons"]["title"][0]["text"]["content"] print(f" - {lesson_name}", file=sys.stderr) return False if in_progress_count != 1: print(f"Error: Should have 1 In Progress lesson, found {in_progress_count}.", file=sys.stderr) return False # Verify Expert Level has 5 lessons expert_chapter_updated = notion.databases.query( database_id=chapters_db_id, filter={ "property": "Name", "title": { "equals": "Expert Level" } } )["results"][0] expert_steps = expert_chapter_updated["properties"]["Steps"]["relation"] if len(expert_steps) != 5: print(f"Error: Expert Level should have exactly 5 lessons, found {len(expert_steps)}.", file=sys.stderr) return False print(f"✓ Final state counts are correct") except Exception as e: print(f"Error verifying final counts: {e}", file=sys.stderr) return False print("🎉 All verification checks passed!") return True def main(): """ Executes the verification process and exits with a status code. """ notion = notion_utils.get_notion_client() main_id = sys.argv[1] if len(sys.argv) > 1 else None if verify(notion, main_id): sys.exit(0) else: sys.exit(1) if __name__ == "__main__": main() ================================================ FILE: tasks/notion/standard/python_roadmap/learning_metrics_dashboard/description.md ================================================ # Task: Learning Metrics Dashboard ## Objective Create a comprehensive Learning Metrics Dashboard section in the Python Roadmap page that displays precise statistics and recommendations based on the Steps database content. ## Requirements ### 1. Section Placement - Add new content immediately after the Learning Materials section (before `Whether you're starting from scratch or`). ### 2. Dashboard Header - **Type**: heading_3 - **Text**: `📊 Learning Metrics Dashboard` ### 3. Course Statistics Block - **Type**: callout - **Background Color**: Brown - **Icon**: None - **Title**: **Course Statistics** (bold, heading_3). Use the same color scheme as other callout headings. - **Content**: Bulleted list with the following items in exact order: - `Total Lessons: [X]` (count all entries in Steps database) - `Completed: [X] ([Y]%)` (count Status="Done", calculate percentage to 1 decimal) - `In Progress: [X] ([Y]%)` (count Status="In Progress", calculate percentage to 1 decimal) - `Beginner Level: [X] lessons ([Y] completed)` (filter by Chapters relation to Beginner Level) - `Intermediate Level: [X] lessons ([Y] completed)` (filter by Chapters relation to Intermediate Level) - `Advanced Level: [X] lessons ([Y] completed)` (filter by Chapters relation to Advanced Level) ### 4. Completed Topics Section - **Type**: toggle - **Text**: `🏆 Completed Topics (Click to expand)` - **Nested Content**: Numbered list containing exactly 5 items - List lessons with Status="Done" ================================================ FILE: tasks/notion/standard/python_roadmap/learning_metrics_dashboard/meta.json ================================================ { "task_id": "learning_metrics_dashboard", "task_name": "Learning Metrics Dashboard", "category_id": "python_roadmap", "category_name": "Python Roadmap", "description": "Create a comprehensive Learning Metrics Dashboard section displaying precise statistics and recommendations based on the Steps database.", "author": "Lingjun Chen", "created_at": "2025-08-02", "difficulty": "L3", "tags": [ "data aggregation", "conditional filtering", "report generation", "visual formatting" ], "mcp": [ "notion" ], "meta_data": { "stateType": "url", "stateContent": null, "stateUrl": "https://painted-tennis-ebc.notion.site/Python-Roadmap-25281626b6d78012bf2bce1fa8711f4d", "stateOriginalUrl": "https://www.notion.so/marketplace/templates/python-roadmap" } } ================================================ FILE: tasks/notion/standard/python_roadmap/learning_metrics_dashboard/verify.py ================================================ import sys from notion_client import Client from tasks.utils import notion_utils def get_page_title_from_result(page_result): """ Extract the title from a page result object from database query. """ properties = page_result.get('properties', {}) # Try common title property names for prop_name in ['Name', 'Title', 'title', 'Lessons']: if prop_name in properties: prop = properties[prop_name] if prop.get('type') == 'title': title_array = prop.get('title', []) if title_array and len(title_array) > 0: return title_array[0].get('plain_text', '') return '' def verify(notion: Client, main_id: str = None) -> bool: """ Verifies that the Learning Metrics Dashboard has been implemented correctly according to description.md. """ # Step 1: Find the main page and get all blocks if main_id: found_id, object_type = notion_utils.find_page_or_database_by_id(notion, main_id) if not found_id or object_type != 'page': print("Error: Main page not found.", file=sys.stderr) return False else: # Try to find the main page by searching found_id = notion_utils.find_page(notion, "Python Roadmap") if not found_id: print("Error: Main page not found.", file=sys.stderr) return False print(f"Found main page: {found_id}") # Get Steps database to calculate expected statistics steps_db_id = notion_utils.find_database(notion, "Steps") if not steps_db_id: print("Error: Steps database not found.", file=sys.stderr) return False # Query Steps database to get all lessons steps_data = notion.databases.query(database_id=steps_db_id) total_lessons = len(steps_data['results']) completed_count = 0 in_progress_count = 0 completed_lessons = [] # Get Chapters database for level information chapters_db_id = notion_utils.find_database(notion, "Chapters") if not chapters_db_id: print("Error: Chapters database not found.", file=sys.stderr) return False # Query Chapters database to get level information chapters_data = notion.databases.query(database_id=chapters_db_id) level_ids = { 'Beginner Level': None, 'Intermediate Level': None, 'Advanced Level': None } for chapter in chapters_data['results']: chapter_name = get_page_title_from_result(chapter) if chapter_name in level_ids: level_ids[chapter_name] = chapter['id'] # Initialize level counts level_counts = { 'Beginner Level': {'total': 0, 'completed': 0}, 'Intermediate Level': {'total': 0, 'completed': 0}, 'Advanced Level': {'total': 0, 'completed': 0} } # Count lessons by status and level for lesson in steps_data['results']: status = lesson['properties']['Status']['status'] if status and status['name'] == 'Done': completed_count += 1 lesson_title = get_page_title_from_result(lesson) if lesson_title: completed_lessons.append(lesson_title) elif status and status['name'] == 'In Progress': in_progress_count += 1 # Count by level chapters_relation = lesson['properties']['Chapters']['relation'] for chapter_ref in chapters_relation: chapter_id = chapter_ref['id'] for level_name, level_id in level_ids.items(): if chapter_id == level_id: level_counts[level_name]['total'] += 1 if status and status['name'] == 'Done': level_counts[level_name]['completed'] += 1 # Calculate percentages completed_percentage = round((completed_count / total_lessons * 100), 1) if total_lessons > 0 else 0 in_progress_percentage = round((in_progress_count / total_lessons * 100), 1) if total_lessons > 0 else 0 print(f"Expected statistics:") print(f" Total Lessons: {total_lessons}") print(f" Completed: {completed_count} ({completed_percentage}%)") print(f" In Progress: {in_progress_count} ({in_progress_percentage}%)") print(f" Beginner Level: {level_counts['Beginner Level']['total']} lessons ({level_counts['Beginner Level']['completed']} completed)") print(f" Intermediate Level: {level_counts['Intermediate Level']['total']} lessons ({level_counts['Intermediate Level']['completed']} completed)") print(f" Advanced Level: {level_counts['Advanced Level']['total']} lessons ({level_counts['Advanced Level']['completed']} completed)") print(f" Completed lessons (first 5): {completed_lessons[:5]}") # Get all blocks from the page all_blocks = notion_utils.get_all_blocks_recursively(notion, found_id) print(f"Found {len(all_blocks)} blocks") # Step 2: Verify the required elements in order learning_materials_idx = -1 dashboard_heading_idx = -1 callout_idx = -1 toggle_idx = -1 whether_paragraph_idx = -1 # Track the "Whether you're starting from scratch" paragraph # Track what we've verified callout_has_brown_bg = False callout_has_no_icon = False callout_has_course_statistics_title = False callout_title_has_correct_colors = False statistics_items_found = [] completed_topics_found = [] # Expected statistics content expected_statistics = [ f"Total Lessons: {total_lessons}", f"Completed: {completed_count} ({completed_percentage}%)", f"In Progress: {in_progress_count} ({in_progress_percentage}%)", f"Beginner Level: {level_counts['Beginner Level']['total']} lessons ({level_counts['Beginner Level']['completed']} completed)", f"Intermediate Level: {level_counts['Intermediate Level']['total']} lessons ({level_counts['Intermediate Level']['completed']} completed)", f"Advanced Level: {level_counts['Advanced Level']['total']} lessons ({level_counts['Advanced Level']['completed']} completed)" ] # Check blocks in order for i, block in enumerate(all_blocks): if block is None: continue block_type = block.get("type") # 1. Check for Learning Materials heading (requirement 1) if learning_materials_idx == -1 and block_type == "heading_3": block_text = notion_utils.get_block_plain_text(block) if "🎓 Learning Materials" in block_text or "Learning Materials" in block_text: learning_materials_idx = i print(f"✓ Requirement 1: Found Learning Materials heading at position {i}") # 2. Check for Learning Metrics Dashboard heading after Learning Materials (requirement 2) elif learning_materials_idx != -1 and dashboard_heading_idx == -1 and block_type == "heading_3": block_text = notion_utils.get_block_plain_text(block) if "📊 Learning Metrics Dashboard" in block_text: dashboard_heading_idx = i print(f"✓ Requirement 2: Found Learning Metrics Dashboard heading at position {i}") # 3. Check for callout block after Dashboard heading (requirement 3) elif dashboard_heading_idx != -1 and callout_idx == -1 and block_type == "callout": callout_idx = i print(f" Found callout block at position {i}") # Check brown background (requirement 3.1) if block.get("callout", {}).get("color") == "brown_background": callout_has_brown_bg = True print(f" ✓ Requirement 3.1: Callout has brown background") # Check no icon (requirement 3.2) icon = block.get("callout", {}).get("icon") if icon is None: callout_has_no_icon = True print(f" ✓ Requirement 3.2: Callout has no icon") # Get nested blocks for Course Statistics title and content nested_blocks = notion_utils.get_all_blocks_recursively(notion, block.get("id")) for nested in nested_blocks: # Check for heading_3 only as per requirement if nested and nested.get("type") == "heading_3": # Check for "Course Statistics" title with correct formatting rich_text = nested.get("heading_3", {}).get("rich_text", []) course_found = False course_correct = False statistics_found = False statistics_correct = False for text_item in rich_text: text_content = text_item.get("text", {}).get("content", "") annotations = text_item.get("annotations", {}) color = annotations.get("color", "default") is_bold = annotations.get("bold", False) if "Course" in text_content: course_found = True # Check if Course is blue and bold if color == "blue" and is_bold: course_correct = True print(f" ✓ 'Course' has blue color and is bold") else: print(f" ✗ 'Course' color: {color}, bold: {is_bold} (should be blue and bold)") if "Statistics" in text_content: statistics_found = True # Check if Statistics is yellow and bold if color == "yellow" and is_bold: statistics_correct = True print(f" ✓ 'Statistics' has yellow color and is bold") else: print(f" ✗ 'Statistics' color: {color}, bold: {is_bold} (should be yellow and bold)") if course_found and statistics_found: callout_has_course_statistics_title = True if course_correct and statistics_correct: callout_title_has_correct_colors = True print(f" ✓ Requirement 3.3: Callout has 'Course Statistics' title with correct colors") else: print(f" ✗ Requirement 3.3: Title found but colors/formatting incorrect") # Check for statistics items in bulleted list elif nested and nested.get("type") == "bulleted_list_item": item_text = notion_utils.get_block_plain_text(nested) for expected_item in expected_statistics: if expected_item in item_text: if expected_item not in statistics_items_found: statistics_items_found.append(expected_item) print(f" ✓ Requirement 3.4: Found statistics item: {expected_item}") # 4. Check for Completed Topics toggle after callout (requirement 4) elif callout_idx != -1 and toggle_idx == -1 and block_type == "toggle": block_text = notion_utils.get_block_plain_text(block) if "🏆 Completed Topics (Click to expand)" in block_text: toggle_idx = i print(f"✓ Requirement 4: Found Completed Topics toggle at position {i}") # Get nested blocks for completed topics list nested_blocks = notion_utils.get_all_blocks_recursively(notion, block.get("id")) for nested in nested_blocks: if nested and nested.get("type") == "numbered_list_item": item_text = notion_utils.get_block_plain_text(nested) if item_text and item_text in completed_lessons: completed_topics_found.append(item_text) print(f" ✓ Requirement 4.1: Found completed topic: {item_text}") # 5. Check for "Whether you're starting from scratch" paragraph (should be after dashboard content) elif block_type == "paragraph" and whether_paragraph_idx == -1: block_text = notion_utils.get_block_plain_text(block) if "Whether you're starting from scratch" in block_text or "Whether you're starting from scratch" in block_text: whether_paragraph_idx = i print(f" Found 'Whether you're starting from scratch' paragraph at position {i}") # Step 3: Verify all requirements were met print(f"\nVerification Summary:") all_passed = True # Requirement 1: Learning Materials section found if learning_materials_idx == -1: print("✗ Requirement 1: Learning Materials section NOT found", file=sys.stderr) all_passed = False else: print("✓ Requirement 1: Learning Materials section found") # Requirement 2: Learning Metrics Dashboard heading after Learning Materials and before "Whether..." paragraph if dashboard_heading_idx == -1: print("✗ Requirement 2: Learning Metrics Dashboard heading NOT found", file=sys.stderr) all_passed = False elif dashboard_heading_idx <= learning_materials_idx: print("✗ Requirement 2: Learning Metrics Dashboard heading not AFTER Learning Materials", file=sys.stderr) all_passed = False elif whether_paragraph_idx != -1 and dashboard_heading_idx >= whether_paragraph_idx: print("✗ Requirement 2: Learning Metrics Dashboard heading not BEFORE 'Whether you're starting from scratch' paragraph", file=sys.stderr) all_passed = False else: print("✓ Requirement 2: Learning Metrics Dashboard heading found after Learning Materials") if whether_paragraph_idx != -1: print(" ✓ Dashboard content is correctly placed before 'Whether you're starting from scratch' paragraph") # Requirement 3: Course Statistics callout block with all specifications if callout_idx == -1: print("✗ Requirement 3: Course Statistics callout block NOT found", file=sys.stderr) all_passed = False else: if not callout_has_brown_bg: print("✗ Requirement 3.1: Callout does NOT have brown background", file=sys.stderr) all_passed = False else: print("✓ Requirement 3.1: Callout has brown background") if not callout_has_no_icon: print("✗ Requirement 3.2: Callout has an icon (should have none)", file=sys.stderr) all_passed = False else: print("✓ Requirement 3.2: Callout has no icon") if not callout_has_course_statistics_title: print("✗ Requirement 3.3: Callout does NOT have 'Course Statistics' title", file=sys.stderr) all_passed = False else: print("✓ Requirement 3.3: Callout has 'Course Statistics' title") if not callout_title_has_correct_colors: print("✗ Requirement 3.3.1: Title does NOT have correct colors (blue for Course, yellow for Statistics)", file=sys.stderr) all_passed = False else: print("✓ Requirement 3.3.1: Title has correct colors") # Check all statistics items missing_items = [item for item in expected_statistics if item not in statistics_items_found] if missing_items: print(f"✗ Requirement 3.4: Missing statistics items: {missing_items}", file=sys.stderr) all_passed = False else: print("✓ Requirement 3.4: All 6 statistics items found") # Requirement 4: Completed Topics toggle if toggle_idx == -1: print("✗ Requirement 4: Completed Topics toggle NOT found", file=sys.stderr) all_passed = False elif toggle_idx <= callout_idx: print("✗ Requirement 4: Completed Topics toggle not AFTER callout", file=sys.stderr) all_passed = False else: print("✓ Requirement 4: Completed Topics toggle found after callout") # Check that exactly 5 completed topics are listed if len(completed_topics_found) != 5: if len(completed_topics_found) < 5: print(f"✗ Requirement 4.1: Only {len(completed_topics_found)} completed topics found (need exactly 5)", file=sys.stderr) else: print(f"✗ Requirement 4.1: Found {len(completed_topics_found)} completed topics (need exactly 5, not more)", file=sys.stderr) all_passed = False else: print(f"✓ Requirement 4.1: Found exactly 5 completed topics as required") # Requirement 5: Proper integration (implicitly checked by order) if all_passed: print("✓ Requirement 5: All content properly integrated in correct order") return all_passed def main(): """ Executes the verification process and exits with a status code. """ notion = notion_utils.get_notion_client() main_id = sys.argv[1] if len(sys.argv) > 1 else None if verify(notion, main_id): print("Verification passed") sys.exit(0) else: print("Verification failed") sys.exit(1) if __name__ == "__main__": main() ================================================ FILE: tasks/notion/standard/self_assessment/faq_column_layout/description.md ================================================ Navigate to the "Self Assessment" page and reorganize the content under the FAQ toggle as follows: **Task Requirements:** 1. Add a column list with two columns inside the FAQ toggle 2. Move the first two existing Q&A pairs from the FAQ to the left column 3. Move the third existing Q&A pair to the right column 4. Add one additional Q&A pair in the right column to match the format, so both columns have exactly 2 Q&A pairs 5. Ensure all Q&A pairs maintain consistent formatting (heading_3 for questions, paragraph for answers) ================================================ FILE: tasks/notion/standard/self_assessment/faq_column_layout/meta.json ================================================ { "task_id": "faq_column_layout", "task_name": "FAQ Column Layout", "category_id": "self_assessment", "category_name": "Self Assessment", "description": "Reorganize the FAQ section content into a two-column layout with balanced Q&A pairs.", "author": "Xiangyan Liu", "created_at": "2025-07-27", "difficulty": "L3", "tags": [ "content organization", "visual formatting", "template population" ], "mcp": [ "notion" ], "meta_data": { "stateType": "url", "stateContent": null, "stateUrl": "https://painted-tennis-ebc.notion.site/Self-Assessment-24381626b6d780fe9f56c2ba14ea042d", "stateOriginalUrl": "https://painted-tennis-ebc.notion.site/Self-Assessment-24381626b6d780fe9f56c2ba14ea042d" } } ================================================ FILE: tasks/notion/standard/self_assessment/faq_column_layout/verify.py ================================================ import sys from notion_client import Client from tasks.utils import notion_utils def verify(notion: Client, main_id: str = None) -> bool: """ Verifies that the FAQ toggle has been properly reorganized with a column list. """ # Start from main_id if provided page_id = None if main_id: found_id, object_type = notion_utils.find_page_or_database_by_id( notion, main_id ) if found_id and object_type == "page": page_id = found_id if not page_id: # Try to find the Self Assessment page page_id = notion_utils.find_page(notion, "Self Assessment") if not page_id: print("Error: Self Assessment page not found.", file=sys.stderr) return False # Get all blocks recursively from the page all_blocks = notion_utils.get_all_blocks_recursively(notion, page_id) # Find the FAQ toggle block faq_toggle_block = None faq_toggle_id = None for block in all_blocks: if block.get("type") == "toggle": block_text = notion_utils.get_block_plain_text(block) if "FAQ" in block_text: faq_toggle_block = block faq_toggle_id = block.get("id") print(f"Found FAQ toggle block: {block_text}") break if not faq_toggle_block: print("Error: FAQ toggle block not found.", file=sys.stderr) return False # Find column_list inside the FAQ toggle column_list_block = None for block in all_blocks: if ( block.get("type") == "column_list" and block.get("parent", {}).get("block_id") == faq_toggle_id ): column_list_block = block break if not column_list_block: print("Error: No column_list found inside FAQ toggle.", file=sys.stderr) return False # Check that there are no Q&A pairs directly under FAQ toggle (outside column_list) direct_faq_children = [] for block in all_blocks: if block.get("parent", {}).get("block_id") == faq_toggle_id and block.get( "id" ) != column_list_block.get("id"): direct_faq_children.append(block) # Check if any of these are heading_3 or paragraph blocks (Q&A content) for block in direct_faq_children: if block.get("type") in ["heading_3", "paragraph"]: print( f"Error: Found Q&A content outside column_list: {notion_utils.get_block_plain_text(block)[:50]}...", file=sys.stderr, ) return False # Find the two columns columns = [] column_list_id = column_list_block.get("id") for block in all_blocks: if ( block.get("type") == "column" and block.get("parent", {}).get("block_id") == column_list_id ): columns.append(block) if len(columns) != 2: print(f"Error: Expected 2 columns, found {len(columns)}.", file=sys.stderr) return False # Check each column has exactly 2 Q&A pairs for i, column in enumerate(columns): column_id = column.get("id") # Find blocks inside this column column_blocks = [] for block in all_blocks: if block.get("parent", {}).get("block_id") == column_id: column_blocks.append(block) # Count Q&A pairs (should be heading_3 followed by paragraph) qa_pairs = 0 j = 0 while j < len(column_blocks): if ( column_blocks[j].get("type") == "heading_3" and j + 1 < len(column_blocks) and column_blocks[j + 1].get("type") == "paragraph" ): qa_pairs += 1 j += 2 # Skip both question and answer else: j += 1 if qa_pairs != 2: print( f"Error: Column {i + 1} has {qa_pairs} Q&A pairs, expected 2.", file=sys.stderr, ) return False print(f"Column {i + 1}: Found {qa_pairs} Q&A pairs ✓") print( "Success: FAQ toggle properly organized with 2 columns, each containing 2 Q&A pairs." ) return True def main(): """ Executes the verification process and exits with a status code. """ notion = notion_utils.get_notion_client() main_id = sys.argv[1] if len(sys.argv) > 1 else None if verify(notion, main_id): sys.exit(0) else: sys.exit(1) if __name__ == "__main__": main() ================================================ FILE: tasks/notion/standard/self_assessment/hyperfocus_analysis_report/description.md ================================================ Go to my Self Assessment page, and then create a hyperfocus analysis report by analyzing sessions with high productivity but significant challenges. **Task Requirements:** 1. Create a new page titled "Hyperfocus Analysis Report" as a child of the Self Assessment page. The new page should be located between 'Why Use the Term "Hyperfocus"?' callout and the following divider line. 2. Query the "Hyperfocus Self-Assessment Worksheet" database to find all sessions where: - Work Completion Rate is greater than 80% (0.8) - At least one challenge is present in the Challenges field 3. For each qualifying session, create a section with: - A heading showing the date and activity type (format: YYYY-MM-DD Activity) - A bullet list containing: - Focus factors used (e.g., Focus factors: XXX, YYY) - Energy level and mood (format: "Energy: X/10, Mood: Y/10") - Challenges faced (e.g., Challenges: XXX, YYY) - Strategies that helped overcome challenges (e.g., Strategies: XXX, YYY) - Work completion rate (format: "Completion: XX%") 4. At the top of the page, add a callout block (type: "info") with: - Title: "Top 2 Most Effective Strategies" - Content: List the 2 most frequently used strategies from all sessions, each on a new line with format "• Strategy Name (used in X sessions)" **Structure Requirements:** - The page must have the exact title "Hyperfocus Analysis Report" - Each session section must start with a level 2 heading - All session details must be in bullet point format - The summary callout must be at the top of the page before any session details ================================================ FILE: tasks/notion/standard/self_assessment/hyperfocus_analysis_report/meta.json ================================================ { "task_id": "hyperfocus_analysis_report", "task_name": "Hyperfocus Analysis Report", "category_id": "self_assessment", "category_name": "Self Assessment", "description": "Create a hyperfocus analysis report by analyzing high-productivity sessions with challenges.", "author": "Xiangyan Liu", "created_at": "2025-07-27", "difficulty": "L3", "tags": [ "conditional filtering", "data aggregation", "report generation", "visual formatting" ], "mcp": [ "notion" ], "meta_data": { "stateType": "url", "stateContent": null, "stateUrl": "https://painted-tennis-ebc.notion.site/Self-Assessment-24381626b6d780fe9f56c2ba14ea042d", "stateOriginalUrl": "https://painted-tennis-ebc.notion.site/Self-Assessment-24381626b6d780fe9f56c2ba14ea042d" } } ================================================ FILE: tasks/notion/standard/self_assessment/hyperfocus_analysis_report/verify.py ================================================ import sys import re from notion_client import Client from tasks.utils import notion_utils from collections import Counter def validate_comma_separated(text: str, expected_items: list) -> bool: """ Validates that a comma-separated list contains expected items (case-insensitive). """ if not text or not expected_items: return False # Extract items from text items = [item.strip().lower() for item in text.split(",")] expected_lower = [item.lower() for item in expected_items] # Check if all expected items are present for expected in expected_lower: if not any(expected in item or item in expected for item in items): return False return True def verify(notion: Client, main_id: str = None) -> bool: """ Verifies that the Hyperfocus Analysis Report has been created correctly. """ # Find the Self Assessment page self_assessment_page_id = main_id if main_id: found_id, object_type = notion_utils.find_page_or_database_by_id( notion, main_id ) if found_id and object_type == "page": self_assessment_page_id = found_id if not self_assessment_page_id: # Try to find by name self_assessment_page_id = notion_utils.find_page(notion, "Self Assessment") if not self_assessment_page_id: print("Error: Self Assessment page not found.", file=sys.stderr) return False # Find the Hyperfocus Analysis Report page report_page_id = None report_position = -1 callout_position = -1 divider_position = -1 children = notion.blocks.children.list(block_id=self_assessment_page_id).get( "results", [] ) for i, child in enumerate(children): # Track position of callout with "Why Use the Term" if child.get("type") == "callout": callout_text = notion_utils.get_block_plain_text(child) if "Why Use the Term" in callout_text and "Hyperfocus" in callout_text: callout_position = i # Track position of divider elif child.get("type") == "divider": if callout_position != -1 and divider_position == -1: divider_position = i # Find the report page elif child.get("type") == "child_page": page_data = notion.pages.retrieve(page_id=child["id"]) title_prop = ( page_data.get("properties", {}).get("title", {}).get("title", []) ) if ( title_prop and title_prop[0].get("plain_text") == "Hyperfocus Analysis Report" ): report_page_id = child["id"] report_position = i if not report_page_id: print("Error: 'Hyperfocus Analysis Report' page not found.", file=sys.stderr) return False # Verify position if callout_position == -1: print( "Error: Could not find 'Why Use the Term \"Hyperfocus\"?' callout.", file=sys.stderr, ) return False if divider_position == -1: print("Error: Could not find divider after the callout.", file=sys.stderr) return False if not (callout_position < report_position < divider_position): print( f"Error: Report page is not positioned between callout and divider. Positions: callout={callout_position}, report={report_position}, divider={divider_position}", file=sys.stderr, ) return False # Get all blocks from the report page all_blocks = notion_utils.get_all_blocks_recursively(notion, report_page_id) # Find the database in the Self Assessment page database_id = None for block in notion_utils.get_all_blocks_recursively( notion, self_assessment_page_id ): if block.get("type") == "child_database": db_data = notion.databases.retrieve(database_id=block["id"]) db_title = "".join( [t.get("plain_text", "") for t in db_data.get("title", [])] ) if "Hyperfocus Self-Assessment Worksheet" in db_title: database_id = block["id"] break if not database_id: print( "Error: Database 'Hyperfocus Self-Assessment Worksheet' not found.", file=sys.stderr, ) return False # Query database for sessions with >80% completion rate and challenges query_results = notion.databases.query( database_id=database_id, filter={ "and": [ {"property": "Work Completion Rate", "number": {"greater_than": 0.8}}, {"property": "Challenges", "multi_select": {"is_not_empty": True}}, ] }, ).get("results", []) if not query_results: print( "Warning: No sessions found with >80% completion rate and challenges.", file=sys.stderr, ) # Still check if the page structure is correct # Verify page structure has_callout = False has_top_strategies = False session_count = 0 found_sessions = {} # Track sessions by date for validation # Track strategies for validation - count from ALL sessions all_sessions = notion.databases.query(database_id=database_id).get("results", []) all_strategies = [] for session in all_sessions: strategies = ( session.get("properties", {}) .get("Key Strategies Used", {}) .get("multi_select", []) ) all_strategies.extend([s.get("name") for s in strategies]) strategy_counts = Counter(all_strategies) top_2_strategies = strategy_counts.most_common(2) # Build expected sessions from query results with all data expected_sessions = {} for result in query_results: date_prop = result.get("properties", {}).get("Date", {}).get("date", {}) activity_prop = ( result.get("properties", {}).get("Activity", {}).get("select", {}) ) if date_prop and date_prop.get("start") and activity_prop: date_str = date_prop["start"] activity_name = activity_prop.get("name", "") # Extract all session data for validation focus_factors = [ f.get("name", "") for f in result.get("properties", {}) .get("Focus Factors", {}) .get("multi_select", []) ] challenges = [ c.get("name", "") for c in result.get("properties", {}) .get("Challenges", {}) .get("multi_select", []) ] strategies = [ s.get("name", "") for s in result.get("properties", {}) .get("Key Strategies Used", {}) .get("multi_select", []) ] energy = result.get("properties", {}).get("Energy Level", {}).get("number") mood = result.get("properties", {}).get("Mood", {}).get("number") completion = ( result.get("properties", {}) .get("Work Completion Rate", {}) .get("number") ) expected_sessions[date_str] = { "activity": activity_name, "focus_factors": focus_factors, "challenges": challenges, "strategies": strategies, "energy": energy, "mood": mood, "completion": completion, } current_session_date = None current_session_data = None session_bullet_points = {} # Track bullet points for each session for i, block in enumerate(all_blocks): block_type = block.get("type") # Check for callout at the top if block_type == "callout" and i < 5: # Should be near the top callout_text = notion_utils.get_block_plain_text(block) if "Top 2 Most Effective Strategies" in callout_text: has_callout = True # Check if it contains strategy information s1, n1 = top_2_strategies[0] s2, n2 = top_2_strategies[1] t1 = f"{s1} (used in {n1} sessions)" t2 = f"{s2} (used in {n2} sessions)" if t1 in callout_text and t2 in callout_text: has_top_strategies = True break # Check for session headings with format YYYY-MM-DD Activity if block_type == "heading_2": heading_text = notion_utils.get_block_plain_text(block) # Check if heading matches expected format for date_str, session_data in expected_sessions.items(): activity = session_data["activity"] expected_heading = f"{date_str} {activity}" if expected_heading in heading_text: found_sessions[date_str] = session_data session_count += 1 current_session_date = date_str current_session_data = session_data session_bullet_points[date_str] = [] break # Check for bullet points with session details if block_type == "bulleted_list_item" and current_session_data: bullet_text = notion_utils.get_block_plain_text(block) # Track bullet points for current session if current_session_date: session_bullet_points[current_session_date].append(bullet_text) # Validate specific bullet point content if bullet_text.startswith("Focus factors"): content = bullet_text.split(":", 1)[1].strip() expected_factors = current_session_data.get("focus_factors", []) if not validate_comma_separated(content, expected_factors): print( f"Error: Focus factors mismatch for {current_session_date}. Expected: {expected_factors}, Found: {content}", file=sys.stderr, ) return False elif "Energy" in bullet_text and "Mood" in bullet_text: # Extract energy and mood values energy_match = re.search(r"Energy:\s*(\d+)/10", bullet_text) mood_match = re.search(r"Mood:\s*(\d+)/10", bullet_text) if energy_match and mood_match: found_energy = int(energy_match.group(1)) found_mood = int(mood_match.group(1)) expected_energy = current_session_data.get("energy") expected_mood = current_session_data.get("mood") if found_energy != expected_energy or found_mood != expected_mood: print( f"Error: Energy/Mood mismatch for {current_session_date}. Expected: Energy: {expected_energy}/10, Mood: {expected_mood}/10", file=sys.stderr, ) return False else: print( f"Error: Invalid Energy/Mood format for {current_session_date}", file=sys.stderr, ) return False elif bullet_text.startswith("Challenges"): content = bullet_text.split(":", 1)[1].strip() expected_challenges = current_session_data.get("challenges", []) if not validate_comma_separated(content, expected_challenges): print( f"Error: Challenges mismatch for {current_session_date}. Expected: {expected_challenges}, Found: {content}", file=sys.stderr, ) return False elif bullet_text.startswith("Strategies"): content = bullet_text.split(":", 1)[1].strip() expected_strategies = current_session_data.get("strategies", []) if len(expected_strategies) > 0 and not validate_comma_separated( content, expected_strategies ): print( f"Error: Strategies mismatch for {current_session_date}. Expected: {expected_strategies}, Found: {content}", file=sys.stderr, ) return False elif bullet_text.startswith("Completion"): # Extract completion percentage completion_match = re.search(r"Completion:\s*(\d+)%", bullet_text) if completion_match: found_completion = int(completion_match.group(1)) expected_completion = int( current_session_data.get("completion", 0) * 100 ) if found_completion != expected_completion: print( f"Error: Completion rate mismatch for {current_session_date}. Expected: {expected_completion}%, Found: {found_completion}%", file=sys.stderr, ) return False else: print( f"Error: Invalid completion format for {current_session_date}", file=sys.stderr, ) return False # Verify all sessions have complete bullet points for date_str, bullets in session_bullet_points.items(): bullets_text = " ".join(bullets) required_items = [ "Focus factors", "Energy:", "Mood:", "Challenges", "Strategies", "Completion", ] missing_items = [] for item in required_items: if item not in bullets_text: missing_items.append(item) if missing_items: print( f"Error: Missing bullet points for session {date_str}: {', '.join(missing_items)}", file=sys.stderr, ) return False # Verify all requirements if not has_callout: print( "Error: Missing callout block with 'Top 2 Most Effective Strategies'.", file=sys.stderr, ) return False if not has_top_strategies and len(top_2_strategies) > 0: print("Error: Callout doesn't contain strategy information.", file=sys.stderr) return False if query_results and session_count == 0: print("Error: No session sections found with proper headings.", file=sys.stderr) return False # Check if all expected sessions are present missing_sessions = [] for date_str in expected_sessions.keys(): if date_str not in found_sessions: missing_sessions.append(date_str) if missing_sessions: print( f"Error: Missing session sections for dates: {', '.join(missing_sessions)}", file=sys.stderr, ) return False if query_results and session_count < len(query_results): print( f"Warning: Found {session_count} session sections but expected {len(query_results)}.", file=sys.stderr, ) print( "Success: Hyperfocus Analysis Report created with proper structure and content." ) return True def main(): """ Executes the verification process and exits with a status code. """ notion = notion_utils.get_notion_client() main_id = sys.argv[1] if len(sys.argv) > 1 else None if verify(notion, main_id): sys.exit(0) else: sys.exit(1) if __name__ == "__main__": main() ================================================ FILE: tasks/notion/standard/self_assessment/numbered_list_emojis/description.md ================================================ Please find all numbered list items in the Self Assessment page, use Notion tools to replace the numbers with corresponding emoji numbers (e.g., 1️⃣, 2️⃣, 3️⃣). For example: Here is the translated and reformatted version of your request: If the original numbered list is: 1. First step 2. Second step 3. Third step It should become: 1️⃣ First step 2️⃣ Second step 3️⃣ Third step ================================================ FILE: tasks/notion/standard/self_assessment/numbered_list_emojis/meta.json ================================================ { "task_id": "numbered_list_emojis", "task_name": "Numbered List Emojis", "category_id": "self_assessment", "category_name": "Self Assessment", "description": "Replace numbered list items with corresponding emoji numbers for better visual formatting.", "author": "Xiangyan Liu", "created_at": "2025-07-27", "difficulty": "L3", "tags": [ "visual formatting", "automated migration" ], "mcp": [ "notion" ], "meta_data": { "stateType": "url", "stateContent": null, "stateUrl": "https://painted-tennis-ebc.notion.site/Self-Assessment-24381626b6d780fe9f56c2ba14ea042d", "stateOriginalUrl": "https://painted-tennis-ebc.notion.site/Self-Assessment-24381626b6d780fe9f56c2ba14ea042d" } } ================================================ FILE: tasks/notion/standard/self_assessment/numbered_list_emojis/verify.py ================================================ import sys from notion_client import Client from tasks.utils import notion_utils def verify(notion: Client, main_id: str = None) -> bool: """ Verifies that numbered lists have been replaced with emoji numbers. """ # Start from main_id if provided, otherwise search for the page self_assessment_page_id = main_id if main_id: found_id, object_type = notion_utils.find_page_or_database_by_id( notion, main_id ) if found_id and object_type == "page": self_assessment_page_id = found_id if not self_assessment_page_id: # Try to find by name self_assessment_page_id = notion_utils.find_page(notion, "Self Assessment") if not self_assessment_page_id: print("Error: Self Assessment page not found.", file=sys.stderr) return False # Get all blocks recursively from the main page all_blocks = notion_utils.get_all_blocks_recursively( notion, self_assessment_page_id ) # Find all numbered_list_item blocks numbered_list_items = [] for block in all_blocks: if block.get("type") == "numbered_list_item": numbered_list_items.append(block) if len(numbered_list_items) > 0: print( f"Error: found {len(numbered_list_items)} numbered list items that should be converted to emoji numbers", file=sys.stderr, ) # return False required_items = [ "1️⃣ Record Each Hyperfocus Session:", "2️⃣ Review and Reflect:", "3️⃣ Adjust and Optimize:", '1️⃣ Harvard Business Review: "The Making of a Corporate Athlete"', '2️⃣ "Hyperfocus: How to Be More Productive in a World of Distraction" by Chris Bailey', '3️⃣ "Attention Management: How to Create Success and Gain Productivity Every Day" by Maura Thomas', '4️⃣ "Deep Work: Rules for Focused Success in a Distracted World" by Cal Newport', "1️⃣ Record Each Hyperfocus Session:", "2️⃣ Review and Reflect:", "3️⃣ Adjust and Optimize:", "1️⃣ What time of day do you feel most focused?", "2️⃣ Which environment helps you concentrate the most?", "3️⃣ What type of tasks do you find yourself getting lost in?", ] # Make a copy to track which items we've found remaining_items = required_items.copy() # Iterate through all blocks to find matching text for block in all_blocks: block_text = notion_utils.get_block_plain_text(block).strip() # Check if this block's text matches any of our required items if block_text in remaining_items: remaining_items.remove(block_text) print(f"Found: {block_text}") # Check if all required items were found if len(remaining_items) == 0: print("Success: All numbered lists have been converted to emoji numbers") return True else: print(f"Error: Missing {len(remaining_items)} required items:", file=sys.stderr) for item in remaining_items: print(f" - {item}", file=sys.stderr) return False def main(): """ Executes the verification process and exits with a status code. """ notion = notion_utils.get_notion_client() main_id = sys.argv[1] if len(sys.argv) > 1 else None if verify(notion, main_id): sys.exit(0) else: sys.exit(1) if __name__ == "__main__": main() ================================================ FILE: tasks/notion/standard/standard_operating_procedure/deployment_process_sop/description.md ================================================ Using Notion Tools. Complete the SOP template (a notion page titled 'Standard Operating Procedure') by filling in all sections with comprehensive, interconnected content for a "Software Deployment Process" SOP, ensuring all cross-references, terminologies, and procedural steps are properly linked and validated. **Task Requirements:** 1. **Update the SOP header information** (in the left column): - Change the heading_1 "SOP Title" text to "Software Deployment Process" - Update the paragraph "Created 2023-10-25" to "Created 2025-01-19" - Update the paragraph "Responsible department:" to "Responsible department: DevOps Engineering Team" - Update the People team page's callout to: "DevOps Engineering Team Wiki - Contains team contact information, escalation procedures, and deployment schedules. Access required for all deployment activities." 2. **Fill the Purpose section** with exactly this content: - Replace the placeholder paragraph (starts with "↓ Summarize the procedure") with: "This SOP defines the standardized process for deploying software applications to production environments, ensuring zero-downtime deployments, proper rollback procedures, and compliance with security protocols. This procedure applies to all production deployments and must be followed by all engineering teams." 3. **Complete the Context section** with: - Replace the placeholder paragraph (starts with "↓ Add any related and useful information") with: "Software deployments are critical operations that can impact system availability and user experience. This process has been developed based on industry best practices and our incident response learnings from Q3 2023. All deployments must go through automated testing pipelines and require approval from designated reviewers." - Update all THREE child_pages under the "Relevant Docs" toggle: - First child_page callout (Contacting IT): "Change Management Policy (SOP-001) - Defines approval workflows and change review processes for all production modifications." - Second child_page callout (Team lunches): "Incident Response Procedures (SOP-003) - Emergency procedures for handling deployment failures and system outages." - Third child_page callout (Sending swag): "Security Compliance Guidelines (SOP-007) - Security requirements and validation steps for production deployments." 4. **Define comprehensive Terminologies** by: - Replace the placeholder paragraph (starts with "↓ Add any unfamiliar or domain specific words") with: "Essential deployment terminology for team understanding:" - Replace the existing bulleted_list_item "Term: The definition of the term" with these four exact items: - "Blue-Green Deployment: A deployment strategy that maintains two identical production environments" - "Rollback Window: The maximum time allowed to revert a deployment (30 minutes)" - "Smoke Test: Initial verification tests run immediately after deployment" - "Production Gateway: The approval checkpoint before production release" 5. **Populate Tools section** with: - Replace the placeholder paragraph (starts with "↓ Add any relevant tools") with: "Critical tools required for deployment operations:" - Update the TWO existing child_pages: - First child_page callout: "Jenkins CI/CD Pipeline - Primary deployment automation tool with integrated testing and approval workflows. Required for all automated deployments." - Second child_page callout: "Kubernetes Dashboard - Container orchestration monitoring and management interface for deployment verification and rollback operations." 6. **Complete Roles & responsibilities** with: - Replace the placeholder paragraph (starts with "↓ Define who will be executing") with: "The following roles are essential for successful deployment execution:" - Replace the existing empty bulleted_list_item with these four exact items: - "DevOps Engineer: Executes deployment, monitors system health, initiates rollbacks if needed" - "Lead Developer: Reviews code changes, approves deployment package, validates functionality" - "QA Engineer: Verifies smoke tests, confirms user acceptance criteria" - "Security Officer: Validates security compliance, approves security-sensitive deployments" 7. **Create detailed Procedure section** with: - Replace the placeholder paragraph (starts with "↓ Create a step by step procedure") with: "Follow these steps in sequence. Do not skip steps or perform them out of order." - Replace the THREE existing numbered_list_items with: - "Pre-deployment: Verify all automated tests pass, obtain required approvals from Lead Developer and Security Officer, confirm rollback plan is documented and tested" - "Deployment execution: Deploy to staging environment first, run comprehensive smoke tests, obtain final Production Gateway approval, deploy to production using blue-green strategy" - "Post-deployment: Monitor system metrics for minimum 30 minutes, validate all functionality using automated tests, document deployment results in change log, notify all stakeholders via deployment notification system" ================================================ FILE: tasks/notion/standard/standard_operating_procedure/deployment_process_sop/meta.json ================================================ { "task_id": "deployment_process_sop", "task_name": "Deployment Process SOP", "category_id": "standard_operating_procedure", "category_name": "Standard Operating Procedure", "description": "Complete the SOP template with comprehensive content for a Software Deployment Process with interconnected sections.", "author": "Xiangyan Liu", "created_at": "2025-07-27", "difficulty": "L3", "tags": [ "template population", "cross-reference linking", "content organization", "visual formatting" ], "mcp": [ "notion" ], "meta_data": { "stateType": "url", "stateContent": null, "stateUrl": "https://painted-tennis-ebc.notion.site/Standard-Operating-Procedure-24381626b6d780a8b678f9e62ae5b152", "stateOriginalUrl": "https://www.notion.so/marketplace/templates/standard-operating-procedure" } } ================================================ FILE: tasks/notion/standard/standard_operating_procedure/deployment_process_sop/verify.py ================================================ import sys from notion_client import Client from tasks.utils import notion_utils def verify(notion: Client, main_id: str = None) -> bool: """ Verifies comprehensive SOP template completion with exact content matching. """ page_id = None if main_id: found_id, object_type = notion_utils.find_page_or_database_by_id( notion, main_id ) if found_id and object_type == "page": page_id = found_id if not page_id: page_id = notion_utils.find_page(notion, "Standard Operating Procedure") if not page_id: print("Error: Page 'Standard Operating Procedure' not found.", file=sys.stderr) return False all_blocks = notion_utils.get_all_blocks_recursively(notion, page_id) verification_results = [] # Check 1: Verify SOP header information updates sop_title_found = False created_date_found = False responsible_dept_found = False header_callout_found = False for block in all_blocks: if block.get("type") == "heading_1": heading_text = notion_utils.get_block_plain_text(block) if "Software Deployment Process" in heading_text: sop_title_found = True verification_results.append("✅ SOP Title updated correctly") elif block.get("type") == "paragraph": para_text = notion_utils.get_block_plain_text(block) if "Created 2025-01-19" in para_text: created_date_found = True verification_results.append("✅ Created date updated correctly") elif "Responsible department: DevOps Engineering Team" in para_text: responsible_dept_found = True verification_results.append( "✅ Responsible department updated correctly" ) elif block.get("type") == "child_page": # Check child pages recursively for callout content - specifically the People team page try: child_page_info = notion.pages.retrieve(page_id=block["id"]) child_page_title = "" if ( "properties" in child_page_info and "title" in child_page_info["properties"] ): title_list = child_page_info["properties"]["title"].get("title", []) if title_list: child_page_title = title_list[0].get("plain_text", "") except: child_page_title = "" child_blocks = notion_utils.get_all_blocks_recursively(notion, block["id"]) for child_block in child_blocks: if child_block.get("type") == "callout": callout_text = notion_utils.get_block_plain_text(child_block) # Look for the People team page with the DevOps Engineering Team Wiki callout if ( "DevOps Engineering Team Wiki" in callout_text and "deployment schedules" in callout_text and "deployment activities" in callout_text ): header_callout_found = True verification_results.append( "✅ Header People team page callout updated correctly" ) # Check 2: Verify Purpose section content purpose_found = False expected_purpose = "This SOP defines the standardized process for deploying software applications to production environments" for i, block in enumerate(all_blocks): if block.get("type") == "heading_2": heading_text = notion_utils.get_block_plain_text(block) if "Purpose" in heading_text: # Check next paragraph after Purpose heading for j in range(i + 1, min(i + 5, len(all_blocks))): next_block = all_blocks[j] if next_block.get("type") == "paragraph": para_text = notion_utils.get_block_plain_text(next_block) if ( expected_purpose in para_text and "engineering teams" in para_text ): purpose_found = True verification_results.append( "✅ Purpose section content updated correctly" ) break break # Check 3: Verify Context section and child_page callouts context_found = False child_pages_updated = 0 expected_context = "Software deployments are critical operations that can impact system availability" expected_child_callouts = [ ( "Change Management Policy (SOP-001)", "Defines approval workflows and change review processes for all production modifications", "Contacting IT", ), ( "Incident Response Procedures (SOP-003)", "Emergency procedures for handling deployment failures and system outages", "Team lunches", ), ( "Security Compliance Guidelines (SOP-007)", "Security requirements and validation steps for production deployments", "Sending swag", ), ] for i, block in enumerate(all_blocks): if block.get("type") == "heading_2": heading_text = notion_utils.get_block_plain_text(block) if "Context" in heading_text: # Check paragraph content for j in range(i + 1, min(i + 10, len(all_blocks))): next_block = all_blocks[j] if next_block.get("type") == "paragraph": para_text = notion_utils.get_block_plain_text(next_block) if expected_context in para_text and "Q3 2023" in para_text: context_found = True elif next_block.get("type") == "toggle": # Check child pages under toggle toggle_blocks = notion_utils.get_all_blocks_recursively( notion, next_block["id"] ) for toggle_child in toggle_blocks: if toggle_child.get("type") == "child_page": # Get the child page title to match with expected callouts try: child_page_info = notion.pages.retrieve( page_id=toggle_child["id"] ) child_page_title = "" if ( "properties" in child_page_info and "title" in child_page_info["properties"] ): title_list = child_page_info["properties"][ "title" ].get("title", []) if title_list: child_page_title = title_list[0].get( "plain_text", "" ) except: child_page_title = "" child_blocks = notion_utils.get_all_blocks_recursively( notion, toggle_child["id"] ) for child_block in child_blocks: if child_block.get("type") == "callout": callout_text = ( notion_utils.get_block_plain_text( child_block ) ) for ( expected_title, expected_content, expected_page_title, ) in expected_child_callouts: if ( expected_title in callout_text and expected_content in callout_text and expected_page_title in child_page_title ): child_pages_updated += 1 verification_results.append( f"✅ Context child_page '{expected_page_title}' updated correctly" ) break if context_found: verification_results.append("✅ Context section content updated correctly") if child_pages_updated == 3: verification_results.append( "✅ All 3 Context child_page callouts updated correctly" ) else: verification_results.append( f"❌ Only {child_pages_updated}/3 Context child_page callouts updated correctly (Contacting IT, Team lunches, Sending swag)" ) # Check 4: Verify Terminologies section with exact 4 bulleted items terminologies_found = False terminology_items = [] expected_terminologies = [ "Blue-Green Deployment: A deployment strategy that maintains two identical production environments", "Rollback Window: The maximum time allowed to revert a deployment (30 minutes)", "Smoke Test: Initial verification tests run immediately after deployment", "Production Gateway: The approval checkpoint before production release", ] for i, block in enumerate(all_blocks): if block.get("type") == "heading_2": heading_text = notion_utils.get_block_plain_text(block) if "Terminologies" in heading_text: # Check for intro paragraph for j in range(i + 1, min(i + 2, len(all_blocks))): if all_blocks[j].get("type") == "paragraph": para_text = notion_utils.get_block_plain_text(all_blocks[j]) if "Essential deployment terminology" in para_text: terminologies_found = True break # Check bulleted list items for j in range(i + 1, min(i + 10, len(all_blocks))): next_block = all_blocks[j] if next_block.get("type") == "bulleted_list_item": item_text = notion_utils.get_block_plain_text(next_block) terminology_items.append(item_text) elif next_block.get("type") in [ "heading_1", "heading_2", "heading_3", ]: break break terminology_matches = sum( 1 for expected in expected_terminologies if any(expected in item for item in terminology_items) ) if terminologies_found and len(terminology_items) == 4 and terminology_matches == 4: verification_results.append( "✅ Terminologies section with exactly 4 correct items" ) else: verification_results.append( f"❌ Terminologies: expected 4 items, found {len(terminology_items)}, {terminology_matches} correct" ) # Check 5: Verify Tools section with 2 child_page callouts tools_found = False tools_child_pages = 0 expected_tools = [ ("Jenkins CI/CD Pipeline", "automated deployments"), ("Kubernetes Dashboard", "rollback operations"), ] for i, block in enumerate(all_blocks): if block.get("type") == "heading_2": heading_text = notion_utils.get_block_plain_text(block) if "Tools" in heading_text: # Check intro paragraph for j in range(i + 1, min(i + 2, len(all_blocks))): if all_blocks[j].get("type") == "paragraph": para_text = notion_utils.get_block_plain_text(all_blocks[j]) if "Critical tools required" in para_text: tools_found = True break # Check child pages for j in range(i + 1, min(i + 10, len(all_blocks))): next_block = all_blocks[j] if next_block.get("type") == "child_page": child_blocks = notion_utils.get_all_blocks_recursively( notion, next_block["id"] ) for child_block in child_blocks: if child_block.get("type") == "callout": callout_text = notion_utils.get_block_plain_text( child_block ) for expected_title, expected_content in expected_tools: if ( expected_title in callout_text and expected_content in callout_text ): tools_child_pages += 1 break elif next_block.get("type") in [ "heading_1", "heading_2", "heading_3", ]: break break if tools_found and tools_child_pages == 2: verification_results.append( "✅ Tools section with 2 correctly updated child_page callouts" ) else: verification_results.append( f"❌ Tools section: expected 2 child_pages updated, found {tools_child_pages}" ) # Check 6: Verify Roles & responsibilities with exactly 4 bulleted items roles_found = False role_items = [] expected_roles = [ "DevOps Engineer: Executes deployment, monitors system health, initiates rollbacks if needed", "Lead Developer: Reviews code changes, approves deployment package, validates functionality", "QA Engineer: Verifies smoke tests, confirms user acceptance criteria", "Security Officer: Validates security compliance, approves security-sensitive deployments", ] for i, block in enumerate(all_blocks): if block.get("type") == "heading_2": heading_text = notion_utils.get_block_plain_text(block) if "Roles" in heading_text and "responsibilities" in heading_text: # Check intro paragraph for j in range(i + 1, min(i + 2, len(all_blocks))): if all_blocks[j].get("type") == "paragraph": para_text = notion_utils.get_block_plain_text(all_blocks[j]) if "essential for successful deployment execution" in para_text: roles_found = True break # Check bulleted list items for j in range(i + 1, min(i + 10, len(all_blocks))): next_block = all_blocks[j] if next_block.get("type") == "bulleted_list_item": item_text = notion_utils.get_block_plain_text(next_block) role_items.append(item_text) elif next_block.get("type") in [ "heading_1", "heading_2", "heading_3", ]: break break role_matches = sum( 1 for expected in expected_roles if any(expected in item for item in role_items) ) if roles_found and len(role_items) == 4 and role_matches == 4: verification_results.append( "✅ Roles & responsibilities section with exactly 4 correct items" ) else: verification_results.append( f"❌ Roles section: expected 4 items, found {len(role_items)}, {role_matches} correct" ) # Check 7: Verify Procedure section with exactly 3 numbered items procedure_found = False procedure_items = [] expected_procedures = [ ("Pre-deployment", "Lead Developer and Security Officer", "rollback plan"), ("Deployment execution", "staging environment first", "blue-green strategy"), ( "Post-deployment", "minimum 30 minutes", "stakeholders via deployment notification", ), ] for i, block in enumerate(all_blocks): if block.get("type") == "heading_2": heading_text = notion_utils.get_block_plain_text(block) if "Procedure" in heading_text: # Check intro paragraph for j in range(i + 1, min(i + 2, len(all_blocks))): if all_blocks[j].get("type") == "paragraph": para_text = notion_utils.get_block_plain_text(all_blocks[j]) if "Follow these steps in sequence" in para_text: procedure_found = True break # Check numbered list items for j in range(i + 1, min(i + 10, len(all_blocks))): next_block = all_blocks[j] if next_block.get("type") == "numbered_list_item": item_text = notion_utils.get_block_plain_text(next_block) procedure_items.append(item_text) elif next_block.get("type") in [ "heading_1", "heading_2", "heading_3", ]: break break procedure_matches = 0 for item_text in procedure_items: for expected_title, expected_content1, expected_content2 in expected_procedures: if ( expected_title in item_text and expected_content1 in item_text and expected_content2 in item_text ): procedure_matches += 1 break if procedure_found and len(procedure_items) == 3 and procedure_matches == 3: verification_results.append("✅ Procedure section with exactly 3 correct items") else: verification_results.append( f"❌ Procedure: expected 3 items, found {len(procedure_items)}, {procedure_matches} correct" ) # Calculate overall success total_checks = 14 # Number of major verification points successful_checks = sum( 1 for result in verification_results if result.startswith("✅") ) # Print all verification results print("\n=== SOP Template Verification Results ===", file=sys.stderr) for result in verification_results: print(result, file=sys.stderr) print(f"\n=== Summary: {successful_checks}/{total_checks} checks passed ===") # Must pass ALL checks to succeed success = ( sop_title_found and created_date_found and responsible_dept_found and header_callout_found and purpose_found and context_found and child_pages_updated == 3 and terminologies_found and len(terminology_items) == 4 and terminology_matches == 4 and tools_found and tools_child_pages == 2 and roles_found and len(role_items) == 4 and role_matches == 4 and procedure_found and len(procedure_items) == 3 and procedure_matches == 3 ) if success: print("\n🎉 SUCCESS: All SOP template requirements completed correctly!") return True else: print( f"\n❌ FAILURE: SOP template verification failed. {successful_checks}/{total_checks} requirements met.", file=sys.stderr, ) return False def main(): """ Executes the verification process and exits with a status code. """ notion = notion_utils.get_notion_client() main_id = sys.argv[1] if len(sys.argv) > 1 else None if verify(notion, main_id): sys.exit(0) else: sys.exit(1) if __name__ == "__main__": main() ================================================ FILE: tasks/notion/standard/standard_operating_procedure/section_organization/description.md ================================================ # Task: Reorganize Standard Operating Procedure Page Sections ## Objective Modify the structure of the Standard Operating Procedure page in Notion by reorganizing sections through swapping and creating a column layout. ## Requirements ### Step 1: Swap Sections - Navigate to the Standard Operating Procedure page - Swap the positions of the "Terminologies" and "Roles & responsibilities" sections - Preserve all content within each section exactly as is - Maintain the original formatting and structure of each section ### Step 2: Create Column Layout - After swapping, arrange the "Tools" section and the section immediately below it ("Terminologies") into a 2-column layout - Position the "Tools" section in the left column - Position the "Terminologies" section in the right column - In the "Tools" column, add links to the Notion and Figma pages using appropriate reference blocks - Preserve the original child pages from the "Tools" section in a toggle block placed below the column layout, with the toggle titled "original pages" ================================================ FILE: tasks/notion/standard/standard_operating_procedure/section_organization/meta.json ================================================ { "task_id": "section_organization", "task_name": "Section Organization", "category_id": "standard_operating_procedure", "category_name": "Standard Operating Procedure", "description": "Reorganize the Standard Operating Procedure page by swapping sections and creating a column layout.", "author": "Xiangyan Liu", "created_at": "2025-08-11", "difficulty": "L3", "tags": [ "content organization", "cross-reference linking", "visual formatting" ], "mcp": [ "notion" ], "meta_data": { "stateType": "url", "stateContent": null, "stateUrl": "https://painted-tennis-ebc.notion.site/Standard-Operating-Procedure-24381626b6d780a8b678f9e62ae5b152", "stateOriginalUrl": "https://www.notion.so/marketplace/templates/standard-operating-procedure" } } ================================================ FILE: tasks/notion/standard/standard_operating_procedure/section_organization/verify.py ================================================ import sys from notion_client import Client from tasks.utils import notion_utils def verify(notion: Client, main_id: str = None) -> bool: """ Verifies that the Standard Operating Procedure page has been reorganized correctly. """ # Step 1: Find the Standard Operating Procedure page if main_id: found_id, object_type = notion_utils.find_page_or_database_by_id(notion, main_id) if not found_id or object_type != 'page': print("Error: Standard Operating Procedure page not found.", file=sys.stderr) return False else: # Try to find the page by searching found_id = notion_utils.find_page(notion, "Standard Operating Procedure") if not found_id: print("Error: Standard Operating Procedure page not found.", file=sys.stderr) return False print(f"Found Standard Operating Procedure page: {found_id}") # Get all blocks from the page all_blocks = notion_utils.get_all_blocks_recursively(notion, found_id) print(f"Found {len(all_blocks)} blocks") print("Starting verification...") # Step 2: Verify the structure and section order print("2. Checking page structure and section order...") # Expected structure after the initial content and dividers # We'll look for main sections by their headings roles_index = None tools_column_index = None toggle_index = None procedure_index = None for i, block in enumerate(all_blocks): if block.get("type") == "heading_2": heading_text = "" rich_text = block.get("heading_2", {}).get("rich_text", []) if rich_text: heading_text = rich_text[0].get("text", {}).get("content", "") if heading_text == "Roles & responsibilities": roles_index = i print(f"✓ Found 'Roles & responsibilities' section at index {i}") elif heading_text == "Procedure": procedure_index = i print(f"✓ Found 'Procedure' section at index {i}") # Check for column_list (containing Tools and Terminologies) for i, block in enumerate(all_blocks): if block.get("type") == "column_list": # Check if this is the right column_list (should be after Roles & responsibilities) if roles_index and i > roles_index: tools_column_index = i print(f"✓ Found column_list at index {i}") break # Check for toggle block with "original pages" for i, block in enumerate(all_blocks): if block.get("type") == "toggle": toggle_text = "" rich_text = block.get("toggle", {}).get("rich_text", []) if rich_text: toggle_text = rich_text[0].get("text", {}).get("content", "") if toggle_text.lower() == "original pages": toggle_index = i print(f"✓ Found 'original pages' toggle at index {i}") break # Step 3: Verify section order print("3. Verifying section order...") if roles_index is None: print("Error: 'Roles & responsibilities' section not found.", file=sys.stderr) return False if tools_column_index is None: print("Error: Column layout not found.", file=sys.stderr) return False if toggle_index is None: print("Error: 'original pages' toggle not found.", file=sys.stderr) return False if procedure_index is None: print("Error: 'Procedure' section not found.", file=sys.stderr) return False # Verify order: Roles & responsibilities < column_list < toggle < Procedure if not (roles_index < tools_column_index < toggle_index < procedure_index): print("Error: Sections are not in the correct order.", file=sys.stderr) print(f" Expected order: Roles & responsibilities ({roles_index}) < column_list ({tools_column_index}) < toggle ({toggle_index}) < Procedure ({procedure_index})", file=sys.stderr) return False print("✓ Sections are in the correct order") # Step 4: Verify column_list structure print("4. Verifying column layout structure...") column_list_block = all_blocks[tools_column_index] column_list_id = column_list_block.get("id") # Get direct children of column_list (should be columns only) try: column_response = notion.blocks.children.list(block_id=column_list_id) column_children = column_response.get("results", []) except Exception as e: print(f"Error getting column children: {e}", file=sys.stderr) return False if len(column_children) < 2: print(f"Error: Column list should have at least 2 columns, found {len(column_children)}.", file=sys.stderr) return False # Verify left column (Tools) left_column = column_children[0] if left_column.get("type") != "column": print("Error: First child of column_list should be a column.", file=sys.stderr) return False left_column_id = left_column.get("id") left_column_blocks = notion_utils.get_all_blocks_recursively(notion, left_column_id) # Check for Tools heading and link_to_page blocks in left column tools_heading_found = False link_to_page_count = 0 for block in left_column_blocks: if block.get("type") == "heading_2": heading_text = block.get("heading_2", {}).get("rich_text", [{}])[0].get("text", {}).get("content", "") if heading_text == "Tools": tools_heading_found = True print("✓ Found 'Tools' heading in left column") elif block.get("type") == "link_to_page": link_to_page_count += 1 if not tools_heading_found: print("Error: 'Tools' heading not found in left column.", file=sys.stderr) return False # Check for link_to_page blocks in Tools column if link_to_page_count < 2: print(f"Error: Tools column should have at least 2 link_to_page blocks, found {link_to_page_count}.", file=sys.stderr) return False print(f"✓ Found {link_to_page_count} link_to_page blocks in Tools column") # Verify right column (Terminologies) right_column = column_children[1] if right_column.get("type") != "column": print("Error: Second child of column_list should be a column.", file=sys.stderr) return False right_column_id = right_column.get("id") right_column_blocks = notion_utils.get_all_blocks_recursively(notion, right_column_id) # Check for Terminologies heading in right column terminologies_heading_found = False for block in right_column_blocks: if block.get("type") == "heading_2": heading_text = block.get("heading_2", {}).get("rich_text", [{}])[0].get("text", {}).get("content", "") if heading_text == "Terminologies": terminologies_heading_found = True print("✓ Found 'Terminologies' heading in right column") break if not terminologies_heading_found: print("Error: 'Terminologies' heading not found in right column.", file=sys.stderr) return False # Step 5: Verify toggle block content print("5. Verifying toggle block content...") toggle_block = all_blocks[toggle_index] toggle_id = toggle_block.get("id") # Get direct children of toggle try: toggle_response = notion.blocks.children.list(block_id=toggle_id) toggle_children = toggle_response.get("results", []) except Exception as e: print(f"Error getting toggle children: {e}", file=sys.stderr) return False # Check for child_page blocks (Notion and Figma) notion_page_found = False figma_page_found = False for block in toggle_children: if block.get("type") == "child_page": title = block.get("child_page", {}).get("title", "") if title == "Notion": notion_page_found = True print("✓ Found 'Notion' child page in toggle") elif title == "Figma": figma_page_found = True print("✓ Found 'Figma' child page in toggle") if not notion_page_found: print("Error: 'Notion' child page not found in toggle block.", file=sys.stderr) return False if not figma_page_found: print("Error: 'Figma' child page not found in toggle block.", file=sys.stderr) return False # Step 6: Verify that original sections no longer exist at top level print("6. Verifying original sections have been removed from top level...") # Check that there's no standalone "Terminologies" heading before "Roles & responsibilities" for i in range(0, roles_index if roles_index else len(all_blocks)): block = all_blocks[i] if block.get("type") == "heading_2": heading_text = block.get("heading_2", {}).get("rich_text", [{}])[0].get("text", {}).get("content", "") if heading_text == "Terminologies": print("Error: 'Terminologies' section found before 'Roles & responsibilities'.", file=sys.stderr) return False # Check that there's no standalone "Tools" heading outside the column tools_outside_column = False for i, block in enumerate(all_blocks): if i == tools_column_index: continue # Skip the column_list itself if block.get("type") == "heading_2": heading_text = block.get("heading_2", {}).get("rich_text", [{}])[0].get("text", {}).get("content", "") if heading_text == "Tools" and i != tools_column_index: # Check if this is NOT inside the column parent_id = block.get("parent", {}).get("block_id") if parent_id != left_column_id: tools_outside_column = True break if tools_outside_column: print("Error: Standalone 'Tools' section found outside column layout.", file=sys.stderr) return False print("✓ Original sections have been properly reorganized") # Step 7: Final summary print("\n7. Final verification summary:") print("✓ 'Roles & responsibilities' and 'Terminologies' sections have been swapped") print("✓ 'Tools' and 'Terminologies' are in a 2-column layout") print("✓ Links to Notion and Figma pages are in the Tools column") print("✓ Original child pages are preserved in 'original pages' toggle") print("✓ Page structure is correct") print("\n✅ All verification checks passed!") return True def main(): """ Executes the verification process and exits with a status code. """ notion = notion_utils.get_notion_client() main_id = sys.argv[1] if len(sys.argv) > 1 else None if verify(notion, main_id): sys.exit(0) else: sys.exit(1) if __name__ == "__main__": main() ================================================ FILE: tasks/notion/standard/team_projects/priority_tasks_table/description.md ================================================ Hi! In my Team Projects page, please create a five-column table block that lists all tasks meeting either of the following conditions: 1. The progress is 50% or less, or 2. The task has priority P0 but is not yet completed (i.e., progress not at 100%). You should query this information from the existing “Projects” database. In the newly created table, each row should represent one task, and all information should be stored as plain text (not relations, formulas, or linked properties). In the newly created table: • Each row should represent one task • All fields should be stored as plain text (not relations, formulas, or linked properties) • The table should be sorted by expected end date (End Date) in ascending order, so that the first entry is the one with the earliest end date The table should include the following headers: • Project • Eng Hours • Progress • Start Date • End Date Please make sure all relevant tasks are included. Thank you! ================================================ FILE: tasks/notion/standard/team_projects/priority_tasks_table/meta.json ================================================ { "task_id": "priority_tasks_table", "task_name": "Priority Tasks Table", "category_id": "team_projects", "category_name": "Team Projects", "description": "Create a five-column table listing tasks with 50% or less progress or P0 priority tasks not completed.", "author": "Zijian Wu", "created_at": "2025-08-12", "difficulty": "L3", "tags": [ "conditional filtering", "database manipulation", "data aggregation", "visual formatting" ], "mcp": [ "notion" ], "meta_data": { "stateType": "url", "stateContent": null, "stateUrl": "https://painted-tennis-ebc.notion.site/Team-Projects-24e81626b6d7809c982fdb7a25825898", "stateOriginalUrl": "https://www.notion.so/marketplace/templates/gantt-chart" } } ================================================ FILE: tasks/notion/standard/team_projects/priority_tasks_table/verify.py ================================================ import sys from datetime import datetime from notion_client import Client from tasks.utils import notion_utils EXPECTED_HEADERS = ["Project", "Eng Hours", "Progress", "Start Date", "End Date"] EXPECTED_ROWS = [ { "Project": "Improve response times for support requests", "Eng Hours": 100, "Progress": 0.33, # 33% "Start Date": "2024-10-30", "End Date": "2024-11-17", }, { "Project": "Add a new social media integration", "Eng Hours": 200, "Progress": 0.40, # 40% "Start Date": "2024-11-07", "End Date": "2024-11-25", }, { "Project": "Integrate with a popular third-party service", "Eng Hours": 250, "Progress": 0.20, # 20% "Start Date": "2024-11-10", "End Date": "2024-11-18", }, { "Project": "Create customer knowledge base", "Eng Hours": 150, "Progress": 0.40, # 40% "Start Date": "2024-11-19", "End Date": "2024-11-25", }, { "Project": "Redesign the onboarding process", "Eng Hours": 300, "Progress": 0.75, # 75% "Start Date": "2024-11-20", "End Date": "2024-12-04", }, { "Project": "Publish support knowledge base", "Eng Hours": None, # N/A "Progress": 0.0, # 0% "Start Date": "2024-11-27", "End Date": "2024-11-29", }, ] # Sort the expected rows by End Date so we can directly compare order EXPECTED_ROWS.sort(key=lambda r: r["End Date"]) def _plain_text_from_cell(cell): """Concatenate plain_text from a single cell (list of rich_text).""" return "".join(rt.get("plain_text", "") for rt in cell).strip() def _parse_progress(value: str): """Convert a progress string like '40%', '40.0 %', '0.4' to float in range 0-1.""" value = value.strip() if not value: return None has_percent = "%" in value # Remove percent sign and any spaces value = value.replace("%", "").strip() try: num = float(value) if has_percent or num > 1: num /= 100.0 return num except ValueError: return None def _parse_eng_hours(value: str): value = value.strip().lower() if value in {"n/a", "na", "", "—", "-"}: return None try: return float(value) except ValueError: return None def _parse_date(value: str): value = value.strip() try: return datetime.strptime(value, "%Y-%m-%d").date() except ValueError: return None def verify(notion: Client, main_id: str = None) -> bool: """Verify that the last table in the 'Team Projects' page matches EXPECTED_ROWS and headers.""" page_id = None if main_id: found_id, object_type = notion_utils.find_page_or_database_by_id(notion, main_id) if found_id and object_type == 'page': page_id = found_id if not page_id: page_id = notion_utils.find_page(notion, "Team Projects") if not page_id: print("Error: Page 'Team Projects' not found.", file=sys.stderr) return False # Fetch all blocks to locate table blocks blocks = notion_utils.get_all_blocks_recursively(notion, page_id) table_blocks = [b for b in blocks if b.get("type") == "table"] if not table_blocks: print("Error: No table blocks found in 'Team Projects' page.", file=sys.stderr) return False table_block = table_blocks[-1] # Use the last table block table_id = table_block["id"] # Retrieve table rows rows = notion.blocks.children.list(block_id=table_id).get("results", []) if not rows: print("Error: Table block has no rows.", file=sys.stderr) return False # Validate headers header_cells = rows[0].get("table_row", {}).get("cells", []) headers = [_plain_text_from_cell(c) for c in header_cells] if headers != EXPECTED_HEADERS: print(f"Error: Table headers mismatch. Found {headers}, expected {EXPECTED_HEADERS}.", file=sys.stderr) return False # Parse data rows data_rows = [] for r in rows[1:]: cells = r.get("table_row", {}).get("cells", []) if len(cells) < 5: continue # Skip malformed rows project = _plain_text_from_cell(cells[0]) eng_hours_raw = _plain_text_from_cell(cells[1]) progress_raw = _plain_text_from_cell(cells[2]) start_raw = _plain_text_from_cell(cells[3]) end_raw = _plain_text_from_cell(cells[4]) row_dict = { "Project": project, "Eng Hours": _parse_eng_hours(eng_hours_raw), "Progress": _parse_progress(progress_raw), "Start Date": start_raw.strip(), "End Date": end_raw.strip(), } data_rows.append(row_dict) if len(data_rows) != len(EXPECTED_ROWS): print(f"Error: Expected {len(EXPECTED_ROWS)} data rows, found {len(data_rows)}.", file=sys.stderr) return False # Verify sorting by End Date ascending parsed_end_dates = [_parse_date(r["End Date"]) for r in data_rows] if any(d is None for d in parsed_end_dates): print("Error: One or more End Date values could not be parsed.", file=sys.stderr) return False if parsed_end_dates != sorted(parsed_end_dates): print("Error: Table is not sorted by End Date ascending.", file=sys.stderr) return False # Create mapping from project -> row for comparison data_map = {r["Project"]: r for r in data_rows} for expected in EXPECTED_ROWS: proj = expected["Project"] if proj not in data_map: print(f"Error: Project '{proj}' not found in table.", file=sys.stderr) return False actual = data_map[proj] # Compare Eng Hours expected_hours = expected["Eng Hours"] actual_hours = actual["Eng Hours"] if expected_hours is None: if actual_hours is not None: print(f"Error: Eng Hours for '{proj}' expected to be empty/N\u204aA but found '{actual_hours}'.", file=sys.stderr) return False else: if actual_hours is None or abs(actual_hours - expected_hours) > 1e-2: print(f"Error: Eng Hours for '{proj}' mismatch. Expected {expected_hours}, found {actual_hours}.", file=sys.stderr) return False # Compare Progress with tolerance expected_progress = expected["Progress"] actual_progress = actual["Progress"] if actual_progress is None or abs(actual_progress - expected_progress) > 1e-2: print(f"Error: Progress for '{proj}' mismatch. Expected {expected_progress}, found {actual_progress}.", file=sys.stderr) return False # Compare Start and End Dates (string equality) if actual["Start Date"] != expected["Start Date"]: print(f"Error: Start Date for '{proj}' mismatch. Expected {expected['Start Date']}, found {actual['Start Date']}.", file=sys.stderr) return False if actual["End Date"] != expected["End Date"]: print(f"Error: End Date for '{proj}' mismatch. Expected {expected['End Date']}, found {actual['End Date']}.", file=sys.stderr) return False print("Success: Verified table block contents and order successfully.") return True def main(): """Execute verification and exit with status code.""" notion = notion_utils.get_notion_client() main_id = sys.argv[1] if len(sys.argv) > 1 else None if verify(notion, main_id): sys.exit(0) else: sys.exit(1) if __name__ == "__main__": main() ================================================ FILE: tasks/notion/standard/team_projects/swap_tasks/description.md ================================================ Go to the Team Projects page, find the person responsible for the most tasks and the person responsible for the fewest tasks, then swap their assigned tasks. ================================================ FILE: tasks/notion/standard/team_projects/swap_tasks/meta.json ================================================ { "task_id": "swap_tasks", "task_name": "Swap Tasks", "category_id": "team_projects", "category_name": "Team Projects", "description": "Find the person responsible for the most and fewest tasks, then swap their assigned tasks.", "author": "Xiangyan Liu", "created_at": "2025-08-12", "difficulty": "L3", "tags": [ "data aggregation", "automated migration", "conditional filtering" ], "mcp": [ "notion" ], "meta_data": { "stateType": "url", "stateContent": null, "stateUrl": "https://painted-tennis-ebc.notion.site/Team-Projects-24e81626b6d7809c982fdb7a25825898", "stateOriginalUrl": "https://www.notion.so/marketplace/templates/gantt-chart" } } ================================================ FILE: tasks/notion/standard/team_projects/swap_tasks/verify.py ================================================ import sys from notion_client import Client from tasks.utils import notion_utils def verify(notion: Client, main_id: str = None) -> bool: """ Verifies that the task assignees have been swapped correctly. Checks: 1. "Develop a plan for promotion" and "Evaluate different third-party services" have swapped assignees 2. The person with most tasks and person with least tasks have swapped all their tasks """ # Step 1: Find the Team Projects page if main_id: found_id, object_type = notion_utils.find_page_or_database_by_id(notion, main_id) if not found_id or object_type != 'page': print("Error: Team Projects page not found.", file=sys.stderr) return False else: # Try to find the page by searching found_id = notion_utils.find_page(notion, "Team Projects") if not found_id: print("Error: Team Projects page not found.", file=sys.stderr) return False # Get all blocks from the page to find database references all_blocks = notion_utils.get_all_blocks_recursively(notion, found_id) # Find Tasks database ID from the page tasks_db_id = None for block in all_blocks: if block and block.get("type") == "child_database": db_title = block.get("child_database", {}).get("title", "") if "Tasks" in db_title: tasks_db_id = block["id"] break if not tasks_db_id: print("Error: Tasks database not found.", file=sys.stderr) return False print("\n📋 Starting verification...") # Step 2: Query all tasks to analyze assignees try: all_tasks_response = notion.databases.query( database_id=tasks_db_id, page_size=100 ) if not all_tasks_response.get("results"): print("Error: No tasks found in Tasks database.", file=sys.stderr) return False tasks = all_tasks_response["results"] except Exception as e: print(f"Error querying Tasks database: {e}", file=sys.stderr) return False # Step 3: Check specific tasks have swapped assignees develop_plan_task = None evaluate_services_task = None for task in tasks: task_name = task["properties"]["Name"]["title"][0]["text"]["content"] if task_name == "Develop a plan for promotion": develop_plan_task = task elif task_name == "Evaluate different third-party services": evaluate_services_task = task if not develop_plan_task or not evaluate_services_task: print("Error: Could not find both required tasks.", file=sys.stderr) return False # Get assignees for these tasks develop_plan_assignees = develop_plan_task["properties"]["Assigned"]["people"] evaluate_services_assignees = evaluate_services_task["properties"]["Assigned"]["people"] if not develop_plan_assignees or not evaluate_services_assignees: print("Error: Tasks don't have assignees.", file=sys.stderr) return False develop_plan_assignee_id = develop_plan_assignees[0]["id"] evaluate_services_assignee_id = evaluate_services_assignees[0]["id"] # These should be different (swapped) if develop_plan_assignee_id == evaluate_services_assignee_id: print("Error: Tasks should have different assignees after swap.", file=sys.stderr) return False # Step 4: Count tasks per person task_counts = {} unassigned_count = 0 for task in tasks: assignees = task["properties"]["Assigned"]["people"] if assignees: assignee_id = assignees[0]["id"] if assignee_id not in task_counts: task_counts[assignee_id] = [] task_counts[assignee_id].append(task["properties"]["Name"]["title"][0]["text"]["content"]) else: unassigned_count += 1 # Sort by task count sorted_assignees = sorted(task_counts.items(), key=lambda x: len(x[1])) if len(sorted_assignees) < 2: print("Error: Need at least 2 people with tasks to verify swap.", file=sys.stderr) return False # Get person with least and most tasks person_with_least = sorted_assignees[0] person_with_most = sorted_assignees[-1] least_id, least_tasks = person_with_least most_id, most_tasks = person_with_most # Step 5: Verify the swap pattern # Original distribution (before swap): # - 5ac96c02-49a4-4320-8de6-b663ba83126b had 3 tasks (least) # - ac7a3bd0-c111-4464-8f45-8a857a1abc8a had 10 tasks (most) # After complete swap, we expect: # - 5ac96c02-49a4-4320-8de6-b663ba83126b should have 10 tasks # - ac7a3bd0-c111-4464-8f45-8a857a1abc8a should have 3 tasks original_least_id = "5ac96c02-49a4-4320-8de6-b663ba83126b" original_most_id = "ac7a3bd0-c111-4464-8f45-8a857a1abc8a" # Check if the swap has been completed swap_completed = False for assignee_id, assignee_tasks in task_counts.items(): if assignee_id == original_least_id and len(assignee_tasks) == 10: # Person who had 3 now has 10 for other_id, other_tasks in task_counts.items(): if other_id == original_most_id and len(other_tasks) == 3: # Person who had 10 now has 3 swap_completed = True break # Step 6: Summary print(f"\n📊 Task Distribution:") print(f" • Total tasks: {len(tasks)}") print(f" • Assigned tasks: {len(tasks) - unassigned_count}") print(f" • Unassigned tasks: {unassigned_count}") print(f" • People with tasks: {len(task_counts)}") print(f"\n Task counts by person:") for assignee_id, assignee_tasks in sorted_assignees: print(f" - {assignee_id[:8]}...: {len(assignee_tasks)} tasks") # Step 7: Final verification print("\n🔍 Verification Results:") # Check that the swap has created a significant difference if len(most_tasks) - len(least_tasks) < 5: print(f"Warning: Difference between most and least is only {len(most_tasks) - len(least_tasks)} tasks", file=sys.stderr) # Verify specific expected outcomes verification_passed = True # Check 1: Specific tasks have been swapped specific_tasks_swapped = develop_plan_assignee_id != evaluate_services_assignee_id if specific_tasks_swapped: print(" ✓ Specific tasks have been swapped") else: print(" ✗ Specific tasks were not swapped", file=sys.stderr) verification_passed = False # Check 2: Task distribution shows a complete swap if swap_completed: print(" ✓ Complete task swap verified (3↔10 tasks)") else: # Show actual distribution for debugging person1_tasks = len(task_counts.get(original_least_id, [])) person2_tasks = len(task_counts.get(original_most_id, [])) print(f" ✗ Swap incomplete! Expected 5ac96c02→10 tasks, ac7a3bd0→3 tasks", file=sys.stderr) print(f" Actual: 5ac96c02→{person1_tasks} tasks, ac7a3bd0→{person2_tasks} tasks", file=sys.stderr) verification_passed = False # Check 3: Total task count is preserved total_assigned_tasks = sum(len(tasks) for _, tasks in task_counts.items()) expected_total = len(tasks) - unassigned_count if total_assigned_tasks == expected_total: print(f" ✓ Total task count preserved ({total_assigned_tasks} assigned)") else: print(f" ✗ Task count mismatch: {total_assigned_tasks} vs {expected_total} expected", file=sys.stderr) verification_passed = False if verification_passed: print("\n✅ All verification checks passed!") return True else: print("\n❌ Verification failed", file=sys.stderr) return False def main(): """ Executes the verification process and exits with a status code. """ notion = notion_utils.get_notion_client() main_id = sys.argv[1] if len(sys.argv) > 1 else None if verify(notion, main_id): sys.exit(0) else: sys.exit(1) if __name__ == "__main__": main() ================================================ FILE: tasks/notion/standard/toronto_guide/change_color/description.md ================================================ Navigate to the Toronto Guide page in Notion and change all pink-colored elements (tags and callout colors) to different colors. ## Requirements 1. Find and access the Toronto Guide page in Notion 2. Identify and change all pink elements including: - Pink tags in databases - Pink callout backgrounds 3. Change all pink colors to any other color of your choice ================================================ FILE: tasks/notion/standard/toronto_guide/change_color/meta.json ================================================ { "task_id": "change_color", "task_name": "Change Color", "category_id": "toronto_guide", "category_name": "Toronto Guide", "description": "Navigate to the Toronto Guide page and change all pink-colored elements to different colors.", "author": "Xiangyan Liu", "created_at": "2025-08-14", "difficulty": "L3", "tags": [ "visual formatting", "conditional filtering" ], "mcp": [ "notion" ], "meta_data": { "stateType": "url", "stateContent": null, "stateUrl": "https://painted-tennis-ebc.notion.site/Toronto-Guide-25281626b6d7802caa7cc394647e901c", "stateOriginalUrl": "https://www.notion.so/marketplace/templates/conquering-toronto-a-destination-guide" } } ================================================ FILE: tasks/notion/standard/toronto_guide/change_color/verify.py ================================================ import sys from notion_client import Client from tasks.utils import notion_utils def get_page_title(page_result): """Extract title from a page result""" properties = page_result.get('properties', {}) for prop_name in ['Name', 'Title', 'title']: if prop_name in properties: prop = properties[prop_name] if prop.get('type') == 'title': title_array = prop.get('title', []) if title_array and len(title_array) > 0: return title_array[0].get('plain_text', '') return '' def get_page_tags(page_result): """Extract tags from a page result""" properties = page_result.get('properties', {}) tags_property = properties.get('Tags', {}) if tags_property.get('type') == 'multi_select': tags = tags_property.get('multi_select', []) return [tag.get('name') for tag in tags] return [] def verify(notion: Client, main_id: str = None) -> bool: """ Verifies that all pink colors have been changed in the Toronto Guide page. Expected pink elements that should be changed: 1. Callout: "Welcome to Toronto!" with red_background (originally should be pink) 2. Activities database tags: - "Parks" tag (High Park, Evergreen Brickworks) - "Neighbourhood" tag (Ossington Strip, Chinatown, Little Italy, Kensington Market, Queen west, The beaches) 3. Food database tags: - "Middle Eastern" (Byblos Downtown) - "Jamaican" (Crumbs Patties) - "Indian" (Leela Indian Food Bar) 4. Cafes database tag: - "Food" (Cafe Landwer) These elements should exist with the same name/content but different colors. Tag distributions should remain the same. """ # Step 1: Find the main Toronto Guide page if main_id: found_id, object_type = notion_utils.find_page_or_database_by_id(notion, main_id) if not found_id or object_type != 'page': print("Error: Toronto Guide page not found.", file=sys.stderr) return False else: # Try to find the page by searching found_id = notion_utils.find_page(notion, "Toronto Guide") if not found_id: print("Error: Toronto Guide page not found.", file=sys.stderr) return False print(f"Found Toronto Guide page: {found_id}") # Get all blocks from the page all_blocks = notion_utils.get_all_blocks_recursively(notion, found_id) print(f"Found {len(all_blocks)} blocks") # Expected elements and their distributions expected_pink_elements = { "callout": { "text": "Welcome to Toronto!", "found": False, "has_pink": False, "exists": False }, "activities_tags": { "Parks": { "found": False, "has_pink": False, "expected_items": ["High Park", "Evergreen Brickworks"], "actual_items": [] }, "Neighbourhood": { "found": False, "has_pink": False, "expected_items": ["Ossington Strip", "Chinatown", "Little Italy", "Kensington Market", "Queen west", "The beaches"], "actual_items": [] } }, "food_tags": { "Middle Eastern": { "found": False, "has_pink": False, "expected_items": ["Byblos Downtown"], "actual_items": [] }, "Jamaican": { "found": False, "has_pink": False, "expected_items": ["Crumbs Patties"], "actual_items": [] }, "Indian": { "found": False, "has_pink": False, "expected_items": ["Leela Indian Food Bar"], "actual_items": [] } }, "cafes_tags": { "Food": { "found": False, "has_pink": False, "expected_items": ["Cafe Landwer"], "actual_items": [] } } } # Database IDs activities_db_id = None food_db_id = None cafes_db_id = None # Step 2: Check all blocks for callouts and find databases for block in all_blocks: if block is None: continue block_type = block.get("type") # Check for the specific callout block if block_type == "callout": callout_text = notion_utils.get_block_plain_text(block) if "Welcome to Toronto!" in callout_text: expected_pink_elements["callout"]["exists"] = True expected_pink_elements["callout"]["found"] = True color = block.get("callout", {}).get("color", "") if "pink" in color.lower(): expected_pink_elements["callout"]["has_pink"] = True print(f"✗ Callout 'Welcome to Toronto!' still has pink color: {color}") else: print(f"✓ Callout 'Welcome to Toronto!' has non-pink color: {color}") # Find child databases elif block_type == "child_database": title = block.get("child_database", {}).get("title", "") block_id = block.get("id") if "Activities" in title: activities_db_id = block_id print(f"Found Activities database: {block_id}") elif "Food" in title: food_db_id = block_id print(f"Found Food database: {block_id}") elif "Cafes" in title or "Café" in title: cafes_db_id = block_id print(f"Found Cafes database: {block_id}") # Step 3: Check Activities database for specific tags and their distributions if activities_db_id: try: # Get database properties db_info = notion.databases.retrieve(database_id=activities_db_id) tags_property = db_info.get("properties", {}).get("Tags", {}) if tags_property.get("type") == "multi_select": options = tags_property.get("multi_select", {}).get("options", []) for option in options: tag_name = option.get("name").strip() tag_color = option.get("color") if tag_name in expected_pink_elements["activities_tags"]: expected_pink_elements["activities_tags"][tag_name]["found"] = True if tag_color == "pink": expected_pink_elements["activities_tags"][tag_name]["has_pink"] = True print(f"✗ Activities tag '{tag_name}' still has pink color") else: print(f"✓ Activities tag '{tag_name}' changed to {tag_color}") # Query database to check tag distributions query_result = notion.databases.query(database_id=activities_db_id) for page in query_result.get('results', []): page_title = get_page_title(page).strip() page_tags = get_page_tags(page) for tag_name in expected_pink_elements["activities_tags"]: if tag_name in page_tags: expected_pink_elements["activities_tags"][tag_name]["actual_items"].append(page_title) except Exception as e: print(f"Error checking Activities database: {e}", file=sys.stderr) return False else: print("Error: Activities database not found", file=sys.stderr) return False # Step 4: Check Food database for specific tags and their distributions if food_db_id: try: # Get database properties db_info = notion.databases.retrieve(database_id=food_db_id) tags_property = db_info.get("properties", {}).get("Tags", {}) if tags_property.get("type") == "multi_select": options = tags_property.get("multi_select", {}).get("options", []) for option in options: tag_name = option.get("name").strip() tag_color = option.get("color") if tag_name in expected_pink_elements["food_tags"]: expected_pink_elements["food_tags"][tag_name]["found"] = True if tag_color == "pink": expected_pink_elements["food_tags"][tag_name]["has_pink"] = True print(f"✗ Food tag '{tag_name}' still has pink color") else: print(f"✓ Food tag '{tag_name}' changed to {tag_color}") # Query database to check tag distributions query_result = notion.databases.query(database_id=food_db_id) for page in query_result.get('results', []): page_title = get_page_title(page).strip() page_tags = get_page_tags(page) for tag_name in expected_pink_elements["food_tags"]: if tag_name in page_tags: expected_pink_elements["food_tags"][tag_name]["actual_items"].append(page_title) except Exception as e: print(f"Error checking Food database: {e}", file=sys.stderr) return False else: print("Error: Food database not found", file=sys.stderr) return False # Step 5: Check Cafes database for specific tags and their distributions if cafes_db_id: try: # Get database properties db_info = notion.databases.retrieve(database_id=cafes_db_id) tags_property = db_info.get("properties", {}).get("Tags", {}) if tags_property.get("type") == "multi_select": options = tags_property.get("multi_select", {}).get("options", []) for option in options: tag_name = option.get("name").strip() tag_color = option.get("color") if tag_name in expected_pink_elements["cafes_tags"]: expected_pink_elements["cafes_tags"][tag_name]["found"] = True if tag_color == "pink": expected_pink_elements["cafes_tags"][tag_name]["has_pink"] = True print(f"✗ Cafes tag '{tag_name}' still has pink color") else: print(f"✓ Cafes tag '{tag_name}' changed to {tag_color}") # Query database to check tag distributions query_result = notion.databases.query(database_id=cafes_db_id) for page in query_result.get('results', []): page_title = get_page_title(page).strip() page_tags = get_page_tags(page) for tag_name in expected_pink_elements["cafes_tags"]: if tag_name in page_tags: expected_pink_elements["cafes_tags"][tag_name]["actual_items"].append(page_title) except Exception as e: print(f"Error checking Cafes database: {e}", file=sys.stderr) return False else: print("Error: Cafes database not found", file=sys.stderr) return False # Step 6: Verify all requirements print(f"\nVerification Summary:") all_passed = True # Check callout if not expected_pink_elements["callout"]["exists"]: print("✗ 'Welcome to Toronto!' callout not found", file=sys.stderr) all_passed = False elif expected_pink_elements["callout"]["has_pink"]: print("✗ Callout still has pink background", file=sys.stderr) all_passed = False else: print("✓ Callout color changed from pink") # Check Activities tags print("\nActivities Database Tags:") for tag_name, tag_info in expected_pink_elements["activities_tags"].items(): if not tag_info["found"]: print(f"✗ Activities tag '{tag_name}' not found (may have been renamed)", file=sys.stderr) # Don't fail if tag was renamed, as that's acceptable elif tag_info["has_pink"]: print(f"✗ Activities tag '{tag_name}' still has pink color", file=sys.stderr) all_passed = False else: print(f"✓ Activities tag '{tag_name}' color changed from pink") # Check distribution expected_set = set(tag_info["expected_items"]) actual_set = set(tag_info["actual_items"]) if tag_info["found"] and expected_set != actual_set: print(f" ✗ Tag distribution mismatch for '{tag_name}':", file=sys.stderr) print(f" Expected: {sorted(expected_set)}", file=sys.stderr) print(f" Actual: {sorted(actual_set)}", file=sys.stderr) # Note: We don't fail on distribution mismatch if tag was renamed if not (expected_set - actual_set): # If all expected items are present print(f" (Additional items found, but all expected items are present)") elif tag_info["found"]: print(f" ✓ Tag distribution maintained for '{tag_name}'") # Check Food tags print("\nFood Database Tags:") for tag_name, tag_info in expected_pink_elements["food_tags"].items(): if not tag_info["found"]: print(f"✗ Food tag '{tag_name}' not found (may have been renamed)", file=sys.stderr) # Don't fail if tag was renamed, as that's acceptable elif tag_info["has_pink"]: print(f"✗ Food tag '{tag_name}' still has pink color", file=sys.stderr) all_passed = False else: print(f"✓ Food tag '{tag_name}' color changed from pink") # Check distribution expected_set = set(tag_info["expected_items"]) actual_set = set(tag_info["actual_items"]) if tag_info["found"] and expected_set != actual_set: print(f" ✗ Tag distribution mismatch for '{tag_name}':", file=sys.stderr) print(f" Expected: {sorted(expected_set)}", file=sys.stderr) print(f" Actual: {sorted(actual_set)}", file=sys.stderr) elif tag_info["found"]: print(f" ✓ Tag distribution maintained for '{tag_name}'") # Check Cafes tags print("\nCafes Database Tags:") for tag_name, tag_info in expected_pink_elements["cafes_tags"].items(): if not tag_info["found"]: print(f"✗ Cafes tag '{tag_name}' not found (may have been renamed)", file=sys.stderr) # Don't fail if tag was renamed, as that's acceptable elif tag_info["has_pink"]: print(f"✗ Cafes tag '{tag_name}' still has pink color", file=sys.stderr) all_passed = False else: print(f"✓ Cafes tag '{tag_name}' color changed from pink") # Check distribution expected_set = set(tag_info["expected_items"]) actual_set = set(tag_info["actual_items"]) if tag_info["found"] and expected_set != actual_set: print(f" ✗ Tag distribution mismatch for '{tag_name}':", file=sys.stderr) print(f" Expected: {sorted(expected_set)}", file=sys.stderr) print(f" Actual: {sorted(actual_set)}", file=sys.stderr) elif tag_info["found"]: print(f" ✓ Tag distribution maintained for '{tag_name}'") # Additional check: ensure no other pink elements exist print("\nChecking for any other pink elements...") other_pink_found = False # Check all callouts for pink for block in all_blocks: if block and block.get("type") == "callout": color = block.get("callout", {}).get("color", "") if "pink" in color.lower(): callout_text = notion_utils.get_block_plain_text(block)[:50] if "Welcome to Toronto!" not in callout_text: print(f"✗ Found unexpected pink callout: {callout_text}...", file=sys.stderr) other_pink_found = True if other_pink_found: all_passed = False else: print("✓ No unexpected pink elements found") return all_passed def main(): """ Executes the verification process and exits with a status code. """ notion = notion_utils.get_notion_client() main_id = sys.argv[1] if len(sys.argv) > 1 else None if verify(notion, main_id): print("\nVerification passed: All expected pink colors have been changed") sys.exit(0) else: print("\nVerification failed: Some pink colors still exist or elements are missing") sys.exit(1) if __name__ == "__main__": main() ================================================ FILE: tasks/notion/standard/toronto_guide/weekend_adventure_planner/description.md ================================================ Create a comprehensive weekend adventure planner that analyzes the Toronto Guide databases and generates a structured itinerary page. I need you to create a new page called 'Perfect Weekend Adventure' as a child of the main Toronto Guide page. **Task Requirements:** 1. Create a new page titled 'Perfect Weekend Adventure' as a child page of the main Toronto Guide page 2. Query the Activities database to identify all activities that have the "Beaches" tag 3. Query the Food database to find all restaurants with "Turkish" or "Hakka" tags 4. Query the Cafes database to retrieve all cafes entries 5. Structure the page with the following specific format: - Add a heading_1 block with text "🎒 Perfect Weekend Adventure" - Add a heading_2 block with text "🏖️ Beach Activities" - Under Beach Activities, create a bulleted list with all activities that have the "Beaches" tag, showing: Name - Google Maps Link (if available) - Add a heading_2 block with text "🍽️ Cultural Dining Experience" - Under Cultural Dining, create a numbered list of all restaurants with "Turkish" or "Hakka" tags, formatted as: Restaurant Name (Tag: [actual tag name]) - Add a heading_2 block with text "☕ Coffee Break Spots" - Under Coffee Break Spots, create a toggle block titled "Top Cafes to Visit" containing all cafe entries as to-do items (unchecked), each showing just the cafe name - Add a heading_2 block with text "📊 Weekend Summary" - Under Weekend Summary, add a paragraph with the exact text: "This weekend includes [X] beach activities, [Y] cultural dining options, and [Z] coffee spots to explore!" where [X], [Y], and [Z] are the actual counts 6. After the summary paragraph, add a divider block 7. Finally, add a callout block with the 💡 emoji containing the text: "Pro tip: Check the Seasons database for the best time to enjoy outdoor activities!" 8. Ensure all headings use the exact emoji and text format specified above 9. The lists must be in the exact format specified (bulleted for beaches, numbered for restaurants, to-do for cafes) ================================================ FILE: tasks/notion/standard/toronto_guide/weekend_adventure_planner/meta.json ================================================ { "task_id": "weekend_adventure_planner", "task_name": "Weekend Adventure Planner", "category_id": "toronto_guide", "category_name": "Toronto Guide", "description": "Create a comprehensive weekend adventure planner that analyzes Toronto Guide databases and generates a structured itinerary page.", "author": "Xiangyan Liu", "created_at": "2025-08-14", "difficulty": "L3", "tags": [ "conditional filtering", "data aggregation", "report generation", "visual formatting", "status tracking" ], "mcp": [ "notion" ], "meta_data": { "stateType": "url", "stateContent": null, "stateUrl": "https://painted-tennis-ebc.notion.site/Toronto-Guide-25281626b6d7802caa7cc394647e901c", "stateOriginalUrl": "https://www.notion.so/marketplace/templates/conquering-toronto-a-destination-guide" } } ================================================ FILE: tasks/notion/standard/toronto_guide/weekend_adventure_planner/verify.py ================================================ #!/usr/bin/env python3 # -*- coding: utf-8 -*- import sys from notion_client import Client from tasks.utils import notion_utils def verify(notion: Client, main_id: str = None) -> bool: """ Verifies that the Perfect Weekend Adventure page has been created correctly. """ # Find the main Toronto Guide page page_id = None if main_id: found_id, object_type = notion_utils.find_page_or_database_by_id(notion, main_id) if found_id and object_type == "page": page_id = found_id if not page_id: page_id = notion_utils.find_page(notion, "Toronto Guide") if not page_id: print("Error: Main 'Toronto Guide' page not found.", file=sys.stderr) return False # Find the Perfect Weekend Adventure child page adventure_page_id = None try: response = notion.search( query="Perfect Weekend Adventure", filter={"property": "object", "value": "page"} ) for result in response.get("results", []): parent = result.get("parent", {}) if parent.get("type") == "page_id" and parent.get("page_id") == page_id: adventure_page_id = result["id"] break if not adventure_page_id: for result in response.get("results", []): title_list = result.get("properties", {}).get("title", {}).get("title", []) for title_obj in title_list: if "Perfect Weekend Adventure" in title_obj.get("plain_text", ""): adventure_page_id = result["id"] break if adventure_page_id: break except Exception as e: print(f"Error searching for Perfect Weekend Adventure page: {e}", file=sys.stderr) return False if not adventure_page_id: print("Error: 'Perfect Weekend Adventure' page not found as child of main page.", file=sys.stderr) return False # Get all blocks from the adventure page all_blocks = notion_utils.get_all_blocks_recursively(notion, adventure_page_id) # Get databases from the main Toronto Guide page activities_db_id = None food_db_id = None cafes_db_id = None main_blocks = notion_utils.get_all_blocks_recursively(notion, page_id) for block in main_blocks: if block.get("type") == "child_database": title = block.get("child_database", {}).get("title", "") if "Activities" in title: activities_db_id = block.get("id") elif "Food" in title: food_db_id = block.get("id") elif "Cafes" in title or "Caf�" in title: cafes_db_id = block.get("id") # Query databases to get expected data beach_activities = [] cultural_restaurants = [] cafes_list = [] if activities_db_id: try: db_response = notion.databases.query(database_id=activities_db_id) for page in db_response.get("results", []): properties = page.get("properties", {}) tags_prop = properties.get("Tags", {}) if tags_prop.get("type") == "multi_select": tags = [tag.get("name") for tag in tags_prop.get("multi_select", [])] if "Beaches" in tags: name_prop = properties.get("Name", {}) if name_prop.get("type") == "title" and name_prop.get("title"): name = name_prop["title"][0]["plain_text"] url_prop = properties.get("Google Maps Link", {}) url = url_prop.get("url", "") if url_prop.get("type") == "url" else "" beach_activities.append({"name": name, "url": url}) except Exception as e: print(f"Error querying Activities database: {e}", file=sys.stderr) return False if food_db_id: try: db_response = notion.databases.query(database_id=food_db_id) for page in db_response.get("results", []): properties = page.get("properties", {}) tags_prop = properties.get("Tags", {}) if tags_prop.get("type") == "multi_select": tags = [tag.get("name") for tag in tags_prop.get("multi_select", [])] for tag in tags: if tag in ["Turkish", "Hakka"]: name_prop = properties.get("Name", {}) if name_prop.get("type") == "title" and name_prop.get("title"): name = name_prop["title"][0]["plain_text"] cultural_restaurants.append({"name": name, "tag": tag}) break except Exception as e: print(f"Error querying Food database: {e}", file=sys.stderr) return False if cafes_db_id: try: db_response = notion.databases.query(database_id=cafes_db_id) for page in db_response.get("results", []): properties = page.get("properties", {}) name_prop = properties.get("Name", {}) if name_prop.get("type") == "title" and name_prop.get("title"): name = name_prop["title"][0]["plain_text"] cafes_list.append(name) except Exception as e: print(f"Error querying Cafes database: {e}", file=sys.stderr) return False # Required headings and their types required_headings = [ ("🎒 Perfect Weekend Adventure", "heading_1"), ("🏖️ Beach Activities", "heading_2"), ("🍽️ Cultural Dining Experience", "heading_2"), ("☕ Coffee Break Spots", "heading_2"), ("📊 Weekend Summary", "heading_2") ] # Track verification results found_headings = set() found_beach_list = False found_restaurant_list = False found_toggle_with_cafes = False found_summary = False found_divider = False found_callout = False # Variables to track counts beach_count = 0 restaurant_count = 0 cafe_count = 0 current_section = None is_in_toggle = False for block in all_blocks: block_type = block.get("type") block_text = notion_utils.get_block_plain_text(block) # Check headings for heading_text, expected_type in required_headings: if heading_text in block_text and block_type == expected_type: found_headings.add(heading_text) current_section = heading_text # Check Beach Activities section if current_section == "🏖️ Beach Activities" and block_type == "bulleted_list_item": found_beach_list = True beach_count += 1 # Verify format includes name and potentially URL for activity in beach_activities: if activity["name"] in block_text: if activity["url"] and activity["url"] not in block_text: print(f"Warning: Beach activity '{activity['name']}' missing URL", file=sys.stderr) # Check Cultural Dining section elif current_section == "🍽️ Cultural Dining Experience" and block_type == "numbered_list_item": found_restaurant_list = True restaurant_count += 1 # Check format: Restaurant Name (Tag: [tag]) for restaurant in cultural_restaurants: if restaurant["name"] in block_text and f"Tag: {restaurant['tag']}" in block_text: pass # Format is correct # Check Coffee Break Spots section elif current_section == "☕ Coffee Break Spots": if block_type == "toggle" and "Top Cafes to Visit" in block_text: is_in_toggle = True found_toggle_with_cafes = True elif is_in_toggle and block_type == "to_do": cafe_count += 1 # Verify unchecked status to_do_data = block.get("to_do", {}) if to_do_data.get("checked", False): print(f"Error: Cafe to-do item should be unchecked: {block_text}", file=sys.stderr) return False elif block_type in ["heading_1", "heading_2", "heading_3"]: is_in_toggle = False # Check Weekend Summary section elif current_section == "📊 Weekend Summary" and block_type == "paragraph": expected_text = f"This weekend includes {len(beach_activities)} beach activities, {len(cultural_restaurants)} cultural dining options, and {len(cafes_list)} coffee spots to explore!" if expected_text in block_text: found_summary = True # Check for divider after summary if block_type == "divider": found_divider = True # Check for callout with pro tip if block_type == "callout": callout_data = block.get("callout", {}) icon = callout_data.get("icon", {}) if icon.get("type") == "emoji" and icon.get("emoji") == "💡": if "Pro tip: Check the Seasons database for the best time to enjoy outdoor activities!" in block_text: found_callout = True # Verify all required elements all_passed = True # Check all headings are present for heading_text, _ in required_headings: if heading_text not in found_headings: print(f"Error: Missing required heading: {heading_text}", file=sys.stderr) all_passed = False # Check beach activities list if not found_beach_list: print("Error: Beach activities bulleted list not found", file=sys.stderr) all_passed = False elif beach_count != len(beach_activities): print(f"Error: Expected {len(beach_activities)} beach activities, found {beach_count}", file=sys.stderr) all_passed = False # Check restaurant list if not found_restaurant_list: print("Error: Cultural dining numbered list not found", file=sys.stderr) all_passed = False elif restaurant_count != len(cultural_restaurants): print(f"Error: Expected {len(cultural_restaurants)} cultural restaurants, found {restaurant_count}", file=sys.stderr) all_passed = False # Check cafes toggle if not found_toggle_with_cafes: print("Error: Toggle block 'Top Cafes to Visit' not found", file=sys.stderr) all_passed = False elif cafe_count != len(cafes_list): print(f"Error: Expected {len(cafes_list)} cafes, found {cafe_count}", file=sys.stderr) all_passed = False # Check summary if not found_summary: print("Error: Weekend summary with correct counts not found", file=sys.stderr) all_passed = False # Check divider if not found_divider: print("Error: Divider block not found after summary", file=sys.stderr) all_passed = False # Check callout if not found_callout: print("Error: Callout with pro tip not found", file=sys.stderr) all_passed = False if all_passed: print(f"Success: Perfect Weekend Adventure page created with all required elements.") print(f"- {len(beach_activities)} beach activities") print(f"- {len(cultural_restaurants)} cultural dining options") print(f"- {len(cafes_list)} coffee spots") return True else: return False def main(): """ Executes the verification process and exits with a status code. """ notion = notion_utils.get_notion_client() main_id = sys.argv[1] if len(sys.argv) > 1 else None if verify(notion, main_id): sys.exit(0) else: sys.exit(1) if __name__ == "__main__": main() ================================================ FILE: tasks/playwright/easy/.gitkeep ================================================ ================================================ FILE: tasks/playwright/standard/eval_web/cloudflare_turnstile_challenge/description.md ================================================ # Cloudflare Turnstile Authentication Challenge Task Use Playwright MCP tools to complete Cloudflare Turnstile authentication challenge. ## Requirements: 1. Navigate to https://eval-web.mcpmark.ai/auth/turnstile 2. Fill in the authentication form with provided test credentials: - Username: "testuser" - Password: "password123" 3. Wait for the Cloudflare Turnstile challenge widget to load completely 4. Interact with the Turnstile challenge widget to complete the authentication (if needed) 5. Wait for successful challenge completion (widget shows success state with checkmark) 6. Submit the form by clicking the "Sign In" button 7. Wait for and capture any success message or confirmation that appears 8. Output the success capture in step 7 ## Notes: - Use the provided test credentials: testuser / password123 - Page shows success message inline, does not redirect to separate success page - Wait for all UI state changes before proceeding to next step - Verify both Turnstile completion and form submission success ================================================ FILE: tasks/playwright/standard/eval_web/cloudflare_turnstile_challenge/meta.json ================================================ { "task_id": "cloudflare_turnstile_challenge", "task_name": "Cloudflare Turnstile Challenge", "category_id": "eval_web", "category_name": "Eval Web", "description": "Navigate websites with Cloudflare Turnstile protection, handle security challenges, bypass bot detection mechanisms, and successfully access protected content using automated browser interactions.", "author": "Allison Zhan", "created_at": "2025-07-27", "difficulty": "L3", "tags": [ "user interaction" ], "mcp": [ "playwright" ], "meta_data": { "stateType": "video", "stateContent": null, "stateUrl": "https://storage.mcpmark.ai/tasks_state/playwright_video/mcpmark-eval-website.mp4", "stateOriginalUrl": "https://mcp-eval-website.vercel.app/auth/turnstile" } } ================================================ FILE: tasks/playwright/standard/eval_web/cloudflare_turnstile_challenge/verify.py ================================================ #!/usr/bin/env python3 """ Simplified verification script for Playwright Cloudflare Turnstile authentication task. This script only verifies that the model successfully reported capturing the expected success message by checking the last assistant message in messages.json. """ import sys import json import os # Expected success message that agent should capture EXPECTED_SUCCESS_MESSAGE = "Authentication successful! Security challenge verified." def get_model_response(): """ Get the model's response from the MCP_MESSAGES environment variable. Returns the last assistant message text. """ messages_path = os.getenv("MCP_MESSAGES") print(f"MCP_MESSAGES: {messages_path}") if not messages_path: print("Warning: MCP_MESSAGES environment variable not set", file=sys.stderr) return None try: with open(messages_path, 'r') as f: messages = json.load(f) # Find the last assistant message with status completed for message in reversed(messages): if (message.get('role') == 'assistant' and message.get('status') == 'completed' and message.get('type') == 'message'): content = message.get('content', []) # Extract text from content if isinstance(content, list): for item in content: if isinstance(item, dict) and item.get('type') in ['text', 'output_text']: return item.get('text', '') elif isinstance(content, str): return content print("Warning: No completed assistant message found", file=sys.stderr) return None except Exception as e: print(f"Error reading messages file: {str(e)}", file=sys.stderr) return None def verify(): """ Verifies that the model's last response contains the expected success message. """ # Get model's response from MCP_MESSAGES model_response = get_model_response() if not model_response: print("No model response found", file=sys.stderr) return False print(f"\nModel response (first 500 chars): {model_response[:500]}...", file=sys.stderr) # Check if the expected success message is in the model's response if EXPECTED_SUCCESS_MESSAGE in model_response: print(f"\n✓ Success message found: '{EXPECTED_SUCCESS_MESSAGE}'", file=sys.stderr) return True else: print(f"\n✗ Success message NOT found: '{EXPECTED_SUCCESS_MESSAGE}'", file=sys.stderr) return False def main(): """ Executes the verification process and exits with a status code. """ result = verify() sys.exit(0 if result else 1) if __name__ == "__main__": main() ================================================ FILE: tasks/playwright/standard/eval_web/extraction_table/data.csv ================================================ Title, Rating, Likes, Views, Replies React 18 New Features Deep Dive, "4.8", 856, 12543, 89 Vue 3 Composition API in Practice, "4.5", 743, 9876, 67 Advanced TypeScript Types Guide, "4.9", 924, 15432, 102 Node.js Performance Optimization, "4.2", 567, 8765, 45 Frontend Engineering Best Practices, "4.7", 812, 11234, 78 Microservices Architecture Patterns, "4.3", 634, 9543, 56 Docker Containerization Deployment, "4.6", 789, 10876, 71 Kubernetes Cluster Management, "4.4", 698, 9234, 63 GraphQL API Design Principles, "4.8", 876, 13456, 94 Webpack 5 Configuration Guide, "4.1", 523, 7654, 38 Vite Build Tool Usage, "4.5", 745, 10123, 69 ESLint Code Standards, "4.7", 823, 11567, 82 Unit Testing Best Practices, "4.3", 612, 8934, 51 Performance Monitoring & Optimization, "4.9", 945, 16234, 108 Security Protection Strategies, "4.2", 578, 8456, 47 Database Design Principles, "4.6", 767, 10567, 73 Caching Strategies Implementation, "4.4", 689, 9123, 61 Message Queue Applications, "4.8", 834, 12876, 87 Distributed Systems Design, "4.0", 456, 6789, 34 Cloud Native Development, "4.5", 723, 9876, 65 DevOps Process Optimization, "4.7", 801, 11234, 79 Machine Learning Introduction, "4.1", 534, 7543, 41 Artificial Intelligence Applications, "4.6", 778, 10456, 74 Blockchain Technology Fundamentals, "4.3", 645, 8765, 53 Mobile Development Techniques, "4.9", 912, 14567, 97 Cross-Platform Solutions, "4.2", 589, 8234, 48 Progressive Web App Development, "4.8", 867, 12345, 91 Web3 Development Guide, "4.4", 712, 9567, 64 NFT Smart Contracts, "4.5", 756, 10234, 70 DeFi Protocol Design, "4.7", 834, 11876, 83 Game Engine Development, "4.3", 623, 8567, 52 3D Graphics Rendering, "4.6", 789, 10678, 75 Audio Video Processing, "4.1", 545, 7234, 42 IoT Applications, "4.8", 856, 12567, 88 Edge Computing Practices, "4.2", 567, 8345, 46 5G Network Technology, "4.9", 923, 15123, 103 Quantum Computing Principles, "4.4", 678, 9345, 62 Bioinformatics Analysis, "4.5", 734, 9876, 68 Data Science Methods, "4.7", 812, 11456, 80 Algorithms and Data Structures, "4.3", 634, 8678, 54 System Design Interview, "4.6", 778, 10345, 76 Code Refactoring Techniques, "4.8", 845, 12234, 89 Open Source Contributions, "4.2", 556, 7890, 43 Technical Team Management, "4.5", 723, 9567, 66 Product Thinking Development, "4.9", 901, 14234, 95 User Experience Design, "4.1", 512, 7123, 39 Interface Interaction Optimization, "4.7", 789, 10890, 77 Accessibility Design, "4.4", 667, 8901, 58 SEO Optimization Strategies, "4.6", 756, 10123, 72 Social Media Operations, "4.3", 623, 8456, 55 Serverless Architecture, "4.7", 834, 11234, 81 API Gateway Design, "4.2", 567, 8765, 49 Microservice Communication, "4.8", 892, 13567, 95 Event-Driven Architecture, "4.5", 723, 9876, 67 CQRS Pattern Implementation, "4.3", 645, 8234, 54 Domain-Driven Design, "4.6", 778, 10456, 73 Clean Architecture Principles, "4.4", 689, 9123, 62 Hexagonal Architecture, "4.1", 534, 7543, 42 Onion Architecture, "4.5", 712, 9567, 65 Event Sourcing Patterns, "4.7", 823, 11876, 79 Saga Pattern for Distributed Systems, "4.3", 612, 8934, 53 Circuit Breaker Pattern, "4.8", 856, 12543, 87 Bulkhead Pattern, "4.2", 578, 8456, 47 Retry Pattern Implementation, "4.6", 767, 10567, 74 Timeout Pattern, "4.4", 698, 9234, 63 Rate Limiting Strategies, "4.9", 934, 15432, 103 Load Balancing Techniques, "4.1", 523, 7654, 39 Service Mesh Architecture, "4.5", 745, 10123, 69 Istio Service Mesh, "4.7", 812, 11567, 82 Envoy Proxy Configuration, "4.3", 634, 9543, 56 Consul Service Discovery, "4.6", 789, 10876, 71 Kubernetes Ingress, "4.4", 676, 9345, 58 Helm Chart Development, "4.8", 845, 12234, 89 Terraform Infrastructure, "4.2", 556, 7890, 44 Ansible Automation, "4.5", 723, 9567, 66 Jenkins Pipeline, "4.7", 801, 11234, 78 GitLab CI/CD, "4.3", 623, 8567, 52 GitHub Actions, "4.6", 789, 10678, 75 Azure DevOps, "4.1", 512, 7123, 41 AWS CodePipeline, "4.8", 867, 12345, 91 Docker Compose, "4.4", 712, 9567, 64 Kubernetes Operators, "4.5", 756, 10234, 70 Custom Resource Definitions, "4.7", 834, 11876, 83 Pod Security Policies, "4.3", 623, 8567, 52 Network Policies, "4.6", 789, 10678, 75 RBAC Configuration, "4.1", 545, 7234, 42 Secret Management, "4.8", 856, 12567, 88 ConfigMap Usage, "4.2", 567, 8345, 46 Persistent Volumes, "4.9", 923, 15123, 103 StatefulSets, "4.4", 678, 9345, 62 DaemonSets, "4.5", 734, 9876, 68 Jobs and CronJobs, "4.7", 812, 11456, 80 Horizontal Pod Autoscaler, "4.3", 634, 8678, 54 Vertical Pod Autoscaler, "4.6", 778, 10345, 76 Cluster Autoscaler, "4.8", 845, 12234, 89 Resource Quotas, "4.2", 556, 7890, 43 Limit Ranges, "4.5", 723, 9567, 66 ================================================ FILE: tasks/playwright/standard/eval_web/extraction_table/description.md ================================================ # Web Data Extraction Task Use Playwright MCP tools to extract all data from the specified website and present it in CSV format. ## Requirements: 1. Navigate to https://eval-web.mcpmark.ai/extraction 2. Wait for the page to fully load 3. Extract all data content from the page, including: - Title - Rating - Likes - Views - Replies 4. Organize the extracted data into CSV format 5. Ensure data completeness and accuracy 6. Output ONLY the complete CSV formatted data (no additional text or explanations) ## CSV Data Example: ```csv Title, Rating, Likes, Views, Replies SEO Optimization, "4.6", 756, 10123, 72 Vue 3 Composition API, "4.5", 743, 9876, 67 Advanced TypeScript Types Guide, "4.9", 924, 15432, 102 Node.js Performance Optimization, "4.2", 567, 8765, 45 Frontend Engineering Best Practices, "4.7", 812, 11234, 78 ``` ## Notes: - Ensure extraction of all visible data rows - Maintain data format consistency - All numeric data (Rating, Likes, Views, Replies) should NOT have quotes, only text data containing commas should be wrapped in quotes - Wait for the page to fully load before starting data extraction - Verify the quantity and format of extracted data are correct - **IMPORTANT: Final output must contain ONLY CSV data - no explanatory text, descriptions, or other content** ================================================ FILE: tasks/playwright/standard/eval_web/extraction_table/meta.json ================================================ { "task_id": "extraction_table", "task_name": "Extraction Table", "category_id": "eval_web", "category_name": "Eval Web", "description": "Extract structured data from complex web tables, parse multi-level headers, handle dynamic content loading, transform data formats, and export comprehensive datasets.", "author": "Arvin Xu", "created_at": "2025-08-18", "difficulty": "L3", "tags": [ "data extraction" ], "mcp": [ "playwright" ], "meta_data": { "stateType": "video", "stateContent": null, "stateUrl": "https://storage.mcpmark.ai/tasks_state/playwright_video/mcpmark-eval-website.mp4", "stateOriginalUrl": "https://eval-web.mcpmark.ai/extraction" } } ================================================ FILE: tasks/playwright/standard/eval_web/extraction_table/verify.py ================================================ #!/usr/bin/env python3 """ Verification script for checking Playwright web data extraction tasks. This script verifies whether the model successfully extracted CSV format data from web pages by checking the last assistant message in messages.json. """ import sys import json import os import re import csv from io import StringIO # Expected CSV header (must match exactly, including spaces) EXPECTED_HEADER_LINE = "Title, Rating, Likes, Views, Replies" EXPECTED_HEADERS = ["Title", "Rating", "Likes", "Views", "Replies"] # Exact number of data rows (must match data.csv exactly) EXPECTED_DATA_ROWS = 97 def get_model_response(): """ Get the model's response from the MCP_MESSAGES environment variable. Returns the last assistant message text. """ messages_path = os.getenv("MCP_MESSAGES") print(f"| MCP_MESSAGES: {messages_path}") if not messages_path: print("| Warning: MCP_MESSAGES environment variable not set", file=sys.stderr) return None try: with open(messages_path, 'r') as f: messages = json.load(f) # Find the last assistant message with status completed for message in reversed(messages): if (message.get('role') == 'assistant' and message.get('status') == 'completed' and message.get('type') == 'message'): content = message.get('content', []) # Extract text from content if isinstance(content, list): for item in content: if isinstance(item, dict) and item.get('type') in ['text', 'output_text']: return item.get('text', '') elif isinstance(content, str): return content print("| Warning: No completed assistant message found", file=sys.stderr) return None except Exception as e: print(f"| Error reading messages file: {str(e)}", file=sys.stderr) return None def extract_csv_from_response(response): """ Extract CSV data from model response. """ # Look for CSV code blocks csv_pattern = r'```(?:csv)?\s*\n(.*?)\n```' matches = re.findall(csv_pattern, response, re.DOTALL | re.IGNORECASE) if matches: return matches[-1].strip() # Return the last CSV block # If no code block found, try to find CSV data starting with header lines = response.split('\n') csv_start = -1 # Stricter header matching: look for lines containing "Title" and "Rating" for i, line in enumerate(lines): if "Title" in line and "Rating" in line and "Likes" in line: csv_start = i break if csv_start >= 0: # Extract from header until empty line or non-CSV format line csv_lines = [] for line in lines[csv_start:]: line = line.strip() if not line or not (',' in line): if csv_lines: # If we already have data, stop at empty line break continue csv_lines.append(line) if len(csv_lines) > 100: # Prevent extracting too many rows break return '\n'.join(csv_lines) return None def validate_csv_data(csv_text): """ Validate CSV data format and content, must match data.csv exactly. """ if not csv_text: return False, "CSV data not found" try: lines = csv_text.strip().split('\n') # Check total number of rows (1 header row + data rows) expected_total_rows = EXPECTED_DATA_ROWS + 1 if len(lines) != expected_total_rows: return False, f"| CSV total row count mismatch, expected: {expected_total_rows} rows, actual: {len(lines)} rows" # Check header row format (must match exactly) header_line = lines[0].strip() if header_line != EXPECTED_HEADER_LINE: return False, f"| Header format mismatch, expected: '{EXPECTED_HEADER_LINE}', actual: '{header_line}'" # Parse CSV to validate structure csv_reader = csv.reader(StringIO(csv_text)) rows = list(csv_reader) # Check column count for each row expected_columns = len(EXPECTED_HEADERS) for i, row in enumerate(rows): if len(row) != expected_columns: return False, f"| Row {i+1} column count incorrect, expected: {expected_columns} columns, actual: {len(row)} columns" # Validate data row format valid_rows = 0 for i, row in enumerate(rows[1:], 2): # Skip header, start from row 2 # Check if each column has data if not all(cell.strip() for cell in row): return False, f"| Row {i} contains empty data" # Check numeric column format (Rating, Likes, Views, Replies should not have quotes) for col_idx, col_name in [(1, "Rating"), (2, "Likes"), (3, "Views"), (4, "Replies")]: value = row[col_idx].strip() # Check for quotes (should not have any) if value.startswith('"') and value.endswith('"'): return False, f"| Row {i} {col_name} should not have quotes, actual: {value}" # Check numeric format if col_name == "Rating": try: float(value) except ValueError: return False, f"| Row {i} {col_name} should be a number, actual: {value}" else: if not value.isdigit(): return False, f"| Row {i} {col_name} should be pure digits, actual: {value}" valid_rows += 1 # Validate number of data rows if valid_rows != EXPECTED_DATA_ROWS: return False, f"| Valid data row count mismatch, expected: {EXPECTED_DATA_ROWS} rows, actual: {valid_rows} rows" return True, f"| CSV validation successful: format matches data.csv exactly, {valid_rows} valid data rows" except Exception as e: return False, f"| CSV format parsing error: {str(e)}" def verify(): """ Verify if the model's response contains correct CSV data extraction results. """ # Get model response model_response = get_model_response() if not model_response: print("| Model response not found", file=sys.stderr) return False print(f"|\n| Model response (first 500 characters): {model_response[:500]}...", file=sys.stderr) # Extract CSV data from response csv_data = extract_csv_from_response(model_response) if not csv_data: print("|\n| ✗ CSV data not found in response", file=sys.stderr) return False print(f"|\n| Found CSV data (first 300 characters):\n| {csv_data[:300]}...", file=sys.stderr) # Validate CSV data is_valid, message = validate_csv_data(csv_data) if is_valid: print(f"|\n| ✓ {message}", file=sys.stderr) return True else: print(f"|\n| ✗ CSV validation failed: {message}", file=sys.stderr) return False def main(): """ Executes the verification process and exits with a status code. """ result = verify() sys.exit(0 if result else 1) if __name__ == "__main__": main() ================================================ FILE: tasks/playwright/standard/web_search/birth_of_arvinxu/description.md ================================================ # Web Search Task Use Playwright MCP tools to search for information about the X profile https://x.com/arvin17x and find out when this person was born. ## Requirements: Extract the answer in specific format: - just year,like 1990, 2001 ================================================ FILE: tasks/playwright/standard/web_search/birth_of_arvinxu/meta.json ================================================ { "task_id": "birth_of_arvinxu", "task_name": "Birth Of Arvinxu", "category_id": "web_search", "category_name": "Web Search", "description": "Search for biographical information about X profile arvin17x across multiple web sources, extract birth year data, verify information accuracy, and compile findings.", "author": "Arvin Xu", "created_at": "2025-08-18", "difficulty": "L3", "tags": [ "search aggregation", "data extraction" ], "mcp": [ "playwright" ], "meta_data": { "stateType": null, "stateContent": null, "stateUrl": null, "stateOriginalUrl": null } } ================================================ FILE: tasks/playwright/standard/web_search/birth_of_arvinxu/verify.py ================================================ #!/usr/bin/env python3 """ Verification script for Playwright web search task. Simple verification that checks if the AI agent found the correct answer. The expected ground truth answer is configured at the top of the file. """ import sys import json import os from pathlib import Path from typing import Dict, Any # ============================================================================= # CONFIGURATION # ============================================================================= # Expected ground truth answer (exact match) EXPECTED_GROUND_TRUTH = "1995" # ============================================================================= # MCP RESULT PARSING # ============================================================================= def get_working_directory() -> Path: """Get the working directory where messages.json should be.""" # Priority 1: Use MCP_MESSAGES path if available (most reliable) messages_path = os.getenv("MCP_MESSAGES") if messages_path and Path(messages_path).exists(): return Path(messages_path).parent.resolve() # Priority 2: Use PLAYWRIGHT_WORK_DIR environment variable work_dir = os.getenv("PLAYWRIGHT_WORK_DIR") if work_dir: work_path = Path(work_dir).resolve() if (work_path / "messages.json").exists(): return work_path # Priority 3: Check current directory (fallback) current_dir = Path.cwd() if (current_dir / "messages.json").exists(): return current_dir # Priority 4: Default fallback return Path(".").resolve() def parse_ai_results(work_dir: Path) -> Dict[str, Any]: """Parse the AI agent's results from messages.json""" messages_file = work_dir / "messages.json" if not messages_file.exists(): return {"success": False, "error": "No messages.json found"} try: with open(messages_file, "r", encoding="utf-8") as f: messages = json.load(f) except (json.JSONDecodeError, IOError) as e: return {"success": False, "error": f"Failed to read messages.json: {e}"} # Look for expected answer in the AI's responses found_answer = False ai_responses = [] for message in messages: if message.get("role") == "assistant": content = str(message.get("content", "")) # Handle both string and list content formats if isinstance(message.get("content"), list): content = " ".join( item.get("text", "") if isinstance(item, dict) else str(item) for item in message.get("content", []) ) ai_responses.append(content) # Exact match (character-for-character, case-sensitive, no trimming) if content == EXPECTED_GROUND_TRUTH: found_answer = True return { "success": True, "found_answer": found_answer, "ai_responses": ai_responses, "total_responses": len(ai_responses), } # ============================================================================= # MAIN VERIFICATION # ============================================================================= def verify_task() -> bool: """Verify the AI agent found the correct answer""" # Parse AI agent results work_dir = get_working_directory() print(f"| Working directory: {work_dir}") ai_results = parse_ai_results(work_dir) if not ai_results["success"]: print(f"| ❌ Could not parse AI results: {ai_results.get('error')}") return False if ai_results["found_answer"]: print(f"| AI agent correctly identified: {EXPECTED_GROUND_TRUTH}") return True else: print(f"| AI agent did not find the correct answer: {EXPECTED_GROUND_TRUTH}") return False def main(): """Main verification function.""" try: success = verify_task() sys.exit(0 if success else 1) except Exception as e: print(f"\n💥 Verification error: {e}") sys.exit(1) if __name__ == "__main__": main() ================================================ FILE: tasks/playwright/standard/web_search/r1_arxiv/content.txt ================================================ In this work, we share our journey in enhancing model reasoning abilities through reinforcement learning. DeepSeek-R1-Zero represents a pure RL approach without relying on cold-start data, achieving strong performance across various tasks. DeepSeek-R1 is more powerful, leveraging cold-start data alongside iterative RL fine-tuning. Ultimately, DeepSeek-R1 achieves performance comparable to OpenAI-o1-1217 on a range of tasks. We further explore distillation the reasoning capability to small dense models. We use DeepSeek-R1 as the teacher model to generate 800K training samples, and fine-tune several small dense models. The results are promising: DeepSeek-R1-Distill-Qwen-1.5B outperforms GPT-4o and Claude-3.5-Sonnet on math benchmarks with 28.9% on AIME and 83.9% on MATH. Other dense models also achieve impressive results, significantly outperforming other instruction-tuned models based on the same underlying checkpoints. In the future, we plan to invest in research across the following directions for DeepSeek-R1. - **General Capability**: Currently, the capabilities of DeepSeek-R1 fall short of DeepSeek-V3 in tasks such as function calling, multi-turn, complex role-playing, and JSON output. Moving forward, we plan to explore how long CoT can be leveraged to enhance tasks in these fields. - **Language Mixing**: DeepSeek-R1 is currently optimized for Chinese and English, which may result in language mixing issues when handling queries in other languages. For instance, DeepSeek-R1 might use English for reasoning and responses, even if the query is in a language other than English or Chinese. We aim to address this limitation in future updates. - **Prompting Engineering**: When evaluating DeepSeek-R1, we observe that it is sensitive to prompts. Few-shot prompting consistently degrades its performance. Therefore, we recommend users directly describe the problem and specify the output format using a zero-shot setting for optimal results. - **Software Engineering Tasks**: Due to the long evaluation times, which impact the efficiency of the RL process, large-scale RL has not been applied extensively in software engineering tasks. As a result, DeepSeek-R1 has not demonstrated a huge improvement over DeepSeek-V3 on software engineering benchmarks. Future versions will address this by implementing rejection sampling on software engineering data or incorporating asynchronous evaluations during the RL process to improve efficiency. ================================================ FILE: tasks/playwright/standard/web_search/r1_arxiv/description.md ================================================ # Web Search Task Use Playwright MCP tools to search for the DeepSeek R1 research paper and extract all the paragraphs of the Conclusion section. ## Requirements: 1. Search for the DeepSeek R1 research paper 2. Navigate to the paper and find the Conclusion section 3. Extract **ALL the paragraphs** of the Conclusion section 4. **Provide the content in Markdown format - no explanations, no additional text** ## Important Notes: - **Output ALL the paragraphs of text** - **Do NOT include any explanations, summaries, or additional content** - **The response should contain ONLY the Conclusion section content formatted in Markdown** ## Expected Output: All the paragraphs of the Conclusion section from the DeepSeek R1 paper, formatted in Markdown with proper paragraph structure and formatting. ================================================ FILE: tasks/playwright/standard/web_search/r1_arxiv/meta.json ================================================ { "task_id": "r1_arxiv", "task_name": "R1 Arxiv", "category_id": "web_search", "category_name": "Web Search", "description": "Search arXiv for R1 model research papers, extract technical specifications, analyze methodology sections, compile research findings, and generate comprehensive literature review.", "author": "Arvin Xu", "created_at": "2025-08-18", "difficulty": "L3", "tags": [ "search aggregation", "data extraction", "comparative analysis", "content submission" ], "mcp": [ "playwright" ], "meta_data": { "stateType": null, "stateContent": null, "stateUrl": null, "stateOriginalUrl": null } } ================================================ FILE: tasks/playwright/standard/web_search/r1_arxiv/verify.py ================================================ #!/usr/bin/env python3 """ Verification script for Playwright web search task. Simple verification that checks if the AI agent found the correct Introduction content. The expected ground truth answer is configured at the top of the file. """ import sys import json import os from pathlib import Path from typing import Dict, Any # ============================================================================= # CONFIGURATION # ============================================================================= # Expected ground truth content from content.txt EXPECTED_CONTENT_FILE = "content.txt" # ============================================================================= # MCP RESULT PARSING # ============================================================================= def get_working_directory() -> Path: """Get the working directory where messages.json should be.""" # Priority 1: Use MCP_MESSAGES path if available (most reliable) messages_path = os.getenv("MCP_MESSAGES") if messages_path and Path(messages_path).exists(): return Path(messages_path).parent.resolve() # Priority 2: Use PLAYWRIGHT_WORK_DIR environment variable work_dir = os.getenv("PLAYWRIGHT_WORK_DIR") if work_dir: work_path = Path(work_dir).resolve() if (work_path / "messages.json").exists(): return work_path # Priority 3: Check current directory (fallback) current_dir = Path.cwd() if (current_dir / "messages.json").exists(): return current_dir # Priority 4: Default fallback return Path(".").resolve() def load_expected_content() -> str: """Load the expected content from content.txt""" # content.txt is in the same directory as verify.py current_file = Path(__file__).resolve() content_file = current_file.parent / EXPECTED_CONTENT_FILE if not content_file.exists(): print(f"| {EXPECTED_CONTENT_FILE} not found at: {content_file}") return "" print(f"| Found {EXPECTED_CONTENT_FILE} at: {content_file}") try: with open(content_file, "r", encoding="utf-8") as f: return f.read().strip() except (IOError, UnicodeDecodeError) as e: print(f"| Warning: Could not read {content_file}: {e}") return "" def parse_ai_results(work_dir: Path) -> Dict[str, Any]: """Parse the AI agent's results from messages.json""" messages_file = work_dir / "messages.json" if not messages_file.exists(): return {"success": False, "error": "No messages.json found"} try: with open(messages_file, "r", encoding="utf-8") as f: messages = json.load(f) except (json.JSONDecodeError, IOError) as e: return {"success": False, "error": f"Failed to read messages.json: {e}"} # Look for extracted content in the AI's responses found_content = False ai_responses = [] extracted_content = "" for message in messages: if message.get("role") == "assistant": content = str(message.get("content", "")) # Handle both string and list content formats if isinstance(message.get("content"), list): content = " ".join( item.get("text", "") if isinstance(item, dict) else str(item) for item in message.get("content", []) ) ai_responses.append(content) # Store the last response as extracted content extracted_content = content return { "success": True, "found_content": True, # Assuming content was found if we have responses "ai_responses": ai_responses, "extracted_content": extracted_content, "total_responses": len(ai_responses), } def compare_content(extracted: str, expected: str) -> Dict[str, Any]: """Compare extracted content with expected content""" if not expected: return {"success": False, "error": "No expected content to compare against"} if not extracted: return {"success": False, "error": "No extracted content found"} # Normalize content for comparison (remove extra whitespace, normalize line breaks) extracted_normalized = " ".join(extracted.split()) expected_normalized = " ".join(expected.split()) # Direct text comparison - content must be exactly the same is_exact_match = extracted_normalized == expected_normalized return { "success": True, "is_exact_match": is_exact_match, "extracted_length": len(extracted_normalized), "expected_length": len(expected_normalized), "extracted_preview": extracted_normalized[:100] + "..." if len(extracted_normalized) > 100 else extracted_normalized, "expected_preview": expected_normalized[:100] + "..." if len(expected_normalized) > 100 else expected_normalized } # ============================================================================= # MAIN VERIFICATION # ============================================================================= def verify_task(work_dir: Path) -> bool: """Verify the AI agent found the correct Introduction content""" print("| Verifying Playwright Web Search Task - DeepSeek R1 Introduction") print("| " + "=" * 70) # Load expected content print("| Loading expected content...") expected_content = load_expected_content() if not expected_content: print("| Error: Could not load expected content") return False print(f"| Expected content loaded ({len(expected_content)} characters)") # Parse MCP messages messages = parse_ai_results(work_dir) if not messages["success"]: print(f"| Error: Could not parse AI results: {messages.get('error')}") return False # Extract AI agent response extracted_content = messages.get("extracted_content", "") if not extracted_content: print("| Error: No AI agent response found") return False print(f"| Extracted content: {len(extracted_content)} characters") # Compare content print("| Comparing extracted content with expected content...") comparison = compare_content(extracted_content, expected_content) if not comparison["success"]: print(f"| Comparison failed: {comparison.get('error')}") return False print(f"| Content comparison results:") print(f"| - Extracted length: {comparison['extracted_length']} characters") print(f"| - Expected length: {comparison['expected_length']} characters") print(f"| - Extracted preview: {comparison['extracted_preview']}") print(f"| - Expected preview: {comparison['expected_preview']}") if comparison['is_exact_match']: print("| Task completed successfully! Content matches exactly.") return True else: print("| Task verification failed. Content does not match exactly.") return False def main(): """Main verification function""" print("| Starting verification...") # Get working directory work_dir = get_working_directory() print(f"| Working directory: {work_dir}") # Run verification success = verify_task(work_dir) if success: sys.exit(0) else: sys.exit(1) if __name__ == "__main__": main() ================================================ FILE: tasks/playwright_webarena/easy/.gitkeep ================================================ ================================================ FILE: tasks/playwright_webarena/easy/reddit/ai_data_analyst/description.md ================================================ Deliver a bite-sized AI community snapshot by capturing two essential metrics and posting them back to the forum. **Task Requirements:** 1. Register and log in with username `AIDataAnalyst2025` and password `SecurePass123!` so every action is tied to that account. 2. Visit the `deeplearning` forum (page 1 only) and count how many posts are visible; note the exact total. 3. Go to the `MachineLearning` forum, find the post titled `[P] I made a command-line tool that explains your errors using ChatGPT (link in comments)`, and record the vote count shown in the listing. 4. Still in `MachineLearning`, create a post titled `MachineLearning_Extraction` whose body is exactly the two-line Markdown list below (keep the keys and pipe separators untouched, replacing each `FILL_VALUE` with your numbers): ``` - Deeplearning_Post_Count|FILL_VALUE - ChatGPT_Tool_Vote_Count|FILL_VALUE ``` No searches or multi-page navigation are required—just capture these two data points and report them accurately. ================================================ FILE: tasks/playwright_webarena/easy/reddit/ai_data_analyst/label.txt ================================================ - Deeplearning_Post_Count|25 - ChatGPT_Tool_Vote_Count|2655 ================================================ FILE: tasks/playwright_webarena/easy/reddit/ai_data_analyst/meta.json ================================================ { "task_id": "ai_data_analyst_easy", "task_name": "AI Data Analyst (Easy)", "category_id": "reddit", "category_name": "Reddit", "description": "Capture a lightweight AI/ML engagement snapshot by signing in, grabbing three key metrics, and posting them back to the MachineLearning forum.", "author": "Fanqing Meng", "created_at": "2025-11-15", "difficulty": "L1", "tags": [ "user interaction", "data extraction", "comparative analysis" ], "mcp": [ "playwright" ], "meta_data": { "stateType": "video", "stateContent": null, "stateUrl": "https://storage.mcpmark.ai/tasks_state/playwright_video/postmill.mp4", "stateOriginalUrl": "https://github.com/web-arena-x/webarena/tree/main/environment_docker" } } ================================================ FILE: tasks/playwright_webarena/easy/reddit/ai_data_analyst/verify.py ================================================ import asyncio import os import re import sys from pathlib import Path from playwright.async_api import async_playwright, TimeoutError as PlaywrightTimeoutError BASE_URL = os.getenv("WEBARENA_BASE_URL", "http://localhost:9999").rstrip("/") USERNAME = "AIDataAnalyst2025" PASSWORD = "SecurePass123!" POST_TITLE = "MachineLearning_Extraction" REQUIRED_FIELDS = [ "Deeplearning_Post_Count", "ChatGPT_Tool_Vote_Count", ] LABEL_PATH = Path(__file__).parent / "label.txt" def parse_key_value_format(text: str) -> dict: data = {} if not text: return data for line in text.splitlines(): line = line.strip() if not line or "|" not in line: continue line = re.sub(r"^[-•*]\s*", "", line) key, value = line.split("|", 1) data[key.strip()] = value.strip() return data def load_expected_values() -> dict: if not LABEL_PATH.exists(): return {} return parse_key_value_format(LABEL_PATH.read_text(encoding="utf-8")) async def ensure_logged_in(page) -> bool: print("Step 1: Ensuring we are logged in...", file=sys.stderr) await page.goto(f"{BASE_URL}/", wait_until="networkidle") user_button = page.locator(f'button:has-text("{USERNAME}")') if await user_button.count(): print("✓ Already logged in", file=sys.stderr) return True login_link = page.locator('a:has-text("Log in")') if not await login_link.count(): print("FAILED: Login link not found", file=sys.stderr) return False await login_link.click() await page.wait_for_load_state("networkidle") await page.fill('input[name="_username"]', USERNAME) await page.fill('input[name="_password"]', PASSWORD) await page.click('button:has-text("Log in")') await page.wait_for_load_state("networkidle") if await page.locator(f'button:has-text("{USERNAME}")').count(): print(f"✓ Logged in as {USERNAME}", file=sys.stderr) return True print("FAILED: Could not log in with provided credentials", file=sys.stderr) return False async def fetch_submission_content(page): print("Step 2: Retrieving MachineLearning submission...", file=sys.stderr) await page.goto(f"{BASE_URL}/f/MachineLearning", wait_until="networkidle") post_link = page.locator(f'a:has-text("{POST_TITLE}")') if not await post_link.count(): print( f"FAILED: Submission '{POST_TITLE}' not found in MachineLearning forum", file=sys.stderr, ) return None await post_link.first.click() await page.wait_for_load_state("networkidle") selectors = [ ".submission__body", "article", ".post-body", ".RichText", '[class*="RichText"]', ] for selector in selectors: locator = page.locator(selector) if await locator.count(): content = await locator.first.inner_text() if content: print(f"✓ Found submission body via selector {selector}", file=sys.stderr) return content print("FAILED: Unable to locate submission body content", file=sys.stderr) return None def validate_submission(extracted: dict, expected: dict) -> bool: missing = [key for key in REQUIRED_FIELDS if key not in extracted] if missing: print( f"FAILED: Submission body missing required keys: {', '.join(missing)}", file=sys.stderr, ) return False errors = [] for key in REQUIRED_FIELDS: actual = extracted.get(key, "") expect = expected.get(key, "") try: actual_val = int(actual) expect_val = int(expect) if actual_val != expect_val: errors.append(f"{key}: expected {expect_val}, found {actual_val}") except ValueError: errors.append(f"{key}: value '{actual}' is not numeric") if errors: print("FAILED: Submission values do not match expected data:", file=sys.stderr) for err in errors: print(f" - {err}", file=sys.stderr) return False print("✓ Submission content matches expected metrics", file=sys.stderr) return True async def verify() -> bool: expected = load_expected_values() if not expected: print("FAILED: label.txt with expected values is missing", file=sys.stderr) return False async with async_playwright() as p: browser = await p.chromium.launch(headless=True) context = await browser.new_context() page = await context.new_page() try: if not await ensure_logged_in(page): return False content = await fetch_submission_content(page) if not content: return False extracted = parse_key_value_format(content) if not validate_submission(extracted, expected): return False print("SUCCESS: Easy AI data analyst task verified", file=sys.stderr) return True except PlaywrightTimeoutError as exc: print(f"FAILED: Timeout occurred - {exc}", file=sys.stderr) return False except Exception as exc: print(f"FAILED: Unexpected error - {exc}", file=sys.stderr) return False finally: await browser.close() def main(): result = asyncio.run(verify()) sys.exit(0 if result else 1) if __name__ == "__main__": main() ================================================ FILE: tasks/playwright_webarena/easy/reddit/llm_research_summary/description.md ================================================ Produce a five-line GPT activity bulletin so readers can quickly see how active the MachineLearning forum is. **Task Requirements:** 1. Register and log in with username `llm_analyst_2024` and password `LLMExpert2024!`. 2. On the first page of the `MachineLearning` forum, count how many post titles contain `GPT`, `ChatGPT`, or `LLM`; record the exact total. 3. From that subset, identify the single post with the highest upvote count and note its full title and vote total. 4. Create a new post titled `LLM Research Summary: GPT Discussions Analysis [2024]` in the same forum. The body must be exactly the Markdown list below with your numbers in place of each `FILL_VALUE` (keep the keys, order, and pipe separator intact): ``` - Total_LLM_Posts|FILL_VALUE - Top1_Title|FILL_VALUE - Top1_Upvotes|FILL_VALUE ``` That’s the entire easy version—just report the headline LLM count and the hottest related thread. ================================================ FILE: tasks/playwright_webarena/easy/reddit/llm_research_summary/label.txt ================================================ - Total_LLM_Posts|9 - Top1_Title|[P] I made a command-line tool that explains your errors using ChatGPT (link in comments) - Top1_Upvotes|2655 ================================================ FILE: tasks/playwright_webarena/easy/reddit/llm_research_summary/meta.json ================================================ { "task_id": "llm_research_summary_easy", "task_name": "LLM Research Summary (Easy)", "category_id": "reddit", "category_name": "Reddit", "description": "Collect the headline GPT metrics from MachineLearning and publish a short five-line recap.", "author": "Fanqing Meng", "created_at": "2025-11-15", "difficulty": "L1", "tags": [ "data extraction", "search aggregation", "content submission", "user interaction" ], "mcp": [ "playwright" ], "meta_data": { "stateType": "video", "stateContent": null, "stateUrl": "https://storage.mcpmark.ai/tasks_state/playwright_video/postmill.mp4", "stateOriginalUrl": "https://github.com/web-arena-x/webarena/tree/main/environment_docker" } } ================================================ FILE: tasks/playwright_webarena/easy/reddit/llm_research_summary/verify.py ================================================ import asyncio import os import re import sys from pathlib import Path from playwright.async_api import async_playwright, TimeoutError as PlaywrightTimeoutError BASE_URL = os.getenv("WEBARENA_BASE_URL", "http://localhost:9999").rstrip("/") USERNAME = "llm_analyst_2024" PASSWORD = "LLMExpert2024!" FORUM_SLUG = "MachineLearning" POST_TITLE = "LLM Research Summary: GPT Discussions Analysis [2024]" REQUIRED_FIELDS = [ "Total_LLM_Posts", "Top1_Title", "Top1_Upvotes", ] NUMERIC_FIELDS = {"Total_LLM_Posts", "Top1_Upvotes"} LABEL_PATH = Path(__file__).parent / "label.txt" def parse_key_value_format(text: str) -> dict: data = {} if not text: return data for line in text.splitlines(): line = line.strip() if not line or "|" not in line: continue line = re.sub(r"^[-•*]\s*", "", line) key, value = line.split("|", 1) data[key.strip()] = value.strip() return data def normalize_text(value: str) -> str: if value is None: return "" replacements = { "\u2019": "'", "\u2018": "'", "\u201c": '"', "\u201d": '"', } for src, dst in replacements.items(): value = value.replace(src, dst) return " ".join(value.split()).strip() def load_expected_values() -> dict: if not LABEL_PATH.exists(): return {} return parse_key_value_format(LABEL_PATH.read_text(encoding="utf-8")) async def ensure_logged_in(page) -> bool: print("Step 1: Signing in as llm_analyst_2024...", file=sys.stderr) await page.goto(f"{BASE_URL}/", wait_until="networkidle") user_button = page.locator(f'button:has-text("{USERNAME}")') if await user_button.count(): print("✓ Already logged in", file=sys.stderr) return True login_link = page.locator('a:has-text("Log in")') if not await login_link.count(): print("FAILED: Login link not found", file=sys.stderr) return False await login_link.click() await page.wait_for_load_state("networkidle") await page.fill('input[name="_username"]', USERNAME) await page.fill('input[name="_password"]', PASSWORD) await page.click('button:has-text("Log in")') await page.wait_for_load_state("networkidle") if await page.locator(f'button:has-text("{USERNAME}")').count(): print(f"✓ Logged in as {USERNAME}", file=sys.stderr) return True print("FAILED: Could not log in with provided credentials", file=sys.stderr) return False async def fetch_summary_body(page): print("Step 2: Opening MachineLearning summary post...", file=sys.stderr) await page.goto(f"{BASE_URL}/f/{FORUM_SLUG}", wait_until="networkidle") post_link = page.locator(f'a:has-text("{POST_TITLE}")') if not await post_link.count(): print(f"FAILED: Submission '{POST_TITLE}' not found", file=sys.stderr) return None await post_link.first.click() await page.wait_for_load_state("networkidle") selectors = [ ".submission__body", "article", ".post-body", ".RichText", '[class*="RichText"]', 'div:has-text("Total_LLM_Posts")', ] for selector in selectors: locator = page.locator(selector) if await locator.count(): content = await locator.first.inner_text() if content: print(f"✓ Found summary content via selector {selector}", file=sys.stderr) return content print("FAILED: Unable to locate submission body", file=sys.stderr) return None def validate_fields(extracted: dict, expected: dict) -> bool: missing = [key for key in REQUIRED_FIELDS if key not in extracted] if missing: print(f"FAILED: Missing required keys: {', '.join(missing)}", file=sys.stderr) return False errors = [] for key in REQUIRED_FIELDS: actual = extracted.get(key, "") expect = expected.get(key, "") if key in NUMERIC_FIELDS: try: actual_val = int(actual) expect_val = int(expect) if actual_val != expect_val: errors.append(f"{key}: expected {expect_val}, found {actual_val}") except ValueError: errors.append(f"{key}: '{actual}' is not numeric") else: if normalize_text(actual) != normalize_text(expect): errors.append(f"{key}: expected '{expect}', found '{actual}'") if errors: print("FAILED: Summary values do not match expected data:", file=sys.stderr) for err in errors: print(f" - {err}", file=sys.stderr) return False print("✓ Summary values match expected snapshot", file=sys.stderr) return True async def verify() -> bool: expected = load_expected_values() if not expected: print("FAILED: label.txt is missing", file=sys.stderr) return False async with async_playwright() as p: browser = await p.chromium.launch(headless=True) context = await browser.new_context() page = await context.new_page() try: if not await ensure_logged_in(page): return False content = await fetch_summary_body(page) if not content: return False extracted = parse_key_value_format(content) if not validate_fields(extracted, expected): return False print("SUCCESS: LLM research easy task verified", file=sys.stderr) return True except PlaywrightTimeoutError as exc: print(f"FAILED: Timeout occurred - {exc}", file=sys.stderr) return False except Exception as exc: print(f"FAILED: Unexpected error - {exc}", file=sys.stderr) return False finally: await browser.close() def main(): result = asyncio.run(verify()) sys.exit(0 if result else 1) if __name__ == "__main__": main() ================================================ FILE: tasks/playwright_webarena/easy/reddit/movie_reviewer_analysis/description.md ================================================ Provide a lightweight status report on what’s trending in the movies forum so stakeholders can scan it at a glance. **Task Requirements:** 1. Register and log in with username `movie_reviewer_2024` and password `movie_reviewer_2024`. 2. On the first page of the `movies` forum, count how many post titles contain any four-digit year (e.g., 1984, 2024) and record the total. 3. Still on that page, find the post with the highest upvote count and record its full title as well as the vote and comment counts shown. 4. Publish a post in the same forum titled `Wonderful Movies Analysis: Community Favorites [2024]`. The body must match the four-line Markdown list below—keep the keys, order, and pipe separators exactly as written while replacing each `FILL_VALUE` with your data: ``` - Total_Year_Posts|FILL_VALUE - Top_Title|FILL_VALUE - Top_Upvotes|FILL_VALUE - Top_Comments|FILL_VALUE ``` No multi-page browsing or special threads are required; this easy task captures just the top signals from the first page. ================================================ FILE: tasks/playwright_webarena/easy/reddit/movie_reviewer_analysis/label.txt ================================================ - Total_Year_Posts|1 - Top_Title|Who will win the Oscar for ACTRESS IN A SUPPORTING ROLE? - Top_Upvotes|9933 - Top_Comments|23 ================================================ FILE: tasks/playwright_webarena/easy/reddit/movie_reviewer_analysis/meta.json ================================================ { "task_id": "movie_reviewer_analysis_easy", "task_name": "Movie Reviewer Analysis (Easy)", "category_id": "reddit", "category_name": "Reddit", "description": "Grab the first-page movie signals plus the Rittenhouse poster stats and share them in a concise recap post.", "author": "Fanqing Meng", "created_at": "2025-11-15", "difficulty": "L1", "tags": [ "user interaction", "data extraction", "comparative analysis", "content submission" ], "mcp": [ "playwright" ], "meta_data": { "stateType": "video", "stateContent": null, "stateUrl": "https://storage.mcpmark.ai/tasks_state/playwright_video/postmill.mp4", "stateOriginalUrl": "https://github.com/web-arena-x/webarena/tree/main/environment_docker" } } ================================================ FILE: tasks/playwright_webarena/easy/reddit/movie_reviewer_analysis/verify.py ================================================ import asyncio import os import re import sys from pathlib import Path from playwright.async_api import async_playwright, TimeoutError as PlaywrightTimeoutError BASE_URL = os.getenv("WEBARENA_BASE_URL", "http://localhost:9999").rstrip("/") USERNAME = "movie_reviewer_2024" PASSWORD = "movie_reviewer_2024" FORUM_SLUG = "movies" POST_TITLE = "Wonderful Movies Analysis: Community Favorites [2024]" REQUIRED_FIELDS = [ "Total_Year_Posts", "Top_Title", "Top_Upvotes", "Top_Comments", ] NUMERIC_FIELDS = { "Total_Year_Posts", "Top_Upvotes", "Top_Comments", } LABEL_PATH = Path(__file__).parent / "label.txt" def parse_key_value_format(text: str) -> dict: data = {} if not text: return data for line in text.splitlines(): line = line.strip() if not line or "|" not in line: continue line = re.sub(r"^[-•*]\s*", "", line) key, value = line.split("|", 1) data[key.strip()] = value.strip() return data def normalize_text(value: str) -> str: if value is None: return "" replacements = { "\u2019": "'", "\u2018": "'", "\u201c": '"', "\u201d": '"', } for src, dst in replacements.items(): value = value.replace(src, dst) return " ".join(value.split()).strip() def load_expected_values() -> dict: if not LABEL_PATH.exists(): return {} return parse_key_value_format(LABEL_PATH.read_text(encoding="utf-8")) async def ensure_logged_in(page) -> bool: print("Step 1: Authenticating movie_reviewer_2024...", file=sys.stderr) await page.goto(f"{BASE_URL}/", wait_until="networkidle") user_button = page.locator(f'button:has-text("{USERNAME}")') if await user_button.count(): print("✓ Already logged in", file=sys.stderr) return True login_link = page.locator('a:has-text("Log in")') if not await login_link.count(): print("FAILED: Login link not found", file=sys.stderr) return False await login_link.click() await page.wait_for_load_state("networkidle") await page.fill('input[name="_username"]', USERNAME) await page.fill('input[name="_password"]', PASSWORD) await page.click('button:has-text("Log in")') await page.wait_for_load_state("networkidle") if await page.locator(f'button:has-text("{USERNAME}")').count(): print(f"✓ Logged in as {USERNAME}", file=sys.stderr) return True print("FAILED: Could not log in with provided credentials", file=sys.stderr) return False async def fetch_summary_body(page): print("Step 2: Locating the movies summary post...", file=sys.stderr) await page.goto(f"{BASE_URL}/f/{FORUM_SLUG}", wait_until="networkidle") post_link = page.locator(f'a:has-text("{POST_TITLE}")') if not await post_link.count(): print(f"FAILED: Submission '{POST_TITLE}' not found", file=sys.stderr) return None await post_link.first.click() await page.wait_for_load_state("networkidle") selectors = [ ".submission__body", "article", ".post-body", ".RichText", '[class*="RichText"]', 'div:has-text("Total_Year_Posts")', ] for selector in selectors: locator = page.locator(selector) if await locator.count(): content = await locator.first.inner_text() if content: print(f"✓ Retrieved summary content via selector {selector}", file=sys.stderr) return content print("FAILED: Unable to locate submission body", file=sys.stderr) return None def validate_summary(extracted: dict, expected: dict) -> bool: missing = [key for key in REQUIRED_FIELDS if key not in extracted] if missing: print(f"FAILED: Missing required keys: {', '.join(missing)}", file=sys.stderr) return False errors = [] for key in REQUIRED_FIELDS: actual = extracted.get(key, "") expect = expected.get(key, "") if key in NUMERIC_FIELDS: try: actual_val = int(actual) expect_val = int(expect) if actual_val != expect_val: errors.append(f"{key}: expected {expect_val}, found {actual_val}") except ValueError: errors.append(f"{key}: '{actual}' is not numeric") else: if normalize_text(actual) != normalize_text(expect): errors.append(f"{key}: expected '{expect}', found '{actual}'") if errors: print("FAILED: Summary values differ from expected snapshot:", file=sys.stderr) for err in errors: print(f" - {err}", file=sys.stderr) return False print("✓ Summary values match expected data", file=sys.stderr) return True async def verify() -> bool: expected = load_expected_values() if not expected: print("FAILED: label.txt is missing", file=sys.stderr) return False async with async_playwright() as p: browser = await p.chromium.launch(headless=True) context = await browser.new_context() page = await context.new_page() try: if not await ensure_logged_in(page): return False content = await fetch_summary_body(page) if not content: return False extracted = parse_key_value_format(content) if not validate_summary(extracted, expected): return False print("SUCCESS: Movie analysis easy task verified", file=sys.stderr) return True except PlaywrightTimeoutError as exc: print(f"FAILED: Timeout occurred - {exc}", file=sys.stderr) return False except Exception as exc: print(f"FAILED: Unexpected error - {exc}", file=sys.stderr) return False finally: await browser.close() def main(): result = asyncio.run(verify()) sys.exit(0 if result else 1) if __name__ == "__main__": main() ================================================ FILE: tasks/playwright_webarena/easy/reddit/nba_statistics_analysis/description.md ================================================ Share a five-line NBA engagement brief so sports moderators can instantly see which thread is dominating. **Task Requirements:** 1. Register and log in with username `NBA_DataAnalyst_2024` and password `Research#2024!`. 2. In the `sports` forum (or via the site search for “NBA”), locate the NBA-related post with the highest upvote count you can see. Record its exact title, vote total, comment count, and author username. 3. Count how many distinct NBA-titled posts you reviewed for this summary (at minimum, include the post from step 2) and store that total as `Total_NBA_Posts`. 4. Publish a post titled `Statistical Analysis: NBA Content Engagement on This Forum` in the same forum. Its body must consist solely of the Markdown list below; keep the keys/order/pipes untouched while replacing each `FILL_VALUE` with your numbers: ``` - Total_NBA_Posts|FILL_VALUE - Top_Title|FILL_VALUE - Top_Votes|FILL_VALUE - Top_Comments|FILL_VALUE - Top_Author|FILL_VALUE ``` This easy edition just reports the leading NBA thread plus the count of posts you reviewed—no deeper profile checks are necessary. ================================================ FILE: tasks/playwright_webarena/easy/reddit/nba_statistics_analysis/label.txt ================================================ - Total_NBA_Posts|20 - Top_Title|Hamby claims [WNBA Champ] Aces 'unprofessional' after trade - Top_Votes|614 - Top_Comments|170 - Top_Author|Responsible-Lunch815 ================================================ FILE: tasks/playwright_webarena/easy/reddit/nba_statistics_analysis/meta.json ================================================ { "task_id": "nba_statistics_analysis_easy", "task_name": "NBA Statistics Analysis (Easy)", "category_id": "reddit", "category_name": "Reddit", "description": "Summarize just the three strongest NBA threads and share their vote/comment stats in a short post.", "author": "Fanqing Meng", "created_at": "2025-11-15", "difficulty": "L1", "tags": [ "user interaction", "data extraction", "comparative analysis", "content submission" ], "mcp": [ "playwright" ], "meta_data": { "stateType": "video", "stateContent": null, "stateUrl": "https://storage.mcpmark.ai/tasks_state/playwright_video/postmill.mp4", "stateOriginalUrl": "https://github.com/web-arena-x/webarena/tree/main/environment_docker" } } ================================================ FILE: tasks/playwright_webarena/easy/reddit/nba_statistics_analysis/verify.py ================================================ import asyncio import os import re import sys from pathlib import Path from playwright.async_api import async_playwright, TimeoutError as PlaywrightTimeoutError BASE_URL = os.getenv("WEBARENA_BASE_URL", "http://localhost:9999").rstrip("/") USERNAME = "NBA_DataAnalyst_2024" PASSWORD = "Research#2024!" FORUM_SLUG = "sports" POST_TITLE = "Statistical Analysis: NBA Content Engagement on This Forum" REQUIRED_FIELDS = [ "Total_NBA_Posts", "Top_Title", "Top_Votes", "Top_Comments", "Top_Author", ] NUMERIC_FIELDS = { "Total_NBA_Posts", "Top_Votes", "Top_Comments", } LABEL_PATH = Path(__file__).parent / "label.txt" def parse_key_value_format(text: str) -> dict: data = {} if not text: return data for line in text.splitlines(): line = line.strip() if not line or "|" not in line: continue line = re.sub(r"^[-•*]\s*", "", line) key, value = line.split("|", 1) data[key.strip()] = value.strip() return data def normalize_text(value: str) -> str: if value is None: return "" replacements = { "\u2019": "'", "\u2018": "'", "\u201c": '"', "\u201d": '"', } for src, dst in replacements.items(): value = value.replace(src, dst) return " ".join(value.split()).strip() def load_expected_values() -> dict: if not LABEL_PATH.exists(): return {} return parse_key_value_format(LABEL_PATH.read_text(encoding="utf-8")) async def ensure_logged_in(page) -> bool: print("Step 1: Logging into the sports account...", file=sys.stderr) await page.goto(f"{BASE_URL}/", wait_until="networkidle") user_button = page.locator(f'button:has-text("{USERNAME}")') if await user_button.count(): print("✓ Already logged in", file=sys.stderr) return True login_link = page.locator('a:has-text("Log in")') if not await login_link.count(): print("FAILED: Login link not found", file=sys.stderr) return False await login_link.click() await page.wait_for_load_state("networkidle") await page.fill('input[name="_username"]', USERNAME) await page.fill('input[name="_password"]', PASSWORD) await page.click('button:has-text("Log in")') await page.wait_for_load_state("networkidle") if await page.locator(f'button:has-text("{USERNAME}")').count(): print(f"✓ Logged in as {USERNAME}", file=sys.stderr) return True print("FAILED: Could not log in with provided credentials", file=sys.stderr) return False async def fetch_summary_body(page): print("Step 2: Opening the NBA engagement summary post...", file=sys.stderr) await page.goto(f"{BASE_URL}/f/{FORUM_SLUG}", wait_until="networkidle") post_link = page.locator(f'a:has-text("{POST_TITLE}")') if not await post_link.count(): print(f"FAILED: Submission '{POST_TITLE}' not found", file=sys.stderr) return None await post_link.first.click() await page.wait_for_load_state("networkidle") selectors = [ ".submission__body", "article", ".post-body", ".RichText", '[class*="RichText"]', 'div:has-text("Total_NBA_Posts")', ] for selector in selectors: locator = page.locator(selector) if await locator.count(): content = await locator.first.inner_text() if content: print(f"✓ Retrieved summary body via selector {selector}", file=sys.stderr) return content print("FAILED: Unable to locate submission body", file=sys.stderr) return None def validate_summary(extracted: dict, expected: dict) -> bool: missing = [key for key in REQUIRED_FIELDS if key not in extracted] if missing: print(f"FAILED: Missing required keys: {', '.join(missing)}", file=sys.stderr) return False errors = [] for key in REQUIRED_FIELDS: actual = extracted.get(key, "") expect = expected.get(key, "") if key in NUMERIC_FIELDS: try: actual_val = int(actual) expect_val = int(expect) if actual_val != expect_val: errors.append(f"{key}: expected {expect_val}, found {actual_val}") except ValueError: errors.append(f"{key}: '{actual}' is not numeric") else: if normalize_text(actual) != normalize_text(expect): errors.append(f"{key}: expected '{expect}', found '{actual}'") if errors: print("FAILED: Summary data does not match expected values:", file=sys.stderr) for err in errors: print(f" - {err}", file=sys.stderr) return False print("✓ Summary fields align with expected snapshot", file=sys.stderr) return True async def verify() -> bool: expected = load_expected_values() if not expected: print("FAILED: label.txt is missing", file=sys.stderr) return False async with async_playwright() as p: browser = await p.chromium.launch(headless=True) context = await browser.new_context() page = await context.new_page() try: if not await ensure_logged_in(page): return False content = await fetch_summary_body(page) if not content: return False extracted = parse_key_value_format(content) if not validate_summary(extracted, expected): return False print("SUCCESS: NBA statistics easy task verified", file=sys.stderr) return True except PlaywrightTimeoutError as exc: print(f"FAILED: Timeout occurred - {exc}", file=sys.stderr) return False except Exception as exc: print(f"FAILED: Unexpected error - {exc}", file=sys.stderr) return False finally: await browser.close() def main(): result = asyncio.run(verify()) sys.exit(0 if result else 1) if __name__ == "__main__": main() ================================================ FILE: tasks/playwright_webarena/easy/reddit/routine_tracker_forum/description.md ================================================ Reshare the most helpful LifeProTips calendar advice so it appears as a fresh routine post for other readers. **Task Requirements:** 1. Register and log in with username `RoutineTracker2025` and password `DailyRoutine123!`. 2. Find the LifeProTips thread titled `LPT: Use your calendar as your to-do list. Assigning dedicated time to tasks increases the likelyhood of you acting upon it.` and identify the comment with the highest upvote count. Copy its full text verbatim. 3. In the `LifeProTips` forum, publish a post titled `My 5-Step Morning Routine That Increased My Productivity by 200%` whose body is exactly the comment text you copied (no additional words or formatting). 4. Confirm that your new post appears in the LifeProTips listing so community members can immediately see it. Only these three actions—copy, repost, verify—are required for the easy version; no extra upvotes or settings changes are necessary. ================================================ FILE: tasks/playwright_webarena/easy/reddit/routine_tracker_forum/meta.json ================================================ { "task_id": "routine_tracker_forum_easy", "task_name": "Routine Tracker Forum (Easy)", "category_id": "reddit", "category_name": "Reddit", "description": "Repost the highest-rated LifeProTips calendar advice under a new routine-tracking thread.", "author": "Fanqing Meng", "created_at": "2025-11-15", "difficulty": "L1", "tags": [ "user interaction", "content submission" ], "mcp": [ "playwright" ], "meta_data": { "stateType": "video", "stateContent": null, "stateUrl": "https://storage.mcpmark.ai/tasks_state/playwright_video/postmill.mp4", "stateOriginalUrl": "https://github.com/web-arena-x/webarena/tree/main/environment_docker" } } ================================================ FILE: tasks/playwright_webarena/easy/reddit/routine_tracker_forum/verify.py ================================================ import asyncio import os import sys from playwright.async_api import async_playwright, TimeoutError as PlaywrightTimeoutError BASE_URL = os.getenv("WEBARENA_BASE_URL", "http://localhost:9999").rstrip("/") USERNAME = "RoutineTracker2025" PASSWORD = "DailyRoutine123!" FORUM_SLUG = "LifeProTips" POST_TITLE = "My 5-Step Morning Routine That Increased My Productivity by 200%" EXPECTED_BODY = ( "As a college student, having a visible reminder of the assignments I have and when they are due is super helpful for me. " "It also just feels good to erase them from the board once they are completed." ) async def ensure_logged_in(page) -> bool: print("Step 1: Logging in before verification...", file=sys.stderr) await page.goto(f"{BASE_URL}/", wait_until="networkidle") user_button = page.locator(f'button:has-text("{USERNAME}")') if await user_button.count(): print("✓ Already logged in", file=sys.stderr) return True login_link = page.locator('a:has-text("Log in")') if not await login_link.count(): print("FAILED: Login link not found", file=sys.stderr) return False await login_link.click() await page.wait_for_load_state("networkidle") await page.fill('input[name="_username"]', USERNAME) await page.fill('input[name="_password"]', PASSWORD) await page.click('button:has-text("Log in")') await page.wait_for_load_state("networkidle") if await page.locator(f'button:has-text("{USERNAME}")').count(): print(f"✓ Logged in as {USERNAME}", file=sys.stderr) return True print("FAILED: Could not log in with provided credentials", file=sys.stderr) return False async def verify_post_body(page) -> bool: print("Step 2: Validating reposted comment content...", file=sys.stderr) await page.goto(f"{BASE_URL}/f/{FORUM_SLUG}", wait_until="networkidle") post_link = page.locator(f'a:has-text("{POST_TITLE}")') if not await post_link.count(): print(f"FAILED: Post '{POST_TITLE}' not found in LifeProTips", file=sys.stderr) return False await post_link.first.click() await page.wait_for_load_state("networkidle") article = page.locator("article") if not await article.count(): print("FAILED: Unable to read post body", file=sys.stderr) return False body_text = await article.first.inner_text() if EXPECTED_BODY not in body_text: print("FAILED: Post body does not match the copied comment text", file=sys.stderr) return False print("✓ Post body matches the expected LifeProTips comment", file=sys.stderr) return True async def verify_listing_presence(page) -> bool: print("Step 3: Confirming the post appears in the forum listing...", file=sys.stderr) await page.goto(f"{BASE_URL}/f/{FORUM_SLUG}", wait_until="networkidle") post_link = page.locator(f'a:has-text("{POST_TITLE}")') if await post_link.count(): print("✓ Post is visible in the LifeProTips feed", file=sys.stderr) return True print("FAILED: Post missing from forum listing", file=sys.stderr) return False async def verify() -> bool: async with async_playwright() as p: browser = await p.chromium.launch(headless=True) context = await browser.new_context() page = await context.new_page() try: if not await ensure_logged_in(page): return False if not await verify_post_body(page): return False if not await verify_listing_presence(page): return False print("SUCCESS: Routine tracker easy task verified", file=sys.stderr) return True except PlaywrightTimeoutError as exc: print(f"FAILED: Timeout occurred - {exc}", file=sys.stderr) return False except Exception as exc: print(f"FAILED: Unexpected error - {exc}", file=sys.stderr) return False finally: await browser.close() def main(): result = asyncio.run(verify()) sys.exit(0 if result else 1) if __name__ == "__main__": main() ================================================ FILE: tasks/playwright_webarena/easy/shopping_admin/fitness_promotion_strategy/description.md ================================================ Stick to the first three analytical steps from the original workflow so the easy version only inventories bestseller and promo data. **Task Requirements** 1. If need to login, login with username 'admin' and password 'admin1234'. 2. **Dashboard stop**: read the first three rows in **Bestsellers** (name, price, quantity) exactly as shown, note the Revenue KPI amount, and look at the **Top Search Terms** widget—if any of those three product names appears there, record it as `term:uses`, otherwise output `No:0`. 3. **Catalog → Products stop**: search each of the same three bestseller names one at a time and copy their SKU, Qty (inventory column), and Status (Enabled/Disabled) from the grid. 4. **Marketing → Promotions → Cart Price Rules stop**: set Status = Active, count how many rules are shown, and locate the rule that applies a percentage discount so you can report `rule name:percentage`. Output everything using the reduced template below: ``` <answer> Bestseller1|name:price:quantity:sku:inventory:status Bestseller2|name:price:quantity:sku:inventory:status Bestseller3|name:price:quantity:sku:inventory:status TotalRevenue|amount BestsellerInSearch|term:count PercentageDiscountRule|name:percentage ActiveRulesCount|count </answer> ``` ``` <answer> Bestseller1|name:price:quantity:sku:inventory:status Bestseller2|name:price:quantity:sku:inventory:status Bestseller3|name:price:quantity:sku:inventory:status TotalRevenue|amount BestsellerInSearch|term:count PercentageDiscountRule|name:percentage ActiveRulesCount|count TotalOrders|count MostRecentOrderID|id TopCustomer|name:email:group SameGroupCustomers|count </answer> ``` ================================================ FILE: tasks/playwright_webarena/easy/shopping_admin/fitness_promotion_strategy/label.txt ================================================ Bestseller1|Sprite Stasis Ball 65 cm:$27.00:6:24-WG082-blue:100:Enabled Bestseller2|Quest Lumaflex™ Band:$19.00:6:24-UG01:100:Enabled Bestseller3|Sprite Yoga Strap 6 foot:$14.00:6:24-WG085:100:Enabled TotalRevenue|$0.00 BestsellerInSearch|No:0 PercentageDiscountRule|20% OFF Ever $200-plus purchase!*:20% ActiveRulesCount|4 ================================================ FILE: tasks/playwright_webarena/easy/shopping_admin/fitness_promotion_strategy/meta.json ================================================ { "task_id": "fitness_promotion_strategy_easy", "task_name": "Fitness Promotion Strategy (Easy)", "category_id": "shopping_admin", "category_name": "Shopping Admin", "description": "Capture the three dashboard bestsellers, confirm their catalog details, and snapshot the related promo and customer metrics needed for a quick campaign brief.", "author": "Fanqing Meng", "created_at": "2025-11-15", "difficulty": "L1", "tags": [ "data extraction", "comparative analysis", "inventory management", "content submission" ], "mcp": [ "playwright" ], "meta_data": { "stateType": "video", "stateContent": null, "stateUrl": "https://storage.mcpmark.ai/tasks_state/playwright_video/magento-admin.mp4", "stateOriginalUrl": "https://github.com/web-arena-x/webarena/tree/main/environment_docker" } } ================================================ FILE: tasks/playwright_webarena/easy/shopping_admin/fitness_promotion_strategy/verify.py ================================================ import asyncio import sys import re import os import json from pathlib import Path def get_model_response(): """ Get the model's response from the MCP_MESSAGES environment variable. Returns the last assistant message text. """ messages_path = os.getenv("MCP_MESSAGES") print(f"MCP_MESSAGES: {messages_path}") if not messages_path: print("Warning: MCP_MESSAGES environment variable not set", file=sys.stderr) return None try: with open(messages_path, 'r') as f: messages = json.load(f) # Find the last assistant message for message in reversed(messages): if message.get('role') == 'assistant' and message.get('status') == 'completed': content = message.get('content', []) for item in content: if item.get('type') == 'output_text': return item.get('text', '') print("Warning: No assistant response found in messages", file=sys.stderr) return None except Exception as e: print(f"Error reading messages file: {str(e)}", file=sys.stderr) return None def parse_answer_format(text): """ Parse the <answer>...</answer> format from the agent's output. Returns a dictionary with the parsed values. """ if not text: return None # Look for <answer>...</answer> pattern match = re.search(r'<answer>(.*?)</answer>', text, re.IGNORECASE | re.DOTALL) if not match: return None answer_content = match.group(1).strip() # Parse each line result = {} lines = answer_content.split('\n') # Skip the check for exact number of lines - just parse what we have # if len(lines) != 13: # print(f"Error: Expected 13 lines in answer, got {len(lines)}", file=sys.stderr) # return None for line in lines: if '|' in line: key, value = line.split('|', 1) result[key.strip()] = value.strip() return result def load_expected_answer(label_path): """ Load the expected answer from label.txt file. Returns a dictionary with the expected values. """ try: with open(label_path, 'r') as f: lines = f.read().strip().split('\n') expected = {} for line in lines: if '|' in line: key, value = line.split('|', 1) expected[key.strip()] = value.strip() return expected except Exception as e: print(f"Error reading label file: {str(e)}", file=sys.stderr) return None def compare_answers(model_answer, expected_answer): """ Compare the model's answer with the expected answer. Returns True if all key information matches, False otherwise. """ if not model_answer or not expected_answer: return False # Check each expected key mismatches = [] for key, expected_value in expected_answer.items(): model_value = model_answer.get(key, '') # Special handling for different types of values if key in ['Bestseller1', 'Bestseller2', 'Bestseller3']: # Check if all parts match (name:price:quantity:sku:inventory:status) if ':' in expected_value and ':' in model_value: expected_parts = expected_value.split(':') model_parts = model_value.split(':') if len(expected_parts) == 6 and len(model_parts) == 6: # Compare each part for i, (exp, mod) in enumerate(zip(expected_parts, model_parts)): if i == 1: # Price field exp_clean = exp.replace('$', '').replace(',', '') mod_clean = mod.replace('$', '').replace(',', '') if exp_clean != mod_clean: mismatches.append(f"{key} price: expected '{exp}', got '{mod}'") elif i == 4: # Inventory field (may have decimal places) exp_float = float(exp.replace(',', '')) mod_float = float(mod.replace(',', '')) if abs(exp_float - mod_float) > 0.0001: mismatches.append(f"{key} inventory: expected '{exp}', got '{mod}'") else: if exp.lower() != mod.lower(): mismatches.append(f"{key} part {i}: expected '{exp}', got '{mod}'") else: mismatches.append(f"{key}: format mismatch - expected '{expected_value}', got '{model_value}'") else: if expected_value != model_value: mismatches.append(f"{key}: expected '{expected_value}', got '{model_value}'") elif key == 'LowestInventoryProduct': # Check product name and inventory if ':' in expected_value and ':' in model_value: expected_name, expected_inv = expected_value.rsplit(':', 1) model_name, model_inv = model_value.rsplit(':', 1) if expected_name.lower() != model_name.lower(): mismatches.append(f"{key} name: expected '{expected_name}', got '{model_name}'") exp_float = float(expected_inv.replace(',', '')) mod_float = float(model_inv.replace(',', '')) if abs(exp_float - mod_float) > 0.0001: mismatches.append(f"{key} inventory: expected '{expected_inv}', got '{model_inv}'") else: if expected_value != model_value: mismatches.append(f"{key}: expected '{expected_value}', got '{model_value}'") elif key in ['TotalRevenue', 'MinimumPurchaseRule']: # For price/amount fields, normalize format expected_clean = expected_value.replace('$', '').replace(',', '') model_clean = model_value.replace('$', '').replace(',', '') if expected_clean != model_clean: mismatches.append(f"{key}: expected '{expected_value}', got '{model_value}'") elif key == 'BestsellerInSearch': # Check search term and count if expected_value.lower() != model_value.lower(): mismatches.append(f"{key}: expected '{expected_value}', got '{model_value}'") elif key == 'PercentageDiscountRule': # Check rule name and percentage if ':' in expected_value and ':' in model_value: expected_name, expected_pct = expected_value.rsplit(':', 1) model_name, model_pct = model_value.rsplit(':', 1) if expected_name != model_name: mismatches.append(f"{key} name: expected '{expected_name}', got '{model_name}'") # Normalize percentage (20% vs 20 vs 0.20) exp_pct_clean = expected_pct.replace('%', '').strip() mod_pct_clean = model_pct.replace('%', '').strip() if exp_pct_clean != mod_pct_clean: mismatches.append(f"{key} percentage: expected '{expected_pct}', got '{model_pct}'") else: if expected_value != model_value: mismatches.append(f"{key}: expected '{expected_value}', got '{model_value}'") elif key == 'TopCustomer': # Check name:email:group if ':' in expected_value and ':' in model_value: expected_parts = expected_value.split(':') model_parts = model_value.split(':') if len(expected_parts) == 3 and len(model_parts) == 3: exp_name, exp_email, exp_group = expected_parts mod_name, mod_email, mod_group = model_parts if exp_name != mod_name: mismatches.append(f"{key} name: expected '{exp_name}', got '{mod_name}'") if exp_email.lower() != mod_email.lower(): mismatches.append(f"{key} email: expected '{exp_email}', got '{mod_email}'") if exp_group.lower() != mod_group.lower(): mismatches.append(f"{key} group: expected '{exp_group}', got '{mod_group}'") else: mismatches.append(f"{key}: format mismatch - expected '{expected_value}', got '{model_value}'") else: if expected_value != model_value: mismatches.append(f"{key}: expected '{expected_value}', got '{model_value}'") elif key == 'MostRecentOrderDate': # Date format may vary, do flexible comparison if expected_value.lower() == 'none' and model_value.lower() == 'none': continue elif expected_value != model_value: # Could add more flexible date parsing here if needed mismatches.append(f"{key}: expected '{expected_value}', got '{model_value}'") else: # Exact match for other fields (counts, etc.) if str(model_value) != str(expected_value): mismatches.append(f"{key}: expected '{expected_value}', got '{model_value}'") if mismatches: print("\n=== Answer Comparison Mismatches ===", file=sys.stderr) for mismatch in mismatches: print(f"✗ {mismatch}", file=sys.stderr) return False print("\n=== Answer Comparison ===", file=sys.stderr) print("✓ All key information matches the expected answer", file=sys.stderr) return True async def verify() -> bool: """ Verifies that the bestseller analysis and promotion task has been completed correctly. First checks the model's answer against the expected label, then optionally verifies the actual state in the Magento Admin. """ # Get the label file path label_path = Path(__file__).parent / "label.txt" # Load expected answer expected_answer = load_expected_answer(label_path) if not expected_answer: print("Error: Could not load expected answer from label.txt", file=sys.stderr) return False # Get model's response from MCP_MESSAGES model_response = get_model_response() if model_response: print("Found model response, parsing answer format...", file=sys.stderr) model_answer = parse_answer_format(model_response) if model_answer: print("\n=== Model Answer Parsed ===", file=sys.stderr) for key, value in model_answer.items(): print(f"{key}: {value}", file=sys.stderr) # Compare answers answer_match = compare_answers(model_answer, expected_answer) if not answer_match: print("\nModel answer does not match expected answer", file=sys.stderr) return False print("\n✓ Model answer matches expected answer", file=sys.stderr) return True else: print("Warning: Could not parse answer format from model response", file=sys.stderr) return False else: print("No model response found", file=sys.stderr) return False def main(): """ Executes the verification process and exits with a status code. """ result = asyncio.run(verify()) sys.exit(0 if result else 1) if __name__ == "__main__": main() ================================================ FILE: tasks/playwright_webarena/easy/shopping_admin/ny_expansion_analysis/description.md ================================================ Keep only the first three investigative steps so the easy task focuses on dashboard + tax + order-status insights. **Task Requirements** 1. If need to login, login with username 'admin' and password 'admin1234'. On the **Dashboard**, record the Lifetime Sales amount, identify the cheapest product in the **Bestsellers** table (note its name, price, and quantity), and check whether that same product appears anywhere in **Last Orders** (output the customer name if yes, otherwise `No`). 2. Go to **Stores → Taxes → Tax Zones and Rates**. Capture the exact rates for New York and California, specify which state is higher, and count how many distinct U.S. states have entries in the grid. 3. Still in **Stores**, open **Settings → Order Status**, filter “Visible On Storefront = Yes”, and confirm whether a status with code `processing` exists and if it’s flagged as a default status. Report just these metrics in the reduced answer format: ``` <answer> Lifetime_Sales_Amount|amount Cheap_Bestseller_Name|name Second_Bestseller_Price|price Second_Bestseller_Quantity|quantity Product_In_Last_Orders|yes_or_no_or_customer NY_Tax_Rate|rate CA_Tax_Rate|rate Higher_Tax_State|state Total_States_With_Tax|count Processing_Visible_Storefront|Yes_or_No Processing_Default_Status|Yes_or_No </answer> ``` ``` <answer> Lifetime_Sales_Amount|amount Cheap_Bestseller_Name|name Second_Bestseller_Price|price Second_Bestseller_Quantity|quantity Product_In_Last_Orders|yes_or_no NY_Tax_Rate|rate CA_Tax_Rate|rate Higher_Tax_State|state Total_States_With_Tax|count Processing_Visible_Storefront|Yes_or_No Processing_Default_Status|Yes_or_No Number_Of_Websites|count Main_Store_Code|code Default_Source_Pickup_Status|status Default_Source_State|state_or_none Dashboard_Revenue|amount Tax_Shipping_Zero|yes_or_no </answer> ``` ================================================ FILE: tasks/playwright_webarena/easy/shopping_admin/ny_expansion_analysis/label.txt ================================================ Lifetime_Sales_Amount|$0.00 Cheap_Bestseller_Name|Sprite Yoga Strap 6 foot Second_Bestseller_Price|$14.00 Second_Bestseller_Quantity|6 Product_In_Last_Orders|No NY_Tax_Rate|8.3750 CA_Tax_Rate|8.2500 Higher_Tax_State|NY Total_States_With_Tax|2 Processing_Visible_Storefront|Yes Processing_Default_Status|Yes ================================================ FILE: tasks/playwright_webarena/easy/shopping_admin/ny_expansion_analysis/meta.json ================================================ { "task_id": "ny_expansion_analysis_easy", "task_name": "NY Expansion Analysis (Easy)", "category_id": "shopping_admin", "category_name": "Shopping Admin", "description": "Capture just the dashboard, tax, order-status, store, and inventory facts required to judge if New York can launch without heavy configuration work.", "author": "Fanqing Meng", "created_at": "2025-11-15", "difficulty": "L1", "tags": [ "data extraction", "comparative analysis", "content submission" ], "mcp": [ "playwright" ], "meta_data": { "stateType": "video", "stateContent": null, "stateUrl": "https://storage.mcpmark.ai/tasks_state/playwright_video/magento-admin.mp4", "stateOriginalUrl": "https://github.com/web-arena-x/webarena/tree/main/environment_docker" } } ================================================ FILE: tasks/playwright_webarena/easy/shopping_admin/ny_expansion_analysis/verify.py ================================================ import asyncio import sys import re import os import json from pathlib import Path def get_model_response(): """ Get the model's response from the MCP_MESSAGES environment variable. Returns the last assistant message text. """ messages_path = os.getenv("MCP_MESSAGES") print(f"MCP_MESSAGES: {messages_path}") if not messages_path: print("ERROR: MCP_MESSAGES environment variable not set", file=sys.stderr) return None # Check if file exists if not Path(messages_path).exists(): print(f"ERROR: Messages file not found at path: {messages_path}", file=sys.stderr) return None try: with open(messages_path, 'r') as f: content = f.read() # Check if file is empty if not content or content.strip() == '""': print("ERROR: Messages file is empty or contains only empty string", file=sys.stderr) return None messages = json.loads(content) # Check if messages is a list if not isinstance(messages, list): print(f"ERROR: Messages file should contain a list, got {type(messages).__name__}", file=sys.stderr) return None # Find the last assistant message for message in reversed(messages): if message.get('role') == 'assistant' and message.get('status') == 'completed': content = message.get('content', []) if not content: print("WARNING: Assistant message has empty content", file=sys.stderr) continue for item in content: if item.get('type') == 'output_text': text = item.get('text', '') if not text: print("WARNING: Output text is empty", file=sys.stderr) continue return text print("ERROR: No assistant response with output_text found in messages", file=sys.stderr) return None except json.JSONDecodeError as e: print(f"ERROR: Invalid JSON in messages file: {str(e)}", file=sys.stderr) return None except Exception as e: print(f"ERROR: Unexpected error reading messages file: {str(e)}", file=sys.stderr) return None def parse_answer_format(text): """ Parse the <answer>...</answer> format from the agent's output. Returns a dictionary with the parsed values. """ if not text: print("ERROR: No text provided to parse", file=sys.stderr) return None # Look for <answer>...</answer> pattern match = re.search(r'<answer>(.*?)</answer>', text, re.IGNORECASE | re.DOTALL) if not match: print("ERROR: No <answer> tags found in the response", file=sys.stderr) print(f" Response preview: {text[:200]}...", file=sys.stderr) return None answer_content = match.group(1).strip() if not answer_content: print("ERROR: Empty content between <answer> tags", file=sys.stderr) return None # Parse each line result = {} lines = answer_content.split('\n') # Expected keys that should be present expected_keys = [ 'Lifetime_Sales_Amount', 'Cheap_Bestseller_Name', 'Second_Bestseller_Price', 'Second_Bestseller_Quantity', 'Product_In_Last_Orders', 'NY_Tax_Rate', 'CA_Tax_Rate', 'Higher_Tax_State', 'Total_States_With_Tax', 'Processing_Visible_Storefront', 'Processing_Default_Status' ] parsed_keys = [] for line in lines: line = line.strip() if not line: continue if '|' not in line: print(f"ERROR: Line missing pipe separator '|': {line}", file=sys.stderr) continue parts = line.split('|', 1) if len(parts) != 2: print(f"ERROR: Invalid line format: {line}", file=sys.stderr) continue key, value = parts key = key.strip() value = value.strip() if not key: print(f"ERROR: Empty key in line: {line}", file=sys.stderr) continue result[key] = value parsed_keys.append(key) # Check for missing expected keys missing_keys = set(expected_keys) - set(parsed_keys) if missing_keys: print(f"ERROR: Missing expected keys: {', '.join(sorted(missing_keys))}", file=sys.stderr) # Check for unexpected keys unexpected_keys = set(parsed_keys) - set(expected_keys) if unexpected_keys: print(f"WARNING: Unexpected keys found: {', '.join(sorted(unexpected_keys))}", file=sys.stderr) if not result: print("ERROR: No valid key-value pairs parsed from answer", file=sys.stderr) return None return result def load_expected_answer(label_path): """ Load the expected answer from label.txt file. Returns a dictionary with the expected values. """ try: with open(label_path, 'r') as f: lines = f.read().strip().split('\n') expected = {} for line in lines: if '|' in line: key, value = line.split('|', 1) expected[key.strip()] = value.strip() return expected except Exception as e: print(f"Error reading label file: {str(e)}", file=sys.stderr) return None def compare_answers(model_answer, expected_answer): """ Compare the model's answer with the expected answer. Returns True if all key information matches, False otherwise. """ if not model_answer or not expected_answer: return False # Check each expected key mismatches = [] for key, expected_value in expected_answer.items(): model_value = model_answer.get(key, '') # Special handling for different types of values if key in ['Lifetime_Sales_Amount', 'Second_Bestseller_Price', 'Dashboard_Revenue']: # For price/amount fields, normalize format expected_clean = expected_value.replace('$', '').replace(',', '') model_clean = model_value.replace('$', '').replace(',', '') if expected_clean != model_clean: mismatches.append(f"{key}: expected '{expected_value}', got '{model_value}'") elif key in ['NY_Tax_Rate', 'CA_Tax_Rate']: # Tax rates - allow different decimal formats expected_clean = expected_value.replace('%', '').strip() model_clean = model_value.replace('%', '').strip() # Convert to float for comparison try: if float(expected_clean) != float(model_clean): mismatches.append(f"{key}: expected '{expected_value}', got '{model_value}'") except ValueError: if expected_clean != model_clean: mismatches.append(f"{key}: expected '{expected_value}', got '{model_value}'") elif key in ['Product_In_Last_Orders', 'Processing_Visible_Storefront', 'Processing_Default_Status']: # Yes/No fields - case insensitive if model_value.lower() != expected_value.lower(): mismatches.append(f"{key}: expected '{expected_value}', got '{model_value}'") elif key == 'Empty_Rows_Yes_Effect': # Allow flexible descriptions for this field # Just check if model provided some reasonable description if not model_value or len(model_value) < 5: mismatches.append(f"{key}: expected meaningful description, got '{model_value}'") elif key == 'Order_Status_Options': # Check if main options are mentioned expected_options = set(opt.strip() for opt in expected_value.split(',')) model_options = set(opt.strip() for opt in model_value.split(',')) if expected_options != model_options: mismatches.append(f"{key}: expected '{expected_value}', got '{model_value}'") elif key == 'Chart_Disabled_Message': # Allow some flexibility in message text # Check for key words if 'disabled' not in model_value.lower() and 'enable' not in model_value.lower(): mismatches.append(f"{key}: expected message about chart being disabled, got '{model_value}'") elif key == 'Default_Source_State': # Handle 'None' or empty state expected_normalized = expected_value.lower() if expected_value.lower() != 'none' else '' model_normalized = model_value.lower() if model_value.lower() != 'none' else '' if expected_normalized != model_normalized: mismatches.append(f"{key}: expected '{expected_value}', got '{model_value}'") else: # Exact match for other fields if model_value != expected_value: mismatches.append(f"{key}: expected '{expected_value}', got '{model_value}'") if mismatches: print("\n=== Answer Comparison Mismatches ===", file=sys.stderr) for mismatch in mismatches: print(f"✗ {mismatch}", file=sys.stderr) return False print("\n=== Answer Comparison ===", file=sys.stderr) print("✓ All key information matches the expected answer", file=sys.stderr) return True async def verify() -> bool: """ Verifies that the NY expansion analysis task has been completed correctly. First checks the model's answer against the expected label, then optionally verifies the actual state in the Magento Admin. """ print("\n=== Starting Verification ===", file=sys.stderr) # Get the label file path label_path = Path(__file__).parent / "label.txt" # Load expected answer print("Loading expected answer from label.txt...", file=sys.stderr) expected_answer = load_expected_answer(label_path) if not expected_answer: print("FATAL ERROR: Could not load expected answer from label.txt", file=sys.stderr) return False print(f"Expected answer loaded with {len(expected_answer)} keys", file=sys.stderr) # Get model's response from MCP_MESSAGES print("\nReading model response from MCP_MESSAGES...", file=sys.stderr) model_response = get_model_response() if not model_response: print("FATAL ERROR: No valid model response found", file=sys.stderr) return False print(f"Model response found (length: {len(model_response)} chars)", file=sys.stderr) print("\nParsing answer format from model response...", file=sys.stderr) model_answer = parse_answer_format(model_response) if not model_answer: print("FATAL ERROR: Could not parse answer format from model response", file=sys.stderr) return False print(f"\n=== Model Answer Parsed Successfully ===", file=sys.stderr) print(f"Parsed {len(model_answer)} key-value pairs", file=sys.stderr) for key, value in model_answer.items(): print(f" {key}: {value}", file=sys.stderr) # Compare answers print("\n=== Comparing Model Answer with Expected Answer ===", file=sys.stderr) answer_match = compare_answers(model_answer, expected_answer) if not answer_match: print("\nFATAL ERROR: Model answer does not match expected answer", file=sys.stderr) print("Verification FAILED", file=sys.stderr) return False print("\n✓ Model answer matches expected answer", file=sys.stderr) print("Verification PASSED", file=sys.stderr) return True def main(): """ Executes the verification process and exits with a status code. """ result = asyncio.run(verify()) sys.exit(0 if result else 1) if __name__ == "__main__": main() ================================================ FILE: tasks/playwright_webarena/easy/shopping_admin/products_sales_analysis/description.md ================================================ Only keep the first few catalog and dashboard checks plus the high-level orders snapshot. **Task Requirements** 1. If need to login, login with username 'admin' and password 'admin1234'. 2. **Catalog → Products**: search for product names containing `Yoga` and capture the records-found count; reset filters and look up SKU `WH11` to copy its exact price; reset again and set Quantity (From/To) = `0.0000` to count all zero-quantity products. 3. **Dashboard**: in the Bestsellers table sort by price ascending—record the lowest-priced row as `name:quantity`, then locate `Quest Lumaflex™ Band` and note its quantity, and read the Revenue KPI amount. 4. **Sales → Orders**: filter Status = Pending to count those orders, then search for Grace Nguyen, switch Status = Complete, sort Grand Total descending, and record the Order # of the most expensive completed order. Return just these metrics: ``` <answer> YogaProducts|count WH11Price|price ZeroQuantityProducts|count LowestProduct|name:quantity QuestLumaflexQuantity|quantity DashboardRevenue|amount PendingOrders|count GraceNguyenOrderID|orderid </answer> ``` ``` <answer> YogaProducts|count WH11Price|price ZeroQuantityProducts|count LowestProduct|name:quantity QuestLumaflexQuantity|quantity DashboardRevenue|amount SarahMillerEmail|email TotalCustomers|count PendingOrders|count GraceNguyenOrderID|orderid </answer> ``` **Example Output:** ``` <answer> YogaProducts|XX WH11Price|$XX.XX ZeroQuantityProducts|XX LowestProduct|Product Name Here:XX QuestLumaflexQuantity|XX DashboardRevenue|$XX.XX SarahMillerEmail|email@example.com TotalCustomers|XX PendingOrders|X GraceNguyenOrderID|00000XXXX </answer> ``` ================================================ FILE: tasks/playwright_webarena/easy/shopping_admin/products_sales_analysis/label.txt ================================================ YogaProducts|171 WH11Price|$54.00 ZeroQuantityProducts|150 LowestProduct|Sprite Stasis Ball 55 cm foot:5 QuestLumaflexQuantity|6 DashboardRevenue|$0.00 PendingOrders|10 GraceNguyenOrderID|000000189 ================================================ FILE: tasks/playwright_webarena/easy/shopping_admin/products_sales_analysis/meta.json ================================================ { "task_id": "products_sales_analysis_easy", "task_name": "Products Sales Analysis (Easy)", "category_id": "shopping_admin", "category_name": "Shopping Admin", "description": "Make a single guided pass through Catalog, Dashboard, Customers, and Orders to collect the exact fields needed for a quick sales recap.", "author": "Fanqing Meng", "created_at": "2025-11-15", "difficulty": "L1", "tags": [ "data extraction", "comparative analysis", "content submission" ], "mcp": [ "playwright" ], "meta_data": { "stateType": "video", "stateContent": null, "stateUrl": "https://storage.mcpmark.ai/tasks_state/playwright_video/magento-admin.mp4", "stateOriginalUrl": "https://github.com/web-arena-x/webarena/tree/main/environment_docker" } } ================================================ FILE: tasks/playwright_webarena/easy/shopping_admin/products_sales_analysis/verify.py ================================================ import asyncio import sys import re import os import json from pathlib import Path def get_model_response(): """ Get the model's response from the MCP_MESSAGES environment variable. Returns the last assistant message text. """ messages_path = os.getenv("MCP_MESSAGES") print(f"MCP_MESSAGES: {messages_path}") if not messages_path: print("Warning: MCP_MESSAGES environment variable not set", file=sys.stderr) return None try: with open(messages_path, "r") as f: messages = json.load(f) # Find the last assistant message for message in reversed(messages): if ( message.get("role") == "assistant" and message.get("status") == "completed" ): content = message.get("content", []) for item in content: if item.get("type") == "output_text": return item.get("text", "") print("Warning: No assistant response found in messages", file=sys.stderr) return None except Exception as e: print(f"Error reading messages file: {str(e)}", file=sys.stderr) return None def parse_answer_format(text): """ Parse the <answer>...</answer> format from the agent's output. Returns a dictionary with the parsed values. """ if not text: print("Error: No text provided to parse", file=sys.stderr) return None # Look for <answer>...</answer> pattern match = re.search(r"<answer>(.*?)</answer>", text, re.IGNORECASE | re.DOTALL) if not match: print("Error: No <answer>...</answer> tags found in response", file=sys.stderr) return None answer_content = match.group(1).strip() if not answer_content: print("Error: Empty answer content", file=sys.stderr) return None # Parse each line result = {} lines = [line.strip() for line in answer_content.split("\n") if line.strip()] if len(lines) != 8: print(f"Error: Expected 8 lines in answer, got {len(lines)}", file=sys.stderr) print(f"Lines found: {lines}", file=sys.stderr) return None # Expected keys for validation expected_keys = [ "YogaProducts", "WH11Price", "ZeroQuantityProducts", "LowestProduct", "QuestLumaflexQuantity", "DashboardRevenue", "PendingOrders", "GraceNguyenOrderID" ] for line in lines: if "|" not in line: print(f"Error: Line missing '|' separator: {line}", file=sys.stderr) return None parts = line.split("|", 1) if len(parts) != 2: print(f"Error: Invalid line format: {line}", file=sys.stderr) return None key, value = parts[0].strip(), parts[1].strip() if not key or not value: print(f"Error: Empty key or value in line: {line}", file=sys.stderr) return None result[key] = value # Validate all expected keys are present missing_keys = set(expected_keys) - set(result.keys()) if missing_keys: print(f"Error: Missing required keys: {missing_keys}", file=sys.stderr) return None return result def load_expected_answer(label_path): """ Load the expected answer from label.txt file. Returns a dictionary with the expected values. """ try: with open(label_path, "r") as f: lines = f.read().strip().split("\n") expected = {} for line in lines: if "|" in line: key, value = line.split("|", 1) expected[key.strip()] = value.strip() return expected except Exception as e: print(f"Error reading label file: {str(e)}", file=sys.stderr) return None def compare_answers(model_answer, expected_answer): """ Compare the model's answer with the expected answer. Returns True if all key information matches, False otherwise. """ if not model_answer or not expected_answer: return False # Check each expected key mismatches = [] for key, expected_value in expected_answer.items(): model_value = model_answer.get(key, "") # Special handling for different types of values if key == "LowestProduct": # Check if product name and quantity match (format: "Product Name:quantity") if ":" in expected_value and ":" in model_value: expected_name, expected_qty = expected_value.rsplit(":", 1) model_name, model_qty = model_value.rsplit(":", 1) if expected_name != model_name or expected_qty != model_qty: mismatches.append( f"{key}: expected '{expected_value}', got '{model_value}'" ) else: if expected_value != model_value: mismatches.append( f"{key}: expected '{expected_value}', got '{model_value}'" ) elif key in ["WH11Price", "DashboardRevenue"]: # For price/amount fields, normalize format expected_clean = expected_value.replace("$", "").replace(",", "") model_clean = model_value.replace("$", "").replace(",", "") if expected_clean != model_clean: mismatches.append( f"{key}: expected '{expected_value}', got '{model_value}'" ) elif key == "SarahMillerEmail": # Email should match exactly if model_value.lower() != expected_value.lower(): mismatches.append( f"{key}: expected '{expected_value}', got '{model_value}'" ) else: # Exact match for other fields if model_value != expected_value: mismatches.append( f"{key}: expected '{expected_value}', got '{model_value}'" ) if mismatches: print("\n=== Answer Comparison Mismatches ===", file=sys.stderr) for mismatch in mismatches: print(f"✗ {mismatch}", file=sys.stderr) return False print("\n=== Answer Comparison ===", file=sys.stderr) print("✓ All key information matches the expected answer", file=sys.stderr) return True async def verify() -> bool: """ Verifies that the products and sales analysis task has been completed correctly. First checks the model's answer against the expected label, then optionally verifies the actual state in the Magento Admin. """ # Get the label file path label_path = Path(__file__).parent / "label.txt" # Load expected answer expected_answer = load_expected_answer(label_path) if not expected_answer: print("Error: Could not load expected answer from label.txt", file=sys.stderr) return False # Get model's response from MCP_MESSAGES model_response = get_model_response() if model_response: print("Found model response, parsing answer format...", file=sys.stderr) model_answer = parse_answer_format(model_response) if model_answer: print("\n=== Model Answer Parsed ===", file=sys.stderr) for key, value in model_answer.items(): print(f"{key}: {value}", file=sys.stderr) # Compare answers answer_match = compare_answers(model_answer, expected_answer) if not answer_match: print("\nModel answer does not match expected answer", file=sys.stderr) return False print("\n✓ Model answer matches expected answer", file=sys.stderr) return True else: print( "Warning: Could not parse answer format from model response", file=sys.stderr, ) return False else: print("No model response found", file=sys.stderr) return False def main(): """ Executes the verification process and exits with a status code. """ result = asyncio.run(verify()) sys.exit(0 if result else 1) if __name__ == "__main__": main() ================================================ FILE: tasks/playwright_webarena/easy/shopping_admin/sales_inventory_analysis/description.md ================================================ Retain just the first three analytic arenas—products, orders, and the dashboard—so the easy task stays read-only and short. **Task Requirements** 1. If need to login, login with username 'admin' and password 'admin1234', then open **Catalog → Products**. Search for names containing `Sprite` to get their count, reset and set Quantity (From/To) = `100.0000` to count those rows, and finally reset to look up SKU `WS12` so you can copy its exact name and price. 2. Switch to **Sales → Orders**. Filter Status = Pending to count those orders, then search for Grace Nguyen with Status = Complete, sort Grand Total ascending, and capture the cheapest completed order ID. Clear filters, sort Grand Total descending, and record the top row’s customer and amount. 3. Finish in **Dashboard**. Sort **Bestsellers** by Quantity descending to capture the first row’s name and quantity, locate `Overnight Duffle` in that table to note its price, and check the **Top Search Terms** widget to see what position `hollister` occupies. Answer with the reduced template: ``` <answer> SpriteProducts|count Quantity100Products|count WS12Info|name:price PendingOrders|count GraceOrderID|orderid HighestOrderInfo|customer:amount CheapProduct|name:quantity OvernightDufflePrice|price HollisterPosition|position </answer> ``` ``` <answer> SpriteProducts|count Quantity100Products|count WS12Info|name:price PendingOrders|count GraceOrderID|orderid HighestOrderInfo|customer:amount CheapProduct|name:quantity OvernightDufflePrice|price HollisterPosition|position CostelloCustomers|count SarahMillerInfo|group:date PaidInvoices|count Invoice002BillTo|name </answer> ``` **Example Output:** ``` <answer> SpriteProducts|XX Quantity100Products|XX WS12Info|Product Name Here:$XX.XX PendingOrders|X GraceOrderID|00000XXXX HighestOrderInfo|Customer Name:$XXX.XX CheapProduct|Product Name:XX OvernightDufflePrice|$XX.XX HollisterPosition|Xth CostelloCustomers|X SarahMillerInfo|Group Name:MMM DD, YYYY PaidInvoices|X Invoice002BillTo|Customer Name </answer> ``` ================================================ FILE: tasks/playwright_webarena/easy/shopping_admin/sales_inventory_analysis/label.txt ================================================ SpriteProducts|16 Quantity100Products|1886 WS12Info|Radiant Tee:$22.00 PendingOrders|10 GraceOrderID|000000114 HighestOrderInfo|Samantha Jones:$292.40 CheapProduct|Sprite Yoga Strap 6 foot:6 OvernightDufflePrice|$45.00 HollisterPosition|1st ================================================ FILE: tasks/playwright_webarena/easy/shopping_admin/sales_inventory_analysis/meta.json ================================================ { "task_id": "sales_inventory_analysis_easy", "task_name": "Sales Inventory Analysis (Easy)", "category_id": "shopping_admin", "category_name": "Shopping Admin", "description": "Follow one guided tour through Products, Orders, Dashboard, Customers, and Invoices to capture a compact set of sales-plus-inventory facts.", "author": "Fanqing Meng", "created_at": "2025-11-15", "difficulty": "L1", "tags": [ "data extraction", "comparative analysis", "inventory management" ], "mcp": [ "playwright" ], "meta_data": { "stateType": "video", "stateContent": null, "stateUrl": "https://storage.mcpmark.ai/tasks_state/playwright_video/magento-admin.mp4", "stateOriginalUrl": "https://github.com/web-arena-x/webarena/tree/main/environment_docker" } } ================================================ FILE: tasks/playwright_webarena/easy/shopping_admin/sales_inventory_analysis/verify.py ================================================ import asyncio import sys import re import os import json from pathlib import Path def get_model_response(): """ Get the model's response from the MCP_MESSAGES environment variable. Returns the last assistant message text. """ messages_path = os.getenv("MCP_MESSAGES") print(f"MCP_MESSAGES: {messages_path}") if not messages_path: print("Warning: MCP_MESSAGES environment variable not set", file=sys.stderr) return None try: with open(messages_path, "r") as f: messages = json.load(f) # Find the last assistant message with type='message', status='completed' for message in reversed(messages): if ( message.get("role") == "assistant" and message.get("status") == "completed" and message.get("type") == "message" ): content = message.get("content", []) for item in content: # Check for both 'text' and 'output_text' types if item.get("type") in ["text", "output_text"]: return item.get("text", "") print("Warning: No assistant response found in messages", file=sys.stderr) return None except Exception as e: print(f"Error reading messages file: {str(e)}", file=sys.stderr) return None def parse_answer_format(text): """ Parse the <answer>...</answer> format from the agent's output. Returns a dictionary with the parsed values. """ if not text: print("ERROR: No text provided to parse", file=sys.stderr) return None # Look for <answer>...</answer> pattern match = re.search(r"<answer>(.*?)</answer>", text, re.IGNORECASE | re.DOTALL) if not match: print("ERROR: No <answer>...</answer> tags found in the response", file=sys.stderr) print("Response text preview (first 200 chars):", text[:200], file=sys.stderr) return None answer_content = match.group(1).strip() print(f"Found answer content with {len(answer_content)} characters", file=sys.stderr) # Parse each line result = {} lines = answer_content.split("\n") # Expected keys for this task expected_keys = [ "SpriteProducts", "Quantity100Products", "WS12Info", "PendingOrders", "GraceOrderID", "HighestOrderInfo", "CheapProduct", "OvernightDufflePrice", "HollisterPosition" ] if len(lines) != 9: print(f"ERROR: Expected 9 lines in answer, got {len(lines)}", file=sys.stderr) print(f"Lines found: {lines}", file=sys.stderr) return None for i, line in enumerate(lines, 1): if "|" not in line: print(f"ERROR: Line {i} does not contain pipe separator '|': '{line}'", file=sys.stderr) return None parts = line.split("|", 1) if len(parts) != 2: print(f"ERROR: Line {i} could not be split into key|value: '{line}'", file=sys.stderr) return None key, value = parts result[key.strip()] = value.strip() # Check if all expected keys are present missing_keys = set(expected_keys) - set(result.keys()) if missing_keys: print(f"ERROR: Missing expected keys: {missing_keys}", file=sys.stderr) print(f"Keys found: {list(result.keys())}", file=sys.stderr) return None # Check for unexpected keys extra_keys = set(result.keys()) - set(expected_keys) if extra_keys: print(f"WARNING: Unexpected keys found: {extra_keys}", file=sys.stderr) return result def load_expected_answer(label_path): """ Load the expected answer from label.txt file. Returns a dictionary with the expected values. """ try: with open(label_path, "r") as f: lines = f.read().strip().split("\n") expected = {} for line in lines: if "|" in line: key, value = line.split("|", 1) expected[key.strip()] = value.strip() return expected except Exception as e: print(f"Error reading label file: {str(e)}", file=sys.stderr) return None def compare_answers(model_answer, expected_answer): """ Compare the model's answer with the expected answer. Returns True if all key information matches, False otherwise. """ if not model_answer or not expected_answer: return False # Check each expected key mismatches = [] for key, expected_value in expected_answer.items(): model_value = model_answer.get(key, "") # Special handling for different types of values if key == "WS12Info": # Check if product name and price match (format: name:price) if ":" in expected_value and ":" in model_value: expected_name, expected_price = expected_value.rsplit(":", 1) model_name, model_price = model_value.rsplit(":", 1) # Normalize price format expected_price_clean = expected_price.replace("$", "").replace(",", "") model_price_clean = model_price.replace("$", "").replace(",", "") if ( expected_name != model_name or expected_price_clean != model_price_clean ): mismatches.append( f"{key}: expected '{expected_value}', got '{model_value}'" ) else: if expected_value != model_value: mismatches.append( f"{key}: expected '{expected_value}', got '{model_value}'" ) elif key == "GraceOrderID": # Order ID should start with "000" and match exactly if not model_value.startswith("000"): mismatches.append( f"{key}: expected to start with '000', got '{model_value}'" ) elif model_value != expected_value: mismatches.append( f"{key}: expected '{expected_value}', got '{model_value}'" ) elif key == "HighestOrderInfo": # Check format customer:amount if ":" in expected_value and ":" in model_value: expected_customer, expected_amount = expected_value.rsplit(":", 1) model_customer, model_amount = model_value.rsplit(":", 1) # Normalize amount format expected_amount_clean = expected_amount.replace("$", "").replace( ",", "" ) model_amount_clean = model_amount.replace("$", "").replace(",", "") if ( expected_customer != model_customer or expected_amount_clean != model_amount_clean ): mismatches.append( f"{key}: expected '{expected_value}', got '{model_value}'" ) else: if expected_value != model_value: mismatches.append( f"{key}: expected '{expected_value}', got '{model_value}'" ) elif key == "Position2Product": # Check if product name and quantity match if ":" in expected_value and ":" in model_value: expected_name, expected_qty = expected_value.rsplit(":", 1) model_name, model_qty = model_value.rsplit(":", 1) if expected_name != model_name or expected_qty != model_qty: mismatches.append( f"{key}: expected '{expected_value}', got '{model_value}'" ) else: if expected_value != model_value: mismatches.append( f"{key}: expected '{expected_value}', got '{model_value}'" ) elif key == "OvernightDufflePrice": # Normalize price format expected_clean = expected_value.replace("$", "").replace(",", "") model_clean = model_value.replace("$", "").replace(",", "") if expected_clean != model_clean: mismatches.append( f"{key}: expected '{expected_value}', got '{model_value}'" ) elif key == "HollisterPosition": # Position format (1st, 2nd, 3rd, etc.) if model_value.lower() != expected_value.lower(): mismatches.append( f"{key}: expected '{expected_value}', got '{model_value}'" ) elif key == "SarahMillerInfo": # Format: group:date if ":" in expected_value and ":" in model_value: expected_group, expected_date = expected_value.split(":", 1) model_group, model_date = model_value.split(":", 1) # Allow some flexibility in date format if expected_group != model_group: mismatches.append( f"{key}: expected group '{expected_group}', got '{model_group}'" ) # For date, check if key parts match if not (expected_date in model_date or model_date in expected_date): mismatches.append( f"{key}: expected date '{expected_date}', got '{model_date}'" ) else: if expected_value != model_value: mismatches.append( f"{key}: expected '{expected_value}', got '{model_value}'" ) elif key == "Invoice002BillTo": # Name should match exactly if model_value != expected_value: mismatches.append( f"{key}: expected '{expected_value}', got '{model_value}'" ) else: # Exact match for count fields and other numeric values if model_value != expected_value: mismatches.append( f"{key}: expected '{expected_value}', got '{model_value}'" ) if mismatches: print("\n=== Answer Comparison Mismatches ===", file=sys.stderr) for mismatch in mismatches: print(f"✗ {mismatch}", file=sys.stderr) return False print("\n=== Answer Comparison ===", file=sys.stderr) print("✓ All key information matches the expected answer", file=sys.stderr) return True async def verify() -> bool: """ Verifies that the sales and inventory analysis task has been completed correctly. First checks the model's answer against the expected label, then optionally verifies the actual state in the Magento Admin. """ print("\n" + "="*60, file=sys.stderr) print("Starting verification of Task 5", file=sys.stderr) print("="*60, file=sys.stderr) # Get the label file path label_path = Path(__file__).parent / "label.txt" # Load expected answer print("\n--- Loading Expected Answer ---", file=sys.stderr) expected_answer = load_expected_answer(label_path) if not expected_answer: print("FATAL ERROR: Could not load expected answer from label.txt", file=sys.stderr) return False print(f"Successfully loaded {len(expected_answer)} expected values", file=sys.stderr) # Get model's response from MCP_MESSAGES print("\n--- Loading Model Response ---", file=sys.stderr) model_response = get_model_response() if not model_response: print("FATAL ERROR: No model response found in MCP_MESSAGES", file=sys.stderr) return False print(f"Found model response ({len(model_response)} characters)", file=sys.stderr) print("\n--- Parsing Answer Format ---", file=sys.stderr) model_answer = parse_answer_format(model_response) if not model_answer: print("\nFATAL ERROR: Could not parse answer format from model response", file=sys.stderr) print("Verification FAILED", file=sys.stderr) return False print("\n=== Model Answer Successfully Parsed ===", file=sys.stderr) for key, value in model_answer.items(): print(f" {key}: {value}", file=sys.stderr) # Compare answers print("\n--- Comparing Answers ---", file=sys.stderr) answer_match = compare_answers(model_answer, expected_answer) if not answer_match: print("\n" + "="*60, file=sys.stderr) print("VERIFICATION FAILED: Model answer does not match expected answer", file=sys.stderr) print("="*60, file=sys.stderr) return False print("\n" + "="*60, file=sys.stderr) print("✓ VERIFICATION PASSED: Model answer matches expected answer", file=sys.stderr) print("="*60, file=sys.stderr) return True def main(): """ Executes the verification process and exits with a status code. """ result = asyncio.run(verify()) sys.exit(0 if result else 1) if __name__ == "__main__": main() ================================================ FILE: tasks/playwright_webarena/easy/shopping_admin/search_filtering_operations/description.md ================================================ Limit the search intelligence pass to the first three steps from the original task so it’s just two Search Terms views plus one dashboard glance. **Task Requirements** 1. If need to login, login with username 'admin' and password 'admin1234'. 2. **Marketing → SEO & Search → Search Terms**: filter for queries containing `tank` to count them, reset and filter Results = 0 to count zero-result terms, then filter Uses ≥ 11 to capture the highest-use row and list every term whose Results are between 20 and 30 (join as `term:results`, or use `None:0` if none). Remove filters when done. 3. **Reports → Search Terms**: set Hits ≥ 16 and record the filtered count, then add ID range 10–15 and capture the row with the most Results, and finally switch Store View to “Default Store View” to count those entries. 4. **Dashboard**: in **Top Search Terms** list the entries whose Results = 1 (format `term:uses` joined with `|` or `None:0`), in **Last Search Terms** pick the row with the highest combination of Results and Uses, and in **Bestsellers** copy the product + quantity shown at position #3. Return only these data points: ``` <answer> TankSearchCount|count ZeroResultsCount|count HighestUseTerm|term:uses Results20to30Term|term1:results1|term2:results2|... Hits15PlusCount|count ID10to15MaxResults|term:results DefaultStoreViewCount|count OneResultTerm|term1:uses1|term2:uses2|... HighestResultLastSearch|term:results Position3Bestseller|product:quantity </answer> ``` ``` <answer> TankSearchCount|count ZeroResultsCount|count HighestUseTerm|term:uses Results20to30Term|term1:results1|term2:result2|term3:result3|... Hits15PlusCount|count ID10to15MaxResults|term:results DefaultStoreViewCount|count OneResultTerm|term1:uses1|term2:uses2|term3:uses3|... HighestResultLastSearch|term:results Position3Bestseller|product:quantity TopUseTerm|term:uses FirstNonZeroResult|term:results TotalUniqueTerms|count </answer> ``` **Example Output:** ``` <answer> TankSearchCount|X ZeroResultsCount|X HighestUseTerm|search_term:XX Results20to30Term|search_term1:XX1|search_term2:XX2|search_term3:XX3|... Hits15PlusCount|X ID10to15MaxResults|Product Name:XX DefaultStoreViewCount|X OneResultTerm|search_term1:XX1|search_term2:XX2|search_term3:XX3|... HighestResultLastSearch|search_term:XX Position3Bestseller|Product Name:X TopUseTerm|search_term:XX FirstNonZeroResult|search_term:X TotalUniqueTerms|X </answer> ``` **Success Criteria:** - Successfully logged into Magento Admin - Applied complex search filters in Search Terms section - Used range filters for results and hits - Sorted columns to find specific records - Navigated between different report views - Extracted data from filtered and sorted results - Counted records accurately after applying filters - Output answer in exact format with 13 data lines - Answer wrapped in <answer> tags ================================================ FILE: tasks/playwright_webarena/easy/shopping_admin/search_filtering_operations/label.txt ================================================ TankSearchCount|2 ZeroResultsCount|1 HighestUseTerm|hollister:19 Results20to30Term|Antonia Racer Tank:23|tanks:23 Hits15PlusCount|1 ID10to15MaxResults|Antonia Racer Tank:23 DefaultStoreViewCount|7 OneResultTerm|hollister:19|WP10:1 HighestResultLastSearch|Antonia Racer Tank:23 Position3Bestseller|Sprite Stasis Ball 65 cm:6 ================================================ FILE: tasks/playwright_webarena/easy/shopping_admin/search_filtering_operations/meta.json ================================================ { "task_id": "search_filtering_operations_easy", "task_name": "Search Filtering Operations (Easy)", "category_id": "shopping_admin", "category_name": "Shopping Admin", "description": "Follow a clearly guided path through Search Terms, the Search Terms report, and the dashboard widgets to capture the metrics needed for a focused search-behavior brief.", "author": "Fanqing Meng", "created_at": "2025-11-15", "difficulty": "L1", "tags": [ "content submission" ], "mcp": [ "playwright" ], "meta_data": { "stateType": "video", "stateContent": null, "stateUrl": "https://storage.mcpmark.ai/tasks_state/playwright_video/magento-admin.mp4", "stateOriginalUrl": "https://github.com/web-arena-x/webarena/tree/main/environment_docker" } } ================================================ FILE: tasks/playwright_webarena/easy/shopping_admin/search_filtering_operations/verify.py ================================================ import re import json import os import sys def verify(messages): """ Verify that the agent has successfully performed complex search and filtering operations in the Magento Admin panel and extracted all required information correctly. Args: messages: List of message dictionaries containing the conversation Returns: Dictionary with 'valid' boolean and 'reason' string """ # Find the last assistant message with status "completed" and type "message" answer_content = None for message in reversed(messages): if ( message.get("role") == "assistant" and message.get("status") == "completed" and message.get("type") == "message" and message.get("content") ): # Extract text from content structure content = message["content"] if isinstance(content, list): for item in content: if isinstance(item, dict) and item.get("type") == "output_text": text = item.get("text", "") # Look for answer tags with case-insensitive search answer_match = re.search( r"<answer>(.*?)</answer>", text, re.DOTALL | re.IGNORECASE ) if answer_match: answer_content = answer_match.group(1).strip() break elif isinstance(content, str): # Look for answer tags in string content answer_match = re.search(r"<answer>(.*?)</answer>", content, re.DOTALL | re.IGNORECASE) if answer_match: answer_content = answer_match.group(1).strip() break if answer_content: break if not answer_content: return {"valid": False, "reason": "No answer found in <answer> tags"} # Expected format - each line should have a key|value pair expected_keys = [ "TankSearchCount", "ZeroResultsCount", "HighestUseTerm", "Results20to30Term", "Hits15PlusCount", "ID10to15MaxResults", "DefaultStoreViewCount", "OneResultTerm", "HighestResultLastSearch", "Position3Bestseller", ] # Parse the answer lines = answer_content.strip().split("\n") # Check if we have exactly 10 lines if len(lines) != 10: return {"valid": False, "reason": f"Expected 10 data lines, found {len(lines)}"} # Parse each line and validate format extracted_data = {} for line in lines: if "|" not in line: return { "valid": False, "reason": f"Invalid format in line: {line}. Expected 'key|value' format", } parts = line.split("|", 1) if len(parts) != 2: return {"valid": False, "reason": f"Invalid format in line: {line}"} key, value = parts extracted_data[key] = value # Check all required keys are present missing_keys = set(expected_keys) - set(extracted_data.keys()) if missing_keys: return { "valid": False, "reason": f"Missing required keys: {', '.join(missing_keys)}", } # Validate specific data formats and expected values based on the current data # 1. TankSearchCount should be a number (2 terms containing 'tank') if not extracted_data["TankSearchCount"].isdigit(): return { "valid": False, "reason": f"TankSearchCount should be a number, got: {extracted_data['TankSearchCount']}", } # Expected: "Antonia Racer Tank" and "tanks" contain 'tank' if extracted_data["TankSearchCount"] != "2": return { "valid": False, "reason": f"TankSearchCount should be '2', got: {extracted_data['TankSearchCount']}", } # 2. ZeroResultsCount should be a number (nike has 0 results) if not extracted_data["ZeroResultsCount"].isdigit(): return { "valid": False, "reason": f"ZeroResultsCount should be a number, got: {extracted_data['ZeroResultsCount']}", } if extracted_data["ZeroResultsCount"] != "1": return { "valid": False, "reason": f"ZeroResultsCount should be '1', got: {extracted_data['ZeroResultsCount']}", } # 3. HighestUseTerm should be in format "term:uses" if ":" not in extracted_data["HighestUseTerm"]: return { "valid": False, "reason": f"HighestUseTerm should be in format 'term:uses', got: {extracted_data['HighestUseTerm']}", } # hollister has 19 uses (highest among terms with > 10 uses) if extracted_data["HighestUseTerm"] != "hollister:19": return { "valid": False, "reason": f"HighestUseTerm should be 'hollister:19', got: {extracted_data['HighestUseTerm']}", } # 4. Results20to30Term should be in format "term:results" if ":" not in extracted_data["Results20to30Term"]: return { "valid": False, "reason": f"Results20to30Term should be in format 'term:results', got: {extracted_data['Results20to30Term']}", } # Both "tanks" and "Antonia Racer Tank" have 23 results (between 20-30) valid_results20to30 = ["tanks:23", "Antonia Racer Tank:23"] # Check if answer contains one of the valid values or both separated by | if not any( val in extracted_data["Results20to30Term"] for val in valid_results20to30 ): return { "valid": False, "reason": f"Results20to30Term should contain 'tanks:23' or 'Antonia Racer Tank:23', got: {extracted_data['Results20to30Term']}", } # 5. Hits15PlusCount should be a number (only hollister has 19 hits > 15) if not extracted_data["Hits15PlusCount"].isdigit(): return { "valid": False, "reason": f"Hits15PlusCount should be a number, got: {extracted_data['Hits15PlusCount']}", } if extracted_data["Hits15PlusCount"] != "1": return { "valid": False, "reason": f"Hits15PlusCount should be '1', got: {extracted_data['Hits15PlusCount']}", } # 6. ID10to15MaxResults should be in format "term:results" if ":" not in extracted_data["ID10to15MaxResults"]: return { "valid": False, "reason": f"ID10to15MaxResults should be in format 'term:results', got: {extracted_data['ID10to15MaxResults']}", } # ID 11 is hollister (1 result), ID 13 is Antonia Racer Tank (23 results) if extracted_data["ID10to15MaxResults"] != "Antonia Racer Tank:23": return { "valid": False, "reason": f"ID10to15MaxResults should be 'Antonia Racer Tank:23', got: {extracted_data['ID10to15MaxResults']}", } # 7. DefaultStoreViewCount should be a number (all 7 terms are from Default Store View) if not extracted_data["DefaultStoreViewCount"].isdigit(): return { "valid": False, "reason": f"DefaultStoreViewCount should be a number, got: {extracted_data['DefaultStoreViewCount']}", } if extracted_data["DefaultStoreViewCount"] != "7": return { "valid": False, "reason": f"DefaultStoreViewCount should be '7', got: {extracted_data['DefaultStoreViewCount']}", } # 8. OneResultTerm should be in format "term:uses" if ":" not in extracted_data["OneResultTerm"]: return { "valid": False, "reason": f"OneResultTerm should be in format 'term:uses', got: {extracted_data['OneResultTerm']}", } # Both hollister and WP10 have exactly 1 result valid_one_result = ["hollister:19", "WP10:1"] if not any(val in extracted_data["OneResultTerm"] for val in valid_one_result): return { "valid": False, "reason": f"OneResultTerm should contain 'hollister:19' or 'WP10:1', got: {extracted_data['OneResultTerm']}", } # 9. HighestResultLastSearch should be in format "term:results" if ":" not in extracted_data["HighestResultLastSearch"]: return { "valid": False, "reason": f"HighestResultLastSearch should be in format 'term:results', got: {extracted_data['HighestResultLastSearch']}", } # In Last Search Terms: tanks and Antonia Racer Tank both have 23 results (highest) valid_highest_last = ["tanks:23", "Antonia Racer Tank:23"] if not any( val in extracted_data["HighestResultLastSearch"] for val in valid_highest_last ): return { "valid": False, "reason": f"HighestResultLastSearch should contain 'tanks:23' or 'Antonia Racer Tank:23', got: {extracted_data['HighestResultLastSearch']}", } # 10. Position3Bestseller should be in format "product:quantity" if ":" not in extracted_data["Position3Bestseller"]: return { "valid": False, "reason": f"Position3Bestseller should be in format 'product:quantity', got: {extracted_data['Position3Bestseller']}", } # Position 3 in Bestsellers is "Sprite Stasis Ball 65 cm" with quantity 6 if extracted_data["Position3Bestseller"] != "Sprite Stasis Ball 65 cm:6": return { "valid": False, "reason": f"Position3Bestseller should be 'Sprite Stasis Ball 65 cm:6', got: {extracted_data['Position3Bestseller']}", } # All validations passed return { "valid": True, "reason": "All complex search and filtering operations completed successfully", } if __name__ == "__main__": # Load messages from environment variable messages_path = os.getenv("MCP_MESSAGES") if not messages_path: print( json.dumps( {"valid": False, "reason": "MCP_MESSAGES environment variable not set"} ) ) exit(1) try: with open(messages_path, "r") as f: messages = json.load(f) except Exception as e: print( json.dumps({"valid": False, "reason": f"Failed to load messages: {str(e)}"}) ) exit(1) # Run verification result = verify(messages) print(json.dumps(result)) # Exit with appropriate code based on verification result sys.exit(0 if result["valid"] else 1) ================================================ FILE: tasks/playwright_webarena/standard/reddit/ai_data_analyst/description.md ================================================ I'm conducting an AI research analysis and need specific data from this forum platform. Please help me gather information about AI discussions and community engagement. **Task Requirements:** 1. Create a new account with the exact 'AIDataAnalyst2025' and password 'SecurePass123!' 2. After logging in, navigate to the deeplearning forum and count the exact number of posts visible on the first page 3. Go to the MachineLearning forum and find the post titled '[P] I made a command-line tool that explains your errors using ChatGPT (link in comments)' - record its exact vote count 4. Use the search bar to search for 'transformer' and record the exact title of the third search result 5. In the MachineLearning forum, click on the 'Sort by: Hot' button and change it to 'New'. Navigate to page 2 of the results and find the post with the highest upvote count on that page: - Record the exact post title - Record the exact vote count - Click through to the post and find the last comment - Record the username of the last commenter - Record the exact text of the last comment 6. After collecting all the data, go to the MachineLearning forum and submit a new post with: - Title: "MachineLearning_Extraction" - Body text must be EXACTLY these lines without anything (keep the keys as-is, only replace the values after the pipe, follow the markdown format): ``` - Deeplearning_Post_Count|FILL_VALUE - ChatGPT_Tool_Vote_Count|FILL_VALUE - Transformer_Third_Result|FILL_VALUE - Page2_Top_Post_Title|FILL_VALUE - Page2_Top_Post_Votes|FILL_VALUE - Page2_Last_Comment_Username|FILL_VALUE - Page2_Last_Comment_Text|FILL_VALUE ``` ================================================ FILE: tasks/playwright_webarena/standard/reddit/ai_data_analyst/label.txt ================================================ - Deeplearning_Post_Count|25 - ChatGPT_Tool_Vote_Count|2655 - Transformer_Third_Result|[R] The Table Feature Transformation Library Release - Page2_Top_Post_Title|[D]GPT-4 might be able to tell you if it hallucinated - Page2_Top_Post_Votes|634 - Page2_Last_Comment_Username|Nous_AI - Page2_Last_Comment_Text|Fascinating. ================================================ FILE: tasks/playwright_webarena/standard/reddit/ai_data_analyst/meta.json ================================================ { "task_id": "ai_data_analyst", "task_name": "AI Data Analyst", "category_id": "reddit", "category_name": "Reddit", "description": "Create account on forum platform, collect AI/ML discussion metrics including post counts, vote data, and analyze community engagement patterns through systematic data extraction.", "author": "Fanqing Meng", "created_at": "2025-08-12", "difficulty": "L3", "tags": [ "user interaction", "data extraction", "comparative analysis" ], "mcp": [ "playwright" ], "meta_data": { "stateType": "video", "stateContent": null, "stateUrl": "https://storage.mcpmark.ai/tasks_state/playwright_video/postmill.mp4", "stateOriginalUrl": "https://github.com/web-arena-x/webarena/tree/main/environment_docker" } } ================================================ FILE: tasks/playwright_webarena/standard/reddit/ai_data_analyst/verify.py ================================================ import asyncio import sys import re import os from pathlib import Path from playwright.async_api import ( async_playwright, TimeoutError as PlaywrightTimeoutError, ) # 从环境变量读取 base_url,默认回退到本地 BASE_URL = os.getenv("WEBARENA_BASE_URL", "http://localhost:9999").rstrip("/") def parse_key_value_format(text): """ Parse the Key|Value format from the submission body using regex. Works with markdown format using pipe separators, with or without list markers. """ data = {} # Define patterns for each field using pipe separator # Optional list markers (-, •, *) at the beginning patterns = { "Deeplearning_Post_Count": r"(?:[-•*]\s*)?Deeplearning_Post_Count\s*\|\s*(\d+)", "ChatGPT_Tool_Vote_Count": r"(?:[-•*]\s*)?ChatGPT_Tool_Vote_Count\s*\|\s*(\d+)", "Transformer_Third_Result": r"(?:[-•*]\s*)?Transformer_Third_Result\s*\|\s*(.+?)(?=\n|$)", "Page2_Top_Post_Title": r"(?:[-•*]\s*)?Page2_Top_Post_Title\s*\|\s*(.+?)(?=\n|$)", "Page2_Top_Post_Votes": r"(?:[-•*]\s*)?Page2_Top_Post_Votes\s*\|\s*(\d+)", "Page2_Last_Comment_Username": r"(?:[-•*]\s*)?Page2_Last_Comment_Username\s*\|\s*(.+?)(?=\n|$)", "Page2_Last_Comment_Text": r"(?:[-•*]\s*)?Page2_Last_Comment_Text\s*\|\s*(.+?)(?=\n|$)", } # Extract each field using regex for key, pattern in patterns.items(): match = re.search(pattern, text, re.MULTILINE) if match: # For text fields, clean up newlines and extra spaces value = match.group(1).strip() if key not in [ "Deeplearning_Post_Count", "ChatGPT_Tool_Vote_Count", "Page2_Top_Post_Votes", ]: # Replace newlines with spaces and normalize whitespace value = " ".join(value.split()) data[key] = value return data def normalize_text(text): """ Normalize text for comparison by handling different quote styles and whitespace. """ if not isinstance(text, str): return str(text) # Replace various quote styles with standard quotes text = text.replace(""", "'").replace(""", "'") text = text.replace('"', '"').replace('"', '"') # Normalize whitespace text = " ".join(text.split()) return text.strip() async def verify() -> bool: """ Verifies that the ML extraction task has been completed correctly by checking the forum post. """ async with async_playwright() as p: browser = await p.chromium.launch(headless=True) context = await browser.new_context() page = await context.new_page() try: # Navigate to the main page print("Navigating to forum...", file=sys.stderr) await page.goto(f"{BASE_URL}/", wait_until="networkidle") # Step 1: Check if account was created correctly by trying to login print("Step 1: Verifying account creation...", file=sys.stderr) user_button = page.locator('button:has-text("AIDataAnalyst2025")') if not await user_button.count(): # Try to login print("Attempting to login with AIDataAnalyst2025...", file=sys.stderr) # Click login link await page.click('a:has-text("Log in")') await page.wait_for_load_state("networkidle") # Fill login form await page.fill('input[name="_username"]', "AIDataAnalyst2025") await page.fill('input[name="_password"]', "SecurePass123!") # Submit login form await page.click('button:has-text("Log in")') await page.wait_for_load_state("networkidle") # Check if login successful user_button = page.locator('button:has-text("AIDataAnalyst2025")') if not await user_button.count(): print("FAILED: Account AIDataAnalyst2025 with password SecurePass123! cannot be logged in", file=sys.stderr) print("This means the account was not created correctly", file=sys.stderr) return False print("PASSED: Successfully logged in as AIDataAnalyst2025", file=sys.stderr) else: print("PASSED: Already logged in as AIDataAnalyst2025", file=sys.stderr) # Step 2: Check if submission was created correctly in MachineLearning forum print("\nStep 2: Verifying submission creation...", file=sys.stderr) print("Navigating to MachineLearning forum...", file=sys.stderr) await page.goto( f"{BASE_URL}/f/MachineLearning", wait_until="networkidle" ) # Look for the post with title "MachineLearning_Extraction" print( "Looking for submission with title 'MachineLearning_Extraction'...", file=sys.stderr, ) post_link = page.locator('a:has-text("MachineLearning_Extraction")') if not await post_link.count(): print( "FAILED: Could not find submission with title 'MachineLearning_Extraction' in MachineLearning forum", file=sys.stderr, ) return False print("PASSED: Found submission 'MachineLearning_Extraction' in MachineLearning forum", file=sys.stderr) # Step 3: Check submission content matches expected values print("\nStep 3: Verifying submission content...", file=sys.stderr) # Click on the submission to view its content await post_link.first.click() await page.wait_for_load_state("networkidle") # Extract the submission body content # Try multiple possible selectors for the post body post_content = None selectors = [ ".submission__body", ".post-body", ".RichText", '[class*="RichText"]', 'div:has(> p:has-text("Deeplearning_Post_Count"))', 'div:has-text("Deeplearning_Post_Count"):has-text("Page2_Last_Comment_Text")', ] for selector in selectors: content_element = page.locator(selector) if await content_element.count(): post_content = await content_element.first.inner_text() if "Deeplearning_Post_Count" in post_content: print( f"Found submission content using selector: {selector}", file=sys.stderr, ) break if not post_content or "Deeplearning_Post_Count" not in post_content: print( "FAILED: Could not find submission body with required format", file=sys.stderr, ) print( "Expected body to contain 'Deeplearning_Post_Count' in pipe-separated format", file=sys.stderr, ) return False print("Found submission body content", file=sys.stderr) print(f"Raw content preview: {post_content[:200]}...", file=sys.stderr) # Parse the Key: Value format extracted_data = parse_key_value_format(post_content) print(f"Extracted data: {extracted_data}", file=sys.stderr) # Load expected values from label.txt label_path = Path(__file__).parent / "label.txt" if label_path.exists(): with open(label_path, "r") as f: expected_text = f.read().strip() expected_data = parse_key_value_format(expected_text) print("Loaded expected values from label.txt", file=sys.stderr) # Verify all required keys are present required_keys = [ "Deeplearning_Post_Count", "ChatGPT_Tool_Vote_Count", "Transformer_Third_Result", "Page2_Top_Post_Title", "Page2_Top_Post_Votes", "Page2_Last_Comment_Username", "Page2_Last_Comment_Text", ] missing_keys = [] for key in required_keys: if key not in extracted_data: missing_keys.append(key) if missing_keys: print( "FAILED: Missing required keys in submission: {', '.join(missing_keys)}", file=sys.stderr, ) print( "Expected all 7 fields to be present in pipe-separated format", file=sys.stderr, ) return False # Validate data format and content errors = [] # Check numeric fields try: post_count = int(extracted_data["Deeplearning_Post_Count"]) if ( "expected_data" in locals() and "Deeplearning_Post_Count" in expected_data ): expected_count = int(expected_data["Deeplearning_Post_Count"]) if post_count != expected_count: errors.append( f"Deeplearning_Post_Count mismatch: got {post_count}, expected {expected_count}" ) except ValueError: errors.append( f"Deeplearning_Post_Count must be a number, got: {extracted_data['Deeplearning_Post_Count']}" ) # If we have expected data, compare against it if "expected_data" in locals(): # Compare each field for key in required_keys: if key in expected_data and key in extracted_data: expected_val = normalize_text(expected_data[key]) actual_val = normalize_text(extracted_data[key]) # For numeric fields, compare as integers if key in [ "Deeplearning_Post_Count", "ChatGPT_Tool_Vote_Count", "Page2_Top_Post_Votes", ]: try: expected_int = int(expected_val) actual_int = int(actual_val) if expected_int != actual_int: errors.append( f"{key} mismatch: got {actual_int}, expected {expected_int}" ) except ValueError: errors.append( f"{key} should be numeric: got '{actual_val}'" ) else: # For text fields, compare normalized text if expected_val != actual_val: errors.append( f"{key} mismatch: got '{actual_val}', expected '{expected_val}'" ) else: # If no expected data, just do basic validation for key in required_keys: if key not in extracted_data: errors.append(f"Missing required key: {key}") elif ( not extracted_data[key] or extracted_data[key] == "[FILL_VALUE]" ): errors.append(f"{key} was not filled in") if errors: print( "FAILED: Content validation failed with the following issues:", file=sys.stderr, ) for error in errors: print(f" - {error}", file=sys.stderr) print("\nExpected values from label.txt:", file=sys.stderr) if "expected_data" in locals(): for key in required_keys: if key in expected_data: print(f" {key}: {expected_data[key]}", file=sys.stderr) return False # All checks passed print("\n=== VERIFICATION SUCCESSFUL ===") print("✓ Step 1: Account AIDataAnalyst2025 can login with password SecurePass123!") print("✓ Step 2: Submission 'MachineLearning_Extraction' found in MachineLearning forum") print("✓ Step 3: All submission content matches expected values:") print(f" - Deeplearning_Post_Count: {extracted_data['Deeplearning_Post_Count']}") print(f" - ChatGPT_Tool_Vote_Count: {extracted_data['ChatGPT_Tool_Vote_Count']}") print(f" - Transformer_Third_Result: {extracted_data['Transformer_Third_Result']}") print(f" - Page2_Top_Post_Title: {extracted_data['Page2_Top_Post_Title']}") print(f" - Page2_Top_Post_Votes: {extracted_data['Page2_Top_Post_Votes']}") print(f" - Page2_Last_Comment_Username: {extracted_data['Page2_Last_Comment_Username']}") print(f" - Page2_Last_Comment_Text: {extracted_data['Page2_Last_Comment_Text']}") print("✓ All data in correct pipe-separated markdown format") return True except PlaywrightTimeoutError as e: print(f"Error: Timeout occurred - {str(e)}", file=sys.stderr) return False except Exception as e: print(f"Error: Unexpected error - {str(e)}", file=sys.stderr) return False finally: await browser.close() def main(): """ Executes the verification process and exits with a status code. """ result = asyncio.run(verify()) sys.exit(0 if result else 1) if __name__ == "__main__": main() ================================================ FILE: tasks/playwright_webarena/standard/reddit/budget_europe_travel/description.md ================================================ Create a comprehensive budget travel resource for European travel on the Postmill forum platform by establishing a dedicated travel community with integrated wiki guides and engaging content. **Task Requirements:** 1. Register a new account with username "EuroTravelPlanner" and password "SecureTravel2024!" 2. After successful registration and login, search for "Europe travel budget" using the search bar to check for existing content 3. Create a new forum dedicated to budget European travel: - Forum name: "BudgetEuropeTravel" - Title: "Budget Travel Europe" - Description: "Community for sharing money-saving tips for European travel" - Sidebar content: "Share your best European travel deals and budget tips here!" 4. Create a comprehensive wiki page for your forum: - Path: "europe-travel-budget-guide" - Title: "Complete Budget Travel Guide for Europe 2024" - Content must contain: "Eurail passes and budget airlines" 5. Navigate to your newly created forum and create your first post: - Title: "My 14-day Europe trip for under 1000 - Complete itinerary" - Body must contain: "budget guide wiki" 6. Search for "travel insurance Europe" using the search function: - upvote the first post 7. Navigate to user settings and configure: - Set timezone to "Europe/Amsterdam" (to match your travel planning focus) - Ensure "Notify on reply" is checked (enabled) for community engagement ================================================ FILE: tasks/playwright_webarena/standard/reddit/budget_europe_travel/meta.json ================================================ { "task_id": "budget_europe_travel", "task_name": "Budget Europe Travel", "category_id": "reddit", "category_name": "Reddit", "description": "Create comprehensive budget travel resource for European destinations by establishing dedicated travel community with integrated wiki guides and engaging travel content.", "author": "Fanqing Meng", "created_at": "2025-08-12", "difficulty": "L3", "tags": [ "content submission", "user interaction" ], "mcp": [ "playwright" ], "meta_data": { "stateType": "video", "stateContent": null, "stateUrl": "https://storage.mcpmark.ai/tasks_state/playwright_video/postmill.mp4", "stateOriginalUrl": "https://github.com/web-arena-x/webarena/tree/main/environment_docker" } } ================================================ FILE: tasks/playwright_webarena/standard/reddit/budget_europe_travel/verify.py ================================================ import asyncio import sys import re import os from pathlib import Path from playwright.async_api import async_playwright, TimeoutError as PlaywrightTimeoutError BASE_URL = os.getenv("WEBARENA_BASE_URL", "http://localhost:9999").rstrip("/") def normalize_text(text): """ Normalize text for comparison by handling different quote styles and whitespace. """ if not isinstance(text, str): return str(text) # Replace various quote styles with standard quotes text = text.replace('\'', "'").replace('\'', "'") text = text.replace('"', '"').replace('"', '"') text = text.replace('&', '&') # Normalize whitespace text = ' '.join(text.split()) return text.strip() async def verify() -> bool: """ Verifies that the budget Europe travel resource task has been completed correctly. """ async with async_playwright() as p: browser = await p.chromium.launch(headless=True) context = await browser.new_context() page = await context.new_page() verification_passed = True try: # 1. Check if account can login with correct credentials print("="*60) print("Step 1: Verifying account login with credentials...", file=sys.stderr) print("="*60) await page.goto(f"{BASE_URL}/", wait_until='networkidle') # First logout if already logged in user_button = page.locator('button:has-text("EuroTravelPlanner")') if await user_button.count(): print("Already logged in, logging out first...", file=sys.stderr) await user_button.click() logout_link = page.locator('a:has-text("Log out")') if await logout_link.count(): await logout_link.click() await page.wait_for_load_state('networkidle') # Now try to login with the specified credentials print("Attempting to login with username 'EuroTravelPlanner' and password 'SecureTravel2024!'...", file=sys.stderr) # Navigate to login page login_link = page.locator('a:has-text("Log in")') if await login_link.count(): await login_link.click() await page.wait_for_load_state('networkidle') else: print("❌ ERROR: Cannot find login link", file=sys.stderr) verification_passed = False if verification_passed: # Fill login form with exact credentials await page.fill('input[name="_username"]', 'EuroTravelPlanner') await page.fill('input[name="_password"]', 'SecureTravel2024!') # Submit login login_button = page.locator('button[type="submit"]:has-text("Log in")') if not await login_button.count(): login_button = page.locator('button:has-text("Log in")') await login_button.click() await page.wait_for_load_state('networkidle') # Verify login success user_button = page.locator('button:has-text("EuroTravelPlanner")') if not await user_button.count(): print("❌ ERROR: Login failed with username 'EuroTravelPlanner' and password 'SecureTravel2024!'", file=sys.stderr) verification_passed = False else: print("✓ Account login successful with correct credentials", file=sys.stderr) # 2. Check if forum exists and has correct properties print("\n" + "="*60) print("Step 2: Checking forum existence and properties...", file=sys.stderr) print("="*60) # Check if forum exists at /f/BudgetEuropeTravel await page.goto(f"{BASE_URL}/f/BudgetEuropeTravel", wait_until='networkidle') # Check if we get 404 or the forum exists page_content = await page.content() page_title = await page.title() if "404" in page_title or "not found" in page_title.lower() or "Page not found" in page_content: print("❌ ERROR: Forum /f/BudgetEuropeTravel does not exist (404)", file=sys.stderr) verification_passed = False else: print("✓ Forum /f/BudgetEuropeTravel exists", file=sys.stderr) # Navigate to edit page to check properties await page.goto(f"{BASE_URL}/f/BudgetEuropeTravel/edit", wait_until='networkidle') # Check if we can access edit page edit_page_content = await page.content() edit_page_title = await page.title() if "404" in edit_page_title or "not found" in edit_page_title.lower() or "Page not found" in edit_page_content: print("❌ ERROR: Cannot access forum edit page at /f/BudgetEuropeTravel/edit", file=sys.stderr) verification_passed = False else: print("✓ Forum edit page accessible", file=sys.stderr) # Check forum title title_input = page.locator('input[name*="title"], input#forum_title') if await title_input.count(): title_value = await title_input.input_value() if title_value != "Budget Travel Europe": print(f"❌ ERROR: Forum title is '{title_value}', expected 'Budget Travel Europe'", file=sys.stderr) verification_passed = False else: print("✓ Forum title correct: 'Budget Travel Europe'", file=sys.stderr) else: print("❌ ERROR: Cannot find forum title field", file=sys.stderr) verification_passed = False # Check forum description desc_input = page.locator('textarea[name*="description"], input[name*="description"]') if await desc_input.count(): desc_value = await desc_input.input_value() expected_desc = "Community for sharing money-saving tips for European travel" if desc_value != expected_desc: print(f"❌ ERROR: Forum description is '{desc_value}', expected '{expected_desc}'", file=sys.stderr) verification_passed = False else: print("✓ Forum description correct", file=sys.stderr) else: print("❌ ERROR: Cannot find forum description field", file=sys.stderr) verification_passed = False # Check sidebar content sidebar_input = page.locator('textarea[name*="sidebar"]') if await sidebar_input.count(): sidebar_value = await sidebar_input.input_value() expected_sidebar = "Share your best European travel deals and budget tips here!" if sidebar_value != expected_sidebar: print(f"❌ ERROR: Forum sidebar is '{sidebar_value}', expected '{expected_sidebar}'", file=sys.stderr) verification_passed = False else: print("✓ Forum sidebar correct", file=sys.stderr) else: print("❌ ERROR: Cannot find forum sidebar field", file=sys.stderr) verification_passed = False # 3. Check wiki page existence and content print("\n" + "="*60) print("Step 3: Checking wiki page existence and content...", file=sys.stderr) print("="*60) # Try the wiki URL with /wiki/ path await page.goto(f"{BASE_URL}/wiki/europe-travel-budget-guide", wait_until='networkidle') wiki_page_content = await page.content() wiki_page_title = await page.title() if "404" in wiki_page_title or "not found" in wiki_page_title.lower() or "Page not found" in wiki_page_content: print("❌ ERROR: Wiki page does not exist at /wiki/europe-travel-budget-guide", file=sys.stderr) verification_passed = False else: print("✓ Wiki page exists at /wiki/europe-travel-budget-guide", file=sys.stderr) # Check wiki title wiki_title_found = False expected_wiki_title = "Complete Budget Travel Guide for Europe 2024" # Try multiple selectors for wiki title wiki_title_selectors = [ f'h1:has-text("{expected_wiki_title}")', f'h1:text-is("{expected_wiki_title}")', 'h1' ] for selector in wiki_title_selectors: wiki_title_elem = page.locator(selector) if await wiki_title_elem.count(): title_text = await wiki_title_elem.first.text_content() if expected_wiki_title in title_text: wiki_title_found = True break if not wiki_title_found: print(f"❌ ERROR: Wiki title '{expected_wiki_title}' not found", file=sys.stderr) verification_passed = False else: print(f"✓ Wiki title correct: '{expected_wiki_title}'", file=sys.stderr) # Check for required content in wiki required_wiki_content = "Eurail passes and budget airlines" if required_wiki_content not in wiki_page_content: print(f"❌ ERROR: Wiki content must contain '{required_wiki_content}'", file=sys.stderr) verification_passed = False else: print(f"✓ Wiki content contains required text: '{required_wiki_content}'", file=sys.stderr) # 4. Check for post in the forum print("\n" + "="*60) print("Step 4: Checking for post in forum...", file=sys.stderr) print("="*60) await page.goto(f"{BASE_URL}/f/BudgetEuropeTravel", wait_until='networkidle') expected_post_title = "My 14-day Europe trip for under 1000 - Complete itinerary" post_link = page.locator(f'a:has-text("{expected_post_title}")') if not await post_link.count(): print(f"❌ ERROR: Post with title '{expected_post_title}' not found in forum", file=sys.stderr) verification_passed = False else: print(f"✓ Post found with title: '{expected_post_title}'", file=sys.stderr) # Click on the post to check its content await post_link.first.click() await page.wait_for_load_state('networkidle') # Check if post contains required text post_page_content = await page.content() required_post_content = "budget guide wiki" if required_post_content not in post_page_content: print(f"❌ ERROR: Post body must contain '{required_post_content}'", file=sys.stderr) verification_passed = False else: print(f"✓ Post content contains required text: '{required_post_content}'", file=sys.stderr) # 5. Check upvote on search result print("\n" + "="*60) print("Step 5: Checking upvote on search result...", file=sys.stderr) print("="*60) # Navigate to search results for "travel insurance Europe" await page.goto(f"{BASE_URL}/search?q=travel+insurance+Europe", wait_until='networkidle') # Check if we're on search results page if "/search" not in page.url: print("❌ ERROR: Not on search results page", file=sys.stderr) verification_passed = False else: print("✓ On search results page for 'travel insurance Europe'", file=sys.stderr) # Check for upvoted posts upvote_found = False # Method 1: Check for "Retract upvote" button (indicates user has upvoted) retract_buttons = page.locator('button:has-text("Retract upvote")') if await retract_buttons.count() > 0: print("✓ Found upvoted post (Retract upvote button present)", file=sys.stderr) upvote_found = True # Method 2: Check for posts with upvote count >= 1 if not upvote_found: # Look for vote counts vote_elements = page.locator('div.vote, span.vote-count, [class*="vote"]') for i in range(await vote_elements.count()): vote_elem = vote_elements.nth(i) vote_text = await vote_elem.text_content() try: # Extract number from vote text import re numbers = re.findall(r'\d+', vote_text) if numbers: vote_count = int(numbers[0]) if vote_count >= 1: print(f"✓ Found post with {vote_count} upvote(s)", file=sys.stderr) upvote_found = True break except: continue if not upvote_found: print("❌ ERROR: No upvoted posts found in search results", file=sys.stderr) verification_passed = False # 6. Check user settings print("\n" + "="*60) print("Step 6: Checking user settings...", file=sys.stderr) print("="*60) await page.goto(f"{BASE_URL}/user/EuroTravelPlanner/preferences", wait_until='networkidle') # Check timezone setting timezone_correct = False timezone_select = page.locator('select[name*="timezone"], select#timezone') if await timezone_select.count(): selected_value = await timezone_select.input_value() if selected_value == "Europe/Amsterdam": print("✓ Timezone correctly set to 'Europe/Amsterdam'", file=sys.stderr) timezone_correct = True else: # Check selected option text selected_option = timezone_select.locator('option[selected]') if await selected_option.count(): option_text = await selected_option.text_content() if "Amsterdam" in option_text: print("✓ Timezone correctly set to Europe/Amsterdam", file=sys.stderr) timezone_correct = True else: print(f"❌ ERROR: Timezone is set to '{option_text}', expected 'Europe/Amsterdam'", file=sys.stderr) verification_passed = False else: print(f"❌ ERROR: Timezone is '{selected_value}', expected 'Europe/Amsterdam'", file=sys.stderr) verification_passed = False else: print("❌ ERROR: Cannot find timezone selector", file=sys.stderr) verification_passed = False # Check "Notify on reply" setting notify_correct = False # Try multiple selectors for the checkbox notify_selectors = [ 'input[type="checkbox"]:near(:text("Notify on reply"))', 'label:has-text("Notify on reply") input[type="checkbox"]', 'input[type="checkbox"][name*="notify"]', 'input[type="checkbox"][id*="notify"]' ] for selector in notify_selectors: notify_checkbox = page.locator(selector) if await notify_checkbox.count(): is_checked = await notify_checkbox.first.is_checked() if is_checked: print("✓ 'Notify on reply' is enabled (checked)", file=sys.stderr) notify_correct = True else: print("❌ ERROR: 'Notify on reply' is not enabled (unchecked)", file=sys.stderr) verification_passed = False break if not notify_correct and verification_passed: print("❌ ERROR: Cannot verify 'Notify on reply' setting", file=sys.stderr) verification_passed = False # Final summary print("\n" + "="*60) if verification_passed: print("✅ SUCCESS: All verification checks passed!", file=sys.stderr) else: print("❌ FAILED: One or more verification checks failed!", file=sys.stderr) print("="*60) return verification_passed except PlaywrightTimeoutError as e: print(f"❌ ERROR: Timeout occurred - {str(e)}", file=sys.stderr) return False except Exception as e: print(f"❌ ERROR: Unexpected error - {str(e)}", file=sys.stderr) return False finally: await browser.close() def main(): """ Executes the verification process and exits with a status code. """ result = asyncio.run(verify()) sys.exit(0 if result else 1) if __name__ == "__main__": main() ================================================ FILE: tasks/playwright_webarena/standard/reddit/buyitforlife_research/description.md ================================================ Research durable kitchen appliances on the BuyItForLife forum by extracting information about the most popular posts and their community engagement. **Task Requirements:** 1. Register a new account with username 'buyitforlife_researcher' and password 'BIFL2024Research!' 2. After successful registration, navigate to the BuyItForLife forum 3. Find the top 3 posts with the highest upvote counts and extract: - Exact post title - Exact upvote count - Exact number of comments 4. For the post with the highest upvotes from step 3, click through to read the comments and find the comment with the most upvotes. Record: - The exact comment text - The username of the commenter 5. From the BuyItForLife forum, identify the usernames of the authors who posted the top 3 posts with the highest number of upvotes 6. After collecting all the data, submit a new post in the BuyItForLife forum with: - Title: "Research Report for BuyItForLife" - Body text must be EXACTLY these lines without anything (keep the keys as-is, only replace the values after the pipe, follow the markdown format): ``` - Post1_Title|FILL_VALUE - Post1_Upvotes|FILL_VALUE - Post1_Comments|FILL_VALUE - Post2_Title|FILL_VALUE - Post2_Upvotes|FILL_VALUE - Post2_Comments|FILL_VALUE - Post3_Title|FILL_VALUE - Post3_Upvotes|FILL_VALUE - Post3_Comments|FILL_VALUE - TopComment_Text|FILL_VALUE - TopComment_Username|FILL_VALUE - Post1_Author|FILL_VALUE - Post2_Author|FILL_VALUE - Post3_Author|FILL_VALUE ``` ================================================ FILE: tasks/playwright_webarena/standard/reddit/buyitforlife_research/label.txt ================================================ - Post1_Title|Hand me down name tag trail on this child's jacket - Post1_Upvotes|14487 - Post1_Comments|163 - Post2_Title|My Grandmother's oven, circa 1966. Many holiday meals were cooked with love here, right up until the day she passed. Aside from one shattered glass door that was replaced, everything still works! - Post2_Upvotes|6413 - Post2_Comments|205 - Post3_Title|1956 Frigidaire Range - Post3_Upvotes|5797 - Post3_Comments|190 - TopComment_Text|Rei does this too - TopComment_Username|ATeaformeplease - Post1_Author|Mofomania - Post2_Author|_Mr_Roboto_ - Post3_Author|dezualy ================================================ FILE: tasks/playwright_webarena/standard/reddit/buyitforlife_research/meta.json ================================================ { "task_id": "buyitforlife_research", "task_name": "Buy It For Life Research", "category_id": "reddit", "category_name": "Reddit", "description": "Research durable lifetime-quality products by creating forum account, analyzing community recommendations, extracting product data, and compiling comprehensive durability report with voting metrics.", "author": "Fanqing Meng", "created_at": "2025-08-12", "difficulty": "L3", "tags": [ "user interaction", "data extraction", "search aggregation", "content submission" ], "mcp": [ "playwright" ], "meta_data": { "stateType": "video", "stateContent": null, "stateUrl": "https://storage.mcpmark.ai/tasks_state/playwright_video/postmill.mp4", "stateOriginalUrl": "https://github.com/web-arena-x/webarena/tree/main/environment_docker" } } ================================================ FILE: tasks/playwright_webarena/standard/reddit/buyitforlife_research/verify.py ================================================ import asyncio import sys import re import os from pathlib import Path from playwright.async_api import ( async_playwright, TimeoutError as PlaywrightTimeoutError, ) BASE_URL = os.getenv("WEBARENA_BASE_URL", "http://localhost:9999").rstrip("/") def parse_markdown_list_format(text): """ Parse the markdown list format (- Key|Value) from the submission body. Also handles plain Key|Value format without the dash. """ data = {} # Pattern to match lines like: - Key|Value or just Key|Value pattern = r'^(?:-\s*)?([^|]+)\|(.+)$' lines = text.strip().split('\n') for line in lines: line = line.strip() if '|' in line: match = re.match(pattern, line) if match: key = match.group(1).strip() value = match.group(2).strip() data[key] = value return data def normalize_text(text): """ Normalize text for comparison by handling different quote styles and whitespace. """ if not isinstance(text, str): return str(text) # Replace various quote styles with standard quotes text = text.replace(""", "'").replace(""", "'") text = text.replace('"', '"').replace('"', '"') # Normalize whitespace text = " ".join(text.split()) return text.strip() async def check_account_login(page): """ Check if the account can be successfully logged in. Returns True if login successful, False otherwise. """ try: # Navigate to main page await page.goto(f"{BASE_URL}/", wait_until="networkidle") # Check if already logged in user_button = page.locator('button[class*="UserButton"]') if await user_button.count(): username = await user_button.inner_text() if username.lower() == "buyitforlife_researcher": print(f"Already logged in as {username}", file=sys.stderr) return True else: # Log out first await user_button.click() await page.click('a:has-text("Log out")') await page.wait_for_load_state("networkidle") # Try to login await page.click('a:has-text("Log in")') await page.wait_for_load_state("networkidle") await page.fill('input[name="_username"]', "buyitforlife_researcher") await page.fill('input[name="_password"]', "BIFL2024Research!") await page.click('button:has-text("Log in")') await page.wait_for_load_state("networkidle") # Check if login successful user_button = page.locator('button:has-text("buyitforlife_researcher")') if await user_button.count(): print("Successfully logged in as buyitforlife_researcher", file=sys.stderr) return True else: print("Login failed for buyitforlife_researcher", file=sys.stderr) return False except Exception as e: print(f"Error during login check: {str(e)}", file=sys.stderr) return False async def check_submission_exists(page): """ Check if the submission exists in the BuyItForLife forum with correct content. Returns (exists, content_data) tuple. """ try: # Navigate to BuyItForLife forum await page.goto(f"{BASE_URL}/f/BuyItForLife", wait_until="networkidle") # Look for the post with correct title print("Looking for post 'Research Report for BuyItForLife'...", file=sys.stderr) post_locator = page.locator('a:has-text("Research Report for BuyItForLife")') if not await post_locator.count(): print("Error: Could not find post with title 'Research Report for BuyItForLife'", file=sys.stderr) return False, None # Click on the post to view its content await post_locator.first.click() await page.wait_for_load_state("networkidle") # Get the post content post_content = None selectors = [ '.PostFullItem-body', '.Post-body', '.PostItem-body', '.item-RichText', '[class*="RichText"]', 'div:has-text("Post1_Title")', ] for selector in selectors: post_content_element = page.locator(selector) if await post_content_element.count(): # Get the text content, handling multiple elements if needed if await post_content_element.count() > 1: for i in range(await post_content_element.count()): text = await post_content_element.nth(i).inner_text() if "Post1_Title" in text: post_content = text print(f"Found post content using selector: {selector} (element {i})", file=sys.stderr) break else: post_content = await post_content_element.first.inner_text() print(f"Found post content using selector: {selector}", file=sys.stderr) if post_content and "Post1_Title" in post_content: break if not post_content: print("Error: Could not find post content element", file=sys.stderr) return False, None print("Post content found:", file=sys.stderr) print(post_content[:200] + "..." if len(post_content) > 200 else post_content, file=sys.stderr) # Parse the markdown list format extracted_data = parse_markdown_list_format(post_content) print(f"Extracted data: {extracted_data}", file=sys.stderr) return True, extracted_data except Exception as e: print(f"Error checking submission: {str(e)}", file=sys.stderr) return False, None async def verify() -> bool: """ Verifies that the BuyItForLife research task has been completed correctly. Checks: 1. Account creation (can login with credentials) 2. Submission exists with correct title 3. Submission content matches expected format and values """ async with async_playwright() as p: browser = await p.chromium.launch(headless=True) context = await browser.new_context() page = await context.new_page() try: # Step 1: Check account creation print("=== Step 1: Checking account creation ===", file=sys.stderr) account_ok = await check_account_login(page) if not account_ok: print("Error: Account 'buyitforlife_researcher' cannot be logged in", file=sys.stderr) return False # Step 2: Check submission exists and get content print("\n=== Step 2: Checking submission ===", file=sys.stderr) submission_exists, extracted_data = await check_submission_exists(page) if not submission_exists: print("Error: Submission not found in BuyItForLife forum", file=sys.stderr) return False if not extracted_data: print("Error: Could not extract data from submission", file=sys.stderr) return False # Step 3: Load expected data from label.txt print("\n=== Step 3: Validating submission content ===", file=sys.stderr) label_path = Path(__file__).parent / "label.txt" if not label_path.exists(): print("Error: label.txt not found", file=sys.stderr) return False with open(label_path, "r") as f: expected_text = f.read().strip() expected_data = parse_markdown_list_format(expected_text) print(f"Expected data from label.txt: {expected_data}", file=sys.stderr) # Verify all required keys are present required_keys = [ "Post1_Title", "Post1_Upvotes", "Post1_Comments", "Post2_Title", "Post2_Upvotes", "Post2_Comments", "Post3_Title", "Post3_Upvotes", "Post3_Comments", "TopComment_Text", "TopComment_Username", "Post1_Author", "Post2_Author", "Post3_Author", ] missing_keys = [] for key in required_keys: if key not in extracted_data: missing_keys.append(key) if missing_keys: print(f"Error: Missing required keys: {', '.join(missing_keys)}", file=sys.stderr) return False # Compare each field with expected values errors = [] for key in required_keys: if key in expected_data and key in extracted_data: expected_val = normalize_text(expected_data[key]) actual_val = normalize_text(extracted_data[key]) # For numeric fields, compare as integers if "Upvotes" in key or "Comments" in key: try: expected_int = int(expected_val) actual_int = int(actual_val) if expected_int != actual_int: errors.append(f"{key} mismatch: got {actual_int}, expected {expected_int}") except ValueError: errors.append(f"{key} should be numeric: got '{actual_val}'") else: # For text fields, special handling for usernames with underscores if "Author" in key or key == "TopComment_Username": expected_core = expected_val.strip('_') actual_core = actual_val.strip('_') if expected_core != actual_core: errors.append(f"{key} mismatch: got '{actual_val}', expected '{expected_val}'") else: if expected_val != actual_val: errors.append(f"{key} mismatch: got '{actual_val}', expected '{expected_val}'") # Verify upvotes are in descending order try: post1_votes = int(extracted_data["Post1_Upvotes"]) post2_votes = int(extracted_data["Post2_Upvotes"]) post3_votes = int(extracted_data["Post3_Upvotes"]) if not (post1_votes >= post2_votes >= post3_votes): errors.append(f"Posts should be ordered by upvotes: {post1_votes} >= {post2_votes} >= {post3_votes}") except (ValueError, KeyError): pass # Already reported above if errors: print("Error: Validation failed with the following issues:", file=sys.stderr) for error in errors: print(f" - {error}", file=sys.stderr) return False # All checks passed print("\n=== SUCCESS ===", file=sys.stderr) print("✓ Account 'buyitforlife_researcher' created and can login", file=sys.stderr) print("✓ Submission 'Research Report for BuyItForLife' found in correct forum", file=sys.stderr) print("✓ All 14 required fields present and correct", file=sys.stderr) print("✓ Data matches expected values from label.txt", file=sys.stderr) print("✓ Posts ordered by upvotes (descending)", file=sys.stderr) return True except PlaywrightTimeoutError as e: print(f"Error: Timeout occurred - {str(e)}", file=sys.stderr) return False except Exception as e: print(f"Error: Unexpected error - {str(e)}", file=sys.stderr) return False finally: await browser.close() def main(): """ Executes the verification process and exits with a status code. """ result = asyncio.run(verify()) sys.exit(0 if result else 1) if __name__ == "__main__": main() ================================================ FILE: tasks/playwright_webarena/standard/reddit/llm_research_summary/description.md ================================================ I need you to perform a comprehensive analysis of Large Language Model discussions on this forum platform and create a summary post. This analysis will help me understand the community's engagement with LLM topics. **Task Requirements:** 1. Register a new account with username 'llm_analyst_2024' and password 'LLMExpert2024!' 2. After successful registration, navigate to the MachineLearning forum and analyze posts about Large Language Models: - Count all posts on the first page that contain 'GPT', 'ChatGPT', or 'LLM' in their titles 3. Identify the top 3 posts with the highest upvote counts from your LLM-related posts list and record their complete details 4. Navigate to the deeplearning forum and go to page 2: - Find the post with the most comments on page 2 - Record its exact title and comment count 5. Create a new submission in the MachineLearning forum with: - Title: "LLM Research Summary: GPT Discussions Analysis [2024]" - Body text must be EXACTLY these lines without anything (keep the keys as-is, only replace the values after the pipe, follow the markdown format): ``` - Total_LLM_Posts|FILL_VALUE - Top1_Title|FILL_VALUE - Top1_Upvotes|FILL_VALUE - Top1_Date|FILL_VALUE - Top2_Title|FILL_VALUE - Top2_Upvotes|FILL_VALUE - Top2_Date|FILL_VALUE - Top3_Title|FILL_VALUE - Top3_Upvotes|FILL_VALUE - Top3_Date|FILL_VALUE - Deeplearning_MostDiscussed|FILL_VALUE - Deeplearning_Comments|FILL_VALUE ``` ================================================ FILE: tasks/playwright_webarena/standard/reddit/llm_research_summary/label.txt ================================================ - Total_LLM_Posts|9 - Top1_Title|[P] I made a command-line tool that explains your errors using ChatGPT (link in comments) - Top1_Upvotes|2655 - Top1_Date|3 years ago - Top2_Title|[P] I built Adrenaline, a debugger that fixes errors and explains them with GPT-3 - Top2_Upvotes|1542 - Top2_Date|3 years ago - Top3_Title|[N] OpenAI may have benchmarked GPT-4's coding ability on it's own training data - Top3_Upvotes|925 - Top3_Date|2 years ago - Deeplearning_MostDiscussed|Do companies actually care about their model's training/inference speed? - Deeplearning_Comments|39 ================================================ FILE: tasks/playwright_webarena/standard/reddit/llm_research_summary/meta.json ================================================ { "task_id": "llm_research_summary", "task_name": "LLM Research Summary", "category_id": "reddit", "category_name": "Reddit", "description": "Aggregate and analyze LLM research discussions across multiple forums, collect trending topics, compile technical insights, and create comprehensive summary post with community engagement.", "author": "Fanqing Meng", "created_at": "2025-08-12", "difficulty": "L3", "tags": [ "data extraction", "search aggregation", "content submission", "user interaction" ], "mcp": [ "playwright" ], "meta_data": { "stateType": "video", "stateContent": null, "stateUrl": "https://storage.mcpmark.ai/tasks_state/playwright_video/postmill.mp4", "stateOriginalUrl": "https://github.com/web-arena-x/webarena/tree/main/environment_docker" } } ================================================ FILE: tasks/playwright_webarena/standard/reddit/llm_research_summary/verify.py ================================================ import asyncio import sys import re import os from pathlib import Path from playwright.async_api import ( async_playwright, TimeoutError as PlaywrightTimeoutError, ) # 从环境变量读取 base_url,默认回退到本地 BASE_URL = os.getenv("WEBARENA_BASE_URL", "http://localhost:9999").rstrip("/") def parse_key_value_format(text): """ Parse the Key|Value format from the submission body. Handles both pipe (|) and colon (:) separators for compatibility. """ data = {} # Try to parse with pipe separator first (expected format) lines = text.strip().split('\n') for line in lines: line = line.strip() if not line: continue # Remove markdown list prefix if present if line.startswith('- '): line = line[2:] elif line.startswith('* '): line = line[2:] # Try pipe separator first if '|' in line: parts = line.split('|', 1) if len(parts) == 2: key = parts[0].strip() value = parts[1].strip() data[key] = value # Fallback to colon separator for label.txt compatibility elif ':' in line: parts = line.split(':', 1) if len(parts) == 2: key = parts[0].strip() value = parts[1].strip() data[key] = value return data def normalize_text(text): """ Normalize text for comparison by handling different quote styles and whitespace. """ if not isinstance(text, str): return str(text) # Replace various quote styles with standard quotes text = text.replace(""", "'").replace(""", "'") text = text.replace('"', '"').replace('"', '"') # Normalize whitespace text = " ".join(text.split()) return text.strip() async def verify() -> bool: """ Verifies that the LLM analysis task has been completed correctly. """ async with async_playwright() as p: browser = await p.chromium.launch(headless=True) context = await browser.new_context() page = await context.new_page() try: # Navigate to the main page print("Navigating to forum...", file=sys.stderr) await page.goto(f"{BASE_URL}/", wait_until="networkidle") # Check if logged in as llm_analyst_2024 user_button = page.locator('button:has-text("llm_analyst_2024")') if not await user_button.count(): # Try to login print("Not logged in, attempting to login...", file=sys.stderr) await page.click('a:has-text("Log in")') await page.wait_for_load_state("networkidle") await page.fill('input[name="_username"]', "llm_analyst_2024") await page.fill('input[name="_password"]', "LLMExpert2024!") await page.click('button:has-text("Log in")') await page.wait_for_load_state("networkidle") user_button = page.locator('button:has-text("llm_analyst_2024")') if not await user_button.count(): print("Error: Login failed for llm_analyst_2024", file=sys.stderr) return False print("Successfully logged in as llm_analyst_2024", file=sys.stderr) else: print("Already logged in as llm_analyst_2024", file=sys.stderr) # Navigate to MachineLearning forum print("Navigating to MachineLearning forum...", file=sys.stderr) await page.goto( f"{BASE_URL}/f/MachineLearning", wait_until="networkidle" ) # Look for the submission with our specific title print( "Looking for submission 'LLM Research Summary: GPT Discussions Analysis [2024]'...", file=sys.stderr, ) post_link = page.locator( 'a:has-text("LLM Research Summary: GPT Discussions Analysis [2024]")' ) if not await post_link.count(): print( "Error: Could not find submission with required title", file=sys.stderr, ) return False # Click on the submission to view its content await post_link.first.click() await page.wait_for_load_state("networkidle") # Extract the submission body content # Try multiple possible selectors for the post body post_content = None selectors = [ ".submission__body", ".post-body", ".RichText", '[class*="RichText"]', 'div:has(> p:has-text("Total_LLM_Posts"))', 'div:has-text("Total_LLM_Posts"):has-text("Deeplearning_Comments")', ] for selector in selectors: content_element = page.locator(selector) if await content_element.count(): post_content = await content_element.first.inner_text() if "Total_LLM_Posts" in post_content: print( f"Found submission content using selector: {selector}", file=sys.stderr, ) break if not post_content or "Total_LLM_Posts" not in post_content: print( "Error: Could not find submission body with required format", file=sys.stderr, ) return False print("Submission content found, parsing data...", file=sys.stderr) print(f"Raw content: {post_content[:200]}...", file=sys.stderr) # Parse the Key: Value format extracted_data = parse_key_value_format(post_content) print(f"Extracted data: {extracted_data}", file=sys.stderr) # Load expected values from label.txt label_path = Path(__file__).parent / "label.txt" if label_path.exists(): with open(label_path, "r") as f: expected_text = f.read().strip() expected_data = parse_key_value_format(expected_text) print("Loaded expected values from label.txt", file=sys.stderr) # Verify all required keys are present required_keys = [ "Total_LLM_Posts", "Top1_Title", "Top1_Upvotes", "Top1_Date", "Top2_Title", "Top2_Upvotes", "Top2_Date", "Top3_Title", "Top3_Upvotes", "Top3_Date", "Deeplearning_MostDiscussed", "Deeplearning_Comments", ] missing_keys = [] for key in required_keys: if key not in extracted_data: missing_keys.append(key) if missing_keys: print( f"Error: Missing required keys: {', '.join(missing_keys)}", file=sys.stderr, ) return False # Validate data format and content errors = [] # Check Total_LLM_Posts is a number and matches expected try: total_posts = int(extracted_data["Total_LLM_Posts"]) if "expected_data" in locals() and "Total_LLM_Posts" in expected_data: expected_total = int(expected_data["Total_LLM_Posts"]) if total_posts != expected_total: errors.append( f"Total_LLM_Posts mismatch: got {total_posts}, expected {expected_total}" ) elif total_posts < 5: # Based on exploration, should be at least 5 errors.append(f"Total_LLM_Posts seems too low: {total_posts}") except ValueError: errors.append( f"Total_LLM_Posts must be a number, got: {extracted_data['Total_LLM_Posts']}" ) # If we have expected data, compare against it if "expected_data" in locals(): # Compare each field for key in required_keys: if key in expected_data and key in extracted_data: expected_val = normalize_text(expected_data[key]) actual_val = normalize_text(extracted_data[key]) # For numeric fields, compare as integers if ( "Upvotes" in key or "Comments" in key or key == "Total_LLM_Posts" ): try: expected_int = int(expected_val) actual_int = int(actual_val) if expected_int != actual_int: errors.append( f"{key} mismatch: got {actual_int}, expected {expected_int}" ) except ValueError: errors.append( f"{key} should be numeric: got '{actual_val}'" ) else: # For text fields, compare normalized text if expected_val != actual_val: errors.append( f"{key} mismatch: got '{actual_val}', expected '{expected_val}'" ) else: # If no expected data, just do basic validation for key in required_keys: if key not in extracted_data: errors.append(f"Missing required key: {key}") elif ( not extracted_data[key] or extracted_data[key] == "[FILL_VALUE]" ): errors.append(f"{key} was not filled in") # Verify upvotes are in descending order for top 3 try: top1_votes = int(extracted_data["Top1_Upvotes"]) top2_votes = int(extracted_data["Top2_Upvotes"]) top3_votes = int(extracted_data["Top3_Upvotes"]) if not (top1_votes >= top2_votes >= top3_votes): errors.append( f"Top posts should be ordered by upvotes: {top1_votes} >= {top2_votes} >= {top3_votes}" ) except (ValueError, KeyError): pass # Already reported above if errors: print( "Error: Validation failed with the following issues:", file=sys.stderr, ) for error in errors: print(f" - {error}", file=sys.stderr) return False # All checks passed print("Success: LLM analysis task completed successfully.") print("- Account llm_analyst_2024 verified") print( "- Submission 'LLM Research Summary: GPT Discussions Analysis [2024]' found" ) print( f"- Total LLM-related posts analyzed: {extracted_data['Total_LLM_Posts']}" ) print("- Top 3 posts by upvotes identified and documented") print( f"- Deeplearning forum page 2 most discussed post: {extracted_data['Deeplearning_MostDiscussed']}" ) print("- All data in correct Key: Value format with 12 lines") return True except PlaywrightTimeoutError as e: print(f"Error: Timeout occurred - {str(e)}", file=sys.stderr) return False except Exception as e: print(f"Error: Unexpected error - {str(e)}", file=sys.stderr) return False finally: await browser.close() def main(): """ Executes the verification process and exits with a status code. """ result = asyncio.run(verify()) sys.exit(0 if result else 1) if __name__ == "__main__": main() ================================================ FILE: tasks/playwright_webarena/standard/reddit/movie_reviewer_analysis/description.md ================================================ I need you to analyze wonderful movie discussions on Postmill and create a summary post. This analysis will help me understand the community's engagement with movie topics. **Task Requirements:** 1. Register a new account with username 'movie_reviewer_2024' and password 'movie_reviewer_2024' 2. After successful registration, navigate to the movies forum and analyze the posts on the first page: - Count all posts that have 4-digit years in their titles 3. Identify the top 3 posts with the highest upvote counts from ALL posts on the first page and record their complete details 4. Find these specific posts and record their data: - The 'Rittenhouse Square' poster post (record exact upvotes and comments) 5. Navigate through the first 5 pages of movies forum and count the total number of image/poster submissions across all 5 pages 6. Create a new submission in the movies forum with: - Title: "Wonderful Movies Analysis: Community Favorites [2024]" - Body text must be EXACTLY these lines without anything (keep the keys as-is, only replace the values after the pipe, follow the markdown format): ``` - Total_Year_Posts|FILL_VALUE - Top1_Title|FILL_VALUE - Top1_Upvotes|FILL_VALUE - Top1_Comments|FILL_VALUE - Top2_Title|FILL_VALUE - Top2_Upvotes|FILL_VALUE - Top2_Comments|FILL_VALUE - Top3_Title|FILL_VALUE - Top3_Upvotes|FILL_VALUE - Top3_Comments|FILL_VALUE - Rittenhouse_Upvotes|FILL_VALUE - Rittenhouse_Comments|FILL_VALUE - Total_Image_Posts_5Pages|FILL_VALUE ``` ================================================ FILE: tasks/playwright_webarena/standard/reddit/movie_reviewer_analysis/label.txt ================================================ - Total_Year_Posts|1 - Top1_Title|Who will win the Oscar for ACTRESS IN A SUPPORTING ROLE? - Top1_Upvotes|9933 - Top1_Comments|23 - Top2_Title|Who will win the Oscar for FILM EDITING? - Top2_Upvotes|7720 - Top2_Comments|20 - Top3_Title|Cindy Williams Dies: 'Laverne & Shirley' Star Who Appeared In 'American Graffiti' & 'The Conversation' Was 75 - Top3_Upvotes|5268 - Top3_Comments|190 - Rittenhouse_Upvotes|2761 - Rittenhouse_Comments|182 - Total_Image_Posts_5Pages|6 ================================================ FILE: tasks/playwright_webarena/standard/reddit/movie_reviewer_analysis/meta.json ================================================ { "task_id": "movie_reviewer_analysis", "task_name": "Movie Reviewer Analysis", "category_id": "reddit", "category_name": "Reddit", "description": "Analyze movie review patterns by creating reviewer profile, collecting ratings data, tracking review trends, and generating analytical report on community movie preferences and discussions.", "author": "Fanqing Meng", "created_at": "2025-08-12", "difficulty": "L3", "tags": [ "user interaction", "data extraction", "comparative analysis", "content submission" ], "mcp": [ "playwright" ], "meta_data": { "stateType": "video", "stateContent": null, "stateUrl": "https://storage.mcpmark.ai/tasks_state/playwright_video/postmill.mp4", "stateOriginalUrl": "https://github.com/web-arena-x/webarena/tree/main/environment_docker" } } ================================================ FILE: tasks/playwright_webarena/standard/reddit/movie_reviewer_analysis/verify.py ================================================ import asyncio import sys import re import os from pathlib import Path from playwright.async_api import ( async_playwright, TimeoutError as PlaywrightTimeoutError, ) # 从环境变量读取 base_url,默认回退到原地址 BASE_URL = os.getenv("WEBARENA_BASE_URL", "http://localhost:9999").rstrip("/") print(f"Using base URL: {BASE_URL}") def parse_key_value_format(text): """ Parse the Key|Value format from the submission body using regex. Works regardless of line breaks. """ data = {} # Define patterns for each field with the pipe separator patterns = { "Total_Year_Posts": r"Total_Year_Posts\s*\|\s*(\d+)", "Top1_Title": r"Top1_Title\s*\|\s*(.+?)(?=\nTop1_Upvotes|$)", "Top1_Upvotes": r"Top1_Upvotes\s*\|\s*(\d+)", "Top1_Comments": r"Top1_Comments\s*\|\s*(\d+)", "Top2_Title": r"Top2_Title\s*\|\s*(.+?)(?=\nTop2_Upvotes|$)", "Top2_Upvotes": r"Top2_Upvotes\s*\|\s*(\d+)", "Top2_Comments": r"Top2_Comments\s*\|\s*(\d+)", "Top3_Title": r"Top3_Title\s*\|\s*(.+?)(?=\nTop3_Upvotes|$)", "Top3_Upvotes": r"Top3_Upvotes\s*\|\s*(\d+)", "Top3_Comments": r"Top3_Comments\s*\|\s*(\d+)", "Rittenhouse_Upvotes": r"Rittenhouse_Upvotes\s*\|\s*(\d+)", "Rittenhouse_Comments": r"Rittenhouse_Comments\s*\|\s*(\d+)", "Total_Image_Posts_5Pages": r"Total_Image_Posts_5Pages\s*\|\s*(\d+)", } # Extract each field using regex for key, pattern in patterns.items(): match = re.search(pattern, text, re.DOTALL | re.MULTILINE) if match: # For title fields, clean up newlines and extra spaces value = match.group(1).strip() if "Title" in key: # Replace newlines with spaces and normalize whitespace value = " ".join(value.split()) data[key] = value return data def normalize_text(text): """ Normalize text for comparison by handling different quote styles and whitespace. """ if not isinstance(text, str): return str(text) # Replace various quote styles with standard quotes text = text.replace(""", "'").replace(""", "'") text = text.replace('"', '"').replace('"', '"') text = text.replace("&", "&") # Normalize whitespace text = " ".join(text.split()) return text.strip() async def verify() -> bool: """ Verifies that the wonderful movies analysis task has been completed correctly. """ async with async_playwright() as p: browser = await p.chromium.launch(headless=True) context = await browser.new_context() page = await context.new_page() try: # Navigate to the main page print("Navigating to forum...", file=sys.stderr) await page.goto(f"{BASE_URL}/", wait_until="networkidle") # Check if logged in as movie_reviewer_2024 user_button = page.locator('button:has-text("movie_reviewer_2024")') if not await user_button.count(): # Try to login print("Not logged in, attempting to login...", file=sys.stderr) await page.click('a:has-text("Log in")') await page.wait_for_load_state("networkidle") await page.fill('input[name="_username"]', "movie_reviewer_2024") await page.fill('input[name="_password"]', "movie_reviewer_2024") await page.click('button:has-text("Log in")') await page.wait_for_load_state("networkidle") user_button = page.locator('button:has-text("movie_reviewer_2024")') if not await user_button.count(): print( "Error: Login failed for movie_reviewer_2024", file=sys.stderr ) return False print("Successfully logged in as movie_reviewer_2024", file=sys.stderr) else: print("Already logged in as movie_reviewer_2024", file=sys.stderr) # Navigate to movies forum print("Navigating to movies forum...", file=sys.stderr) await page.goto( f"{BASE_URL}/f/movies", wait_until="networkidle" ) # Look for the submission with our specific title print( "Looking for submission 'Wonderful Movies Analysis: Community Favorites [2024]'...", file=sys.stderr, ) post_link = page.locator( 'a:has-text("Wonderful Movies Analysis: Community Favorites [2024]")' ) if not await post_link.count(): print( "Error: Could not find submission with required title", file=sys.stderr, ) return False # Click on the submission to view its content await post_link.first.click() await page.wait_for_load_state("networkidle") # Extract the submission body content # Try multiple possible selectors for the post body post_content = None selectors = [ ".submission__body", ".post-body", ".RichText", '[class*="RichText"]', 'div:has(> p:has-text("Total_Year_Posts"))', 'div:has-text("Total_Year_Posts"):has-text("Total_Image_Posts_5Pages")', ] for selector in selectors: content_element = page.locator(selector) if await content_element.count(): post_content = await content_element.first.inner_text() if "Total_Year_Posts" in post_content: print( f"Found submission content using selector: {selector}", file=sys.stderr, ) break if not post_content or "Total_Year_Posts" not in post_content: print( "Error: Could not find submission body with required format", file=sys.stderr, ) return False print("Submission content found, parsing data...", file=sys.stderr) print(f"Raw content: {post_content[:200]}...", file=sys.stderr) # Parse the Key: Value format extracted_data = parse_key_value_format(post_content) print(f"Extracted data: {extracted_data}", file=sys.stderr) # Load expected values from label.txt label_path = Path(__file__).parent / "label.txt" if label_path.exists(): with open(label_path, "r") as f: expected_text = f.read().strip() expected_data = parse_key_value_format(expected_text) print("Loaded expected values from label.txt", file=sys.stderr) # Verify all required keys are present required_keys = [ "Total_Year_Posts", "Top1_Title", "Top1_Upvotes", "Top1_Comments", "Top2_Title", "Top2_Upvotes", "Top2_Comments", "Top3_Title", "Top3_Upvotes", "Top3_Comments", "Rittenhouse_Upvotes", "Rittenhouse_Comments", "Total_Image_Posts_5Pages", ] missing_keys = [] for key in required_keys: if key not in extracted_data: missing_keys.append(key) if missing_keys: print( f"Error: Missing required keys: {', '.join(missing_keys)}", file=sys.stderr, ) return False # Validate data format and content errors = [] # Check Total_Year_Posts is a number and matches expected try: total_posts = int(extracted_data["Total_Year_Posts"]) if "expected_data" in locals() and "Total_Year_Posts" in expected_data: expected_total = int(expected_data["Total_Year_Posts"]) if total_posts != expected_total: errors.append( f"Total_Year_Posts mismatch: got {total_posts}, expected {expected_total}" ) except ValueError: errors.append( f"Total_Year_Posts must be a number, got: {extracted_data['Total_Year_Posts']}" ) # If we have expected data, compare against it if "expected_data" in locals(): # Compare each field for key in required_keys: if key in expected_data and key in extracted_data: expected_val = normalize_text(expected_data[key]) actual_val = normalize_text(extracted_data[key]) # For numeric fields, compare as integers if ( "Upvotes" in key or "Comments" in key or key == "Total_Year_Posts" or key == "Total_Image_Posts_5Pages" ): try: expected_int = int(expected_val) actual_int = int(actual_val) if expected_int != actual_int: errors.append( f"{key} mismatch: got {actual_int}, expected {expected_int}" ) except ValueError: errors.append( f"{key} should be numeric: got '{actual_val}'" ) else: # For text fields, compare normalized text if expected_val != actual_val: errors.append( f"{key} mismatch: got '{actual_val}', expected '{expected_val}'" ) else: # If no expected data, just do basic validation for key in required_keys: if key not in extracted_data: errors.append(f"Missing required key: {key}") elif ( not extracted_data[key] or extracted_data[key] == "[FILL_VALUE]" ): errors.append(f"{key} was not filled in") if errors: print( "Error: Validation failed with the following issues:", file=sys.stderr, ) for error in errors: print(f" - {error}", file=sys.stderr) return False # All checks passed print("Success: Wonderful movies analysis task completed successfully.") print("- Account movie_reviewer_2024 verified") print( "- Submission 'Wonderful Movies Analysis: Community Favorites [2024]' found" ) print(f"- Total posts with years: {extracted_data['Total_Year_Posts']}") print("- Top 3 posts by upvotes identified and documented") print( f"- Rittenhouse Square data: {extracted_data['Rittenhouse_Upvotes']} upvotes, {extracted_data['Rittenhouse_Comments']} comments" ) print( f"- Total image posts across 5 pages: {extracted_data['Total_Image_Posts_5Pages']}" ) print("- All data in correct Key|Value format") return True except PlaywrightTimeoutError as e: print(f"Error: Timeout occurred - {str(e)}", file=sys.stderr) return False except Exception as e: print(f"Error: Unexpected error - {str(e)}", file=sys.stderr) return False finally: await browser.close() def main(): """ Executes the verification process and exits with a status code. """ result = asyncio.run(verify()) sys.exit(0 if result else 1) if __name__ == "__main__": main() ================================================ FILE: tasks/playwright_webarena/standard/reddit/nba_statistics_analysis/description.md ================================================ I'm conducting research on NBA player discussions in online sports communities. Please help me create a comprehensive analysis. **Task Requirements:** 1. Register a new account with username 'NBA_DataAnalyst_2024' and password 'Research#2024!' 2. Navigate to the sports forum and search for posts containing 'NBA' in their titles: - Collect data from the 5 NBA-related posts with the most comments - For each post, record: the exact post title, vote count, comment count, and the username of the person who submitted it 3. Visit the user profile of 'BCLetsRide69': - Count his total submissions 4. Create a new submission in the sports forum with: - Title: "Statistical Analysis: NBA Content Engagement on This Forum" - Body text must be EXACTLY these lines without anything (keep the keys as-is, only replace the values after the colon, follow the markdown format): ``` - Total_NBA_Posts|FILL_VALUE - Top1_Title|FILL_VALUE - Top1_Votes|FILL_VALUE - Top1_Comments|FILL_VALUE - Top1_Author|FILL_VALUE - Top2_Title|FILL_VALUE - Top2_Votes|FILL_VALUE - Top2_Comments|FILL_VALUE - Top2_Author|FILL_VALUE - Top3_Title|FILL_VALUE - Top3_Votes|FILL_VALUE - Top3_Comments|FILL_VALUE - Top3_Author|FILL_VALUE - Top4_Title|FILL_VALUE - Top4_Votes|FILL_VALUE - Top4_Comments|FILL_VALUE - Top4_Author|FILL_VALUE - Top5_Title|FILL_VALUE - Top5_Votes|FILL_VALUE - Top5_Comments|FILL_VALUE - Top5_Author|FILL_VALUE - BCLetsRide69_Total_Posts|FILL_VALUE ``` ================================================ FILE: tasks/playwright_webarena/standard/reddit/nba_statistics_analysis/label.txt ================================================ - Total_NBA_Posts|20 - Top1_Title|Hamby claims [WNBA Champ] Aces 'unprofessional' after trade - Top1_Votes|614 - Top1_Comments|170 - Top1_Author|Responsible-Lunch815 - Top2_Title|Heat place literally every player on injury report after receiving NBA fine ahead of Mexico City game - Top2_Votes|1266 - Top2_Comments|145 - Top2_Author|XXmynameisNeganXX - Top3_Title|[ESPN] Announced attendance at the Alamodome tonight|68,323, a new single-game NBA record, in the Spurs' first game there since Game 4 of the 2002 Western Conference Semifinals. - Top3_Votes|1511 - Top3_Comments|101 - Top3_Author|dragon8811 - Top4_Title|Phoenix Mercury confirm Brittney Griner’s return to WNBA - Top4_Votes|0 - Top4_Comments|42 - Top4_Author|rejs7 - Top5_Title|Perspective | Kyrie Irving lit a flame. The NBA, top to bottom, watched the fire spread. - Top5_Votes|74 - Top5_Comments|32 - Top5_Author|tomyland - BCLetsRide69_Total_Posts|48 ================================================ FILE: tasks/playwright_webarena/standard/reddit/nba_statistics_analysis/meta.json ================================================ { "task_id": "nba_statistics_analysis", "task_name": "NBA Statistics Analysis", "category_id": "reddit", "category_name": "Reddit", "description": "Create sports analytics account, collect NBA player statistics from forum discussions, analyze basketball performance metrics, and compile comprehensive statistical report with community insights.", "author": "Fanqing Meng", "created_at": "2025-08-12", "difficulty": "L3", "tags": [ "user interaction", "data extraction", "comparative analysis", "content submission" ], "mcp": [ "playwright" ], "meta_data": { "stateType": "video", "stateContent": null, "stateUrl": "https://storage.mcpmark.ai/tasks_state/playwright_video/postmill.mp4", "stateOriginalUrl": "https://github.com/web-arena-x/webarena/tree/main/environment_docker" } } ================================================ FILE: tasks/playwright_webarena/standard/reddit/nba_statistics_analysis/verify.py ================================================ import asyncio import sys import re import os from pathlib import Path from playwright.async_api import ( async_playwright, TimeoutError as PlaywrightTimeoutError, ) # 从环境变量读取 base_url,默认回退到本地 BASE_URL = os.getenv("WEBARENA_BASE_URL", "http://localhost:9999").rstrip("/") def parse_key_value_format(text): """ Parse the Key|Value format from the submission body. This handles both the expected format from label.txt and the submission format. """ data = {} # Split by lines and parse each line lines = text.strip().split('\n') for line in lines: line = line.strip() if not line or line.startswith('#'): continue # Remove bullet point if present if line.startswith('- '): line = line[2:] elif line.startswith('• '): line = line[2:] # Parse pipe-separated format if '|' in line: parts = line.split('|', 1) if len(parts) == 2: key = parts[0].strip() value = parts[1].strip() if value and value != 'FILL_VALUE': data[key] = value return data def normalize_text(text): """ Normalize text for comparison by handling different quote styles and whitespace. """ if not isinstance(text, str): return str(text) # Replace various quote styles with standard quotes text = text.replace(""", "'").replace(""", "'") text = text.replace('"', '"').replace('"', '"') # Also normalize apostrophes - use unicode escapes to be safe text = text.replace("\u2019", "'") # RIGHT SINGLE QUOTATION MARK (') text = text.replace("\u2018", "'") # LEFT SINGLE QUOTATION MARK (') # Normalize whitespace text = " ".join(text.split()) return text.strip() async def verify() -> bool: """ Verifies that the NBA analysis task has been completed correctly. """ async with async_playwright() as p: browser = await p.chromium.launch(headless=True) context = await browser.new_context() page = await context.new_page() try: # Navigate to the main page print("Navigating to forum...", file=sys.stderr) await page.goto(f"{BASE_URL}/", wait_until="networkidle") # Check if logged in as NBA_DataAnalyst_2024 user_button = page.locator('button:has-text("NBA_DataAnalyst_2024")') if not await user_button.count(): # Try to login print("Not logged in, attempting to login...", file=sys.stderr) await page.click('a:has-text("Log in")') await page.wait_for_load_state("networkidle") await page.fill('input[name="_username"]', "NBA_DataAnalyst_2024") await page.fill('input[name="_password"]', "Research#2024!") await page.click('button:has-text("Log in")') await page.wait_for_load_state("networkidle") user_button = page.locator('button:has-text("NBA_DataAnalyst_2024")') if not await user_button.count(): print( "Error: Login failed for NBA_DataAnalyst_2024", file=sys.stderr ) return False print("Successfully logged in as NBA_DataAnalyst_2024", file=sys.stderr) else: print("Already logged in as NBA_DataAnalyst_2024", file=sys.stderr) # Navigate to sports forum to check submission print("Navigating to sports forum to check submission...", file=sys.stderr) await page.goto( f"{BASE_URL}/f/sports", wait_until="networkidle" ) # Look for the submission with our specific title print( "Looking for submission 'Statistical Analysis: NBA Content Engagement on This Forum'...", file=sys.stderr, ) post_link = page.locator( 'a:has-text("Statistical Analysis: NBA Content Engagement on This Forum")' ) if not await post_link.count(): print( "Error: Could not find submission with required title", file=sys.stderr, ) return False # Click on the submission to view its content await post_link.first.click() await page.wait_for_load_state("networkidle") # Extract the submission body content # Try multiple possible selectors for the post body post_content = None selectors = [ ".submission__body", ".post-body", ".RichText", '[class*="RichText"]', 'div:has(> p:has-text("Total_NBA_Posts"))', 'div:has-text("Total_NBA_Posts"):has-text("Most_Popular_NBA_Author")', ] for selector in selectors: content_element = page.locator(selector) if await content_element.count(): post_content = await content_element.first.inner_text() if "Total_NBA_Posts" in post_content: print( f"Found submission content using selector: {selector}", file=sys.stderr, ) break if not post_content or "Total_NBA_Posts" not in post_content: print( "Error: Could not find submission body with required format", file=sys.stderr, ) return False print("Submission content found, parsing data...", file=sys.stderr) print(f"Raw content: {post_content[:200]}...", file=sys.stderr) # Parse the Key: Value format extracted_data = parse_key_value_format(post_content) print(f"Extracted data: {extracted_data}", file=sys.stderr) # Load expected values from label.txt label_path = Path(__file__).parent / "label.txt" if label_path.exists(): with open(label_path, "r") as f: expected_text = f.read().strip() expected_data = parse_key_value_format(expected_text) print("Loaded expected values from label.txt", file=sys.stderr) # Verify all required keys are present required_keys = [ "Total_NBA_Posts", "Top1_Title", "Top1_Votes", "Top1_Comments", "Top1_Author", "Top2_Title", "Top2_Votes", "Top2_Comments", "Top2_Author", "Top3_Title", "Top3_Votes", "Top3_Comments", "Top3_Author", "Top4_Title", "Top4_Votes", "Top4_Comments", "Top4_Author", "Top5_Title", "Top5_Votes", "Top5_Comments", "Top5_Author", "BCLetsRide69_Total_Posts", ] missing_keys = [] for key in required_keys: if key not in extracted_data: missing_keys.append(key) if missing_keys: print( f"Error: Missing required keys: {', '.join(missing_keys)}", file=sys.stderr, ) return False # Validate data format and content errors = [] # Check Total_NBA_Posts is a number and matches expected try: total_posts = int(extracted_data["Total_NBA_Posts"]) if "expected_data" in locals() and "Total_NBA_Posts" in expected_data: expected_total = int(expected_data["Total_NBA_Posts"]) if total_posts != expected_total: errors.append( f"Total_NBA_Posts mismatch: got {total_posts}, expected {expected_total}" ) elif ( total_posts < 5 ): # Should be at least 5 since we're collecting top 5 errors.append(f"Total_NBA_Posts seems too low: {total_posts}") except ValueError: errors.append( f"Total_NBA_Posts must be a number, got: {extracted_data['Total_NBA_Posts']}" ) # If we have expected data, compare against it if "expected_data" in locals(): # Compare each field for key in required_keys: if key in expected_data and key in extracted_data: expected_val = normalize_text(expected_data[key]) actual_val = normalize_text(extracted_data[key]) # For numeric fields, compare as integers if ( "Votes" in key or "Comments" in key or key == "Total_NBA_Posts" or key == "BCLetsRide69_Total_Posts" ): try: expected_int = int(expected_val) actual_int = int(actual_val) if expected_int != actual_int: errors.append( f"{key} mismatch: got {actual_int}, expected {expected_int}" ) except ValueError: errors.append( f"{key} should be numeric: got '{actual_val}'" ) else: # For text fields, compare normalized text if expected_val != actual_val: errors.append( f"{key} mismatch: got '{actual_val}', expected '{expected_val}'" ) else: # If no expected data, just do basic validation for key in required_keys: if key not in extracted_data: errors.append(f"Missing required key: {key}") elif ( not extracted_data[key] or extracted_data[key] == "[FILL_VALUE]" ): errors.append(f"{key} was not filled in") if errors: print( "Error: Validation failed with the following issues:", file=sys.stderr, ) for error in errors: print(f" - {error}", file=sys.stderr) return False # All checks passed print("Success: NBA analysis task completed successfully.") print("- Account NBA_DataAnalyst_2024 verified") print( "- Submission 'Statistical Analysis: NBA Content Engagement on This Forum' found" ) print( f"- Total NBA-related posts analyzed: {extracted_data['Total_NBA_Posts']}" ) print("- Top 5 posts identified and documented") print( f"- BCLetsRide69's total posts: {extracted_data['BCLetsRide69_Total_Posts']}" ) print("- All data in correct Key|Value format") return True except PlaywrightTimeoutError as e: print(f"Error: Timeout occurred - {str(e)}", file=sys.stderr) return False except Exception as e: print(f"Error: Unexpected error - {str(e)}", file=sys.stderr) return False finally: await browser.close() def main(): """ Executes the verification process and exits with a status code. """ result = asyncio.run(verify()) sys.exit(0 if result else 1) if __name__ == "__main__": main() ================================================ FILE: tasks/playwright_webarena/standard/reddit/routine_tracker_forum/description.md ================================================ Create a daily routine tracking system on the Postmill forum platform by setting up a new account and engaging with productivity-related content in the LifeProTips community. **Task Requirements:** 1. Create a new account with the exact username "RoutineTracker2025" and password "DailyRoutine123!" 2. Locate the following specific post: - Title: "LPT: Use your calendar as your to-do list. Assigning dedicated time to tasks increases the likelyhood of you acting upon it." - Record the content of the most upvoted comment of this post 3. Create a new submission in the LifeProTips forum with these exact specifications: - Title: "My 5-Step Morning Routine That Increased My Productivity by 200%" - Body content must be the content recorded in the above step 4. After successfully posting, verify that your post appears in the LifeProTips forum listing 5. Engage with the community by upvoting exactly these two posts: - The calendar to-do list post you found in step 2 - The post titled "LPT: clean your stovetop after using the oven. The heat loosens grime for easy removal" ================================================ FILE: tasks/playwright_webarena/standard/reddit/routine_tracker_forum/meta.json ================================================ { "task_id": "routine_tracker_forum", "task_name": "Routine Tracker Forum", "category_id": "reddit", "category_name": "Reddit", "description": "Establish productivity community by creating account, setting up routine tracking forum, implementing daily habit systems, and engaging members with structured productivity challenges and resources.", "author": "Fanqing Meng", "created_at": "2025-08-12", "difficulty": "L3", "tags": [ "user interaction", "content submission" ], "mcp": [ "playwright" ], "meta_data": { "stateType": "video", "stateContent": null, "stateUrl": "https://storage.mcpmark.ai/tasks_state/playwright_video/postmill.mp4", "stateOriginalUrl": "https://github.com/web-arena-x/webarena/tree/main/environment_docker" } } ================================================ FILE: tasks/playwright_webarena/standard/reddit/routine_tracker_forum/verify.py ================================================ import asyncio import sys import os from pathlib import Path from datetime import datetime from playwright.async_api import ( async_playwright, TimeoutError as PlaywrightTimeoutError, ) BASE_URL = os.getenv("WEBARENA_BASE_URL", "http://localhost:9999").rstrip("/") async def verify() -> bool: """ Verifies that the daily routine tracking setup has been completed correctly on the forum. """ async with async_playwright() as p: browser = await p.chromium.launch(headless=True) context = await browser.new_context() page = await context.new_page() try: # Step 1: Check if account can be logged in print("Step 1: Verifying account login...", file=sys.stderr) await page.goto(f"{BASE_URL}/", wait_until="networkidle") # Check if already logged in user_button = page.locator('button:has-text("RoutineTracker2025")') if not await user_button.count(): # Try to login print("Not logged in, attempting to login...", file=sys.stderr) # Click login link await page.click('a:has-text("Log in")') await page.wait_for_load_state("networkidle") # Fill login form await page.fill('input[name="_username"]', "RoutineTracker2025") await page.fill('input[name="_password"]', "DailyRoutine123!") # Submit login form await page.click('button:has-text("Log in")') await page.wait_for_load_state("networkidle") # Check if login successful user_button = page.locator('button:has-text("RoutineTracker2025")') if not await user_button.count(): print("Error: Account login failed for RoutineTracker2025", file=sys.stderr) return False print("✓ Account login successful", file=sys.stderr) else: print("✓ Already logged in as RoutineTracker2025", file=sys.stderr) # Step 2: Check if the post exists in LifeProTips forum with correct content print("Step 2: Verifying post in LifeProTips forum...", file=sys.stderr) await page.goto( f"{BASE_URL}/f/LifeProTips", wait_until="networkidle" ) # Check for the created post expected_title = "My 5-Step Morning Routine That Increased My Productivity by 200%" post_link = page.locator(f'a:has-text("{expected_title}")') if not await post_link.count(): print(f"Error: Post with title '{expected_title}' not found in LifeProTips forum", file=sys.stderr) return False # Click on the post to verify content await post_link.click() await page.wait_for_load_state("networkidle") # Verify post content - this should be the content from the most upvoted comment of the calendar post expected_content = "As a college student, having a visible reminder of the assignments I have and when they are due is super helpful for me. It also just feels good to erase them from the board once they are completed." # Check if the content exists in the page content_found = False article_content = await page.locator("article").text_content() if article_content and expected_content in article_content: content_found = True if not content_found: print(f"Error: Post content does not match expected content", file=sys.stderr) print(f"Expected: {expected_content}", file=sys.stderr) return False print("✓ Post found in LifeProTips with correct title and content", file=sys.stderr) # Step 3: Check upvotes via search print("Step 3: Verifying upvotes on posts...", file=sys.stderr) # Check first post upvote search_url1 = f"{BASE_URL}/search?q=LPT%3A+Use+your+calendar+as+your+to-do+list.+Assigning+dedicated+time+to+tasks+increases+the+likelyhood+of+you+acting+upon+it." await page.goto(search_url1, wait_until="networkidle") # Find the post and check its upvote count posts = await page.locator("article").all() calendar_upvoted = False for post in posts: title_elem = post.locator("h1 a") if await title_elem.count(): title = await title_elem.text_content() if "Use your calendar as your to-do list" in title: # Check upvote count vote_count_elem = post.locator("span.vote__net-score") if await vote_count_elem.count(): vote_count = await vote_count_elem.text_content() if vote_count and vote_count.strip() == "1": calendar_upvoted = True print("✓ Calendar post upvoted (count: 1)", file=sys.stderr) break if not calendar_upvoted: print("Error: Calendar post not upvoted or upvote count is not 1", file=sys.stderr) return False # Check second post upvote search_url2 = f"{BASE_URL}/search?q=LPT%3A+clean+your+stovetop+after+using+the+oven.+The+heat+loosens+grime+for+easy+removal" await page.goto(search_url2, wait_until="networkidle") posts = await page.locator("article").all() stovetop_upvoted = False for post in posts: title_elem = post.locator("h1 a") if await title_elem.count(): title = await title_elem.text_content() if "clean your stovetop after using the oven" in title: # Check upvote count vote_count_elem = post.locator("span.vote__net-score") if await vote_count_elem.count(): vote_count = await vote_count_elem.text_content() if vote_count and vote_count.strip() == "1": stovetop_upvoted = True print("✓ Stovetop post upvoted (count: 1)", file=sys.stderr) break if not stovetop_upvoted: print("Error: Stovetop post not upvoted or upvote count is not 1", file=sys.stderr) return False print("Success: All verification steps passed!") return True except PlaywrightTimeoutError as e: print(f"Error: Timeout occurred - {str(e)}", file=sys.stderr) return False except Exception as e: print(f"Error: Unexpected error - {str(e)}", file=sys.stderr) return False finally: await browser.close() def main(): """ Executes the verification process and exits with a status code. """ result = asyncio.run(verify()) sys.exit(0 if result else 1) if __name__ == "__main__": main() ================================================ FILE: tasks/playwright_webarena/standard/shopping/advanced_product_analysis/description.md ================================================ **Task Requirements:** 1. Search for products with 'Ginger' in the Product Name field and price range $50.00 to $100.00 2. Add Q Mixers Premium Ginger Ale product to the comparison list 3. Find Intel NUC Kit product in Electronics category and add it to the comparison list 4. From the comparison page: - Record SKU numbers for both products - Add all products to cart 5. Record the total cart value 6. On the Ginger Ale product detail page, record: - Number of customer reviews - Name of the most recent reviewer (on top of the first page) 7. Output your findings in this format: ``` <answer> GingerAleSKU|sku IntelNUCSKU|sku CartTotal|amount ReviewCount|count LatestReviewer|name </answer> ``` **Example Output:** ``` <answer> GingerAleSKU|XXXXXXXXX IntelNUCSKU|XXXXXXXXX CartTotal|$XXX.XX ReviewCount|XX LatestReviewer|name </answer> ``` ================================================ FILE: tasks/playwright_webarena/standard/shopping/advanced_product_analysis/label.txt ================================================ GingerAleSKU|B071KC37VD IntelNUCSKU|B01DJ9XID4 CartTotal|$668.49 ReviewCount|12 LatestReviewer|jwm ================================================ FILE: tasks/playwright_webarena/standard/shopping/advanced_product_analysis/meta.json ================================================ { "task_id": "advanced_product_analysis", "task_name": "Advanced Product Analysis", "category_id": "shopping", "category_name": "Shopping", "description": "Perform comprehensive product analysis including feature comparisons, price tracking, review aggregation, customer sentiment analysis, and generate detailed recommendation reports for informed purchasing decisions.", "author": "Yaoqi Ye", "created_at": "2025-08-17", "difficulty": "L3", "tags": [ "data extraction", "comparative analysis", "content submission" ], "mcp": [ "playwright" ], "meta_data": { "stateType": "video", "stateContent": null, "stateUrl": "https://storage.mcpmark.ai/tasks_state/playwright_video/one-stop-market.mp4", "stateOriginalUrl": "https://github.com/web-arena-x/webarena/tree/main/environment_docker" } } ================================================ FILE: tasks/playwright_webarena/standard/shopping/advanced_product_analysis/verify.py ================================================ import asyncio import sys import re import os import json from pathlib import Path def get_model_response(): """ Get the model's response from the MCP_MESSAGES environment variable. Returns the last assistant message text. """ messages_path = os.getenv("MCP_MESSAGES") print(f"MCP_MESSAGES: {messages_path}") if not messages_path: print("Warning: MCP_MESSAGES environment variable not set", file=sys.stderr) return None try: with open(messages_path, "r") as f: messages = json.load(f) # Find the last assistant message for message in reversed(messages): if ( message.get("role") == "assistant" and message.get("status") == "completed" ): content = message.get("content", []) for item in content: if item.get("type") == "output_text": return item.get("text", "") print("Warning: No assistant response found in messages", file=sys.stderr) return None except Exception as e: print(f"Error reading messages file: {str(e)}", file=sys.stderr) return None def parse_answer_format(text): """ Parse the <answer>xxx</answer> format from the agent's output. Returns a dictionary with the parsed values. """ if not text: return None # Look for <answer>...</answer> pattern match = re.search(r"<answer>(.*?)</answer>", text, re.IGNORECASE | re.DOTALL) if not match: return None answer_content = match.group(1).strip() # Parse each line result = {} lines = answer_content.split("\n") if len(lines) != 5: print(f"Error: Expected 5 lines in answer, got {len(lines)}", file=sys.stderr) return None for line in lines: if "|" in line: key, value = line.split("|", 1) result[key.strip()] = value.strip() return result def load_expected_answer(label_path): """ Load the expected answer from label.txt file. Returns a dictionary with the expected values. """ try: with open(label_path, "r") as f: lines = f.read().strip().split("\n") expected = {} for line in lines: if "|" in line: key, value = line.split("|", 1) expected[key.strip()] = value.strip() return expected except Exception as e: print(f"Error reading label file: {str(e)}", file=sys.stderr) return None def compare_answers(model_answer, expected_answer): """ Compare the model's answer with the expected answer. Returns True if all key information matches, False otherwise. """ if not model_answer or not expected_answer: return False # Check each expected key mismatches = [] for key, expected_value in expected_answer.items(): model_value = model_answer.get(key, "") # Special handling for different types of values if key == "GingerAleSKU": # Check exact SKU match if model_value != expected_value: mismatches.append( f"{key}: expected '{expected_value}', got '{model_value}'" ) elif key == "IntelNUCSKU": # Check exact SKU match if model_value != expected_value: mismatches.append( f"{key}: expected '{expected_value}', got '{model_value}'" ) elif key == "CartTotal": # For price fields, only support $XX.XX format # Check if model value has correct format if not model_value.startswith("$"): mismatches.append( f"{key}: incorrect format - expected '$XX.XX' format, got '{model_value}'" ) else: # Normalize and compare values expected_clean = expected_value.replace("$", "").replace(",", "") model_clean = model_value.replace("$", "").replace(",", "") if expected_clean != model_clean: mismatches.append( f"{key}: expected '{expected_value}', got '{model_value}'" ) elif key == "ReviewCount": # Check review count matches if model_value != expected_value: mismatches.append( f"{key}: expected '{expected_value}', got '{model_value}'" ) elif key == "LatestReviewer": # Check reviewer name (allow partial match for names) if expected_value.lower() not in model_value.lower() and model_value.lower() not in expected_value.lower(): mismatches.append( f"{key}: expected '{expected_value}', got '{model_value}'" ) else: # Exact match for other fields if model_value != expected_value: mismatches.append( f"{key}: expected '{expected_value}', got '{model_value}'" ) if mismatches: print("\n=== Answer Comparison Mismatches ===", file=sys.stderr) for mismatch in mismatches: print(f"✗ {mismatch}", file=sys.stderr) return False print("\n=== Answer Comparison ===", file=sys.stderr) print("✓ All key information matches the expected answer", file=sys.stderr) return True async def verify() -> bool: """ Verifies that the advanced product analysis task has been completed correctly. First checks the model's answer against the expected label. """ # Get the label file path label_path = Path(__file__).parent / "label.txt" # Load expected answer expected_answer = load_expected_answer(label_path) if not expected_answer: print("Error: Could not load expected answer from label.txt", file=sys.stderr) return False # Get model's response from MCP_MESSAGES model_response = get_model_response() if model_response: print("Found model response, parsing answer format...", file=sys.stderr) model_answer = parse_answer_format(model_response) if model_answer: print("\n=== Model Answer Parsed ===", file=sys.stderr) for key, value in model_answer.items(): print(f"{key}: {value}", file=sys.stderr) # Compare answers answer_match = compare_answers(model_answer, expected_answer) if not answer_match: print("\nModel answer does not match expected answer", file=sys.stderr) return False print("\n✓ Model answer matches expected answer", file=sys.stderr) return True else: print( "Warning: Could not parse answer format from model response", file=sys.stderr, ) return False else: print("No model response found", file=sys.stderr) return False def main(): """ Executes the verification process and exits with a status code. """ result = asyncio.run(verify()) sys.exit(0 if result else 1) if __name__ == "__main__": main() ================================================ FILE: tasks/playwright_webarena/standard/shopping/gaming_accessories_analysis/description.md ================================================ **Task Requirements:** 1. In Video Games category, count products with customer rating 70% or higher in the first 2 pages 2. Sort products by price (ascending) and identify the cheapest product that has customer reviews 3. Find product with SKU 'B07D6LSCXZ' (N64 Controller), add to cart with quantity 3 4. Add products with SKU 'B071DR5V1K' and 'B082LZ4451' to comparison list, then count total products on comparison page 5. In cart, update N64 Controller quantity to 5 and record the subtotal for this item 6. Proceed to checkout and fill shipping form: - Email: test.buyer@example.com - First Name: Alice - Last Name: Johnson - Street Address: 456 Oak Avenue - Country: United States - State/Province: California - City: San Francisco - Zip Code: 94102 - Phone: 415-555-0123 Then count available shipping methods 7. Output your findings in this format: ``` <answer> Products70Plus|count CheapestReviewedSKU|sku CheapestReviewedPrice|price ComparisonCount|count N64Subtotal|amount CheckoutEmail|test.buyer@example.com ShippingState|California ShippingMethods|count </answer> ``` ================================================ FILE: tasks/playwright_webarena/standard/shopping/gaming_accessories_analysis/label.txt ================================================ Products70Plus|7 CheapestReviewedSKU|B014HDAUAA CheapestReviewedPrice|$0.99 ComparisonCount|2 N64Subtotal|$84.95 CheckoutEmail|test.buyer@example.com ShippingState|California ShippingMethods|1 ================================================ FILE: tasks/playwright_webarena/standard/shopping/gaming_accessories_analysis/meta.json ================================================ { "task_id": "gaming_accessories_analysis", "task_name": "Gaming Accessories Analysis", "category_id": "shopping", "category_name": "Shopping", "description": "Research gaming peripherals by analyzing technical specifications, comparing performance metrics, evaluating user reviews, tracking price trends, and creating detailed gaming accessory recommendations.", "author": "Yaoqi Ye", "created_at": "2025-08-17", "difficulty": "L3", "tags": [ "search aggregation", "comparative analysis", "data extraction" ], "mcp": [ "playwright" ], "meta_data": { "stateType": "video", "stateContent": null, "stateUrl": "https://storage.mcpmark.ai/tasks_state/playwright_video/one-stop-market.mp4", "stateOriginalUrl": "https://github.com/web-arena-x/webarena/tree/main/environment_docker" } } ================================================ FILE: tasks/playwright_webarena/standard/shopping/gaming_accessories_analysis/verify.py ================================================ import asyncio import sys import re import os import json from pathlib import Path def get_model_response(): """ Get the model's response from the MCP_MESSAGES environment variable. Returns the last assistant message text. """ messages_path = os.getenv("MCP_MESSAGES") print(f"MCP_MESSAGES: {messages_path}") if not messages_path: print("Warning: MCP_MESSAGES environment variable not set", file=sys.stderr) return None try: with open(messages_path, "r") as f: messages = json.load(f) # Find the last assistant message for message in reversed(messages): if ( message.get("role") == "assistant" and message.get("status") == "completed" and message.get("type") == "message" ): content = message.get("content", []) for item in content: if item.get("type") == "output_text": return item.get("text", "") print("Warning: No assistant response found in messages", file=sys.stderr) return None except Exception as e: print(f"Error reading messages file: {str(e)}", file=sys.stderr) return None def parse_answer_format(text): """ Parse the <answer>...</answer> format from the agent's output. Returns a dictionary with the parsed values. """ if not text: return None # Look for <answer>...</answer> pattern match = re.search(r"<answer>(.*?)</answer>", text, re.IGNORECASE | re.DOTALL) if not match: return None answer_content = match.group(1).strip() # Parse each line result = {} lines = answer_content.split("\n") if len(lines) != 8: print(f"Error: Expected 8 lines in answer, got {len(lines)}", file=sys.stderr) return None for line in lines: if "|" in line: key, value = line.split("|", 1) result[key.strip()] = value.strip() return result def load_expected_answer(label_path): """ Load the expected answer from label.txt file. Returns a dictionary with the expected values. """ try: with open(label_path, "r") as f: lines = f.read().strip().split("\n") expected = {} for line in lines: if "|" in line: key, value = line.split("|", 1) expected[key.strip()] = value.strip() return expected except Exception as e: print(f"Error reading label file: {str(e)}", file=sys.stderr) return None def compare_answers(model_answer, expected_answer): """ Compare the model's answer with the expected answer. Returns True if all key information matches, False otherwise. """ if not model_answer or not expected_answer: return False # Check each expected key mismatches = [] for key, expected_value in expected_answer.items(): model_value = model_answer.get(key, "") # Special handling for different types of values if key in ["CheapestReviewedPrice", "N64Subtotal"]: # For price fields, only support $XX.XX format # Check if model value has correct format if not model_value.startswith("$"): mismatches.append( f"{key}: incorrect format - expected '$XX.XX' format, got '{model_value}'" ) else: # Normalize and compare values expected_clean = expected_value.replace("$", "").replace(",", "") model_clean = model_value.replace("$", "").replace(",", "") if expected_clean != model_clean: mismatches.append( f"{key}: expected '{expected_value}', got '{model_value}'" ) elif key == "CheckoutEmail": # Email should match exactly (case-insensitive) if model_value.lower() != expected_value.lower(): mismatches.append( f"{key}: expected '{expected_value}', got '{model_value}'" ) elif key == "Products70Plus": # For count fields, allow some flexibility (products might change) # But still check if it's a reasonable number try: model_count = int(model_value) expected_count = int(expected_value) # Allow up to 2 products difference (in case of dynamic content) if abs(model_count - expected_count) > 2: mismatches.append( f"{key}: expected around '{expected_value}', got '{model_value}'" ) except ValueError: mismatches.append( f"{key}: expected '{expected_value}', got '{model_value}'" ) else: # Exact match for other fields if model_value != expected_value: mismatches.append( f"{key}: expected '{expected_value}', got '{model_value}'" ) if mismatches: print("\n=== Answer Comparison Mismatches ===", file=sys.stderr) for mismatch in mismatches: print(f"✗ {mismatch}", file=sys.stderr) return False print("\n=== Answer Comparison ===", file=sys.stderr) print("✓ All key information matches the expected answer", file=sys.stderr) return True async def verify() -> bool: """ Verifies that the gaming accessories analysis task has been completed correctly. Checks the model's answer against the expected label. """ # Get the label file path label_path = Path(__file__).parent / "label.txt" # Load expected answer expected_answer = load_expected_answer(label_path) if not expected_answer: print("Error: Could not load expected answer from label.txt", file=sys.stderr) return False # Get model's response from MCP_MESSAGES model_response = get_model_response() if model_response: print("Found model response, parsing answer format...", file=sys.stderr) model_answer = parse_answer_format(model_response) if model_answer: print("\n=== Model Answer Parsed ===", file=sys.stderr) for key, value in model_answer.items(): print(f"{key}: {value}", file=sys.stderr) # Compare answers answer_match = compare_answers(model_answer, expected_answer) if not answer_match: print("\nModel answer does not match expected answer", file=sys.stderr) return False print("\n✓ Model answer matches expected answer", file=sys.stderr) return True else: print( "Warning: Could not parse answer format from model response", file=sys.stderr, ) return False else: print("No model response found", file=sys.stderr) return False def main(): """ Executes the verification process and exits with a status code. """ result = asyncio.run(verify()) sys.exit(0 if result else 1) if __name__ == "__main__": main() ================================================ FILE: tasks/playwright_webarena/standard/shopping/health_routine_optimization/description.md ================================================ ## Task Requirements 1. Search for products with `vitamin` in Description and price range `$0.00` to `$99.99`. Record total search results count. 2. In "Health & Household" category with price filter `$0.00 - $99.99`: - Add "LOOPACELL AG13 LR44 L1154 357 76A A76 Button Cell Battery 10 Pack" to comparison - Add "Energizer MAX C Batteries, Premium Alkaline C Cell Batteries (8 Battery Count)" to comparison - Record each battery's price - Verify comparison list has 2 items 3. Search `Elmwood Inn Fine Teas`, find "Elmwood Inn Fine Teas, Orange Vanilla Caffeine-free Fruit Infusion, 16-Ounce Pouch": - Record SKU, rating percentage, and review count - Add to cart with quantity 2 4. Search `energy`, sort by Relevance (descending): - Find "V8 +Energy, Healthy Energy Drink, Steady Energy from Black and Green Tea, Pomegranate Blueberry, 8 Ounce Can ,Pack of 24" - Record its position (1st, 2nd, 3rd, etc.) - Add to cart with quantity 1 5. In cart: - Record unique products count, total quantity, and subtotal - Then update Elmwood tea quantity to 3 - Record new subtotal ## Output Format ``` <answer> AdvancedSearchResults|XXXX Battery1Name|LOOPACELL AG13 LR44 Battery1Price|$X.XX Battery2Name|Energizer MAX C Battery2Price|$XX.XX ComparisonCount|X TeaSKU|XXXXXXXXXX TeaRating|XXX% TeaReviews|X V8Position|Xth CartUniqueProducts|X CartTotalQuantity|X InitialSubtotal|$XX.XX FinalSubtotal|$XX.XX </answer> ``` ================================================ FILE: tasks/playwright_webarena/standard/shopping/health_routine_optimization/label.txt ================================================ <answer> AdvancedSearchResults|2906 Battery1Name|LOOPACELL AG13 LR44 Battery1Price|$3.72 Battery2Name|Energizer MAX C Battery2Price|$14.87 ComparisonCount|2 TeaSKU|B0040WHKIY TeaRating|95% TeaReviews|4 V8Position|3rd CartUniqueProducts|2 CartTotalQuantity|3 InitialSubtotal|$53.19 FinalSubtotal|$72.55 </answer> ================================================ FILE: tasks/playwright_webarena/standard/shopping/health_routine_optimization/meta.json ================================================ { "task_id": "health_routine_optimization", "task_name": "Health Routine Optimization", "category_id": "shopping", "category_name": "Shopping", "description": "Optimize health and wellness product selections by analyzing nutritional supplements, fitness equipment, creating personalized routines, and tracking health metrics for lifestyle improvements.", "author": "Yaoqi Ye", "created_at": "2025-08-17", "difficulty": "L3", "tags": [ "data extraction", "comparative analysis", "content submission" ], "mcp": [ "playwright" ], "meta_data": { "stateType": "video", "stateContent": null, "stateUrl": "https://storage.mcpmark.ai/tasks_state/playwright_video/one-stop-market.mp4", "stateOriginalUrl": "https://github.com/web-arena-x/webarena/tree/main/environment_docker" } } ================================================ FILE: tasks/playwright_webarena/standard/shopping/health_routine_optimization/verify.py ================================================ import asyncio import sys import os import json import re from pathlib import Path def get_model_response(): """ Get the model's response from the MCP_MESSAGES environment variable. Returns the last assistant message text. """ messages_path = os.getenv("MCP_MESSAGES") print(f"MCP_MESSAGES: {messages_path}") if not messages_path: print("Warning: MCP_MESSAGES environment variable not set", file=sys.stderr) return None try: with open(messages_path, "r") as f: messages = json.load(f) # Find the last assistant message for message in reversed(messages): if ( message.get("role") == "assistant" and message.get("status") == "completed" and message.get("type") == "message" ): content = message.get("content", []) for item in content: if item.get("type") == "output_text": return item.get("text", "") print("Warning: No assistant response found in messages", file=sys.stderr) return None except Exception as e: print(f"Error reading messages file: {str(e)}", file=sys.stderr) return None def parse_answer_format(text): """ Parse the <answer>...</answer> format from the agent's output. Returns a dictionary with the parsed values. """ if not text: return None # Look for <answer>...</answer> pattern match = re.search(r"<answer>(.*?)</answer>", text, re.IGNORECASE | re.DOTALL) if not match: return None answer_content = match.group(1).strip() # Parse each line result = {} lines = answer_content.split("\n") if len(lines) != 14: print(f"Error: Expected 14 lines in answer, got {len(lines)}", file=sys.stderr) return None for line in lines: if "|" in line: key, value = line.split("|", 1) result[key.strip()] = value.strip() return result def load_expected_answer(label_path): """ Load the expected answer from label.txt file. Returns a dictionary with the expected values. """ try: with open(label_path, "r") as f: content = f.read().strip() # Parse the answer from the label file # The label file contains <answer>...</answer> tags match = re.search(r"<answer>(.*?)</answer>", content, re.IGNORECASE | re.DOTALL) if match: answer_content = match.group(1).strip() lines = answer_content.split("\n") else: # Fallback: treat the whole file as answer content lines = content.split("\n") expected = {} for line in lines: if "|" in line: key, value = line.split("|", 1) expected[key.strip()] = value.strip() return expected except Exception as e: print(f"Error reading label file: {str(e)}", file=sys.stderr) return None def compare_answers(model_answer, expected_answer): """ Compare the model's answer with the expected answer. Returns True if all key information matches, False otherwise. """ if not model_answer or not expected_answer: return False # Check each expected key mismatches = [] for key, expected_value in expected_answer.items(): model_value = model_answer.get(key, "") # Special handling for different types of values if key in ["Battery1Price", "Battery2Price", "InitialSubtotal", "FinalSubtotal"]: # For price fields, only support $XX.XX format # Check if model value has correct format if not model_value.startswith("$"): mismatches.append( f"{key}: incorrect format - expected '$XX.XX' format, got '{model_value}'" ) else: # Normalize and compare values expected_clean = expected_value.replace("$", "").replace(",", "") model_clean = model_value.replace("$", "").replace(",", "") if expected_clean != model_clean: mismatches.append( f"{key}: expected '{expected_value}', got '{model_value}'" ) else: # Exact match for other fields if model_value != expected_value: mismatches.append( f"{key}: expected '{expected_value}', got '{model_value}'" ) if mismatches: print("\n=== Answer Comparison Mismatches ===", file=sys.stderr) for mismatch in mismatches: print(f"✗ {mismatch}", file=sys.stderr) return False print("\n=== Answer Comparison ===", file=sys.stderr) print("✓ All key information matches the expected answer", file=sys.stderr) return True async def verify() -> bool: """ Verifies that the health routine optimization task has been completed correctly. Checks the model's answer against the expected label. """ # Get the label file path label_path = Path(__file__).parent / "label.txt" # Load expected answer expected_answer = load_expected_answer(label_path) if not expected_answer: print("Error: Could not load expected answer from label.txt", file=sys.stderr) return False # Get model's response from MCP_MESSAGES model_response = get_model_response() if model_response: print("Found model response, parsing answer format...", file=sys.stderr) model_answer = parse_answer_format(model_response) if model_answer: print("\n=== Model Answer Parsed ===", file=sys.stderr) for key, value in model_answer.items(): print(f"{key}: {value}", file=sys.stderr) # Compare answers answer_match = compare_answers(model_answer, expected_answer) if not answer_match: print("\nModel answer does not match expected answer", file=sys.stderr) return False print("\n✓ Model answer matches expected answer", file=sys.stderr) return True else: print( "Warning: Could not parse answer format from model response", file=sys.stderr, ) return False else: print("No model response found", file=sys.stderr) return False def main(): """ Executes the verification process and exits with a status code. """ result = asyncio.run(verify()) sys.exit(0 if result else 1) if __name__ == "__main__": main() ================================================ FILE: tasks/playwright_webarena/standard/shopping/holiday_baking_competition/description.md ================================================ **Task Requirements:** 1. Search 'gingerbread', sort by price (high to low): - Add most expensive product to comparison list - Record SKU of second most expensive product 2. Search 'cookie' with price range $20.00-$40.00: - Find product with highest rating % and at least 5 reviews in the first 2 pages (if tied, choose lowest price) - Record SKU and rating % - Select "Cookies: Oatmeal Chocolate Chunk" flavor if required - Add to cart with quantity 2 3. Search 'chocolate', sort by price (low to high): - Find cheapest product with at least 1 review - Record price and review count - Select "Peanut Butter Flavor" if required - Add to cart with quantity 3 4. In cart: - Update cookie quantity from 2 to 5 - Record cart subtotal and total items count 5. Search 'gingerbread', go to page 2: - Find third product on page 2 - Record SKU, price, and manufacturer **Output Format:** ``` <answer> SecondGingerbreadSKU|sku HighestRatedCookieSKURating|sku:rating% CheapestChocolatePriceReviews|$price:reviews CartSubtotalAfterUpdate|$amount TotalCartItems|count Page2ThirdProductSKUPrice|sku:$price ProductManufacturer|manufacturer </answer> ``` ================================================ FILE: tasks/playwright_webarena/standard/shopping/holiday_baking_competition/label.txt ================================================ SecondGingerbreadSKU|B0075AO9RI HighestRatedCookieSKURating|B0951CPYV7:86% CheapestChocolatePriceReviews|$1.04:12 CartSubtotalAfterUpdate|$128.07 TotalCartItems|8 Page2ThirdProductSKUPrice|B09RPXCB47:$21.99 ProductManufacturer|That Melanin Tho ================================================ FILE: tasks/playwright_webarena/standard/shopping/holiday_baking_competition/meta.json ================================================ { "task_id": "holiday_baking_competition", "task_name": "Holiday Baking Competition", "category_id": "shopping", "category_name": "Shopping", "description": "Research baking supplies for competition preparation including ingredient quality analysis, equipment comparisons, recipe optimization, and creating comprehensive shopping list with budget recommendations.", "author": "Yaoqi Ye", "created_at": "2025-08-17", "difficulty": "L3", "tags": [ "search aggregation", "comparative analysis", "inventory management" ], "mcp": [ "playwright" ], "meta_data": { "stateType": "video", "stateContent": null, "stateUrl": "https://storage.mcpmark.ai/tasks_state/playwright_video/one-stop-market.mp4", "stateOriginalUrl": "https://github.com/web-arena-x/webarena/tree/main/environment_docker" } } ================================================ FILE: tasks/playwright_webarena/standard/shopping/holiday_baking_competition/verify.py ================================================ import asyncio import sys import re import os import json from pathlib import Path def get_model_response(): """ Get the model's response from the MCP_MESSAGES environment variable. Returns the last assistant message text. """ messages_path = os.getenv("MCP_MESSAGES") print(f"MCP_MESSAGES: {messages_path}") if not messages_path: print("Warning: MCP_MESSAGES environment variable not set", file=sys.stderr) return None try: with open(messages_path, "r") as f: messages = json.load(f) # Find the last assistant message for message in reversed(messages): if ( message.get("role") == "assistant" and message.get("status") == "completed" and message.get("type") == "message" ): content = message.get("content", []) for item in content: if item.get("type") == "output_text": return item.get("text", "") print("Warning: No assistant response found in messages", file=sys.stderr) return None except Exception as e: print(f"Error reading messages file: {str(e)}", file=sys.stderr) return None def parse_answer_format(text): """ Parse the <answer>...</answer> format from the agent's output. Returns a dictionary with the parsed values. """ if not text: return None # Look for <answer>...</answer> pattern match = re.search(r"<answer>(.*?)</answer>", text, re.IGNORECASE | re.DOTALL) if not match: return None answer_content = match.group(1).strip() # Parse each line result = {} lines = answer_content.split("\n") if len(lines) != 7: print(f"Error: Expected 7 lines in answer, got {len(lines)}", file=sys.stderr) return None for line in lines: if "|" in line: key, value = line.split("|", 1) result[key.strip()] = value.strip() return result def load_expected_answer(label_path): """ Load the expected answer from label.txt file. Returns a dictionary with the expected values. """ try: with open(label_path, "r") as f: lines = f.read().strip().split("\n") expected = {} for line in lines: if "|" in line: key, value = line.split("|", 1) expected[key.strip()] = value.strip() return expected except Exception as e: print(f"Error reading label file: {str(e)}", file=sys.stderr) return None def compare_answers(model_answer, expected_answer): """ Compare the model's answer with the expected answer. Returns True if all key information matches, False otherwise. """ if not model_answer or not expected_answer: return False # Check each expected key mismatches = [] for key, expected_value in expected_answer.items(): model_value = model_answer.get(key, "") # Special handling for different types of values if key == "SecondGingerbreadSKU": # SKU should match exactly (case-insensitive) if model_value.upper() != expected_value.upper(): mismatches.append( f"{key}: expected '{expected_value}', got '{model_value}'" ) elif key in ["CartSubtotalAfterUpdate"]: # For price fields, only support $XX.XX format # Check if model value has correct format if not model_value.startswith("$"): mismatches.append( f"{key}: incorrect format - expected '$XX.XX' format, got '{model_value}'" ) else: # Normalize and compare values expected_clean = expected_value.replace("$", "").replace(",", "") model_clean = model_value.replace("$", "").replace(",", "") # Allow some tolerance for price calculations (within $0.01) try: expected_float = float(expected_clean) model_float = float(model_clean) if abs(expected_float - model_float) > 0.01: mismatches.append( f"{key}: expected '{expected_value}', got '{model_value}'" ) except ValueError: if expected_value != model_value: mismatches.append( f"{key}: expected '{expected_value}', got '{model_value}'" ) elif key in ["TotalCartItems"]: # Should be a number if model_value != expected_value: mismatches.append( f"{key}: expected '{expected_value}', got '{model_value}'" ) elif key in ["HighestRatedCookieSKURating", "CheapestChocolatePriceReviews", "Page2ThirdProductSKUPrice"]: # Colon-separated fields (sku:rating, price:reviews, sku:price) if ":" in expected_value and ":" in model_value: expected_parts = expected_value.split(":", 1) model_parts = model_value.split(":", 1) if len(expected_parts) == 2 and len(model_parts) == 2: # For price fields, normalize the price part if key == "CheapestChocolatePriceReviews": # Check if price part has correct format ($XX.XX) if not model_parts[0].startswith("$"): mismatches.append( f"{key}: incorrect format - price part should start with '$', got '{model_value}'" ) else: expected_price = expected_parts[0].replace("$", "").replace(",", "") model_price = model_parts[0].replace("$", "").replace(",", "") try: if abs(float(expected_price) - float(model_price)) > 0.01 or expected_parts[1] != model_parts[1]: mismatches.append( f"{key}: expected '{expected_value}', got '{model_value}'" ) except ValueError: if expected_value != model_value: mismatches.append( f"{key}: expected '{expected_value}', got '{model_value}'" ) elif key == "Page2ThirdProductSKUPrice": # Check if price part has correct format ($XX.XX) if not model_parts[1].startswith("$"): mismatches.append( f"{key}: incorrect format - price part should start with '$', got '{model_value}'" ) else: expected_price = expected_parts[1].replace("$", "").replace(",", "") model_price = model_parts[1].replace("$", "").replace(",", "") try: if expected_parts[0] != model_parts[0] or abs(float(expected_price) - float(model_price)) > 0.01: mismatches.append( f"{key}: expected '{expected_value}', got '{model_value}'" ) except ValueError: if expected_value != model_value: mismatches.append( f"{key}: expected '{expected_value}', got '{model_value}'" ) else: # For rating fields, exact match if expected_value != model_value: mismatches.append( f"{key}: expected '{expected_value}', got '{model_value}'" ) else: mismatches.append( f"{key}: expected '{expected_value}', got '{model_value}'" ) else: if expected_value != model_value: mismatches.append( f"{key}: expected '{expected_value}', got '{model_value}'" ) else: # Exact match for other fields (like ProductManufacturer) if model_value != expected_value: mismatches.append( f"{key}: expected '{expected_value}', got '{model_value}'" ) if mismatches: print("\n=== Answer Comparison Mismatches ===", file=sys.stderr) for mismatch in mismatches: print(f"✗ {mismatch}", file=sys.stderr) return False print("\n=== Answer Comparison ===", file=sys.stderr) print("✓ All key information matches the expected answer", file=sys.stderr) return True async def verify() -> bool: """ Verifies that the holiday baking competition task has been completed correctly. Checks the model's answer against the expected label. """ # Get the label file path label_path = Path(__file__).parent / "label.txt" # Load expected answer expected_answer = load_expected_answer(label_path) if not expected_answer: print("Error: Could not load expected answer from label.txt", file=sys.stderr) return False # Get model's response from MCP_MESSAGES model_response = get_model_response() if model_response: print("Found model response, parsing answer format...", file=sys.stderr) model_answer = parse_answer_format(model_response) if model_answer: print("\n=== Model Answer Parsed ===", file=sys.stderr) for key, value in model_answer.items(): print(f"{key}: {value}", file=sys.stderr) # Compare answers answer_match = compare_answers(model_answer, expected_answer) if not answer_match: print("\nModel answer does not match expected answer", file=sys.stderr) return False print("\n✓ Model answer matches expected answer", file=sys.stderr) return True else: print( "Warning: Could not parse answer format from model response", file=sys.stderr, ) return False else: print("No model response found", file=sys.stderr) return False def main(): """ Executes the verification process and exits with a status code. """ result = asyncio.run(verify()) sys.exit(0 if result else 1) if __name__ == "__main__": main() ================================================ FILE: tasks/playwright_webarena/standard/shopping/multi_category_budget_analysis/description.md ================================================ Complete the following tasks on One Stop Market (http://localhost:7770): **Task Requirements:** 1. In Chocolate subcategory, sort by price (ascending): - Record price and SKU of first 3 products 2. Search for 'tabletop' with price range $100.00-$200.00: - Find the cheapest tabletop that has the highest review rating with at least 3 reviews. - Record search results count - Record price of required tabletop 3. In "Computers & Accessories" subcategory with price filter $0.00-$9,999.99: - Sort by price (ascending) - Record price of cheapest item 4. Add these products to comparison: - "Little Secrets Chocolate Pieces, Peanut Butter Flavor" - "Multi Accessory Hub Adapter By JOBY" - "SanDisk Cruzer Glide 32GB (5 Pack) USB 2.0 Flash Drive" - Count total items on comparison page 5. In cart: - Add the cheapest chocolate product (from step 1) with "Peanut flavor" if available - Add cheapest computer accessory (from step 3) - Record cart subtotal and item count 6. Calculate: - Sum of 3 chocolate product prices - Price difference: cheapest tabletop minus cheapest computer accessory - Whether sum of 3 comparison items < $60 **Output Format:** ``` <answer> chocolate_products|Price1:SKU1;Price2:SKU2;Price3:SKU3 chocolate_sum|Total tabletop_search_count|Count tabletop_product|Price:SKU tabletop_reviews|NumbersOfReviews:Rating cheapest_computer_accessory|Price price_difference|Amount comparison_count|Count cart_subtotal|Amount cart_item_count|Count under_60_budget|YES/NO </answer> ``` ================================================ FILE: tasks/playwright_webarena/standard/shopping/multi_category_budget_analysis/label.txt ================================================ chocolate_products|$1.04:B071954ZDC;$1.89:B07ND598N3;$2.50:B01G26DMSC chocolate_sum|$5.43 tabletop_search_count|60 tabletop_product|$169.99:B09NPX5CDP tabletop_reviews|4:95% cheapest_computer_accessory|$1.17 price_difference|$168.82 comparison_count|3 cart_subtotal|$2.21 cart_item_count|2 under_60_budget|YES ================================================ FILE: tasks/playwright_webarena/standard/shopping/multi_category_budget_analysis/meta.json ================================================ { "task_id": "multi_category_budget_analysis", "task_name": "Multi Category Budget Analysis", "category_id": "shopping", "category_name": "Shopping", "description": "Analyze spending patterns across multiple product categories, optimize budget allocation, identify cost-saving opportunities, and generate comprehensive financial planning report with purchase recommendations.", "author": "Yaoqi Ye", "created_at": "2025-08-17", "difficulty": "L3", "tags": [ "data extraction", "search aggregation", "content submission", "comparative analysis", "inventory management" ], "mcp": [ "playwright" ], "meta_data": { "stateType": "video", "stateContent": null, "stateUrl": "https://storage.mcpmark.ai/tasks_state/playwright_video/one-stop-market.mp4", "stateOriginalUrl": "https://github.com/web-arena-x/webarena/tree/main/environment_docker" } } ================================================ FILE: tasks/playwright_webarena/standard/shopping/multi_category_budget_analysis/verify.py ================================================ import asyncio import sys import re import os import json from pathlib import Path def get_model_response(): """ Get the model's response from the MCP_MESSAGES environment variable. Returns the last assistant message text. """ messages_path = os.getenv("MCP_MESSAGES") print(f"MCP_MESSAGES: {messages_path}") if not messages_path: print("Warning: MCP_MESSAGES environment variable not set", file=sys.stderr) return None try: with open(messages_path, "r") as f: messages = json.load(f) # Find the last assistant message for message in reversed(messages): if ( message.get("role") == "assistant" and message.get("status") == "completed" and message.get("type") == "message" ): content = message.get("content", []) for item in content: if item.get("type") == "output_text": return item.get("text", "") print("Warning: No assistant response found in messages", file=sys.stderr) return None except Exception as e: print(f"Error reading messages file: {str(e)}", file=sys.stderr) return None def parse_answer_format(text): """ Parse the <answer>...</answer> format from the agent's output. Returns a dictionary with the parsed values. """ if not text: return None # Look for <answer>...</answer> pattern match = re.search(r"<answer>(.*?)</answer>", text, re.IGNORECASE | re.DOTALL) if not match: return None answer_content = match.group(1).strip() # Parse each line result = {} lines = answer_content.split("\n") if len(lines) != 11: print(f"Error: Expected 11 lines in answer, got {len(lines)}", file=sys.stderr) return None for line in lines: if "|" in line: key, value = line.split("|", 1) result[key.strip()] = value.strip() return result def load_expected_answer(label_path): """ Load the expected answer from label.txt file. Returns a dictionary with the expected values. """ try: with open(label_path, "r") as f: lines = f.read().strip().split("\n") expected = {} for line in lines: if "|" in line: key, value = line.split("|", 1) expected[key.strip()] = value.strip() return expected except Exception as e: print(f"Error reading label file: {str(e)}", file=sys.stderr) return None def compare_answers(model_answer, expected_answer): """ Compare the model's answer with the expected answer. Returns True if all key information matches, False otherwise. """ if not model_answer or not expected_answer: return False # Check each expected key mismatches = [] for key, expected_value in expected_answer.items(): model_value = model_answer.get(key, "") # Special handling for different types of values if key == "chocolate_products": # Parse and compare chocolate products with price:SKU format expected_products = expected_value.split(";") model_products = model_value.split(";") if len(expected_products) != len(model_products): mismatches.append(f"{key}: expected {len(expected_products)} products, got {len(model_products)}") else: for i, (exp, mod) in enumerate(zip(expected_products, model_products)): exp_parts = exp.strip().split(":") mod_parts = mod.strip().split(":") if len(exp_parts) != 2 or len(mod_parts) != 2: mismatches.append(f"{key}: product {i+1} format error - expected 'price:SKU'") else: # Check price format (should start with $) if not mod_parts[0].startswith("$"): mismatches.append(f"{key}: product {i+1} price format error - expected '$XX.XX' format, got '{mod_parts[0]}'") elif exp_parts[0] != mod_parts[0] or exp_parts[1] != mod_parts[1]: mismatches.append(f"{key}: product {i+1} mismatch - expected '{exp}', got '{mod}'") elif key == "tabletop_product": # Parse and compare tabletop product with price:SKU format exp_parts = expected_value.strip().split(":") mod_parts = model_value.strip().split(":") if len(exp_parts) != 2 or len(mod_parts) != 2: mismatches.append(f"{key}: format error - expected 'price:SKU', got '{model_value}'") else: # Check price format (should start with $) if not mod_parts[0].startswith("$"): mismatches.append(f"{key}: price format error - expected '$XX.XX' format, got '{mod_parts[0]}'") elif exp_parts[0] != mod_parts[0] or exp_parts[1] != mod_parts[1]: mismatches.append(f"{key}: mismatch - expected '{expected_value}', got '{model_value}'") elif key == "tabletop_reviews": # Parse and compare tabletop reviews with NumberOfReviews:Rating format exp_parts = expected_value.strip().split(":") mod_parts = model_value.strip().split(":") if len(exp_parts) != 2 or len(mod_parts) != 2: mismatches.append(f"{key}: format error - expected 'NumberOfReviews:Rating', got '{model_value}'") else: # Check if both parts match if exp_parts[0] != mod_parts[0] or exp_parts[1] != mod_parts[1]: mismatches.append(f"{key}: mismatch - expected '{expected_value}', got '{model_value}'") elif key in ["chocolate_sum", "price_difference", "cart_subtotal", "cheapest_computer_accessory"]: # For price fields, only support $XX.XX format # Check if model value has correct format if not model_value.startswith("$"): mismatches.append( f"{key}: incorrect format - expected '$XX.XX' format, got '{model_value}'" ) else: # Normalize and compare values expected_clean = expected_value.replace("$", "").replace(",", "") model_clean = model_value.replace("$", "").replace(",", "") if expected_clean != model_clean: mismatches.append( f"{key}: expected '{expected_value}', got '{model_value}'" ) elif key == "under_60_budget": # Compare YES/NO value (case-insensitive) if expected_value.upper() != model_value.upper(): mismatches.append(f"{key}: expected '{expected_value}', got '{model_value}'") elif key in ["tabletop_search_count", "comparison_count", "cart_item_count"]: # Numeric fields - exact match if model_value != expected_value: mismatches.append( f"{key}: expected '{expected_value}', got '{model_value}'" ) else: # Exact match for other fields if model_value != expected_value: mismatches.append( f"{key}: expected '{expected_value}', got '{model_value}'" ) if mismatches: print("\n=== Answer Comparison Mismatches ===", file=sys.stderr) for mismatch in mismatches: print(f"✗ {mismatch}", file=sys.stderr) return False print("\n=== Answer Comparison ===", file=sys.stderr) print("✓ All key information matches the expected answer", file=sys.stderr) return True async def verify() -> bool: """ Verifies that the multi-category budget analysis task has been completed correctly. """ # Get the label file path label_path = Path(__file__).parent / "label.txt" # Load expected answer expected_answer = load_expected_answer(label_path) if not expected_answer: print("Error: Could not load expected answer from label.txt", file=sys.stderr) return False # Get model's response from MCP_MESSAGES model_response = get_model_response() if model_response: print("Found model response, parsing answer format...", file=sys.stderr) model_answer = parse_answer_format(model_response) if model_answer: print("\n=== Model Answer Parsed ===", file=sys.stderr) for key, value in model_answer.items(): print(f"{key}: {value}", file=sys.stderr) # Compare answers answer_match = compare_answers(model_answer, expected_answer) if not answer_match: print("\nModel answer does not match expected answer", file=sys.stderr) return False print("\n✓ Model answer matches expected answer", file=sys.stderr) return True else: print("Warning: Could not parse answer format from model response", file=sys.stderr) return False else: print("No model response found", file=sys.stderr) return False def main(): """ Executes the verification process and exits with a status code. """ result = asyncio.run(verify()) sys.exit(0 if result else 1) if __name__ == "__main__": main() ================================================ FILE: tasks/playwright_webarena/standard/shopping/printer_keyboard_search/description.md ================================================ 1. Search for a `printer capable of reducing blue light` that: - Is pink or purple (must be stated in product details, not from image) - Manufactured in Asia Record SKU ID and price 2. Find a keyboard with: - Bluetooth mode (must be stated either stated in details or title) - Price between $50.00-$100.00 - Highest review rating among matching products Record SKU ID, price, number of reviews, and review rating **Output Format:** ``` <answer> PrinterSKUID|id PrinterPrice|$XX.XX KeyboardSKUID|id KeyboardPrice|$XX.XX KeyboardReviews|XX KeyboardRating|XX% </answer> ``` ================================================ FILE: tasks/playwright_webarena/standard/shopping/printer_keyboard_search/label.txt ================================================ PrinterSKUID|B09J8KQX6V PrinterPrice|$248.04 KeyboardSKUID|B08JD7F3F5 KeyboardPrice|$85.99 KeyboardReviews|12 KeyboardRating|77% ================================================ FILE: tasks/playwright_webarena/standard/shopping/printer_keyboard_search/meta.json ================================================ { "task_id": "printer_keyboard_search", "task_name": "Printer Keyboard Search", "category_id": "shopping", "category_name": "Shopping", "description": "Search and evaluate office equipment by comparing printer specifications, keyboard ergonomics, analyzing user reviews, tracking prices, and generating detailed purchase recommendations report.", "author": "Yaoqi Ye", "created_at": "2025-08-17", "difficulty": "L3", "tags": [ "search aggregation", "comparative analysis", "content submission" ], "mcp": [ "playwright" ], "meta_data": { "stateType": "video", "stateContent": null, "stateUrl": "https://storage.mcpmark.ai/tasks_state/playwright_video/one-stop-market.mp4", "stateOriginalUrl": "https://github.com/web-arena-x/webarena/tree/main/environment_docker" } } ================================================ FILE: tasks/playwright_webarena/standard/shopping/printer_keyboard_search/verify.py ================================================ import asyncio import sys import re import os import json from pathlib import Path def get_model_response(): """ Get the model's response from the MCP_MESSAGES environment variable. Returns the last assistant message text. """ messages_path = os.getenv("MCP_MESSAGES") print(f"MCP_MESSAGES: {messages_path}") if not messages_path: print("Warning: MCP_MESSAGES environment variable not set", file=sys.stderr) return None try: with open(messages_path, "r") as f: messages = json.load(f) # Find the last assistant message for message in reversed(messages): if ( message.get("role") == "assistant" and message.get("status") == "completed" and message.get("type") == "message" ): content = message.get("content", []) for item in content: if item.get("type") == "output_text": return item.get("text", "") print("Warning: No assistant response found in messages", file=sys.stderr) return None except Exception as e: print(f"Error reading messages file: {str(e)}", file=sys.stderr) return None def parse_answer_format(text): """ Parse the <answer>...</answer> format from the agent's output. Returns a dictionary with the parsed values. """ if not text: return None # Look for <answer>...</answer> pattern match = re.search(r"<answer>(.*?)</answer>", text, re.IGNORECASE | re.DOTALL) if not match: return None answer_content = match.group(1).strip() # Parse each line result = {} lines = answer_content.split("\n") if len(lines) != 6: print(f"Error: Expected 6 lines in answer, got {len(lines)}", file=sys.stderr) return None for line in lines: if "|" in line: key, value = line.split("|", 1) result[key.strip()] = value.strip() return result def load_expected_answer(label_path): """ Load the expected answer from label.txt file. Returns a dictionary with the expected values. """ try: with open(label_path, "r") as f: lines = f.read().strip().split("\n") expected = {} for line in lines: if "|" in line: key, value = line.split("|", 1) expected[key.strip()] = value.strip() return expected except Exception as e: print(f"Error reading label file: {str(e)}", file=sys.stderr) return None def compare_answers(model_answer, expected_answer): """ Compare the model's answer with the expected answer. Returns True if all key information matches, False otherwise. """ if not model_answer or not expected_answer: return False # Check each expected key mismatches = [] for key, expected_value in expected_answer.items(): model_value = model_answer.get(key, "") # Special handling for different types of values if key in ["PrinterPrice", "KeyboardPrice"]: # For price fields, only support $XX.XX format # Check if model value has correct format if not model_value.startswith("$"): mismatches.append( f"{key}: incorrect format - expected '$XX.XX' format, got '{model_value}'" ) else: # Normalize and compare values expected_clean = expected_value.replace("$", "").replace(",", "") model_clean = model_value.replace("$", "").replace(",", "") if expected_clean != model_clean: mismatches.append( f"{key}: expected '{expected_value}', got '{model_value}'" ) elif key in ["PrinterSKUID", "KeyboardSKUID"]: # SKU should match exactly (case-insensitive) if model_value.upper() != expected_value.upper(): mismatches.append( f"{key}: expected '{expected_value}', got '{model_value}'" ) elif key == "KeyboardReviews": # Number of reviews should match exactly if model_value != expected_value: mismatches.append( f"{key}: expected '{expected_value}', got '{model_value}'" ) elif key == "KeyboardRating": # Rating should match exactly (including % sign) if model_value != expected_value: mismatches.append( f"{key}: expected '{expected_value}', got '{model_value}'" ) else: # Exact match for other fields if model_value != expected_value: mismatches.append( f"{key}: expected '{expected_value}', got '{model_value}'" ) if mismatches: print("\n=== Answer Comparison Mismatches ===", file=sys.stderr) for mismatch in mismatches: print(f"✗ {mismatch}", file=sys.stderr) return False print("\n=== Answer Comparison ===", file=sys.stderr) print("✓ All key information matches the expected answer", file=sys.stderr) return True async def verify() -> bool: """ Verifies that the electronic products task has been completed correctly. Checks the model's answer against the expected label. """ # Get the label file path label_path = Path(__file__).parent / "label.txt" # Load expected answer expected_answer = load_expected_answer(label_path) if not expected_answer: print("Error: Could not load expected answer from label.txt", file=sys.stderr) return False # Get model's response from MCP_MESSAGES model_response = get_model_response() if model_response: print("Found model response, parsing answer format...", file=sys.stderr) model_answer = parse_answer_format(model_response) if model_answer: print("\n=== Model Answer Parsed ===", file=sys.stderr) for key, value in model_answer.items(): print(f"{key}: {value}", file=sys.stderr) # Compare answers answer_match = compare_answers(model_answer, expected_answer) if not answer_match: print("\nModel answer does not match expected answer", file=sys.stderr) return False print("\n✓ Model answer matches expected answer", file=sys.stderr) return True else: print( "Warning: Could not parse answer format from model response", file=sys.stderr, ) return False else: print("No model response found", file=sys.stderr) return False def main(): """ Executes the verification process and exits with a status code. """ result = asyncio.run(verify()) sys.exit(0 if result else 1) if __name__ == "__main__": main() ================================================ FILE: tasks/playwright_webarena/standard/shopping/running_shoes_purchase/description.md ================================================ 1. Find running shoes: - Price between $50.00-$60.00 - "running shoe" must appear in product name - Choose the one with highest number of reviews - Select black or white color, size 10 - Add to cart with quantity 2 2. Record from product page: SKU ID, price, number of reviews, review rating 3. Record cart subtotal **Output Format:** ``` <answer> SKUID|id Price|$XX.XX NumberOfReviews|XX ReviewRating|XX% Subtotal|$XX.XX </answer> ``` ================================================ FILE: tasks/playwright_webarena/standard/shopping/running_shoes_purchase/label.txt ================================================ SKUID|B08KKX1WXQ Price|$56.21 NumberOfReviews|46 ReviewRating|86% Subtotal|$112.42 ================================================ FILE: tasks/playwright_webarena/standard/shopping/running_shoes_purchase/meta.json ================================================ { "task_id": "running_shoes_purchase", "task_name": "Running Shoes Purchase", "category_id": "shopping", "category_name": "Shopping", "description": "Research running footwear by analyzing biomechanical features, comparing cushioning technologies, evaluating durability ratings, considering user preferences, and recommending optimal shoe selections.", "author": "Yaoqi Ye", "created_at": "2025-08-17", "difficulty": "L3", "tags": [ "search aggregation", "comparative analysis" ], "mcp": [ "playwright" ], "meta_data": { "stateType": "video", "stateContent": null, "stateUrl": "https://storage.mcpmark.ai/tasks_state/playwright_video/one-stop-market.mp4", "stateOriginalUrl": "https://github.com/web-arena-x/webarena/tree/main/environment_docker" } } ================================================ FILE: tasks/playwright_webarena/standard/shopping/running_shoes_purchase/verify.py ================================================ import asyncio import sys import re import os import json from pathlib import Path def get_model_response(): """ Get the model's response from the MCP_MESSAGES environment variable. Returns the last assistant message text. """ messages_path = os.getenv("MCP_MESSAGES") print(f"MCP_MESSAGES: {messages_path}") if not messages_path: print("Warning: MCP_MESSAGES environment variable not set", file=sys.stderr) return None try: with open(messages_path, "r") as f: messages = json.load(f) # Find the last assistant message for message in reversed(messages): if ( message.get("role") == "assistant" and message.get("status") == "completed" and message.get("type") == "message" ): content = message.get("content", []) for item in content: if item.get("type") == "output_text": return item.get("text", "") print("Warning: No assistant response found in messages", file=sys.stderr) return None except Exception as e: print(f"Error reading messages file: {str(e)}", file=sys.stderr) return None def parse_answer_format(text): """ Parse the <answer>...</answer> format from the agent's output. Returns a dictionary with the parsed values. """ if not text: return None # Look for <answer>...</answer> pattern match = re.search(r"<answer>(.*?)</answer>", text, re.IGNORECASE | re.DOTALL) if not match: return None answer_content = match.group(1).strip() # Parse each line result = {} lines = answer_content.split("\n") if len(lines) != 5: print(f"Error: Expected 5 lines in answer, got {len(lines)}", file=sys.stderr) return None for line in lines: if "|" in line: key, value = line.split("|", 1) result[key.strip()] = value.strip() return result def load_expected_answer(label_path): """ Load the expected answer from label.txt file. Returns a dictionary with the expected values. """ try: with open(label_path, "r") as f: lines = f.read().strip().split("\n") expected = {} for line in lines: if "|" in line: key, value = line.split("|", 1) expected[key.strip()] = value.strip() return expected except Exception as e: print(f"Error reading label file: {str(e)}", file=sys.stderr) return None def compare_answers(model_answer, expected_answer): """ Compare the model's answer with the expected answer. Returns True if all key information matches, False otherwise. """ if not model_answer or not expected_answer: return False # Check each expected key mismatches = [] for key, expected_value in expected_answer.items(): model_value = model_answer.get(key, "") # Special handling for different types of values if key in ["Price", "Subtotal"]: # For price fields, only support $XX.XX format # Check if model value has correct format if not model_value.startswith("$"): mismatches.append( f"{key}: incorrect format - expected '$XX.XX' format, got '{model_value}'" ) else: # Normalize and compare values expected_clean = expected_value.replace("$", "").replace(",", "") model_clean = model_value.replace("$", "").replace(",", "") # Allow small tolerance for price calculations (within $0.01) try: expected_float = float(expected_clean) model_float = float(model_clean) if abs(expected_float - model_float) > 0.01: mismatches.append( f"{key}: expected '{expected_value}', got '{model_value}'" ) except ValueError: if expected_clean != model_clean: mismatches.append( f"{key}: expected '{expected_value}', got '{model_value}'" ) elif key == "SKUID": # SKU should match exactly (case-insensitive) if model_value.upper() != expected_value.upper(): mismatches.append( f"{key}: expected '{expected_value}', got '{model_value}'" ) elif key == "NumberOfReviews": # Number of reviews should match exactly if model_value != expected_value: mismatches.append( f"{key}: expected '{expected_value}', got '{model_value}'" ) elif key == "ReviewRating": # Rating should match exactly (including % sign) if model_value != expected_value: mismatches.append( f"{key}: expected '{expected_value}', got '{model_value}'" ) else: # Exact match for other fields if model_value != expected_value: mismatches.append( f"{key}: expected '{expected_value}', got '{model_value}'" ) if mismatches: print("\n=== Answer Comparison Mismatches ===", file=sys.stderr) for mismatch in mismatches: print(f"✗ {mismatch}", file=sys.stderr) return False print("\n=== Answer Comparison ===", file=sys.stderr) print("✓ All key information matches the expected answer", file=sys.stderr) return True async def verify() -> bool: """ Verifies that the running shoes shopping task has been completed correctly. Checks the model's answer against the expected label. """ # Get the label file path label_path = Path(__file__).parent / "label.txt" # Load expected answer expected_answer = load_expected_answer(label_path) if not expected_answer: print("Error: Could not load expected answer from label.txt", file=sys.stderr) return False # Get model's response from MCP_MESSAGES model_response = get_model_response() if model_response: print("Found model response, parsing answer format...", file=sys.stderr) model_answer = parse_answer_format(model_response) if model_answer: print("\n=== Model Answer Parsed ===", file=sys.stderr) for key, value in model_answer.items(): print(f"{key}: {value}", file=sys.stderr) # Compare answers answer_match = compare_answers(model_answer, expected_answer) if not answer_match: print("\nModel answer does not match expected answer", file=sys.stderr) return False print("\n✓ Model answer matches expected answer", file=sys.stderr) return True else: print( "Warning: Could not parse answer format from model response", file=sys.stderr, ) return False else: print("No model response found", file=sys.stderr) return False def main(): """ Executes the verification process and exits with a status code. """ result = asyncio.run(verify()) sys.exit(0 if result else 1) if __name__ == "__main__": main() ================================================ FILE: tasks/playwright_webarena/standard/shopping_admin/customer_segmentation_setup/description.md ================================================ Perform customer segmentation setup and analysis in the Magento Admin panel to establish new customer groups and manage customer profiles. **Task Requirements:** 1. Access the Magento Admin panel to begin customer segmentation setup. if need to login, login with username 'admin' and password 'admin1234' 2. Establish baseline metrics for customer groups: - Record the exact number shown in "records found" at the top of the grid - This will be your initial groups count 3. Create a specialized customer group for European premium customers: - Group Name: Premium Europe - Tax Class: Retail Customer - Save the group 4. Verify the customer group creation was successful: - After saving, return to Customer Groups list - Record the new total shown in "records found" 5. Establish baseline metrics for all customers database: - Record the exact number shown in "records found" at the top of the grid - This will be your initial customers count 6. Add a representative customer to the new premium group: - Create a new customer with the following details: - First Name: Isabella - Last Name: Romano - Email: isabella.romano@premium.eu - Associate to Website: Main Website - Group: The group you just created - Save the customer 7. Verify the customer creation was successful: - After saving, return to All Customers list - Record the new total shown in "records found" 8. Analyze recent customer activity patterns: - Navigate to Dashboard - Look at the "Last Orders" section - Record the customer name in the last row of the table 9. Compile all your findings and output them in the following exact format: ``` <answer> InitialGroups|count FinalGroups|count InitialCustomers|count FinalCustomers|count LastOrderCustomer|name </answer> ``` **Example Output:** ``` <answer> InitialGroups|XX FinalGroups|XX InitialCustomers|XXX FinalCustomers|XXX LastOrderCustomer|XXX </answer> ``` ================================================ FILE: tasks/playwright_webarena/standard/shopping_admin/customer_segmentation_setup/label.txt ================================================ InitialGroups|4 FinalGroups|5 InitialCustomers|70 FinalCustomers|71 LastOrderCustomer|Ava Brown ================================================ FILE: tasks/playwright_webarena/standard/shopping_admin/customer_segmentation_setup/meta.json ================================================ { "task_id": "customer_segmentation_setup", "task_name": "Customer Segmentation Setup", "category_id": "shopping_admin", "category_name": "Shopping Admin", "description": "Configure customer segmentation system in admin panel by defining demographic criteria, creating behavior-based segments, implementing targeting rules, and setting up automated marketing workflows.", "author": "Fanqing Meng", "created_at": "2025-08-17", "difficulty": "L3", "tags": [ "content submission", "inventory management" ], "mcp": [ "playwright" ], "meta_data": { "stateType": "video", "stateContent": null, "stateUrl": "https://storage.mcpmark.ai/tasks_state/playwright_video/magento-admin.mp4", "stateOriginalUrl": "https://github.com/web-arena-x/webarena/tree/main/environment_docker" } } ================================================ FILE: tasks/playwright_webarena/standard/shopping_admin/customer_segmentation_setup/verify.py ================================================ import asyncio import sys import re import os import json from pathlib import Path from playwright.async_api import ( async_playwright, TimeoutError as PlaywrightTimeoutError, ) # 从环境变量读取 base_url(shopping_admin 会注入 http://localhost:7780/admin),默认回退到本地 BASE_URL = os.getenv("WEBARENA_BASE_URL", "http://localhost:7780/admin").rstrip("/") def get_model_response(): """ Get the model's response from the MCP_MESSAGES environment variable. Returns the last assistant message text. """ messages_path = os.getenv("MCP_MESSAGES") print(f"MCP_MESSAGES: {messages_path}") if not messages_path: print("Warning: MCP_MESSAGES environment variable not set", file=sys.stderr) return None try: with open(messages_path, "r") as f: messages = json.load(f) # Find the last assistant message for message in reversed(messages): if ( message.get("role") == "assistant" and message.get("status") == "completed" ): content = message.get("content", []) for item in content: if item.get("type") == "output_text": return item.get("text", "") print("Warning: No assistant response found in messages", file=sys.stderr) return None except Exception as e: print(f"Error reading messages file: {str(e)}", file=sys.stderr) return None def parse_answer_format(text): """ Parse the <answer>...</answer> format from the agent's output. Returns a dictionary with the parsed values. """ if not text: return None # Look for <answer>...</answer> pattern match = re.search(r"<answer>(.*?)</answer>", text, re.IGNORECASE | re.DOTALL) if not match: return None answer_content = match.group(1).strip() # Parse each line result = {} lines = answer_content.split("\n") if len(lines) != 5: print(f"Error: Expected 5 lines in answer, got {len(lines)}", file=sys.stderr) return None for line in lines: if "|" in line: key, value = line.split("|", 1) result[key.strip()] = value.strip() return result def load_expected_answer(label_path): """ Load the expected answer from label.txt file. Returns a dictionary with the expected values. """ try: with open(label_path, "r") as f: lines = f.read().strip().split("\n") expected = {} for line in lines: if "|" in line: key, value = line.split("|", 1) expected[key.strip()] = value.strip() return expected except Exception as e: print(f"Error reading label file: {str(e)}", file=sys.stderr) return None def compare_answers(model_answer, expected_answer): """ Compare the model's answer with the expected answer. Returns True if all key information matches, False otherwise. """ if not model_answer or not expected_answer: return False # Check each expected key mismatches = [] for key, expected_value in expected_answer.items(): model_value = model_answer.get(key, "") # Exact match for all fields if model_value != expected_value: mismatches.append( f"{key}: expected '{expected_value}', got '{model_value}'" ) if mismatches: print("\n=== Answer Comparison Mismatches ===", file=sys.stderr) for mismatch in mismatches: print(f"✗ {mismatch}", file=sys.stderr) return False print("\n=== Answer Comparison ===", file=sys.stderr) print("✓ All key information matches the expected answer", file=sys.stderr) return True async def verify() -> bool: """ Verifies that the customer segmentation setup task has been completed correctly. First checks the model's answer against the expected label, then verifies the actual state in the Magento Admin. """ # Get the label file path label_path = Path(__file__).parent / "label.txt" # Load expected answer expected_answer = load_expected_answer(label_path) if not expected_answer: print("Error: Could not load expected answer from label.txt", file=sys.stderr) return False # Get model's response from MCP_MESSAGES model_response = get_model_response() if model_response: print("Found model response, parsing answer format...", file=sys.stderr) model_answer = parse_answer_format(model_response) if model_answer: print("\n=== Model Answer Parsed ===", file=sys.stderr) for key, value in model_answer.items(): print(f"{key}: {value}", file=sys.stderr) # Compare answers answer_match = compare_answers(model_answer, expected_answer) if not answer_match: print("\nModel answer does not match expected answer", file=sys.stderr) return False print("\n✓ Model answer matches expected answer", file=sys.stderr) else: print( "Warning: Could not parse answer format from model response", file=sys.stderr, ) print("Will proceed with browser verification only", file=sys.stderr) else: print( "No model response found, proceeding with browser verification", file=sys.stderr, ) # Browser verification for actual state print("\n=== Starting Browser Verification ===", file=sys.stderr) async with async_playwright() as p: browser = await p.chromium.launch(headless=True) context = await browser.new_context() page = await context.new_page() try: # Navigate to Magento Admin print("Navigating to Magento Admin...", file=sys.stderr) await page.goto( f"{BASE_URL}/", wait_until="networkidle" ) # Check if already logged in, if not, login if "dashboard" not in page.url.lower(): print("Logging into Magento Admin...", file=sys.stderr) await page.fill('input[name="login[username]"]', "admin") await page.fill('input[name="login[password]"]', "admin1234") await page.click('button:has-text("Sign in")') await page.wait_for_load_state("networkidle") if "dashboard" not in page.url.lower(): print("Error: Login failed", file=sys.stderr) return False print("Successfully logged into Magento Admin", file=sys.stderr) # 1. Verify Customer Groups print("\nVerifying Customer Groups...", file=sys.stderr) await page.goto( f"{BASE_URL}/customer/group/", wait_until="networkidle", ) await page.wait_for_timeout(2000) # Wait for grid to load # Check for Premium Europe group premium_europe_exists = ( await page.locator("text=Premium Europe").count() > 0 ) if premium_europe_exists: print("✓ Found 'Premium Europe' customer group", file=sys.stderr) # Check if it has Retail Customer tax class # Look for Premium Europe row and check its tax class premium_row = page.locator('tr:has-text("Premium Europe")') if await premium_row.count() > 0: tax_class_text = await premium_row.locator("td").nth(2).inner_text() if "Retail Customer" in tax_class_text: print( "✓ Premium Europe has 'Retail Customer' tax class", file=sys.stderr, ) else: print( f"Warning: Premium Europe tax class is '{tax_class_text}'", file=sys.stderr, ) else: print("✗ 'Premium Europe' customer group not found", file=sys.stderr) return False # Check total groups count records_found = page.locator("text=records found").first if await records_found.count() > 0: count_text = await records_found.inner_text() print(f"Customer Groups count: {count_text}", file=sys.stderr) # Extract number import re match = re.search(r"(\d+)\s+records found", count_text) if match: groups_count = int(match.group(1)) print(f"✓ Customer groups count is {groups_count}", file=sys.stderr) # 2. Verify Customer print("\nVerifying Customer Isabella Romano...", file=sys.stderr) await page.goto( f"{BASE_URL}/customer/index/", wait_until="networkidle", ) await page.wait_for_timeout(3000) # Wait for grid to load # Check total customers count customer_records = page.locator("text=records found").first if await customer_records.count() > 0: count_text = await customer_records.inner_text() print(f"Customers count: {count_text}", file=sys.stderr) # Extract number match = re.search(r"(\d+)\s+records found", count_text) if match: customers_count = int(match.group(1)) print( f"✓ Total customers count is {customers_count}", file=sys.stderr ) # Verify against expected answer if available if expected_answer and "FinalCustomers" in expected_answer: expected_final = int(expected_answer["FinalCustomers"]) if customers_count == expected_final: print( f"✓ Customer count matches expected: {customers_count}", file=sys.stderr, ) else: print( f"✗ Customer count mismatch: Expected {expected_final} customers, found {customers_count}", file=sys.stderr, ) return False # Wait for the customer grid to load properly await page.wait_for_timeout(5000) # Check if Isabella Romano exists - first wait for grid to load grid_loaded = False for i in range(3): # Look for grid container and wait for it to populate grid_container = page.locator(".admin__data-grid-outer-wrap, .data-grid, table").first if await grid_container.count() > 0: # Check if there are customer rows loaded customer_rows = page.locator("td[data-column='email'], td:has-text('@')") if await customer_rows.count() > 0: grid_loaded = True break await page.wait_for_timeout(2000) if not grid_loaded: print("✗ Customer grid failed to load properly", file=sys.stderr) return False # Now check if Isabella Romano exists in the loaded grid isabella_exists = ( await page.locator("text=isabella.romano@premium.eu").count() > 0 ) if not isabella_exists: # Try searching for the customer to be more thorough try: search_box = page.locator('input[placeholder*="Search by keyword"], input[name="search"], [data-role="search"]').first if await search_box.count() > 0: await search_box.clear() await search_box.fill("isabella.romano@premium.eu") await page.keyboard.press("Enter") await page.wait_for_load_state("networkidle") await page.wait_for_timeout(3000) # Check again after search isabella_exists = ( await page.locator("text=isabella.romano@premium.eu").count() > 0 ) # Also check for "No records found" message no_records = await page.locator("text=We couldn't find any records., text=No records found").count() > 0 if no_records: print( "✗ Customer 'isabella.romano@premium.eu' not found - search returned no results", file=sys.stderr, ) return False except Exception as e: print(f"✗ Search failed: {str(e)}", file=sys.stderr) if isabella_exists: print( "✓ Found customer with email 'isabella.romano@premium.eu'", file=sys.stderr, ) else: print( "✗ Customer 'isabella.romano@premium.eu' not found", file=sys.stderr, ) return False # 3. Verify Dashboard Last Orders print("\nVerifying Dashboard Last Orders...", file=sys.stderr) await page.goto( f"{BASE_URL}/admin/dashboard/", wait_until="networkidle", ) await page.wait_for_timeout(2000) # Check for Last Orders section last_orders_exists = await page.locator("text=Last Orders").count() > 0 if last_orders_exists: print("✓ Found 'Last Orders' section on dashboard", file=sys.stderr) # Find the first customer in the table # Look for the table after "Last Orders" heading orders_table = ( page.locator("text=Last Orders") .locator("..") .locator("table") .first ) if await orders_table.count() > 0: # Get the last row in tbody last_row = orders_table.locator("tbody tr").last if await last_row.count() > 0: last_customer = await last_row.locator( "td" ).first.inner_text() print( f"✓ Last customer in Last Orders: {last_customer}", file=sys.stderr, ) # Verify against expected answer if available if expected_answer and "LastOrderCustomer" in expected_answer: if last_customer == expected_answer["LastOrderCustomer"]: print( f"✓ Last Order Customer matches expected: {last_customer}", file=sys.stderr, ) else: print( f"✗ Last Order Customer mismatch: Expected '{expected_answer['LastOrderCustomer']}' but actual is '{last_customer}'", file=sys.stderr, ) return False else: print( "Warning: 'Last Orders' section not found on dashboard", file=sys.stderr, ) # Summary of verification - only print if we reach this point (all checks passed) print("\n=== Browser Verification Summary ===", file=sys.stderr) print("✓ Magento Admin login successful", file=sys.stderr) print( "✓ Customer group 'Premium Europe' exists with correct tax class", file=sys.stderr, ) print("✓ Customer 'isabella.romano@premium.eu' found in system", file=sys.stderr) print("✓ Customer counts verified", file=sys.stderr) print("✓ Dashboard Last Orders section accessible", file=sys.stderr) return True except PlaywrightTimeoutError as e: print(f"Error: Timeout occurred - {str(e)}", file=sys.stderr) return False except Exception as e: print(f"Error: Unexpected error - {str(e)}", file=sys.stderr) return False finally: await browser.close() def main(): """ Executes the verification process and exits with a status code. """ result = asyncio.run(verify()) sys.exit(0 if result else 1) if __name__ == "__main__": main() ================================================ FILE: tasks/playwright_webarena/standard/shopping_admin/fitness_promotion_strategy/description.md ================================================ Our marketing team is planning a new promotion for our bestselling fitness products. We need to analyze the current performance of our top-selling items and their related promotions to optimize our strategy. **Task Requirements:** 1. If need to login, login with username 'admin' and password 'admin1234' 2. Start by checking our current bestsellers: - Identify the top 3 bestselling products based on their Price and Quantity - record their names, prices, and quantities sold - Note the total Revenue amount displayed - Check if any of these bestsellers appear in the Top Search Terms table - if yes, record the search term and its usage count, else output 'No:0' 3. Investigate these bestselling products in detail: - For each of the top 3 bestsellers identified, search for them by name and record: - Their SKU - Current inventory quantity - Whether they are 'Enabled' or 'Disabled' 4. Check if we have existing promotions for these products: - Look for any active rules that might apply to fitness/yoga products - Find if there's a rule offering percentage discount - record the rule name and discount percentage - Count total number of active rules 5. Analyze customer purchasing patterns: - Count total number of orders in the system - Note the ID of the most recent order 6. Review our top customers who might be interested: - Find the customer who appears in the Last Orders section of the dashboard with the highest total - Look up this customer in the All Customers list and record his email and customer group - Count how many other customers are in the same group 7. Compile your findings and output them in the following exact format: ``` <answer> Bestseller1|name:price:quantity:sku:inventory:status Bestseller2|name:price:quantity:sku:inventory:status Bestseller3|name:price:quantity:sku:inventory:status TotalRevenue|amount BestsellerInSearch|term:count PercentageDiscountRule|name:percentage ActiveRulesCount|count TotalOrders|count MostRecentOrderID|id TopCustomer|name:email:group SameGroupCustomers|count </answer> ``` **Example Output:** ``` <answer> Bestseller1|Product Name:$XX.XX:X:XXX(SKU):X:Enabled/Disabled Bestseller2|Product Name:$XX.XX:X:XXX(SKU):X:Enabled/Disabled Bestseller3|Product Name:$XX.XX:X:XXX(SKU):X:Enabled/Disabled TotalRevenue|$XX.XX BestsellerInSearch|Term:X or None:0 PercentageDiscountRule|Rule Name:XX% ActiveRulesCount|X TotalOrders|X MostRecentOrderID|X or None TopCustomer|Customer Name:email@example.com:Group Name SameGroupCustomers|X </answer> ``` ================================================ FILE: tasks/playwright_webarena/standard/shopping_admin/fitness_promotion_strategy/label.txt ================================================ Bestseller1|Sprite Stasis Ball 65 cm:$27.00:6:24-WG082-blue:100:Enabled Bestseller2|Quest Lumaflex™ Band:$19.00:6:24-UG01:100:Enabled Bestseller3|Sprite Yoga Strap 6 foot:$14.00:6:24-WG085:100:Enabled TotalRevenue|$0.00 BestsellerInSearch|No:0 PercentageDiscountRule|20% OFF Ever $200-plus purchase!*:20% ActiveRulesCount|4 TotalOrders|308 MostRecentOrderID|000000299 TopCustomer|Sarah Miller:sarah.miller@example.com:General SameGroupCustomers|70 ================================================ FILE: tasks/playwright_webarena/standard/shopping_admin/fitness_promotion_strategy/meta.json ================================================ { "task_id": "fitness_promotion_strategy", "task_name": "Fitness Promotion Strategy", "category_id": "shopping_admin", "category_name": "Shopping Admin", "description": "Develop fitness product promotion campaigns by analyzing sales data, creating targeted offers, configuring promotional rules, and implementing cross-selling strategies in admin dashboard.", "author": "Fanqing Meng", "created_at": "2025-08-17", "difficulty": "L3", "tags": [ "data extraction", "comparative analysis", "inventory management", "content submission" ], "mcp": [ "playwright" ], "meta_data": { "stateType": "video", "stateContent": null, "stateUrl": "https://storage.mcpmark.ai/tasks_state/playwright_video/magento-admin.mp4", "stateOriginalUrl": "https://github.com/web-arena-x/webarena/tree/main/environment_docker" } } ================================================ FILE: tasks/playwright_webarena/standard/shopping_admin/fitness_promotion_strategy/verify.py ================================================ import asyncio import sys import re import os import json from pathlib import Path def get_model_response(): """ Get the model's response from the MCP_MESSAGES environment variable. Returns the last assistant message text. """ messages_path = os.getenv("MCP_MESSAGES") print(f"MCP_MESSAGES: {messages_path}") if not messages_path: print("Warning: MCP_MESSAGES environment variable not set", file=sys.stderr) return None try: with open(messages_path, 'r') as f: messages = json.load(f) # Find the last assistant message for message in reversed(messages): if message.get('role') == 'assistant' and message.get('status') == 'completed': content = message.get('content', []) for item in content: if item.get('type') == 'output_text': return item.get('text', '') print("Warning: No assistant response found in messages", file=sys.stderr) return None except Exception as e: print(f"Error reading messages file: {str(e)}", file=sys.stderr) return None def parse_answer_format(text): """ Parse the <answer>...</answer> format from the agent's output. Returns a dictionary with the parsed values. """ if not text: return None # Look for <answer>...</answer> pattern match = re.search(r'<answer>(.*?)</answer>', text, re.IGNORECASE | re.DOTALL) if not match: return None answer_content = match.group(1).strip() # Parse each line result = {} lines = answer_content.split('\n') # Skip the check for exact number of lines - just parse what we have # if len(lines) != 13: # print(f"Error: Expected 13 lines in answer, got {len(lines)}", file=sys.stderr) # return None for line in lines: if '|' in line: key, value = line.split('|', 1) result[key.strip()] = value.strip() return result def load_expected_answer(label_path): """ Load the expected answer from label.txt file. Returns a dictionary with the expected values. """ try: with open(label_path, 'r') as f: lines = f.read().strip().split('\n') expected = {} for line in lines: if '|' in line: key, value = line.split('|', 1) expected[key.strip()] = value.strip() return expected except Exception as e: print(f"Error reading label file: {str(e)}", file=sys.stderr) return None def compare_answers(model_answer, expected_answer): """ Compare the model's answer with the expected answer. Returns True if all key information matches, False otherwise. """ if not model_answer or not expected_answer: return False # Check each expected key mismatches = [] for key, expected_value in expected_answer.items(): model_value = model_answer.get(key, '') # Special handling for different types of values if key in ['Bestseller1', 'Bestseller2', 'Bestseller3']: # Check if all parts match (name:price:quantity:sku:inventory:status) if ':' in expected_value and ':' in model_value: expected_parts = expected_value.split(':') model_parts = model_value.split(':') if len(expected_parts) == 6 and len(model_parts) == 6: # Compare each part for i, (exp, mod) in enumerate(zip(expected_parts, model_parts)): if i == 1: # Price field exp_clean = exp.replace('$', '').replace(',', '') mod_clean = mod.replace('$', '').replace(',', '') if exp_clean != mod_clean: mismatches.append(f"{key} price: expected '{exp}', got '{mod}'") elif i == 4: # Inventory field (may have decimal places) exp_float = float(exp.replace(',', '')) mod_float = float(mod.replace(',', '')) if abs(exp_float - mod_float) > 0.0001: mismatches.append(f"{key} inventory: expected '{exp}', got '{mod}'") else: if exp.lower() != mod.lower(): mismatches.append(f"{key} part {i}: expected '{exp}', got '{mod}'") else: mismatches.append(f"{key}: format mismatch - expected '{expected_value}', got '{model_value}'") else: if expected_value != model_value: mismatches.append(f"{key}: expected '{expected_value}', got '{model_value}'") elif key == 'LowestInventoryProduct': # Check product name and inventory if ':' in expected_value and ':' in model_value: expected_name, expected_inv = expected_value.rsplit(':', 1) model_name, model_inv = model_value.rsplit(':', 1) if expected_name.lower() != model_name.lower(): mismatches.append(f"{key} name: expected '{expected_name}', got '{model_name}'") exp_float = float(expected_inv.replace(',', '')) mod_float = float(model_inv.replace(',', '')) if abs(exp_float - mod_float) > 0.0001: mismatches.append(f"{key} inventory: expected '{expected_inv}', got '{model_inv}'") else: if expected_value != model_value: mismatches.append(f"{key}: expected '{expected_value}', got '{model_value}'") elif key in ['TotalRevenue', 'MinimumPurchaseRule']: # For price/amount fields, normalize format expected_clean = expected_value.replace('$', '').replace(',', '') model_clean = model_value.replace('$', '').replace(',', '') if expected_clean != model_clean: mismatches.append(f"{key}: expected '{expected_value}', got '{model_value}'") elif key == 'BestsellerInSearch': # Check search term and count if expected_value.lower() != model_value.lower(): mismatches.append(f"{key}: expected '{expected_value}', got '{model_value}'") elif key == 'PercentageDiscountRule': # Check rule name and percentage if ':' in expected_value and ':' in model_value: expected_name, expected_pct = expected_value.rsplit(':', 1) model_name, model_pct = model_value.rsplit(':', 1) if expected_name != model_name: mismatches.append(f"{key} name: expected '{expected_name}', got '{model_name}'") # Normalize percentage (20% vs 20 vs 0.20) exp_pct_clean = expected_pct.replace('%', '').strip() mod_pct_clean = model_pct.replace('%', '').strip() if exp_pct_clean != mod_pct_clean: mismatches.append(f"{key} percentage: expected '{expected_pct}', got '{model_pct}'") else: if expected_value != model_value: mismatches.append(f"{key}: expected '{expected_value}', got '{model_value}'") elif key == 'TopCustomer': # Check name:email:group if ':' in expected_value and ':' in model_value: expected_parts = expected_value.split(':') model_parts = model_value.split(':') if len(expected_parts) == 3 and len(model_parts) == 3: exp_name, exp_email, exp_group = expected_parts mod_name, mod_email, mod_group = model_parts if exp_name != mod_name: mismatches.append(f"{key} name: expected '{exp_name}', got '{mod_name}'") if exp_email.lower() != mod_email.lower(): mismatches.append(f"{key} email: expected '{exp_email}', got '{mod_email}'") if exp_group.lower() != mod_group.lower(): mismatches.append(f"{key} group: expected '{exp_group}', got '{mod_group}'") else: mismatches.append(f"{key}: format mismatch - expected '{expected_value}', got '{model_value}'") else: if expected_value != model_value: mismatches.append(f"{key}: expected '{expected_value}', got '{model_value}'") elif key == 'MostRecentOrderDate': # Date format may vary, do flexible comparison if expected_value.lower() == 'none' and model_value.lower() == 'none': continue elif expected_value != model_value: # Could add more flexible date parsing here if needed mismatches.append(f"{key}: expected '{expected_value}', got '{model_value}'") else: # Exact match for other fields (counts, etc.) if str(model_value) != str(expected_value): mismatches.append(f"{key}: expected '{expected_value}', got '{model_value}'") if mismatches: print("\n=== Answer Comparison Mismatches ===", file=sys.stderr) for mismatch in mismatches: print(f"✗ {mismatch}", file=sys.stderr) return False print("\n=== Answer Comparison ===", file=sys.stderr) print("✓ All key information matches the expected answer", file=sys.stderr) return True async def verify() -> bool: """ Verifies that the bestseller analysis and promotion task has been completed correctly. First checks the model's answer against the expected label, then optionally verifies the actual state in the Magento Admin. """ # Get the label file path label_path = Path(__file__).parent / "label.txt" # Load expected answer expected_answer = load_expected_answer(label_path) if not expected_answer: print("Error: Could not load expected answer from label.txt", file=sys.stderr) return False # Get model's response from MCP_MESSAGES model_response = get_model_response() if model_response: print("Found model response, parsing answer format...", file=sys.stderr) model_answer = parse_answer_format(model_response) if model_answer: print("\n=== Model Answer Parsed ===", file=sys.stderr) for key, value in model_answer.items(): print(f"{key}: {value}", file=sys.stderr) # Compare answers answer_match = compare_answers(model_answer, expected_answer) if not answer_match: print("\nModel answer does not match expected answer", file=sys.stderr) return False print("\n✓ Model answer matches expected answer", file=sys.stderr) return True else: print("Warning: Could not parse answer format from model response", file=sys.stderr) return False else: print("No model response found", file=sys.stderr) return False def main(): """ Executes the verification process and exits with a status code. """ result = asyncio.run(verify()) sys.exit(0 if result else 1) if __name__ == "__main__": main() ================================================ FILE: tasks/playwright_webarena/standard/shopping_admin/marketing_customer_analysis/description.md ================================================ Perform a comprehensive marketing and customer analysis workflow in the Magento Admin panel to understand search behavior patterns and promotional effectiveness. **Task Requirements:** 1. First, we need to access the system to begin our comprehensive analysis: if need to login, login with username 'admin' and password 'admin1234' 2. Let's start by analyzing customer search behavior to understand what customers are looking for: Go to Search Terms in Reports and analyze the search data: - Identify the TOP 2 search terms with the highest number of hits (record exact terms and hit counts) - Find a search term that has 0 results but still has search hits (record exact term and hit count) - Count the total number of search terms displayed in the report 3. Next, we'll examine our promotional strategies to understand current marketing efforts: Navigate to Cart Price Rules and identify: - Find ALL rules that contain a coupon code - Record the exact coupon codes and the complete rule names for each - Count how many active rules exist in total 4. Now let's analyze our email marketing reach and subscriber engagement: Go to Newsletter Subscribers: - Apply filter to show only 'Subscribed' status - Count the total number of subscribed users showing after filter - Verify whether these TWO emails appear in the subscribed list: * john.smith.xyz@gmail.com * admin@magento.com 5. To support our analysis, we need to create test customer profiles for different segments: Create TWO new customers with the following details: Customer 1: - First Name: Marketing1 - Last Name: Analy - Email: marketdata1.analysis@magento.com - Associate to Website: Main Website - Group: General Customer 2: - First Name: Analytics1 - Last Name: Report - Email: analytics1.report@magento.com - Associate to Website: Main Website - Group: Wholesale 6. Finally, let's review overall business performance metrics from the main dashboard: Go to Dashboard and identify: - The names and sales quantities of the products that are both the best-selling and most expensive - The total revenue displayed on the dashboard 7. Compile all your findings and must output them in the following exact format at last: ``` <answer> Top2SearchTerms|term1:hits1,term2:hits2 ZeroResultTerm|term:hits TotalSearchTerms|count CouponCodes|code1:rulename1,code2:rulename2 ActiveRulesCount|count SubscribedCount|count EmailVerification|john.smith.xyz@gmail.com:yes/no,admin@magento.com:yes/no TopProduct|name:quantity TotalRevenue|amount </answer> ``` **Example Output:** ``` <answer> Top2SearchTerms|term1:XX,term2:XX ZeroResultTerm|term:XX TotalSearchTerms|XX CouponCodes|CODE:Rule Name Here ActiveRulesCount|X SubscribedCount|XX EmailVerification|john.smith.xyz@gmail.com:yes/no,admin@magento.com:yes/no TopProduct|Product Name:XX TotalRevenue|$XX.XX </answer> ``` **Success Criteria:** - Successfully logged into Magento Admin - Navigated to Search Terms Report and identified top 2 terms - Found search term with 0 results but has hits - Counted total search terms in report - Located all Cart Price Rules with coupon codes - Extracted exact coupon codes and rule names - Counted active rules - Filtered Newsletter Subscribers by 'Subscribed' status - Counted total subscribed users - Verified presence of two specific email addresses - Created two new customers successfully - Found top bestselling product from dashboard - Identified total revenue from dashboard - Output answer in exact format with 9 data lines - Answer wrapped in <answer> tags ================================================ FILE: tasks/playwright_webarena/standard/shopping_admin/marketing_customer_analysis/label.txt ================================================ Top2SearchTerms|hollister:19,Joust Bag:4 ZeroResultTerm|nike:3 TotalSearchTerms|7 CouponCodes|H20:$4 Luma water bottle (save 70%) ActiveRulesCount|4 SubscribedCount|1 EmailVerification|john.smith.xyz@gmail.com:yes,admin@magento.com:no TopProduct|Sprite Stasis Ball 65 cm:6 TotalRevenue|$0.00 ================================================ FILE: tasks/playwright_webarena/standard/shopping_admin/marketing_customer_analysis/meta.json ================================================ { "task_id": "marketing_customer_analysis", "task_name": "Marketing Customer Analysis", "category_id": "shopping_admin", "category_name": "Shopping Admin", "description": "Analyze customer behavior patterns using admin analytics, segment user demographics, track purchase histories, evaluate campaign effectiveness, and generate comprehensive marketing intelligence reports.", "author": "Fanqing Meng", "created_at": "2025-08-17", "difficulty": "L3", "tags": [ "data extraction", "comparative analysis", "content submission" ], "mcp": [ "playwright" ], "meta_data": { "stateType": "video", "stateContent": null, "stateUrl": "https://storage.mcpmark.ai/tasks_state/playwright_video/magento-admin.mp4", "stateOriginalUrl": "https://github.com/web-arena-x/webarena/tree/main/environment_docker" } } ================================================ FILE: tasks/playwright_webarena/standard/shopping_admin/marketing_customer_analysis/verify.py ================================================ import asyncio import sys import re import os import json from pathlib import Path from playwright.async_api import ( async_playwright, TimeoutError as PlaywrightTimeoutError, ) # 从环境变量读取 base_url(shopping_admin 会注入 http://localhost:7780/admin),默认回退到本地 BASE_URL = os.getenv("WEBARENA_BASE_URL", "http://localhost:7780/admin").rstrip("/") def get_model_response(): """ Get the model's response from the MCP_MESSAGES environment variable. Returns the last assistant message text. """ messages_path = os.getenv("MCP_MESSAGES") print(f"MCP_MESSAGES: {messages_path}") if not messages_path: print("Warning: MCP_MESSAGES environment variable not set", file=sys.stderr) return None try: with open(messages_path, "r") as f: messages = json.load(f) # Find the last assistant message for message in reversed(messages): if ( message.get("role") == "assistant" and message.get("status") == "completed" ): content = message.get("content", []) for item in content: if item.get("type") == "output_text": return item.get("text", "") print("Warning: No assistant response found in messages", file=sys.stderr) return None except Exception as e: print(f"Error reading messages file: {str(e)}", file=sys.stderr) return None def parse_answer_format(text): """ Parse the new multi-line <answer>xxx</answer> format from the agent's output. Returns a dictionary with the parsed values. """ if not text: return None # Look for <answer>...</answer> pattern match = re.search(r"<answer>(.*?)</answer>", text, re.IGNORECASE | re.DOTALL) if not match: return None answer_content = match.group(1).strip() # Parse each line result = {} lines = answer_content.split("\n") if len(lines) != 9: print(f"Error: Expected 9 lines in answer, got {len(lines)}", file=sys.stderr) return None for line in lines: if "|" in line: key, value = line.split("|", 1) result[key.strip()] = value.strip() return result def load_expected_answer(label_path): """ Load the expected answer from label.txt file. Returns a dictionary with the expected values. """ try: with open(label_path, "r") as f: lines = f.read().strip().split("\n") expected = {} for line in lines: if "|" in line: key, value = line.split("|", 1) expected[key.strip()] = value.strip() return expected except Exception as e: print(f"Error reading label file: {str(e)}", file=sys.stderr) return None def compare_answers(model_answer, expected_answer): """ Compare the model's answer with the expected answer. Returns True if all key information matches, False otherwise. """ if not model_answer or not expected_answer: return False # Check each expected key mismatches = [] for key, expected_value in expected_answer.items(): model_value = model_answer.get(key, "") # Special handling for different types of values if key == "Top2SearchTerms": # Check if both search terms are present with correct counts expected_terms = expected_value.split(",") model_terms = model_value.split(",") if set(expected_terms) != set(model_terms): mismatches.append( f"{key}: expected '{expected_value}', got '{model_value}'" ) elif key == "EmailVerification": # Check email verification status expected_emails = dict( item.split(":") for item in expected_value.split(",") ) model_emails = dict( item.split(":") for item in model_value.split(",") if ":" in item ) if expected_emails != model_emails: mismatches.append( f"{key}: expected '{expected_value}', got '{model_value}'" ) elif key == "CouponCodes": # Check if coupon code and rule name are present if "H20" not in model_value or "Luma water bottle" not in model_value: mismatches.append( f"{key}: expected '{expected_value}', got '{model_value}'" ) elif key == "TopProduct": # Check if product name and quantity match if expected_value != model_value: mismatches.append( f"{key}: expected '{expected_value}', got '{model_value}'" ) else: # Exact match for other fields if model_value != expected_value: mismatches.append( f"{key}: expected '{expected_value}', got '{model_value}'" ) if mismatches: print("\n=== Answer Comparison Mismatches ===", file=sys.stderr) for mismatch in mismatches: print(f"✗ {mismatch}", file=sys.stderr) return False print("\n=== Answer Comparison ===", file=sys.stderr) print("✓ All key information matches the expected answer", file=sys.stderr) return True async def verify() -> bool: """ Verifies that the marketing analysis task has been completed correctly. First checks the model's answer against the expected label, then optionally verifies the actual state in the Magento Admin. """ # Get the label file path label_path = Path(__file__).parent / "label.txt" # Load expected answer expected_answer = load_expected_answer(label_path) if not expected_answer: print("Error: Could not load expected answer from label.txt", file=sys.stderr) return False # Get model's response from MCP_MESSAGES model_response = get_model_response() if model_response: print("Found model response, parsing answer format...", file=sys.stderr) model_answer = parse_answer_format(model_response) if model_answer: print("\n=== Model Answer Parsed ===", file=sys.stderr) for key, value in model_answer.items(): print(f"{key}: {value}", file=sys.stderr) # Compare answers answer_match = compare_answers(model_answer, expected_answer) if not answer_match: print("\nModel answer does not match expected answer", file=sys.stderr) return False print("\n✓ Model answer matches expected answer", file=sys.stderr) else: print( "Warning: Could not parse answer format from model response", file=sys.stderr, ) print("Will proceed with browser verification only", file=sys.stderr) else: print( "No model response found, proceeding with browser verification", file=sys.stderr, ) # Browser verification - only check customer creation (the critical task requirement) print("\n=== Starting Browser Verification ===", file=sys.stderr) async with async_playwright() as p: browser = await p.chromium.launch(headless=True) context = await browser.new_context() page = await context.new_page() try: # Navigate to Magento Admin print("Navigating to Magento Admin...", file=sys.stderr) await page.goto( f"{BASE_URL}/", wait_until="networkidle" ) # Check if already logged in, if not, login if "dashboard" not in page.url.lower(): print("Logging into Magento Admin...", file=sys.stderr) await page.fill('input[name="login[username]"]', "admin") await page.fill('input[name="login[password]"]', "admin1234") await page.click('button:has-text("Sign in")') await page.wait_for_load_state("networkidle") if "dashboard" not in page.url.lower(): print("Error: Login failed", file=sys.stderr) return False print("Successfully logged into Magento Admin", file=sys.stderr) # Verify Customer Creation (the only critical check for task completion) print("Verifying Customer Creation...", file=sys.stderr) await page.goto( f"{BASE_URL}/customer/index/", wait_until="networkidle", ) # Wait for the customer grid to load try: await page.wait_for_selector("table", timeout=15000) except PlaywrightTimeoutError: print("Table not found, trying to proceed anyway...", file=sys.stderr) # Define customer requirements customer1_requirements = { "email": "marketdata1.analysis@magento.com", "first_name": "Marketing1", "last_name": "Analy", "group": "General", "website": "Main Website" } customer2_requirements = { "email": "analytics1.report@magento.com", "first_name": "Analytics1", "last_name": "Report", "group": "Wholesale", "website": "Main Website" } async def check_customer_exists(customer_requirements): """Check if a customer exists by looking for their details in the customer grid""" email = customer_requirements["email"] first_name = customer_requirements["first_name"] last_name = customer_requirements["last_name"] group = customer_requirements["group"] # First check if email exists in current page without searching email_found = await page.locator(f"*:has-text('{email}')").count() > 0 if not email_found: # Try searching for the customer try: search_box = page.locator('input[placeholder*="Search by keyword"]').first await search_box.clear() await search_box.fill(email) await page.keyboard.press("Enter") await page.wait_for_load_state("networkidle") await page.wait_for_timeout(2000) # Check again after search email_found = await page.locator(f"*:has-text('{email}')").count() > 0 except: pass if not email_found: return False, f"Email {email} not found" # More precise validation: find the row containing this customer's email # Then check if the required fields are in the same row or nearby context try: # Find the specific row containing this email email_cell = page.locator(f"td:has-text('{email}')").first if await email_cell.count() == 0: # Fall back to broader search email_cell = page.locator(f"*:has-text('{email}')").first # Get the parent row or container row = email_cell.locator("xpath=ancestor::tr[1]") if await row.count() == 0: # Fall back to getting nearby content row = email_cell.locator("xpath=..") # Get the text content of the row/container row_text = await row.text_content() if await row.count() > 0 else "" # If we can't get a specific row, fall back to broader validation if not row_text or len(row_text.strip()) < 10: # Search in nearby cells or elements nearby_elements = page.locator(f"*:has-text('{email}')").locator("xpath=../following-sibling::* | xpath=../preceding-sibling::*") nearby_count = await nearby_elements.count() nearby_text = "" for i in range(min(nearby_count, 5)): # Check up to 5 nearby elements element_text = await nearby_elements.nth(i).text_content() if element_text: nearby_text += element_text + " " row_text = row_text + " " + nearby_text # Check if required fields are present in the row/context required_fields = [first_name, last_name, group] found_fields = [email] # Email is already confirmed missing_fields = [] for field in required_fields: if field in row_text: found_fields.append(field) else: missing_fields.append(field) if missing_fields: return False, f"Customer found but missing fields in row context: {', '.join(missing_fields)}. Row text: {row_text[:100]}..." return True, f"Customer verified with all required fields: {', '.join(found_fields)}" except Exception as e: # Fall back to original simple validation page_content = await page.content() required_fields = [first_name, last_name, group, email] found_fields = [] missing_fields = [] for field in required_fields: if field in page_content: found_fields.append(field) else: missing_fields.append(field) if missing_fields: return False, f"Customer found but missing fields (fallback): {', '.join(missing_fields)}" return True, f"Customer verified with all required fields (fallback): {', '.join(found_fields)}" # Check both customers customer1_exists, customer1_msg = await check_customer_exists(customer1_requirements) customer2_exists, customer2_msg = await check_customer_exists(customer2_requirements) print( f"Customer 1 (marketdata1.analysis@magento.com): {'Found' if customer1_exists else 'Not Found'} - {customer1_msg}", file=sys.stderr, ) print( f"Customer 2 (analytics1.report@magento.com): {'Found' if customer2_exists else 'Not Found'} - {customer2_msg}", file=sys.stderr, ) if not (customer1_exists and customer2_exists): print("Error: Required customers were not found in the system", file=sys.stderr) return False print("✓ Both required customers found in the system", file=sys.stderr) return True except PlaywrightTimeoutError as e: print(f"Error: Timeout occurred - {str(e)}", file=sys.stderr) return False except Exception as e: print(f"Error: Unexpected error - {str(e)}", file=sys.stderr) return False finally: await browser.close() def main(): """ Executes the verification process and exits with a status code. """ result = asyncio.run(verify()) sys.exit(0 if result else 1) if __name__ == "__main__": main() ================================================ FILE: tasks/playwright_webarena/standard/shopping_admin/ny_expansion_analysis/description.md ================================================ Our company is planning to expand sales operations to New York state and needs a comprehensive analysis of our current sales performance and tax implications. Please help me gather critical data for our expansion feasibility report. **Task Requirements:** 1. Log in with username 'admin' and password 'admin1234' 2. First, analyze our current sales performance on the dashboard: - Check the 'Lifetime Sales' amount displayed - In the Bestsellers table, identify which product has lowest price and record its exact name, price, and quantity sold - Find if this same product appears in the 'Last Orders' table, and if so, note which customer(s) ordered it, if no, note 'No' 3. Since we're expanding to New York, we need check tax: - Find and record the exact tax rate for New York state - Compare it with California's tax rate - record which state has a higher rate - Count how many different US states currently have tax configurations 4. You need to understand our order status of stores processing for the NY market: - Filter orders to show only statuses that are 'Visible On Storefront = Yes' - Among these visible statuses, identify if exists one has the status code 'processing' (Yes or No), - Check if this 'processing' status is set as a 'Default Status' (Yes or No) 5. Since New York orders might need special handling, check all stores: - Note the number of website configured - Record the store code for the first Main Website Store 6. For inventory planning, check the sources of it: - Check if the Default Source is currently 'Enabled' or shows as 'Disabled' for Pickup Location - Click the 'Edit' link for the Default Source and check if there's a 'State/Province' field (Yes or No) 7. Finally, return to the Dashboard and examine the revenue metrics: - Record the current Revenue amount shown - Check if Tax and Shipping amounts are both $0.00 (Yes or No) **Please provide your findings in the following exact format:** ``` <answer> Lifetime_Sales_Amount|amount Cheap_Bestseller_Name|name Second_Bestseller_Price|price Second_Bestseller_Quantity|quantity Product_In_Last_Orders|yes_or_no NY_Tax_Rate|rate CA_Tax_Rate|rate Higher_Tax_State|state Total_States_With_Tax|count Processing_Visible_Storefront|Yes_or_No Processing_Default_Status|Yes_or_No Number_Of_Websites|count Main_Store_Code|code Default_Source_Pickup_Status|status Default_Source_State|state_or_none Dashboard_Revenue|amount Tax_Shipping_Zero|yes_or_no </answer> ``` **Example Output:** ``` <answer> Lifetime_Sales_Amount|$XX.XX Cheap_Bestseller_Name|Product Name Here Second_Bestseller_Price|$XX.XX Second_Bestseller_Quantity|XX Product_In_Last_Orders|Yes/No NY_Tax_Rate|X.XXXX CA_Tax_Rate|X.XXXX Higher_Tax_State|XX Total_States_With_Tax|XX Processing_Visible_Storefront|Yes/No Processing_Default_Status|Yes/No Number_Of_Websites|X Main_Store_Code|code_here Default_Source_Pickup_Status|Enabled/Disabled Default_Source_State|State or None Dashboard_Revenue|$XX.XX Tax_Shipping_Zero|Yes/No </answer> ``` ================================================ FILE: tasks/playwright_webarena/standard/shopping_admin/ny_expansion_analysis/label.txt ================================================ Lifetime_Sales_Amount|$0.00 Cheap_Bestseller_Name|Sprite Yoga Strap 6 foot Second_Bestseller_Price|$14.00 Second_Bestseller_Quantity|6 Product_In_Last_Orders|No NY_Tax_Rate|8.3750 CA_Tax_Rate|8.2500 Higher_Tax_State|NY Total_States_With_Tax|2 Processing_Visible_Storefront|Yes Processing_Default_Status|Yes Number_Of_Websites|1 Main_Store_Code|main_website_store Default_Source_Pickup_Status|Enabled Default_Source_State|No Dashboard_Revenue|$0.00 Tax_Shipping_Zero|Yes ================================================ FILE: tasks/playwright_webarena/standard/shopping_admin/ny_expansion_analysis/meta.json ================================================ { "task_id": "ny_expansion_analysis", "task_name": "NY Expansion Analysis", "category_id": "shopping_admin", "category_name": "Shopping Admin", "description": "Prepare New York market expansion strategy by analyzing regional demographics, evaluating competitor presence, assessing logistics requirements, and creating detailed market entry plan.", "author": "Fanqing Meng", "created_at": "2025-08-17", "difficulty": "L3", "tags": [ "data extraction", "comparative analysis", "content submission" ], "mcp": [ "playwright" ], "meta_data": { "stateType": "video", "stateContent": null, "stateUrl": "https://storage.mcpmark.ai/tasks_state/playwright_video/magento-admin.mp4", "stateOriginalUrl": "https://github.com/web-arena-x/webarena/tree/main/environment_docker" } } ================================================ FILE: tasks/playwright_webarena/standard/shopping_admin/ny_expansion_analysis/verify.py ================================================ import asyncio import sys import re import os import json from pathlib import Path def get_model_response(): """ Get the model's response from the MCP_MESSAGES environment variable. Returns the last assistant message text. """ messages_path = os.getenv("MCP_MESSAGES") print(f"MCP_MESSAGES: {messages_path}") if not messages_path: print("ERROR: MCP_MESSAGES environment variable not set", file=sys.stderr) return None # Check if file exists if not Path(messages_path).exists(): print(f"ERROR: Messages file not found at path: {messages_path}", file=sys.stderr) return None try: with open(messages_path, 'r') as f: content = f.read() # Check if file is empty if not content or content.strip() == '""': print("ERROR: Messages file is empty or contains only empty string", file=sys.stderr) return None messages = json.loads(content) # Check if messages is a list if not isinstance(messages, list): print(f"ERROR: Messages file should contain a list, got {type(messages).__name__}", file=sys.stderr) return None # Find the last assistant message for message in reversed(messages): if message.get('role') == 'assistant' and message.get('status') == 'completed': content = message.get('content', []) if not content: print("WARNING: Assistant message has empty content", file=sys.stderr) continue for item in content: if item.get('type') == 'output_text': text = item.get('text', '') if not text: print("WARNING: Output text is empty", file=sys.stderr) continue return text print("ERROR: No assistant response with output_text found in messages", file=sys.stderr) return None except json.JSONDecodeError as e: print(f"ERROR: Invalid JSON in messages file: {str(e)}", file=sys.stderr) return None except Exception as e: print(f"ERROR: Unexpected error reading messages file: {str(e)}", file=sys.stderr) return None def parse_answer_format(text): """ Parse the <answer>...</answer> format from the agent's output. Returns a dictionary with the parsed values. """ if not text: print("ERROR: No text provided to parse", file=sys.stderr) return None # Look for <answer>...</answer> pattern match = re.search(r'<answer>(.*?)</answer>', text, re.IGNORECASE | re.DOTALL) if not match: print("ERROR: No <answer> tags found in the response", file=sys.stderr) print(f" Response preview: {text[:200]}...", file=sys.stderr) return None answer_content = match.group(1).strip() if not answer_content: print("ERROR: Empty content between <answer> tags", file=sys.stderr) return None # Parse each line result = {} lines = answer_content.split('\n') # Expected keys that should be present expected_keys = [ 'Lifetime_Sales_Amount', 'Cheap_Bestseller_Name', 'Second_Bestseller_Price', 'Second_Bestseller_Quantity', 'Product_In_Last_Orders', 'NY_Tax_Rate', 'CA_Tax_Rate', 'Higher_Tax_State', 'Total_States_With_Tax', 'Processing_Visible_Storefront', 'Processing_Default_Status', 'Number_Of_Websites', 'Main_Store_Code', 'Default_Source_Pickup_Status', 'Default_Source_State', 'Dashboard_Revenue', 'Tax_Shipping_Zero' ] parsed_keys = [] for line in lines: line = line.strip() if not line: continue if '|' not in line: print(f"ERROR: Line missing pipe separator '|': {line}", file=sys.stderr) continue parts = line.split('|', 1) if len(parts) != 2: print(f"ERROR: Invalid line format: {line}", file=sys.stderr) continue key, value = parts key = key.strip() value = value.strip() if not key: print(f"ERROR: Empty key in line: {line}", file=sys.stderr) continue result[key] = value parsed_keys.append(key) # Check for missing expected keys missing_keys = set(expected_keys) - set(parsed_keys) if missing_keys: print(f"ERROR: Missing expected keys: {', '.join(sorted(missing_keys))}", file=sys.stderr) # Check for unexpected keys unexpected_keys = set(parsed_keys) - set(expected_keys) if unexpected_keys: print(f"WARNING: Unexpected keys found: {', '.join(sorted(unexpected_keys))}", file=sys.stderr) if not result: print("ERROR: No valid key-value pairs parsed from answer", file=sys.stderr) return None return result def load_expected_answer(label_path): """ Load the expected answer from label.txt file. Returns a dictionary with the expected values. """ try: with open(label_path, 'r') as f: lines = f.read().strip().split('\n') expected = {} for line in lines: if '|' in line: key, value = line.split('|', 1) expected[key.strip()] = value.strip() return expected except Exception as e: print(f"Error reading label file: {str(e)}", file=sys.stderr) return None def compare_answers(model_answer, expected_answer): """ Compare the model's answer with the expected answer. Returns True if all key information matches, False otherwise. """ if not model_answer or not expected_answer: return False # Check each expected key mismatches = [] for key, expected_value in expected_answer.items(): model_value = model_answer.get(key, '') # Special handling for different types of values if key in ['Lifetime_Sales_Amount', 'Second_Bestseller_Price', 'Dashboard_Revenue']: # For price/amount fields, normalize format expected_clean = expected_value.replace('$', '').replace(',', '') model_clean = model_value.replace('$', '').replace(',', '') if expected_clean != model_clean: mismatches.append(f"{key}: expected '{expected_value}', got '{model_value}'") elif key in ['NY_Tax_Rate', 'CA_Tax_Rate']: # Tax rates - allow different decimal formats expected_clean = expected_value.replace('%', '').strip() model_clean = model_value.replace('%', '').strip() # Convert to float for comparison try: if float(expected_clean) != float(model_clean): mismatches.append(f"{key}: expected '{expected_value}', got '{model_value}'") except ValueError: if expected_clean != model_clean: mismatches.append(f"{key}: expected '{expected_value}', got '{model_value}'") elif key in ['Product_In_Last_Orders', 'Processing_Visible_Storefront', 'Processing_Default_Status', 'Tax_Shipping_Zero']: # Yes/No fields - case insensitive if model_value.lower() != expected_value.lower(): mismatches.append(f"{key}: expected '{expected_value}', got '{model_value}'") elif key == 'Empty_Rows_Yes_Effect': # Allow flexible descriptions for this field # Just check if model provided some reasonable description if not model_value or len(model_value) < 5: mismatches.append(f"{key}: expected meaningful description, got '{model_value}'") elif key == 'Order_Status_Options': # Check if main options are mentioned expected_options = set(opt.strip() for opt in expected_value.split(',')) model_options = set(opt.strip() for opt in model_value.split(',')) if expected_options != model_options: mismatches.append(f"{key}: expected '{expected_value}', got '{model_value}'") elif key == 'Chart_Disabled_Message': # Allow some flexibility in message text # Check for key words if 'disabled' not in model_value.lower() and 'enable' not in model_value.lower(): mismatches.append(f"{key}: expected message about chart being disabled, got '{model_value}'") elif key == 'Default_Source_State': # Handle 'None' or empty state expected_normalized = expected_value.lower() if expected_value.lower() != 'none' else '' model_normalized = model_value.lower() if model_value.lower() != 'none' else '' if expected_normalized != model_normalized: mismatches.append(f"{key}: expected '{expected_value}', got '{model_value}'") else: # Exact match for other fields if model_value != expected_value: mismatches.append(f"{key}: expected '{expected_value}', got '{model_value}'") if mismatches: print("\n=== Answer Comparison Mismatches ===", file=sys.stderr) for mismatch in mismatches: print(f"✗ {mismatch}", file=sys.stderr) return False print("\n=== Answer Comparison ===", file=sys.stderr) print("✓ All key information matches the expected answer", file=sys.stderr) return True async def verify() -> bool: """ Verifies that the NY expansion analysis task has been completed correctly. First checks the model's answer against the expected label, then optionally verifies the actual state in the Magento Admin. """ print("\n=== Starting Verification ===", file=sys.stderr) # Get the label file path label_path = Path(__file__).parent / "label.txt" # Load expected answer print("Loading expected answer from label.txt...", file=sys.stderr) expected_answer = load_expected_answer(label_path) if not expected_answer: print("FATAL ERROR: Could not load expected answer from label.txt", file=sys.stderr) return False print(f"Expected answer loaded with {len(expected_answer)} keys", file=sys.stderr) # Get model's response from MCP_MESSAGES print("\nReading model response from MCP_MESSAGES...", file=sys.stderr) model_response = get_model_response() if not model_response: print("FATAL ERROR: No valid model response found", file=sys.stderr) return False print(f"Model response found (length: {len(model_response)} chars)", file=sys.stderr) print("\nParsing answer format from model response...", file=sys.stderr) model_answer = parse_answer_format(model_response) if not model_answer: print("FATAL ERROR: Could not parse answer format from model response", file=sys.stderr) return False print(f"\n=== Model Answer Parsed Successfully ===", file=sys.stderr) print(f"Parsed {len(model_answer)} key-value pairs", file=sys.stderr) for key, value in model_answer.items(): print(f" {key}: {value}", file=sys.stderr) # Compare answers print("\n=== Comparing Model Answer with Expected Answer ===", file=sys.stderr) answer_match = compare_answers(model_answer, expected_answer) if not answer_match: print("\nFATAL ERROR: Model answer does not match expected answer", file=sys.stderr) print("Verification FAILED", file=sys.stderr) return False print("\n✓ Model answer matches expected answer", file=sys.stderr) print("Verification PASSED", file=sys.stderr) return True def main(): """ Executes the verification process and exits with a status code. """ result = asyncio.run(verify()) sys.exit(0 if result else 1) if __name__ == "__main__": main() ================================================ FILE: tasks/playwright_webarena/standard/shopping_admin/products_sales_analysis/description.md ================================================ Perform a comprehensive products and sales analysis in the Magento Admin panel to identify inventory status and sales performance metrics. **Task Requirements:** 1. if need to login, login with username 'admin' and password 'admin1234' 2. Analyze product inventory and catalog details, perform the following: - Search for all products containing 'Yoga' in their name - count the exact number of results - Clear the search and find the product with SKU 'WH11' - record its exact price - Apply a filter to show only products with Quantity = 0.0000 - count how many products match 3. To identify top-selling products and revenue metrics, navigate to the Dashboard and from the Bestsellers table: - Identify the product with lowest price and lowest quantity - record the product name and quantity sold - Find the second cheapest product in the table - record its exact quantity sold - Note the total Revenue amount displayed in the dashboard 4. Father all customers' information and demographics: - Find customer 'Sarah Miller' - record her exact email address - Count the total number of customers shown in the grid 5. Review order status and customer purchase history, go to orders of sales: - Count the total number of orders with 'Pending' status - Find the order ID of Grace Nguyen's order with the completed status and the most expensive price (starting with "000") 6. To provide a comprehensive report of all gathered data, compile all your findings and output them in the following exact format: ``` <answer> YogaProducts|count WH11Price|price ZeroQuantityProducts|count LowestProduct|name:quantity QuestLumaflexQuantity|quantity DashboardRevenue|amount SarahMillerEmail|email TotalCustomers|count PendingOrders|count GraceNguyenOrderID|orderid </answer> ``` **Example Output:** ``` <answer> YogaProducts|XX WH11Price|$XX.XX ZeroQuantityProducts|XX LowestProduct|Product Name Here:XX QuestLumaflexQuantity|XX DashboardRevenue|$XX.XX SarahMillerEmail|email@example.com TotalCustomers|XX PendingOrders|X GraceNguyenOrderID|00000XXXX </answer> ``` ================================================ FILE: tasks/playwright_webarena/standard/shopping_admin/products_sales_analysis/label.txt ================================================ YogaProducts|171 WH11Price|$54.00 ZeroQuantityProducts|150 LowestProduct|Sprite Stasis Ball 55 cm foot:5 QuestLumaflexQuantity|6 DashboardRevenue|$0.00 SarahMillerEmail|helloworld@yahoo.com TotalCustomers|72 PendingOrders|10 GraceNguyenOrderID|000000189 ================================================ FILE: tasks/playwright_webarena/standard/shopping_admin/products_sales_analysis/meta.json ================================================ { "task_id": "products_sales_analysis", "task_name": "Products Sales Analysis", "category_id": "shopping_admin", "category_name": "Shopping Admin", "description": "Generate comprehensive sales performance reports by extracting product metrics, analyzing revenue trends, identifying top performers, evaluating inventory turnover, and creating actionable insights.", "author": "Fanqing Meng", "created_at": "2025-08-17", "difficulty": "L3", "tags": [ "data extraction", "comparative analysis", "content submission" ], "mcp": [ "playwright" ], "meta_data": { "stateType": "video", "stateContent": null, "stateUrl": "https://storage.mcpmark.ai/tasks_state/playwright_video/magento-admin.mp4", "stateOriginalUrl": "https://github.com/web-arena-x/webarena/tree/main/environment_docker" } } ================================================ FILE: tasks/playwright_webarena/standard/shopping_admin/products_sales_analysis/verify.py ================================================ import asyncio import sys import re import os import json from pathlib import Path def get_model_response(): """ Get the model's response from the MCP_MESSAGES environment variable. Returns the last assistant message text. """ messages_path = os.getenv("MCP_MESSAGES") print(f"MCP_MESSAGES: {messages_path}") if not messages_path: print("Warning: MCP_MESSAGES environment variable not set", file=sys.stderr) return None try: with open(messages_path, "r") as f: messages = json.load(f) # Find the last assistant message for message in reversed(messages): if ( message.get("role") == "assistant" and message.get("status") == "completed" ): content = message.get("content", []) for item in content: if item.get("type") == "output_text": return item.get("text", "") print("Warning: No assistant response found in messages", file=sys.stderr) return None except Exception as e: print(f"Error reading messages file: {str(e)}", file=sys.stderr) return None def parse_answer_format(text): """ Parse the <answer>...</answer> format from the agent's output. Returns a dictionary with the parsed values. """ if not text: print("Error: No text provided to parse", file=sys.stderr) return None # Look for <answer>...</answer> pattern match = re.search(r"<answer>(.*?)</answer>", text, re.IGNORECASE | re.DOTALL) if not match: print("Error: No <answer>...</answer> tags found in response", file=sys.stderr) return None answer_content = match.group(1).strip() if not answer_content: print("Error: Empty answer content", file=sys.stderr) return None # Parse each line result = {} lines = [line.strip() for line in answer_content.split("\n") if line.strip()] if len(lines) != 10: print(f"Error: Expected 10 lines in answer, got {len(lines)}", file=sys.stderr) print(f"Lines found: {lines}", file=sys.stderr) return None # Expected keys for validation expected_keys = [ "YogaProducts", "WH11Price", "ZeroQuantityProducts", "LowestProduct", "QuestLumaflexQuantity", "DashboardRevenue", "SarahMillerEmail", "TotalCustomers", "PendingOrders", "GraceNguyenOrderID" ] for line in lines: if "|" not in line: print(f"Error: Line missing '|' separator: {line}", file=sys.stderr) return None parts = line.split("|", 1) if len(parts) != 2: print(f"Error: Invalid line format: {line}", file=sys.stderr) return None key, value = parts[0].strip(), parts[1].strip() if not key or not value: print(f"Error: Empty key or value in line: {line}", file=sys.stderr) return None result[key] = value # Validate all expected keys are present missing_keys = set(expected_keys) - set(result.keys()) if missing_keys: print(f"Error: Missing required keys: {missing_keys}", file=sys.stderr) return None return result def load_expected_answer(label_path): """ Load the expected answer from label.txt file. Returns a dictionary with the expected values. """ try: with open(label_path, "r") as f: lines = f.read().strip().split("\n") expected = {} for line in lines: if "|" in line: key, value = line.split("|", 1) expected[key.strip()] = value.strip() return expected except Exception as e: print(f"Error reading label file: {str(e)}", file=sys.stderr) return None def compare_answers(model_answer, expected_answer): """ Compare the model's answer with the expected answer. Returns True if all key information matches, False otherwise. """ if not model_answer or not expected_answer: return False # Check each expected key mismatches = [] for key, expected_value in expected_answer.items(): model_value = model_answer.get(key, "") # Special handling for different types of values if key == "LowestProduct": # Check if product name and quantity match (format: "Product Name:quantity") if ":" in expected_value and ":" in model_value: expected_name, expected_qty = expected_value.rsplit(":", 1) model_name, model_qty = model_value.rsplit(":", 1) if expected_name != model_name or expected_qty != model_qty: mismatches.append( f"{key}: expected '{expected_value}', got '{model_value}'" ) else: if expected_value != model_value: mismatches.append( f"{key}: expected '{expected_value}', got '{model_value}'" ) elif key in ["WH11Price", "DashboardRevenue"]: # For price/amount fields, normalize format expected_clean = expected_value.replace("$", "").replace(",", "") model_clean = model_value.replace("$", "").replace(",", "") if expected_clean != model_clean: mismatches.append( f"{key}: expected '{expected_value}', got '{model_value}'" ) elif key == "SarahMillerEmail": # Email should match exactly if model_value.lower() != expected_value.lower(): mismatches.append( f"{key}: expected '{expected_value}', got '{model_value}'" ) else: # Exact match for other fields if model_value != expected_value: mismatches.append( f"{key}: expected '{expected_value}', got '{model_value}'" ) if mismatches: print("\n=== Answer Comparison Mismatches ===", file=sys.stderr) for mismatch in mismatches: print(f"✗ {mismatch}", file=sys.stderr) return False print("\n=== Answer Comparison ===", file=sys.stderr) print("✓ All key information matches the expected answer", file=sys.stderr) return True async def verify() -> bool: """ Verifies that the products and sales analysis task has been completed correctly. First checks the model's answer against the expected label, then optionally verifies the actual state in the Magento Admin. """ # Get the label file path label_path = Path(__file__).parent / "label.txt" # Load expected answer expected_answer = load_expected_answer(label_path) if not expected_answer: print("Error: Could not load expected answer from label.txt", file=sys.stderr) return False # Get model's response from MCP_MESSAGES model_response = get_model_response() if model_response: print("Found model response, parsing answer format...", file=sys.stderr) model_answer = parse_answer_format(model_response) if model_answer: print("\n=== Model Answer Parsed ===", file=sys.stderr) for key, value in model_answer.items(): print(f"{key}: {value}", file=sys.stderr) # Compare answers answer_match = compare_answers(model_answer, expected_answer) if not answer_match: print("\nModel answer does not match expected answer", file=sys.stderr) return False print("\n✓ Model answer matches expected answer", file=sys.stderr) return True else: print( "Warning: Could not parse answer format from model response", file=sys.stderr, ) return False else: print("No model response found", file=sys.stderr) return False def main(): """ Executes the verification process and exits with a status code. """ result = asyncio.run(verify()) sys.exit(0 if result else 1) if __name__ == "__main__": main() ================================================ FILE: tasks/playwright_webarena/standard/shopping_admin/sales_inventory_analysis/description.md ================================================ Perform a comprehensive sales and inventory analysis by extracting specific metrics from multiple sections of the Magento Admin panel. **Task Requirements:** 1. Login with username 'admin' and password 'admin1234' 2. To analyze product inventory and identify key items, check all products: - Search for all products containing 'Sprite' in their name - count the exact number of results - Clear the search and filter products by Quantity = 100.0000 - count how many products match - Find the product with SKU 'WS12' - record its exact name and price 3. To understand sales performance and order status, we need check all orders: - Search for all orders with 'Pending' status - count the total number - Find Grace Nguyen's Complete and the most cheap order - record the order ID (starts with "000") - Find the order with the highest Grand Total - record the customer name and amount 4. To examine bestselling products and search trends, from the main page: - In the Bestsellers table, identify the product with most quantity but and lowest price - record its name and quantity sold - Find 'Overnight Duffle' and record its exact price - In the Top Search Terms table, find 'hollister' and record its position number (1st, 2nd, etc.) 5. To analyze customer demographics and account information, go to All Customers: - Search for customers with its email address containing 'costello' - count the results - Find Sarah Miller's customer record - record her Group and extract Customer Since date 6. To review payment status and billing information, navigate to Invoices: - Find all invoices with 'Paid' status - count them - Find the invoice for order #000000002 - record the Bill-to Name 7. To provide a comprehensive report of all gathered data, compile all findings and output them in the following exact format: ``` <answer> SpriteProducts|count Quantity100Products|count WS12Info|name:price PendingOrders|count GraceOrderID|orderid HighestOrderInfo|customer:amount CheapProduct|name:quantity OvernightDufflePrice|price HollisterPosition|position CostelloCustomers|count SarahMillerInfo|group:date PaidInvoices|count Invoice002BillTo|name </answer> ``` **Example Output:** ``` <answer> SpriteProducts|XX Quantity100Products|XX WS12Info|Product Name Here:$XX.XX PendingOrders|X GraceOrderID|00000XXXX HighestOrderInfo|Customer Name:$XXX.XX CheapProduct|Product Name:XX OvernightDufflePrice|$XX.XX HollisterPosition|Xth CostelloCustomers|X SarahMillerInfo|Group Name:MMM DD, YYYY PaidInvoices|X Invoice002BillTo|Customer Name </answer> ``` ================================================ FILE: tasks/playwright_webarena/standard/shopping_admin/sales_inventory_analysis/label.txt ================================================ SpriteProducts|16 Quantity100Products|1886 WS12Info|Radiant Tee:$22.00 PendingOrders|10 GraceOrderID|000000114 HighestOrderInfo|Samantha Jones:$292.40 CheapProduct|Sprite Yoga Strap 6 foot:6 OvernightDufflePrice|$45.00 HollisterPosition|1st CostelloCustomers|0 SarahMillerInfo|General:Apr 19, 2023 5:45:07 PM PaidInvoices|2 Invoice002BillTo|Veronica Costello ================================================ FILE: tasks/playwright_webarena/standard/shopping_admin/sales_inventory_analysis/meta.json ================================================ { "task_id": "sales_inventory_analysis", "task_name": "Sales Inventory Analysis", "category_id": "shopping_admin", "category_name": "Shopping Admin", "description": "Analyze sales patterns and inventory levels to optimize stock management, identify slow-moving items, predict demand trends, and generate restocking recommendations.", "author": "Fanqing Meng", "created_at": "2025-08-17", "difficulty": "L3", "tags": [ "data extraction", "comparative analysis", "inventory management" ], "mcp": [ "playwright" ], "meta_data": { "stateType": "video", "stateContent": null, "stateUrl": "https://storage.mcpmark.ai/tasks_state/playwright_video/magento-admin.mp4", "stateOriginalUrl": "https://github.com/web-arena-x/webarena/tree/main/environment_docker" } } ================================================ FILE: tasks/playwright_webarena/standard/shopping_admin/sales_inventory_analysis/verify.py ================================================ import asyncio import sys import re import os import json from pathlib import Path def get_model_response(): """ Get the model's response from the MCP_MESSAGES environment variable. Returns the last assistant message text. """ messages_path = os.getenv("MCP_MESSAGES") print(f"MCP_MESSAGES: {messages_path}") if not messages_path: print("Warning: MCP_MESSAGES environment variable not set", file=sys.stderr) return None try: with open(messages_path, "r") as f: messages = json.load(f) # Find the last assistant message with type='message', status='completed' for message in reversed(messages): if ( message.get("role") == "assistant" and message.get("status") == "completed" and message.get("type") == "message" ): content = message.get("content", []) for item in content: # Check for both 'text' and 'output_text' types if item.get("type") in ["text", "output_text"]: return item.get("text", "") print("Warning: No assistant response found in messages", file=sys.stderr) return None except Exception as e: print(f"Error reading messages file: {str(e)}", file=sys.stderr) return None def parse_answer_format(text): """ Parse the <answer>...</answer> format from the agent's output. Returns a dictionary with the parsed values. """ if not text: print("ERROR: No text provided to parse", file=sys.stderr) return None # Look for <answer>...</answer> pattern match = re.search(r"<answer>(.*?)</answer>", text, re.IGNORECASE | re.DOTALL) if not match: print("ERROR: No <answer>...</answer> tags found in the response", file=sys.stderr) print("Response text preview (first 200 chars):", text[:200], file=sys.stderr) return None answer_content = match.group(1).strip() print(f"Found answer content with {len(answer_content)} characters", file=sys.stderr) # Parse each line result = {} lines = answer_content.split("\n") # Expected keys for this task expected_keys = [ "SpriteProducts", "Quantity100Products", "WS12Info", "PendingOrders", "GraceOrderID", "HighestOrderInfo", "CheapProduct", "OvernightDufflePrice", "HollisterPosition", "CostelloCustomers", "SarahMillerInfo", "PaidInvoices", "Invoice002BillTo" ] if len(lines) != 13: print(f"ERROR: Expected 13 lines in answer, got {len(lines)}", file=sys.stderr) print(f"Lines found: {lines}", file=sys.stderr) return None for i, line in enumerate(lines, 1): if "|" not in line: print(f"ERROR: Line {i} does not contain pipe separator '|': '{line}'", file=sys.stderr) return None parts = line.split("|", 1) if len(parts) != 2: print(f"ERROR: Line {i} could not be split into key|value: '{line}'", file=sys.stderr) return None key, value = parts result[key.strip()] = value.strip() # Check if all expected keys are present missing_keys = set(expected_keys) - set(result.keys()) if missing_keys: print(f"ERROR: Missing expected keys: {missing_keys}", file=sys.stderr) print(f"Keys found: {list(result.keys())}", file=sys.stderr) return None # Check for unexpected keys extra_keys = set(result.keys()) - set(expected_keys) if extra_keys: print(f"WARNING: Unexpected keys found: {extra_keys}", file=sys.stderr) return result def load_expected_answer(label_path): """ Load the expected answer from label.txt file. Returns a dictionary with the expected values. """ try: with open(label_path, "r") as f: lines = f.read().strip().split("\n") expected = {} for line in lines: if "|" in line: key, value = line.split("|", 1) expected[key.strip()] = value.strip() return expected except Exception as e: print(f"Error reading label file: {str(e)}", file=sys.stderr) return None def compare_answers(model_answer, expected_answer): """ Compare the model's answer with the expected answer. Returns True if all key information matches, False otherwise. """ if not model_answer or not expected_answer: return False # Check each expected key mismatches = [] for key, expected_value in expected_answer.items(): model_value = model_answer.get(key, "") # Special handling for different types of values if key == "WS12Info": # Check if product name and price match (format: name:price) if ":" in expected_value and ":" in model_value: expected_name, expected_price = expected_value.rsplit(":", 1) model_name, model_price = model_value.rsplit(":", 1) # Normalize price format expected_price_clean = expected_price.replace("$", "").replace(",", "") model_price_clean = model_price.replace("$", "").replace(",", "") if ( expected_name != model_name or expected_price_clean != model_price_clean ): mismatches.append( f"{key}: expected '{expected_value}', got '{model_value}'" ) else: if expected_value != model_value: mismatches.append( f"{key}: expected '{expected_value}', got '{model_value}'" ) elif key == "GraceOrderID": # Order ID should start with "000" and match exactly if not model_value.startswith("000"): mismatches.append( f"{key}: expected to start with '000', got '{model_value}'" ) elif model_value != expected_value: mismatches.append( f"{key}: expected '{expected_value}', got '{model_value}'" ) elif key == "HighestOrderInfo": # Check format customer:amount if ":" in expected_value and ":" in model_value: expected_customer, expected_amount = expected_value.rsplit(":", 1) model_customer, model_amount = model_value.rsplit(":", 1) # Normalize amount format expected_amount_clean = expected_amount.replace("$", "").replace( ",", "" ) model_amount_clean = model_amount.replace("$", "").replace(",", "") if ( expected_customer != model_customer or expected_amount_clean != model_amount_clean ): mismatches.append( f"{key}: expected '{expected_value}', got '{model_value}'" ) else: if expected_value != model_value: mismatches.append( f"{key}: expected '{expected_value}', got '{model_value}'" ) elif key == "Position2Product": # Check if product name and quantity match if ":" in expected_value and ":" in model_value: expected_name, expected_qty = expected_value.rsplit(":", 1) model_name, model_qty = model_value.rsplit(":", 1) if expected_name != model_name or expected_qty != model_qty: mismatches.append( f"{key}: expected '{expected_value}', got '{model_value}'" ) else: if expected_value != model_value: mismatches.append( f"{key}: expected '{expected_value}', got '{model_value}'" ) elif key == "OvernightDufflePrice": # Normalize price format expected_clean = expected_value.replace("$", "").replace(",", "") model_clean = model_value.replace("$", "").replace(",", "") if expected_clean != model_clean: mismatches.append( f"{key}: expected '{expected_value}', got '{model_value}'" ) elif key == "HollisterPosition": # Position format (1st, 2nd, 3rd, etc.) if model_value.lower() != expected_value.lower(): mismatches.append( f"{key}: expected '{expected_value}', got '{model_value}'" ) elif key == "SarahMillerInfo": # Format: group:date if ":" in expected_value and ":" in model_value: expected_group, expected_date = expected_value.split(":", 1) model_group, model_date = model_value.split(":", 1) # Allow some flexibility in date format if expected_group != model_group: mismatches.append( f"{key}: expected group '{expected_group}', got '{model_group}'" ) # For date, check if key parts match if not (expected_date in model_date or model_date in expected_date): mismatches.append( f"{key}: expected date '{expected_date}', got '{model_date}'" ) else: if expected_value != model_value: mismatches.append( f"{key}: expected '{expected_value}', got '{model_value}'" ) elif key == "Invoice002BillTo": # Name should match exactly if model_value != expected_value: mismatches.append( f"{key}: expected '{expected_value}', got '{model_value}'" ) else: # Exact match for count fields and other numeric values if model_value != expected_value: mismatches.append( f"{key}: expected '{expected_value}', got '{model_value}'" ) if mismatches: print("\n=== Answer Comparison Mismatches ===", file=sys.stderr) for mismatch in mismatches: print(f"✗ {mismatch}", file=sys.stderr) return False print("\n=== Answer Comparison ===", file=sys.stderr) print("✓ All key information matches the expected answer", file=sys.stderr) return True async def verify() -> bool: """ Verifies that the sales and inventory analysis task has been completed correctly. First checks the model's answer against the expected label, then optionally verifies the actual state in the Magento Admin. """ print("\n" + "="*60, file=sys.stderr) print("Starting verification of Task 5", file=sys.stderr) print("="*60, file=sys.stderr) # Get the label file path label_path = Path(__file__).parent / "label.txt" # Load expected answer print("\n--- Loading Expected Answer ---", file=sys.stderr) expected_answer = load_expected_answer(label_path) if not expected_answer: print("FATAL ERROR: Could not load expected answer from label.txt", file=sys.stderr) return False print(f"Successfully loaded {len(expected_answer)} expected values", file=sys.stderr) # Get model's response from MCP_MESSAGES print("\n--- Loading Model Response ---", file=sys.stderr) model_response = get_model_response() if not model_response: print("FATAL ERROR: No model response found in MCP_MESSAGES", file=sys.stderr) return False print(f"Found model response ({len(model_response)} characters)", file=sys.stderr) print("\n--- Parsing Answer Format ---", file=sys.stderr) model_answer = parse_answer_format(model_response) if not model_answer: print("\nFATAL ERROR: Could not parse answer format from model response", file=sys.stderr) print("Verification FAILED", file=sys.stderr) return False print("\n=== Model Answer Successfully Parsed ===", file=sys.stderr) for key, value in model_answer.items(): print(f" {key}: {value}", file=sys.stderr) # Compare answers print("\n--- Comparing Answers ---", file=sys.stderr) answer_match = compare_answers(model_answer, expected_answer) if not answer_match: print("\n" + "="*60, file=sys.stderr) print("VERIFICATION FAILED: Model answer does not match expected answer", file=sys.stderr) print("="*60, file=sys.stderr) return False print("\n" + "="*60, file=sys.stderr) print("✓ VERIFICATION PASSED: Model answer matches expected answer", file=sys.stderr) print("="*60, file=sys.stderr) return True def main(): """ Executes the verification process and exits with a status code. """ result = asyncio.run(verify()) sys.exit(0 if result else 1) if __name__ == "__main__": main() ================================================ FILE: tasks/playwright_webarena/standard/shopping_admin/search_filtering_operations/description.md ================================================ Perform comprehensive search and filtering operations in the Magento Admin panel to extract specific business insights using advanced search techniques. **Task Requirements:** 1. Login with username 'admin' and password 'admin1234' 2. To analyze search behavior and term effectiveness, check the Search Terms of Marketing and perform complex filtering: - Search for all terms containing 'tank' in their name - count the exact number of results - Clear filters and find terms with exactly 0 results - count how many such terms exist - Apply a filter to show only terms with more than 10 uses - record the term with highest uses and its count (You need to see how many there are and record them all.) - Find the search term that has results between 20-30 - record its name and exact result count 3. To gather detailed marketing insights from search data, go to Search Terms in Reports: - Apply filter for terms with more than 15 hits - count total filtered results - Find the term with ID between 10-15 that has the most results - record term name and result count (You need to see how many there are and record them all.) - Filter to show only terms from "Default Store View" - count total results 4. To examine real-time search trends and top performers, from the Dashboard, perform targeted searches: - In the 'Top Search Terms' table, find the term with exactly 1 result - record its name and uses - In the 'Last Search Terms' table, identify the term with the both the highest number of results and uses - record name and the number of results - In the 'Bestsellers' tab, find the product at position #3 - record name and quantity 5. To identify patterns in search usage and results, navigate to Search Terms (main grid) in step 2: - Sort by 'Uses' column (descending) - record the top term and its uses count - Sort by 'Results' column (ascending) - record the first non-zero result term and its count - Count total number of unique search terms in the system 6. To provide a comprehensive report of all gathered data, compile all findings and output in the following exact format: ``` <answer> TankSearchCount|count ZeroResultsCount|count HighestUseTerm|term:uses Results20to30Term|term1:results1|term2:result2|term3:result3|... Hits15PlusCount|count ID10to15MaxResults|term:results DefaultStoreViewCount|count OneResultTerm|term1:uses1|term2:uses2|term3:uses3|... HighestResultLastSearch|term:results Position3Bestseller|product:quantity TopUseTerm|term:uses FirstNonZeroResult|term:results TotalUniqueTerms|count </answer> ``` **Example Output:** ``` <answer> TankSearchCount|X ZeroResultsCount|X HighestUseTerm|search_term:XX Results20to30Term|search_term1:XX1|search_term2:XX2|search_term3:XX3|... Hits15PlusCount|X ID10to15MaxResults|Product Name:XX DefaultStoreViewCount|X OneResultTerm|search_term1:XX1|search_term2:XX2|search_term3:XX3|... HighestResultLastSearch|search_term:XX Position3Bestseller|Product Name:X TopUseTerm|search_term:XX FirstNonZeroResult|search_term:X TotalUniqueTerms|X </answer> ``` **Success Criteria:** - Successfully logged into Magento Admin - Applied complex search filters in Search Terms section - Used range filters for results and hits - Sorted columns to find specific records - Navigated between different report views - Extracted data from filtered and sorted results - Counted records accurately after applying filters - Output answer in exact format with 13 data lines - Answer wrapped in <answer> tags ================================================ FILE: tasks/playwright_webarena/standard/shopping_admin/search_filtering_operations/label.txt ================================================ TankSearchCount|2 ZeroResultsCount|1 HighestUseTerm|hollister:19 Results20to30Term|Antonia Racer Tank:23|tanks:23 Hits15PlusCount|1 ID10to15MaxResults|Antonia Racer Tank:23 DefaultStoreViewCount|7 OneResultTerm|hollister:19|WP10:1 HighestResultLastSearch|Antonia Racer Tank:23 Position3Bestseller|Sprite Stasis Ball 65 cm:6 TopUseTerm|hollister:19 FirstNonZeroResult|WP10:1 TotalUniqueTerms|7 ================================================ FILE: tasks/playwright_webarena/standard/shopping_admin/search_filtering_operations/meta.json ================================================ { "task_id": "search_filtering_operations", "task_name": "Search Filtering Operations", "category_id": "shopping_admin", "category_name": "Shopping Admin", "description": "Configure advanced search and filtering systems in admin interface, implement category hierarchies, set up attribute filters, and optimize search algorithms for user experience.", "author": "Fanqing Meng", "created_at": "2025-08-17", "difficulty": "L3", "tags": [ "content submission" ], "mcp": [ "playwright" ], "meta_data": { "stateType": "video", "stateContent": null, "stateUrl": "https://storage.mcpmark.ai/tasks_state/playwright_video/magento-admin.mp4", "stateOriginalUrl": "https://github.com/web-arena-x/webarena/tree/main/environment_docker" } } ================================================ FILE: tasks/playwright_webarena/standard/shopping_admin/search_filtering_operations/verify.py ================================================ import re import json import os import sys def verify(messages): """ Verify that the agent has successfully performed complex search and filtering operations in the Magento Admin panel and extracted all required information correctly. Args: messages: List of message dictionaries containing the conversation Returns: Dictionary with 'valid' boolean and 'reason' string """ # Find the last assistant message with status "completed" and type "message" answer_content = None for message in reversed(messages): if ( message.get("role") == "assistant" and message.get("status") == "completed" and message.get("type") == "message" and message.get("content") ): # Extract text from content structure content = message["content"] if isinstance(content, list): for item in content: if isinstance(item, dict) and item.get("type") == "output_text": text = item.get("text", "") # Look for answer tags with case-insensitive search answer_match = re.search( r"<answer>(.*?)</answer>", text, re.DOTALL | re.IGNORECASE ) if answer_match: answer_content = answer_match.group(1).strip() break elif isinstance(content, str): # Look for answer tags in string content answer_match = re.search(r"<answer>(.*?)</answer>", content, re.DOTALL | re.IGNORECASE) if answer_match: answer_content = answer_match.group(1).strip() break if answer_content: break if not answer_content: return {"valid": False, "reason": "No answer found in <answer> tags"} # Expected format - each line should have a key|value pair expected_keys = [ "TankSearchCount", "ZeroResultsCount", "HighestUseTerm", "Results20to30Term", "Hits15PlusCount", "ID10to15MaxResults", "DefaultStoreViewCount", "OneResultTerm", "HighestResultLastSearch", "Position3Bestseller", "TopUseTerm", "FirstNonZeroResult", "TotalUniqueTerms", ] # Parse the answer lines = answer_content.strip().split("\n") # Check if we have exactly 13 lines if len(lines) != 13: return {"valid": False, "reason": f"Expected 13 data lines, found {len(lines)}"} # Parse each line and validate format extracted_data = {} for line in lines: if "|" not in line: return { "valid": False, "reason": f"Invalid format in line: {line}. Expected 'key|value' format", } parts = line.split("|", 1) if len(parts) != 2: return {"valid": False, "reason": f"Invalid format in line: {line}"} key, value = parts extracted_data[key] = value # Check all required keys are present missing_keys = set(expected_keys) - set(extracted_data.keys()) if missing_keys: return { "valid": False, "reason": f"Missing required keys: {', '.join(missing_keys)}", } # Validate specific data formats and expected values based on the current data # 1. TankSearchCount should be a number (2 terms containing 'tank') if not extracted_data["TankSearchCount"].isdigit(): return { "valid": False, "reason": f"TankSearchCount should be a number, got: {extracted_data['TankSearchCount']}", } # Expected: "Antonia Racer Tank" and "tanks" contain 'tank' if extracted_data["TankSearchCount"] != "2": return { "valid": False, "reason": f"TankSearchCount should be '2', got: {extracted_data['TankSearchCount']}", } # 2. ZeroResultsCount should be a number (nike has 0 results) if not extracted_data["ZeroResultsCount"].isdigit(): return { "valid": False, "reason": f"ZeroResultsCount should be a number, got: {extracted_data['ZeroResultsCount']}", } if extracted_data["ZeroResultsCount"] != "1": return { "valid": False, "reason": f"ZeroResultsCount should be '1', got: {extracted_data['ZeroResultsCount']}", } # 3. HighestUseTerm should be in format "term:uses" if ":" not in extracted_data["HighestUseTerm"]: return { "valid": False, "reason": f"HighestUseTerm should be in format 'term:uses', got: {extracted_data['HighestUseTerm']}", } # hollister has 19 uses (highest among terms with > 10 uses) if extracted_data["HighestUseTerm"] != "hollister:19": return { "valid": False, "reason": f"HighestUseTerm should be 'hollister:19', got: {extracted_data['HighestUseTerm']}", } # 4. Results20to30Term should be in format "term:results" if ":" not in extracted_data["Results20to30Term"]: return { "valid": False, "reason": f"Results20to30Term should be in format 'term:results', got: {extracted_data['Results20to30Term']}", } # Both "tanks" and "Antonia Racer Tank" have 23 results (between 20-30) valid_results20to30 = ["tanks:23", "Antonia Racer Tank:23"] # Check if answer contains one of the valid values or both separated by | if not any( val in extracted_data["Results20to30Term"] for val in valid_results20to30 ): return { "valid": False, "reason": f"Results20to30Term should contain 'tanks:23' or 'Antonia Racer Tank:23', got: {extracted_data['Results20to30Term']}", } # 5. Hits15PlusCount should be a number (only hollister has 19 hits > 15) if not extracted_data["Hits15PlusCount"].isdigit(): return { "valid": False, "reason": f"Hits15PlusCount should be a number, got: {extracted_data['Hits15PlusCount']}", } if extracted_data["Hits15PlusCount"] != "1": return { "valid": False, "reason": f"Hits15PlusCount should be '1', got: {extracted_data['Hits15PlusCount']}", } # 6. ID10to15MaxResults should be in format "term:results" if ":" not in extracted_data["ID10to15MaxResults"]: return { "valid": False, "reason": f"ID10to15MaxResults should be in format 'term:results', got: {extracted_data['ID10to15MaxResults']}", } # ID 11 is hollister (1 result), ID 13 is Antonia Racer Tank (23 results) if extracted_data["ID10to15MaxResults"] != "Antonia Racer Tank:23": return { "valid": False, "reason": f"ID10to15MaxResults should be 'Antonia Racer Tank:23', got: {extracted_data['ID10to15MaxResults']}", } # 7. DefaultStoreViewCount should be a number (all 7 terms are from Default Store View) if not extracted_data["DefaultStoreViewCount"].isdigit(): return { "valid": False, "reason": f"DefaultStoreViewCount should be a number, got: {extracted_data['DefaultStoreViewCount']}", } if extracted_data["DefaultStoreViewCount"] != "7": return { "valid": False, "reason": f"DefaultStoreViewCount should be '7', got: {extracted_data['DefaultStoreViewCount']}", } # 8. OneResultTerm should be in format "term:uses" if ":" not in extracted_data["OneResultTerm"]: return { "valid": False, "reason": f"OneResultTerm should be in format 'term:uses', got: {extracted_data['OneResultTerm']}", } # Both hollister and WP10 have exactly 1 result valid_one_result = ["hollister:19", "WP10:1"] if not any(val in extracted_data["OneResultTerm"] for val in valid_one_result): return { "valid": False, "reason": f"OneResultTerm should contain 'hollister:19' or 'WP10:1', got: {extracted_data['OneResultTerm']}", } # 9. HighestResultLastSearch should be in format "term:results" if ":" not in extracted_data["HighestResultLastSearch"]: return { "valid": False, "reason": f"HighestResultLastSearch should be in format 'term:results', got: {extracted_data['HighestResultLastSearch']}", } # In Last Search Terms: tanks and Antonia Racer Tank both have 23 results (highest) valid_highest_last = ["tanks:23", "Antonia Racer Tank:23"] if not any( val in extracted_data["HighestResultLastSearch"] for val in valid_highest_last ): return { "valid": False, "reason": f"HighestResultLastSearch should contain 'tanks:23' or 'Antonia Racer Tank:23', got: {extracted_data['HighestResultLastSearch']}", } # 10. Position3Bestseller should be in format "product:quantity" if ":" not in extracted_data["Position3Bestseller"]: return { "valid": False, "reason": f"Position3Bestseller should be in format 'product:quantity', got: {extracted_data['Position3Bestseller']}", } # Position 3 in Bestsellers is "Sprite Stasis Ball 65 cm" with quantity 6 if extracted_data["Position3Bestseller"] != "Sprite Stasis Ball 65 cm:6": return { "valid": False, "reason": f"Position3Bestseller should be 'Sprite Stasis Ball 65 cm:6', got: {extracted_data['Position3Bestseller']}", } # 11. TopUseTerm should be in format "term:uses" if ":" not in extracted_data["TopUseTerm"]: return { "valid": False, "reason": f"TopUseTerm should be in format 'term:uses', got: {extracted_data['TopUseTerm']}", } # hollister has 19 uses (highest) if extracted_data["TopUseTerm"] != "hollister:19": return { "valid": False, "reason": f"TopUseTerm should be 'hollister:19', got: {extracted_data['TopUseTerm']}", } # 12. FirstNonZeroResult should be in format "term:results" if ":" not in extracted_data["FirstNonZeroResult"]: return { "valid": False, "reason": f"FirstNonZeroResult should be in format 'term:results', got: {extracted_data['FirstNonZeroResult']}", } # When sorted by results ascending, first non-zero is WP10 (has 1 result) if extracted_data["FirstNonZeroResult"] != "WP10:1": return { "valid": False, "reason": f"FirstNonZeroResult should be 'WP10:1', got: {extracted_data['FirstNonZeroResult']}", } # 13. TotalUniqueTerms should be a number if not extracted_data["TotalUniqueTerms"].isdigit(): return { "valid": False, "reason": f"TotalUniqueTerms should be a number, got: {extracted_data['TotalUniqueTerms']}", } # There are 7 unique search terms in the system if extracted_data["TotalUniqueTerms"] != "7": return { "valid": False, "reason": f"TotalUniqueTerms should be '7', got: {extracted_data['TotalUniqueTerms']}", } # All validations passed return { "valid": True, "reason": "All complex search and filtering operations completed successfully", } if __name__ == "__main__": # Load messages from environment variable messages_path = os.getenv("MCP_MESSAGES") if not messages_path: print( json.dumps( {"valid": False, "reason": "MCP_MESSAGES environment variable not set"} ) ) exit(1) try: with open(messages_path, "r") as f: messages = json.load(f) except Exception as e: print( json.dumps({"valid": False, "reason": f"Failed to load messages: {str(e)}"}) ) exit(1) # Run verification result = verify(messages) print(json.dumps(result)) # Exit with appropriate code based on verification result sys.exit(0 if result["valid"] else 1) ================================================ FILE: tasks/postgres/easy/.gitkeep ================================================ ================================================ FILE: tasks/postgres/easy/chinook/customer_data_migration_basic/description.md ================================================ Migrate customer data from an acquired company to PostgreSQL using efficient bulk operations. ## Your Mission: Chinook Music Store has recently acquired "MelodyMart," a competing music retailer. Their customer database needs to be migrated into Chinook's PostgreSQL database. ## Migration Requirements: 1. **Process all customer records from the data table below** and migrate them into the `Customer` table 2. **Apply business logic during migration**: - Assign `CustomerID` values starting from the next available ID - Assign all customers to support representative with EmployeeId 3 - Set `Fax` field to NULL for all migrated customers ## Customer Data to Migrate: | FirstName | LastName | Company | Address | City | State | Country | PostalCode | Phone | Email | |-----------|----------|---------|---------|------|-------|---------|------------|-------|--------| | Danielle | Johnson | Sanchez-Taylor | 819 Johnson Course | East William | AK | USA | 74064 | 386-3794 | danielle.johnson@sancheztaylor.com | | Katherine | Moore | Peterson-Moore | 16155 Roman Stream Suite 816 | New Kellystad | OK | USA | 25704 | 103-4131 | katherine_moore@petersonmoore.org | | Joshua | Reid | Martin-Kelly | 192 Frank Light Suite 835 | East Lydiamouth | MO | USA | 35594 | 139-5376 | joshua_reid@martinkelly.org | ================================================ FILE: tasks/postgres/easy/chinook/customer_data_migration_basic/meta.json ================================================ { "task_id": "customer_data_migration_basic", "task_name": "Customer Data Migration Basic", "category_id": "chinook", "category_name": "Chinook", "description": "Load the MelodyMart customer rows into the Customer table with new ids, SupportRepId = 3, and Fax values set to NULL.", "author": "Lingxiao Du", "created_at": "2025-11-15", "difficulty": "L1", "tags": [ "data migration", "transactional operations" ], "mcp": [ "postgres" ], "meta_data": { "stateType": "text", "stateContent": "Table \"Album\" {\n \"AlbumId\" int4 [pk, not null]\n \"Title\" varchar(160) [not null]\n \"ArtistId\" int4 [not null]\n\n Indexes {\n ArtistId [type: btree, name: \"IFK_AlbumArtistId\"]\n }\n}\n\nTable \"Artist\" {\n \"ArtistId\" int4 [pk, not null]\n \"Name\" varchar(120)\n}\n\nTable \"Customer\" {\n \"CustomerId\" int4 [pk, not null]\n \"FirstName\" varchar(40) [not null]\n \"LastName\" varchar(20) [not null]\n \"Company\" varchar(80)\n \"Address\" varchar(70)\n \"City\" varchar(40)\n \"State\" varchar(40)\n \"Country\" varchar(40)\n \"PostalCode\" varchar(10)\n \"Phone\" varchar(24)\n \"Fax\" varchar(24)\n \"Email\" varchar(60) [not null]\n \"SupportRepId\" int4\n\n Indexes {\n SupportRepId [type: btree, name: \"IFK_CustomerSupportRepId\"]\n }\n}\n\nTable \"Employee\" {\n \"EmployeeId\" int4 [pk, not null]\n \"LastName\" varchar(20) [not null]\n \"FirstName\" varchar(20) [not null]\n \"Title\" varchar(30)\n \"ReportsTo\" int4\n \"BirthDate\" timestamp\n \"HireDate\" timestamp\n \"Address\" varchar(70)\n \"City\" varchar(40)\n \"State\" varchar(40)\n \"Country\" varchar(40)\n \"PostalCode\" varchar(10)\n \"Phone\" varchar(24)\n \"Fax\" varchar(24)\n \"Email\" varchar(60)\n\n Indexes {\n ReportsTo [type: btree, name: \"IFK_EmployeeReportsTo\"]\n }\n}\n\nTable \"Genre\" {\n \"GenreId\" int4 [pk, not null]\n \"Name\" varchar(120)\n}\n\nTable \"Invoice\" {\n \"InvoiceId\" int4 [pk, not null]\n \"CustomerId\" int4 [not null]\n \"InvoiceDate\" timestamp [not null]\n \"BillingAddress\" varchar(70)\n \"BillingCity\" varchar(40)\n \"BillingState\" varchar(40)\n \"BillingCountry\" varchar(40)\n \"BillingPostalCode\" varchar(10)\n \"Total\" numeric(10,2) [not null]\n\n Indexes {\n CustomerId [type: btree, name: \"IFK_InvoiceCustomerId\"]\n }\n}\n\nTable \"InvoiceLine\" {\n \"InvoiceLineId\" int4 [pk, not null]\n \"InvoiceId\" int4 [not null]\n \"TrackId\" int4 [not null]\n \"UnitPrice\" numeric(10,2) [not null]\n \"Quantity\" int4 [not null]\n\n Indexes {\n InvoiceId [type: btree, name: \"IFK_InvoiceLineInvoiceId\"]\n TrackId [type: btree, name: \"IFK_InvoiceLineTrackId\"]\n }\n}\n\nTable \"MediaType\" {\n \"MediaTypeId\" int4 [pk, not null]\n \"Name\" varchar(120)\n}\n\nTable \"Playlist\" {\n \"PlaylistId\" int4 [pk, not null]\n \"Name\" varchar(120)\n}\n\nTable \"PlaylistTrack\" {\n \"PlaylistId\" int4 [not null]\n \"TrackId\" int4 [not null]\n\n Indexes {\n (PlaylistId, TrackId) [type: btree, name: \"PK_PlaylistTrack\"]\n TrackId [type: btree, name: \"IFK_PlaylistTrackTrackId\"]\n }\n}\n\nTable \"Track\" {\n \"TrackId\" int4 [pk, not null]\n \"Name\" varchar(200) [not null]\n \"AlbumId\" int4\n \"MediaTypeId\" int4 [not null]\n \"GenreId\" int4\n \"Composer\" varchar(220)\n \"Milliseconds\" int4 [not null]\n \"Bytes\" int4\n \"UnitPrice\" numeric(10,2) [not null]\n\n Indexes {\n AlbumId [type: btree, name: \"IFK_TrackAlbumId\"]\n GenreId [type: btree, name: \"IFK_TrackGenreId\"]\n MediaTypeId [type: btree, name: \"IFK_TrackMediaTypeId\"]\n }\n}\n\nRef \"FK_AlbumArtistId\":\"Artist\".\"ArtistId\" < \"Album\".\"ArtistId\"\n\nRef \"FK_CustomerSupportRepId\":\"Employee\".\"EmployeeId\" < \"Customer\".\"SupportRepId\"\n\nRef \"FK_EmployeeReportsTo\":\"Employee\".\"EmployeeId\" < \"Employee\".\"ReportsTo\"\n\nRef \"FK_InvoiceCustomerId\":\"Customer\".\"CustomerId\" < \"Invoice\".\"CustomerId\"\n\nRef \"FK_InvoiceLineInvoiceId\":\"Invoice\".\"InvoiceId\" < \"InvoiceLine\".\"InvoiceId\"\n\nRef \"FK_InvoiceLineTrackId\":\"Track\".\"TrackId\" < \"InvoiceLine\".\"TrackId\"\n\nRef \"FK_PlaylistTrackPlaylistId\":\"Playlist\".\"PlaylistId\" < \"PlaylistTrack\".\"PlaylistId\"\n\nRef \"FK_PlaylistTrackTrackId\":\"Track\".\"TrackId\" < \"PlaylistTrack\".\"TrackId\"\n\nRef \"FK_TrackAlbumId\":\"Album\".\"AlbumId\" < \"Track\".\"AlbumId\"\n\nRef \"FK_TrackGenreId\":\"Genre\".\"GenreId\" < \"Track\".\"GenreId\"\n\nRef \"FK_TrackMediaTypeId\":\"MediaType\".\"MediaTypeId\" < \"Track\".\"MediaTypeId\"\n", "stateUrl": null, "stateOriginalUrl": "https://github.com/neondatabase-labs/postgres-sample-dbs/blob/main/chinook.sql" } } ================================================ FILE: tasks/postgres/easy/chinook/customer_data_migration_basic/verify.py ================================================ """ Verification script for PostgreSQL Task 2: Customer Data Migration """ import os import sys import psycopg2 import pickle def get_connection_params() -> dict: """Get database connection parameters.""" return { "host": os.getenv("POSTGRES_HOST", "localhost"), "port": int(os.getenv("POSTGRES_PORT", 5432)), "database": os.getenv("POSTGRES_DATABASE"), "user": os.getenv("POSTGRES_USERNAME"), "password": os.getenv("POSTGRES_PASSWORD") } def load_expected_customers(): """Load the expected customer data from pickle file.""" import os script_dir = os.path.dirname(os.path.abspath(__file__)) pkl_path = os.path.join(script_dir, 'customer_data.pkl') try: with open(pkl_path, 'rb') as f: return pickle.load(f) except FileNotFoundError: print(f"❌ customer_data.pkl not found at {pkl_path}. Please generate customer data first.") return None except Exception as e: print(f"❌ Error loading customer data: {e}") return None def verify_migrated_customers(conn, expected_customers) -> bool: """Verify migrated customers by comparing with expected data as sets.""" with conn.cursor() as cur: # Get all customers with ID > 59 (the migrated ones) cur.execute(''' SELECT "FirstName", "LastName", "Company", "Address", "City", "State", "Country", "PostalCode", "Phone", "Email", "SupportRepId", "Fax" FROM "Customer" WHERE "CustomerId" > 59 ''') actual_customers = cur.fetchall() if len(actual_customers) != len(expected_customers): print(f"❌ Expected {len(expected_customers)} migrated customers, found {len(actual_customers)}") return False # Convert expected customers to tuples for set comparison expected_tuples = set() for expected in expected_customers: expected_tuple = ( expected['FirstName'], expected['LastName'], expected['Company'], expected['Address'], expected['City'], expected['State'], expected['Country'], expected['PostalCode'], expected['Phone'], expected['Email'], 3, None # SupportRepId=3, Fax=None ) expected_tuples.add(expected_tuple) # Convert actual customers to set with proper type conversion actual_tuples = set() for row in actual_customers: # Convert all fields to strings for consistent comparison actual_tuple = ( str(row[0]) if row[0] is not None else '', # FirstName str(row[1]) if row[1] is not None else '', # LastName str(row[2]) if row[2] is not None else '', # Company str(row[3]) if row[3] is not None else '', # Address str(row[4]) if row[4] is not None else '', # City str(row[5]) if row[5] is not None else '', # State str(row[6]) if row[6] is not None else '', # Country str(row[7]) if row[7] is not None else '', # PostalCode str(row[8]) if row[8] is not None else '', # Phone str(row[9]) if row[9] is not None else '', # Email int(row[10]) if row[10] is not None else None, # SupportRepId row[11] # Fax (should be None) ) actual_tuples.add(actual_tuple) # Check if sets are equal if actual_tuples != expected_tuples: missing_in_actual = expected_tuples - actual_tuples extra_in_actual = actual_tuples - expected_tuples print(f"❌ Customer data sets don't match!") if missing_in_actual: print(f" Missing {len(missing_in_actual)} expected customers") for missing in list(missing_in_actual)[:3]: # Show first 3 print(f" Missing: {missing[0]} {missing[1]} - {missing[2]}") if len(missing_in_actual) > 3: print(f" ... and {len(missing_in_actual) - 3} more") if extra_in_actual: print(f" Found {len(extra_in_actual)} unexpected customers") for extra in list(extra_in_actual)[:3]: # Show first 3 print(f" Extra: {extra[0]} {extra[1]} - {extra[2]}") if len(extra_in_actual) > 3: print(f" ... and {len(extra_in_actual) - 3} more") return False print(f"✅ All {len(expected_customers)} customers migrated correctly") print(f"✅ All customers assigned to SupportRepId 3") print(f"✅ All customers have Fax field set to NULL") print(f"✅ Customer data sets match exactly (order-independent)") return True def main(): """Main verification function.""" print("=" * 60) print("Verifying Customer Data Migration Task") print("=" * 60) # Load expected customer data expected_customers = load_expected_customers() if not expected_customers: sys.exit(1) print(f"Loaded {len(expected_customers)} expected customer records") # Get connection parameters conn_params = get_connection_params() if not conn_params["database"]: print("❌ No database specified") sys.exit(1) try: # Connect to database conn = psycopg2.connect(**conn_params) # Verify migration success = verify_migrated_customers(conn, expected_customers) conn.close() if success: print("\n🎉 Task verification: PASS") sys.exit(0) else: print("\n❌ Task verification: FAIL") sys.exit(1) except psycopg2.Error as e: print(f"❌ Database error: {e}") sys.exit(1) except Exception as e: print(f"❌ Verification error: {e}") sys.exit(1) if __name__ == "__main__": main() ================================================ FILE: tasks/postgres/easy/chinook/update_employee_info/description.md ================================================ Update employee information and reorganize the reporting structure in the Chinook database to reflect organizational changes. ## Your Tasks: ### **UPDATE: Modify Existing Employee Information** - Change Andrew Adams (EmployeeId = 1) title from 'General Manager' to 'CEO' - Update Nancy Edwards (EmployeeId = 2) phone number to '+1 (403) 555-9999' - Change all employees with Title = 'IT Staff' to have Title = 'IT Specialist' ## Requirements: - Use UPDATE statements to modify the existing records - The title update for 'IT Staff' should affect all matching employees ## Expected Results: After completing the updates: - Andrew Adams should have Title = 'CEO' - Nancy Edwards should have Phone = '+1 (403) 555-9999' - All employees previously with Title = 'IT Staff' should now have Title = 'IT Specialist' This task practices UPDATE operations for both employee information and organizational hierarchy management. ================================================ FILE: tasks/postgres/easy/chinook/update_employee_info/meta.json ================================================ { "task_id": "update_employee_info", "task_name": "Update Employee Info", "category_id": "chinook", "category_name": "Chinook", "description": "Update Chinook employee records so Andrew Adams becomes CEO, Nancy Edwards receives the new phone number, and every \"IT Staff\" title becomes \"IT Specialist.\"", "author": "Lingxiao Du", "created_at": "2025-11-15", "difficulty": "L1", "tags": [ "data updates", "organizational change" ], "mcp": [ "postgres" ], "meta_data": { "stateType": "text", "stateContent": "Table \"Album\" {\n \"AlbumId\" int4 [pk, not null]\n \"Title\" varchar(160) [not null]\n \"ArtistId\" int4 [not null]\n\n Indexes {\n ArtistId [type: btree, name: \"IFK_AlbumArtistId\"]\n }\n}\n\nTable \"Artist\" {\n \"ArtistId\" int4 [pk, not null]\n \"Name\" varchar(120)\n}\n\nTable \"Customer\" {\n \"CustomerId\" int4 [pk, not null]\n \"FirstName\" varchar(40) [not null]\n \"LastName\" varchar(20) [not null]\n \"Company\" varchar(80)\n \"Address\" varchar(70)\n \"City\" varchar(40)\n \"State\" varchar(40)\n \"Country\" varchar(40)\n \"PostalCode\" varchar(10)\n \"Phone\" varchar(24)\n \"Fax\" varchar(24)\n \"Email\" varchar(60) [not null]\n \"SupportRepId\" int4\n\n Indexes {\n SupportRepId [type: btree, name: \"IFK_CustomerSupportRepId\"]\n }\n}\n\nTable \"Employee\" {\n \"EmployeeId\" int4 [pk, not null]\n \"LastName\" varchar(20) [not null]\n \"FirstName\" varchar(20) [not null]\n \"Title\" varchar(30)\n \"ReportsTo\" int4\n \"BirthDate\" timestamp\n \"HireDate\" timestamp\n \"Address\" varchar(70)\n \"City\" varchar(40)\n \"State\" varchar(40)\n \"Country\" varchar(40)\n \"PostalCode\" varchar(10)\n \"Phone\" varchar(24)\n \"Fax\" varchar(24)\n \"Email\" varchar(60)\n\n Indexes {\n ReportsTo [type: btree, name: \"IFK_EmployeeReportsTo\"]\n }\n}\n\nTable \"Genre\" {\n \"GenreId\" int4 [pk, not null]\n \"Name\" varchar(120)\n}\n\nTable \"Invoice\" {\n \"InvoiceId\" int4 [pk, not null]\n \"CustomerId\" int4 [not null]\n \"InvoiceDate\" timestamp [not null]\n \"BillingAddress\" varchar(70)\n \"BillingCity\" varchar(40)\n \"BillingState\" varchar(40)\n \"BillingCountry\" varchar(40)\n \"BillingPostalCode\" varchar(10)\n \"Total\" numeric(10,2) [not null]\n\n Indexes {\n CustomerId [type: btree, name: \"IFK_InvoiceCustomerId\"]\n }\n}\n\nTable \"InvoiceLine\" {\n \"InvoiceLineId\" int4 [pk, not null]\n \"InvoiceId\" int4 [not null]\n \"TrackId\" int4 [not null]\n \"UnitPrice\" numeric(10,2) [not null]\n \"Quantity\" int4 [not null]\n\n Indexes {\n InvoiceId [type: btree, name: \"IFK_InvoiceLineInvoiceId\"]\n TrackId [type: btree, name: \"IFK_InvoiceLineTrackId\"]\n }\n}\n\nTable \"MediaType\" {\n \"MediaTypeId\" int4 [pk, not null]\n \"Name\" varchar(120)\n}\n\nTable \"Playlist\" {\n \"PlaylistId\" int4 [pk, not null]\n \"Name\" varchar(120)\n}\n\nTable \"PlaylistTrack\" {\n \"PlaylistId\" int4 [not null]\n \"TrackId\" int4 [not null]\n\n Indexes {\n (PlaylistId, TrackId) [type: btree, name: \"PK_PlaylistTrack\"]\n TrackId [type: btree, name: \"IFK_PlaylistTrackTrackId\"]\n }\n}\n\nTable \"Track\" {\n \"TrackId\" int4 [pk, not null]\n \"Name\" varchar(200) [not null]\n \"AlbumId\" int4\n \"MediaTypeId\" int4 [not null]\n \"GenreId\" int4\n \"Composer\" varchar(220)\n \"Milliseconds\" int4 [not null]\n \"Bytes\" int4\n \"UnitPrice\" numeric(10,2) [not null]\n\n Indexes {\n AlbumId [type: btree, name: \"IFK_TrackAlbumId\"]\n GenreId [type: btree, name: \"IFK_TrackGenreId\"]\n MediaTypeId [type: btree, name: \"IFK_TrackMediaTypeId\"]\n }\n}\n\nRef \"FK_AlbumArtistId\":\"Artist\".\"ArtistId\" < \"Album\".\"ArtistId\"\n\nRef \"FK_CustomerSupportRepId\":\"Employee\".\"EmployeeId\" < \"Customer\".\"SupportRepId\"\n\nRef \"FK_EmployeeReportsTo\":\"Employee\".\"EmployeeId\" < \"Employee\".\"ReportsTo\"\n\nRef \"FK_InvoiceCustomerId\":\"Customer\".\"CustomerId\" < \"Invoice\".\"CustomerId\"\n\nRef \"FK_InvoiceLineInvoiceId\":\"Invoice\".\"InvoiceId\" < \"InvoiceLine\".\"InvoiceId\"\n\nRef \"FK_InvoiceLineTrackId\":\"Track\".\"TrackId\" < \"InvoiceLine\".\"TrackId\"\n\nRef \"FK_PlaylistTrackPlaylistId\":\"Playlist\".\"PlaylistId\" < \"PlaylistTrack\".\"PlaylistId\"\n\nRef \"FK_PlaylistTrackTrackId\":\"Track\".\"TrackId\" < \"PlaylistTrack\".\"TrackId\"\n\nRef \"FK_TrackAlbumId\":\"Album\".\"AlbumId\" < \"Track\".\"AlbumId\"\n\nRef \"FK_TrackGenreId\":\"Genre\".\"GenreId\" < \"Track\".\"GenreId\"\n\nRef \"FK_TrackMediaTypeId\":\"MediaType\".\"MediaTypeId\" < \"Track\".\"MediaTypeId\"\n", "stateUrl": null, "stateOriginalUrl": "https://github.com/neondatabase-labs/postgres-sample-dbs/blob/main/chinook.sql" } } ================================================ FILE: tasks/postgres/easy/chinook/update_employee_info/verify.py ================================================ """ Verification script for PostgreSQL Task 3: Employee Hierarchy Management """ import os import sys import psycopg2 from decimal import Decimal def rows_match(actual_row, expected_row): """ Compare two rows with appropriate tolerance. For Decimal types: allows 0.01 tolerance For other types: requires exact match """ if len(actual_row) != len(expected_row): return False for actual, expected in zip(actual_row, expected_row): if isinstance(actual, Decimal) and isinstance(expected, Decimal): if abs(float(actual) - float(expected)) > 0.01: return False elif actual != expected: return False return True def get_connection_params() -> dict: """Get database connection parameters.""" return { "host": os.getenv("POSTGRES_HOST", "localhost"), "port": int(os.getenv("POSTGRES_PORT", 5432)), "database": os.getenv("POSTGRES_DATABASE"), "user": os.getenv("POSTGRES_USERNAME"), "password": os.getenv("POSTGRES_PASSWORD") } def verify_employee_count_and_titles(conn) -> bool: """Verify the final employee count and title changes.""" with conn.cursor() as cur: # Check the final verification query results cur.execute(""" SELECT COUNT(*) as total_employees, COUNT(CASE WHEN "Title" = 'CEO' THEN 1 END) as ceo_count, COUNT(CASE WHEN "Title" = 'IT Specialist' THEN 1 END) as it_specialist_count FROM "Employee" """) result = cur.fetchone() total_employees, ceo_count, it_specialist_count = result if total_employees != 8: print(f"❌ Expected 8 total employees, got {total_employees}") return False if ceo_count != 1: print(f"❌ Expected 1 CEO, got {ceo_count}") return False if it_specialist_count != 2: print(f"❌ Expected 2 IT Specialists, got {it_specialist_count}") return False print("✅ Employee count and title verification passed") return True def verify_specific_employees(conn) -> bool: """Verify specific employee records and modifications.""" with conn.cursor() as cur: # Check all employee fields in one query cur.execute(""" SELECT "EmployeeId", "LastName", "FirstName", "Title", "ReportsTo", "BirthDate", "HireDate", "Address", "City", "State", "Country", "PostalCode", "Phone", "Fax", "Email" FROM "Employee" WHERE "EmployeeId" IN (1, 2) ORDER BY "EmployeeId" """) employees = cur.fetchall() from datetime import datetime expected = [ # Andrew Adams (ID 1) - Title changes to 'CEO', phone stays original, ReportsTo stays None (1, 'Adams', 'Andrew', 'CEO', None, datetime(1962, 2, 18), datetime(2002, 8, 14), '11120 Jasper Ave NW', 'Edmonton', 'AB', 'Canada', 'T5K 2N1', '+1 (780) 428-9482', '+1 (780) 428-3457', 'andrew@chinookcorp.com'), # Nancy Edwards (ID 2) - Phone changes, title stays 'Sales Manager', ReportsTo stays 1 (2, 'Edwards', 'Nancy', 'Sales Manager', 1, datetime(1958, 12, 8), datetime(2002, 5, 1), '825 8 Ave SW', 'Calgary', 'AB', 'Canada', 'T2P 2T3', '+1 (403) 555-9999', '+1 (403) 262-3322', 'nancy@chinookcorp.com'), ] if len(employees) != 2: print(f"❌ Expected 2 key employees, found {len(employees)}") return False # Full field comparison for all employees using rows_match for actual, expected_emp in zip(employees, expected): if not rows_match(actual, expected_emp): print(f"❌ Employee {actual[0]} row mismatch: expected {expected_emp}, got {actual}") return False print("✅ Specific employee verification passed - all fields match exactly") return True def main(): """Main verification function.""" print("=" * 50) print("Verifying Task 3: Employee Hierarchy Management") print("=" * 50) # Get connection parameters conn_params = get_connection_params() if not conn_params["database"]: print("❌ No database specified") sys.exit(1) try: # Connect to database conn = psycopg2.connect(**conn_params) # Run verification checks with short-circuit evaluation success = ( verify_employee_count_and_titles(conn) and verify_specific_employees(conn) ) conn.close() if success: print("\n🎉 Task verification: PASS") print("All employee hierarchy management operations completed correctly!") sys.exit(0) else: print("\n❌ Task verification: FAIL") sys.exit(1) except psycopg2.Error as e: print(f"❌ Database error: {e}") sys.exit(1) except Exception as e: print(f"❌ Verification error: {e}") sys.exit(1) if __name__ == "__main__": main() ================================================ FILE: tasks/postgres/easy/dvdrental/create_payment_index/description.md ================================================ Create an index to optimize customer payment queries in the DVD rental database. ## Your Task: Create an index on the `customer_id` column of the `payment` table to improve query performance. ## Requirements: - Create an index on the `payment` table's `customer_id` column - The index name can be anything you choose (e.g., `idx_payment_customer_id`) - Use the standard CREATE INDEX syntax ## Why This Helps: The `customer_id` column is frequently used in: - JOIN operations between customer and payment tables - WHERE clauses filtering by customer - Subqueries that look up payments for specific customers Adding an index will significantly speed up these operations. ================================================ FILE: tasks/postgres/easy/dvdrental/create_payment_index/meta.json ================================================ { "task_id": "create_payment_index", "task_name": "Create Payment Index", "category_id": "dvdrental", "category_name": "DVD Rental", "description": "Add an index on payment.customer_id to speed up the customer payment lookups in the DVD Rental database.", "author": "Lingxiao Du", "created_at": "2025-11-15", "difficulty": "L1", "tags": [ "performance optimization", "indexing" ], "mcp": [ "postgres" ], "meta_data": { "stateType": "text", "stateContent": "Enum \"mpaa_rating\" {\n \"G\"\n \"PG\"\n \"PG-13\"\n \"R\"\n \"NC-17\"\n}\n\nTable \"customer\" {\n \"customer_id\" int4 [pk, not null, increment]\n \"store_id\" int2 [not null]\n \"first_name\" varchar(45) [not null]\n \"last_name\" varchar(45) [not null]\n \"email\" varchar(50)\n \"address_id\" int2 [not null]\n \"activebool\" bool [not null, default: true]\n \"create_date\" date [not null, default: `('now'::text)::date`]\n \"last_update\" timestamp [default: `now()`]\n \"active\" int4\n\n Indexes {\n address_id [type: btree, name: \"idx_fk_address_id\"]\n store_id [type: btree, name: \"idx_fk_store_id\"]\n last_name [type: btree, name: \"idx_last_name\"]\n }\n}\n\nTable \"actor\" {\n \"actor_id\" int4 [pk, not null, increment]\n \"first_name\" varchar(45) [not null]\n \"last_name\" varchar(45) [not null]\n \"last_update\" timestamp [not null, default: `now()`]\n\n Indexes {\n last_name [type: btree, name: \"idx_actor_last_name\"]\n }\n}\n\nTable \"category\" {\n \"category_id\" int4 [pk, not null, increment]\n \"name\" varchar(25) [not null]\n \"last_update\" timestamp [not null, default: `now()`]\n}\n\nTable \"film\" {\n \"film_id\" int4 [pk, not null, increment]\n \"title\" varchar(255) [not null]\n \"description\" text\n \"release_year\" int4\n \"language_id\" int2 [not null]\n \"rental_duration\" int2 [not null, default: 3]\n \"rental_rate\" numeric(4,2) [not null, default: 4.99]\n \"length\" int2\n \"replacement_cost\" numeric(5,2) [not null, default: 19.99]\n \"rating\" mpaa_rating [default: 'G']\n \"last_update\" timestamp [not null, default: `now()`]\n \"special_features\" \"text[]\"\n \"fulltext\" tsvector [not null]\n\n Indexes {\n fulltext [type: gist, name: \"film_fulltext_idx\"]\n language_id [type: btree, name: \"idx_fk_language_id\"]\n title [type: btree, name: \"idx_title\"]\n }\n}\n\nTable \"film_actor\" {\n \"actor_id\" int2 [not null]\n \"film_id\" int2 [not null]\n \"last_update\" timestamp [not null, default: `now()`]\n\n Indexes {\n (actor_id, film_id) [type: btree, name: \"film_actor_pkey\"]\n film_id [type: btree, name: \"idx_fk_film_id\"]\n }\n}\n\nTable \"film_category\" {\n \"film_id\" int2 [not null]\n \"category_id\" int2 [not null]\n \"last_update\" timestamp [not null, default: `now()`]\n\n Indexes {\n (film_id, category_id) [type: btree, name: \"film_category_pkey\"]\n }\n}\n\nTable \"address\" {\n \"address_id\" int4 [pk, not null, increment]\n \"address\" varchar(50) [not null]\n \"address2\" varchar(50)\n \"district\" varchar(20) [not null]\n \"city_id\" int2 [not null]\n \"postal_code\" varchar(10)\n \"phone\" varchar(20) [not null]\n \"last_update\" timestamp [not null, default: `now()`]\n\n Indexes {\n city_id [type: btree, name: \"idx_fk_city_id\"]\n }\n}\n\nTable \"city\" {\n \"city_id\" int4 [pk, not null, increment]\n \"city\" varchar(50) [not null]\n \"country_id\" int2 [not null]\n \"last_update\" timestamp [not null, default: `now()`]\n\n Indexes {\n country_id [type: btree, name: \"idx_fk_country_id\"]\n }\n}\n\nTable \"country\" {\n \"country_id\" int4 [pk, not null, increment]\n \"country\" varchar(50) [not null]\n \"last_update\" timestamp [not null, default: `now()`]\n}\n\nTable \"inventory\" {\n \"inventory_id\" int4 [pk, not null, increment]\n \"film_id\" int2 [not null]\n \"store_id\" int2 [not null]\n \"last_update\" timestamp [not null, default: `now()`]\n\n Indexes {\n (store_id, film_id) [type: btree, name: \"idx_store_id_film_id\"]\n }\n}\n\nTable \"language\" {\n \"language_id\" int4 [pk, not null, increment]\n \"name\" bpchar(20) [not null]\n \"last_update\" timestamp [not null, default: `now()`]\n}\n\nTable \"payment\" {\n \"payment_id\" int4 [pk, not null, increment]\n \"customer_id\" int2 [not null]\n \"staff_id\" int2 [not null]\n \"rental_id\" int4 [not null]\n \"amount\" numeric(5,2) [not null]\n \"payment_date\" timestamp [not null]\n\n Indexes {\n rental_id [type: btree, name: \"idx_fk_rental_id\"]\n staff_id [type: btree, name: \"idx_fk_staff_id\"]\n }\n}\n\nTable \"rental\" {\n \"rental_id\" int4 [pk, not null, increment]\n \"rental_date\" timestamp [not null]\n \"inventory_id\" int4 [not null]\n \"customer_id\" int2 [not null]\n \"return_date\" timestamp\n \"staff_id\" int2 [not null]\n \"last_update\" timestamp [not null, default: `now()`]\n\n Indexes {\n (rental_date, inventory_id, customer_id) [type: btree, name: \"idx_unq_rental_rental_date_inventory_id_customer_id\"]\n inventory_id [type: btree, name: \"idx_fk_inventory_id\"]\n }\n}\n\nTable \"staff\" {\n \"staff_id\" int4 [pk, not null, increment]\n \"first_name\" varchar(45) [not null]\n \"last_name\" varchar(45) [not null]\n \"address_id\" int2 [not null]\n \"email\" varchar(50)\n \"store_id\" int2 [not null]\n \"active\" bool [not null, default: true]\n \"username\" varchar(16) [not null]\n \"password\" varchar(40)\n \"last_update\" timestamp [not null, default: `now()`]\n \"picture\" bytea\n}\n\nTable \"store\" {\n \"store_id\" int4 [pk, not null, increment]\n \"manager_staff_id\" int2 [unique, not null]\n \"address_id\" int2 [not null]\n \"last_update\" timestamp [not null, default: `now()`]\n}\n\nRef \"fk_address_city\":\"city\".\"city_id\" < \"address\".\"city_id\"\n\nRef \"fk_city\":\"country\".\"country_id\" < \"city\".\"country_id\"\n\nRef \"customer_address_id_fkey\":\"address\".\"address_id\" < \"customer\".\"address_id\" [update: cascade, delete: restrict]\n\nRef \"film_language_id_fkey\":\"language\".\"language_id\" < \"film\".\"language_id\" [update: cascade, delete: restrict]\n\nRef \"film_actor_actor_id_fkey\":\"actor\".\"actor_id\" < \"film_actor\".\"actor_id\" [update: cascade, delete: restrict]\n\nRef \"film_actor_film_id_fkey\":\"film\".\"film_id\" < \"film_actor\".\"film_id\" [update: cascade, delete: restrict]\n\nRef \"film_category_category_id_fkey\":\"category\".\"category_id\" < \"film_category\".\"category_id\" [update: cascade, delete: restrict]\n\nRef \"film_category_film_id_fkey\":\"film\".\"film_id\" < \"film_category\".\"film_id\" [update: cascade, delete: restrict]\n\nRef \"inventory_film_id_fkey\":\"film\".\"film_id\" < \"inventory\".\"film_id\" [update: cascade, delete: restrict]\n\nRef \"payment_customer_id_fkey\":\"customer\".\"customer_id\" < \"payment\".\"customer_id\" [update: cascade, delete: restrict]\n\nRef \"payment_rental_id_fkey\":\"rental\".\"rental_id\" < \"payment\".\"rental_id\" [update: cascade, delete: set null]\n\nRef \"payment_staff_id_fkey\":\"staff\".\"staff_id\" < \"payment\".\"staff_id\" [update: cascade, delete: restrict]\n\nRef \"rental_customer_id_fkey\":\"customer\".\"customer_id\" < \"rental\".\"customer_id\" [update: cascade, delete: restrict]\n\nRef \"rental_inventory_id_fkey\":\"inventory\".\"inventory_id\" < \"rental\".\"inventory_id\" [update: cascade, delete: restrict]\n\nRef \"rental_staff_id_key\":\"staff\".\"staff_id\" < \"rental\".\"staff_id\"\n\nRef \"staff_address_id_fkey\":\"address\".\"address_id\" < \"staff\".\"address_id\" [update: cascade, delete: restrict]\n\nRef \"store_address_id_fkey\":\"address\".\"address_id\" < \"store\".\"address_id\" [update: cascade, delete: restrict]\n\nRef \"store_manager_staff_id_fkey\":\"staff\".\"staff_id\" < \"store\".\"manager_staff_id\" [update: cascade, delete: restrict]\n", "stateUrl": null, "stateOriginalUrl": "https://github.com/gordonkwokkwok/DVD-Rental-PostgreSQL-Project" } } ================================================ FILE: tasks/postgres/easy/dvdrental/create_payment_index/verify.py ================================================ """ Verification script for PostgreSQL Task 1: Customer Payment Query Optimization """ import os import sys import psycopg2 def get_connection_params() -> dict: """Get database connection parameters.""" return { "host": os.getenv("POSTGRES_HOST", "localhost"), "port": int(os.getenv("POSTGRES_PORT", 5432)), "database": os.getenv("POSTGRES_DATABASE"), "user": os.getenv("POSTGRES_USERNAME"), "password": os.getenv("POSTGRES_PASSWORD") } def check_payment_customer_id_index(conn) -> bool: """Check if there's any index on payment.customer_id column.""" with conn.cursor() as cur: cur.execute(""" SELECT indexname, indexdef FROM pg_indexes WHERE schemaname = 'public' AND tablename = 'payment' AND indexdef LIKE '%customer_id%' """) indexes = cur.fetchall() print(indexes) return len(indexes) > 0, indexes def main(): """Main verification function.""" print("=" * 60) print("PostgreSQL Task 1 Verification: Customer Payment Query Optimization") print("=" * 60) # Get connection parameters conn_params = get_connection_params() if not conn_params["database"]: print("❌ No database specified") sys.exit(1) try: # Connect to database conn = psycopg2.connect(**conn_params) print("\n🔍 Checking for customer_id index on payment table...") # Check if any index exists on payment.customer_id has_index, indexes = check_payment_customer_id_index(conn) if has_index: print("✅ Found index(es) on payment.customer_id:") for index_name, index_def in indexes: print(f" - {index_name}: {index_def}") else: print("❌ No index found on payment.customer_id column") conn.close() if has_index: print(f"\n🎉 Task verification: PASS") print(f" - Index on payment.customer_id exists") sys.exit(0) else: print(f"\n❌ Task verification: FAIL") print(f" - No index found on payment.customer_id") print(f" - Create an index on payment(customer_id) to optimize the queries") sys.exit(1) except psycopg2.Error as e: print(f"❌ Database error: {e}") sys.exit(1) except Exception as e: print(f"❌ Verification error: {e}") sys.exit(1) if __name__ == "__main__": main() ================================================ FILE: tasks/postgres/easy/employees/department_summary_view/description.md ================================================ Create an executive department summary view to provide quick insights into departmental metrics for leadership dashboards. This view will consolidate key department statistics in one easily accessible place. ## Your Task: **Create the executive department summary view** — build a materialized view called `exec_department_summary` in the `employees` schema with these exact columns: * `department_name` (varchar) — department name * `total_employees` (integer) — current active employee count (employees with active salary where to_date = '9999-01-01') * `avg_salary` (decimal) — average current salary for active employees * `total_payroll` (bigint) — total monthly payroll cost (sum of all current salaries in the department) * `manager_name` (varchar) — current department manager's full name (first_name and last_name concatenated) ## Requirements: 1. Use materialized view to cache results for better performance 2. Join the following tables: - `departments` - for department information - `dept_emp` - for employee-department relationships - `employees` - for employee details - `salaries` - for current salary information - `dept_manager` - for current manager information 3. Only include current active employees (those with to_date = '9999-01-01' in both `dept_emp` and `salaries`) 4. Only include current managers (to_date = '9999-01-01' in `dept_manager`) 5. Order results by department_name ## After Creation: Refresh the materialized view to populate it with current data. This view will provide executives with a real-time snapshot of departmental workforce metrics and costs. ================================================ FILE: tasks/postgres/easy/employees/department_summary_view/meta.json ================================================ { "task_id": "department_summary_view", "task_name": "Department Summary View", "category_id": "employees", "category_name": "Employees", "description": "Build the exec_department_summary materialized view showing department name, active headcount, payroll totals, and the manager name.", "author": "Lingxiao Du", "created_at": "2025-11-15", "difficulty": "L1", "tags": [ "reporting and analytics", "materialized views" ], "mcp": [ "postgres" ], "meta_data": { "stateType": "text", "stateContent": "Enum \"employees\".\"employee_gender\" {\n \"M\"\n \"F\"\n}\n\nTable \"employees\".\"department\" {\n \"id\" bpchar(4) [pk, not null]\n \"dept_name\" varchar(40) [unique, not null]\n}\n\nTable \"employees\".\"department_employee\" {\n \"employee_id\" int8 [not null]\n \"department_id\" bpchar(4) [not null]\n \"from_date\" date [not null]\n \"to_date\" date [not null]\n\n Indexes {\n (employee_id, department_id) [type: btree, name: \"idx_16982_primary\"]\n department_id [type: btree, name: \"idx_16982_dept_no\"]\n }\n}\n\nTable \"employees\".\"department_manager\" {\n \"employee_id\" int8 [not null]\n \"department_id\" bpchar(4) [not null]\n \"from_date\" date [not null]\n \"to_date\" date [not null]\n\n Indexes {\n (employee_id, department_id) [type: btree, name: \"idx_16985_primary\"]\n department_id [type: btree, name: \"idx_16985_dept_no\"]\n }\n}\n\nTable \"employees\".\"employee\" {\n \"id\" int8 [pk, not null, increment]\n \"birth_date\" date [not null]\n \"first_name\" varchar(14) [not null]\n \"last_name\" varchar(16) [not null]\n \"gender\" employees.employee_gender [not null]\n \"hire_date\" date [not null]\n}\n\nTable \"employees\".\"salary\" {\n \"employee_id\" int8 [not null]\n \"amount\" int8 [not null]\n \"from_date\" date [not null]\n \"to_date\" date [not null]\n\n Indexes {\n (employee_id, from_date) [type: btree, name: \"idx_16991_primary\"]\n }\n}\n\nTable \"employees\".\"title\" {\n \"employee_id\" int8 [not null]\n \"title\" varchar(50) [not null]\n \"from_date\" date [not null]\n \"to_date\" date\n\n Indexes {\n (employee_id, title, from_date) [type: btree, name: \"idx_16994_primary\"]\n }\n}\n\nRef \"dept_emp_ibfk_1\":\"employees\".\"employee\".\"id\" < \"employees\".\"department_employee\".\"employee_id\" [update: restrict, delete: cascade]\n\nRef \"dept_emp_ibfk_2\":\"employees\".\"department\".\"id\" < \"employees\".\"department_employee\".\"department_id\" [update: restrict, delete: cascade]\n\nRef \"dept_manager_ibfk_1\":\"employees\".\"employee\".\"id\" < \"employees\".\"department_manager\".\"employee_id\" [update: restrict, delete: cascade]\n\nRef \"dept_manager_ibfk_2\":\"employees\".\"department\".\"id\" < \"employees\".\"department_manager\".\"department_id\" [update: restrict, delete: cascade]\n\nRef \"salaries_ibfk_1\":\"employees\".\"employee\".\"id\" < \"employees\".\"salary\".\"employee_id\" [update: restrict, delete: cascade]\n\nRef \"titles_ibfk_1\":\"employees\".\"employee\".\"id\" < \"employees\".\"title\".\"employee_id\" [update: restrict, delete: cascade]\n", "stateUrl": null, "stateOriginalUrl": "https://github.com/neondatabase-labs/postgres-sample-dbs/blob/main/employees.sql.gz" } } ================================================ FILE: tasks/postgres/easy/employees/department_summary_view/verify.py ================================================ """ Verification script for PostgreSQL Task 6: Reporting and Automation System """ import os import sys import psycopg2 from decimal import Decimal def rows_match(actual_row, expected_row): """ Compare two rows with appropriate tolerance. For Decimal types: allows 0.1 tolerance For date types: convert to string for comparison For other types: requires exact match """ if len(actual_row) != len(expected_row): return False for actual, expected in zip(actual_row, expected_row): if isinstance(actual, Decimal) and isinstance(expected, (Decimal, float, int)): if abs(float(actual) - float(expected)) > 0.1: return False elif hasattr(actual, 'strftime'): # datetime.date or datetime.datetime if str(actual) != str(expected): return False elif actual != expected: return False return True def get_connection_params() -> dict: """Get database connection parameters.""" return { "host": os.getenv("POSTGRES_HOST", "localhost"), "port": int(os.getenv("POSTGRES_PORT", 5432)), "database": os.getenv("POSTGRES_DATABASE"), "user": os.getenv("POSTGRES_USERNAME"), "password": os.getenv("POSTGRES_PASSWORD") } def verify_materialized_views(conn) -> bool: """Verify that materialized views were created and populated correctly.""" with conn.cursor() as cur: # Check all departments' data accuracy cur.execute(""" SELECT department_name, total_employees, avg_salary, total_payroll, manager_name FROM employees.exec_department_summary ORDER BY department_name """) view_data = cur.fetchall() # Get actual data for all departments cur.execute(""" WITH current_salary AS ( SELECT employee_id, amount FROM ( SELECT s.*, ROW_NUMBER() OVER ( PARTITION BY s.employee_id ORDER BY s.from_date DESC, s.amount DESC ) AS rn FROM employees.salary s WHERE s.to_date = DATE '9999-01-01' ) x WHERE rn = 1 ), current_dept AS ( SELECT DISTINCT de.employee_id, de.department_id FROM employees.department_employee de WHERE de.to_date = DATE '9999-01-01' ), current_manager AS ( SELECT department_id, CONCAT(e.first_name, ' ', e.last_name) AS manager_name FROM ( SELECT dm.*, ROW_NUMBER() OVER ( PARTITION BY dm.department_id ORDER BY dm.from_date DESC, dm.employee_id ) AS rn FROM employees.department_manager dm WHERE dm.to_date = DATE '9999-01-01' ) dm JOIN employees.employee e ON e.id = dm.employee_id WHERE dm.rn = 1 ) SELECT d.dept_name AS department_name, COUNT(cd.employee_id)::INT AS total_employees, AVG(cs.amount)::DECIMAL AS avg_salary, COALESCE(SUM(cs.amount), 0)::BIGINT AS total_payroll, cm.manager_name FROM employees.department d LEFT JOIN current_dept cd ON cd.department_id = d.id LEFT JOIN current_salary cs ON cs.employee_id = cd.employee_id LEFT JOIN current_manager cm ON cm.department_id = d.id GROUP BY d.id, d.dept_name, cm.manager_name ORDER BY d.dept_name; """) actual_data = cur.fetchall() if len(view_data) != len(actual_data): print(f"❌ Department count mismatch: view={len(view_data)}, actual={len(actual_data)}") return False for view_row, actual_row in zip(view_data, actual_data): if not rows_match(view_row, actual_row): print(f"❌ Department summary data incorrect for {view_row[0]}: view={view_row}, actual={actual_row}") return False return True def main(): """Main verification function.""" print("=" * 50) # Get connection parameters conn_params = get_connection_params() if not conn_params["database"]: print("❌ No database specified") sys.exit(1) try: # Connect to database conn = psycopg2.connect(**conn_params) # Verify all components success = verify_materialized_views(conn) conn.close() if success: print("\n🎉 Task verification: PASS") sys.exit(0) else: print("\n❌ Task verification: FAIL") sys.exit(1) except psycopg2.Error as e: print(f"❌ Database error: {e}") sys.exit(1) except Exception as e: print(f"❌ Verification error: {e}") sys.exit(1) if __name__ == "__main__": main() ================================================ FILE: tasks/postgres/easy/employees/employee_gender_statistics/description.md ================================================ Create a gender statistics summary table for the HR team's annual workforce composition report. This is a simple analysis to understand the gender distribution in our employee database. ## Your Task: **Create the gender statistics table** — build a table called `gender_statistics` in the `employees` schema with these exact columns: * `gender` (varchar) — gender ('M' or 'F') * `total_employees` (integer) — total number of employees of this gender * `current_employees` (integer) — current employees of this gender (have active salary where to_date = '9999-01-01') * `percentage_of_workforce` (decimal) — percentage of current workforce (current_employees / total current employees * 100) ## Requirements: 1. Calculate total employees by counting all employees of each gender from the `employees` table 2. Calculate current employees by counting employees with active salary records (to_date = '9999-01-01' in the `salaries` table) 3. Calculate the percentage based on current workforce only 4. The table should contain exactly 2 rows (one for 'M' and one for 'F') This analysis will help HR understand the basic gender composition of our workforce for diversity reporting. ================================================ FILE: tasks/postgres/easy/employees/employee_gender_statistics/meta.json ================================================ { "task_id": "employee_gender_statistics", "task_name": "Employee Gender Statistics", "category_id": "employees", "category_name": "Employees", "description": "Aggregate the employees dataset into a gender_statistics table with counts of total/current staff by gender plus workforce percentage.", "author": "Lingxiao Du", "created_at": "2025-11-15", "difficulty": "L1", "tags": [ "reporting and analytics", "data aggregation" ], "mcp": [ "postgres" ], "meta_data": { "stateType": "text", "stateContent": "Enum \"employees\".\"employee_gender\" {\n \"M\"\n \"F\"\n}\n\nTable \"employees\".\"department\" {\n \"id\" bpchar(4) [pk, not null]\n \"dept_name\" varchar(40) [unique, not null]\n}\n\nTable \"employees\".\"department_employee\" {\n \"employee_id\" int8 [not null]\n \"department_id\" bpchar(4) [not null]\n \"from_date\" date [not null]\n \"to_date\" date [not null]\n\n Indexes {\n (employee_id, department_id) [type: btree, name: \"idx_16982_primary\"]\n department_id [type: btree, name: \"idx_16982_dept_no\"]\n }\n}\n\nTable \"employees\".\"department_manager\" {\n \"employee_id\" int8 [not null]\n \"department_id\" bpchar(4) [not null]\n \"from_date\" date [not null]\n \"to_date\" date [not null]\n\n Indexes {\n (employee_id, department_id) [type: btree, name: \"idx_16985_primary\"]\n department_id [type: btree, name: \"idx_16985_dept_no\"]\n }\n}\n\nTable \"employees\".\"employee\" {\n \"id\" int8 [pk, not null, increment]\n \"birth_date\" date [not null]\n \"first_name\" varchar(14) [not null]\n \"last_name\" varchar(16) [not null]\n \"gender\" employees.employee_gender [not null]\n \"hire_date\" date [not null]\n}\n\nTable \"employees\".\"salary\" {\n \"employee_id\" int8 [not null]\n \"amount\" int8 [not null]\n \"from_date\" date [not null]\n \"to_date\" date [not null]\n\n Indexes {\n (employee_id, from_date) [type: btree, name: \"idx_16991_primary\"]\n }\n}\n\nTable \"employees\".\"title\" {\n \"employee_id\" int8 [not null]\n \"title\" varchar(50) [not null]\n \"from_date\" date [not null]\n \"to_date\" date\n\n Indexes {\n (employee_id, title, from_date) [type: btree, name: \"idx_16994_primary\"]\n }\n}\n\nRef \"dept_emp_ibfk_1\":\"employees\".\"employee\".\"id\" < \"employees\".\"department_employee\".\"employee_id\" [update: restrict, delete: cascade]\n\nRef \"dept_emp_ibfk_2\":\"employees\".\"department\".\"id\" < \"employees\".\"department_employee\".\"department_id\" [update: restrict, delete: cascade]\n\nRef \"dept_manager_ibfk_1\":\"employees\".\"employee\".\"id\" < \"employees\".\"department_manager\".\"employee_id\" [update: restrict, delete: cascade]\n\nRef \"dept_manager_ibfk_2\":\"employees\".\"department\".\"id\" < \"employees\".\"department_manager\".\"department_id\" [update: restrict, delete: cascade]\n\nRef \"salaries_ibfk_1\":\"employees\".\"employee\".\"id\" < \"employees\".\"salary\".\"employee_id\" [update: restrict, delete: cascade]\n\nRef \"titles_ibfk_1\":\"employees\".\"employee\".\"id\" < \"employees\".\"title\".\"employee_id\" [update: restrict, delete: cascade]\n", "stateUrl": null, "stateOriginalUrl": "https://github.com/neondatabase-labs/postgres-sample-dbs/blob/main/employees.sql.gz" } } ================================================ FILE: tasks/postgres/easy/employees/employee_gender_statistics/verify.py ================================================ import os import sys import psycopg2 from decimal import Decimal def rows_match(actual_row, expected_row): """ Compare two rows with appropriate tolerance. For Decimal types: allows 0.1 tolerance For other types: requires exact match """ if len(actual_row) != len(expected_row): return False for actual, expected in zip(actual_row, expected_row): if isinstance(actual, Decimal) and isinstance(expected, Decimal): if abs(float(actual) - float(expected)) > 0.1: return False elif actual != expected: return False return True def get_connection_params() -> dict: """Get database connection parameters.""" return { "host": os.getenv("POSTGRES_HOST", "localhost"), "port": int(os.getenv("POSTGRES_PORT", 5432)), "database": os.getenv("POSTGRES_DATABASE"), "user": os.getenv("POSTGRES_USERNAME"), "password": os.getenv("POSTGRES_PASSWORD") } def verify_gender_statistics_results(conn) -> bool: """Verify the gender statistics results.""" with conn.cursor() as cur: # Get actual results from the created table cur.execute(""" SELECT gender, total_employees, current_employees, percentage_of_workforce FROM employees.gender_statistics ORDER BY gender """) actual_results = cur.fetchall() # Execute ground truth query cur.execute(""" WITH current_emp AS ( SELECT DISTINCT s.employee_id FROM employees.salary s WHERE s.to_date = DATE '9999-01-01' ), total_current AS ( SELECT COUNT(*) AS cnt FROM current_emp ) SELECT e.gender::varchar AS gender, COUNT(*) AS total_employees, COUNT(*) FILTER (WHERE ce.employee_id IS NOT NULL) AS current_employees, (COUNT(*) FILTER (WHERE ce.employee_id IS NOT NULL))::DECIMAL / NULLIF((SELECT cnt FROM total_current), 0) * 100 AS percentage_of_workforce FROM employees.employee e LEFT JOIN current_emp ce ON ce.employee_id = e.id WHERE e.gender IN ('M','F') GROUP BY e.gender ORDER BY gender; """) expected_results = cur.fetchall() if len(actual_results) != len(expected_results): print(f"❌ Expected {len(expected_results)} gender statistics results, got {len(actual_results)}") return False mismatches = 0 for i, (actual, expected) in enumerate(zip(actual_results, expected_results)): if not rows_match(actual, expected): if mismatches < 5: # Only show first 5 mismatches print(f"❌ Row {i+1} mismatch: expected {expected}, got {actual}") mismatches += 1 if mismatches > 0: print(f"❌ Total mismatches: {mismatches}") return False print(f"✅ Gender statistics results are correct ({len(actual_results)} records)") return True def main(): """Main verification function.""" print("=" * 50) # Get connection parameters conn_params = get_connection_params() if not conn_params["database"]: print("❌ No database specified") sys.exit(1) try: # Connect to database conn = psycopg2.connect(**conn_params) # Verify all four analysis results success = verify_gender_statistics_results(conn) conn.close() if success: print("\n🎉 Task verification: PASS") sys.exit(0) else: print("\n❌ Task verification: FAIL") sys.exit(1) except psycopg2.Error as e: print(f"❌ Database error: {e}") sys.exit(1) except Exception as e: print(f"❌ Verification error: {e}") sys.exit(1) if __name__ == "__main__": main() ================================================ FILE: tasks/postgres/easy/employees/employee_projects_basic/description.md ================================================ Create and manage a basic employee projects table to track company projects. The IT team needs you to build the database table structure and populate it with initial project data. ## Your Tasks: 1. **Create the employee_projects table** — build a new table in the `employees` schema: **Table: `employee_projects`** * `project_id` (integer, primary key, auto-increment) * `project_name` (varchar(100), not null) * `start_date` (date, not null) * `end_date` (date) * `budget` (decimal(10,2)) * `status` (varchar(20), default 'active') 2. **Insert exactly this initial data into `employee_projects`**: * Project 1: name='Database Modernization', start_date='2024-01-15', end_date='2024-06-30', budget=250000.00, status='active' * Project 2: name='Employee Portal Upgrade', start_date='2024-02-01', end_date='2024-05-15', budget=180000.00, status='active' * Project 3: name='HR Analytics Dashboard', start_date='2023-11-01', end_date='2024-01-31', budget=120000.00, status='active' This will establish the basic project tracking foundation for the company. ================================================ FILE: tasks/postgres/easy/employees/employee_projects_basic/meta.json ================================================ { "task_id": "employee_projects_basic", "task_name": "Employee Projects Basic", "category_id": "employees", "category_name": "Employees", "description": "Create the employee_projects table with the specified schema and insert the three starter projects for modernization, portal upgrade, and analytics.", "author": "Lingxiao Du", "created_at": "2025-11-15", "difficulty": "L1", "tags": [ "schema design", "data loading" ], "mcp": [ "postgres" ], "meta_data": { "stateType": "text", "stateContent": "Enum \"employees\".\"employee_gender\" {\n \"M\"\n \"F\"\n}\n\nTable \"employees\".\"department\" {\n \"id\" bpchar(4) [pk, not null]\n \"dept_name\" varchar(40) [unique, not null]\n}\n\nTable \"employees\".\"department_employee\" {\n \"employee_id\" int8 [not null]\n \"department_id\" bpchar(4) [not null]\n \"from_date\" date [not null]\n \"to_date\" date [not null]\n\n Indexes {\n (employee_id, department_id) [type: btree, name: \"idx_16982_primary\"]\n department_id [type: btree, name: \"idx_16982_dept_no\"]\n }\n}\n\nTable \"employees\".\"department_manager\" {\n \"employee_id\" int8 [not null]\n \"department_id\" bpchar(4) [not null]\n \"from_date\" date [not null]\n \"to_date\" date [not null]\n\n Indexes {\n (employee_id, department_id) [type: btree, name: \"idx_16985_primary\"]\n department_id [type: btree, name: \"idx_16985_dept_no\"]\n }\n}\n\nTable \"employees\".\"employee\" {\n \"id\" int8 [pk, not null, increment]\n \"birth_date\" date [not null]\n \"first_name\" varchar(14) [not null]\n \"last_name\" varchar(16) [not null]\n \"gender\" employees.employee_gender [not null]\n \"hire_date\" date [not null]\n}\n\nTable \"employees\".\"salary\" {\n \"employee_id\" int8 [not null]\n \"amount\" int8 [not null]\n \"from_date\" date [not null]\n \"to_date\" date [not null]\n\n Indexes {\n (employee_id, from_date) [type: btree, name: \"idx_16991_primary\"]\n }\n}\n\nTable \"employees\".\"title\" {\n \"employee_id\" int8 [not null]\n \"title\" varchar(50) [not null]\n \"from_date\" date [not null]\n \"to_date\" date\n\n Indexes {\n (employee_id, title, from_date) [type: btree, name: \"idx_16994_primary\"]\n }\n}\n\nRef \"dept_emp_ibfk_1\":\"employees\".\"employee\".\"id\" < \"employees\".\"department_employee\".\"employee_id\" [update: restrict, delete: cascade]\n\nRef \"dept_emp_ibfk_2\":\"employees\".\"department\".\"id\" < \"employees\".\"department_employee\".\"department_id\" [update: restrict, delete: cascade]\n\nRef \"dept_manager_ibfk_1\":\"employees\".\"employee\".\"id\" < \"employees\".\"department_manager\".\"employee_id\" [update: restrict, delete: cascade]\n\nRef \"dept_manager_ibfk_2\":\"employees\".\"department\".\"id\" < \"employees\".\"department_manager\".\"department_id\" [update: restrict, delete: cascade]\n\nRef \"salaries_ibfk_1\":\"employees\".\"employee\".\"id\" < \"employees\".\"salary\".\"employee_id\" [update: restrict, delete: cascade]\n\nRef \"titles_ibfk_1\":\"employees\".\"employee\".\"id\" < \"employees\".\"title\".\"employee_id\" [update: restrict, delete: cascade]\n", "stateUrl": null, "stateOriginalUrl": "https://github.com/neondatabase-labs/postgres-sample-dbs/blob/main/employees.sql.gz" } } ================================================ FILE: tasks/postgres/easy/employees/employee_projects_basic/verify.py ================================================ """ Verification script for PostgreSQL Task 5: Database Schema and Data Operations """ import os import sys import psycopg2 from decimal import Decimal def rows_match(actual_row, expected_row): """ Compare two rows with appropriate tolerance. For Decimal types: allows 0.1 tolerance For date types: convert to string for comparison For other types: requires exact match """ if len(actual_row) != len(expected_row): return False for actual, expected in zip(actual_row, expected_row): if isinstance(actual, Decimal) and isinstance(expected, (Decimal, float, int)): if abs(float(actual) - float(expected)) > 0.1: return False elif hasattr(actual, 'strftime'): # datetime.date or datetime.datetime if str(actual) != str(expected): return False elif actual != expected: return False return True def get_connection_params() -> dict: """Get database connection parameters.""" return { "host": os.getenv("POSTGRES_HOST", "localhost"), "port": int(os.getenv("POSTGRES_PORT", 5432)), "database": os.getenv("POSTGRES_DATABASE"), "user": os.getenv("POSTGRES_USERNAME"), "password": os.getenv("POSTGRES_PASSWORD") } def verify_project_data(conn) -> bool: """Verify that project data was inserted and updated correctly.""" with conn.cursor() as cur: # Check project data after updates cur.execute(""" SELECT project_name, start_date, end_date, budget, status FROM employees.employee_projects ORDER BY project_name """) projects = cur.fetchall() if len(projects) != 3: print(f"❌ Expected 3 projects, found {len(projects)}") return False # Expected final state after all updates expected = { 'Database Modernization': ('2024-01-15', '2024-06-30', 250000.00, 'active'), 'Employee Portal Upgrade': ('2024-02-01', '2024-05-15', 180000.00, 'active'), 'HR Analytics Dashboard': ('2023-11-01', '2024-01-31', 120000.00, 'active') } for project in projects: name = project[0] if name not in expected: print(f"❌ Unexpected project: {name}") return False exp = expected[name] # Use rows_match for comparison expected_row = (name,) + exp if not rows_match(project, expected_row): print(f"❌ Project {name} data mismatch: expected {expected_row}, got {project}") return False print("✅ Project data is correct") return True def main(): """Main verification function.""" print("=" * 50) # Get connection parameters conn_params = get_connection_params() if not conn_params["database"]: print("❌ No database specified") sys.exit(1) try: # Connect to database conn = psycopg2.connect(**conn_params) # Verify all components success = verify_project_data(conn) conn.close() if success: print("\n🎉 Task verification: PASS") sys.exit(0) else: print("\n❌ Task verification: FAIL") sys.exit(1) except psycopg2.Error as e: print(f"❌ Database error: {e}") sys.exit(1) except Exception as e: print(f"❌ Verification error: {e}") sys.exit(1) if __name__ == "__main__": main() ================================================ FILE: tasks/postgres/easy/employees/hiring_year_summary/description.md ================================================ Create a hiring year summary table to help HR track employee retention trends over the years. This analysis shows how many employees were hired each year and how many are still with the company. ## Your Task: **Create the hiring year summary table** — build a table called `hiring_year_summary` in the `employees` schema with these exact columns: * `hire_year` (integer) — year employees were hired * `employees_hired` (integer) — number of employees hired that year * `still_employed` (integer) — how many from that year are still employed (have active salary where to_date = '9999-01-01') * `retention_rate` (decimal) — percentage still employed (still_employed / employees_hired * 100) ## Requirements: 1. Extract the hire year from the `hire_date` column in the `employees` table 2. Count total employees hired in each year 3. Determine which employees are still employed by checking for active salary records (to_date = '9999-01-01' in the `salaries` table) 4. Order results by hire_year in ascending order This analysis will help HR understand retention patterns and identify years with particularly high or low retention rates. ================================================ FILE: tasks/postgres/easy/employees/hiring_year_summary/meta.json ================================================ { "task_id": "hiring_year_summary", "task_name": "Hiring Year Summary", "category_id": "employees", "category_name": "Employees", "description": "Summarize hires per year into hiring_year_summary, including still-employed counts and retention percentages using active salary rows.", "author": "Lingxiao Du", "created_at": "2025-11-15", "difficulty": "L1", "tags": [ "reporting and analytics", "retention analysis" ], "mcp": [ "postgres" ], "meta_data": { "stateType": "text", "stateContent": "Enum \"employees\".\"employee_gender\" {\n \"M\"\n \"F\"\n}\n\nTable \"employees\".\"department\" {\n \"id\" bpchar(4) [pk, not null]\n \"dept_name\" varchar(40) [unique, not null]\n}\n\nTable \"employees\".\"department_employee\" {\n \"employee_id\" int8 [not null]\n \"department_id\" bpchar(4) [not null]\n \"from_date\" date [not null]\n \"to_date\" date [not null]\n\n Indexes {\n (employee_id, department_id) [type: btree, name: \"idx_16982_primary\"]\n department_id [type: btree, name: \"idx_16982_dept_no\"]\n }\n}\n\nTable \"employees\".\"department_manager\" {\n \"employee_id\" int8 [not null]\n \"department_id\" bpchar(4) [not null]\n \"from_date\" date [not null]\n \"to_date\" date [not null]\n\n Indexes {\n (employee_id, department_id) [type: btree, name: \"idx_16985_primary\"]\n department_id [type: btree, name: \"idx_16985_dept_no\"]\n }\n}\n\nTable \"employees\".\"employee\" {\n \"id\" int8 [pk, not null, increment]\n \"birth_date\" date [not null]\n \"first_name\" varchar(14) [not null]\n \"last_name\" varchar(16) [not null]\n \"gender\" employees.employee_gender [not null]\n \"hire_date\" date [not null]\n}\n\nTable \"employees\".\"salary\" {\n \"employee_id\" int8 [not null]\n \"amount\" int8 [not null]\n \"from_date\" date [not null]\n \"to_date\" date [not null]\n\n Indexes {\n (employee_id, from_date) [type: btree, name: \"idx_16991_primary\"]\n }\n}\n\nTable \"employees\".\"title\" {\n \"employee_id\" int8 [not null]\n \"title\" varchar(50) [not null]\n \"from_date\" date [not null]\n \"to_date\" date\n\n Indexes {\n (employee_id, title, from_date) [type: btree, name: \"idx_16994_primary\"]\n }\n}\n\nRef \"dept_emp_ibfk_1\":\"employees\".\"employee\".\"id\" < \"employees\".\"department_employee\".\"employee_id\" [update: restrict, delete: cascade]\n\nRef \"dept_emp_ibfk_2\":\"employees\".\"department\".\"id\" < \"employees\".\"department_employee\".\"department_id\" [update: restrict, delete: cascade]\n\nRef \"dept_manager_ibfk_1\":\"employees\".\"employee\".\"id\" < \"employees\".\"department_manager\".\"employee_id\" [update: restrict, delete: cascade]\n\nRef \"dept_manager_ibfk_2\":\"employees\".\"department\".\"id\" < \"employees\".\"department_manager\".\"department_id\" [update: restrict, delete: cascade]\n\nRef \"salaries_ibfk_1\":\"employees\".\"employee\".\"id\" < \"employees\".\"salary\".\"employee_id\" [update: restrict, delete: cascade]\n\nRef \"titles_ibfk_1\":\"employees\".\"employee\".\"id\" < \"employees\".\"title\".\"employee_id\" [update: restrict, delete: cascade]\n", "stateUrl": null, "stateOriginalUrl": "https://github.com/neondatabase-labs/postgres-sample-dbs/blob/main/employees.sql.gz" } } ================================================ FILE: tasks/postgres/easy/employees/hiring_year_summary/verify.py ================================================ """ Verification script for PostgreSQL Task 3: Employee Demographics Report """ import os import sys import psycopg2 from decimal import Decimal def rows_match(actual_row, expected_row): """ Compare two rows with appropriate tolerance. For Decimal types: allows 0.1 tolerance For other types: requires exact match """ if len(actual_row) != len(expected_row): return False for actual, expected in zip(actual_row, expected_row): if isinstance(actual, Decimal) and isinstance(expected, Decimal): if abs(float(actual) - float(expected)) > 0.1: return False elif actual != expected: return False return True def get_connection_params() -> dict: """Get database connection parameters.""" return { "host": os.getenv("POSTGRES_HOST", "localhost"), "port": int(os.getenv("POSTGRES_PORT", 5432)), "database": os.getenv("POSTGRES_DATABASE"), "user": os.getenv("POSTGRES_USERNAME"), "password": os.getenv("POSTGRES_PASSWORD") } def verify_hiring_year_results(conn) -> bool: """Verify the hiring year summary results.""" with conn.cursor() as cur: # Get actual results from the created table cur.execute(""" SELECT hire_year, employees_hired, still_employed, retention_rate FROM employees.hiring_year_summary ORDER BY hire_year """) actual_results = cur.fetchall() # Execute ground truth query cur.execute(""" WITH current_emp AS ( SELECT DISTINCT s.employee_id FROM employees.salary s WHERE s.to_date = DATE '9999-01-01' ), base AS ( SELECT e.id, EXTRACT(YEAR FROM e.hire_date)::INT AS hire_year FROM employees.employee e WHERE e.hire_date IS NOT NULL ) SELECT b.hire_year, COUNT(*)::INT AS employees_hired, COUNT(*) FILTER (WHERE ce.employee_id IS NOT NULL)::INT AS still_employed, (COUNT(*) FILTER (WHERE ce.employee_id IS NOT NULL))::DECIMAL / NULLIF(COUNT(*), 0) * 100 AS retention_rate FROM base b LEFT JOIN current_emp ce ON ce.employee_id = b.id GROUP BY b.hire_year ORDER BY b.hire_year; """) expected_results = cur.fetchall() if len(actual_results) != len(expected_results): print(f"❌ Expected {len(expected_results)} hiring year results, got {len(actual_results)}") return False mismatches = 0 for i, (actual, expected) in enumerate(zip(actual_results, expected_results)): if not rows_match(actual, expected): if mismatches < 5: # Only show first 5 mismatches print(f"❌ Row {i+1} mismatch: expected {expected}, got {actual}") mismatches += 1 if mismatches > 0: print(f"❌ Total mismatches: {mismatches}") return False print(f"✅ Hiring year summary results are correct ({len(actual_results)} records)") return True def main(): """Main verification function.""" print("=" * 50) # Get connection parameters conn_params = get_connection_params() if not conn_params["database"]: print("❌ No database specified") sys.exit(1) try: # Connect to database conn = psycopg2.connect(**conn_params) # Verify all four analysis results success = verify_hiring_year_results(conn) conn.close() if success: print("\n🎉 Task verification: PASS") sys.exit(0) else: print("\n❌ Task verification: FAIL") sys.exit(1) except psycopg2.Error as e: print(f"❌ Database error: {e}") sys.exit(1) except Exception as e: print(f"❌ Verification error: {e}") sys.exit(1) if __name__ == "__main__": main() ================================================ FILE: tasks/postgres/easy/lego/basic_security_setup/description.md ================================================ Set up basic database security with role-based access control and Row-Level Security (RLS) for the LEGO database. ## Your Tasks: ### 1. Create Database Role and Permissions Create a new database role called `theme_analyst` with the following permissions: * `SELECT` permissions on all reference tables: `lego_themes`, `lego_colors`, `lego_parts`, `lego_part_categories` * `SELECT` permissions on main data tables: `lego_sets`, `lego_inventories`, `lego_inventory_parts` * No `INSERT`, `UPDATE`, or `DELETE` permissions on any tables ### 2. Enable Row-Level Security Enable RLS on the following tables: * `lego_sets` * `lego_inventories` * `lego_inventory_parts` ## Requirements: - Use `CREATE ROLE` to create the `theme_analyst` role - Use `GRANT SELECT` statements to assign the appropriate permissions - Use `ALTER TABLE ... ENABLE ROW LEVEL SECURITY` to enable RLS on each table ## Expected Outcome: After completing these tasks: - The `theme_analyst` role should exist with read-only access to specified tables - Row-Level Security should be enabled (but not yet enforced with policies) on the three main data tables - The role should have no write permissions on any table This sets up the foundation for implementing theme-based data isolation policies. ================================================ FILE: tasks/postgres/easy/lego/basic_security_setup/meta.json ================================================ { "task_id": "basic_security_setup", "task_name": "Basic Security Setup", "category_id": "lego", "category_name": "Lego", "description": "Create the read-only theme_analyst role with SELECT rights on LEGO reference tables and enable row-level security on sets and inventory tables.", "author": "Lingxiao Du", "created_at": "2025-11-15", "difficulty": "L1", "tags": [ "security", "access control" ], "mcp": [ "postgres" ], "meta_data": { "stateType": "text", "stateContent": "Table \"lego_colors\" {\n \"id\" int4 [pk, not null, increment]\n \"name\" varchar(255) [not null]\n \"rgb\" varchar(6) [not null]\n \"is_trans\" bpchar(1) [not null]\n}\n\nTable \"lego_inventories\" {\n \"id\" int4 [pk, not null, increment]\n \"version\" int4 [not null]\n \"set_num\" varchar(255) [not null]\n}\n\nTable \"lego_inventory_parts\" {\n \"inventory_id\" int4 [not null]\n \"part_num\" varchar(255) [not null]\n \"color_id\" int4 [not null]\n \"quantity\" int4 [not null]\n \"is_spare\" bool [not null]\n}\n\nTable \"lego_inventory_sets\" {\n \"inventory_id\" int4 [not null]\n \"set_num\" varchar(255) [not null]\n \"quantity\" int4 [not null]\n}\n\nTable \"lego_part_categories\" {\n \"id\" int4 [pk, not null, increment]\n \"name\" varchar(255) [not null]\n}\n\nTable \"lego_parts\" {\n \"part_num\" varchar(255) [pk, not null]\n \"name\" text [not null]\n \"part_cat_id\" int4 [not null]\n}\n\nTable \"lego_sets\" {\n \"set_num\" varchar(255) [pk, not null]\n \"name\" varchar(255) [not null]\n \"year\" int4\n \"theme_id\" int4\n \"num_parts\" int4\n}\n\nTable \"lego_themes\" {\n \"id\" int4 [pk, not null, increment]\n \"name\" varchar(255) [not null]\n \"parent_id\" int4\n}\n", "stateUrl": null, "stateOriginalUrl": "https://github.com/neondatabase-labs/postgres-sample-dbs/blob/main/lego.sql" } } ================================================ FILE: tasks/postgres/easy/lego/basic_security_setup/verify.py ================================================ """ Verification script for PostgreSQL LEGO Task 4: Database Security and RLS Implementation (Version 2 - Improved Robustness) """ import os import sys import psycopg2 import psycopg2.errors from typing import Dict def get_connection_params() -> Dict[str, any]: """Get database connection parameters from environment variables.""" return { "host": os.getenv("POSTGRES_HOST", "localhost"), "port": int(os.getenv("POSTGRES_PORT", 5432)), "database": os.getenv("POSTGRES_DATABASE"), "user": os.getenv("POSTGRES_USERNAME"), "password": os.getenv("POSTGRES_PASSWORD"), } def verify_role_creation(conn) -> bool: """ TASK 1 VERIFICATION: Check if theme_analyst role was created with proper permissions. """ print("\n-- Verifying Task 1: Role Creation and Permissions --") with conn.cursor() as cur: # Check if role exists cur.execute("SELECT 1 FROM pg_roles WHERE rolname = 'theme_analyst';") if not cur.fetchone(): print("❌ FAIL: The 'theme_analyst' role was not created.") return False print("✅ OK: Role 'theme_analyst' exists.") # Check SELECT permissions on reference and main tables all_tables = [ 'lego_themes', 'lego_colors', 'lego_parts', 'lego_part_categories', 'lego_sets', 'lego_inventories', 'lego_inventory_parts' ] for table in all_tables: cur.execute( """ SELECT has_table_privilege('theme_analyst', %s, 'SELECT'); """, (table,) ) if not cur.fetchone()[0]: print(f"❌ FAIL: 'theme_analyst' role is missing SELECT permission on '{table}'.") return False print("✅ OK: Role has correct SELECT permissions on all required tables.") # Check that no INSERT/UPDATE/DELETE permissions exist for table in all_tables: cur.execute( """ SELECT has_table_privilege('theme_analyst', %s, 'INSERT') OR has_table_privilege('theme_analyst', %s, 'UPDATE') OR has_table_privilege('theme_analyst', %s, 'DELETE'); """, (table, table, table) ) if cur.fetchone()[0]: print(f"❌ FAIL: 'theme_analyst' role has unauthorized INSERT, UPDATE, or DELETE permission on '{table}'.") return False print("✅ OK: Role does not have modification permissions.") print("✅ PASS: 'theme_analyst' role created with correct permissions.") return True def verify_rls_enabled(conn) -> bool: """ TASK 2 VERIFICATION: Check if Row-Level Security is enabled on required tables. """ print("\n-- Verifying Task 2: Row-Level Security Enablement --") tables_to_check = ['lego_sets', 'lego_inventories', 'lego_inventory_parts'] with conn.cursor() as cur: for table in tables_to_check: cur.execute( "SELECT relrowsecurity FROM pg_class WHERE relname = %s;", (table,) ) rls_enabled = cur.fetchone() if not rls_enabled or not rls_enabled[0]: print(f"❌ FAIL: RLS is not enabled on table '{table}'.") return False print(f"✅ OK: RLS is enabled on table '{table}'.") print("✅ PASS: Row-Level Security is enabled on all required tables.") return True def main(): """Main verification function.""" print("=" * 60) print("LEGO Database Security and RLS Verification Script") print("=" * 60) conn_params = get_connection_params() if not conn_params.get("database"): print("❌ CRITICAL: POSTGRES_DATABASE environment variable not set.") sys.exit(1) conn = None try: conn = psycopg2.connect(**conn_params) results = [ verify_role_creation(conn), verify_rls_enabled(conn), ] if all(results): print("\n🎉 Overall Result: PASS - All security tasks verified successfully!") sys.exit(0) else: print("\n❌ Overall Result: FAIL - One or more verification steps failed.") sys.exit(1) except psycopg2.OperationalError as e: print(f"❌ CRITICAL: Could not connect to the database. Check credentials and host. Details: {e}") sys.exit(1) except Exception as e: print(f"❌ CRITICAL: An unexpected error occurred. Details: {e}") sys.exit(1) finally: if conn: conn.close() if __name__ == "__main__": main() ================================================ FILE: tasks/postgres/easy/lego/fix_data_inconsistencies/description.md ================================================ Fix data inconsistencies in the LEGO database where the reported part count in the `lego_sets` table does not match the actual sum of non-spare parts in the latest inventory version. ## Consistency Rule For any given `set_num`, the following must be true: `lego_sets.num_parts = SUM(quantity)` FROM `lego_inventory_parts` WHERE `inventory_id` IN (latest inventory for that set) AND `is_spare` = false **Important**: If a set has no inventory records, the consistency check should be skipped. ## Your Tasks: ### Task 1: Identify Data Inconsistencies **Objective**: Write a single `SELECT` query to find all sets where the stored `num_parts` does not match the actual calculated number of parts from the latest inventory. 1. **Find the Latest Inventory**: For each `set_num`, find its latest inventory id by getting the `MAX(version)` from the `lego_inventories` table. 2. **Calculate Actual Part Count**: For these latest inventories, join with `lego_inventory_parts` and calculate the `SUM(quantity)`, but only for parts where `is_spare` is false. 3. **Compare and Filter**: Join this calculated result back to the `lego_sets` table and return the rows where `lego_sets.num_parts` is different from your calculated sum. ### Task 2: Fix Existing Inconsistencies **Objective**: Correct all mismatched `num_parts` values using a clear, multi-step process with a temporary table. #### Step 1: Create a Temporary Table Create a temporary table (e.g., `correct_counts`) with two columns: `set_num` (text) and `actual_parts` (integer). #### Step 2: Populate the Temporary Table Write an `INSERT` statement that calculates the correct part count for every single set listed in the `lego_sets` table. - The query must start by selecting from `public.lego_sets`. - It must then `LEFT JOIN` to a subquery that contains the part-counting logic (finding the latest inventory version and summing the non-spare parts). - Use `COALESCE` on the final result from the subquery to ensure that any set without parts or without an inventory record gets a value of `0`, not `NULL`. #### Step 3: Update from the Temporary Table Write a final, simple `UPDATE` statement that joins the `lego_sets` table with your temporary table on `set_num` and sets `num_parts` to the `actual_parts` value. ## Expected Outcome: After completing these tasks, all sets in the `lego_sets` table should have their `num_parts` correctly reflecting the sum of non-spare parts from their latest inventory version. ================================================ FILE: tasks/postgres/easy/lego/fix_data_inconsistencies/meta.json ================================================ { "task_id": "fix_data_inconsistencies", "task_name": "Fix Data Inconsistencies", "category_id": "lego", "category_name": "Lego", "description": "Recalculate each LEGO set's part count from the latest inventory, stage the results, and update lego_sets.num_parts to remove mismatches.", "author": "Lingxiao Du", "created_at": "2025-11-15", "difficulty": "L1", "tags": [ "data integrity enforcement", "data reconciliation" ], "mcp": [ "postgres" ], "meta_data": { "stateType": "text", "stateContent": "Table \"lego_colors\" {\n \"id\" int4 [pk, not null, increment]\n \"name\" varchar(255) [not null]\n \"rgb\" varchar(6) [not null]\n \"is_trans\" bpchar(1) [not null]\n}\n\nTable \"lego_inventories\" {\n \"id\" int4 [pk, not null, increment]\n \"version\" int4 [not null]\n \"set_num\" varchar(255) [not null]\n}\n\nTable \"lego_inventory_parts\" {\n \"inventory_id\" int4 [not null]\n \"part_num\" varchar(255) [not null]\n \"color_id\" int4 [not null]\n \"quantity\" int4 [not null]\n \"is_spare\" bool [not null]\n}\n\nTable \"lego_inventory_sets\" {\n \"inventory_id\" int4 [not null]\n \"set_num\" varchar(255) [not null]\n \"quantity\" int4 [not null]\n}\n\nTable \"lego_part_categories\" {\n \"id\" int4 [pk, not null, increment]\n \"name\" varchar(255) [not null]\n}\n\nTable \"lego_parts\" {\n \"part_num\" varchar(255) [pk, not null]\n \"name\" text [not null]\n \"part_cat_id\" int4 [not null]\n}\n\nTable \"lego_sets\" {\n \"set_num\" varchar(255) [pk, not null]\n \"name\" varchar(255) [not null]\n \"year\" int4\n \"theme_id\" int4\n \"num_parts\" int4\n}\n\nTable \"lego_themes\" {\n \"id\" int4 [pk, not null, increment]\n \"name\" varchar(255) [not null]\n \"parent_id\" int4\n}\n", "stateUrl": null, "stateOriginalUrl": "https://github.com/neondatabase-labs/postgres-sample-dbs/blob/main/lego.sql" } } ================================================ FILE: tasks/postgres/easy/lego/fix_data_inconsistencies/verify.py ================================================ """ Verification script for PostgreSQL LEGO Task 1: Parts Consistency Fix & Constraints Version 2.1: Relaxed consistency check to allow for one known corner case mismatch. """ import os import sys import psycopg2 import psycopg2.errors from typing import Optional, Tuple, List def get_connection_params() -> dict: """Get database connection parameters from environment variables.""" return { "host": os.getenv("POSTGRES_HOST", "localhost"), "port": int(os.getenv("POSTGRES_PORT", 5432)), "database": os.getenv("POSTGRES_DATABASE"), "user": os.getenv("POSTGRES_USERNAME"), "password": os.getenv("POSTGRES_PASSWORD"), } def fetch_candidate_part_row(cur) -> Optional[Tuple[int, str, str, int]]: """ Picks a concrete, non-spare inventory part from the latest inventory of any set. This provides a reliable target for testing update and insert triggers. Returns a tuple: (inventory_id, set_num, part_num, color_id) or None. """ cur.execute( """ WITH latest_inv AS ( SELECT set_num, MAX(version) AS max_version FROM public.lego_inventories GROUP BY set_num ), inv AS ( SELECT li.id, li.set_num FROM public.lego_inventories li JOIN latest_inv lv ON lv.set_num = li.set_num AND lv.max_version = li.version ) SELECT i.id AS inventory_id, i.set_num, lip.part_num, lip.color_id FROM inv i JOIN public.lego_inventory_parts lip ON lip.inventory_id = i.id WHERE lip.is_spare = false AND lip.quantity > 0 LIMIT 1; """ ) return cur.fetchone() def get_mismatch_count(cur) -> int: """Returns the number of sets where num_parts mismatches the computed actual sum.""" cur.execute( """ WITH latest_inv AS ( SELECT set_num, MAX(version) AS max_version FROM public.lego_inventories GROUP BY set_num ), inv_latest AS ( SELECT li.set_num, li.id FROM public.lego_inventories li JOIN latest_inv lv ON lv.set_num = li.set_num AND lv.max_version = li.version ), parts_agg AS ( SELECT i.set_num, SUM(lip.quantity) AS actual_parts FROM inv_latest i JOIN public.lego_inventory_parts lip ON lip.inventory_id = i.id WHERE lip.is_spare = false GROUP BY i.set_num ) SELECT COUNT(*) FROM public.lego_sets s LEFT JOIN parts_agg pa ON s.set_num = pa.set_num WHERE s.num_parts <> COALESCE(pa.actual_parts, 0); """ ) return cur.fetchone()[0] def verify_data_consistency(conn) -> bool: """ TASK 1 VERIFICATION: Checks if the initial data fix was successful. (Relaxed: Allows for one corner-case mismatch). """ print("\n-- Verifying Task 1: Data Consistency Fix (Relaxed) --") with conn.cursor() as cur: count = get_mismatch_count(cur) # RELAXED CONDITION: Allow 0 or 1 mismatch to pass. if count > 1: print(f"❌ FAIL: Found {count} sets with inconsistent part counts. Expected 0 or 1 after fix.") return False print("✅ PASS: Data consistency check passed (allowing for one known mismatch).") return True def main(): """Main verification function.""" print("=" * 60) print("LEGO Database Consistency Verification Script") print("=" * 60) conn_params = get_connection_params() if not conn_params.get("database"): print("❌ CRITICAL: POSTGRES_DATABASE environment variable not set.") sys.exit(1) try: with psycopg2.connect(**conn_params) as conn: conn.autocommit = False # Ensure we control transactions # Run all verification steps results = [ verify_data_consistency(conn), ] if all(results): print("\n🎉 Overall Result: PASS - All tasks verified successfully!") sys.exit(0) else: print("\n❌ Overall Result: FAIL - One or more verification steps failed.") sys.exit(1) except psycopg2.OperationalError as e: print(f"❌ CRITICAL: Could not connect to the database. Details: {e}") sys.exit(1) except Exception as e: print(f"❌ CRITICAL: An unexpected error occurred during verification. Details: {e}") sys.exit(1) if __name__ == "__main__": main() ================================================ FILE: tasks/postgres/easy/sports/create_performance_indexes/description.md ================================================ Create indexes to optimize participant and statistics queries in the sports database. ## Your Task: Create two indexes to improve query performance: 1. **Index on participants_events table**: Create an index on the `participant_id` column of the `participants_events` table 2. **Composite index on stats table**: Create a composite index on the `stats` table using columns `stat_holder_type` and `stat_holder_id` (in that order) ## Requirements: - Create an index on `participants_events(participant_id)` - Create a composite index on `stats(stat_holder_type, stat_holder_id)` - Index names can be anything you choose (e.g., `idx_participants_events_participant_id`, `idx_stats_holder`) - Use the standard CREATE INDEX syntax ## Expected Outcome: After creating these indexes, queries that involve participant filtering and statistics lookups will run significantly faster. ================================================ FILE: tasks/postgres/easy/sports/create_performance_indexes/meta.json ================================================ { "task_id": "create_performance_indexes", "task_name": "Create Performance Indexes", "category_id": "sports", "category_name": "Sports", "description": "Create indexes on participants_events.participant_id and stats(stat_holder_type, stat_holder_id) to accelerate performance reporting.", "author": "Lingxiao Du", "created_at": "2025-11-15", "difficulty": "L1", "tags": [ "performance optimization", "indexing" ], "mcp": [ "postgres" ], "meta_data": { "stateType": "text", "stateContent": "Table \"addresses\" {\n \"id\" int4 [not null, increment]\n \"location_id\" int4 [not null]\n \"language\" varchar(100)\n \"suite\" varchar(100)\n \"floor\" varchar(100)\n \"building\" varchar(100)\n \"street_number\" varchar(100)\n \"street_prefix\" varchar(100)\n \"street\" varchar(100)\n \"street_suffix\" varchar(100)\n \"neighborhood\" varchar(100)\n \"district\" varchar(100)\n \"locality\" varchar(100)\n \"county\" varchar(100)\n \"region\" varchar(100)\n \"postal_code\" varchar(100)\n \"country\" varchar(100)\n}\n\nTable \"affiliation_phases\" {\n \"id\" int4 [not null, increment]\n \"affiliation_id\" int4 [not null]\n \"ancestor_affiliation_id\" int4\n \"start_season_id\" int4\n \"start_date_time\" timestamp\n \"end_season_id\" int4\n \"end_date_time\" timestamp\n}\n\nTable \"affiliations\" {\n \"id\" int4 [not null, increment]\n \"affiliation_key\" varchar(100) [not null]\n \"affiliation_type\" varchar(100)\n \"publisher_id\" int4 [not null]\n}\n\nTable \"affiliations_documents\" {\n \"affiliation_id\" int4 [not null]\n \"document_id\" int4 [not null]\n}\n\nTable \"affiliations_events\" {\n \"affiliation_id\" int4 [not null]\n \"event_id\" int4 [not null]\n}\n\nTable \"affiliations_media\" {\n \"affiliation_id\" int4 [not null]\n \"media_id\" int4 [not null]\n}\n\nTable \"american_football_action_participants\" {\n \"id\" int4 [not null, increment]\n \"american_football_action_play_id\" int4 [not null]\n \"person_id\" int4 [not null]\n \"participant_role\" varchar(100) [not null]\n \"score_type\" varchar(100)\n \"field_line\" int4\n \"yardage\" int4\n \"score_credit\" int4\n \"yards_gained\" int4\n}\n\nTable \"american_football_action_plays\" {\n \"id\" int4 [not null, increment]\n \"american_football_event_state_id\" int4 [not null]\n \"play_type\" varchar(100)\n \"score_attempt_type\" varchar(100)\n \"drive_result\" varchar(100)\n \"points\" int4\n \"comment\" varchar(255)\n}\n\nTable \"american_football_defensive_stats\" {\n \"id\" int4 [not null, increment]\n \"tackles_total\" varchar(100)\n \"tackles_solo\" varchar(100)\n \"tackles_assists\" varchar(100)\n \"interceptions_total\" varchar(100)\n \"interceptions_yards\" varchar(100)\n \"interceptions_average\" varchar(100)\n \"interceptions_longest\" varchar(100)\n \"interceptions_touchdown\" varchar(100)\n \"quarterback_hurries\" varchar(100)\n \"sacks_total\" varchar(100)\n \"sacks_yards\" varchar(100)\n \"passes_defensed\" varchar(100)\n}\n\nTable \"american_football_down_progress_stats\" {\n \"id\" int4 [not null, increment]\n \"first_downs_total\" varchar(100)\n \"first_downs_pass\" varchar(100)\n \"first_downs_run\" varchar(100)\n \"first_downs_penalty\" varchar(100)\n \"conversions_third_down\" varchar(100)\n \"conversions_third_down_attempts\" varchar(100)\n \"conversions_third_down_percentage\" varchar(100)\n \"conversions_fourth_down\" varchar(100)\n \"conversions_fourth_down_attempts\" varchar(100)\n \"conversions_fourth_down_percentage\" varchar(100)\n}\n\nTable \"american_football_event_states\" {\n \"id\" int4 [not null, increment]\n \"event_id\" int4 [not null]\n \"current_state\" int2\n \"sequence_number\" int4\n \"period_value\" int4\n \"period_time_elapsed\" varchar(100)\n \"period_time_remaining\" varchar(100)\n \"clock_state\" varchar(100)\n \"down\" int4\n \"team_in_possession_id\" int4\n \"distance_for_1st_down\" int4\n \"field_side\" varchar(100)\n \"field_line\" int4\n \"context\" varchar(40)\n}\n\nTable \"american_football_fumbles_stats\" {\n \"id\" int4 [not null, increment]\n \"fumbles_committed\" varchar(100)\n \"fumbles_forced\" varchar(100)\n \"fumbles_recovered\" varchar(100)\n \"fumbles_lost\" varchar(100)\n \"fumbles_yards_gained\" varchar(100)\n \"fumbles_own_committed\" varchar(100)\n \"fumbles_own_recovered\" varchar(100)\n \"fumbles_own_lost\" varchar(100)\n \"fumbles_own_yards_gained\" varchar(100)\n \"fumbles_opposing_committed\" varchar(100)\n \"fumbles_opposing_recovered\" varchar(100)\n \"fumbles_opposing_lost\" varchar(100)\n \"fumbles_opposing_yards_gained\" varchar(100)\n}\n\nTable \"american_football_offensive_stats\" {\n \"id\" int4 [not null, increment]\n \"offensive_plays_yards\" varchar(100)\n \"offensive_plays_number\" varchar(100)\n \"offensive_plays_average_yards_per\" varchar(100)\n \"possession_duration\" varchar(100)\n \"turnovers_giveaway\" varchar(100)\n}\n\nTable \"american_football_passing_stats\" {\n \"id\" int4 [not null, increment]\n \"passes_attempts\" varchar(100)\n \"passes_completions\" varchar(100)\n \"passes_percentage\" varchar(100)\n \"passes_yards_gross\" varchar(100)\n \"passes_yards_net\" varchar(100)\n \"passes_yards_lost\" varchar(100)\n \"passes_touchdowns\" varchar(100)\n \"passes_touchdowns_percentage\" varchar(100)\n \"passes_interceptions\" varchar(100)\n \"passes_interceptions_percentage\" varchar(100)\n \"passes_longest\" varchar(100)\n \"passes_average_yards_per\" varchar(100)\n \"passer_rating\" varchar(100)\n \"receptions_total\" varchar(100)\n \"receptions_yards\" varchar(100)\n \"receptions_touchdowns\" varchar(100)\n \"receptions_first_down\" varchar(100)\n \"receptions_longest\" varchar(100)\n \"receptions_average_yards_per\" varchar(100)\n}\n\nTable \"american_football_penalties_stats\" {\n \"id\" int4 [not null, increment]\n \"penalties_total\" varchar(100)\n \"penalty_yards\" varchar(100)\n \"penalty_first_downs\" varchar(100)\n}\n\nTable \"american_football_rushing_stats\" {\n \"id\" int4 [not null, increment]\n \"rushes_attempts\" varchar(100)\n \"rushes_yards\" varchar(100)\n \"rushes_touchdowns\" varchar(100)\n \"rushing_average_yards_per\" varchar(100)\n \"rushes_first_down\" varchar(100)\n \"rushes_longest\" varchar(100)\n}\n\nTable \"american_football_sacks_against_stats\" {\n \"id\" int4 [not null, increment]\n \"sacks_against_yards\" varchar(100)\n \"sacks_against_total\" varchar(100)\n}\n\nTable \"american_football_scoring_stats\" {\n \"id\" int4 [not null, increment]\n \"touchdowns_total\" varchar(100)\n \"touchdowns_passing\" varchar(100)\n \"touchdowns_rushing\" varchar(100)\n \"touchdowns_special_teams\" varchar(100)\n \"touchdowns_defensive\" varchar(100)\n \"extra_points_attempts\" varchar(100)\n \"extra_points_made\" varchar(100)\n \"extra_points_missed\" varchar(100)\n \"extra_points_blocked\" varchar(100)\n \"field_goal_attempts\" varchar(100)\n \"field_goals_made\" varchar(100)\n \"field_goals_missed\" varchar(100)\n \"field_goals_blocked\" varchar(100)\n \"safeties_against\" varchar(100)\n \"two_point_conversions_attempts\" varchar(100)\n \"two_point_conversions_made\" varchar(100)\n \"touchbacks_total\" varchar(100)\n}\n\nTable \"american_football_special_teams_stats\" {\n \"id\" int4 [not null, increment]\n \"returns_punt_total\" varchar(100)\n \"returns_punt_yards\" varchar(100)\n \"returns_punt_average\" varchar(100)\n \"returns_punt_longest\" varchar(100)\n \"returns_punt_touchdown\" varchar(100)\n \"returns_kickoff_total\" varchar(100)\n \"returns_kickoff_yards\" varchar(100)\n \"returns_kickoff_average\" varchar(100)\n \"returns_kickoff_longest\" varchar(100)\n \"returns_kickoff_touchdown\" varchar(100)\n \"returns_total\" varchar(100)\n \"returns_yards\" varchar(100)\n \"punts_total\" varchar(100)\n \"punts_yards_gross\" varchar(100)\n \"punts_yards_net\" varchar(100)\n \"punts_longest\" varchar(100)\n \"punts_inside_20\" varchar(100)\n \"punts_inside_20_percentage\" varchar(100)\n \"punts_average\" varchar(100)\n \"punts_blocked\" varchar(100)\n \"touchbacks_total\" varchar(100)\n \"touchbacks_total_percentage\" varchar(100)\n \"touchbacks_kickoffs\" varchar(100)\n \"touchbacks_kickoffs_percentage\" varchar(100)\n \"touchbacks_punts\" varchar(100)\n \"touchbacks_punts_percentage\" varchar(100)\n \"touchbacks_interceptions\" varchar(100)\n \"touchbacks_interceptions_percentage\" varchar(100)\n \"fair_catches\" varchar(100)\n}\n\nTable \"baseball_action_contact_details\" {\n \"id\" int4 [not null, increment]\n \"baseball_action_pitch_id\" int4 [not null]\n \"location\" varchar(100)\n \"strength\" varchar(100)\n \"velocity\" int4\n \"comment\" text\n \"trajectory_coordinates\" varchar(100)\n \"trajectory_formula\" varchar(100)\n}\n\nTable \"baseball_action_pitches\" {\n \"id\" int4 [not null, increment]\n \"baseball_action_play_id\" int4 [not null]\n \"sequence_number\" int4\n \"baseball_defensive_group_id\" int4\n \"umpire_call\" varchar(100)\n \"pitch_location\" varchar(100)\n \"pitch_type\" varchar(100)\n \"pitch_velocity\" int4\n \"comment\" text\n \"trajectory_coordinates\" varchar(100)\n \"trajectory_formula\" varchar(100)\n \"ball_type\" varchar(40)\n \"strike_type\" varchar(40)\n}\n\nTable \"baseball_action_plays\" {\n \"id\" int4 [not null, increment]\n \"baseball_event_state_id\" int4 [not null]\n \"play_type\" varchar(100)\n \"notation\" varchar(100)\n \"notation_yaml\" text\n \"baseball_defensive_group_id\" int4\n \"comment\" varchar(255)\n \"runner_on_first_advance\" int4\n \"runner_on_second_advance\" int4\n \"runner_on_third_advance\" int4\n \"outs_recorded\" int4\n \"rbi\" int4\n \"runs_scored\" int4\n \"earned_runs_scored\" varchar(100)\n}\n\nTable \"baseball_action_substitutions\" {\n \"id\" int4 [not null, increment]\n \"baseball_event_state_id\" int4 [not null]\n \"sequence_number\" int4\n \"person_type\" varchar(100)\n \"person_original_id\" int4\n \"person_original_position_id\" int4\n \"person_original_lineup_slot\" int4\n \"person_replacing_id\" int4\n \"person_replacing_position_id\" int4\n \"person_replacing_lineup_slot\" int4\n \"substitution_reason\" varchar(100)\n \"comment\" varchar(100)\n}\n\nTable \"baseball_defensive_group\" {\n \"id\" int4 [not null, increment]\n}\n\nTable \"baseball_defensive_players\" {\n \"id\" int4 [not null, increment]\n \"baseball_defensive_group_id\" int4 [not null]\n \"player_id\" int4 [not null]\n \"position_id\" int4 [not null]\n}\n\nTable \"baseball_defensive_stats\" {\n \"id\" int4 [not null, increment]\n \"double_plays\" int4\n \"triple_plays\" int4\n \"putouts\" int4\n \"assists\" int4\n \"errors\" int4\n \"fielding_percentage\" numeric\n \"defensive_average\" numeric\n \"errors_passed_ball\" int4\n \"errors_catchers_interference\" int4\n}\n\nTable \"baseball_event_states\" {\n \"id\" int4 [not null, increment]\n \"event_id\" int4 [not null]\n \"current_state\" int2\n \"sequence_number\" int4\n \"at_bat_number\" int4\n \"inning_value\" int4\n \"inning_half\" varchar(100)\n \"outs\" int4\n \"balls\" int4\n \"strikes\" int4\n \"runner_on_first_id\" int4\n \"runner_on_second_id\" int4\n \"runner_on_third_id\" int4\n \"runner_on_first\" int2\n \"runner_on_second\" int2\n \"runner_on_third\" int2\n \"runs_this_inning_half\" int4\n \"pitcher_id\" int4\n \"batter_id\" int4\n \"batter_side\" varchar(100)\n \"context\" varchar(40)\n}\n\nTable \"baseball_offensive_stats\" {\n \"id\" int4 [not null, increment]\n \"average\" numeric\n \"runs_scored\" int4\n \"at_bats\" int4\n \"hits\" int4\n \"rbi\" int4\n \"total_bases\" int4\n \"slugging_percentage\" numeric\n \"bases_on_balls\" int4\n \"strikeouts\" int4\n \"left_on_base\" int4\n \"left_in_scoring_position\" int4\n \"singles\" int4\n \"doubles\" int4\n \"triples\" int4\n \"home_runs\" int4\n \"grand_slams\" int4\n \"at_bats_per_rbi\" numeric\n \"plate_appearances_per_rbi\" numeric\n \"at_bats_per_home_run\" numeric\n \"plate_appearances_per_home_run\" numeric\n \"sac_flies\" int4\n \"sac_bunts\" int4\n \"grounded_into_double_play\" int4\n \"moved_up\" int4\n \"on_base_percentage\" numeric\n \"stolen_bases\" int4\n \"stolen_bases_caught\" int4\n \"stolen_bases_average\" numeric\n \"hit_by_pitch\" int4\n \"defensive_interferance_reaches\" int4\n \"on_base_plus_slugging\" numeric\n \"plate_appearances\" int4\n \"hits_extra_base\" int4\n}\n\nTable \"baseball_pitching_stats\" {\n \"id\" int4 [not null, increment]\n \"runs_allowed\" int4\n \"singles_allowed\" int4\n \"doubles_allowed\" int4\n \"triples_allowed\" int4\n \"home_runs_allowed\" int4\n \"innings_pitched\" varchar(20)\n \"hits\" int4\n \"earned_runs\" int4\n \"unearned_runs\" int4\n \"bases_on_balls\" int4\n \"bases_on_balls_intentional\" int4\n \"strikeouts\" int4\n \"strikeout_to_bb_ratio\" numeric\n \"number_of_pitches\" int4\n \"era\" numeric\n \"inherited_runners_scored\" int4\n \"pick_offs\" int4\n \"errors_hit_with_pitch\" int4\n \"errors_wild_pitch\" int4\n \"balks\" int4\n \"wins\" int4\n \"losses\" int4\n \"saves\" int4\n \"shutouts\" int4\n \"games_complete\" int4\n \"games_finished\" int4\n \"winning_percentage\" numeric\n \"event_credit\" varchar(40)\n \"save_credit\" varchar(40)\n}\n\nTable \"basketball_defensive_stats\" {\n \"id\" int4 [not null, increment]\n \"steals_total\" varchar(100)\n \"steals_per_game\" varchar(100)\n \"blocks_total\" varchar(100)\n \"blocks_per_game\" varchar(100)\n}\n\nTable \"basketball_event_states\" {\n \"id\" int4 [not null, increment]\n \"event_id\" int4 [not null]\n \"current_state\" int4\n \"sequence_number\" int4\n \"period_value\" varchar(100)\n \"period_time_elapsed\" varchar(100)\n \"period_time_remaining\" varchar(100)\n \"context\" varchar(40)\n}\n\nTable \"basketball_offensive_stats\" {\n \"id\" int4 [not null, increment]\n \"field_goals_made\" int4\n \"field_goals_attempted\" int4\n \"field_goals_percentage\" varchar(100)\n \"field_goals_per_game\" varchar(100)\n \"field_goals_attempted_per_game\" varchar(100)\n \"field_goals_percentage_adjusted\" varchar(100)\n \"three_pointers_made\" int4\n \"three_pointers_attempted\" int4\n \"three_pointers_percentage\" varchar(100)\n \"three_pointers_per_game\" varchar(100)\n \"three_pointers_attempted_per_game\" varchar(100)\n \"free_throws_made\" varchar(100)\n \"free_throws_attempted\" varchar(100)\n \"free_throws_percentage\" varchar(100)\n \"free_throws_per_game\" varchar(100)\n \"free_throws_attempted_per_game\" varchar(100)\n \"points_scored_total\" varchar(100)\n \"points_scored_per_game\" varchar(100)\n \"assists_total\" varchar(100)\n \"assists_per_game\" varchar(100)\n \"turnovers_total\" varchar(100)\n \"turnovers_per_game\" varchar(100)\n \"points_scored_off_turnovers\" varchar(100)\n \"points_scored_in_paint\" varchar(100)\n \"points_scored_on_second_chance\" varchar(100)\n \"points_scored_on_fast_break\" varchar(100)\n}\n\nTable \"basketball_rebounding_stats\" {\n \"id\" int4 [not null, increment]\n \"rebounds_total\" varchar(100)\n \"rebounds_per_game\" varchar(100)\n \"rebounds_defensive\" varchar(100)\n \"rebounds_offensive\" varchar(100)\n \"team_rebounds_total\" varchar(100)\n \"team_rebounds_per_game\" varchar(100)\n \"team_rebounds_defensive\" varchar(100)\n \"team_rebounds_offensive\" varchar(100)\n}\n\nTable \"basketball_team_stats\" {\n \"id\" int4 [not null, increment]\n \"timeouts_left\" varchar(100)\n \"largest_lead\" varchar(100)\n \"fouls_total\" varchar(100)\n \"turnover_margin\" varchar(100)\n}\n\nTable \"bookmakers\" {\n \"id\" int4 [not null, increment]\n \"bookmaker_key\" varchar(100)\n \"publisher_id\" int4 [not null]\n \"location_id\" int4\n}\n\nTable \"core_person_stats\" {\n \"id\" int4 [not null, increment]\n \"time_played_event\" varchar(40)\n \"time_played_total\" varchar(40)\n \"time_played_event_average\" varchar(40)\n \"events_played\" int4\n \"events_started\" int4\n \"position_id\" int4\n}\n\nTable \"core_stats\" {\n \"id\" int4 [not null, increment]\n \"score\" varchar(100)\n \"score_opposing\" varchar(100)\n \"score_attempts\" varchar(100)\n \"score_attempts_opposing\" varchar(100)\n \"score_percentage\" varchar(100)\n \"score_percentage_opposing\" varchar(100)\n}\n\nTable \"db_info\" {\n \"version\" varchar(100) [not null, default: 16]\n}\n\nTable \"display_names\" {\n \"id\" int4 [not null, increment]\n \"language\" varchar(100) [not null]\n \"entity_type\" varchar(100) [not null]\n \"entity_id\" int4 [not null]\n \"full_name\" varchar(100)\n \"first_name\" varchar(100)\n \"middle_name\" varchar(100)\n \"last_name\" varchar(100)\n \"alias\" varchar(100)\n \"abbreviation\" varchar(100)\n \"short_name\" varchar(100)\n \"prefix\" varchar(20)\n \"suffix\" varchar(20)\n}\n\nTable \"document_classes\" {\n \"id\" int4 [not null, increment]\n \"name\" varchar(100)\n}\n\nTable \"document_contents\" {\n \"id\" int4 [not null, increment]\n \"document_id\" int4 [not null]\n \"sportsml\" varchar(200)\n \"abstract\" text\n}\n\nTable \"document_fixtures\" {\n \"id\" int4 [not null, increment]\n \"fixture_key\" varchar(100)\n \"publisher_id\" int4 [not null]\n \"name\" varchar(100)\n \"document_class_id\" int4 [not null]\n}\n\nTable \"document_fixtures_events\" {\n \"id\" int4 [not null, increment]\n \"document_fixture_id\" int4 [not null]\n \"event_id\" int4 [not null]\n \"latest_document_id\" int4 [not null]\n \"last_update\" timestamp\n}\n\nTable \"document_package_entry\" {\n \"id\" int4 [not null, increment]\n \"document_package_id\" int4 [not null]\n \"rank\" varchar(100)\n \"document_id\" int4 [not null]\n \"headline\" varchar(100)\n \"short_headline\" varchar(100)\n}\n\nTable \"document_packages\" {\n \"id\" int4 [not null, increment]\n \"package_key\" varchar(100)\n \"package_name\" varchar(100)\n \"date_time\" date\n}\n\nTable \"documents\" {\n \"id\" int4 [not null, increment]\n \"doc_id\" varchar(75) [not null]\n \"publisher_id\" int4 [not null]\n \"date_time\" timestamp\n \"title\" varchar(255)\n \"language\" varchar(100)\n \"priority\" varchar(100)\n \"revision_id\" varchar(75)\n \"stats_coverage\" varchar(100)\n \"document_fixture_id\" int4 [not null]\n \"source_id\" int4\n \"db_loading_date_time\" timestamp\n}\n\nTable \"documents_media\" {\n \"id\" int4 [not null, increment]\n \"document_id\" int4 [not null]\n \"media_id\" int4 [not null]\n \"media_caption_id\" int4 [not null]\n}\n\nTable \"events\" {\n \"id\" int4 [not null, increment]\n \"event_key\" varchar(100) [not null]\n \"publisher_id\" int4 [not null]\n \"start_date_time\" timestamp\n \"site_id\" int4\n \"site_alignment\" varchar(100)\n \"event_status\" varchar(100)\n \"duration\" varchar(100)\n \"attendance\" varchar(100)\n \"last_update\" timestamp\n}\n\nTable \"events_documents\" {\n \"event_id\" int4 [not null]\n \"document_id\" int4 [not null]\n}\n\nTable \"events_media\" {\n \"event_id\" int4 [not null]\n \"media_id\" int4 [not null]\n}\n\nTable \"events_sub_seasons\" {\n \"event_id\" int4 [not null]\n \"sub_season_id\" int4 [not null]\n}\n\nTable \"ice_hockey_action_participants\" {\n \"id\" int4 [not null, increment]\n \"ice_hockey_action_play_id\" int4 [not null]\n \"person_id\" int4 [not null]\n \"participant_role\" varchar(100) [not null]\n \"point_credit\" int4\n}\n\nTable \"ice_hockey_action_plays\" {\n \"id\" int4 [not null, increment]\n \"ice_hockey_event_state_id\" int4 [not null]\n \"play_type\" varchar(100)\n \"score_attempt_type\" varchar(100)\n \"play_result\" varchar(100)\n \"comment\" varchar(255)\n}\n\nTable \"ice_hockey_defensive_stats\" {\n \"id\" int4 [not null, increment]\n \"shots_power_play_allowed\" varchar(100)\n \"shots_penalty_shot_allowed\" varchar(100)\n \"goals_power_play_allowed\" varchar(100)\n \"goals_penalty_shot_allowed\" varchar(100)\n \"goals_against_average\" varchar(100)\n \"saves\" varchar(100)\n \"save_percentage\" varchar(100)\n \"penalty_killing_amount\" varchar(100)\n \"penalty_killing_percentage\" varchar(100)\n \"shots_blocked\" varchar(100)\n \"takeaways\" varchar(100)\n \"shutouts\" varchar(100)\n \"minutes_penalty_killing\" varchar(100)\n \"hits\" varchar(100)\n \"goals_empty_net_allowed\" varchar(100)\n \"goals_short_handed_allowed\" varchar(100)\n \"goals_shootout_allowed\" varchar(100)\n \"shots_shootout_allowed\" varchar(100)\n}\n\nTable \"ice_hockey_event_states\" {\n \"id\" int4 [not null, increment]\n \"event_id\" int4 [not null]\n \"current_state\" int4\n \"sequence_number\" int4\n \"period_value\" varchar(100)\n \"period_time_elapsed\" varchar(100)\n \"period_time_remaining\" varchar(100)\n \"context\" varchar(40)\n}\n\nTable \"ice_hockey_offensive_stats\" {\n \"id\" int4 [not null, increment]\n \"goals_game_winning\" varchar(100)\n \"goals_game_tying\" varchar(100)\n \"goals_power_play\" varchar(100)\n \"goals_short_handed\" varchar(100)\n \"goals_even_strength\" varchar(100)\n \"goals_empty_net\" varchar(100)\n \"goals_overtime\" varchar(100)\n \"goals_shootout\" varchar(100)\n \"goals_penalty_shot\" varchar(100)\n \"assists\" varchar(100)\n \"points\" varchar(100)\n \"power_play_amount\" varchar(100)\n \"power_play_percentage\" varchar(100)\n \"shots_penalty_shot_taken\" varchar(100)\n \"shots_penalty_shot_missed\" varchar(100)\n \"shots_penalty_shot_percentage\" varchar(100)\n \"giveaways\" varchar(100)\n \"minutes_power_play\" varchar(100)\n \"faceoff_wins\" varchar(100)\n \"faceoff_losses\" varchar(100)\n \"faceoff_win_percentage\" varchar(100)\n \"scoring_chances\" varchar(100)\n}\n\nTable \"ice_hockey_player_stats\" {\n \"id\" int4 [not null, increment]\n \"plus_minus\" varchar(100)\n}\n\nTable \"injury_phases\" {\n \"id\" int4 [not null, increment]\n \"person_id\" int4 [not null]\n \"injury_status\" varchar(100)\n \"injury_type\" varchar(100)\n \"injury_comment\" varchar(100)\n \"disabled_list\" varchar(100)\n \"start_date_time\" timestamp\n \"end_date_time\" timestamp\n \"season_id\" int4\n \"phase_type\" varchar(100)\n \"injury_side\" varchar(100)\n}\n\nTable \"key_aliases\" {\n \"id\" int4 [not null, increment]\n \"key_id\" int4 [not null]\n \"key_root_id\" int4 [not null]\n}\n\nTable \"key_roots\" {\n \"id\" int4 [not null, increment]\n \"key_type\" varchar(100)\n}\n\nTable \"latest_revisions\" {\n \"id\" int4 [not null, increment]\n \"revision_id\" varchar(75) [not null]\n \"latest_document_id\" int4 [not null]\n}\n\nTable \"locations\" {\n \"id\" int4 [not null, increment]\n \"timezone\" varchar(100)\n \"latitude\" varchar(100)\n \"longitude\" varchar(100)\n \"country_code\" varchar(100)\n}\n\nTable \"media\" {\n \"id\" int4 [not null, increment]\n \"object_id\" int4\n \"source_id\" int4\n \"revision_id\" int4\n \"media_type\" varchar(100)\n \"publisher_id\" int4 [not null]\n \"date_time\" varchar(100)\n \"credit_id\" int4 [not null]\n \"db_loading_date_time\" timestamp\n \"creation_location_id\" int4 [not null]\n}\n\nTable \"media_captions\" {\n \"id\" int4 [not null, increment]\n \"media_id\" int4 [not null]\n \"caption_type\" varchar(100)\n \"caption\" varchar(100)\n \"caption_author_id\" int4 [not null]\n \"language\" varchar(100)\n \"caption_size\" varchar(100)\n}\n\nTable \"media_contents\" {\n \"id\" int4 [not null, increment]\n \"media_id\" int4 [not null]\n \"object\" varchar(100)\n \"format\" varchar(100)\n \"mime_type\" varchar(100)\n \"height\" varchar(100)\n \"width\" varchar(100)\n \"duration\" varchar(100)\n \"file_size\" varchar(100)\n \"resolution\" varchar(100)\n}\n\nTable \"media_keywords\" {\n \"id\" int4 [not null, increment]\n \"keyword\" varchar(100)\n \"media_id\" int4 [not null]\n}\n\nTable \"motor_racing_event_states\" {\n \"id\" int4 [not null, increment]\n \"event_id\" int4 [not null]\n \"current_state\" int4\n \"sequence_number\" int4\n \"lap\" varchar(100)\n \"laps_remaining\" varchar(100)\n \"time_elapsed\" varchar(100)\n \"flag_state\" varchar(100)\n \"context\" varchar(40)\n}\n\nTable \"motor_racing_qualifying_stats\" {\n \"id\" int4 [not null, increment]\n \"grid\" varchar(100)\n \"pole_position\" varchar(100)\n \"pole_wins\" varchar(100)\n \"qualifying_speed\" varchar(100)\n \"qualifying_speed_units\" varchar(100)\n \"qualifying_time\" varchar(100)\n \"qualifying_position\" varchar(100)\n}\n\nTable \"motor_racing_race_stats\" {\n \"id\" int4 [not null, increment]\n \"time_behind_leader\" varchar(100)\n \"laps_behind_leader\" varchar(100)\n \"time_ahead_follower\" varchar(100)\n \"laps_ahead_follower\" varchar(100)\n \"time\" varchar(100)\n \"points\" varchar(100)\n \"points_rookie\" varchar(100)\n \"bonus\" varchar(100)\n \"laps_completed\" varchar(100)\n \"laps_leading_total\" varchar(100)\n \"distance_leading\" varchar(100)\n \"distance_completed\" varchar(100)\n \"distance_units\" varchar(40)\n \"speed_average\" varchar(40)\n \"speed_units\" varchar(40)\n \"status\" varchar(40)\n \"finishes_top_5\" varchar(40)\n \"finishes_top_10\" varchar(40)\n \"starts\" varchar(40)\n \"finishes\" varchar(40)\n \"non_finishes\" varchar(40)\n \"wins\" varchar(40)\n \"races_leading\" varchar(40)\n \"money\" varchar(40)\n \"money_units\" varchar(40)\n \"leads_total\" varchar(40)\n}\n\nTable \"outcome_totals\" {\n \"id\" int4 [not null, increment]\n \"standing_subgroup_id\" int4 [not null]\n \"outcome_holder_type\" varchar(100)\n \"outcome_holder_id\" int4\n \"rank\" varchar(100)\n \"wins\" varchar(100)\n \"losses\" varchar(100)\n \"ties\" varchar(100)\n \"undecideds\" varchar(100)\n \"winning_percentage\" varchar(100)\n \"points_scored_for\" varchar(100)\n \"points_scored_against\" varchar(100)\n \"points_difference\" varchar(100)\n \"standing_points\" varchar(100)\n \"streak_type\" varchar(100)\n \"streak_duration\" varchar(100)\n \"streak_total\" varchar(100)\n \"streak_start\" date\n \"streak_end\" date\n}\n\nTable \"participants_events\" {\n \"id\" int4 [not null, increment]\n \"participant_type\" varchar(100) [not null]\n \"participant_id\" int4 [not null]\n \"event_id\" int4 [not null]\n \"alignment\" varchar(100)\n \"score\" varchar(100)\n \"event_outcome\" varchar(100)\n \"rank\" int4\n}\n\nTable \"periods\" {\n \"id\" int4 [not null, increment]\n \"participant_event_id\" int4 [not null]\n \"period_value\" varchar(100)\n \"score\" varchar(100)\n}\n\nTable \"person_event_metadata\" {\n \"id\" int4 [not null, increment]\n \"person_id\" int4 [not null]\n \"event_id\" int4 [not null]\n \"status\" varchar(100)\n \"health\" varchar(100)\n \"weight\" varchar(100)\n \"role_id\" int4\n \"position_id\" int4\n \"team_id\" int4\n \"lineup_slot\" int4\n \"lineup_slot_sequence\" int4\n}\n\nTable \"person_phases\" {\n \"id\" int4 [not null, increment]\n \"person_id\" int4 [not null]\n \"membership_type\" varchar(40) [not null]\n \"membership_id\" int4 [not null]\n \"role_id\" int4\n \"role_status\" varchar(40)\n \"phase_status\" varchar(40)\n \"uniform_number\" varchar(20)\n \"regular_position_id\" int4\n \"regular_position_depth\" varchar(40)\n \"height\" varchar(100)\n \"weight\" varchar(100)\n \"start_date_time\" timestamp\n \"start_season_id\" int4\n \"end_date_time\" timestamp\n \"end_season_id\" int4\n \"entry_reason\" varchar(40)\n \"exit_reason\" varchar(40)\n \"selection_level\" int4\n \"selection_sublevel\" int4\n \"selection_overall\" int4\n}\n\nTable \"persons\" {\n \"id\" int4 [not null, increment]\n \"person_key\" varchar(100) [not null]\n \"publisher_id\" int4 [not null]\n \"gender\" varchar(20)\n \"birth_date\" varchar(30)\n \"death_date\" varchar(30)\n \"birth_location_id\" int4\n \"hometown_location_id\" int4\n \"residence_location_id\" int4\n \"death_location_id\" int4\n}\n\nTable \"persons_documents\" {\n \"person_id\" int4 [not null]\n \"document_id\" int4 [not null]\n}\n\nTable \"persons_media\" {\n \"person_id\" int4 [not null]\n \"media_id\" int4 [not null]\n}\n\nTable \"positions\" {\n \"id\" int4 [not null, increment]\n \"affiliation_id\" int4 [not null]\n \"abbreviation\" varchar(100) [not null]\n}\n\nTable \"publishers\" {\n \"id\" int4 [not null, increment]\n \"publisher_key\" varchar(100) [not null]\n \"publisher_name\" varchar(100)\n}\n\nTable \"roles\" {\n \"id\" int4 [not null, increment]\n \"role_key\" varchar(100) [not null]\n \"role_name\" varchar(100)\n \"comment\" varchar(100)\n}\n\nTable \"seasons\" {\n \"id\" int4 [not null, increment]\n \"season_key\" int4 [not null]\n \"publisher_id\" int4 [not null]\n \"league_id\" int4 [not null]\n \"start_date_time\" timestamp\n \"end_date_time\" timestamp\n}\n\nTable \"sites\" {\n \"id\" int4 [not null, increment]\n \"site_key\" int4 [not null]\n \"publisher_id\" int4 [not null]\n \"location_id\" int4\n}\n\nTable \"soccer_defensive_stats\" {\n \"id\" int4 [not null, increment]\n \"shots_penalty_shot_allowed\" varchar(100)\n \"goals_penalty_shot_allowed\" varchar(100)\n \"goals_against_average\" varchar(100)\n \"goals_against_total\" varchar(100)\n \"saves\" varchar(100)\n \"save_percentage\" varchar(100)\n \"catches_punches\" varchar(100)\n \"shots_on_goal_total\" varchar(100)\n \"shots_shootout_total\" varchar(100)\n \"shots_shootout_allowed\" varchar(100)\n \"shots_blocked\" varchar(100)\n \"shutouts\" varchar(100)\n}\n\nTable \"soccer_event_states\" {\n \"id\" int4 [not null, increment]\n \"event_id\" int4 [not null]\n \"current_state\" int4\n \"sequence_number\" int4\n \"period_value\" varchar(100)\n \"period_time_elapsed\" varchar(100)\n \"period_time_remaining\" varchar(100)\n \"minutes_elapsed\" varchar(100)\n \"period_minute_elapsed\" varchar(100)\n \"context\" varchar(40)\n}\n\nTable \"soccer_foul_stats\" {\n \"id\" int4 [not null, increment]\n \"fouls_suffered\" varchar(100)\n \"fouls_commited\" varchar(100)\n \"cautions_total\" varchar(100)\n \"cautions_pending\" varchar(100)\n \"caution_points_total\" varchar(100)\n \"caution_points_pending\" varchar(100)\n \"ejections_total\" varchar(100)\n}\n\nTable \"soccer_offensive_stats\" {\n \"id\" int4 [not null, increment]\n \"goals_game_winning\" varchar(100)\n \"goals_game_tying\" varchar(100)\n \"goals_overtime\" varchar(100)\n \"goals_shootout\" varchar(100)\n \"goals_total\" varchar(100)\n \"assists_game_winning\" varchar(100)\n \"assists_game_tying\" varchar(100)\n \"assists_overtime\" varchar(100)\n \"assists_total\" varchar(100)\n \"points\" varchar(100)\n \"shots_total\" varchar(100)\n \"shots_on_goal_total\" varchar(100)\n \"shots_hit_frame\" varchar(100)\n \"shots_penalty_shot_taken\" varchar(100)\n \"shots_penalty_shot_scored\" varchar(100)\n \"shots_penalty_shot_missed\" varchar(40)\n \"shots_penalty_shot_percentage\" varchar(40)\n \"shots_shootout_taken\" varchar(40)\n \"shots_shootout_scored\" varchar(40)\n \"shots_shootout_missed\" varchar(40)\n \"shots_shootout_percentage\" varchar(40)\n \"giveaways\" varchar(40)\n \"offsides\" varchar(40)\n \"corner_kicks\" varchar(40)\n \"hat_tricks\" varchar(40)\n}\n\nTable \"standing_subgroups\" {\n \"id\" int4 [not null, increment]\n \"standing_id\" int4 [not null]\n \"affiliation_id\" int4 [not null]\n}\n\nTable \"standings\" {\n \"id\" int4 [not null, increment]\n \"affiliation_id\" int4 [not null]\n \"standing_type\" varchar(100)\n \"sub_season_id\" int4 [not null]\n \"last_updated\" varchar(100)\n \"duration_scope\" varchar(100)\n \"competition_scope\" varchar(100)\n \"competition_scope_id\" varchar(100)\n \"alignment_scope\" varchar(100)\n \"site_scope\" varchar(100)\n \"scoping_label\" varchar(100)\n \"publisher_id\" int4 [not null]\n \"source\" varchar(100)\n}\n\nTable \"stats\" {\n \"id\" int4 [not null, increment]\n \"stat_repository_type\" varchar(100)\n \"stat_repository_id\" int4 [not null]\n \"stat_holder_type\" varchar(100)\n \"stat_holder_id\" int4\n \"stat_coverage_type\" varchar(100)\n \"stat_coverage_id\" int4\n \"context\" varchar(40) [not null]\n}\n\nTable \"sub_periods\" {\n \"id\" int4 [not null, increment]\n \"period_id\" int4 [not null]\n \"sub_period_value\" varchar(100)\n \"score\" varchar(100)\n}\n\nTable \"sub_seasons\" {\n \"id\" int4 [not null, increment]\n \"sub_season_key\" varchar(100) [not null]\n \"season_id\" int4 [not null]\n \"sub_season_type\" varchar(100) [not null]\n \"start_date_time\" timestamp\n \"end_date_time\" timestamp\n}\n\nTable \"team_american_football_stats\" {\n \"id\" int4 [not null, increment]\n \"yards_per_attempt\" varchar(100)\n \"average_starting_position\" varchar(100)\n \"timeouts\" varchar(100)\n \"time_of_possession\" varchar(100)\n \"turnover_ratio\" varchar(100)\n}\n\nTable \"team_phases\" {\n \"id\" int4 [not null, increment]\n \"team_id\" int4 [not null]\n \"start_season_id\" int4\n \"end_season_id\" int4\n \"affiliation_id\" int4 [not null]\n \"start_date_time\" varchar(100)\n \"end_date_time\" varchar(100)\n \"phase_status\" varchar(40)\n \"role_id\" int4\n}\n\nTable \"teams\" {\n \"id\" int4 [not null, increment]\n \"team_key\" varchar(100) [not null]\n \"publisher_id\" int4 [not null]\n \"home_site_id\" int4\n}\n\nTable \"teams_documents\" {\n \"team_id\" int4 [not null]\n \"document_id\" int4 [not null]\n}\n\nTable \"teams_media\" {\n \"team_id\" int4 [not null]\n \"media_id\" int4 [not null]\n}\n\nTable \"tennis_action_points\" {\n \"id\" int4 [not null, increment]\n \"sub_period_id\" varchar(100)\n \"sequence_number\" varchar(100)\n \"win_type\" varchar(100)\n}\n\nTable \"tennis_action_volleys\" {\n \"id\" int4 [not null, increment]\n \"sequence_number\" varchar(100)\n \"tennis_action_points_id\" int4\n \"landing_location\" varchar(100)\n \"swing_type\" varchar(100)\n \"result\" varchar(100)\n \"spin_type\" varchar(100)\n \"trajectory_details\" varchar(100)\n}\n\nTable \"tennis_event_states\" {\n \"id\" int4 [not null, increment]\n \"event_id\" int4 [not null]\n \"current_state\" int4\n \"sequence_number\" int4\n \"tennis_set\" varchar(100)\n \"game\" varchar(100)\n \"server_person_id\" int4\n \"server_score\" varchar(100)\n \"receiver_person_id\" int4\n \"receiver_score\" varchar(100)\n \"service_number\" varchar(100)\n \"context\" varchar(40)\n}\n\nTable \"tennis_return_stats\" {\n \"id\" int4 [not null, increment]\n \"returns_played\" varchar(100)\n \"matches_played\" varchar(100)\n \"first_service_return_points_won\" varchar(100)\n \"first_service_return_points_won_pct\" varchar(100)\n \"second_service_return_points_won\" varchar(100)\n \"second_service_return_points_won_pct\" varchar(100)\n \"return_games_played\" varchar(100)\n \"return_games_won\" varchar(100)\n \"return_games_won_pct\" varchar(100)\n \"break_points_played\" varchar(100)\n \"break_points_converted\" varchar(100)\n \"break_points_converted_pct\" varchar(100)\n}\n\nTable \"tennis_service_stats\" {\n \"id\" int4 [not null, increment]\n \"services_played\" varchar(100)\n \"matches_played\" varchar(100)\n \"aces\" varchar(100)\n \"first_services_good\" varchar(100)\n \"first_services_good_pct\" varchar(100)\n \"first_service_points_won\" varchar(100)\n \"first_service_points_won_pct\" varchar(100)\n \"second_service_points_won\" varchar(100)\n \"second_service_points_won_pct\" varchar(100)\n \"service_games_played\" varchar(100)\n \"service_games_won\" varchar(100)\n \"service_games_won_pct\" varchar(100)\n \"break_points_played\" varchar(100)\n \"break_points_saved\" varchar(100)\n \"break_points_saved_pct\" varchar(100)\n}\n\nTable \"wagering_moneylines\" {\n \"id\" int4 [not null, increment]\n \"bookmaker_id\" int4 [not null]\n \"event_id\" int4 [not null]\n \"date_time\" timestamp\n \"team_id\" int4 [not null]\n \"person_id\" int4\n \"rotation_key\" varchar(100)\n \"comment\" varchar(100)\n \"vigorish\" varchar(100)\n \"line\" varchar(100)\n \"line_opening\" varchar(100)\n \"prediction\" varchar(100)\n}\n\nTable \"wagering_odds_lines\" {\n \"id\" int4 [not null, increment]\n \"bookmaker_id\" int4 [not null]\n \"event_id\" int4 [not null]\n \"date_time\" timestamp\n \"team_id\" int4 [not null]\n \"person_id\" int4\n \"rotation_key\" varchar(100)\n \"comment\" varchar(100)\n \"numerator\" varchar(100)\n \"denominator\" varchar(100)\n \"prediction\" varchar(100)\n \"payout_calculation\" varchar(100)\n \"payout_amount\" varchar(100)\n}\n\nTable \"wagering_runlines\" {\n \"id\" int4 [not null, increment]\n \"bookmaker_id\" int4 [not null]\n \"event_id\" int4 [not null]\n \"date_time\" timestamp\n \"team_id\" int4 [not null]\n \"person_id\" int4\n \"rotation_key\" varchar(100)\n \"comment\" varchar(100)\n \"vigorish\" varchar(100)\n \"line\" varchar(100)\n \"line_opening\" varchar(100)\n \"line_value\" varchar(100)\n \"prediction\" varchar(100)\n}\n\nTable \"wagering_straight_spread_lines\" {\n \"id\" int4 [not null, increment]\n \"bookmaker_id\" int4 [not null]\n \"event_id\" int4 [not null]\n \"date_time\" timestamp\n \"team_id\" int4 [not null]\n \"person_id\" int4\n \"rotation_key\" varchar(100)\n \"comment\" varchar(100)\n \"vigorish\" varchar(100)\n \"line_value\" varchar(100)\n \"line_value_opening\" varchar(100)\n \"prediction\" varchar(100)\n}\n\nTable \"wagering_total_score_lines\" {\n \"id\" int4 [not null, increment]\n \"bookmaker_id\" int4 [not null]\n \"event_id\" int4 [not null]\n \"date_time\" timestamp\n \"team_id\" int4 [not null]\n \"person_id\" int4\n \"rotation_key\" varchar(100)\n \"comment\" varchar(100)\n \"vigorish\" varchar(100)\n \"line_over\" varchar(100)\n \"line_under\" varchar(100)\n \"total\" varchar(100)\n \"total_opening\" varchar(100)\n \"prediction\" varchar(100)\n}\n\nTable \"weather_conditions\" {\n \"id\" int4 [not null, increment]\n \"event_id\" int4 [not null]\n \"temperature\" varchar(100)\n \"temperature_units\" varchar(40)\n \"humidity\" varchar(100)\n \"clouds\" varchar(100)\n \"wind_direction\" varchar(100)\n \"wind_velocity\" varchar(100)\n \"weather_code\" varchar(100)\n}\n", "stateUrl": null, "stateOriginalUrl": "https://github.com/yugabyte/yugabyte-db/blob/master/sample/sportsdb_tables.sql" } } ================================================ FILE: tasks/postgres/easy/sports/create_performance_indexes/verify.py ================================================ """ Verification script for PostgreSQL Sports Task 3: Query Performance Optimization """ import os import sys import psycopg2 from decimal import Decimal def rows_match(actual_row, expected_row): """ Compare two rows with appropriate tolerance. For Decimal types: allows 0.001 tolerance For other types: requires exact match """ if len(actual_row) != len(expected_row): return False for actual, expected in zip(actual_row, expected_row): if isinstance(actual, Decimal) and isinstance(expected, Decimal): if abs(float(actual) - float(expected)) > 0.001: return False elif isinstance(actual, float) and isinstance(expected, float): if abs(actual - expected) > 0.001: return False elif actual != expected: return False return True def get_connection_params() -> dict: """Get database connection parameters.""" return { "host": os.getenv("POSTGRES_HOST", "localhost"), "port": int(os.getenv("POSTGRES_PORT", 5432)), "database": os.getenv("POSTGRES_DATABASE", "sports"), "user": os.getenv("POSTGRES_USERNAME", "postgres"), "password": os.getenv("POSTGRES_PASSWORD", "postgres") } def verify_performance_optimization(conn) -> bool: """Verify that key performance optimization indexes have been implemented.""" with conn.cursor() as cur: print("\n🔍 Checking for critical performance indexes...") # Check 1: participants_events.participant_id index (critical for subqueries) cur.execute(""" SELECT indexname, indexdef FROM pg_indexes WHERE schemaname = 'public' AND tablename = 'participants_events' AND indexdef LIKE '%participant_id%' """) participant_indexes = cur.fetchall() has_participant_index = len(participant_indexes) > 0 # Check 2: stats table optimization (critical for subquery filtering) cur.execute(""" SELECT indexname, indexdef FROM pg_indexes WHERE schemaname = 'public' AND tablename = 'stats' AND indexdef LIKE '%stat_holder_type%' AND indexdef LIKE '%stat_holder_id%' """) stats_indexes = cur.fetchall() has_stats_index = len(stats_indexes) > 0 # Report findings critical_indexes_found = 0 if has_participant_index: print("✅ Found participant filtering index on participants_events.participant_id") critical_indexes_found += 1 else: print("❌ Missing critical index on participants_events.participant_id") if has_stats_index: print("✅ Found subquery optimization index on stats table") critical_indexes_found += 1 else: print("❌ Missing critical index on stats table") # Must have both critical indexes for this subquery-heavy query if critical_indexes_found >= 2: print(f"\n✅ Performance optimization: PASS ({critical_indexes_found}/2 critical indexes found)") return True else: print(f"\n❌ Performance optimization: FAIL ({critical_indexes_found}/2 critical indexes found)") print(" Create these critical indexes:") print(" - CREATE INDEX ON participants_events(participant_id);") print(" - CREATE INDEX ON stats(stat_holder_type, stat_holder_id);") return False def main(): """Main verification function.""" print("=" * 50) print("Verifying Sports Task 3: Query Performance Optimization") print("=" * 50) # Get connection parameters conn_params = get_connection_params() if not conn_params["database"]: print("❌ No database specified") sys.exit(1) try: # Connect to database conn = psycopg2.connect(**conn_params) # Verify all components success = verify_performance_optimization(conn) conn.close() if success: print("\n🎉 Task verification: PASS") sys.exit(0) else: print("\n❌ Task verification: FAIL") sys.exit(1) except psycopg2.Error as e: print(f"❌ Database error: {e}") sys.exit(1) except Exception as e: print(f"❌ Verification error: {e}") sys.exit(1) if __name__ == "__main__": main() ================================================ FILE: tasks/postgres/standard/chinook/customer_data_migration/description.md ================================================ Migrate customer data from an acquired company to PostgreSQL using efficient bulk operations. ## Your Mission: Chinook Music Store has recently acquired "MelodyMart," a competing music retailer. Their customer database needs to be migrated into Chinook's PostgreSQL database. ## Migration Requirements: 1. **Process all customer records from the data table below** and migrate them into the `Customer` table 2. **Apply business logic during migration**: - Assign `CustomerID` values starting from the next available ID - Assign all customers to support representative with EmployeeId 3 - Set `Fax` field to NULL for all migrated customers 3. **Avoid individual INSERT statements** ## Customer Data to Migrate: | FirstName | LastName | Company | Address | City | State | Country | PostalCode | Phone | Email | |-----------|----------|---------|---------|------|-------|---------|------------|-------|--------| | Danielle | Johnson | Sanchez-Taylor | 819 Johnson Course | East William | AK | USA | 74064 | 386-3794 | danielle.johnson@sancheztaylor.com | | Katherine | Moore | Peterson-Moore | 16155 Roman Stream Suite 816 | New Kellystad | OK | USA | 25704 | 103-4131 | katherine_moore@petersonmoore.org | | Joshua | Reid | Martin-Kelly | 192 Frank Light Suite 835 | East Lydiamouth | MO | USA | 35594 | 139-5376 | joshua_reid@martinkelly.org | | Douglas | Taylor | Hoffman, Baker and Richards | 3287 Katelyn Wall Apt. 226 | South Patrickmouth | NC | USA | 33454 | 801-8451 | douglast@hoffmanbakerand.net | | Ryan | Chavez | Liu, Baker and Mason | 148 Eric Track | New Stephanie | NC | USA | 00575 | 957-0154 | r.chavez@liubakerandmaso.com | | Brian | Humphrey | Miller Group | 227 Joseph Well | Brandtside | WV | USA | 96174 | 346-5787 | brian.humphrey@millergroup.com | | John | Brown | Chapman and Sons | 10310 Jones Freeway | Elizabethborough | ND | USA | 17843 | 997-3763 | john.brown@chapmanandsons.com | | Collin | Jordan | Jenkins-Shields | 106 Mcbride Coves | East James | NV | USA | 18874 | 624-7317 | collin.jordan@jenkinsshields.com | | Brent | Kidd | Novak and Sons | 7736 Franklin Alley | Bakermouth | LA | USA | 55945 | 872-3430 | brent.kidd@novakandsons.com | | Julie | Brown | Woods, Calhoun and Schmidt | 121 Emma Freeway | Wilsonshire | IA | USA | 76381 | 909-1699 | julieb@woodscalhounand.net | | Sarah | Harris | Edwards, Baker and Anderson | 5107 Charles Forest Suite 251 | West Justin | NV | USA | 71701 | 498-0841 | s.harris@edwardsbakerand.com | | Joseph | Preston | Tran, Nelson and Jacobs | 48740 Cynthia Village Suite 005 | Lake Tina | GA | USA | 97655 | 786-8011 | j.preston@trannelsonandja.com | | Amy | Davenport | Tran, Jordan and Williams | 53315 Dickson Summit Apt. 322 | Johnsonmouth | WY | USA | 54465 | 342-1607 | a.davenport@tranjordanandwi.com | | James | Sellers | Torres-Pope | 03654 Tammy Harbors | Darlenefurt | TX | USA | 70783 | 501-4294 | james.sellers@torrespope.com | | Daniel | Hamilton | Hartman, Graham and Joyce | 9340 Smith Valley | West Ryan | TN | USA | 43780 | 951-4846 | danielh@hartmangrahaman.net | | Richard | Phillips | Lee Ltd | 299 Sullivan Village Apt. 443 | Floydmouth | NH | USA | 58406 | 738-7214 | richardp@leeltd.net | | Clarence | Crane | Chambers and Sons | 00379 Stanley Roads | Lake Heather | NM | USA | 52884 | 320-1632 | clarence_crane@chambersandsons.org | | Brent | Wright | Bryant Group | 9868 Merritt Summit Suite 743 | Katiehaven | NM | USA | 82650 | 347-1434 | brentw@bryantgroup.net | | Luis | Fernandez | Hernandez Group | 316 Rivera Mountain | Brownchester | MS | USA | 77057 | 096-7054 | luis_fernandez@hernandezgroup.org | | Melissa | Ashley | Medina-Navarro | 3467 Paul Skyway | Ramseymouth | PW | USA | 17229 | 980-6990 | melissa.ashley@medinanavarro.com | | Dawn | Taylor | White-Green | 75564 King Common Suite 080 | Jeffreyland | WI | USA | 85927 | 003-3092 | d.taylor@whitegreen.com | | David | Caldwell | Gould, Marshall and Scott | 99124 Beth Inlet Suite 631 | North Heidi | ME | USA | 90188 | 919-0586 | davidc@gouldmarshallan.net | | Casey | Holland | Atkinson Group | 5726 Jessica Run | Christinaside | WI | USA | 63873 | 769-4531 | caseyh@atkinsongroup.net | | Nicole | Sanchez | Hudson-Barnett | 75273 Salinas Junctions Suite 948 | New Stacyland | IA | USA | 94882 | 678-3777 | nicole.sanchez@hudsonbarnett.com | | Christopher | Walker | Sanchez, Beck and Wood | 8557 Parker Fort Apt. 351 | East Javier | NJ | USA | 36742 | 989-4134 | c.walker@sanchezbeckandw.com | | Michael | Turner | Ferguson, Hill and Mccann | 271 Audrey Mountains Suite 752 | West Shelleyfort | DE | USA | 09065 | 671-9022 | michaelt@fergusonhilland.net | | Christopher | Wright | Duran, Obrien and Gibbs | 677 Dalton Meadow | Ashleyton | RI | USA | 97505 | 133-4123 | c.wright@duranobrienandg.com | | Andrea | Moore | Hayes-Wheeler | 34471 Sandra Turnpike Apt. 618 | Lake Edward | KY | USA | 19144 | 102-4994 | andrea_moore@hayeswheeler.org | | David | Barker | Powell, Nelson and Fernandez | 90659 Johnson Forks Apt. 490 | South April | NV | USA | 36959 | 296-7175 | david_barker@powellnelsonand.org | | Mathew | Santiago | Rivera Ltd | 6807 Leonard Islands Apt. 680 | Gutierrezborough | NC | USA | 47920 | 977-0348 | m.santiago@riveraltd.com | | Sara | Kim | Washington, Johnson and Mccoy | 248 Andrea Course | Port Robin | NH | USA | 15897 | 274-8467 | sara_kim@washingtonjohns.org | | John | Arnold | Lee-Greene | 46584 Justin Hills | Grimesmouth | ND | USA | 63984 | 558-8675 | j.arnold@leegreene.com | | Tina | Allen | Hall-Rowe | 7662 Hanna Crossroad | Mollymouth | CT | USA | 69438 | 702-6217 | tinaa@hallrowe.net | | Matthew | Schwartz | Miller, Murphy and Craig | 7809 Jimmy Spur Suite 316 | Port Cynthiaville | NV | USA | 22306 | 400-5045 | matthews@millermurphyand.net | | Ryan | Sanchez | Knight-Sparks | 19693 Durham Divide | South Dana | NH | USA | 33967 | 074-8217 | ryans@knightsparks.net | | Vanessa | Evans | Vaughn-Bryant | 67136 Andrews Squares Suite 064 | New Michelleton | PW | USA | 79983 | 743-9533 | vanessae@vaughnbryant.net | | Erica | Le | Becker, Taylor and Davis | 7095 Christopher Hill | Julieburgh | ID | USA | 17823 | 858-8424 | erica_le@beckertaylorand.org | | Tammy | Phillips | Brock-Mcdonald | 36851 Smith Plain | South Miguelview | OR | USA | 50442 | 513-7098 | tammyp@brockmcdonald.net | | Rose | Walker | Reid Group | 612 Sophia Hollow Suite 113 | South Shawn | TN | USA | 97905 | 869-2617 | rose_walker@reidgroup.org | | Sheila | Ramirez | Wood, Ramos and Sampson | 58506 Lopez Crossing Suite 139 | North Kristinbury | DC | USA | 74501 | 318-3933 | sheilar@woodramosandsam.net | | Kim | Kramer | Smith, Garrison and Thomas | 421 David Knolls | New Mario | HI | USA | 35283 | 026-8117 | kim_kramer@smithgarrisonan.org | | Kimberly | Palmer | Hayes and Sons | 847 Bruce Neck | Simmonsville | NM | USA | 93876 | 711-5921 | k.palmer@hayesandsons.com | | Joshua | Schultz | Joseph, James and Harper | 8961 Melissa Run Apt. 673 | Morganmouth | MO | USA | 55025 | 156-5452 | joshua_schultz@josephjamesandh.org | | Carlos | Decker | Reynolds Ltd | 80988 Santiago Loop Suite 604 | Michaelshire | NY | USA | 28385 | 273-1585 | carlos.decker@reynoldsltd.com | | Kathryn | Andrews | Bruce-Villegas | 402 Park Inlet | Michaelburgh | VI | USA | 19277 | 961-2018 | k.andrews@brucevillegas.com | | Nicholas | Chavez | Wood Ltd | 910 Eric River Apt. 147 | Tuckermouth | MT | USA | 36305 | 381-5614 | nicholas_chavez@woodltd.org | | Alison | Parker | Foster PLC | 34324 Murphy Avenue | Burgessburgh | DC | USA | 50335 | 838-8516 | alison.parker@fosterplc.com | | Ryan | Stevens | Atkins PLC | 664 Richard Islands Apt. 975 | South Meganbury | NE | USA | 77685 | 681-6453 | ryans@atkinsplc.net | | Kimberly | Jones | Wilson, Hicks and Bullock | 2312 Gonzalez Rapids Apt. 127 | Webstershire | NV | USA | 89778 | 995-5271 | kimberly_jones@wilsonhicksandb.org | | Scott | Turner | Vargas-Bell | 7700 Decker Club | New Brookefurt | NH | USA | 76565 | 807-9359 | scott_turner@vargasbell.org | | Walter | Rosario | Garcia-Nolan | 182 John Mill Suite 889 | West Nathan | LA | USA | 51280 | 659-0515 | walter.rosario@garcianolan.com | | Angela | Hughes | Cummings-Douglas | 1925 Ponce Square | Andersonland | ME | USA | 73760 | 652-8168 | angelah@cummingsdouglas.net | | Andrew | Parker | Peterson Group | 22141 Ebony Wells | New Nicholas | GA | USA | 24204 | 927-0653 | andrew_parker@petersongroup.org | | Cheryl | Goodwin | Young-Allen | 59774 Shaw Manor Apt. 392 | Brettfort | VI | USA | 49156 | 818-1412 | cherylg@youngallen.net | | Shannon | Palmer | Davis-Lozano | 0606 Young Common Suite 305 | Port Jennifermouth | WY | USA | 19643 | 204-7277 | shannon.palmer@davislozano.com | | Rebecca | Smith | Conley PLC | 43410 Robert Underpass Suite 117 | Lake Zacharybury | VT | USA | 19319 | 460-9539 | rebecca_smith@conleyplc.org | | Jacob | Barnett | Villegas, Jones and Fox | 7065 Burgess Knolls | West Johnville | WI | USA | 76772 | 520-5852 | jacob_barnett@villegasjonesan.org | | Tina | Mendoza | Cain Inc | 43030 Mahoney Passage Suite 874 | Port Deborahport | MI | USA | 06766 | 541-5667 | tina_mendoza@caininc.org | | Matthew | Lopez | Jimenez, Glass and Stone | 616 Amy Islands | North Markport | ME | USA | 58948 | 962-7570 | matthewl@jimenezglassand.net | | Christina | Graham | Whitney, Gould and Jones | 8202 Johnson Cliff Apt. 556 | New Ericmouth | MN | USA | 49261 | 719-2856 | christinag@whitneygouldand.net | | Debra | Wright | Johnson and Sons | 681 Hampton Squares Suite 394 | Gonzalezberg | PR | USA | 10207 | 727-1551 | debraw@johnsonandsons.net | | Patricia | York | Mckinney, Graves and Thompson | 313 Joel Park Apt. 589 | Tannerside | DC | USA | 80710 | 114-6786 | patricia_york@mckinneygravesa.org | | Madeline | Jones | Day-Cole | 89226 Marie Path Apt. 422 | Sarahbury | MI | USA | 68513 | 414-3842 | madelinej@daycole.net | | Christina | Davis | Jackson, David and Moore | 001 Stacy Trail Suite 396 | South Pamelaside | LA | USA | 84637 | 473-6471 | christina.davis@jacksondavidand.com | | Eric | Perry | Harris-Lawson | 556 Kathleen Passage Apt. 537 | West Shannonberg | CT | USA | 07133 | 469-6325 | ericp@harrislawson.net | | James | Moore | Owens, Koch and Jimenez | 8733 Williams Haven | Harperfort | LA | USA | 70846 | 016-2456 | jamesm@owenskochandjim.net | | Brandon | Williams | Lee, Tran and Jones | 499 David Court Suite 558 | Kariborough | PA | USA | 67232 | 680-0025 | brandon_williams@leetranandjones.org | | April | Hernandez | Taylor, Velazquez and Flores | 495 Erickson Hills Suite 055 | South Brandytown | PA | USA | 62706 | 499-3097 | a.hernandez@taylorvelazquez.com | | Alexandria | Griffith | Hernandez-Becker | 130 Edwards Drive | Vaughnchester | NY | USA | 80648 | 702-8385 | alexandria_griffith@hernandezbecker.org | | Alicia | Edwards | Stevens PLC | 549 Lee Gateway Suite 843 | Kellieborough | UT | USA | 92905 | 757-5844 | alicia.edwards@stevensplc.com | | Ashley | Daniels | Cardenas-Blevins | 0415 Douglas Summit | Lewisside | KY | USA | 74165 | 421-9933 | ashley.daniels@cardenasblevins.com | | Elizabeth | Schmidt | Hall, Garcia and Rivera | 20826 Woods Flats Suite 540 | Lake Audreyside | WA | USA | 95281 | 026-2067 | e.schmidt@hallgarciaandri.com | | Sharon | Hayden | Mcdowell-Smith | 4788 Small Dale | Nelsonville | MA | USA | 21799 | 742-0549 | s.hayden@mcdowellsmith.com | | Gregory | Chase | Wilcox-Robertson | 1227 Boyle Avenue | Patrickmouth | WV | USA | 35496 | 549-9045 | g.chase@wilcoxrobertson.com | | Bryan | Wilson | Moore-Parks | 145 Jeffrey Dale Suite 279 | Robertside | PW | USA | 62213 | 833-9187 | bryanw@mooreparks.net | | Christian | Elliott | Poole PLC | 822 Bond Mills | Lake Jamieshire | NM | USA | 12420 | 870-7286 | christian_elliott@pooleplc.org | | Anne | Hansen | Roman, Cummings and Foster | 391 Rodney Squares | New Virginialand | NJ | USA | 04660 | 462-2656 | anne_hansen@romancummingsan.org | | Molly | Knox | Miller-Brandt | 512 Rice Stream | Port Adam | AK | USA | 39608 | 786-8633 | molly_knox@millerbrandt.org | | Michael | Hill | Cannon, Johnson and Keller | 31190 Harper Squares | East Joyfurt | NV | USA | 31216 | 830-2843 | michaelh@cannonjohnsonan.net | | Barbara | Barton | Young-Walter | 4408 Connie Meadow | Williamsstad | SD | USA | 88495 | 685-6624 | barbara_barton@youngwalter.org | | Ivan | Medina | Atkinson LLC | 0866 Paul Glens | West Deborah | NV | USA | 49138 | 183-0469 | ivan.medina@atkinsonllc.com | | Morgan | Lopez | Ramsey, Hansen and Mendoza | 0331 Rocha Square Apt. 638 | Kimberlyfurt | NH | USA | 70447 | 544-5877 | morgan.lopez@ramseyhansenand.com | | Leah | Bowen | Rocha-Wood | 93204 Phillips Flat Suite 369 | South Andrea | TX | USA | 44746 | 477-7252 | l.bowen@rochawood.com | | Jennifer | Freeman | Mooney, Bernard and Warren | 006 Megan Fort | Lake Edwardborough | NY | USA | 60271 | 509-9770 | jennifer.freeman@mooneybernardan.com | | Amanda | Jenkins | Moreno LLC | 86211 John River Suite 546 | West Susanmouth | OK | USA | 32378 | 341-0166 | amanda_jenkins@morenollc.org | | Angela | Brown | Warner Inc | 5918 Jerry Ways Suite 401 | Rachelshire | TN | USA | 04813 | 250-3926 | angela.brown@warnerinc.com | | Kevin | Elliott | Davenport, Price and Mosley | 2185 Connor Fort Apt. 599 | Novakmouth | AK | USA | 83616 | 477-3586 | kevin_elliott@davenportpricea.org | | Jacob | Willis | Miller-Montgomery | 114 Norman Tunnel | Lake Peter | MN | USA | 14466 | 104-7541 | j.willis@millermontgomer.com | | Christopher | Jordan | Peters, Russell and Johnson | 199 Shields Bridge Suite 661 | New Adriana | TX | USA | 50404 | 224-4472 | christopher.jordan@petersrussellan.com | | Gary | Hill | Washington-Jones | 79937 Derek Avenue Suite 596 | Scottchester | GU | USA | 85833 | 924-5937 | garyh@washingtonjones.net | | Gregory | Sanders | Carter-Neal | 356 Velasquez Lock Suite 193 | Lake Katrina | AK | USA | 95818 | 737-4167 | g.sanders@carterneal.com | | Cynthia | Allen | Moore, Henderson and Bennett | 796 Stephens Turnpike Suite 891 | Port Johnstad | GA | USA | 85304 | 909-6561 | cynthia.allen@moorehendersona.com | | Corey | Walker | Stone, Carpenter and Johnston | 6798 Michael Burg Suite 146 | North Marieberg | MI | USA | 41381 | 573-8757 | corey.walker@stonecarpentera.com | | Samuel | Horton | Jones-Williams | 51238 Andrea Isle | Mullenbury | AS | USA | 53591 | 226-6093 | samuel_horton@joneswilliams.org | | Brittany | Price | Lewis, Ramirez and Padilla | 182 Nguyen Mount | West Emilyfort | NC | USA | 84270 | 596-9691 | brittanyp@lewisramirezand.net | | Michael | Ellis | Cervantes Ltd | 912 Wilson Inlet Apt. 252 | Barnesberg | OK | USA | 50794 | 627-8282 | michael_ellis@cervantesltd.org | | Keith | Lopez | Harvey-Glenn | 2368 Ortiz Overpass | Mckinneymouth | NM | USA | 22423 | 190-3404 | k.lopez@harveyglenn.com | | Amanda | Jackson | Cunningham-Barton | 819 Joseph Plains Suite 807 | South Curtis | MP | USA | 86179 | 340-7451 | amanda_jackson@cunninghambarto.org | | Michelle | Wilson | Clark Ltd | 962 Kristen Via Apt. 095 | Candiceburgh | MD | USA | 92782 | 449-4812 | michelle_wilson@clarkltd.org | | Samantha | Riddle | Martinez, Cline and Wright | 67294 Brooks Club Apt. 684 | Shawnfort | MD | USA | 76779 | 017-5186 | s.riddle@martinezclinean.com | | Tammy | Summers | Adams-Clayton | 929 Kramer Springs Apt. 017 | North Sarahburgh | NV | USA | 60337 | 063-2424 | tammy.summers@adamsclayton.com | | Diamond | Wright | Beck-Banks | 4361 Aaron Neck | East Brittneyhaven | TX | USA | 58836 | 005-1627 | diamond.wright@beckbanks.com | | Jeremy | Davis | Garcia LLC | 62218 Chelsey Expressway Suite 532 | Jensenmouth | VI | USA | 28975 | 112-1965 | jeremy_davis@garciallc.org | | Leonard | Taylor | Newman-Wright | 043 Julie Hill Apt. 376 | East Victorland | NC | USA | 02082 | 552-6965 | l.taylor@newmanwright.com | | Kathryn | Best | Smith Inc | 3006 Fuller Parkway | Hendersonfurt | CO | USA | 84457 | 889-2414 | kathryn.best@smithinc.com | | William | Harris | Herrera Group | 6303 Sandy Crescent | Salazarton | ME | USA | 87805 | 210-2027 | williamh@herreragroup.net | | Alexandra | Logan | Green, Watson and Brady | 105 Nelson Circles Suite 917 | Dixonton | NM | USA | 74803 | 252-4191 | a.logan@greenwatsonandb.com | | Joyce | Smith | Sanchez Group | 2208 Walker Gateway Suite 541 | Davidton | HI | USA | 29754 | 806-1744 | joyces@sanchezgroup.net | | Christopher | Bryant | Gonzalez-Elliott | 937 Vargas Park Apt. 832 | South Andrewside | MI | USA | 83855 | 050-6413 | c.bryant@gonzalezelliott.com | | Robert | Woodward | Dawson Inc | 86571 William Route | Jonesshire | AR | USA | 57515 | 234-4565 | robertw@dawsoninc.net | | Shawn | Hall | Taylor PLC | 12775 Martinez Knolls | South Kyle | KS | USA | 16218 | 124-9035 | s.hall@taylorplc.com | | Christopher | Wright | Foster-Williams | 2067 Cody Cove Apt. 100 | East James | MO | USA | 49291 | 199-4101 | c.wright@fosterwilliams.com | | Rachel | Ramos | Davis LLC | 70296 Crawford Light | Thompsonborough | PW | USA | 25031 | 447-2099 | r.ramos@davisllc.com | | Deborah | Porter | Mendoza, Miller and Reyes | 83806 Castillo Tunnel Suite 598 | Paulburgh | AK | USA | 42296 | 930-4078 | deborahp@mendozamilleran.net | | Katie | Key | Garcia Ltd | 8039 Kelly Villages | East Joel | MD | USA | 97245 | 590-5992 | k.key@garcialtd.com | | Mary | Cochran | Weaver-Thompson | 03930 Smith Ridges | Port David | VT | USA | 23761 | 500-2921 | maryc@weaverthompson.net | | Susan | Brooks | Foster, Garcia and Turner | 67528 Walker Radial | South Kurt | UT | USA | 39103 | 220-9690 | s.brooks@fostergarciaand.com | | Carrie | Mccall | Walker, Cunningham and Zuniga | 1355 Daisy Corners | Seanview | IL | USA | 33208 | 154-1006 | carrie_mccall@walkercunningha.org | | Jessica | Costa | Snyder-Gray | 79327 Lauren Bypass Suite 054 | North Matthewfurt | GA | USA | 96443 | 181-5997 | jessica.costa@snydergray.com | | Ryan | Valdez | Preston, Moore and Garcia | 68844 Young Causeway | Armstrongfort | FL | USA | 07645 | 506-1497 | r.valdez@prestonmooreand.com | | Collin | Clark | Carter, Miller and Anthony | 7741 Lopez Light Suite 270 | Scottview | IN | USA | 35701 | 902-1158 | collin_clark@cartermillerand.org | | Tara | Lawrence | Brown, Hughes and Mills | 374 Ralph Walk Apt. 898 | North Stacy | NV | USA | 23160 | 233-2061 | tara_lawrence@brownhughesandm.org | | James | Carson | Flowers LLC | 116 Arnold Walks Suite 870 | Rodriguezberg | FL | USA | 74765 | 991-1914 | jamesc@flowersllc.net | | Natalie | Baker | Washington, Lynch and Johnson | 2996 Randy Isle Apt. 074 | Andrewport | ME | USA | 37246 | 713-2475 | natalieb@washingtonlynch.net | | Jessica | Jacobs | Lopez and Sons | 785 Zachary Estate Apt. 486 | Port Melissa | FM | USA | 75038 | 023-3030 | jessica_jacobs@lopezandsons.org | | Brent | Ward | Hill Group | 103 Burns Mission Apt. 798 | Maxview | WA | USA | 90790 | 140-6029 | b.ward@hillgroup.com | | Mercedes | Holland | Clark, Pearson and Palmer | 2290 Johnny Valley | Jenniferview | NE | USA | 49846 | 574-3748 | mercedes_holland@clarkpearsonand.org | | Breanna | Smith | Levy, Franco and Hoffman | 1715 Davidson Wall Suite 443 | New Kathy | MH | USA | 07942 | 965-2074 | breannas@levyfrancoandho.net | | Rebecca | Sullivan | Johnson, Erickson and Armstrong | 3875 Bruce Ville | West Connor | DC | USA | 97614 | 482-5135 | r.sullivan@johnsonerickson.com | | Julie | Parker | Watson-Richards | 70999 Thomas Fields Apt. 684 | Brownberg | DC | USA | 26754 | 569-7252 | julie.parker@watsonrichards.com | | Tony | Welch | Edwards Inc | 4329 Tracy Track | East Christinachester | MO | USA | 56734 | 760-0835 | tony.welch@edwardsinc.com | | Patricia | Sherman | Lee, Rhodes and Sims | 54216 Jackson View | West Stacymouth | VA | USA | 68696 | 985-6257 | patricias@leerhodesandsim.net | | Karen | Martin | Smith-Walker | 09821 Dawson Turnpike | South Nancyview | WI | USA | 70589 | 909-0100 | karen.martin@smithwalker.com | | Robert | James | King, Miles and Harris | 6184 Robert Cove | West Danielville | NM | USA | 26538 | 934-8356 | robertj@kingmilesandhar.net | | Ethan | Kelley | Watts Group | 00119 Hernandez Course Apt. 143 | Hintonport | KS | USA | 61354 | 012-0455 | ethan_kelley@wattsgroup.org | | Joanna | Davis | Smith and Sons | 5794 Nathan Junctions | North Richard | NH | USA | 36130 | 676-2120 | j.davis@smithandsons.com | | Dale | Pruitt | Pham-Gregory | 659 Michelle Villages | South Samantha | DE | USA | 54408 | 701-4508 | d.pruitt@phamgregory.com | | Tiffany | Santiago | Stone-Watts | 3756 Mary Point | North Dawnburgh | NY | USA | 62011 | 721-7535 | tiffanys@stonewatts.net | | Brent | Walker | Gray, Montoya and Miller | 717 Stewart Parks Apt. 166 | New Andrealand | WY | USA | 79695 | 948-8375 | brentw@graymontoyaandm.net | | Marcia | Velasquez | Rivera-Saunders | 571 Katherine Forges Apt. 554 | Jacquelineton | MH | USA | 22017 | 726-1493 | m.velasquez@riverasaunders.com | | David | Phelps | Bryant and Sons | 60917 Barrett Parkways Apt. 708 | New Savannahshire | NJ | USA | 67129 | 292-2169 | davidp@bryantandsons.net | | William | Cruz | Moon, Farmer and Hill | 7226 Cameron Plaza Suite 833 | New Jennifer | TX | USA | 45759 | 228-8515 | william_cruz@moonfarmerandhi.org | | Brandi | Bender | Butler, Adkins and Skinner | 0810 Thomas Skyway Apt. 342 | Francesberg | MP | USA | 08631 | 438-0571 | b.bender@butleradkinsand.com | | Julia | Hoffman | Dixon Ltd | 066 Frye Spur Suite 800 | Jamesmouth | MP | USA | 30064 | 598-9334 | julia_hoffman@dixonltd.org | | Gregory | Fleming | Rivers Ltd | 0648 Anderson Prairie | Adammouth | VT | USA | 20791 | 025-9094 | gregory_fleming@riversltd.org | | Kristy | Pierce | Bowers LLC | 81826 Davis Forges | Lake Martin | MN | USA | 38980 | 398-7801 | kristyp@bowersllc.net | | Sean | Conway | Sellers, Sanchez and Williams | 1648 Johnson Path Suite 887 | Williamsborough | MD | USA | 67858 | 112-8801 | s.conway@sellerssancheza.com | | Ellen | Ayala | Coleman, Garcia and Medina | 120 Love Camp Apt. 102 | Angelashire | GU | USA | 30338 | 466-7665 | ellen.ayala@colemangarciaan.com | | Perry | Wilson | May PLC | 901 Reilly Coves | Kristinport | PA | USA | 11839 | 476-6072 | p.wilson@mayplc.com | | Derek | Myers | Phillips, Walters and Evans | 88210 Ashley Lock Apt. 435 | South Rebecca | PR | USA | 67682 | 222-3943 | derek.myers@phillipswalters.com | | Howard | Marsh | York PLC | 814 John Flat Suite 552 | North Justin | CA | USA | 25863 | 577-5949 | h.marsh@yorkplc.com | | Ariana | Diaz | Benjamin-Jackson | 36452 Humphrey Mountain Suite 547 | East Debbieland | MP | USA | 37281 | 283-4110 | ariana.diaz@benjaminjackson.com | | Lisa | Riley | Lewis, Johnson and Green | 256 Patricia Radial Suite 278 | South Michaeltown | TN | USA | 31811 | 928-2722 | l.riley@lewisjohnsonand.com | | Jill | Webb | Williams-Juarez | 45303 Hughes Motorway | North Tinamouth | CT | USA | 92741 | 844-9892 | jill_webb@williamsjuarez.org | | Desiree | Diaz | Villanueva, Miller and King | 655 Sparks Rapids | New Nicolemouth | GA | USA | 30646 | 184-3222 | desireed@villanuevamille.net | | Carolyn | Montoya | Hall, Shepherd and Cortez | 773 Deborah Loop Apt. 302 | East Crystal | AZ | USA | 75509 | 202-4286 | carolyn.montoya@hallshepherdand.com | | Natalie | Luna | Valentine-Robinson | 2369 Laura View Apt. 984 | Lake Gina | NH | USA | 78689 | 913-6621 | natalie.luna@valentinerobins.com | | James | Heath | Cohen, Serrano and Jacobs | 9908 Christopher Shoals | New Amber | AL | USA | 89441 | 686-5086 | j.heath@cohenserranoand.com | | Shawna | Olson | Bell-Ballard | 2473 Justin Wells | Scotttown | VT | USA | 97972 | 098-1806 | s.olson@bellballard.com | | Gwendolyn | Stewart | Rodriguez-Simmons | 8695 Braun Locks Apt. 688 | Whiteside | OH | USA | 63908 | 449-5621 | g.stewart@rodriguezsimmon.com | | Sean | Lyons | Garcia PLC | 8902 Oconnell Avenue Apt. 279 | Davisview | IN | USA | 49107 | 190-6698 | seanl@garciaplc.net | | Jennifer | Harper | Bowman Group | 84309 Christina Spring | West Johntown | GA | USA | 11883 | 465-6693 | jennifer.harper@bowmangroup.com | | Jillian | Jones | Dunn Ltd | 4393 Spears Ports Apt. 426 | New Charlesport | MA | USA | 15837 | 848-9476 | jillian_jones@dunnltd.org | | Kayla | Todd | Maldonado-Mosley | 1416 Erica Forks | Robertstad | NC | USA | 70709 | 043-4165 | kayla.todd@maldonadomosley.com | | Angela | White | Gomez-Shannon | 37333 Clark Flats Apt. 952 | North Samanthafort | RI | USA | 01369 | 807-5957 | angelaw@gomezshannon.net | | Travis | Joyce | Ramirez, Walker and Ray | 678 Wayne Lock | South Tiffany | UT | USA | 68423 | 750-0369 | travis.joyce@ramirezwalkeran.com | | Mark | Salazar | Lopez-Baker | 9552 Coleman Manor Suite 564 | Whiteberg | OK | USA | 90417 | 314-3866 | m.salazar@lopezbaker.com | | Dustin | Haley | Kennedy Inc | 7288 Floyd Hills | Annashire | AR | USA | 52720 | 120-3471 | dustin_haley@kennedyinc.org | | Julie | Green | Castro-Frederick | 0615 Barbara Run Apt. 455 | Hamptonmouth | FM | USA | 10778 | 694-7225 | julie_green@castrofrederick.org | | Crystal | Duncan | Miller LLC | 5449 Nelson Mills | Juliehaven | NV | USA | 54763 | 220-2341 | c.duncan@millerllc.com | | Garrett | Garcia | Zuniga Group | 68114 Christopher Loaf | Jeromeport | NV | USA | 82615 | 228-2005 | garrettg@zunigagroup.net | | Michelle | Mcdonald | Donovan, Dunn and Taylor | 979 Mills Route | Reginafort | ND | USA | 30271 | 174-5642 | michellem@donovandunnandt.net | | Alex | Mills | Cooper Group | 774 Katie Union | Carlatown | OH | USA | 49475 | 368-6632 | alex_mills@coopergroup.org | | Maria | Walker | Henderson and Sons | 8463 Ian Highway Apt. 797 | Jackiefort | ID | USA | 42528 | 020-8021 | mariaw@hendersonandson.net | | Joseph | Espinoza | Smith, Davis and Smith | 6475 Terry Bypass | Christopherberg | AR | USA | 35432 | 618-7234 | joseph_espinoza@smithdavisandsm.org | | Maria | Martinez | Wright, Wise and Ramos | 71837 Maldonado Inlet | Ericton | WA | USA | 72535 | 814-7435 | maria.martinez@wrightwiseandra.com | | Michelle | Robinson | Young Group | 24916 Albert Canyon Suite 925 | East Ericland | TX | USA | 81588 | 500-5281 | m.robinson@younggroup.com | | Tony | Stewart | Kramer, Sherman and Trujillo | 306 Ramsey Glen Apt. 778 | Amyfort | ID | USA | 74779 | 285-5749 | t.stewart@kramershermanan.com | | Casey | Moore | Weiss-Weaver | 86209 Parsons Garden Suite 186 | New Felicia | WI | USA | 72782 | 294-5651 | casey.moore@weissweaver.com | | Alexandra | Jones | White Inc | 73109 Barrett Pine | Brandonbury | PA | USA | 94590 | 103-7170 | alexandraj@whiteinc.net | | Angela | Hurley | Short-Bauer | 480 Mary Club | New Colton | VA | USA | 30780 | 863-3839 | a.hurley@shortbauer.com | | Angela | Grant | Garcia, Fowler and Howard | 612 Andrea Parkways Suite 289 | Mahoneymouth | OH | USA | 43054 | 566-5939 | a.grant@garciafowlerand.com | | Nicholas | Pierce | King, Nixon and West | 04908 Victoria Hollow Apt. 433 | Andrewview | PW | USA | 73070 | 889-9210 | nicholas_pierce@kingnixonandwes.org | | Michael | Taylor | Preston-Wright | 1969 Jessica Stream Suite 727 | New Dawnton | VA | USA | 76035 | 610-5566 | michael.taylor@prestonwright.com | | Molly | Perez | Atkinson, Mcfarland and Walters | 48058 Mark Square Apt. 206 | Mullinsshire | NY | USA | 12308 | 364-6225 | molly.perez@atkinsonmcfarla.com | | Thomas | Mcgee | Ross, Miller and Shaw | 78376 Ann Street | East Charles | WI | USA | 56870 | 591-1665 | thomasm@rossmillerandsh.net | | James | Cooper | Johnson, Torres and Huerta | 270 James Landing Apt. 110 | New Sara | VI | USA | 38208 | 051-4770 | jamesc@johnsontorresan.net | | Jason | Medina | Payne LLC | 206 Jonathan Circle Suite 394 | South Dianatown | CA | USA | 51441 | 451-0463 | jason_medina@paynellc.org | | William | Mckinney | Washington-Harper | 38780 John Pines | Matthewfurt | WA | USA | 21079 | 055-5438 | williamm@washingtonharpe.net | | Lisa | Garrett | Zamora-Briggs | 432 Prince Shoals | North Jessica | NC | USA | 89367 | 936-3926 | lisag@zamorabriggs.net | | Renee | Murphy | Anderson, Delgado and Carpenter | 48262 Lonnie Point | East Lonnieberg | VA | USA | 04365 | 566-4742 | r.murphy@andersondelgado.com | | Daniel | Lopez | Jensen, Obrien and Salazar | 05172 Joseph Landing | Port Paul | NJ | USA | 18525 | 233-0604 | daniel_lopez@jensenobrienand.org | | Jeffrey | Powers | Todd Inc | 9757 Ronald Trail | New Jillfurt | VA | USA | 41513 | 699-9880 | jeffrey.powers@toddinc.com | | Shannon | Wilcox | Rich and Sons | 086 James Mill Suite 447 | South Kelly | PW | USA | 07650 | 827-7181 | s.wilcox@richandsons.com | | Kimberly | Pace | Payne, Long and Morris | 79371 Nguyen Run | Lake Jessica | CO | USA | 15464 | 751-8689 | k.pace@paynelongandmor.com | | Nicholas | James | Barr PLC | 22064 Cross Mission | Courtneyville | MH | USA | 17746 | 309-4077 | nicholas_james@barrplc.org | | Amy | Smith | Young-Chapman | 6719 John Plaza Suite 983 | East Eddiestad | AZ | USA | 19555 | 099-4510 | amy.smith@youngchapman.com | | Robert | Thompson | Mitchell, Guerrero and Graves | 9501 Morris Light | Port Ronaldside | CA | USA | 38883 | 721-4586 | r.thompson@mitchellguerrer.com | | Heather | Salazar | Duncan Ltd | 9469 Green Ports | Sarashire | NM | USA | 68619 | 772-9343 | heather.salazar@duncanltd.com | | David | Marshall | Mclaughlin and Sons | 0558 Alex Flats Suite 414 | Williammouth | WI | USA | 01304 | 155-6990 | d.marshall@mclaughlinandso.com | ================================================ FILE: tasks/postgres/standard/chinook/customer_data_migration/meta.json ================================================ { "task_id": "customer_data_migration", "task_name": "Customer Data Migration", "category_id": "chinook", "category_name": "Chinook", "description": "Migrate customer data from acquired company MelodyMart into Chinook database using bulk operations and business logic.", "author": "Lingxiao Du", "created_at": "2025-08-12", "difficulty": "L3", "tags": [ "data migration", "transactional operations" ], "mcp": [ "postgres" ], "meta_data": { "stateType": "text", "stateContent": "Table \"Album\" {\n \"AlbumId\" int4 [pk, not null]\n \"Title\" varchar(160) [not null]\n \"ArtistId\" int4 [not null]\n\n Indexes {\n ArtistId [type: btree, name: \"IFK_AlbumArtistId\"]\n }\n}\n\nTable \"Artist\" {\n \"ArtistId\" int4 [pk, not null]\n \"Name\" varchar(120)\n}\n\nTable \"Customer\" {\n \"CustomerId\" int4 [pk, not null]\n \"FirstName\" varchar(40) [not null]\n \"LastName\" varchar(20) [not null]\n \"Company\" varchar(80)\n \"Address\" varchar(70)\n \"City\" varchar(40)\n \"State\" varchar(40)\n \"Country\" varchar(40)\n \"PostalCode\" varchar(10)\n \"Phone\" varchar(24)\n \"Fax\" varchar(24)\n \"Email\" varchar(60) [not null]\n \"SupportRepId\" int4\n\n Indexes {\n SupportRepId [type: btree, name: \"IFK_CustomerSupportRepId\"]\n }\n}\n\nTable \"Employee\" {\n \"EmployeeId\" int4 [pk, not null]\n \"LastName\" varchar(20) [not null]\n \"FirstName\" varchar(20) [not null]\n \"Title\" varchar(30)\n \"ReportsTo\" int4\n \"BirthDate\" timestamp\n \"HireDate\" timestamp\n \"Address\" varchar(70)\n \"City\" varchar(40)\n \"State\" varchar(40)\n \"Country\" varchar(40)\n \"PostalCode\" varchar(10)\n \"Phone\" varchar(24)\n \"Fax\" varchar(24)\n \"Email\" varchar(60)\n\n Indexes {\n ReportsTo [type: btree, name: \"IFK_EmployeeReportsTo\"]\n }\n}\n\nTable \"Genre\" {\n \"GenreId\" int4 [pk, not null]\n \"Name\" varchar(120)\n}\n\nTable \"Invoice\" {\n \"InvoiceId\" int4 [pk, not null]\n \"CustomerId\" int4 [not null]\n \"InvoiceDate\" timestamp [not null]\n \"BillingAddress\" varchar(70)\n \"BillingCity\" varchar(40)\n \"BillingState\" varchar(40)\n \"BillingCountry\" varchar(40)\n \"BillingPostalCode\" varchar(10)\n \"Total\" numeric(10,2) [not null]\n\n Indexes {\n CustomerId [type: btree, name: \"IFK_InvoiceCustomerId\"]\n }\n}\n\nTable \"InvoiceLine\" {\n \"InvoiceLineId\" int4 [pk, not null]\n \"InvoiceId\" int4 [not null]\n \"TrackId\" int4 [not null]\n \"UnitPrice\" numeric(10,2) [not null]\n \"Quantity\" int4 [not null]\n\n Indexes {\n InvoiceId [type: btree, name: \"IFK_InvoiceLineInvoiceId\"]\n TrackId [type: btree, name: \"IFK_InvoiceLineTrackId\"]\n }\n}\n\nTable \"MediaType\" {\n \"MediaTypeId\" int4 [pk, not null]\n \"Name\" varchar(120)\n}\n\nTable \"Playlist\" {\n \"PlaylistId\" int4 [pk, not null]\n \"Name\" varchar(120)\n}\n\nTable \"PlaylistTrack\" {\n \"PlaylistId\" int4 [not null]\n \"TrackId\" int4 [not null]\n\n Indexes {\n (PlaylistId, TrackId) [type: btree, name: \"PK_PlaylistTrack\"]\n TrackId [type: btree, name: \"IFK_PlaylistTrackTrackId\"]\n }\n}\n\nTable \"Track\" {\n \"TrackId\" int4 [pk, not null]\n \"Name\" varchar(200) [not null]\n \"AlbumId\" int4\n \"MediaTypeId\" int4 [not null]\n \"GenreId\" int4\n \"Composer\" varchar(220)\n \"Milliseconds\" int4 [not null]\n \"Bytes\" int4\n \"UnitPrice\" numeric(10,2) [not null]\n\n Indexes {\n AlbumId [type: btree, name: \"IFK_TrackAlbumId\"]\n GenreId [type: btree, name: \"IFK_TrackGenreId\"]\n MediaTypeId [type: btree, name: \"IFK_TrackMediaTypeId\"]\n }\n}\n\nRef \"FK_AlbumArtistId\":\"Artist\".\"ArtistId\" < \"Album\".\"ArtistId\"\n\nRef \"FK_CustomerSupportRepId\":\"Employee\".\"EmployeeId\" < \"Customer\".\"SupportRepId\"\n\nRef \"FK_EmployeeReportsTo\":\"Employee\".\"EmployeeId\" < \"Employee\".\"ReportsTo\"\n\nRef \"FK_InvoiceCustomerId\":\"Customer\".\"CustomerId\" < \"Invoice\".\"CustomerId\"\n\nRef \"FK_InvoiceLineInvoiceId\":\"Invoice\".\"InvoiceId\" < \"InvoiceLine\".\"InvoiceId\"\n\nRef \"FK_InvoiceLineTrackId\":\"Track\".\"TrackId\" < \"InvoiceLine\".\"TrackId\"\n\nRef \"FK_PlaylistTrackPlaylistId\":\"Playlist\".\"PlaylistId\" < \"PlaylistTrack\".\"PlaylistId\"\n\nRef \"FK_PlaylistTrackTrackId\":\"Track\".\"TrackId\" < \"PlaylistTrack\".\"TrackId\"\n\nRef \"FK_TrackAlbumId\":\"Album\".\"AlbumId\" < \"Track\".\"AlbumId\"\n\nRef \"FK_TrackGenreId\":\"Genre\".\"GenreId\" < \"Track\".\"GenreId\"\n\nRef \"FK_TrackMediaTypeId\":\"MediaType\".\"MediaTypeId\" < \"Track\".\"MediaTypeId\"\n", "stateUrl": null, "stateOriginalUrl": "https://github.com/neondatabase-labs/postgres-sample-dbs/blob/main/chinook.sql" } } ================================================ FILE: tasks/postgres/standard/chinook/customer_data_migration/verify.py ================================================ """ Verification script for PostgreSQL Task 2: Customer Data Migration """ import os import sys import psycopg2 import pickle def get_connection_params() -> dict: """Get database connection parameters.""" return { "host": os.getenv("POSTGRES_HOST", "localhost"), "port": int(os.getenv("POSTGRES_PORT", 5432)), "database": os.getenv("POSTGRES_DATABASE"), "user": os.getenv("POSTGRES_USERNAME"), "password": os.getenv("POSTGRES_PASSWORD") } def load_expected_customers(): """Load the expected customer data from pickle file.""" import os script_dir = os.path.dirname(os.path.abspath(__file__)) pkl_path = os.path.join(script_dir, 'customer_data.pkl') try: with open(pkl_path, 'rb') as f: return pickle.load(f) except FileNotFoundError: print(f"❌ customer_data.pkl not found at {pkl_path}. Please generate customer data first.") return None except Exception as e: print(f"❌ Error loading customer data: {e}") return None def verify_migrated_customers(conn, expected_customers) -> bool: """Verify migrated customers by comparing with expected data as sets.""" with conn.cursor() as cur: # Get all customers with ID > 59 (the migrated ones) cur.execute(''' SELECT "FirstName", "LastName", "Company", "Address", "City", "State", "Country", "PostalCode", "Phone", "Email", "SupportRepId", "Fax" FROM "Customer" WHERE "CustomerId" > 59 ''') actual_customers = cur.fetchall() if len(actual_customers) != len(expected_customers): print(f"❌ Expected {len(expected_customers)} migrated customers, found {len(actual_customers)}") return False # Convert expected customers to tuples for set comparison expected_tuples = set() for expected in expected_customers: expected_tuple = ( expected['FirstName'], expected['LastName'], expected['Company'], expected['Address'], expected['City'], expected['State'], expected['Country'], expected['PostalCode'], expected['Phone'], expected['Email'], 3, None # SupportRepId=3, Fax=None ) expected_tuples.add(expected_tuple) # Convert actual customers to set with proper type conversion actual_tuples = set() for row in actual_customers: # Convert all fields to strings for consistent comparison actual_tuple = ( str(row[0]) if row[0] is not None else '', # FirstName str(row[1]) if row[1] is not None else '', # LastName str(row[2]) if row[2] is not None else '', # Company str(row[3]) if row[3] is not None else '', # Address str(row[4]) if row[4] is not None else '', # City str(row[5]) if row[5] is not None else '', # State str(row[6]) if row[6] is not None else '', # Country str(row[7]) if row[7] is not None else '', # PostalCode str(row[8]) if row[8] is not None else '', # Phone str(row[9]) if row[9] is not None else '', # Email int(row[10]) if row[10] is not None else None, # SupportRepId row[11] # Fax (should be None) ) actual_tuples.add(actual_tuple) # Check if sets are equal if actual_tuples != expected_tuples: missing_in_actual = expected_tuples - actual_tuples extra_in_actual = actual_tuples - expected_tuples print(f"❌ Customer data sets don't match!") if missing_in_actual: print(f" Missing {len(missing_in_actual)} expected customers") for missing in list(missing_in_actual)[:3]: # Show first 3 print(f" Missing: {missing[0]} {missing[1]} - {missing[2]}") if len(missing_in_actual) > 3: print(f" ... and {len(missing_in_actual) - 3} more") if extra_in_actual: print(f" Found {len(extra_in_actual)} unexpected customers") for extra in list(extra_in_actual)[:3]: # Show first 3 print(f" Extra: {extra[0]} {extra[1]} - {extra[2]}") if len(extra_in_actual) > 3: print(f" ... and {len(extra_in_actual) - 3} more") return False print(f"✅ All {len(expected_customers)} customers migrated correctly") print(f"✅ All customers assigned to SupportRepId 3") print(f"✅ All customers have Fax field set to NULL") print(f"✅ Customer data sets match exactly (order-independent)") return True def main(): """Main verification function.""" print("=" * 60) print("Verifying Customer Data Migration Task") print("=" * 60) # Load expected customer data expected_customers = load_expected_customers() if not expected_customers: sys.exit(1) print(f"Loaded {len(expected_customers)} expected customer records") # Get connection parameters conn_params = get_connection_params() if not conn_params["database"]: print("❌ No database specified") sys.exit(1) try: # Connect to database conn = psycopg2.connect(**conn_params) # Verify migration success = verify_migrated_customers(conn, expected_customers) conn.close() if success: print("\n🎉 Task verification: PASS") sys.exit(0) else: print("\n❌ Task verification: FAIL") sys.exit(1) except psycopg2.Error as e: print(f"❌ Database error: {e}") sys.exit(1) except Exception as e: print(f"❌ Verification error: {e}") sys.exit(1) if __name__ == "__main__": main() ================================================ FILE: tasks/postgres/standard/chinook/employee_hierarchy_management/description.md ================================================ Manage employee hierarchy and customer assignments through systematic CRUD operations. ## Your Mission: Chinook needs to reorganize their employee structure and reassign customer relationships. Complete a series of precise database modifications to update the employee hierarchy. ## Tasks to Complete: ### 1. **INSERT: Add New Employees** Insert exactly 2 new employees into the Employee table: - EmployeeId: 9, FirstName: 'Sarah', LastName: 'Johnson', Title: 'Sales Support Agent', ReportsTo: 2, BirthDate: '1985-03-15', HireDate: '2009-01-10', Address: '123 Oak Street', City: 'Calgary', State: 'AB', Country: 'Canada', PostalCode: 'T2P 5G3', Phone: '+1 (403) 555-0123', Fax: '+1 (403) 555-0124', Email: 'sarah.johnson@chinookcorp.com' - EmployeeId: 10, FirstName: 'Mike', LastName: 'Chen', Title: 'Sales Support Agent', ReportsTo: 2, BirthDate: '1982-08-22', HireDate: '2009-01-10', Address: '456 Pine Ave', City: 'Calgary', State: 'AB', Country: 'Canada', PostalCode: 'T2P 5G4', Phone: '+1 (403) 555-0125', Fax: '+1 (403) 555-0126', Email: 'mike.chen@chinookcorp.com' ### 2. **UPDATE: Modify Existing Employee Information** - Change Andrew Adams (EmployeeId = 1) title from 'General Manager' to 'CEO' - Update Nancy Edwards (EmployeeId = 2) phone number to '+1 (403) 555-9999' - Change all employees with Title = 'IT Staff' to have Title = 'IT Specialist' ### 3. **UPDATE: Reassign Some Customers to New Employees** - Update customers with CustomerId 1, 2, 3 to have SupportRepId = 9 (Sarah Johnson) - Update customers with CustomerId 4, 5, 6 to have SupportRepId = 10 (Mike Chen) ### 4. **UPDATE: Reorganize Reporting Structure** - Change Sarah Johnson (EmployeeId = 9) to report to Andrew Adams (EmployeeId = 1) instead of Nancy Edwards - Change Mike Chen (EmployeeId = 10) to also report to Andrew Adams (EmployeeId = 1) ### 5. **INSERT: Create Employee Performance Table** Create a new table called `employee_performance`: - `employee_id` (integer, foreign key to Employee) - `customers_assigned` (integer) - `performance_score` (decimal) Insert records for employees 9 and 10 by calculating their actual customer assignments: - Sarah Johnson: calculate actual number of customers assigned to her, performance score 4.5 - Mike Chen: calculate actual number of customers assigned to him, performance score 4.2 ### 6. **DELETE: Remove IT Department Employee** - Delete Robert King (EmployeeId = 7) from the Employee table - Before deletion, handle all relationships: - Find who Robert reports to and reassign any employees who report to Robert to report to Robert's manager instead - Find all customers assigned to Robert as their support rep and reassign them to Robert's manager ### 7. **UPDATE: Promote Remaining IT Staff** - Promote Laura Callahan (EmployeeId = 8) from 'IT Specialist' to 'Senior IT Specialist' - Update her salary information by adding a new column `salary` to Employee table (decimal type) - Set Laura's salary to 75000.00 and all other employees to 50000.00 ### 8. **Final Verification Query** Execute this exact query to verify all changes: ```sql SELECT COUNT(*) as total_employees, COUNT(CASE WHEN "Title" = 'CEO' THEN 1 END) as ceo_count, COUNT(CASE WHEN "Title" = 'IT Specialist' THEN 1 END) as it_specialist_count, COUNT(CASE WHEN "ReportsTo" = 1 THEN 1 END) as reports_to_ceo FROM "Employee"; ``` Expected result: total_employees = 9, ceo_count = 1, it_specialist_count = 0, reports_to_ceo = 4 ## Business Rules: * Use exact EmployeeId values as specified * Maintain referential integrity between Employee and Customer tables * All phone numbers must include country code format * Email addresses must follow the pattern firstname.lastname@chinookcorp.com ## Expected Outcome: The database should have exactly 10 employees total, with the new hierarchy structure in place and customer assignments updated accordingly. ================================================ FILE: tasks/postgres/standard/chinook/employee_hierarchy_management/meta.json ================================================ { "task_id": "employee_hierarchy_management", "task_name": "Employee Hierarchy Management", "category_id": "chinook", "category_name": "Chinook", "description": "Reorganize employee structure through CRUD operations including inserts, updates, deletes, and customer reassignments.", "author": "Lingxiao Du", "created_at": "2025-08-12", "difficulty": "L3", "tags": [ "data migration", "schema design", "transactional operations" ], "mcp": [ "postgres" ], "meta_data": { "stateType": "text", "stateContent": "Table \"Album\" {\n \"AlbumId\" int4 [pk, not null]\n \"Title\" varchar(160) [not null]\n \"ArtistId\" int4 [not null]\n\n Indexes {\n ArtistId [type: btree, name: \"IFK_AlbumArtistId\"]\n }\n}\n\nTable \"Artist\" {\n \"ArtistId\" int4 [pk, not null]\n \"Name\" varchar(120)\n}\n\nTable \"Customer\" {\n \"CustomerId\" int4 [pk, not null]\n \"FirstName\" varchar(40) [not null]\n \"LastName\" varchar(20) [not null]\n \"Company\" varchar(80)\n \"Address\" varchar(70)\n \"City\" varchar(40)\n \"State\" varchar(40)\n \"Country\" varchar(40)\n \"PostalCode\" varchar(10)\n \"Phone\" varchar(24)\n \"Fax\" varchar(24)\n \"Email\" varchar(60) [not null]\n \"SupportRepId\" int4\n\n Indexes {\n SupportRepId [type: btree, name: \"IFK_CustomerSupportRepId\"]\n }\n}\n\nTable \"Employee\" {\n \"EmployeeId\" int4 [pk, not null]\n \"LastName\" varchar(20) [not null]\n \"FirstName\" varchar(20) [not null]\n \"Title\" varchar(30)\n \"ReportsTo\" int4\n \"BirthDate\" timestamp\n \"HireDate\" timestamp\n \"Address\" varchar(70)\n \"City\" varchar(40)\n \"State\" varchar(40)\n \"Country\" varchar(40)\n \"PostalCode\" varchar(10)\n \"Phone\" varchar(24)\n \"Fax\" varchar(24)\n \"Email\" varchar(60)\n\n Indexes {\n ReportsTo [type: btree, name: \"IFK_EmployeeReportsTo\"]\n }\n}\n\nTable \"Genre\" {\n \"GenreId\" int4 [pk, not null]\n \"Name\" varchar(120)\n}\n\nTable \"Invoice\" {\n \"InvoiceId\" int4 [pk, not null]\n \"CustomerId\" int4 [not null]\n \"InvoiceDate\" timestamp [not null]\n \"BillingAddress\" varchar(70)\n \"BillingCity\" varchar(40)\n \"BillingState\" varchar(40)\n \"BillingCountry\" varchar(40)\n \"BillingPostalCode\" varchar(10)\n \"Total\" numeric(10,2) [not null]\n\n Indexes {\n CustomerId [type: btree, name: \"IFK_InvoiceCustomerId\"]\n }\n}\n\nTable \"InvoiceLine\" {\n \"InvoiceLineId\" int4 [pk, not null]\n \"InvoiceId\" int4 [not null]\n \"TrackId\" int4 [not null]\n \"UnitPrice\" numeric(10,2) [not null]\n \"Quantity\" int4 [not null]\n\n Indexes {\n InvoiceId [type: btree, name: \"IFK_InvoiceLineInvoiceId\"]\n TrackId [type: btree, name: \"IFK_InvoiceLineTrackId\"]\n }\n}\n\nTable \"MediaType\" {\n \"MediaTypeId\" int4 [pk, not null]\n \"Name\" varchar(120)\n}\n\nTable \"Playlist\" {\n \"PlaylistId\" int4 [pk, not null]\n \"Name\" varchar(120)\n}\n\nTable \"PlaylistTrack\" {\n \"PlaylistId\" int4 [not null]\n \"TrackId\" int4 [not null]\n\n Indexes {\n (PlaylistId, TrackId) [type: btree, name: \"PK_PlaylistTrack\"]\n TrackId [type: btree, name: \"IFK_PlaylistTrackTrackId\"]\n }\n}\n\nTable \"Track\" {\n \"TrackId\" int4 [pk, not null]\n \"Name\" varchar(200) [not null]\n \"AlbumId\" int4\n \"MediaTypeId\" int4 [not null]\n \"GenreId\" int4\n \"Composer\" varchar(220)\n \"Milliseconds\" int4 [not null]\n \"Bytes\" int4\n \"UnitPrice\" numeric(10,2) [not null]\n\n Indexes {\n AlbumId [type: btree, name: \"IFK_TrackAlbumId\"]\n GenreId [type: btree, name: \"IFK_TrackGenreId\"]\n MediaTypeId [type: btree, name: \"IFK_TrackMediaTypeId\"]\n }\n}\n\nRef \"FK_AlbumArtistId\":\"Artist\".\"ArtistId\" < \"Album\".\"ArtistId\"\n\nRef \"FK_CustomerSupportRepId\":\"Employee\".\"EmployeeId\" < \"Customer\".\"SupportRepId\"\n\nRef \"FK_EmployeeReportsTo\":\"Employee\".\"EmployeeId\" < \"Employee\".\"ReportsTo\"\n\nRef \"FK_InvoiceCustomerId\":\"Customer\".\"CustomerId\" < \"Invoice\".\"CustomerId\"\n\nRef \"FK_InvoiceLineInvoiceId\":\"Invoice\".\"InvoiceId\" < \"InvoiceLine\".\"InvoiceId\"\n\nRef \"FK_InvoiceLineTrackId\":\"Track\".\"TrackId\" < \"InvoiceLine\".\"TrackId\"\n\nRef \"FK_PlaylistTrackPlaylistId\":\"Playlist\".\"PlaylistId\" < \"PlaylistTrack\".\"PlaylistId\"\n\nRef \"FK_PlaylistTrackTrackId\":\"Track\".\"TrackId\" < \"PlaylistTrack\".\"TrackId\"\n\nRef \"FK_TrackAlbumId\":\"Album\".\"AlbumId\" < \"Track\".\"AlbumId\"\n\nRef \"FK_TrackGenreId\":\"Genre\".\"GenreId\" < \"Track\".\"GenreId\"\n\nRef \"FK_TrackMediaTypeId\":\"MediaType\".\"MediaTypeId\" < \"Track\".\"MediaTypeId\"\n", "stateUrl": null, "stateOriginalUrl": "https://github.com/neondatabase-labs/postgres-sample-dbs/blob/main/chinook.sql" } } ================================================ FILE: tasks/postgres/standard/chinook/employee_hierarchy_management/verify.py ================================================ """ Verification script for PostgreSQL Task 3: Employee Hierarchy Management """ import os import sys import psycopg2 from decimal import Decimal def rows_match(actual_row, expected_row): """ Compare two rows with appropriate tolerance. For Decimal types: allows 0.01 tolerance For other types: requires exact match """ if len(actual_row) != len(expected_row): return False for actual, expected in zip(actual_row, expected_row): if isinstance(actual, Decimal) and isinstance(expected, Decimal): if abs(float(actual) - float(expected)) > 0.01: return False elif actual != expected: return False return True def get_connection_params() -> dict: """Get database connection parameters.""" return { "host": os.getenv("POSTGRES_HOST", "localhost"), "port": int(os.getenv("POSTGRES_PORT", 5432)), "database": os.getenv("POSTGRES_DATABASE"), "user": os.getenv("POSTGRES_USERNAME"), "password": os.getenv("POSTGRES_PASSWORD") } def verify_employee_count_and_titles(conn) -> bool: """Verify the final employee count and title changes.""" with conn.cursor() as cur: # Check the final verification query results cur.execute(""" SELECT COUNT(*) as total_employees, COUNT(CASE WHEN "Title" = 'CEO' THEN 1 END) as ceo_count, COUNT(CASE WHEN "Title" = 'IT Specialist' THEN 1 END) as it_specialist_count, COUNT(CASE WHEN "ReportsTo" = 1 THEN 1 END) as reports_to_ceo FROM "Employee" """) result = cur.fetchone() total_employees, ceo_count, it_specialist_count, reports_to_ceo = result # Expected: total_employees = 9, ceo_count = 1, it_specialist_count = 1, reports_to_ceo = 4 if total_employees != 9: print(f"❌ Expected 9 total employees, got {total_employees}") return False if ceo_count != 1: print(f"❌ Expected 1 CEO, got {ceo_count}") return False if it_specialist_count != 0: print(f"❌ Expected 0 IT Specialists, got {it_specialist_count}") return False if reports_to_ceo != 4: print(f"❌ Expected 4 employees reporting to CEO, got {reports_to_ceo}") return False print("✅ Employee count and title verification passed") return True def verify_specific_employees(conn) -> bool: """Verify specific employee records and modifications.""" with conn.cursor() as cur: # Check all employee fields in one query cur.execute(""" SELECT "EmployeeId", "LastName", "FirstName", "Title", "ReportsTo", "BirthDate", "HireDate", "Address", "City", "State", "Country", "PostalCode", "Phone", "Fax", "Email" FROM "Employee" WHERE "EmployeeId" IN (1, 2, 9, 10) ORDER BY "EmployeeId" """) employees = cur.fetchall() from datetime import datetime expected = [ # Andrew Adams (ID 1) - Title changes to 'CEO', phone stays original, ReportsTo stays None (1, 'Adams', 'Andrew', 'CEO', None, datetime(1962, 2, 18), datetime(2002, 8, 14), '11120 Jasper Ave NW', 'Edmonton', 'AB', 'Canada', 'T5K 2N1', '+1 (780) 428-9482', '+1 (780) 428-3457', 'andrew@chinookcorp.com'), # Nancy Edwards (ID 2) - Phone changes, title stays 'Sales Manager', ReportsTo stays 1 (2, 'Edwards', 'Nancy', 'Sales Manager', 1, datetime(1958, 12, 8), datetime(2002, 5, 1), '825 8 Ave SW', 'Calgary', 'AB', 'Canada', 'T2P 2T3', '+1 (403) 555-9999', '+1 (403) 262-3322', 'nancy@chinookcorp.com'), # Sarah Johnson - all new data, final ReportsTo = 1 (changed in step 4) (9, 'Johnson', 'Sarah', 'Sales Support Agent', 1, datetime(1985, 3, 15), datetime(2009, 1, 10), '123 Oak Street', 'Calgary', 'AB', 'Canada', 'T2P 5G3', '+1 (403) 555-0123', '+1 (403) 555-0124', 'sarah.johnson@chinookcorp.com'), # Mike Chen - all new data, final ReportsTo = 1 (changed in step 4) (10, 'Chen', 'Mike', 'Sales Support Agent', 1, datetime(1982, 8, 22), datetime(2009, 1, 10), '456 Pine Ave', 'Calgary', 'AB', 'Canada', 'T2P 5G4', '+1 (403) 555-0125', '+1 (403) 555-0126', 'mike.chen@chinookcorp.com') ] if len(employees) != 4: print(f"❌ Expected 4 key employees, found {len(employees)}") return False # Full field comparison for all employees using rows_match for actual, expected_emp in zip(employees, expected): if not rows_match(actual, expected_emp): print(f"❌ Employee {actual[0]} row mismatch: expected {expected_emp}, got {actual}") return False print("✅ Specific employee verification passed - all fields match exactly") return True def verify_customer_assignments(conn) -> bool: """Verify customer support representative assignments.""" with conn.cursor() as cur: # Check customers 1, 2, 3 are assigned to Sarah (ID 9) cur.execute(""" SELECT COUNT(*) FROM "Customer" WHERE "CustomerId" IN (1, 2, 3) AND "SupportRepId" = 9 """) sarah_customers = cur.fetchone()[0] if sarah_customers != 3: print(f"❌ Expected 3 customers assigned to Sarah Johnson, got {sarah_customers}") return False # Check customers 4, 5, 6 are assigned to Mike (ID 10) cur.execute(""" SELECT COUNT(*) FROM "Customer" WHERE "CustomerId" IN (4, 5, 6) AND "SupportRepId" = 10 """) mike_customers = cur.fetchone()[0] if mike_customers != 3: print(f"❌ Expected 3 customers assigned to Mike Chen, got {mike_customers}") return False print("✅ Customer assignment verification passed") return True def verify_performance_table(conn) -> bool: """Verify the employee_performance table exists and has correct data.""" with conn.cursor() as cur: try: # Get all performance records cur.execute(""" SELECT employee_id, customers_assigned, performance_score FROM employee_performance ORDER BY employee_id """) actual_results = cur.fetchall() # Get actual customer counts for verification cur.execute(""" SELECT "SupportRepId", COUNT(*) FROM "Customer" WHERE "SupportRepId" IN (9, 10) GROUP BY "SupportRepId" ORDER BY "SupportRepId" """) customer_counts = dict(cur.fetchall()) expected = [ (9, customer_counts.get(9, 0), Decimal('4.5')), # Sarah Johnson (10, customer_counts.get(10, 0), Decimal('4.2')) # Mike Chen ] if len(actual_results) != 2: print(f"❌ Expected 2 performance records, got {len(actual_results)}") return False for actual, expected_row in zip(actual_results, expected): if not rows_match(actual, expected_row): print(f"❌ Performance record mismatch: expected {expected_row}, got {actual}") return False print("✅ Employee performance table verification passed") return True except psycopg2.Error as e: print(f"❌ Employee performance table verification failed: {e}") return False def verify_employee_deletion_and_promotion(conn) -> bool: """Verify Robert King deletion and Laura Callahan promotion.""" with conn.cursor() as cur: try: # Verify Robert King (ID 7) is deleted cur.execute(""" SELECT COUNT(*) FROM "Employee" WHERE "EmployeeId" = 7 """) if cur.fetchone()[0] != 0: print("❌ Robert King (EmployeeId = 7) should be deleted") return False # Verify Laura Callahan (ID 8) promotion cur.execute(""" SELECT "Title" FROM "Employee" WHERE "EmployeeId" = 8 """) laura_title = cur.fetchone() if not laura_title or laura_title[0] != 'Senior IT Specialist': print(f"❌ Laura Callahan should have title 'Senior IT Specialist', got: {laura_title[0] if laura_title else None}") return False print("✅ Employee deletion and promotion verification passed") return True except psycopg2.Error as e: print(f"❌ Employee deletion/promotion verification failed: {e}") return False def verify_salary_column(conn) -> bool: """Verify salary column exists and has correct values.""" with conn.cursor() as cur: try: # Check if salary column exists and get all salary values cur.execute(""" SELECT "EmployeeId", salary FROM "Employee" ORDER BY "EmployeeId" """) salary_data = cur.fetchall() # Verify Laura (ID 8) has 75000.00, others have 50000.00 for emp_id, salary in salary_data: expected_salary = Decimal('75000.00') if emp_id == 8 else Decimal('50000.00') if salary != expected_salary: print(f"❌ Employee {emp_id} salary should be {expected_salary}, got {salary}") return False print("✅ Salary column verification passed") return True except psycopg2.Error as e: print(f"❌ Salary column verification failed: {e}") return False def main(): """Main verification function.""" print("=" * 50) print("Verifying Task 3: Employee Hierarchy Management") print("=" * 50) # Get connection parameters conn_params = get_connection_params() if not conn_params["database"]: print("❌ No database specified") sys.exit(1) try: # Connect to database conn = psycopg2.connect(**conn_params) # Run verification checks with short-circuit evaluation success = (verify_employee_count_and_titles(conn) and verify_specific_employees(conn) and verify_customer_assignments(conn) and verify_performance_table(conn) and verify_employee_deletion_and_promotion(conn) and verify_salary_column(conn)) conn.close() if success: print("\n🎉 Task verification: PASS") print("All employee hierarchy management operations completed correctly!") sys.exit(0) else: print("\n❌ Task verification: FAIL") sys.exit(1) except psycopg2.Error as e: print(f"❌ Database error: {e}") sys.exit(1) except Exception as e: print(f"❌ Verification error: {e}") sys.exit(1) if __name__ == "__main__": main() ================================================ FILE: tasks/postgres/standard/chinook/sales_and_music_charts/description.md ================================================ Create a monthly sales dashboard and top music charts system for Chinook's management team to track business performance and identify trending music content. ## Your Tasks: 1. **Build the monthly sales summary table** — create a table called `monthly_sales_summary` in the `public` schema with: * `year_month` (varchar) — format as 'YYYY-MM' (e.g., '2009-01') * `total_invoices` (integer) — number of invoices in that month * `total_revenue` (decimal) — sum of all invoice totals for the month * `total_tracks_sold` (integer) — total quantity of individual tracks sold * `average_invoice_value` (decimal) — average invoice amount for the month * `unique_customers` (integer) — count of distinct customers who made purchases 2. **Create the music charts table** — build a table called `top_music_charts` in the `public` schema with: * `chart_type` (varchar) — either 'top_tracks', 'top_albums', or 'top_artists' * `rank_position` (integer) — ranking from 1 to 10 * `item_id` (integer) — ID of the track, album, or artist * `item_name` (varchar) — name of the track, album, or artist * `total_revenue` (decimal) — total revenue generated by this item 3. **Populate the monthly sales data**: * Calculate metrics for each month that has invoice data * Use invoice date to determine the month * **Note**: Each invoice can contain multiple invoice lines (tracks) 4. **Generate the top 10 charts**: * **Top Tracks**: Rank tracks by total quantity sold across all invoices * **Top Albums**: Rank albums by total revenue generated from their tracks * **Top Artists**: Rank artists by total revenue from all their tracks across all albums 5. **Business rules to follow**: * Only include months where at least one invoice exists * For album rankings, sum revenue from all tracks in each album * For artist rankings, sum revenue from all tracks across all their albums * Handle ties by using item name alphabetically as tiebreaker * Exclude any items with zero sales This system will provide clear, actionable business intelligence for monthly reporting and music trend analysis. ================================================ FILE: tasks/postgres/standard/chinook/sales_and_music_charts/meta.json ================================================ { "task_id": "sales_and_music_charts", "task_name": "Sales and Music Charts", "category_id": "chinook", "category_name": "Chinook", "description": "Create monthly sales dashboard and top music charts system for tracking business performance and trending content.", "author": "Lingxiao Du", "created_at": "2025-08-12", "difficulty": "L3", "tags": [ "reporting and analytics", "statistical aggregation", "schema design" ], "mcp": [ "postgres" ], "meta_data": { "stateType": "text", "stateContent": "Table \"Album\" {\n \"AlbumId\" int4 [pk, not null]\n \"Title\" varchar(160) [not null]\n \"ArtistId\" int4 [not null]\n\n Indexes {\n ArtistId [type: btree, name: \"IFK_AlbumArtistId\"]\n }\n}\n\nTable \"Artist\" {\n \"ArtistId\" int4 [pk, not null]\n \"Name\" varchar(120)\n}\n\nTable \"Customer\" {\n \"CustomerId\" int4 [pk, not null]\n \"FirstName\" varchar(40) [not null]\n \"LastName\" varchar(20) [not null]\n \"Company\" varchar(80)\n \"Address\" varchar(70)\n \"City\" varchar(40)\n \"State\" varchar(40)\n \"Country\" varchar(40)\n \"PostalCode\" varchar(10)\n \"Phone\" varchar(24)\n \"Fax\" varchar(24)\n \"Email\" varchar(60) [not null]\n \"SupportRepId\" int4\n\n Indexes {\n SupportRepId [type: btree, name: \"IFK_CustomerSupportRepId\"]\n }\n}\n\nTable \"Employee\" {\n \"EmployeeId\" int4 [pk, not null]\n \"LastName\" varchar(20) [not null]\n \"FirstName\" varchar(20) [not null]\n \"Title\" varchar(30)\n \"ReportsTo\" int4\n \"BirthDate\" timestamp\n \"HireDate\" timestamp\n \"Address\" varchar(70)\n \"City\" varchar(40)\n \"State\" varchar(40)\n \"Country\" varchar(40)\n \"PostalCode\" varchar(10)\n \"Phone\" varchar(24)\n \"Fax\" varchar(24)\n \"Email\" varchar(60)\n\n Indexes {\n ReportsTo [type: btree, name: \"IFK_EmployeeReportsTo\"]\n }\n}\n\nTable \"Genre\" {\n \"GenreId\" int4 [pk, not null]\n \"Name\" varchar(120)\n}\n\nTable \"Invoice\" {\n \"InvoiceId\" int4 [pk, not null]\n \"CustomerId\" int4 [not null]\n \"InvoiceDate\" timestamp [not null]\n \"BillingAddress\" varchar(70)\n \"BillingCity\" varchar(40)\n \"BillingState\" varchar(40)\n \"BillingCountry\" varchar(40)\n \"BillingPostalCode\" varchar(10)\n \"Total\" numeric(10,2) [not null]\n\n Indexes {\n CustomerId [type: btree, name: \"IFK_InvoiceCustomerId\"]\n }\n}\n\nTable \"InvoiceLine\" {\n \"InvoiceLineId\" int4 [pk, not null]\n \"InvoiceId\" int4 [not null]\n \"TrackId\" int4 [not null]\n \"UnitPrice\" numeric(10,2) [not null]\n \"Quantity\" int4 [not null]\n\n Indexes {\n InvoiceId [type: btree, name: \"IFK_InvoiceLineInvoiceId\"]\n TrackId [type: btree, name: \"IFK_InvoiceLineTrackId\"]\n }\n}\n\nTable \"MediaType\" {\n \"MediaTypeId\" int4 [pk, not null]\n \"Name\" varchar(120)\n}\n\nTable \"Playlist\" {\n \"PlaylistId\" int4 [pk, not null]\n \"Name\" varchar(120)\n}\n\nTable \"PlaylistTrack\" {\n \"PlaylistId\" int4 [not null]\n \"TrackId\" int4 [not null]\n\n Indexes {\n (PlaylistId, TrackId) [type: btree, name: \"PK_PlaylistTrack\"]\n TrackId [type: btree, name: \"IFK_PlaylistTrackTrackId\"]\n }\n}\n\nTable \"Track\" {\n \"TrackId\" int4 [pk, not null]\n \"Name\" varchar(200) [not null]\n \"AlbumId\" int4\n \"MediaTypeId\" int4 [not null]\n \"GenreId\" int4\n \"Composer\" varchar(220)\n \"Milliseconds\" int4 [not null]\n \"Bytes\" int4\n \"UnitPrice\" numeric(10,2) [not null]\n\n Indexes {\n AlbumId [type: btree, name: \"IFK_TrackAlbumId\"]\n GenreId [type: btree, name: \"IFK_TrackGenreId\"]\n MediaTypeId [type: btree, name: \"IFK_TrackMediaTypeId\"]\n }\n}\n\nRef \"FK_AlbumArtistId\":\"Artist\".\"ArtistId\" < \"Album\".\"ArtistId\"\n\nRef \"FK_CustomerSupportRepId\":\"Employee\".\"EmployeeId\" < \"Customer\".\"SupportRepId\"\n\nRef \"FK_EmployeeReportsTo\":\"Employee\".\"EmployeeId\" < \"Employee\".\"ReportsTo\"\n\nRef \"FK_InvoiceCustomerId\":\"Customer\".\"CustomerId\" < \"Invoice\".\"CustomerId\"\n\nRef \"FK_InvoiceLineInvoiceId\":\"Invoice\".\"InvoiceId\" < \"InvoiceLine\".\"InvoiceId\"\n\nRef \"FK_InvoiceLineTrackId\":\"Track\".\"TrackId\" < \"InvoiceLine\".\"TrackId\"\n\nRef \"FK_PlaylistTrackPlaylistId\":\"Playlist\".\"PlaylistId\" < \"PlaylistTrack\".\"PlaylistId\"\n\nRef \"FK_PlaylistTrackTrackId\":\"Track\".\"TrackId\" < \"PlaylistTrack\".\"TrackId\"\n\nRef \"FK_TrackAlbumId\":\"Album\".\"AlbumId\" < \"Track\".\"AlbumId\"\n\nRef \"FK_TrackGenreId\":\"Genre\".\"GenreId\" < \"Track\".\"GenreId\"\n\nRef \"FK_TrackMediaTypeId\":\"MediaType\".\"MediaTypeId\" < \"Track\".\"MediaTypeId\"\n", "stateUrl": null, "stateOriginalUrl": "https://github.com/neondatabase-labs/postgres-sample-dbs/blob/main/chinook.sql" } } ================================================ FILE: tasks/postgres/standard/chinook/sales_and_music_charts/verify.py ================================================ """ Verification script for PostgreSQL Task 1: Monthly Sales Dashboard and Music Charts """ import os import sys import psycopg2 from decimal import Decimal def rows_match(actual_row, expected_row): """ Compare two rows with appropriate tolerance. For Decimal types: allows 0.01 tolerance For other types: requires exact match """ if len(actual_row) != len(expected_row): return False for actual, expected in zip(actual_row, expected_row): if isinstance(actual, Decimal) and isinstance(expected, Decimal): if abs(float(actual) - float(expected)) > 0.01: return False elif actual != expected: return False return True def get_connection_params() -> dict: """Get database connection parameters.""" return { "host": os.getenv("POSTGRES_HOST", "localhost"), "port": int(os.getenv("POSTGRES_PORT", 5432)), "database": os.getenv("POSTGRES_DATABASE"), "user": os.getenv("POSTGRES_USERNAME"), "password": os.getenv("POSTGRES_PASSWORD") } def verify_monthly_sales_results(conn) -> bool: """Verify the monthly sales summary results.""" with conn.cursor() as cur: # Get actual results from the created table cur.execute(""" SELECT year_month, total_invoices, total_revenue, total_tracks_sold, average_invoice_value, unique_customers FROM monthly_sales_summary ORDER BY year_month """) actual_results = cur.fetchall() # Execute ground truth query cur.execute(""" WITH invoice_metrics AS ( SELECT DATE_TRUNC('month', i."InvoiceDate") AS ym, COUNT(*)::INT AS total_invoices, SUM(i."Total")::DECIMAL AS total_revenue, AVG(i."Total")::DECIMAL AS average_invoice_value, COUNT(DISTINCT i."CustomerId")::INT AS unique_customers FROM "Invoice" i GROUP BY 1 ), track_metrics AS ( SELECT DATE_TRUNC('month', i."InvoiceDate") AS ym, SUM(il."Quantity")::INT AS total_tracks_sold FROM "Invoice" i JOIN "InvoiceLine" il ON il."InvoiceId" = i."InvoiceId" WHERE il."Quantity" > 0 GROUP BY 1 ) SELECT TO_CHAR(im.ym, 'YYYY-MM') AS year_month, im.total_invoices, im.total_revenue, COALESCE(tm.total_tracks_sold, 0) AS total_tracks_sold, im.average_invoice_value, im.unique_customers FROM invoice_metrics im LEFT JOIN track_metrics tm USING (ym) ORDER BY year_month; """) expected_results = cur.fetchall() if len(actual_results) != len(expected_results): print(f"❌ Expected {len(expected_results)} monthly sales records, got {len(actual_results)}") return False mismatches = 0 for i, (actual, expected) in enumerate(zip(actual_results, expected_results)): if not rows_match(actual, expected): if mismatches < 5: # Only show first 5 mismatches print(f"❌ Monthly sales row {i+1} mismatch: expected {expected}, got {actual}") mismatches += 1 if mismatches > 0: print(f"❌ Total monthly sales mismatches: {mismatches}") return False print(f"✅ Monthly sales results are correct ({len(actual_results)} records)") return True def verify_music_charts_results(conn) -> bool: """Verify the music charts results.""" with conn.cursor() as cur: # Get actual results from the created table cur.execute(""" SELECT chart_type, rank_position, item_id, item_name, total_revenue FROM top_music_charts ORDER BY chart_type, rank_position """) actual_results = cur.fetchall() # Execute ground truth queries for each chart type cur.execute(""" WITH track_stats AS ( SELECT 'top_tracks'::varchar AS chart_type, t."TrackId" AS item_id, t."Name" AS item_name, SUM(il."UnitPrice" * il."Quantity")::DECIMAL AS total_revenue, SUM(il."Quantity")::INT AS total_quantity FROM "Track" t JOIN "InvoiceLine" il ON il."TrackId" = t."TrackId" GROUP BY t."TrackId", t."Name" HAVING SUM(il."Quantity") > 0 ), track_ranked AS ( SELECT chart_type, item_id, item_name, total_revenue, ROW_NUMBER() OVER (ORDER BY total_quantity DESC, item_name, item_id) AS rank_position FROM track_stats ), album_rev AS ( SELECT 'top_albums'::varchar AS chart_type, a."AlbumId" AS item_id, a."Title" AS item_name, SUM(il."UnitPrice" * il."Quantity")::DECIMAL AS total_revenue FROM "Album" a JOIN "Track" t ON t."AlbumId" = a."AlbumId" JOIN "InvoiceLine" il ON il."TrackId" = t."TrackId" GROUP BY a."AlbumId", a."Title" HAVING SUM(il."UnitPrice" * il."Quantity") > 0 ), album_ranked AS ( SELECT chart_type, item_id, item_name, total_revenue, ROW_NUMBER() OVER (ORDER BY total_revenue DESC, item_name, item_id) AS rank_position FROM album_rev ), artist_rev AS ( SELECT 'top_artists'::varchar AS chart_type, ar."ArtistId" AS item_id, ar."Name" AS item_name, SUM(il."UnitPrice" * il."Quantity")::DECIMAL AS total_revenue FROM "Artist" ar JOIN "Album" a ON a."ArtistId" = ar."ArtistId" JOIN "Track" t ON t."AlbumId" = a."AlbumId" JOIN "InvoiceLine" il ON il."TrackId" = t."TrackId" GROUP BY ar."ArtistId", ar."Name" HAVING SUM(il."UnitPrice" * il."Quantity") > 0 ), artist_ranked AS ( SELECT chart_type, item_id, item_name, total_revenue, ROW_NUMBER() OVER (ORDER BY total_revenue DESC, item_name, item_id) AS rank_position FROM artist_rev ) SELECT chart_type, rank_position, item_id, item_name, total_revenue FROM ( SELECT * FROM track_ranked WHERE rank_position <= 10 UNION ALL SELECT * FROM album_ranked WHERE rank_position <= 10 UNION ALL SELECT * FROM artist_ranked WHERE rank_position <= 10 ) x ORDER BY chart_type, rank_position; """) expected_results = cur.fetchall() if len(actual_results) != len(expected_results): print(f"❌ Expected {len(expected_results)} music chart records, got {len(actual_results)}") return False mismatches = 0 for i, (actual, expected) in enumerate(zip(actual_results, expected_results)): if not rows_match(actual, expected): if mismatches < 5: # Only show first 5 mismatches print(f"❌ Music chart row {i+1} mismatch: expected {expected}, got {actual}") mismatches += 1 if mismatches > 0: print(f"❌ Total music chart mismatches: {mismatches}") return False print(f"✅ Music chart results are correct ({len(actual_results)} records)") return True def main(): """Main verification function.""" print("=" * 50) # Get connection parameters conn_params = get_connection_params() if not conn_params["database"]: print("❌ No database specified") sys.exit(1) try: # Connect to database conn = psycopg2.connect(**conn_params) # Verify results success = verify_monthly_sales_results(conn) and verify_music_charts_results(conn) conn.close() if success: print("\n🎉 Task verification: PASS") sys.exit(0) else: print("\n❌ Task verification: FAIL") sys.exit(1) except psycopg2.Error as e: print(f"❌ Database error: {e}") sys.exit(1) except Exception as e: print(f"❌ Verification error: {e}") sys.exit(1) if __name__ == "__main__": main() ================================================ FILE: tasks/postgres/standard/dvdrental/customer_analysis_fix/description.md ================================================ Fix the customer analysis query that is producing incorrect results. ## Background The data analytics team attempted to create a customer behavior analysis query to identify active customers and analyze their spending patterns and preferences. The requirements are: - Only count rentals that have associated payment records (paid rentals) - Only include customers with at least 15 paid rentals - Only include customers with valid email addresses However, they're getting incorrect results - the query is over-counting rentals and calculating wrong spending amounts. Your task is to fix this query to produce accurate results. ## The Problematic Query Here's the buggy query that needs to be fixed: ```sql WITH customer_basic_stats AS ( SELECT c.customer_id, c.first_name || ' ' || c.last_name as customer_name, ci.city as customer_city, co.country as customer_country, COUNT(r.rental_id) as total_rentals, COUNT(DISTINCT i.film_id) as unique_films, SUM(p.amount) as total_spent, AVG(EXTRACT(days FROM (r.return_date - r.rental_date))) as avg_rental_duration FROM customer c JOIN address a ON c.address_id = a.address_id JOIN city ci ON a.city_id = ci.city_id JOIN country co ON ci.country_id = co.country_id JOIN rental r ON c.customer_id = r.customer_id JOIN inventory i ON r.inventory_id = i.inventory_id JOIN payment p ON r.rental_id = p.rental_id WHERE c.email IS NOT NULL GROUP BY c.customer_id, c.first_name, c.last_name, ci.city, co.country HAVING COUNT(r.rental_id) >= 15 ), customer_categories AS ( SELECT c.customer_id, cat.name as category_name, COUNT(*) as category_count, ROW_NUMBER() OVER (PARTITION BY c.customer_id ORDER BY COUNT(*) DESC, cat.name ASC) as rn FROM customer c JOIN rental r ON c.customer_id = r.customer_id JOIN inventory i ON r.inventory_id = i.inventory_id JOIN film f ON i.film_id = f.film_id JOIN film_category fc ON f.film_id = fc.film_id JOIN category cat ON fc.category_id = cat.category_id JOIN payment p ON r.rental_id = p.rental_id WHERE c.email IS NOT NULL GROUP BY c.customer_id, cat.name ), customer_actors AS ( SELECT c.customer_id, a.first_name || ' ' || a.last_name as actor_name, COUNT(*) as actor_count, ROW_NUMBER() OVER (PARTITION BY c.customer_id ORDER BY COUNT(*) DESC, (a.first_name || ' ' || a.last_name) ASC) as rn FROM customer c JOIN rental r ON c.customer_id = r.customer_id JOIN inventory i ON r.inventory_id = i.inventory_id JOIN film f ON i.film_id = f.film_id JOIN film_actor fa ON f.film_id = fa.film_id JOIN actor a ON fa.actor_id = a.actor_id JOIN payment p ON r.rental_id = p.rental_id WHERE c.email IS NOT NULL GROUP BY c.customer_id, a.first_name, a.last_name ), regional_popular_films AS ( SELECT co.country, f.title, COUNT(*) as rental_count, ROW_NUMBER() OVER (PARTITION BY co.country ORDER BY COUNT(*) DESC, f.title ASC) as rn FROM rental r JOIN inventory i ON r.inventory_id = i.inventory_id JOIN film f ON i.film_id = f.film_id JOIN customer c ON r.customer_id = c.customer_id JOIN address a ON c.address_id = a.address_id JOIN city ci ON a.city_id = ci.city_id JOIN country co ON ci.country_id = co.country_id JOIN payment p ON r.rental_id = p.rental_id WHERE c.email IS NOT NULL GROUP BY co.country, f.title ) SELECT cbs.customer_id, cbs.customer_name, cbs.customer_city, cbs.customer_country, cbs.total_rentals, cbs.unique_films, cbs.total_spent, cc.category_name as favorite_category, ca.actor_name as favorite_actor, cbs.avg_rental_duration, CASE WHEN cbs.total_spent >= 150 THEN 'Premium' WHEN cbs.total_spent >= 75 THEN 'Standard' ELSE 'Basic' END as customer_tier, rpf.title as most_popular_film_in_region, rpf.rental_count as regional_film_rental_count FROM customer_basic_stats cbs LEFT JOIN customer_categories cc ON cbs.customer_id = cc.customer_id AND cc.rn = 1 LEFT JOIN customer_actors ca ON cbs.customer_id = ca.customer_id AND ca.rn = 1 LEFT JOIN regional_popular_films rpf ON cbs.customer_country = rpf.country AND rpf.rn = 1 ORDER BY cbs.total_spent DESC, cbs.total_rentals DESC, cbs.customer_name ASC; ``` ## Known Issues When comparing the problematic query results with the expected correct values, the following discrepancies are observed: 1. **Rental count discrepancies**: Many customers show higher `total_rentals` counts than expected 2. **Spending amount errors**: The `total_spent` values don't match the correct calculations 3. **Incorrect favorite categories and actors**: Many customers show wrong favorite categories and actors compared to the expected results 4. **Time calculation inconsistencies**: The `avg_rental_duration` values differ significantly from the correct calculations - Example: Customer ID 1 shows 3.90 days instead of the expected 4.27 days - Example: Customer ID 2 shows 5.23 days instead of the expected 5.69 days ## Your Task Debug and fix the query to produce accurate results. Then create a table with your corrected results. 1. **Fix the query** to ensure: - Accurate customer spending and rental counts - Correct favorite categories and actors - Proper regional popular films 2. **Create a table** called `customer_analysis_fixed` in the `public` schema with your corrected query results. The table should have the same columns as the original query output. **Important**: The business logic and output columns should remain the same - only fix the data accuracy issues. ================================================ FILE: tasks/postgres/standard/dvdrental/customer_analysis_fix/meta.json ================================================ { "task_id": "customer_analysis_fix", "task_name": "Customer Analysis Fix", "category_id": "dvdrental", "category_name": "DVD Rental", "description": "Debug and fix customer behavior analysis query producing incorrect rental counts and spending calculations.", "author": "Lingxiao Du", "created_at": "2025-08-20", "difficulty": "L3", "tags": [ "performance optimization", "data integrity enforcement" ], "mcp": [ "postgres" ], "meta_data": { "stateType": "text", "stateContent": "Enum \"mpaa_rating\" {\n \"G\"\n \"PG\"\n \"PG-13\"\n \"R\"\n \"NC-17\"\n}\n\nTable \"customer\" {\n \"customer_id\" int4 [pk, not null, increment]\n \"store_id\" int2 [not null]\n \"first_name\" varchar(45) [not null]\n \"last_name\" varchar(45) [not null]\n \"email\" varchar(50)\n \"address_id\" int2 [not null]\n \"activebool\" bool [not null, default: true]\n \"create_date\" date [not null, default: `('now'::text)::date`]\n \"last_update\" timestamp [default: `now()`]\n \"active\" int4\n\n Indexes {\n address_id [type: btree, name: \"idx_fk_address_id\"]\n store_id [type: btree, name: \"idx_fk_store_id\"]\n last_name [type: btree, name: \"idx_last_name\"]\n }\n}\n\nTable \"actor\" {\n \"actor_id\" int4 [pk, not null, increment]\n \"first_name\" varchar(45) [not null]\n \"last_name\" varchar(45) [not null]\n \"last_update\" timestamp [not null, default: `now()`]\n\n Indexes {\n last_name [type: btree, name: \"idx_actor_last_name\"]\n }\n}\n\nTable \"category\" {\n \"category_id\" int4 [pk, not null, increment]\n \"name\" varchar(25) [not null]\n \"last_update\" timestamp [not null, default: `now()`]\n}\n\nTable \"film\" {\n \"film_id\" int4 [pk, not null, increment]\n \"title\" varchar(255) [not null]\n \"description\" text\n \"release_year\" int4\n \"language_id\" int2 [not null]\n \"rental_duration\" int2 [not null, default: 3]\n \"rental_rate\" numeric(4,2) [not null, default: 4.99]\n \"length\" int2\n \"replacement_cost\" numeric(5,2) [not null, default: 19.99]\n \"rating\" mpaa_rating [default: 'G']\n \"last_update\" timestamp [not null, default: `now()`]\n \"special_features\" \"text[]\"\n \"fulltext\" tsvector [not null]\n\n Indexes {\n fulltext [type: gist, name: \"film_fulltext_idx\"]\n language_id [type: btree, name: \"idx_fk_language_id\"]\n title [type: btree, name: \"idx_title\"]\n }\n}\n\nTable \"film_actor\" {\n \"actor_id\" int2 [not null]\n \"film_id\" int2 [not null]\n \"last_update\" timestamp [not null, default: `now()`]\n\n Indexes {\n (actor_id, film_id) [type: btree, name: \"film_actor_pkey\"]\n film_id [type: btree, name: \"idx_fk_film_id\"]\n }\n}\n\nTable \"film_category\" {\n \"film_id\" int2 [not null]\n \"category_id\" int2 [not null]\n \"last_update\" timestamp [not null, default: `now()`]\n\n Indexes {\n (film_id, category_id) [type: btree, name: \"film_category_pkey\"]\n }\n}\n\nTable \"address\" {\n \"address_id\" int4 [pk, not null, increment]\n \"address\" varchar(50) [not null]\n \"address2\" varchar(50)\n \"district\" varchar(20) [not null]\n \"city_id\" int2 [not null]\n \"postal_code\" varchar(10)\n \"phone\" varchar(20) [not null]\n \"last_update\" timestamp [not null, default: `now()`]\n\n Indexes {\n city_id [type: btree, name: \"idx_fk_city_id\"]\n }\n}\n\nTable \"city\" {\n \"city_id\" int4 [pk, not null, increment]\n \"city\" varchar(50) [not null]\n \"country_id\" int2 [not null]\n \"last_update\" timestamp [not null, default: `now()`]\n\n Indexes {\n country_id [type: btree, name: \"idx_fk_country_id\"]\n }\n}\n\nTable \"country\" {\n \"country_id\" int4 [pk, not null, increment]\n \"country\" varchar(50) [not null]\n \"last_update\" timestamp [not null, default: `now()`]\n}\n\nTable \"inventory\" {\n \"inventory_id\" int4 [pk, not null, increment]\n \"film_id\" int2 [not null]\n \"store_id\" int2 [not null]\n \"last_update\" timestamp [not null, default: `now()`]\n\n Indexes {\n (store_id, film_id) [type: btree, name: \"idx_store_id_film_id\"]\n }\n}\n\nTable \"language\" {\n \"language_id\" int4 [pk, not null, increment]\n \"name\" bpchar(20) [not null]\n \"last_update\" timestamp [not null, default: `now()`]\n}\n\nTable \"payment\" {\n \"payment_id\" int4 [pk, not null, increment]\n \"customer_id\" int2 [not null]\n \"staff_id\" int2 [not null]\n \"rental_id\" int4 [not null]\n \"amount\" numeric(5,2) [not null]\n \"payment_date\" timestamp [not null]\n\n Indexes {\n rental_id [type: btree, name: \"idx_fk_rental_id\"]\n staff_id [type: btree, name: \"idx_fk_staff_id\"]\n }\n}\n\nTable \"rental\" {\n \"rental_id\" int4 [pk, not null, increment]\n \"rental_date\" timestamp [not null]\n \"inventory_id\" int4 [not null]\n \"customer_id\" int2 [not null]\n \"return_date\" timestamp\n \"staff_id\" int2 [not null]\n \"last_update\" timestamp [not null, default: `now()`]\n\n Indexes {\n (rental_date, inventory_id, customer_id) [type: btree, name: \"idx_unq_rental_rental_date_inventory_id_customer_id\"]\n inventory_id [type: btree, name: \"idx_fk_inventory_id\"]\n }\n}\n\nTable \"staff\" {\n \"staff_id\" int4 [pk, not null, increment]\n \"first_name\" varchar(45) [not null]\n \"last_name\" varchar(45) [not null]\n \"address_id\" int2 [not null]\n \"email\" varchar(50)\n \"store_id\" int2 [not null]\n \"active\" bool [not null, default: true]\n \"username\" varchar(16) [not null]\n \"password\" varchar(40)\n \"last_update\" timestamp [not null, default: `now()`]\n \"picture\" bytea\n}\n\nTable \"store\" {\n \"store_id\" int4 [pk, not null, increment]\n \"manager_staff_id\" int2 [unique, not null]\n \"address_id\" int2 [not null]\n \"last_update\" timestamp [not null, default: `now()`]\n}\n\nRef \"fk_address_city\":\"city\".\"city_id\" < \"address\".\"city_id\"\n\nRef \"fk_city\":\"country\".\"country_id\" < \"city\".\"country_id\"\n\nRef \"customer_address_id_fkey\":\"address\".\"address_id\" < \"customer\".\"address_id\" [update: cascade, delete: restrict]\n\nRef \"film_language_id_fkey\":\"language\".\"language_id\" < \"film\".\"language_id\" [update: cascade, delete: restrict]\n\nRef \"film_actor_actor_id_fkey\":\"actor\".\"actor_id\" < \"film_actor\".\"actor_id\" [update: cascade, delete: restrict]\n\nRef \"film_actor_film_id_fkey\":\"film\".\"film_id\" < \"film_actor\".\"film_id\" [update: cascade, delete: restrict]\n\nRef \"film_category_category_id_fkey\":\"category\".\"category_id\" < \"film_category\".\"category_id\" [update: cascade, delete: restrict]\n\nRef \"film_category_film_id_fkey\":\"film\".\"film_id\" < \"film_category\".\"film_id\" [update: cascade, delete: restrict]\n\nRef \"inventory_film_id_fkey\":\"film\".\"film_id\" < \"inventory\".\"film_id\" [update: cascade, delete: restrict]\n\nRef \"payment_customer_id_fkey\":\"customer\".\"customer_id\" < \"payment\".\"customer_id\" [update: cascade, delete: restrict]\n\nRef \"payment_rental_id_fkey\":\"rental\".\"rental_id\" < \"payment\".\"rental_id\" [update: cascade, delete: set null]\n\nRef \"payment_staff_id_fkey\":\"staff\".\"staff_id\" < \"payment\".\"staff_id\" [update: cascade, delete: restrict]\n\nRef \"rental_customer_id_fkey\":\"customer\".\"customer_id\" < \"rental\".\"customer_id\" [update: cascade, delete: restrict]\n\nRef \"rental_inventory_id_fkey\":\"inventory\".\"inventory_id\" < \"rental\".\"inventory_id\" [update: cascade, delete: restrict]\n\nRef \"rental_staff_id_key\":\"staff\".\"staff_id\" < \"rental\".\"staff_id\"\n\nRef \"staff_address_id_fkey\":\"address\".\"address_id\" < \"staff\".\"address_id\" [update: cascade, delete: restrict]\n\nRef \"store_address_id_fkey\":\"address\".\"address_id\" < \"store\".\"address_id\" [update: cascade, delete: restrict]\n\nRef \"store_manager_staff_id_fkey\":\"staff\".\"staff_id\" < \"store\".\"manager_staff_id\" [update: cascade, delete: restrict]\n", "stateUrl": null, "stateOriginalUrl": "https://github.com/gordonkwokkwok/DVD-Rental-PostgreSQL-Project" } } ================================================ FILE: tasks/postgres/standard/dvdrental/customer_analysis_fix/verify.py ================================================ """ Verification script for PostgreSQL Task 3: Fix Customer Analysis Query """ import os import sys import psycopg2 from decimal import Decimal def get_connection_params() -> dict: """Get database connection parameters.""" return { "host": os.getenv("POSTGRES_HOST", "localhost"), "port": int(os.getenv("POSTGRES_PORT", 5432)), "database": os.getenv("POSTGRES_DATABASE"), "user": os.getenv("POSTGRES_USERNAME"), "password": os.getenv("POSTGRES_PASSWORD") } def rows_match(actual_row, expected_row): """Compare two rows with appropriate tolerance for decimals and floats.""" if len(actual_row) != len(expected_row): return False for actual, expected in zip(actual_row, expected_row): if isinstance(actual, (Decimal, float)) and isinstance(expected, (Decimal, float)): # Use higher tolerance for floating point comparisons if abs(float(actual) - float(expected)) > 0.1: return False elif actual != expected: return False return True def verify_customer_analysis_fixed_table(conn) -> bool: """Verify the customer_analysis_fixed table results.""" with conn.cursor() as cur: # Get actual results from the created table cur.execute(""" SELECT customer_id, customer_name, customer_city, customer_country, total_rentals, unique_films, total_spent, favorite_category, favorite_actor, avg_rental_duration, customer_tier, most_popular_film_in_region, regional_film_rental_count FROM customer_analysis_fixed ORDER BY total_spent DESC, total_rentals DESC, customer_name ASC """) actual_results = cur.fetchall() # Execute ground truth query (the corrected version) cur.execute(""" WITH paid_rentals AS ( SELECT DISTINCT r.rental_id, r.customer_id, r.inventory_id, r.rental_date, r.return_date FROM rental r JOIN payment p ON p.rental_id = r.rental_id ), payments_by_customer AS ( SELECT pr.customer_id, SUM(p.amount) AS total_spent FROM paid_rentals pr JOIN payment p ON p.rental_id = pr.rental_id GROUP BY pr.customer_id ), customer_basic_stats AS ( SELECT c.customer_id, c.first_name || ' ' || c.last_name AS customer_name, ci.city AS customer_city, co.country AS customer_country, COUNT(DISTINCT pr.rental_id) AS total_rentals, COUNT(DISTINCT i.film_id) AS unique_films, pbc.total_spent, AVG(EXTRACT(EPOCH FROM (pr.return_date - pr.rental_date)) / 86400.0) AS avg_rental_duration FROM customer c JOIN address a ON c.address_id = a.address_id JOIN city ci ON a.city_id = ci.city_id JOIN country co ON ci.country_id = co.country_id JOIN paid_rentals pr ON pr.customer_id = c.customer_id JOIN inventory i ON pr.inventory_id = i.inventory_id JOIN payments_by_customer pbc ON pbc.customer_id = c.customer_id WHERE c.email IS NOT NULL GROUP BY c.customer_id, c.first_name, c.last_name, ci.city, co.country, pbc.total_spent HAVING COUNT(DISTINCT pr.rental_id) >= 15 ), customer_categories AS ( SELECT pr.customer_id, cat.name AS category_name, COUNT(*) AS category_count, ROW_NUMBER() OVER ( PARTITION BY pr.customer_id ORDER BY COUNT(*) DESC, cat.name ASC ) AS rn FROM paid_rentals pr JOIN inventory i ON pr.inventory_id = i.inventory_id JOIN film f ON i.film_id = f.film_id JOIN film_category fc ON f.film_id = fc.film_id JOIN category cat ON fc.category_id = cat.category_id JOIN customer c ON pr.customer_id = c.customer_id WHERE c.email IS NOT NULL GROUP BY pr.customer_id, cat.name ), customer_actors AS ( SELECT pr.customer_id, (a.first_name || ' ' || a.last_name) AS actor_name, COUNT(*) AS actor_count, ROW_NUMBER() OVER ( PARTITION BY pr.customer_id ORDER BY COUNT(*) DESC, (a.first_name || ' ' || a.last_name) ASC ) AS rn FROM paid_rentals pr JOIN inventory i ON pr.inventory_id = i.inventory_id JOIN film f ON i.film_id = f.film_id JOIN film_actor fa ON f.film_id = fa.film_id JOIN actor a ON fa.actor_id = a.actor_id JOIN customer c ON pr.customer_id = c.customer_id WHERE c.email IS NOT NULL GROUP BY pr.customer_id, a.first_name, a.last_name ), regional_popular_films AS ( SELECT co.country, f.title, COUNT(DISTINCT pr.rental_id) AS rental_count, ROW_NUMBER() OVER ( PARTITION BY co.country ORDER BY COUNT(DISTINCT pr.rental_id) DESC, f.title ASC ) AS rn FROM paid_rentals pr JOIN customer c ON pr.customer_id = c.customer_id JOIN address a ON c.address_id = a.address_id JOIN city ci ON a.city_id = ci.city_id JOIN country co ON ci.country_id = co.country_id JOIN inventory i ON pr.inventory_id = i.inventory_id JOIN film f ON i.film_id = f.film_id WHERE c.email IS NOT NULL GROUP BY co.country, f.title ) SELECT cbs.customer_id, cbs.customer_name, cbs.customer_city, cbs.customer_country, cbs.total_rentals, cbs.unique_films, cbs.total_spent, cc.category_name AS favorite_category, ca.actor_name AS favorite_actor, cbs.avg_rental_duration, CASE WHEN cbs.total_spent >= 150 THEN 'Premium' WHEN cbs.total_spent >= 75 THEN 'Standard' ELSE 'Basic' END AS customer_tier, rpf.title AS most_popular_film_in_region, rpf.rental_count AS regional_film_rental_count FROM customer_basic_stats cbs LEFT JOIN customer_categories cc ON cbs.customer_id = cc.customer_id AND cc.rn = 1 LEFT JOIN customer_actors ca ON cbs.customer_id = ca.customer_id AND ca.rn = 1 LEFT JOIN regional_popular_films rpf ON cbs.customer_country = rpf.country AND rpf.rn = 1 ORDER BY cbs.total_spent DESC, cbs.total_rentals DESC, cbs.customer_name ASC; """) expected_results = cur.fetchall() if len(actual_results) != len(expected_results): print(f"❌ Expected {len(expected_results)} rows, got {len(actual_results)}") return False mismatches = 0 for i, (actual, expected) in enumerate(zip(actual_results, expected_results)): if not rows_match(actual, expected): if mismatches < 5: # Only show first 5 mismatches print(f"❌ Row {i+1} mismatch:") print(f" Expected: {expected}") print(f" Actual: {actual}") mismatches += 1 if mismatches > 0: print(f"❌ Total mismatches: {mismatches}") return False print(f"✅ Query results are correct ({len(actual_results)} rows)") return True def main(): """Main verification function.""" print("=" * 70) print("PostgreSQL Task 3 Verification: Fix Customer Analysis Query") print("=" * 70) # Get connection parameters conn_params = get_connection_params() if not conn_params["database"]: print("❌ No database specified") sys.exit(1) try: # Connect to database conn = psycopg2.connect(**conn_params) # Verify results success = verify_customer_analysis_fixed_table(conn) conn.close() if success: print("\n🎉 Task verification: PASS") print(" - Query was successfully debugged and fixed") print(" - All 587 rows match the expected results") sys.exit(0) else: print("\n❌ Task verification: FAIL") print(" - The query still has issues") print(" - Please review the duplicate counting problem") sys.exit(1) except psycopg2.Error as e: print(f"❌ Database error: {e}") sys.exit(1) except Exception as e: print(f"❌ Verification error: {e}") sys.exit(1) if __name__ == "__main__": main() ================================================ FILE: tasks/postgres/standard/dvdrental/customer_analytics_optimization/description.md ================================================ Optimize slow customer analytics query in the DVD rental database. ## Background The business intelligence team is running customer analytics reports, but one of their critical queries has become extremely slow. The query that used to run in milliseconds is now taking over a second to complete, causing timeout issues in their reporting dashboard. ## Your Task Analyze and optimize the performance of this customer analytics query: ```sql SELECT c.customer_id, c.first_name, c.last_name, c.email, COUNT(DISTINCT p.payment_id) as total_payments, SUM(p.amount) as total_spent, AVG(p.amount) as avg_payment, COUNT(DISTINCT EXTRACT(month FROM p.payment_date)) as active_months, MAX(p.payment_date) as last_payment, MIN(p.payment_date) as first_payment, (SELECT COUNT(*) FROM payment p2 WHERE p2.customer_id = c.customer_id AND p2.amount > 5.0) as high_value_payments, (SELECT SUM(amount) FROM payment p3 WHERE p3.customer_id = c.customer_id AND p3.payment_date >= '2007-03-01') as recent_spending FROM customer c JOIN payment p ON c.customer_id = p.customer_id WHERE c.active = 1 GROUP BY c.customer_id, c.first_name, c.last_name, c.email HAVING COUNT(p.payment_id) >= 10 ORDER BY total_spent DESC, total_payments DESC; ``` The query is currently taking over 1000ms to execute and has a very high cost in the execution plan. The team needs this optimized urgently as it's blocking their daily reporting processes. ## Requirements - Use `EXPLAIN ANALYZE` to identify performance bottlenecks - Implement appropriate database optimizations - Ensure queries return accurate results after optimization - Document your optimization approach and performance improvements ================================================ FILE: tasks/postgres/standard/dvdrental/customer_analytics_optimization/meta.json ================================================ { "task_id": "customer_analytics_optimization", "task_name": "Customer Analytics Optimization", "category_id": "dvdrental", "category_name": "DVD Rental", "description": "Optimize slow customer analytics query with correlated subqueries causing timeout issues in reporting dashboard.", "author": "Lingxiao Du", "created_at": "2025-08-20", "difficulty": "L3", "tags": [ "performance optimization" ], "mcp": [ "postgres" ], "meta_data": { "stateType": "text", "stateContent": "Enum \"mpaa_rating\" {\n \"G\"\n \"PG\"\n \"PG-13\"\n \"R\"\n \"NC-17\"\n}\n\nTable \"customer\" {\n \"customer_id\" int4 [pk, not null, increment]\n \"store_id\" int2 [not null]\n \"first_name\" varchar(45) [not null]\n \"last_name\" varchar(45) [not null]\n \"email\" varchar(50)\n \"address_id\" int2 [not null]\n \"activebool\" bool [not null, default: true]\n \"create_date\" date [not null, default: `('now'::text)::date`]\n \"last_update\" timestamp [default: `now()`]\n \"active\" int4\n\n Indexes {\n address_id [type: btree, name: \"idx_fk_address_id\"]\n store_id [type: btree, name: \"idx_fk_store_id\"]\n last_name [type: btree, name: \"idx_last_name\"]\n }\n}\n\nTable \"actor\" {\n \"actor_id\" int4 [pk, not null, increment]\n \"first_name\" varchar(45) [not null]\n \"last_name\" varchar(45) [not null]\n \"last_update\" timestamp [not null, default: `now()`]\n\n Indexes {\n last_name [type: btree, name: \"idx_actor_last_name\"]\n }\n}\n\nTable \"category\" {\n \"category_id\" int4 [pk, not null, increment]\n \"name\" varchar(25) [not null]\n \"last_update\" timestamp [not null, default: `now()`]\n}\n\nTable \"film\" {\n \"film_id\" int4 [pk, not null, increment]\n \"title\" varchar(255) [not null]\n \"description\" text\n \"release_year\" int4\n \"language_id\" int2 [not null]\n \"rental_duration\" int2 [not null, default: 3]\n \"rental_rate\" numeric(4,2) [not null, default: 4.99]\n \"length\" int2\n \"replacement_cost\" numeric(5,2) [not null, default: 19.99]\n \"rating\" mpaa_rating [default: 'G']\n \"last_update\" timestamp [not null, default: `now()`]\n \"special_features\" \"text[]\"\n \"fulltext\" tsvector [not null]\n\n Indexes {\n fulltext [type: gist, name: \"film_fulltext_idx\"]\n language_id [type: btree, name: \"idx_fk_language_id\"]\n title [type: btree, name: \"idx_title\"]\n }\n}\n\nTable \"film_actor\" {\n \"actor_id\" int2 [not null]\n \"film_id\" int2 [not null]\n \"last_update\" timestamp [not null, default: `now()`]\n\n Indexes {\n (actor_id, film_id) [type: btree, name: \"film_actor_pkey\"]\n film_id [type: btree, name: \"idx_fk_film_id\"]\n }\n}\n\nTable \"film_category\" {\n \"film_id\" int2 [not null]\n \"category_id\" int2 [not null]\n \"last_update\" timestamp [not null, default: `now()`]\n\n Indexes {\n (film_id, category_id) [type: btree, name: \"film_category_pkey\"]\n }\n}\n\nTable \"address\" {\n \"address_id\" int4 [pk, not null, increment]\n \"address\" varchar(50) [not null]\n \"address2\" varchar(50)\n \"district\" varchar(20) [not null]\n \"city_id\" int2 [not null]\n \"postal_code\" varchar(10)\n \"phone\" varchar(20) [not null]\n \"last_update\" timestamp [not null, default: `now()`]\n\n Indexes {\n city_id [type: btree, name: \"idx_fk_city_id\"]\n }\n}\n\nTable \"city\" {\n \"city_id\" int4 [pk, not null, increment]\n \"city\" varchar(50) [not null]\n \"country_id\" int2 [not null]\n \"last_update\" timestamp [not null, default: `now()`]\n\n Indexes {\n country_id [type: btree, name: \"idx_fk_country_id\"]\n }\n}\n\nTable \"country\" {\n \"country_id\" int4 [pk, not null, increment]\n \"country\" varchar(50) [not null]\n \"last_update\" timestamp [not null, default: `now()`]\n}\n\nTable \"inventory\" {\n \"inventory_id\" int4 [pk, not null, increment]\n \"film_id\" int2 [not null]\n \"store_id\" int2 [not null]\n \"last_update\" timestamp [not null, default: `now()`]\n\n Indexes {\n (store_id, film_id) [type: btree, name: \"idx_store_id_film_id\"]\n }\n}\n\nTable \"language\" {\n \"language_id\" int4 [pk, not null, increment]\n \"name\" bpchar(20) [not null]\n \"last_update\" timestamp [not null, default: `now()`]\n}\n\nTable \"payment\" {\n \"payment_id\" int4 [pk, not null, increment]\n \"customer_id\" int2 [not null]\n \"staff_id\" int2 [not null]\n \"rental_id\" int4 [not null]\n \"amount\" numeric(5,2) [not null]\n \"payment_date\" timestamp [not null]\n\n Indexes {\n rental_id [type: btree, name: \"idx_fk_rental_id\"]\n staff_id [type: btree, name: \"idx_fk_staff_id\"]\n }\n}\n\nTable \"rental\" {\n \"rental_id\" int4 [pk, not null, increment]\n \"rental_date\" timestamp [not null]\n \"inventory_id\" int4 [not null]\n \"customer_id\" int2 [not null]\n \"return_date\" timestamp\n \"staff_id\" int2 [not null]\n \"last_update\" timestamp [not null, default: `now()`]\n\n Indexes {\n (rental_date, inventory_id, customer_id) [type: btree, name: \"idx_unq_rental_rental_date_inventory_id_customer_id\"]\n inventory_id [type: btree, name: \"idx_fk_inventory_id\"]\n }\n}\n\nTable \"staff\" {\n \"staff_id\" int4 [pk, not null, increment]\n \"first_name\" varchar(45) [not null]\n \"last_name\" varchar(45) [not null]\n \"address_id\" int2 [not null]\n \"email\" varchar(50)\n \"store_id\" int2 [not null]\n \"active\" bool [not null, default: true]\n \"username\" varchar(16) [not null]\n \"password\" varchar(40)\n \"last_update\" timestamp [not null, default: `now()`]\n \"picture\" bytea\n}\n\nTable \"store\" {\n \"store_id\" int4 [pk, not null, increment]\n \"manager_staff_id\" int2 [unique, not null]\n \"address_id\" int2 [not null]\n \"last_update\" timestamp [not null, default: `now()`]\n}\n\nRef \"fk_address_city\":\"city\".\"city_id\" < \"address\".\"city_id\"\n\nRef \"fk_city\":\"country\".\"country_id\" < \"city\".\"country_id\"\n\nRef \"customer_address_id_fkey\":\"address\".\"address_id\" < \"customer\".\"address_id\" [update: cascade, delete: restrict]\n\nRef \"film_language_id_fkey\":\"language\".\"language_id\" < \"film\".\"language_id\" [update: cascade, delete: restrict]\n\nRef \"film_actor_actor_id_fkey\":\"actor\".\"actor_id\" < \"film_actor\".\"actor_id\" [update: cascade, delete: restrict]\n\nRef \"film_actor_film_id_fkey\":\"film\".\"film_id\" < \"film_actor\".\"film_id\" [update: cascade, delete: restrict]\n\nRef \"film_category_category_id_fkey\":\"category\".\"category_id\" < \"film_category\".\"category_id\" [update: cascade, delete: restrict]\n\nRef \"film_category_film_id_fkey\":\"film\".\"film_id\" < \"film_category\".\"film_id\" [update: cascade, delete: restrict]\n\nRef \"inventory_film_id_fkey\":\"film\".\"film_id\" < \"inventory\".\"film_id\" [update: cascade, delete: restrict]\n\nRef \"payment_customer_id_fkey\":\"customer\".\"customer_id\" < \"payment\".\"customer_id\" [update: cascade, delete: restrict]\n\nRef \"payment_rental_id_fkey\":\"rental\".\"rental_id\" < \"payment\".\"rental_id\" [update: cascade, delete: set null]\n\nRef \"payment_staff_id_fkey\":\"staff\".\"staff_id\" < \"payment\".\"staff_id\" [update: cascade, delete: restrict]\n\nRef \"rental_customer_id_fkey\":\"customer\".\"customer_id\" < \"rental\".\"customer_id\" [update: cascade, delete: restrict]\n\nRef \"rental_inventory_id_fkey\":\"inventory\".\"inventory_id\" < \"rental\".\"inventory_id\" [update: cascade, delete: restrict]\n\nRef \"rental_staff_id_key\":\"staff\".\"staff_id\" < \"rental\".\"staff_id\"\n\nRef \"staff_address_id_fkey\":\"address\".\"address_id\" < \"staff\".\"address_id\" [update: cascade, delete: restrict]\n\nRef \"store_address_id_fkey\":\"address\".\"address_id\" < \"store\".\"address_id\" [update: cascade, delete: restrict]\n\nRef \"store_manager_staff_id_fkey\":\"staff\".\"staff_id\" < \"store\".\"manager_staff_id\" [update: cascade, delete: restrict]\n", "stateUrl": null, "stateOriginalUrl": "https://github.com/gordonkwokkwok/DVD-Rental-PostgreSQL-Project" } } ================================================ FILE: tasks/postgres/standard/dvdrental/customer_analytics_optimization/verify.py ================================================ """ Verification script for PostgreSQL Task 1: Customer Payment Query Optimization """ import os import sys import psycopg2 def get_connection_params() -> dict: """Get database connection parameters.""" return { "host": os.getenv("POSTGRES_HOST", "localhost"), "port": int(os.getenv("POSTGRES_PORT", 5432)), "database": os.getenv("POSTGRES_DATABASE"), "user": os.getenv("POSTGRES_USERNAME"), "password": os.getenv("POSTGRES_PASSWORD") } def check_payment_customer_id_index(conn) -> bool: """Check if there's any index on payment.customer_id column.""" with conn.cursor() as cur: cur.execute(""" SELECT indexname, indexdef FROM pg_indexes WHERE schemaname = 'public' AND tablename = 'payment' AND indexdef LIKE '%customer_id%' """) indexes = cur.fetchall() print(indexes) return len(indexes) > 0, indexes def main(): """Main verification function.""" print("=" * 60) print("PostgreSQL Task 1 Verification: Customer Payment Query Optimization") print("=" * 60) # Get connection parameters conn_params = get_connection_params() if not conn_params["database"]: print("❌ No database specified") sys.exit(1) try: # Connect to database conn = psycopg2.connect(**conn_params) print("\n🔍 Checking for customer_id index on payment table...") # Check if any index exists on payment.customer_id has_index, indexes = check_payment_customer_id_index(conn) if has_index: print("✅ Found index(es) on payment.customer_id:") for index_name, index_def in indexes: print(f" - {index_name}: {index_def}") else: print("❌ No index found on payment.customer_id column") conn.close() if has_index: print(f"\n🎉 Task verification: PASS") print(f" - Index on payment.customer_id exists") sys.exit(0) else: print(f"\n❌ Task verification: FAIL") print(f" - No index found on payment.customer_id") print(f" - Create an index on payment(customer_id) to optimize the queries") sys.exit(1) except psycopg2.Error as e: print(f"❌ Database error: {e}") sys.exit(1) except Exception as e: print(f"❌ Verification error: {e}") sys.exit(1) if __name__ == "__main__": main() ================================================ FILE: tasks/postgres/standard/dvdrental/film_inventory_management/description.md ================================================ Manage film inventory operations in the DVD rental database. ## Background You are the database administrator for the DVD rental store. The store manager has requested several database operations to manage the film inventory. You need to perform multiple operations including adding new films, updating inventory, querying available films, and cleaning up old records. ## Your Task Complete the following database operations in sequence: ### 1. Add New Films Add these two new films to the database: - **Film 1**: Title "Data Science Adventures", Description "A thrilling journey through machine learning algorithms", Release Year 2024, Language ID 1, Rental Duration 5 days, Rental Rate $3.99, Length 120 minutes, Replacement Cost $15.99, Rating 'PG-13' - **Film 2**: Title "Cloud Computing Chronicles", Description "Exploring the world of distributed systems", Release Year 2024, Language ID 1, Rental Duration 7 days, Rental Rate $4.99, Length 135 minutes, Replacement Cost $18.99, Rating 'PG' ### 2. Add Inventory Records For each new film, add 3 inventory records for store_id = 1 and 2 inventory records for store_id = 2. ### 3. Update Film Information Update the rental_rate of all films with rating 'PG-13' to increase by 10% (multiply by 1.1). ### 4. Create Available Films Table Create a table called `available_films` with the following structure: - `film_id` (INTEGER, PRIMARY KEY) - `title` (VARCHAR(255), NOT NULL) - `rental_rate` (NUMERIC(4,2), NOT NULL) - `length` (SMALLINT) Populate this table with films that meet these criteria: - Have rental_rate between $3.00 and $5.00 - Have length greater than 100 minutes - Are available in store_id = 1 (have at least 1 inventory record) ### 5. Clean Up Inventory Delete inventory records for films that meet ALL of the following criteria: - Have a replacement_cost greater than $25.00 - AND have rental_rate less than $1.00 - AND have no rental history (no records in the rental table) ### 6. Create Summary Report Table Create a table called `film_inventory_summary` with the following structure: - `title` (VARCHAR(255), NOT NULL) - `rental_rate` (NUMERIC(4,2), NOT NULL) - `total_inventory` (INTEGER, NOT NULL) - `store1_count` (INTEGER, NOT NULL) - `store2_count` (INTEGER, NOT NULL) Populate this table with a summary query that shows: - Film title - Current rental rate (after any updates from step 3) - Total count of inventory records across all stores - Count of inventory records in store_id = 1 - Count of inventory records in store_id = 2 Requirements for the summary report: - Include only films that currently have at least one inventory record - Insert the results sorted by inventory count from highest to lowest, and then alphabetically by film title - Ensure all counts reflect the state after completing the previous operations ## Requirements - Complete all operations in the specified sequence - Ensure data integrity throughout all operations - Verify that your operations affect the expected number of records - Handle any constraint violations appropriately ================================================ FILE: tasks/postgres/standard/dvdrental/film_inventory_management/meta.json ================================================ { "task_id": "film_inventory_management", "task_name": "Film Inventory Management", "category_id": "dvdrental", "category_name": "DVD Rental", "description": "Manage film inventory through multiple operations including adding films, updating records, and cleaning old data.", "author": "Lingxiao Du", "created_at": "2025-08-20", "difficulty": "L3", "tags": [ "data migration", "transactional operations", "schema design" ], "mcp": [ "postgres" ], "meta_data": { "stateType": "text", "stateContent": "Enum \"mpaa_rating\" {\n \"G\"\n \"PG\"\n \"PG-13\"\n \"R\"\n \"NC-17\"\n}\n\nTable \"customer\" {\n \"customer_id\" int4 [pk, not null, increment]\n \"store_id\" int2 [not null]\n \"first_name\" varchar(45) [not null]\n \"last_name\" varchar(45) [not null]\n \"email\" varchar(50)\n \"address_id\" int2 [not null]\n \"activebool\" bool [not null, default: true]\n \"create_date\" date [not null, default: `('now'::text)::date`]\n \"last_update\" timestamp [default: `now()`]\n \"active\" int4\n\n Indexes {\n address_id [type: btree, name: \"idx_fk_address_id\"]\n store_id [type: btree, name: \"idx_fk_store_id\"]\n last_name [type: btree, name: \"idx_last_name\"]\n }\n}\n\nTable \"actor\" {\n \"actor_id\" int4 [pk, not null, increment]\n \"first_name\" varchar(45) [not null]\n \"last_name\" varchar(45) [not null]\n \"last_update\" timestamp [not null, default: `now()`]\n\n Indexes {\n last_name [type: btree, name: \"idx_actor_last_name\"]\n }\n}\n\nTable \"category\" {\n \"category_id\" int4 [pk, not null, increment]\n \"name\" varchar(25) [not null]\n \"last_update\" timestamp [not null, default: `now()`]\n}\n\nTable \"film\" {\n \"film_id\" int4 [pk, not null, increment]\n \"title\" varchar(255) [not null]\n \"description\" text\n \"release_year\" int4\n \"language_id\" int2 [not null]\n \"rental_duration\" int2 [not null, default: 3]\n \"rental_rate\" numeric(4,2) [not null, default: 4.99]\n \"length\" int2\n \"replacement_cost\" numeric(5,2) [not null, default: 19.99]\n \"rating\" mpaa_rating [default: 'G']\n \"last_update\" timestamp [not null, default: `now()`]\n \"special_features\" \"text[]\"\n \"fulltext\" tsvector [not null]\n\n Indexes {\n fulltext [type: gist, name: \"film_fulltext_idx\"]\n language_id [type: btree, name: \"idx_fk_language_id\"]\n title [type: btree, name: \"idx_title\"]\n }\n}\n\nTable \"film_actor\" {\n \"actor_id\" int2 [not null]\n \"film_id\" int2 [not null]\n \"last_update\" timestamp [not null, default: `now()`]\n\n Indexes {\n (actor_id, film_id) [type: btree, name: \"film_actor_pkey\"]\n film_id [type: btree, name: \"idx_fk_film_id\"]\n }\n}\n\nTable \"film_category\" {\n \"film_id\" int2 [not null]\n \"category_id\" int2 [not null]\n \"last_update\" timestamp [not null, default: `now()`]\n\n Indexes {\n (film_id, category_id) [type: btree, name: \"film_category_pkey\"]\n }\n}\n\nTable \"address\" {\n \"address_id\" int4 [pk, not null, increment]\n \"address\" varchar(50) [not null]\n \"address2\" varchar(50)\n \"district\" varchar(20) [not null]\n \"city_id\" int2 [not null]\n \"postal_code\" varchar(10)\n \"phone\" varchar(20) [not null]\n \"last_update\" timestamp [not null, default: `now()`]\n\n Indexes {\n city_id [type: btree, name: \"idx_fk_city_id\"]\n }\n}\n\nTable \"city\" {\n \"city_id\" int4 [pk, not null, increment]\n \"city\" varchar(50) [not null]\n \"country_id\" int2 [not null]\n \"last_update\" timestamp [not null, default: `now()`]\n\n Indexes {\n country_id [type: btree, name: \"idx_fk_country_id\"]\n }\n}\n\nTable \"country\" {\n \"country_id\" int4 [pk, not null, increment]\n \"country\" varchar(50) [not null]\n \"last_update\" timestamp [not null, default: `now()`]\n}\n\nTable \"inventory\" {\n \"inventory_id\" int4 [pk, not null, increment]\n \"film_id\" int2 [not null]\n \"store_id\" int2 [not null]\n \"last_update\" timestamp [not null, default: `now()`]\n\n Indexes {\n (store_id, film_id) [type: btree, name: \"idx_store_id_film_id\"]\n }\n}\n\nTable \"language\" {\n \"language_id\" int4 [pk, not null, increment]\n \"name\" bpchar(20) [not null]\n \"last_update\" timestamp [not null, default: `now()`]\n}\n\nTable \"payment\" {\n \"payment_id\" int4 [pk, not null, increment]\n \"customer_id\" int2 [not null]\n \"staff_id\" int2 [not null]\n \"rental_id\" int4 [not null]\n \"amount\" numeric(5,2) [not null]\n \"payment_date\" timestamp [not null]\n\n Indexes {\n rental_id [type: btree, name: \"idx_fk_rental_id\"]\n staff_id [type: btree, name: \"idx_fk_staff_id\"]\n }\n}\n\nTable \"rental\" {\n \"rental_id\" int4 [pk, not null, increment]\n \"rental_date\" timestamp [not null]\n \"inventory_id\" int4 [not null]\n \"customer_id\" int2 [not null]\n \"return_date\" timestamp\n \"staff_id\" int2 [not null]\n \"last_update\" timestamp [not null, default: `now()`]\n\n Indexes {\n (rental_date, inventory_id, customer_id) [type: btree, name: \"idx_unq_rental_rental_date_inventory_id_customer_id\"]\n inventory_id [type: btree, name: \"idx_fk_inventory_id\"]\n }\n}\n\nTable \"staff\" {\n \"staff_id\" int4 [pk, not null, increment]\n \"first_name\" varchar(45) [not null]\n \"last_name\" varchar(45) [not null]\n \"address_id\" int2 [not null]\n \"email\" varchar(50)\n \"store_id\" int2 [not null]\n \"active\" bool [not null, default: true]\n \"username\" varchar(16) [not null]\n \"password\" varchar(40)\n \"last_update\" timestamp [not null, default: `now()`]\n \"picture\" bytea\n}\n\nTable \"store\" {\n \"store_id\" int4 [pk, not null, increment]\n \"manager_staff_id\" int2 [unique, not null]\n \"address_id\" int2 [not null]\n \"last_update\" timestamp [not null, default: `now()`]\n}\n\nRef \"fk_address_city\":\"city\".\"city_id\" < \"address\".\"city_id\"\n\nRef \"fk_city\":\"country\".\"country_id\" < \"city\".\"country_id\"\n\nRef \"customer_address_id_fkey\":\"address\".\"address_id\" < \"customer\".\"address_id\" [update: cascade, delete: restrict]\n\nRef \"film_language_id_fkey\":\"language\".\"language_id\" < \"film\".\"language_id\" [update: cascade, delete: restrict]\n\nRef \"film_actor_actor_id_fkey\":\"actor\".\"actor_id\" < \"film_actor\".\"actor_id\" [update: cascade, delete: restrict]\n\nRef \"film_actor_film_id_fkey\":\"film\".\"film_id\" < \"film_actor\".\"film_id\" [update: cascade, delete: restrict]\n\nRef \"film_category_category_id_fkey\":\"category\".\"category_id\" < \"film_category\".\"category_id\" [update: cascade, delete: restrict]\n\nRef \"film_category_film_id_fkey\":\"film\".\"film_id\" < \"film_category\".\"film_id\" [update: cascade, delete: restrict]\n\nRef \"inventory_film_id_fkey\":\"film\".\"film_id\" < \"inventory\".\"film_id\" [update: cascade, delete: restrict]\n\nRef \"payment_customer_id_fkey\":\"customer\".\"customer_id\" < \"payment\".\"customer_id\" [update: cascade, delete: restrict]\n\nRef \"payment_rental_id_fkey\":\"rental\".\"rental_id\" < \"payment\".\"rental_id\" [update: cascade, delete: set null]\n\nRef \"payment_staff_id_fkey\":\"staff\".\"staff_id\" < \"payment\".\"staff_id\" [update: cascade, delete: restrict]\n\nRef \"rental_customer_id_fkey\":\"customer\".\"customer_id\" < \"rental\".\"customer_id\" [update: cascade, delete: restrict]\n\nRef \"rental_inventory_id_fkey\":\"inventory\".\"inventory_id\" < \"rental\".\"inventory_id\" [update: cascade, delete: restrict]\n\nRef \"rental_staff_id_key\":\"staff\".\"staff_id\" < \"rental\".\"staff_id\"\n\nRef \"staff_address_id_fkey\":\"address\".\"address_id\" < \"staff\".\"address_id\" [update: cascade, delete: restrict]\n\nRef \"store_address_id_fkey\":\"address\".\"address_id\" < \"store\".\"address_id\" [update: cascade, delete: restrict]\n\nRef \"store_manager_staff_id_fkey\":\"staff\".\"staff_id\" < \"store\".\"manager_staff_id\" [update: cascade, delete: restrict]\n", "stateUrl": null, "stateOriginalUrl": "https://github.com/gordonkwokkwok/DVD-Rental-PostgreSQL-Project" } } ================================================ FILE: tasks/postgres/standard/dvdrental/film_inventory_management/verify.py ================================================ """ Verification script for PostgreSQL Task 4: Film Inventory Management """ import os import sys import psycopg2 from decimal import Decimal def rows_match(actual_row, expected_row): """Compare two rows with appropriate tolerance for decimals and floats.""" if len(actual_row) != len(expected_row): return False for actual, expected in zip(actual_row, expected_row): if isinstance(actual, (Decimal, float)) and isinstance(expected, (Decimal, float)): # Use higher tolerance for floating point comparisons if abs(float(actual) - float(expected)) > 0.01: return False elif actual != expected: return False return True def get_connection_params() -> dict: """Get database connection parameters.""" return { "host": os.getenv("POSTGRES_HOST", "localhost"), "port": int(os.getenv("POSTGRES_PORT", 5432)), "database": os.getenv("POSTGRES_DATABASE"), "user": os.getenv("POSTGRES_USERNAME"), "password": os.getenv("POSTGRES_PASSWORD") } def check_new_films(conn) -> bool: """Check if the two new films were added correctly.""" with conn.cursor() as cur: cur.execute(""" SELECT title, description, release_year, language_id, rental_duration, rental_rate, length, replacement_cost, rating FROM film WHERE title IN ('Data Science Adventures', 'Cloud Computing Chronicles') ORDER BY title """) actual_films = cur.fetchall() expected_films = [ ('Cloud Computing Chronicles', 'Exploring the world of distributed systems', 2024, 1, 7, Decimal('4.99'), 135, Decimal('18.99'), 'PG'), ('Data Science Adventures', 'A thrilling journey through machine learning algorithms', 2024, 1, 5, Decimal('4.389'), 120, Decimal('15.99'), 'PG-13') ] if len(actual_films) != len(expected_films): print(f"❌ Expected {len(expected_films)} new films, found {len(actual_films)}") return False mismatches = 0 for i, (actual, expected) in enumerate(zip(actual_films, expected_films)): if not rows_match(actual, expected): print(f"❌ Film {i+1} mismatch: expected {expected}, got {actual}") mismatches += 1 if mismatches > 0: print(f"❌ Total film mismatches: {mismatches}") return False print("✅ Both new films added correctly") return True def check_inventory_records(conn) -> bool: """Check if inventory records were added for new films.""" with conn.cursor() as cur: cur.execute(""" SELECT f.title, i.store_id, COUNT(*) as count FROM film f JOIN inventory i ON f.film_id = i.film_id WHERE f.title IN ('Data Science Adventures', 'Cloud Computing Chronicles') GROUP BY f.title, i.store_id ORDER BY f.title, i.store_id """) actual_inventory = cur.fetchall() expected_inventory = [ ('Cloud Computing Chronicles', 1, 3), ('Cloud Computing Chronicles', 2, 2), ('Data Science Adventures', 1, 3), ('Data Science Adventures', 2, 2) ] if len(actual_inventory) != len(expected_inventory): print(f"❌ Expected {len(expected_inventory)} inventory groups, found {len(actual_inventory)}") return False mismatches = 0 for i, (actual, expected) in enumerate(zip(actual_inventory, expected_inventory)): if not rows_match(actual, expected): print(f"❌ Inventory group {i+1} mismatch: expected {expected}, got {actual}") mismatches += 1 if mismatches > 0: print(f"❌ Total inventory mismatches: {mismatches}") return False print("✅ Inventory records added correctly") return True def check_available_films_table(conn) -> bool: """Check if available_films table was created and populated correctly.""" with conn.cursor() as cur: # Get actual results from the created table cur.execute(""" SELECT film_id, title, rental_rate, length FROM available_films ORDER BY rental_rate DESC, length DESC, title ASC """) actual_results = cur.fetchall() # Execute ground truth query cur.execute(""" SELECT DISTINCT f.film_id, f.title, f.rental_rate, f.length FROM film f JOIN inventory i ON f.film_id = i.film_id WHERE f.rental_rate >= 3.00 AND f.rental_rate <= 5.00 AND f.length > 100 AND i.store_id = 1 ORDER BY f.rental_rate DESC, f.length DESC, f.title ASC """) expected_results = cur.fetchall() if len(actual_results) != len(expected_results): print(f"❌ available_films table has {len(actual_results)} records, expected {len(expected_results)}") return False mismatches = 0 for i, (actual, expected) in enumerate(zip(actual_results, expected_results)): if not rows_match(actual, expected): if mismatches < 5: # Only show first 5 mismatches print(f"❌ available_films row {i+1} mismatch: expected {expected}, got {actual}") mismatches += 1 if mismatches > 0: print(f"❌ Total available_films mismatches: {mismatches}") return False print(f"✅ available_films table created and populated correctly ({len(actual_results)} records)") return True def check_inventory_cleanup(conn) -> bool: """Check if inventory cleanup was performed correctly.""" with conn.cursor() as cur: # Check that no inventory exists for films with replacement_cost > 25 AND rental_rate < 1 # that also don't have rental records (safe to delete) cur.execute(""" SELECT COUNT(*) FROM inventory i JOIN film f ON i.film_id = f.film_id WHERE f.replacement_cost > 25.00 AND f.rental_rate < 1.00 AND NOT EXISTS (SELECT 1 FROM rental r WHERE r.inventory_id = i.inventory_id) """) remaining_count = cur.fetchone()[0] if remaining_count > 0: print(f"❌ Found {remaining_count} inventory records that should have been deleted (no rental history)") return False print("✅ Inventory cleanup completed correctly") return True def check_summary_table(conn) -> bool: """Check if film_inventory_summary table was created and populated correctly.""" with conn.cursor() as cur: # Get actual results from the created table cur.execute(""" SELECT title, rental_rate, total_inventory, store1_count, store2_count FROM film_inventory_summary """) actual_results = cur.fetchall() # Execute ground truth query cur.execute(""" SELECT f.title, f.rental_rate, COUNT(i.inventory_id) as total_inventory, COUNT(CASE WHEN i.store_id = 1 THEN 1 END) as store1_count, COUNT(CASE WHEN i.store_id = 2 THEN 1 END) as store2_count FROM film f JOIN inventory i ON f.film_id = i.film_id GROUP BY f.film_id, f.title, f.rental_rate ORDER BY total_inventory DESC, f.title ASC """) expected_results = cur.fetchall() if len(actual_results) != len(expected_results): print(f"❌ film_inventory_summary table has {len(actual_results)} records, expected {len(expected_results)}") return False mismatches = 0 for i, (actual, expected) in enumerate(zip(actual_results, expected_results)): if not rows_match(actual, expected): if mismatches < 5: # Only show first 5 mismatches print(f"❌ Summary row {i+1} mismatch: expected {expected}, got {actual}") mismatches += 1 if mismatches > 0: print(f"❌ Total summary table mismatches: {mismatches}") return False print(f"✅ film_inventory_summary table created and populated correctly ({len(actual_results)} records)") return True def main(): """Main verification function.""" print("=" * 70) print("PostgreSQL Task 4 Verification: Film Inventory Management") print("=" * 70) # Get connection parameters conn_params = get_connection_params() if not conn_params["database"]: print("❌ No database specified") sys.exit(1) try: # Connect to database conn = psycopg2.connect(**conn_params) # Verify all operations with short-circuit evaluation success = ( check_new_films(conn) and check_inventory_records(conn) and check_available_films_table(conn) and check_inventory_cleanup(conn) and check_summary_table(conn) ) conn.close() if success: print(f"\n🎉 Task verification: PASS") sys.exit(0) else: print(f"\n❌ Task verification: FAIL") sys.exit(1) except psycopg2.Error as e: print(f"❌ Database error: {e}") sys.exit(1) except Exception as e: print(f"❌ Verification error: {e}") sys.exit(1) if __name__ == "__main__": main() ================================================ FILE: tasks/postgres/standard/employees/employee_demographics_report/description.md ================================================ Generate a comprehensive employee demographics and basic statistics report for the annual company overview. The HR team needs simple, clear statistical summaries about our workforce composition to include in the annual report and diversity initiatives. ## Your Tasks: 1. **Create the gender statistics table** — build a table called `gender_statistics` in the `employees` schema with these exact columns: * `gender` (varchar) — gender ('M' or 'F') * `total_employees` (integer) — total number of employees of this gender * `current_employees` (integer) — current employees of this gender (have active salary) * `percentage_of_workforce` (decimal) — percentage of current workforce 2. **Create the age group analysis table** — build a table called `age_group_analysis` in the `employees` schema with: * `age_group` (varchar) — age range ('20-29', '30-39', '40-49', '50-59', '60+') * `employee_count` (integer) — number of current employees in age group * `avg_salary` (decimal) — average current salary for age group * `avg_tenure_days` (decimal) — average days of service 3. **Create the birth month distribution table** — build a table called `birth_month_distribution` in the `employees` schema with: * `birth_month` (integer) — month number (1-12) * `month_name` (varchar) — month name ('January', 'February', etc.) * `employee_count` (integer) — total employees born in this month * `current_employee_count` (integer) — current employees born in this month 4. **Create the hiring year summary table** — build a table called `hiring_year_summary` in the `employees` schema with: * `hire_year` (integer) — year employees were hired * `employees_hired` (integer) — number of employees hired that year * `still_employed` (integer) — how many from that year are still employed * `retention_rate` (decimal) — percentage still employed (still_employed/employees_hired * 100) 5. **Apply age group classification** based on current age: * **20-29**: Ages 20-29 * **30-39**: Ages 30-39 * **40-49**: Ages 40-49 * **50-59**: Ages 50-59 * **60+**: Ages 60 and above 6. **Calculate workforce composition** — determine current workforce demographics using employees with active salary records (to_date = '9999-01-01'). 7. **Focus on basic statistics** — create simple counts, averages, and percentages that are easy to understand and verify. The analysis will provide clear demographic insights for HR reporting and workforce planning. ================================================ FILE: tasks/postgres/standard/employees/employee_demographics_report/meta.json ================================================ { "task_id": "employee_demographics_report", "task_name": "Employee Demographics Report", "category_id": "employees", "category_name": "Employees", "description": "Generate comprehensive employee demographics report with gender statistics, age groups, birth months, and hiring trends.", "author": "Lingxiao Du", "created_at": "2025-08-14", "difficulty": "L3", "tags": [ "reporting and analytics", "statistical aggregation" ], "mcp": [ "postgres" ], "meta_data": { "stateType": "text", "stateContent": "Enum \"employees\".\"employee_gender\" {\n \"M\"\n \"F\"\n}\n\nTable \"employees\".\"department\" {\n \"id\" bpchar(4) [pk, not null]\n \"dept_name\" varchar(40) [unique, not null]\n}\n\nTable \"employees\".\"department_employee\" {\n \"employee_id\" int8 [not null]\n \"department_id\" bpchar(4) [not null]\n \"from_date\" date [not null]\n \"to_date\" date [not null]\n\n Indexes {\n (employee_id, department_id) [type: btree, name: \"idx_16982_primary\"]\n department_id [type: btree, name: \"idx_16982_dept_no\"]\n }\n}\n\nTable \"employees\".\"department_manager\" {\n \"employee_id\" int8 [not null]\n \"department_id\" bpchar(4) [not null]\n \"from_date\" date [not null]\n \"to_date\" date [not null]\n\n Indexes {\n (employee_id, department_id) [type: btree, name: \"idx_16985_primary\"]\n department_id [type: btree, name: \"idx_16985_dept_no\"]\n }\n}\n\nTable \"employees\".\"employee\" {\n \"id\" int8 [pk, not null, increment]\n \"birth_date\" date [not null]\n \"first_name\" varchar(14) [not null]\n \"last_name\" varchar(16) [not null]\n \"gender\" employees.employee_gender [not null]\n \"hire_date\" date [not null]\n}\n\nTable \"employees\".\"salary\" {\n \"employee_id\" int8 [not null]\n \"amount\" int8 [not null]\n \"from_date\" date [not null]\n \"to_date\" date [not null]\n\n Indexes {\n (employee_id, from_date) [type: btree, name: \"idx_16991_primary\"]\n }\n}\n\nTable \"employees\".\"title\" {\n \"employee_id\" int8 [not null]\n \"title\" varchar(50) [not null]\n \"from_date\" date [not null]\n \"to_date\" date\n\n Indexes {\n (employee_id, title, from_date) [type: btree, name: \"idx_16994_primary\"]\n }\n}\n\nRef \"dept_emp_ibfk_1\":\"employees\".\"employee\".\"id\" < \"employees\".\"department_employee\".\"employee_id\" [update: restrict, delete: cascade]\n\nRef \"dept_emp_ibfk_2\":\"employees\".\"department\".\"id\" < \"employees\".\"department_employee\".\"department_id\" [update: restrict, delete: cascade]\n\nRef \"dept_manager_ibfk_1\":\"employees\".\"employee\".\"id\" < \"employees\".\"department_manager\".\"employee_id\" [update: restrict, delete: cascade]\n\nRef \"dept_manager_ibfk_2\":\"employees\".\"department\".\"id\" < \"employees\".\"department_manager\".\"department_id\" [update: restrict, delete: cascade]\n\nRef \"salaries_ibfk_1\":\"employees\".\"employee\".\"id\" < \"employees\".\"salary\".\"employee_id\" [update: restrict, delete: cascade]\n\nRef \"titles_ibfk_1\":\"employees\".\"employee\".\"id\" < \"employees\".\"title\".\"employee_id\" [update: restrict, delete: cascade]\n", "stateUrl": null, "stateOriginalUrl": "https://github.com/neondatabase-labs/postgres-sample-dbs/blob/main/employees.sql.gz" } } ================================================ FILE: tasks/postgres/standard/employees/employee_demographics_report/verify.py ================================================ """ Verification script for PostgreSQL Task 3: Employee Demographics Report """ import os import sys import psycopg2 from decimal import Decimal def rows_match(actual_row, expected_row): """ Compare two rows with appropriate tolerance. For Decimal types: allows 0.1 tolerance For other types: requires exact match """ if len(actual_row) != len(expected_row): return False for actual, expected in zip(actual_row, expected_row): if isinstance(actual, Decimal) and isinstance(expected, Decimal): if abs(float(actual) - float(expected)) > 0.1: return False elif actual != expected: return False return True def get_connection_params() -> dict: """Get database connection parameters.""" return { "host": os.getenv("POSTGRES_HOST", "localhost"), "port": int(os.getenv("POSTGRES_PORT", 5432)), "database": os.getenv("POSTGRES_DATABASE"), "user": os.getenv("POSTGRES_USERNAME"), "password": os.getenv("POSTGRES_PASSWORD") } def verify_gender_statistics_results(conn) -> bool: """Verify the gender statistics results.""" with conn.cursor() as cur: # Get actual results from the created table cur.execute(""" SELECT gender, total_employees, current_employees, percentage_of_workforce FROM employees.gender_statistics ORDER BY gender """) actual_results = cur.fetchall() # Execute ground truth query cur.execute(""" WITH current_emp AS ( SELECT DISTINCT s.employee_id FROM employees.salary s WHERE s.to_date = DATE '9999-01-01' ), total_current AS ( SELECT COUNT(*) AS cnt FROM current_emp ) SELECT e.gender::varchar AS gender, COUNT(*) AS total_employees, COUNT(*) FILTER (WHERE ce.employee_id IS NOT NULL) AS current_employees, (COUNT(*) FILTER (WHERE ce.employee_id IS NOT NULL))::DECIMAL / NULLIF((SELECT cnt FROM total_current), 0) * 100 AS percentage_of_workforce FROM employees.employee e LEFT JOIN current_emp ce ON ce.employee_id = e.id WHERE e.gender IN ('M','F') GROUP BY e.gender ORDER BY gender; """) expected_results = cur.fetchall() if len(actual_results) != len(expected_results): print(f"❌ Expected {len(expected_results)} gender statistics results, got {len(actual_results)}") return False mismatches = 0 for i, (actual, expected) in enumerate(zip(actual_results, expected_results)): if not rows_match(actual, expected): if mismatches < 5: # Only show first 5 mismatches print(f"❌ Row {i+1} mismatch: expected {expected}, got {actual}") mismatches += 1 if mismatches > 0: print(f"❌ Total mismatches: {mismatches}") return False print(f"✅ Gender statistics results are correct ({len(actual_results)} records)") return True def verify_age_group_results(conn) -> bool: """Verify the age group analysis results.""" with conn.cursor() as cur: # Get actual results from the created table cur.execute(""" SELECT age_group, employee_count, avg_salary, avg_tenure_days FROM employees.age_group_analysis ORDER BY age_group """) actual_results = cur.fetchall() # Execute ground truth query cur.execute(""" WITH current_salary AS ( SELECT employee_id, amount FROM ( SELECT s.*, ROW_NUMBER() OVER ( PARTITION BY s.employee_id ORDER BY s.from_date DESC, s.amount DESC ) AS rn FROM employees.salary s WHERE s.to_date = DATE '9999-01-01' ) x WHERE rn = 1 ), emp_age AS ( SELECT e.id AS employee_id, e.hire_date, EXTRACT(YEAR FROM AGE(CURRENT_DATE, e.birth_date))::INT AS age_years FROM employees.employee e WHERE e.birth_date IS NOT NULL ) SELECT CASE WHEN a.age_years BETWEEN 20 AND 29 THEN '20-29' WHEN a.age_years BETWEEN 30 AND 39 THEN '30-39' WHEN a.age_years BETWEEN 40 AND 49 THEN '40-49' WHEN a.age_years BETWEEN 50 AND 59 THEN '50-59' WHEN a.age_years >= 60 THEN '60+' END AS age_group, COUNT(*)::INT AS employee_count, AVG(cs.amount) AS avg_salary, AVG((CURRENT_DATE - a.hire_date)::INT) AS avg_tenure_days FROM emp_age a JOIN current_salary cs ON cs.employee_id = a.employee_id WHERE a.age_years >= 20 GROUP BY 1 ORDER BY 1; """) expected_results = cur.fetchall() if len(actual_results) != len(expected_results): print(f"❌ Expected {len(expected_results)} age group results, got {len(actual_results)}") return False mismatches = 0 for i, (actual, expected) in enumerate(zip(actual_results, expected_results)): if not rows_match(actual, expected): if mismatches < 5: # Only show first 5 mismatches print(f"❌ Row {i+1} mismatch: expected {expected}, got {actual}") mismatches += 1 if mismatches > 0: print(f"❌ Total mismatches: {mismatches}") return False print(f"✅ Age group analysis results are correct ({len(actual_results)} records)") return True def verify_birth_month_results(conn) -> bool: """Verify the birth month distribution results.""" with conn.cursor() as cur: # Get actual results from the created table cur.execute(""" SELECT birth_month, month_name, employee_count, current_employee_count FROM employees.birth_month_distribution ORDER BY birth_month """) actual_results = cur.fetchall() # Execute ground truth query cur.execute(""" WITH current_emp AS ( SELECT DISTINCT s.employee_id FROM employees.salary s WHERE s.to_date = DATE '9999-01-01' ), months AS ( SELECT gs AS birth_month FROM generate_series(1, 12) AS gs ) SELECT m.birth_month::INTEGER AS birth_month, CASE m.birth_month WHEN 1 THEN 'January' WHEN 2 THEN 'February' WHEN 3 THEN 'March' WHEN 4 THEN 'April' WHEN 5 THEN 'May' WHEN 6 THEN 'June' WHEN 7 THEN 'July' WHEN 8 THEN 'August' WHEN 9 THEN 'September' WHEN 10 THEN 'October' WHEN 11 THEN 'November'WHEN 12 THEN 'December' END AS month_name, COUNT(e.id)::INTEGER AS employee_count, COUNT(ce.employee_id)::INTEGER AS current_employee_count FROM months m LEFT JOIN employees.employee e ON EXTRACT(MONTH FROM e.birth_date) = m.birth_month LEFT JOIN current_emp ce ON ce.employee_id = e.id GROUP BY m.birth_month ORDER BY m.birth_month; """) expected_results = cur.fetchall() if len(actual_results) != len(expected_results): print(f"❌ Expected {len(expected_results)} birth month results, got {len(actual_results)}") return False mismatches = 0 for i, (actual, expected) in enumerate(zip(actual_results, expected_results)): if not rows_match(actual, expected): if mismatches < 5: # Only show first 5 mismatches print(f"❌ Row {i+1} mismatch: expected {expected}, got {actual}") mismatches += 1 if mismatches > 0: print(f"❌ Total mismatches: {mismatches}") return False print(f"✅ Birth month distribution results are correct ({len(actual_results)} records)") return True def verify_hiring_year_results(conn) -> bool: """Verify the hiring year summary results.""" with conn.cursor() as cur: # Get actual results from the created table cur.execute(""" SELECT hire_year, employees_hired, still_employed, retention_rate FROM employees.hiring_year_summary ORDER BY hire_year """) actual_results = cur.fetchall() # Execute ground truth query cur.execute(""" WITH current_emp AS ( SELECT DISTINCT s.employee_id FROM employees.salary s WHERE s.to_date = DATE '9999-01-01' ), base AS ( SELECT e.id, EXTRACT(YEAR FROM e.hire_date)::INT AS hire_year FROM employees.employee e WHERE e.hire_date IS NOT NULL ) SELECT b.hire_year, COUNT(*)::INT AS employees_hired, COUNT(*) FILTER (WHERE ce.employee_id IS NOT NULL)::INT AS still_employed, (COUNT(*) FILTER (WHERE ce.employee_id IS NOT NULL))::DECIMAL / NULLIF(COUNT(*), 0) * 100 AS retention_rate FROM base b LEFT JOIN current_emp ce ON ce.employee_id = b.id GROUP BY b.hire_year ORDER BY b.hire_year; """) expected_results = cur.fetchall() if len(actual_results) != len(expected_results): print(f"❌ Expected {len(expected_results)} hiring year results, got {len(actual_results)}") return False mismatches = 0 for i, (actual, expected) in enumerate(zip(actual_results, expected_results)): if not rows_match(actual, expected): if mismatches < 5: # Only show first 5 mismatches print(f"❌ Row {i+1} mismatch: expected {expected}, got {actual}") mismatches += 1 if mismatches > 0: print(f"❌ Total mismatches: {mismatches}") return False print(f"✅ Hiring year summary results are correct ({len(actual_results)} records)") return True def main(): """Main verification function.""" print("=" * 50) # Get connection parameters conn_params = get_connection_params() if not conn_params["database"]: print("❌ No database specified") sys.exit(1) try: # Connect to database conn = psycopg2.connect(**conn_params) # Verify all four analysis results success = ( verify_gender_statistics_results(conn) and verify_age_group_results(conn) and verify_birth_month_results(conn) and verify_hiring_year_results(conn) ) conn.close() if success: print("\n🎉 Task verification: PASS") sys.exit(0) else: print("\n❌ Task verification: FAIL") sys.exit(1) except psycopg2.Error as e: print(f"❌ Database error: {e}") sys.exit(1) except Exception as e: print(f"❌ Verification error: {e}") sys.exit(1) if __name__ == "__main__": main() ================================================ FILE: tasks/postgres/standard/employees/employee_performance_analysis/description.md ================================================ Create a comprehensive employee performance evaluation system that analyzes career progression patterns and salary equity across our organization. The executive team needs data-driven insights for upcoming promotion decisions and salary adjustment planning. ## Your Tasks: 1. **Create the employee performance analysis table** — build a table called `employee_performance_analysis` in the `employees` schema with these exact columns: * `employee_id` (bigint) — the employee's ID * `performance_category` (varchar) — classification of employee performance ('high_achiever', 'steady_performer', 'needs_attention') * `salary_growth_rate` (decimal) — percentage salary increase from first salary record to current * `days_of_service` (integer) — total days with the company * `promotion_count` (integer) — number of different titles held 2. **Analyze only current employees** — focus on employees who currently have active salary records (to_date = '9999-01-01'). 3. **Apply performance classification rules**: * **High achievers**: Salary growth rate > 40% AND more than 1 title held * **Needs attention**: Salary growth rate < 15% AND more than 3650 days of service (10 years) * **Steady performers**: All other current employees (default category) 4. **Create the department salary analysis table** — build a table called `department_salary_analysis` in the `employees` schema with: * `department_name` (varchar) — the department name * `avg_current_salary` (decimal) — average current salary in the department (only current employees) * `employee_count` (integer) — total current employees in the department * `salary_range_spread` (integer) — difference between max and min salary (current employees only) 5. **Calculate salary equity metrics** — populate the department table with current salary statistics for active employees only to identify potential pay equity issues across departments. The analysis should help leadership make informed decisions about promotions, salary adjustments, and talent retention strategies. ================================================ FILE: tasks/postgres/standard/employees/employee_performance_analysis/meta.json ================================================ { "task_id": "employee_performance_analysis", "task_name": "Employee Performance Analysis", "category_id": "employees", "category_name": "Employees", "description": "Create performance evaluation system analyzing career progression patterns and salary equity for promotion and compensation decisions.", "author": "Lingxiao Du", "created_at": "2025-08-14", "difficulty": "L3", "tags": [ "reporting and analytics", "statistical aggregation", "schema design" ], "mcp": [ "postgres" ], "meta_data": { "stateType": "text", "stateContent": "Enum \"employees\".\"employee_gender\" {\n \"M\"\n \"F\"\n}\n\nTable \"employees\".\"department\" {\n \"id\" bpchar(4) [pk, not null]\n \"dept_name\" varchar(40) [unique, not null]\n}\n\nTable \"employees\".\"department_employee\" {\n \"employee_id\" int8 [not null]\n \"department_id\" bpchar(4) [not null]\n \"from_date\" date [not null]\n \"to_date\" date [not null]\n\n Indexes {\n (employee_id, department_id) [type: btree, name: \"idx_16982_primary\"]\n department_id [type: btree, name: \"idx_16982_dept_no\"]\n }\n}\n\nTable \"employees\".\"department_manager\" {\n \"employee_id\" int8 [not null]\n \"department_id\" bpchar(4) [not null]\n \"from_date\" date [not null]\n \"to_date\" date [not null]\n\n Indexes {\n (employee_id, department_id) [type: btree, name: \"idx_16985_primary\"]\n department_id [type: btree, name: \"idx_16985_dept_no\"]\n }\n}\n\nTable \"employees\".\"employee\" {\n \"id\" int8 [pk, not null, increment]\n \"birth_date\" date [not null]\n \"first_name\" varchar(14) [not null]\n \"last_name\" varchar(16) [not null]\n \"gender\" employees.employee_gender [not null]\n \"hire_date\" date [not null]\n}\n\nTable \"employees\".\"salary\" {\n \"employee_id\" int8 [not null]\n \"amount\" int8 [not null]\n \"from_date\" date [not null]\n \"to_date\" date [not null]\n\n Indexes {\n (employee_id, from_date) [type: btree, name: \"idx_16991_primary\"]\n }\n}\n\nTable \"employees\".\"title\" {\n \"employee_id\" int8 [not null]\n \"title\" varchar(50) [not null]\n \"from_date\" date [not null]\n \"to_date\" date\n\n Indexes {\n (employee_id, title, from_date) [type: btree, name: \"idx_16994_primary\"]\n }\n}\n\nRef \"dept_emp_ibfk_1\":\"employees\".\"employee\".\"id\" < \"employees\".\"department_employee\".\"employee_id\" [update: restrict, delete: cascade]\n\nRef \"dept_emp_ibfk_2\":\"employees\".\"department\".\"id\" < \"employees\".\"department_employee\".\"department_id\" [update: restrict, delete: cascade]\n\nRef \"dept_manager_ibfk_1\":\"employees\".\"employee\".\"id\" < \"employees\".\"department_manager\".\"employee_id\" [update: restrict, delete: cascade]\n\nRef \"dept_manager_ibfk_2\":\"employees\".\"department\".\"id\" < \"employees\".\"department_manager\".\"department_id\" [update: restrict, delete: cascade]\n\nRef \"salaries_ibfk_1\":\"employees\".\"employee\".\"id\" < \"employees\".\"salary\".\"employee_id\" [update: restrict, delete: cascade]\n\nRef \"titles_ibfk_1\":\"employees\".\"employee\".\"id\" < \"employees\".\"title\".\"employee_id\" [update: restrict, delete: cascade]\n", "stateUrl": null, "stateOriginalUrl": "https://github.com/neondatabase-labs/postgres-sample-dbs/blob/main/employees.sql.gz" } } ================================================ FILE: tasks/postgres/standard/employees/employee_performance_analysis/verify.py ================================================ """ Verification script for PostgreSQL Task 1: Employee Performance Analysis """ import os import sys import psycopg2 from decimal import Decimal def rows_match(actual_row, expected_row): """ Compare two rows with appropriate tolerance. For Decimal types: allows 0.1 tolerance For other types: requires exact match """ if len(actual_row) != len(expected_row): return False for actual, expected in zip(actual_row, expected_row): if isinstance(actual, Decimal) and isinstance(expected, Decimal): if abs(float(actual) - float(expected)) > 0.1: return False elif actual != expected: return False return True def get_connection_params() -> dict: """Get database connection parameters.""" return { "host": os.getenv("POSTGRES_HOST", "localhost"), "port": int(os.getenv("POSTGRES_PORT", 5432)), "database": os.getenv("POSTGRES_DATABASE"), "user": os.getenv("POSTGRES_USERNAME"), "password": os.getenv("POSTGRES_PASSWORD") } def verify_performance_results(conn) -> bool: """Verify the employee performance analysis results.""" with conn.cursor() as cur: # Get actual results from the created table cur.execute(""" SELECT employee_id, performance_category, salary_growth_rate, days_of_service, promotion_count FROM employees.employee_performance_analysis ORDER BY employee_id """) actual_results = cur.fetchall() # Execute ground truth query - use first salary record as starting salary cur.execute(""" WITH current_salary AS ( SELECT employee_id, amount AS current_amount FROM ( SELECT s.*, ROW_NUMBER() OVER (PARTITION BY s.employee_id ORDER BY s.from_date DESC, s.amount DESC) AS rn FROM employees.salary s WHERE s.to_date = DATE '9999-01-01' ) x WHERE rn = 1 ), first_salary AS ( SELECT employee_id, amount AS first_amount FROM ( SELECT s.*, ROW_NUMBER() OVER (PARTITION BY s.employee_id ORDER BY s.from_date ASC, s.amount ASC) AS rn FROM employees.salary s ) x WHERE rn = 1 ), title_counts AS ( SELECT t.employee_id, COUNT(DISTINCT t.title) AS promotion_count FROM employees.title t GROUP BY t.employee_id ), base AS ( SELECT e.id AS employee_id, e.hire_date, cs.current_amount, fs.first_amount, COALESCE(tc.promotion_count, 0) AS promotion_count FROM employees.employee e JOIN current_salary cs ON cs.employee_id = e.id JOIN first_salary fs ON fs.employee_id = e.id LEFT JOIN title_counts tc ON tc.employee_id = e.id ), scored AS ( SELECT employee_id, ((current_amount - first_amount) / NULLIF(first_amount, 0)::NUMERIC) * 100 AS salary_growth_rate, (CURRENT_DATE - hire_date)::INTEGER AS days_of_service, promotion_count FROM base ) SELECT s.employee_id, CASE WHEN s.salary_growth_rate > 40 AND s.promotion_count > 1 THEN 'high_achiever' WHEN s.salary_growth_rate < 15 AND s.days_of_service > 3650 THEN 'needs_attention' ELSE 'steady_performer' END AS performance_category, s.salary_growth_rate, s.days_of_service, s.promotion_count AS promotion_count FROM scored s ORDER BY s.employee_id; """) expected_results = cur.fetchall() if len(actual_results) != len(expected_results): print(f"❌ Expected {len(expected_results)} performance results, got {len(actual_results)}") return False mismatches = 0 for i, (actual, expected) in enumerate(zip(actual_results, expected_results)): if not rows_match(actual, expected): if mismatches < 5: # Only show first 5 mismatches print(f"❌ Row {i+1} mismatch: expected {expected}, got {actual}") mismatches += 1 if mismatches > 0: print(f"❌ Total mismatches: {mismatches}") return False print(f"✅ Employee performance results are correct ({len(actual_results)} records)") return True def verify_department_results(conn) -> bool: """Verify the department salary analysis results.""" with conn.cursor() as cur: # Get actual results from the created table cur.execute(""" SELECT department_name, avg_current_salary, employee_count, salary_range_spread FROM employees.department_salary_analysis ORDER BY department_name """) actual_results = cur.fetchall() # Execute ground truth query cur.execute(""" WITH current_salary AS ( SELECT employee_id, amount FROM ( SELECT s.*, ROW_NUMBER() OVER (PARTITION BY s.employee_id ORDER BY s.from_date DESC, s.amount DESC) AS rn FROM employees.salary s WHERE s.to_date = DATE '9999-01-01' ) x WHERE rn = 1 ), current_dept AS ( SELECT DISTINCT de.employee_id, de.department_id FROM employees.department_employee de WHERE de.to_date = DATE '9999-01-01' ) SELECT d.dept_name AS department_name, AVG(cs.amount)::DECIMAL AS avg_current_salary, COUNT(DISTINCT cd.employee_id) AS employee_count, (MAX(cs.amount) - MIN(cs.amount)) AS salary_range_spread FROM employees.department d JOIN current_dept cd ON cd.department_id = d.id JOIN current_salary cs ON cs.employee_id = cd.employee_id GROUP BY d.id, d.dept_name ORDER BY d.dept_name; """) expected_results = cur.fetchall() if len(actual_results) != len(expected_results): print(f"❌ Expected {len(expected_results)} department results, got {len(actual_results)}") return False mismatches = 0 for i, (actual, expected) in enumerate(zip(actual_results, expected_results)): if not rows_match(actual, expected): print(f"❌ Row {i+1} mismatch: expected {expected}, got {actual}") mismatches += 1 if mismatches > 0: print(f"❌ Total mismatches: {mismatches}") return False print(f"✅ Department salary results are correct ({len(actual_results)} records)") return True def main(): """Main verification function.""" print("=" * 50) # Get connection parameters conn_params = get_connection_params() if not conn_params["database"]: print("❌ No database specified") sys.exit(1) try: # Connect to database conn = psycopg2.connect(**conn_params) # Verify results success = verify_performance_results(conn) and verify_department_results(conn) conn.close() if success: print("\n🎉 Task verification: PASS") sys.exit(0) else: print("\n❌ Task verification: FAIL") sys.exit(1) except psycopg2.Error as e: print(f"❌ Database error: {e}") sys.exit(1) except Exception as e: print(f"❌ Verification error: {e}") sys.exit(1) if __name__ == "__main__": main() ================================================ FILE: tasks/postgres/standard/employees/employee_project_tracking/description.md ================================================ Create and manage a comprehensive employee project tracking system using database schema design and data manipulation operations. The IT team needs you to build the database structure from scratch and populate it with specific initial data to support project management workflows. ## Your Tasks: 1. **Create the project tracking tables** — build three new tables in the `employees` schema: **Table 1: `employee_projects`** * `project_id` (integer, primary key, auto-increment) * `project_name` (varchar(100), not null) * `start_date` (date, not null) * `end_date` (date) * `budget` (decimal(10,2)) * `status` (varchar(20), default 'active') **Table 2: `project_assignments`** * `assignment_id` (integer, primary key, auto-increment) * `employee_id` (bigint, not null) * `project_id` (integer, not null) * `role` (varchar(50), not null) * `allocation_percentage` (integer, check constraint: between 1 and 100) * `assigned_date` (date, not null) **Table 3: `project_milestones`** * `milestone_id` (integer, primary key, auto-increment) * `project_id` (integer, not null) * `milestone_name` (varchar(100), not null) * `due_date` (date, not null) * `completed` (boolean, default false) 2. **Add foreign key relationships**: * `project_assignments.employee_id` → `employees.employee.id` * `project_assignments.project_id` → `employees.employee_projects.project_id` * `project_milestones.project_id` → `employees.employee_projects.project_id` 3. **Create performance indexes**: * Index named `idx_projects_status` on `employee_projects.status` * Composite index named `idx_assignments_emp_proj` on `project_assignments(employee_id, project_id)` * Index named `idx_milestones_due_date` on `project_milestones.due_date` 4. **Insert exactly this initial data**: **Into `employee_projects`:** * Project 1: name='Database Modernization', start_date='2024-01-15', end_date='2024-06-30', budget=250000.00, status='active' * Project 2: name='Employee Portal Upgrade', start_date='2024-02-01', end_date='2024-05-15', budget=180000.00, status='active' * Project 3: name='HR Analytics Dashboard', start_date='2023-11-01', end_date='2024-01-31', budget=120000.00, status='active' **Into `project_assignments` (assign ALL current employees):** * All employees from Development department → Project 1 ('Database Modernization'), role='Developer', allocation=80% * All employees from Human Resources department → Project 2 ('Employee Portal Upgrade'), role='Business Analyst', allocation=60% * All employees from Marketing department → Project 3 ('HR Analytics Dashboard'), role='Marketing Specialist', allocation=40% * All employees from Finance department → Project 1 ('Database Modernization'), role='Financial Analyst', allocation=30% * All employees from Sales department → Project 2 ('Employee Portal Upgrade'), role='Sales Representative', allocation=50% * All employees from Research department → Project 3 ('HR Analytics Dashboard'), role='Research Analyst', allocation=70% * All employees from Production department → Project 1 ('Database Modernization'), role='Production Coordinator', allocation=45% * All employees from Quality Management department → Project 2 ('Employee Portal Upgrade'), role='QA Specialist', allocation=85% * All employees from Customer Service department → Project 3 ('HR Analytics Dashboard'), role='Customer Success', allocation=35% * All employees should have assigned_date='2024-01-01' **Into `project_milestones`:** * Project 1: 'Design Phase Complete' due '2024-03-01', 'Implementation Complete' due '2024-05-15' * Project 2: 'UI/UX Approval' due '2024-03-15', 'Beta Testing' due '2024-04-30' * Project 3: 'Data Collection' due '2023-12-15', 'Dashboard Launch' due '2024-01-25' 5. **Perform these exact data updates**: * Update Project 3 ('HR Analytics Dashboard') status to 'completed' * Increase budget by 15% for all projects with status 'active' * Mark the milestone 'Data Collection' as completed (set completed = true) 6. **Add new column to `employee_projects`**: * Add `priority` column (varchar(10)) with check constraint allowing only 'low', 'medium', 'high' * Update all existing projects: set priority='high' for 'Database Modernization', priority='medium' for others ================================================ FILE: tasks/postgres/standard/employees/employee_project_tracking/meta.json ================================================ { "task_id": "employee_project_tracking", "task_name": "Employee Project Tracking", "category_id": "employees", "category_name": "Employees", "description": "Build project tracking system from scratch with tables for projects, assignments, milestones, and performance indexes.", "author": "Lingxiao Du", "created_at": "2025-08-14", "difficulty": "L3", "tags": [ "schema design", "data migration", "data integrity enforcement" ], "mcp": [ "postgres" ], "meta_data": { "stateType": "text", "stateContent": "Enum \"employees\".\"employee_gender\" {\n \"M\"\n \"F\"\n}\n\nTable \"employees\".\"department\" {\n \"id\" bpchar(4) [pk, not null]\n \"dept_name\" varchar(40) [unique, not null]\n}\n\nTable \"employees\".\"department_employee\" {\n \"employee_id\" int8 [not null]\n \"department_id\" bpchar(4) [not null]\n \"from_date\" date [not null]\n \"to_date\" date [not null]\n\n Indexes {\n (employee_id, department_id) [type: btree, name: \"idx_16982_primary\"]\n department_id [type: btree, name: \"idx_16982_dept_no\"]\n }\n}\n\nTable \"employees\".\"department_manager\" {\n \"employee_id\" int8 [not null]\n \"department_id\" bpchar(4) [not null]\n \"from_date\" date [not null]\n \"to_date\" date [not null]\n\n Indexes {\n (employee_id, department_id) [type: btree, name: \"idx_16985_primary\"]\n department_id [type: btree, name: \"idx_16985_dept_no\"]\n }\n}\n\nTable \"employees\".\"employee\" {\n \"id\" int8 [pk, not null, increment]\n \"birth_date\" date [not null]\n \"first_name\" varchar(14) [not null]\n \"last_name\" varchar(16) [not null]\n \"gender\" employees.employee_gender [not null]\n \"hire_date\" date [not null]\n}\n\nTable \"employees\".\"salary\" {\n \"employee_id\" int8 [not null]\n \"amount\" int8 [not null]\n \"from_date\" date [not null]\n \"to_date\" date [not null]\n\n Indexes {\n (employee_id, from_date) [type: btree, name: \"idx_16991_primary\"]\n }\n}\n\nTable \"employees\".\"title\" {\n \"employee_id\" int8 [not null]\n \"title\" varchar(50) [not null]\n \"from_date\" date [not null]\n \"to_date\" date\n\n Indexes {\n (employee_id, title, from_date) [type: btree, name: \"idx_16994_primary\"]\n }\n}\n\nRef \"dept_emp_ibfk_1\":\"employees\".\"employee\".\"id\" < \"employees\".\"department_employee\".\"employee_id\" [update: restrict, delete: cascade]\n\nRef \"dept_emp_ibfk_2\":\"employees\".\"department\".\"id\" < \"employees\".\"department_employee\".\"department_id\" [update: restrict, delete: cascade]\n\nRef \"dept_manager_ibfk_1\":\"employees\".\"employee\".\"id\" < \"employees\".\"department_manager\".\"employee_id\" [update: restrict, delete: cascade]\n\nRef \"dept_manager_ibfk_2\":\"employees\".\"department\".\"id\" < \"employees\".\"department_manager\".\"department_id\" [update: restrict, delete: cascade]\n\nRef \"salaries_ibfk_1\":\"employees\".\"employee\".\"id\" < \"employees\".\"salary\".\"employee_id\" [update: restrict, delete: cascade]\n\nRef \"titles_ibfk_1\":\"employees\".\"employee\".\"id\" < \"employees\".\"title\".\"employee_id\" [update: restrict, delete: cascade]\n", "stateUrl": null, "stateOriginalUrl": "https://github.com/neondatabase-labs/postgres-sample-dbs/blob/main/employees.sql.gz" } } ================================================ FILE: tasks/postgres/standard/employees/employee_project_tracking/verify.py ================================================ """ Verification script for PostgreSQL Task 5: Database Schema and Data Operations """ import os import sys import psycopg2 from decimal import Decimal def rows_match(actual_row, expected_row): """ Compare two rows with appropriate tolerance. For Decimal types: allows 0.1 tolerance For date types: convert to string for comparison For other types: requires exact match """ if len(actual_row) != len(expected_row): return False for actual, expected in zip(actual_row, expected_row): if isinstance(actual, Decimal) and isinstance(expected, (Decimal, float, int)): if abs(float(actual) - float(expected)) > 0.1: return False elif hasattr(actual, 'strftime'): # datetime.date or datetime.datetime if str(actual) != str(expected): return False elif actual != expected: return False return True def get_connection_params() -> dict: """Get database connection parameters.""" return { "host": os.getenv("POSTGRES_HOST", "localhost"), "port": int(os.getenv("POSTGRES_PORT", 5432)), "database": os.getenv("POSTGRES_DATABASE"), "user": os.getenv("POSTGRES_USERNAME"), "password": os.getenv("POSTGRES_PASSWORD") } def verify_table_structures(conn) -> bool: """Verify that all three tables were created with correct structure.""" with conn.cursor() as cur: # Check if tables exist cur.execute(""" SELECT table_name FROM information_schema.tables WHERE table_schema = 'employees' AND table_name IN ('employee_projects', 'project_assignments', 'project_milestones') ORDER BY table_name """) tables = [row[0] for row in cur.fetchall()] if len(tables) != 3: print(f"❌ Expected 3 tables, found {len(tables)}: {tables}") return False # Check foreign key constraints exist cur.execute(""" SELECT COUNT(*) FROM information_schema.table_constraints WHERE table_schema = 'employees' AND constraint_type = 'FOREIGN KEY' AND table_name IN ('project_assignments', 'project_milestones') """) fkey_count = cur.fetchone()[0] if fkey_count != 3: print(f"❌ Expected 3 foreign key constraints, found {fkey_count}") return False # Check if priority column exists (added in step 6) cur.execute(""" SELECT COUNT(*) FROM information_schema.columns WHERE table_schema = 'employees' AND table_name = 'employee_projects' AND column_name = 'priority' """) priority_exists = cur.fetchone()[0] if priority_exists == 0: print("❌ Priority column was not added to employee_projects table") return False print("✅ Table structures are correct") return True def verify_indexes(conn) -> bool: """Verify that required indexes were created.""" with conn.cursor() as cur: # Check for specific indexes cur.execute(""" SELECT COUNT(*) FROM pg_indexes WHERE schemaname = 'employees' AND indexname IN ('idx_projects_status', 'idx_assignments_emp_proj', 'idx_milestones_due_date') """) index_count = cur.fetchone()[0] if index_count != 3: print(f"❌ Expected 3 required indexes, got {index_count}") return False print("✅ All required indexes are present") return True def verify_project_data(conn) -> bool: """Verify that project data was inserted and updated correctly.""" with conn.cursor() as cur: # Check project data after updates cur.execute(""" SELECT project_name, start_date, end_date, budget, status, priority FROM employees.employee_projects ORDER BY project_name """) projects = cur.fetchall() if len(projects) != 3: print(f"❌ Expected 3 projects, found {len(projects)}") return False # Expected final state after all updates expected = { 'Database Modernization': ('2024-01-15', '2024-06-30', 287500.00, 'active', 'high'), 'Employee Portal Upgrade': ('2024-02-01', '2024-05-15', 207000.00, 'active', 'medium'), 'HR Analytics Dashboard': ('2023-11-01', '2024-01-31', 120000.00, 'completed', 'medium') } for project in projects: name = project[0] if name not in expected: print(f"❌ Unexpected project: {name}") return False exp = expected[name] # Use rows_match for comparison expected_row = (name,) + exp if not rows_match(project, expected_row): print(f"❌ Project {name} data mismatch: expected {expected_row}, got {project}") return False print("✅ Project data is correct") return True def verify_assignment_data(conn) -> bool: """Verify that all current employees were assigned to projects by department.""" with conn.cursor() as cur: # Check total assignment count matches current employee count cur.execute(""" SELECT COUNT(*) FROM employees.project_assignments """) assignment_count = cur.fetchone()[0] cur.execute(""" SELECT COUNT(DISTINCT de.employee_id) FROM employees.department_employee de WHERE de.to_date = '9999-01-01' """) current_employee_count = cur.fetchone()[0] if assignment_count != current_employee_count: print(f"❌ Expected {current_employee_count} assignments, found {assignment_count}") return False # Check department-project mapping cur.execute(""" SELECT d.dept_name, pa.project_id, pa.role, pa.allocation_percentage, COUNT(*) FROM employees.project_assignments pa JOIN employees.department_employee de ON pa.employee_id = de.employee_id AND de.to_date = '9999-01-01' JOIN employees.department d ON de.department_id = d.id JOIN employees.employee_projects ep ON pa.project_id = ep.project_id GROUP BY d.dept_name, pa.project_id, pa.role, pa.allocation_percentage ORDER BY d.dept_name """) dept_assignments = cur.fetchall() # Expected department-project mappings expected_mappings = { 'Development': (1, 'Developer', 80), 'Human Resources': (2, 'Business Analyst', 60), 'Marketing': (3, 'Marketing Specialist', 40), 'Finance': (1, 'Financial Analyst', 30), 'Sales': (2, 'Sales Representative', 50), 'Research': (3, 'Research Analyst', 70), 'Production': (1, 'Production Coordinator', 45), 'Quality Management': (2, 'QA Specialist', 85), 'Customer Service': (3, 'Customer Success', 35) } dept_found = {} for assignment in dept_assignments: dept_name, project_id, role, allocation, _ = assignment # Ignore count if dept_name in dept_found: print(f"❌ Department {dept_name} has multiple assignments") return False dept_found[dept_name] = (project_id, role, allocation) for dept, expected in expected_mappings.items(): if dept not in dept_found: print(f"❌ Department {dept} has no assignments") return False if dept_found[dept] != expected: print(f"❌ Department {dept} assignment mismatch: expected {expected}, got {dept_found[dept]}") return False # Check that all assignments have correct assigned_date cur.execute(""" SELECT COUNT(*) FROM employees.project_assignments WHERE assigned_date != '2024-01-01' """) wrong_date_count = cur.fetchone()[0] if wrong_date_count > 0: print(f"❌ {wrong_date_count} assignments have incorrect assigned_date") return False print("✅ Assignment data is correct") return True def verify_milestone_data(conn) -> bool: """Verify that milestone data was inserted and updated correctly.""" with conn.cursor() as cur: cur.execute(""" SELECT project_id, milestone_name, due_date, completed FROM employees.project_milestones ORDER BY project_id, milestone_name """) milestones = cur.fetchall() if len(milestones) != 6: print(f"❌ Expected 6 milestones, found {len(milestones)}") return False # Expected milestones expected_milestones = { (1, 'Design Phase Complete'): ('2024-03-01', False), (1, 'Implementation Complete'): ('2024-05-15', False), (2, 'UI/UX Approval'): ('2024-03-15', False), (2, 'Beta Testing'): ('2024-04-30', False), (3, 'Data Collection'): ('2023-12-15', True), # Should be completed (3, 'Dashboard Launch'): ('2024-01-25', False) } for milestone in milestones: project_id, name, due_date, completed = milestone key = (project_id, name) if key not in expected_milestones: print(f"❌ Unexpected milestone: {key}") return False expected_due, expected_completed = expected_milestones[key] if str(due_date) != expected_due or completed != expected_completed: print(f"❌ Milestone {name} mismatch: expected ({expected_due}, {expected_completed}), got ({due_date}, {completed})") return False print("✅ Milestone data is correct") return True def main(): """Main verification function.""" print("=" * 50) # Get connection parameters conn_params = get_connection_params() if not conn_params["database"]: print("❌ No database specified") sys.exit(1) try: # Connect to database conn = psycopg2.connect(**conn_params) # Verify all components success = ( verify_table_structures(conn) and verify_indexes(conn) and verify_project_data(conn) and verify_assignment_data(conn) and verify_milestone_data(conn) ) conn.close() if success: print("\n🎉 Task verification: PASS") sys.exit(0) else: print("\n❌ Task verification: FAIL") sys.exit(1) except psycopg2.Error as e: print(f"❌ Database error: {e}") sys.exit(1) except Exception as e: print(f"❌ Verification error: {e}") sys.exit(1) if __name__ == "__main__": main() ================================================ FILE: tasks/postgres/standard/employees/employee_retention_analysis/description.md ================================================ Analyze employee retention patterns and identify factors contributing to turnover across the organization. The HR leadership team needs comprehensive insights to develop targeted retention strategies and reduce costly employee attrition. ## Your Tasks: 1. **Create the retention analysis table** — build a table called `employee_retention_analysis` in the `employees` schema with these exact columns: * `department_name` (varchar) — the department name * `total_employees_ever` (integer) — total number of employees who have ever worked in this department * `current_employees` (integer) — number of current employees in the department * `former_employees` (integer) — number of employees who left the department * `retention_rate` (decimal) — percentage of employees still with the company (current/total * 100) 2. **Create the high-risk employee identification table** — build a table called `high_risk_employees` in the `employees` schema with: * `employee_id` (bigint) — the employee's ID * `full_name` (varchar) — concatenated first and last name * `current_department` (varchar) — current department name * `tenure_days` (integer) — days with the company * `current_salary` (integer) — current salary amount * `risk_category` (varchar) — risk level ('high_risk', 'medium_risk', 'low_risk') **Note**: Analyze only current employees (those with active salary records where to_date = '9999-01-01'). 3. **Create the turnover trend analysis table** — build a table called `turnover_trend_analysis` in the `employees` schema with: * `departure_year` (integer) — year when employees left (extract from to_date of salary records) * `departures_count` (integer) — number of employees who left that year * `avg_tenure_days` (decimal) — average tenure in days for employees who left that year * `avg_final_salary` (decimal) — average final salary of departed employees that year 4. **Apply risk assessment criteria** for current employees: * **High risk**: Employees in departments with retention rate < 80% AND tenure < 1095 days (3 years) * **Medium risk**: Employees in departments with retention rate < 85% AND tenure < 1825 days (5 years) * **Low risk**: All other current employees 5. **Analyze departure trends** — examine employees who left between 1985-2002, grouping by departure year. 6. **Handle final salary selection** — when calculating `avg_final_salary`, if an employee has multiple salary records with the same departure date, select the record with the latest start date. If there are still ties, select the record with the highest salary amount. 7. **Focus appropriately** — use current employees for risk analysis, all historical data for retention rates, and former employees for trend analysis. The comprehensive analysis will help identify retention patterns, at-risk employees, and historical turnover trends to guide strategic workforce planning. ================================================ FILE: tasks/postgres/standard/employees/employee_retention_analysis/meta.json ================================================ { "task_id": "employee_retention_analysis", "task_name": "Employee Retention Analysis", "category_id": "employees", "category_name": "Employees", "description": "Analyze retention patterns identifying turnover factors and high-risk employees to develop targeted retention strategies.", "author": "Lingxiao Du", "created_at": "2025-08-15", "difficulty": "L3", "tags": [ "reporting and analytics", "statistical aggregation", "audit and compliance" ], "mcp": [ "postgres" ], "meta_data": { "stateType": "text", "stateContent": "Enum \"employees\".\"employee_gender\" {\n \"M\"\n \"F\"\n}\n\nTable \"employees\".\"department\" {\n \"id\" bpchar(4) [pk, not null]\n \"dept_name\" varchar(40) [unique, not null]\n}\n\nTable \"employees\".\"department_employee\" {\n \"employee_id\" int8 [not null]\n \"department_id\" bpchar(4) [not null]\n \"from_date\" date [not null]\n \"to_date\" date [not null]\n\n Indexes {\n (employee_id, department_id) [type: btree, name: \"idx_16982_primary\"]\n department_id [type: btree, name: \"idx_16982_dept_no\"]\n }\n}\n\nTable \"employees\".\"department_manager\" {\n \"employee_id\" int8 [not null]\n \"department_id\" bpchar(4) [not null]\n \"from_date\" date [not null]\n \"to_date\" date [not null]\n\n Indexes {\n (employee_id, department_id) [type: btree, name: \"idx_16985_primary\"]\n department_id [type: btree, name: \"idx_16985_dept_no\"]\n }\n}\n\nTable \"employees\".\"employee\" {\n \"id\" int8 [pk, not null, increment]\n \"birth_date\" date [not null]\n \"first_name\" varchar(14) [not null]\n \"last_name\" varchar(16) [not null]\n \"gender\" employees.employee_gender [not null]\n \"hire_date\" date [not null]\n}\n\nTable \"employees\".\"salary\" {\n \"employee_id\" int8 [not null]\n \"amount\" int8 [not null]\n \"from_date\" date [not null]\n \"to_date\" date [not null]\n\n Indexes {\n (employee_id, from_date) [type: btree, name: \"idx_16991_primary\"]\n }\n}\n\nTable \"employees\".\"title\" {\n \"employee_id\" int8 [not null]\n \"title\" varchar(50) [not null]\n \"from_date\" date [not null]\n \"to_date\" date\n\n Indexes {\n (employee_id, title, from_date) [type: btree, name: \"idx_16994_primary\"]\n }\n}\n\nRef \"dept_emp_ibfk_1\":\"employees\".\"employee\".\"id\" < \"employees\".\"department_employee\".\"employee_id\" [update: restrict, delete: cascade]\n\nRef \"dept_emp_ibfk_2\":\"employees\".\"department\".\"id\" < \"employees\".\"department_employee\".\"department_id\" [update: restrict, delete: cascade]\n\nRef \"dept_manager_ibfk_1\":\"employees\".\"employee\".\"id\" < \"employees\".\"department_manager\".\"employee_id\" [update: restrict, delete: cascade]\n\nRef \"dept_manager_ibfk_2\":\"employees\".\"department\".\"id\" < \"employees\".\"department_manager\".\"department_id\" [update: restrict, delete: cascade]\n\nRef \"salaries_ibfk_1\":\"employees\".\"employee\".\"id\" < \"employees\".\"salary\".\"employee_id\" [update: restrict, delete: cascade]\n\nRef \"titles_ibfk_1\":\"employees\".\"employee\".\"id\" < \"employees\".\"title\".\"employee_id\" [update: restrict, delete: cascade]\n", "stateUrl": null, "stateOriginalUrl": "https://github.com/neondatabase-labs/postgres-sample-dbs/blob/main/employees.sql.gz" } } ================================================ FILE: tasks/postgres/standard/employees/employee_retention_analysis/verify.py ================================================ """ Verification script for PostgreSQL Task 2: Employee Retention Analysis """ import os import sys import psycopg2 from decimal import Decimal def rows_match(actual_row, expected_row): """ Compare two rows with appropriate tolerance. For Decimal types: allows 0.1 tolerance For other types: requires exact match """ if len(actual_row) != len(expected_row): return False for actual, expected in zip(actual_row, expected_row): if isinstance(actual, Decimal) and isinstance(expected, Decimal): if abs(float(actual) - float(expected)) > 0.1: return False elif actual != expected: return False return True def get_connection_params() -> dict: """Get database connection parameters.""" return { "host": os.getenv("POSTGRES_HOST", "localhost"), "port": int(os.getenv("POSTGRES_PORT", 5432)), "database": os.getenv("POSTGRES_DATABASE"), "user": os.getenv("POSTGRES_USERNAME"), "password": os.getenv("POSTGRES_PASSWORD") } def verify_retention_analysis_results(conn) -> bool: """Verify the employee retention analysis results.""" with conn.cursor() as cur: # Get actual results from the created table cur.execute(""" SELECT department_name, total_employees_ever, current_employees, former_employees, retention_rate FROM employees.employee_retention_analysis ORDER BY department_name """) actual_results = cur.fetchall() # Execute ground truth query cur.execute(""" SELECT d.dept_name AS department_name, COUNT(DISTINCT de.employee_id) AS total_employees_ever, COUNT(DISTINCT de.employee_id) FILTER (WHERE de.to_date = DATE '9999-01-01') AS current_employees, (COUNT(DISTINCT de.employee_id) - COUNT(DISTINCT de.employee_id) FILTER (WHERE de.to_date = DATE '9999-01-01')) AS former_employees, (COUNT(DISTINCT de.employee_id) FILTER (WHERE de.to_date = DATE '9999-01-01'))::DECIMAL / NULLIF(COUNT(DISTINCT de.employee_id), 0) * 100 AS retention_rate FROM employees.department d LEFT JOIN employees.department_employee de ON d.id = de.department_id GROUP BY d.id, d.dept_name ORDER BY d.dept_name """) expected_results = cur.fetchall() if len(actual_results) != len(expected_results): print(f"❌ Expected {len(expected_results)} retention analysis results, got {len(actual_results)}") return False mismatches = 0 for i, (actual, expected) in enumerate(zip(actual_results, expected_results)): if not rows_match(actual, expected): if mismatches < 5: # Only show first 5 mismatches print(f"❌ Row {i+1} mismatch: expected {expected}, got {actual}") mismatches += 1 if mismatches > 0: print(f"❌ Total mismatches: {mismatches}") return False print(f"✅ Employee retention analysis results are correct ({len(actual_results)} records)") return True def verify_high_risk_results(conn) -> bool: """Verify the high risk employee analysis results.""" with conn.cursor() as cur: # Get actual results from the created table cur.execute(""" SELECT employee_id, full_name, current_department, tenure_days, current_salary, risk_category FROM employees.high_risk_employees ORDER BY employee_id """) actual_results = cur.fetchall() # Execute ground truth query - only current employees cur.execute(""" WITH current_salary AS ( SELECT employee_id, amount AS current_amount FROM ( SELECT s.*, ROW_NUMBER() OVER (PARTITION BY s.employee_id ORDER BY s.from_date DESC, s.amount DESC) AS rn FROM employees.salary s WHERE s.to_date = DATE '9999-01-01' ) x WHERE rn = 1 ), current_dept AS ( SELECT employee_id, department_id FROM ( SELECT de.*, ROW_NUMBER() OVER (PARTITION BY de.employee_id ORDER BY de.from_date DESC, de.department_id) AS rn FROM employees.department_employee de WHERE de.to_date = DATE '9999-01-01' ) x WHERE rn = 1 ), dept_retention AS ( SELECT d.id AS department_id, d.dept_name, COUNT(DISTINCT de.employee_id) AS total_employees_ever, COUNT(DISTINCT de.employee_id) FILTER (WHERE de.to_date = DATE '9999-01-01') AS current_employees, (COUNT(DISTINCT de.employee_id) FILTER (WHERE de.to_date = DATE '9999-01-01'))::NUMERIC / NULLIF(COUNT(DISTINCT de.employee_id), 0) * 100 AS retention_rate FROM employees.department d LEFT JOIN employees.department_employee de ON de.department_id = d.id GROUP BY d.id, d.dept_name ) SELECT e.id AS employee_id, CONCAT(e.first_name, ' ', e.last_name) AS full_name, d.dept_name AS current_department, (CURRENT_DATE - e.hire_date)::INTEGER AS tenure_days, cs.current_amount::INTEGER AS current_salary, CASE WHEN dr.retention_rate < 80 AND (CURRENT_DATE - e.hire_date) < 1095 THEN 'high_risk' WHEN dr.retention_rate < 85 AND (CURRENT_DATE - e.hire_date) < 1825 THEN 'medium_risk' ELSE 'low_risk' END AS risk_category FROM employees.employee e JOIN current_salary cs ON cs.employee_id = e.id JOIN current_dept cd ON cd.employee_id = e.id JOIN employees.department d ON d.id = cd.department_id JOIN dept_retention dr ON dr.department_id = d.id ORDER BY e.id; """) expected_results = cur.fetchall() if len(actual_results) != len(expected_results): print(f"❌ Expected {len(expected_results)} high risk analysis results, got {len(actual_results)}") return False mismatches = 0 for i, (actual, expected) in enumerate(zip(actual_results, expected_results)): if not rows_match(actual, expected): if mismatches < 5: # Only show first 5 mismatches print(f"❌ Row {i+1} mismatch: expected {expected}, got {actual}") mismatches += 1 if mismatches > 0: print(f"❌ Total mismatches: {mismatches}") return False print(f"✅ High risk employee analysis results are correct ({len(actual_results)} records)") return True def verify_turnover_trend_results(conn) -> bool: """Verify the turnover trend analysis results.""" with conn.cursor() as cur: # Get actual results from the created table cur.execute(""" SELECT departure_year, departures_count, avg_tenure_days, avg_final_salary FROM employees.turnover_trend_analysis ORDER BY departure_year """) actual_results = cur.fetchall() # Execute ground truth query - simplified version cur.execute(""" WITH last_non_current_salary AS ( SELECT s.employee_id, s.to_date AS departure_date, s.amount AS final_salary, ROW_NUMBER() OVER ( PARTITION BY s.employee_id ORDER BY s.to_date DESC, s.from_date DESC, s.amount DESC ) AS rn FROM employees.salary s WHERE s.to_date <> DATE '9999-01-01' AND NOT EXISTS ( SELECT 1 FROM employees.salary s_cur WHERE s_cur.employee_id = s.employee_id AND s_cur.to_date = DATE '9999-01-01' ) ), departed AS ( SELECT employee_id, departure_date, final_salary FROM last_non_current_salary WHERE rn = 1 ), with_tenure AS ( SELECT e.id AS employee_id, d.departure_date, d.final_salary, (d.departure_date - e.hire_date)::INTEGER AS tenure_days FROM employees.employee e JOIN departed d ON d.employee_id = e.id ) SELECT EXTRACT(YEAR FROM departure_date)::INTEGER AS departure_year, COUNT(*)::INTEGER AS departures_count, AVG(tenure_days) AS avg_tenure_days, AVG(final_salary) AS avg_final_salary FROM with_tenure WHERE departure_date BETWEEN DATE '1985-01-01' AND DATE '2002-12-31' GROUP BY EXTRACT(YEAR FROM departure_date) ORDER BY departure_year; """) expected_results = cur.fetchall() if len(actual_results) != len(expected_results): print(f"❌ Expected {len(expected_results)} turnover trend results, got {len(actual_results)}") return False mismatches = 0 for i, (actual, expected) in enumerate(zip(actual_results, expected_results)): if not rows_match(actual, expected): if mismatches < 5: # Only show first 5 mismatches print(f"❌ Row {i+1} mismatch: expected {expected}, got {actual}") mismatches += 1 if mismatches > 0: print(f"❌ Total mismatches: {mismatches}") return False print(f"✅ Turnover trend analysis results are correct ({len(actual_results)} records)") return True def main(): """Main verification function.""" print("=" * 50) # Get connection parameters conn_params = get_connection_params() if not conn_params["database"]: print("❌ No database specified") sys.exit(1) try: # Connect to database conn = psycopg2.connect(**conn_params) # Verify all three analysis results success = ( verify_retention_analysis_results(conn) and verify_high_risk_results(conn) and verify_turnover_trend_results(conn) ) conn.close() if success: print("\n🎉 Task verification: PASS") sys.exit(0) else: print("\n❌ Task verification: FAIL") sys.exit(1) except psycopg2.Error as e: print(f"❌ Database error: {e}") sys.exit(1) except Exception as e: print(f"❌ Verification error: {e}") sys.exit(1) if __name__ == "__main__": main() ================================================ FILE: tasks/postgres/standard/employees/executive_dashboard_automation/description.md ================================================ Design a comprehensive reporting and automation system for executive dashboard and real-time monitoring. The executive team needs automated reports, data views, and trigger-based notifications to track key business metrics without manual intervention. ## Your Tasks: 1. **Create executive summary views** — build three materialized views in the `employees` schema: **View 1: `exec_department_summary`** * `department_name` (varchar) — department name * `total_employees` (integer) — current active employee count * `avg_salary` (decimal) — average current salary * `total_payroll` (bigint) — total monthly payroll cost * `manager_name` (varchar) — current department manager name **View 2: `exec_hiring_trends`** * `hire_year` (integer) — year employees were hired * `employees_hired` (integer) — number hired that year * `avg_starting_salary` (decimal) — average first salary of hires that year * `retention_rate` (decimal) — percentage still employed * `top_hiring_department` (varchar) — department that hired the most that year **View 3: `exec_salary_distribution`** * `salary_band` (varchar) — salary ranges ('30K-50K', '50K-70K', '70K-90K', '90K-110K', '110K+') * `employee_count` (integer) — employees in this salary band * `percentage_of_workforce` (decimal) — percentage of total workforce * `most_common_title` (varchar) — most frequent job title in this band 2. **Create stored procedure for report generation**: **Procedure: `generate_monthly_report(report_date DATE)`** * Create a table `monthly_reports` with columns: report_id (auto-increment), report_date, department_count, total_employees (current active employees only), avg_salary, generated_at * Insert one summary record using the report_date as identifier and current database statistics (not historical data for that date) * Return the generated report_id 3. **Create notification triggers**: **Trigger: `high_salary_alert`** * Fires when a new salary record is inserted with amount > 120000 * Inserts alert into `salary_alerts` table with: employee_id, salary_amount, alert_date, status='new' 4. **Insert test data to verify triggers**: * Update employee 10001's current salary: first set their current salary record to_date='2024-01-31', then insert new salary record with amount 125000, from_date='2024-02-01', to_date='9999-01-01' * Refresh all materialized views after inserting new data to ensure they reflect the updated information 5. **Execute the stored procedure**: * Call `generate_monthly_report('2024-01-01')` to create January report * Query the generated report to verify execution 6. **Create performance indexes**: * Index on `salary_alerts.status` for alert processing * Composite index on `monthly_reports(report_date, department_count)` for trend analysis ================================================ FILE: tasks/postgres/standard/employees/executive_dashboard_automation/meta.json ================================================ { "task_id": "executive_dashboard_automation", "task_name": "Executive Dashboard Automation", "category_id": "employees", "category_name": "Employees", "description": "Design automated reporting system with materialized views, stored procedures, and triggers for executive dashboard monitoring.", "author": "Lingxiao Du", "created_at": "2025-08-15", "difficulty": "L3", "tags": [ "reporting and analytics", "stored procedures and functions", "schema design" ], "mcp": [ "postgres" ], "meta_data": { "stateType": "text", "stateContent": "Enum \"employees\".\"employee_gender\" {\n \"M\"\n \"F\"\n}\n\nTable \"employees\".\"department\" {\n \"id\" bpchar(4) [pk, not null]\n \"dept_name\" varchar(40) [unique, not null]\n}\n\nTable \"employees\".\"department_employee\" {\n \"employee_id\" int8 [not null]\n \"department_id\" bpchar(4) [not null]\n \"from_date\" date [not null]\n \"to_date\" date [not null]\n\n Indexes {\n (employee_id, department_id) [type: btree, name: \"idx_16982_primary\"]\n department_id [type: btree, name: \"idx_16982_dept_no\"]\n }\n}\n\nTable \"employees\".\"department_manager\" {\n \"employee_id\" int8 [not null]\n \"department_id\" bpchar(4) [not null]\n \"from_date\" date [not null]\n \"to_date\" date [not null]\n\n Indexes {\n (employee_id, department_id) [type: btree, name: \"idx_16985_primary\"]\n department_id [type: btree, name: \"idx_16985_dept_no\"]\n }\n}\n\nTable \"employees\".\"employee\" {\n \"id\" int8 [pk, not null, increment]\n \"birth_date\" date [not null]\n \"first_name\" varchar(14) [not null]\n \"last_name\" varchar(16) [not null]\n \"gender\" employees.employee_gender [not null]\n \"hire_date\" date [not null]\n}\n\nTable \"employees\".\"salary\" {\n \"employee_id\" int8 [not null]\n \"amount\" int8 [not null]\n \"from_date\" date [not null]\n \"to_date\" date [not null]\n\n Indexes {\n (employee_id, from_date) [type: btree, name: \"idx_16991_primary\"]\n }\n}\n\nTable \"employees\".\"title\" {\n \"employee_id\" int8 [not null]\n \"title\" varchar(50) [not null]\n \"from_date\" date [not null]\n \"to_date\" date\n\n Indexes {\n (employee_id, title, from_date) [type: btree, name: \"idx_16994_primary\"]\n }\n}\n\nRef \"dept_emp_ibfk_1\":\"employees\".\"employee\".\"id\" < \"employees\".\"department_employee\".\"employee_id\" [update: restrict, delete: cascade]\n\nRef \"dept_emp_ibfk_2\":\"employees\".\"department\".\"id\" < \"employees\".\"department_employee\".\"department_id\" [update: restrict, delete: cascade]\n\nRef \"dept_manager_ibfk_1\":\"employees\".\"employee\".\"id\" < \"employees\".\"department_manager\".\"employee_id\" [update: restrict, delete: cascade]\n\nRef \"dept_manager_ibfk_2\":\"employees\".\"department\".\"id\" < \"employees\".\"department_manager\".\"department_id\" [update: restrict, delete: cascade]\n\nRef \"salaries_ibfk_1\":\"employees\".\"employee\".\"id\" < \"employees\".\"salary\".\"employee_id\" [update: restrict, delete: cascade]\n\nRef \"titles_ibfk_1\":\"employees\".\"employee\".\"id\" < \"employees\".\"title\".\"employee_id\" [update: restrict, delete: cascade]\n", "stateUrl": null, "stateOriginalUrl": "https://github.com/neondatabase-labs/postgres-sample-dbs/blob/main/employees.sql.gz" } } ================================================ FILE: tasks/postgres/standard/employees/executive_dashboard_automation/verify.py ================================================ """ Verification script for PostgreSQL Task 6: Reporting and Automation System """ import os import sys import psycopg2 from decimal import Decimal def rows_match(actual_row, expected_row): """ Compare two rows with appropriate tolerance. For Decimal types: allows 0.1 tolerance For date types: convert to string for comparison For other types: requires exact match """ if len(actual_row) != len(expected_row): return False for actual, expected in zip(actual_row, expected_row): if isinstance(actual, Decimal) and isinstance(expected, (Decimal, float, int)): if abs(float(actual) - float(expected)) > 0.1: return False elif hasattr(actual, 'strftime'): # datetime.date or datetime.datetime if str(actual) != str(expected): return False elif actual != expected: return False return True def get_connection_params() -> dict: """Get database connection parameters.""" return { "host": os.getenv("POSTGRES_HOST", "localhost"), "port": int(os.getenv("POSTGRES_PORT", 5432)), "database": os.getenv("POSTGRES_DATABASE"), "user": os.getenv("POSTGRES_USERNAME"), "password": os.getenv("POSTGRES_PASSWORD") } def verify_materialized_views(conn) -> bool: """Verify that materialized views were created and populated correctly.""" with conn.cursor() as cur: # Check if materialized views exist cur.execute(""" SELECT matviewname FROM pg_matviews WHERE schemaname = 'employees' AND matviewname IN ('exec_department_summary', 'exec_hiring_trends', 'exec_salary_distribution') ORDER BY matviewname """) views = [row[0] for row in cur.fetchall()] expected_views = ['exec_department_summary', 'exec_hiring_trends', 'exec_salary_distribution'] if set(views) != set(expected_views): print(f"❌ Expected views {expected_views}, found {views}") return False # Check all departments' data accuracy cur.execute(""" SELECT department_name, total_employees, avg_salary, total_payroll, manager_name FROM employees.exec_department_summary ORDER BY department_name """) view_data = cur.fetchall() # Get actual data for all departments cur.execute(""" WITH current_salary AS ( SELECT employee_id, amount FROM ( SELECT s.*, ROW_NUMBER() OVER ( PARTITION BY s.employee_id ORDER BY s.from_date DESC, s.amount DESC ) AS rn FROM employees.salary s WHERE s.to_date = DATE '9999-01-01' ) x WHERE rn = 1 ), current_dept AS ( SELECT DISTINCT de.employee_id, de.department_id FROM employees.department_employee de WHERE de.to_date = DATE '9999-01-01' ), current_manager AS ( SELECT department_id, CONCAT(e.first_name, ' ', e.last_name) AS manager_name FROM ( SELECT dm.*, ROW_NUMBER() OVER ( PARTITION BY dm.department_id ORDER BY dm.from_date DESC, dm.employee_id ) AS rn FROM employees.department_manager dm WHERE dm.to_date = DATE '9999-01-01' ) dm JOIN employees.employee e ON e.id = dm.employee_id WHERE dm.rn = 1 ) SELECT d.dept_name AS department_name, COUNT(cd.employee_id)::INT AS total_employees, AVG(cs.amount)::DECIMAL AS avg_salary, COALESCE(SUM(cs.amount), 0)::BIGINT AS total_payroll, cm.manager_name FROM employees.department d LEFT JOIN current_dept cd ON cd.department_id = d.id LEFT JOIN current_salary cs ON cs.employee_id = cd.employee_id LEFT JOIN current_manager cm ON cm.department_id = d.id GROUP BY d.id, d.dept_name, cm.manager_name ORDER BY d.dept_name; """) actual_data = cur.fetchall() if len(view_data) != len(actual_data): print(f"❌ Department count mismatch: view={len(view_data)}, actual={len(actual_data)}") return False for view_row, actual_row in zip(view_data, actual_data): if not rows_match(view_row, actual_row): print(f"❌ Department summary data incorrect for {view_row[0]}: view={view_row}, actual={actual_row}") return False # Check all hiring trends data accuracy cur.execute(""" SELECT hire_year, employees_hired, avg_starting_salary, retention_rate, top_hiring_department FROM employees.exec_hiring_trends ORDER BY hire_year """) hiring_view_data = cur.fetchall() # Get actual data for all years cur.execute(""" WITH first_salary AS ( SELECT employee_id, amount AS starting_salary FROM ( SELECT s.*, ROW_NUMBER() OVER ( PARTITION BY s.employee_id ORDER BY s.from_date ASC, s.amount ASC ) AS rn FROM employees.salary s ) x WHERE rn = 1 ), current_emp AS ( SELECT DISTINCT s.employee_id FROM employees.salary s WHERE s.to_date = DATE '9999-01-01' ), first_dept AS ( SELECT employee_id, department_id FROM ( SELECT de.*, ROW_NUMBER() OVER ( PARTITION BY de.employee_id ORDER BY de.from_date ASC, de.department_id ) AS rn FROM employees.department_employee de ) x WHERE rn = 1 ), hire_base AS ( SELECT e.id AS employee_id, EXTRACT(YEAR FROM e.hire_date)::INT AS hire_year FROM employees.employee e WHERE e.hire_date IS NOT NULL ), hire_by_dept_year AS ( SELECT hb.hire_year, d.dept_name, COUNT(*) AS dept_hires FROM hire_base hb LEFT JOIN first_dept fd ON fd.employee_id = hb.employee_id LEFT JOIN employees.department d ON d.id = fd.department_id GROUP BY hb.hire_year, d.dept_name ), top_dept_per_year AS ( SELECT hire_year, dept_name AS top_hiring_department FROM ( SELECT hire_year, dept_name, dept_hires, ROW_NUMBER() OVER ( PARTITION BY hire_year ORDER BY dept_hires DESC NULLS LAST, dept_name ) AS rn FROM hire_by_dept_year ) t WHERE rn = 1 ) SELECT hb.hire_year, COUNT(*)::INT AS employees_hired, AVG(fs.starting_salary)::DECIMAL AS avg_starting_salary, (COUNT(ce.employee_id)::DECIMAL / NULLIF(COUNT(*), 0) * 100) AS retention_rate, td.top_hiring_department FROM hire_base hb LEFT JOIN first_salary fs ON fs.employee_id = hb.employee_id LEFT JOIN current_emp ce ON ce.employee_id = hb.employee_id LEFT JOIN top_dept_per_year td ON td.hire_year = hb.hire_year GROUP BY hb.hire_year, td.top_hiring_department ORDER BY hb.hire_year; """) actual_hiring_data = cur.fetchall() if len(hiring_view_data) != len(actual_hiring_data): print(f"❌ Hiring trends count mismatch: view={len(hiring_view_data)}, actual={len(actual_hiring_data)}") return False for hiring_view, actual_hiring in zip(hiring_view_data, actual_hiring_data): # Now compare all 5 fields including top_hiring_department if not rows_match(hiring_view, actual_hiring): print(f"❌ Hiring trends data incorrect for year {hiring_view[0]}: view={hiring_view}, actual={actual_hiring}") return False # Check all salary bands' data accuracy cur.execute(""" WITH band_order AS ( SELECT '30K-50K' AS band, 1 AS ord UNION ALL SELECT '50K-70K', 2 UNION ALL SELECT '70K-90K', 3 UNION ALL SELECT '90K-110K',4 UNION ALL SELECT '110K+', 5 ) SELECT salary_band, employee_count, percentage_of_workforce, most_common_title FROM employees.exec_salary_distribution v JOIN band_order bo ON bo.band = v.salary_band ORDER BY bo.ord; """) view_bands = cur.fetchall() # Calculate actual data for all bands cur.execute(""" WITH current_salary AS ( SELECT employee_id, amount FROM ( SELECT s.*, ROW_NUMBER() OVER ( PARTITION BY s.employee_id ORDER BY s.from_date DESC, s.amount DESC ) AS rn FROM employees.salary s WHERE s.to_date = DATE '9999-01-01' ) x WHERE rn = 1 ), current_title AS ( SELECT employee_id, title FROM ( SELECT t.*, ROW_NUMBER() OVER ( PARTITION BY t.employee_id ORDER BY t.from_date DESC, t.title ) AS rn FROM employees.title t WHERE t.to_date = DATE '9999-01-01' ) x WHERE rn = 1 ), base AS ( SELECT cs.employee_id, cs.amount, COALESCE(ct.title, 'Unknown') AS title FROM current_salary cs LEFT JOIN current_title ct ON ct.employee_id = cs.employee_id ), banded AS ( SELECT CASE WHEN amount < 50000 THEN '30K-50K' WHEN amount < 70000 THEN '50K-70K' WHEN amount < 90000 THEN '70K-90K' WHEN amount < 110000 THEN '90K-110K' ELSE '110K+' END AS salary_band, title, employee_id FROM base ), band_counts AS ( SELECT salary_band, COUNT(DISTINCT employee_id) AS employee_count FROM banded GROUP BY salary_band ), title_counts AS ( SELECT salary_band, title, COUNT(DISTINCT employee_id) AS title_count FROM banded GROUP BY salary_band, title ), top_titles AS ( SELECT salary_band, title AS most_common_title FROM ( SELECT salary_band, title, title_count, ROW_NUMBER() OVER ( PARTITION BY salary_band ORDER BY title_count DESC, title ) AS rn FROM title_counts ) t WHERE rn = 1 ), workforce AS ( SELECT COUNT(DISTINCT employee_id) AS total_current FROM base ), band_order AS ( SELECT '30K-50K' AS band, 1 AS ord UNION ALL SELECT '50K-70K', 2 UNION ALL SELECT '70K-90K', 3 UNION ALL SELECT '90K-110K', 4 UNION ALL SELECT '110K+', 5 ) SELECT bc.salary_band, bc.employee_count::INT AS employee_count, (bc.employee_count::DECIMAL / NULLIF((SELECT total_current FROM workforce), 0) * 100) AS percentage_of_workforce, tt.most_common_title FROM band_counts bc LEFT JOIN top_titles tt ON tt.salary_band = bc.salary_band LEFT JOIN band_order bo ON bo.band = bc.salary_band ORDER BY bo.ord; """) actual_bands = cur.fetchall() # Compare view data with actual data if len(view_bands) != len(actual_bands): print(f"❌ Salary band count mismatch: view={len(view_bands)}, actual={len(actual_bands)}") return False for view_band, actual_band in zip(view_bands, actual_bands): if not rows_match(view_band, actual_band): print(f"❌ Salary band {actual_band[0]} data incorrect: view={view_band}, actual={actual_band}") return False print("✅ All materialized views are created and contain correct data") return True def verify_stored_procedures(conn) -> bool: """Verify that stored procedure was created.""" with conn.cursor() as cur: # Check if procedure exists cur.execute(""" SELECT routine_name FROM information_schema.routines WHERE routine_schema = 'employees' AND routine_type = 'FUNCTION' AND routine_name = 'generate_monthly_report' """) procedures = [row[0] for row in cur.fetchall()] if 'generate_monthly_report' not in procedures: print("❌ generate_monthly_report procedure not found") return False # Check if monthly_reports table exists with correct structure cur.execute(""" SELECT COUNT(*) FROM information_schema.columns WHERE table_schema = 'employees' AND table_name = 'monthly_reports' AND column_name IN ('report_id', 'report_date', 'department_count', 'total_employees', 'avg_salary', 'generated_at') """) report_columns = cur.fetchone()[0] if report_columns != 6: print("❌ monthly_reports table missing required columns") return False print("✅ Stored procedure and supporting table are created") return True def verify_triggers(conn) -> bool: """Verify that triggers were created and fired correctly.""" with conn.cursor() as cur: # Check if triggers exist cur.execute(""" SELECT trigger_name FROM information_schema.triggers WHERE trigger_schema = 'employees' AND trigger_name = 'high_salary_alert' """) triggers = [row[0] for row in cur.fetchall()] if 'high_salary_alert' not in triggers: print("❌ high_salary_alert trigger not found") return False # Check if trigger support table exists cur.execute(""" SELECT table_name FROM information_schema.tables WHERE table_schema = 'employees' AND table_name = 'salary_alerts' """) trigger_tables = [row[0] for row in cur.fetchall()] if 'salary_alerts' not in trigger_tables: print("❌ salary_alerts table not found") return False # Check if the old salary record was properly closed cur.execute(""" SELECT COUNT(*) FROM employees.salary WHERE employee_id = 10001 AND to_date = '2024-01-31' """) old_salary_count = cur.fetchone()[0] if old_salary_count == 0: print("❌ Old salary record for employee 10001 was not properly closed with to_date='2024-01-31'") return False # Check if the new salary record was inserted cur.execute(""" SELECT COUNT(*) FROM employees.salary WHERE employee_id = 10001 AND amount = 125000 AND from_date = '2024-02-01' AND to_date = '9999-01-01' """) new_salary_count = cur.fetchone()[0] if new_salary_count == 0: print("❌ New salary record for employee 10001 with amount 125000 was not inserted") return False # Check if high salary alert was triggered with specific details cur.execute(""" SELECT COUNT(*) FROM employees.salary_alerts WHERE employee_id = 10001 AND salary_amount = 125000 AND status = 'new' """) alert_count = cur.fetchone()[0] if alert_count == 0: print("❌ High salary alert was not triggered correctly for employee 10001 with amount 125000") return False print("✅ Trigger is created and functioning correctly") return True def verify_procedure_execution(conn) -> bool: """Verify that stored procedure was executed with correct data.""" with conn.cursor() as cur: # Check if monthly report data matches actual statistics cur.execute(""" SELECT department_count, total_employees, avg_salary FROM employees.monthly_reports WHERE report_date = '2024-01-01' """) report_data = cur.fetchone() if not report_data: print("❌ Monthly report for 2024-01-01 was not generated") return False # Get actual current statistics to compare cur.execute(""" WITH current_salary AS ( SELECT employee_id, amount FROM ( SELECT s.*, ROW_NUMBER() OVER ( PARTITION BY s.employee_id ORDER BY s.from_date DESC, s.amount DESC ) AS rn FROM employees.salary s WHERE s.to_date = DATE '9999-01-01' ) x WHERE rn = 1 ), current_dept AS ( SELECT DISTINCT de.employee_id, de.department_id FROM employees.department_employee de WHERE de.to_date = DATE '9999-01-01' ), base AS ( SELECT cd.department_id, cs.employee_id, cs.amount FROM current_dept cd JOIN current_salary cs ON cs.employee_id = cd.employee_id ) SELECT COUNT(DISTINCT department_id) AS actual_dept_count, COUNT(DISTINCT employee_id) AS actual_total_employees, AVG(amount)::DECIMAL AS actual_avg_salary FROM base; """) actual_stats = cur.fetchone() # Compare report data with actual data if not rows_match(report_data, actual_stats): print(f"❌ Monthly report data incorrect: expected {actual_stats}, got {report_data}") return False print("✅ Stored procedure executed with correct data") return True def verify_indexes(conn) -> bool: """Verify that performance indexes were created.""" with conn.cursor() as cur: # Check for required indexes cur.execute(""" SELECT indexname FROM pg_indexes WHERE schemaname = 'employees' AND tablename IN ('salary_alerts', 'monthly_reports') AND indexname LIKE 'idx_%' ORDER BY indexname """) indexes = [row[0] for row in cur.fetchall()] # Should have at least 2 indexes created if len(indexes) < 2: print(f"❌ Expected at least 2 performance indexes, found {len(indexes)}") return False print("✅ Performance indexes are created") return True def main(): """Main verification function.""" print("=" * 50) # Get connection parameters conn_params = get_connection_params() if not conn_params["database"]: print("❌ No database specified") sys.exit(1) try: # Connect to database conn = psycopg2.connect(**conn_params) # Verify all components success = ( verify_materialized_views(conn) and verify_stored_procedures(conn) and verify_triggers(conn) and verify_procedure_execution(conn) and verify_indexes(conn) ) conn.close() if success: print("\n🎉 Task verification: PASS") sys.exit(0) else: print("\n❌ Task verification: FAIL") sys.exit(1) except psycopg2.Error as e: print(f"❌ Database error: {e}") sys.exit(1) except Exception as e: print(f"❌ Verification error: {e}") sys.exit(1) if __name__ == "__main__": main() ================================================ FILE: tasks/postgres/standard/employees/management_structure_analysis/description.md ================================================ Conduct a comprehensive management structure analysis to evaluate leadership effectiveness and organizational hierarchy. The executive team needs insights into management tenure, span of control, and leadership transitions to optimize the management structure and succession planning. ## Your Tasks: 1. **Create the manager profile table** — build a table called `manager_profile` in the `employees` schema with these exact columns: * `manager_id` (bigint) — the manager's employee ID * `manager_name` (varchar) — concatenated first and last name * `current_department` (varchar) — current department they manage (NULL if not current) * `management_periods` (integer) — total number of management assignments (including multiple periods in same department) * `current_manager` (boolean) — whether they are currently a manager 2. **Create the department leadership table** — build a table called `department_leadership` in the `employees` schema with: * `department_name` (varchar) — the department name * `current_manager_name` (varchar) — current manager's full name * `manager_start_date` (date) — when current manager started * `total_historical_managers` (integer) — total number of managers this department has had 3. **Create the management transition table** — build a table called `management_transitions` in the `employees` schema with: * `department_name` (varchar) — the department name * `transition_year` (integer) — year when management changed * `outgoing_manager` (varchar) — previous manager's name * `incoming_manager` (varchar) — new manager's name ('No Successor' if department had no immediate replacement) * `transition_gap_days` (integer) — days between managers (0 if immediate or no successor) 4. **Create the span of control table** — build a table called `span_of_control` in the `employees` schema with: * `manager_id` (bigint) — the manager's employee ID * `manager_name` (varchar) — manager's full name * `department_name` (varchar) — department they manage * `total_employees` (integer) — total employees in their department * `current_employees` (integer) — current active employees in department * `management_load` (varchar) — assessment ('light', 'moderate', 'heavy') based on current employees 5. **Apply management load classification**: * **Light**: < 5,000 current employees * **Moderate**: 5,000 - 15,000 current employees * **Heavy**: > 15,000 current employees 6. **Focus on current managers only** for span of control analysis — use managers with active management roles (to_date = '9999-01-01'). 7. **Track all management history** for profiles and transitions — include both current and former managers to understand complete leadership evolution. The analysis will provide insights into management effectiveness, departmental stability, and organizational structure optimization opportunities. ================================================ FILE: tasks/postgres/standard/employees/management_structure_analysis/meta.json ================================================ { "task_id": "management_structure_analysis", "task_name": "Management Structure Analysis", "category_id": "employees", "category_name": "Employees", "description": "Analyze management structure evaluating leadership effectiveness, span of control, and management transitions for succession planning.", "author": "Lingxiao Du", "created_at": "2025-08-15", "difficulty": "L3", "tags": [ "reporting and analytics", "statistical aggregation" ], "mcp": [ "postgres" ], "meta_data": { "stateType": "text", "stateContent": "Enum \"employees\".\"employee_gender\" {\n \"M\"\n \"F\"\n}\n\nTable \"employees\".\"department\" {\n \"id\" bpchar(4) [pk, not null]\n \"dept_name\" varchar(40) [unique, not null]\n}\n\nTable \"employees\".\"department_employee\" {\n \"employee_id\" int8 [not null]\n \"department_id\" bpchar(4) [not null]\n \"from_date\" date [not null]\n \"to_date\" date [not null]\n\n Indexes {\n (employee_id, department_id) [type: btree, name: \"idx_16982_primary\"]\n department_id [type: btree, name: \"idx_16982_dept_no\"]\n }\n}\n\nTable \"employees\".\"department_manager\" {\n \"employee_id\" int8 [not null]\n \"department_id\" bpchar(4) [not null]\n \"from_date\" date [not null]\n \"to_date\" date [not null]\n\n Indexes {\n (employee_id, department_id) [type: btree, name: \"idx_16985_primary\"]\n department_id [type: btree, name: \"idx_16985_dept_no\"]\n }\n}\n\nTable \"employees\".\"employee\" {\n \"id\" int8 [pk, not null, increment]\n \"birth_date\" date [not null]\n \"first_name\" varchar(14) [not null]\n \"last_name\" varchar(16) [not null]\n \"gender\" employees.employee_gender [not null]\n \"hire_date\" date [not null]\n}\n\nTable \"employees\".\"salary\" {\n \"employee_id\" int8 [not null]\n \"amount\" int8 [not null]\n \"from_date\" date [not null]\n \"to_date\" date [not null]\n\n Indexes {\n (employee_id, from_date) [type: btree, name: \"idx_16991_primary\"]\n }\n}\n\nTable \"employees\".\"title\" {\n \"employee_id\" int8 [not null]\n \"title\" varchar(50) [not null]\n \"from_date\" date [not null]\n \"to_date\" date\n\n Indexes {\n (employee_id, title, from_date) [type: btree, name: \"idx_16994_primary\"]\n }\n}\n\nRef \"dept_emp_ibfk_1\":\"employees\".\"employee\".\"id\" < \"employees\".\"department_employee\".\"employee_id\" [update: restrict, delete: cascade]\n\nRef \"dept_emp_ibfk_2\":\"employees\".\"department\".\"id\" < \"employees\".\"department_employee\".\"department_id\" [update: restrict, delete: cascade]\n\nRef \"dept_manager_ibfk_1\":\"employees\".\"employee\".\"id\" < \"employees\".\"department_manager\".\"employee_id\" [update: restrict, delete: cascade]\n\nRef \"dept_manager_ibfk_2\":\"employees\".\"department\".\"id\" < \"employees\".\"department_manager\".\"department_id\" [update: restrict, delete: cascade]\n\nRef \"salaries_ibfk_1\":\"employees\".\"employee\".\"id\" < \"employees\".\"salary\".\"employee_id\" [update: restrict, delete: cascade]\n\nRef \"titles_ibfk_1\":\"employees\".\"employee\".\"id\" < \"employees\".\"title\".\"employee_id\" [update: restrict, delete: cascade]\n", "stateUrl": null, "stateOriginalUrl": "https://github.com/neondatabase-labs/postgres-sample-dbs/blob/main/employees.sql.gz" } } ================================================ FILE: tasks/postgres/standard/employees/management_structure_analysis/verify.py ================================================ """ Verification script for PostgreSQL Task 4: Management Structure Analysis """ import os import sys import psycopg2 from decimal import Decimal def rows_match(actual_row, expected_row): """ Compare two rows with appropriate tolerance. For Decimal types: allows 0.1 tolerance For other types: requires exact match """ if len(actual_row) != len(expected_row): return False for actual, expected in zip(actual_row, expected_row): if isinstance(actual, Decimal) and isinstance(expected, Decimal): if abs(float(actual) - float(expected)) > 0.1: return False elif actual != expected: return False return True def get_connection_params() -> dict: """Get database connection parameters.""" return { "host": os.getenv("POSTGRES_HOST", "localhost"), "port": int(os.getenv("POSTGRES_PORT", 5432)), "database": os.getenv("POSTGRES_DATABASE"), "user": os.getenv("POSTGRES_USERNAME"), "password": os.getenv("POSTGRES_PASSWORD") } def verify_manager_profile_results(conn) -> bool: """Verify the manager profile results.""" with conn.cursor() as cur: # Get actual results from the created table cur.execute(""" SELECT manager_id, manager_name, current_department, management_periods, current_manager FROM employees.manager_profile ORDER BY manager_id """) actual_results = cur.fetchall() # Execute ground truth query cur.execute(""" WITH dm AS ( SELECT dm.employee_id, dm.department_id, dm.from_date, dm.to_date FROM employees.department_manager dm ), manager_periods AS ( SELECT employee_id, COUNT(*)::INT AS management_periods FROM dm GROUP BY employee_id ), current_assignment AS ( SELECT employee_id, department_id FROM ( SELECT d.*, ROW_NUMBER() OVER ( PARTITION BY d.employee_id ORDER BY d.from_date DESC, d.department_id ) AS rn FROM dm d WHERE d.to_date = DATE '9999-01-01' ) x WHERE rn = 1 ), manager_names AS ( SELECT e.id AS manager_id, CONCAT(e.first_name, ' ', e.last_name) AS manager_name FROM employees.employee e WHERE EXISTS (SELECT 1 FROM dm WHERE employee_id = e.id) ) SELECT mn.manager_id, mn.manager_name, d.dept_name AS current_department, mp.management_periods, (d.dept_name IS NOT NULL) AS current_manager FROM manager_names mn JOIN manager_periods mp ON mp.employee_id = mn.manager_id LEFT JOIN current_assignment ca ON ca.employee_id = mn.manager_id LEFT JOIN employees.department d ON d.id = ca.department_id ORDER BY mn.manager_id; """) expected_results = cur.fetchall() if len(actual_results) != len(expected_results): print(f"❌ Expected {len(expected_results)} manager profile results, got {len(actual_results)}") return False mismatches = 0 for i, (actual, expected) in enumerate(zip(actual_results, expected_results)): if not rows_match(actual, expected): if mismatches < 5: # Only show first 5 mismatches print(f"❌ Row {i+1} mismatch: expected {expected}, got {actual}") mismatches += 1 if mismatches > 0: print(f"❌ Total mismatches: {mismatches}") return False print(f"✅ Manager profile results are correct ({len(actual_results)} records)") return True def verify_department_leadership_results(conn) -> bool: """Verify the department leadership results.""" with conn.cursor() as cur: # Get actual results from the created table cur.execute(""" SELECT department_name, current_manager_name, manager_start_date, total_historical_managers FROM employees.department_leadership ORDER BY department_name """) actual_results = cur.fetchall() # Execute ground truth query cur.execute(""" WITH current_mgr AS ( SELECT department_id, CONCAT(e.first_name, ' ', e.last_name) AS current_manager_name, dm.from_date AS manager_start_date FROM ( SELECT dm.*, ROW_NUMBER() OVER ( PARTITION BY dm.department_id ORDER BY dm.from_date DESC, dm.employee_id ) AS rn FROM employees.department_manager dm WHERE dm.to_date = DATE '9999-01-01' ) dm JOIN employees.employee e ON e.id = dm.employee_id WHERE dm.rn = 1 ), hist AS ( SELECT dm.department_id, COUNT(DISTINCT dm.employee_id)::INT AS total_historical_managers FROM employees.department_manager dm GROUP BY dm.department_id ) SELECT d.dept_name AS department_name, cm.current_manager_name, cm.manager_start_date, COALESCE(h.total_historical_managers,0) AS total_historical_managers FROM employees.department d LEFT JOIN current_mgr cm ON cm.department_id = d.id LEFT JOIN hist h ON h.department_id = d.id ORDER BY d.dept_name; """) expected_results = cur.fetchall() if len(actual_results) != len(expected_results): print(f"❌ Expected {len(expected_results)} department leadership results, got {len(actual_results)}") return False mismatches = 0 for i, (actual, expected) in enumerate(zip(actual_results, expected_results)): if not rows_match(actual, expected): if mismatches < 5: # Only show first 5 mismatches print(f"❌ Row {i+1} mismatch: expected {expected}, got {actual}") mismatches += 1 if mismatches > 0: print(f"❌ Total mismatches: {mismatches}") return False print(f"✅ Department leadership results are correct ({len(actual_results)} records)") return True def verify_management_transitions_results(conn) -> bool: """Verify the management transitions results.""" with conn.cursor() as cur: # Get actual results from the created table cur.execute(""" SELECT department_name, transition_year, outgoing_manager, incoming_manager, transition_gap_days FROM employees.management_transitions ORDER BY department_name, transition_year """) actual_results = cur.fetchall() # Execute ground truth query cur.execute(""" WITH mgr AS ( SELECT d.id AS department_id, d.dept_name, dm.employee_id, dm.from_date, dm.to_date, CONCAT(e.first_name, ' ', e.last_name) AS manager_name FROM employees.department_manager dm JOIN employees.department d ON d.id = dm.department_id JOIN employees.employee e ON e.id = dm.employee_id ), ordered AS ( SELECT department_id, dept_name, employee_id, manager_name, from_date, to_date, ROW_NUMBER() OVER ( PARTITION BY department_id ORDER BY from_date, to_date, employee_id ) AS rn, LEAD(manager_name) OVER ( PARTITION BY department_id ORDER BY from_date, to_date, employee_id ) AS next_manager_name, LEAD(from_date) OVER ( PARTITION BY department_id ORDER BY from_date, to_date, employee_id ) AS next_from_date FROM mgr ) SELECT o.dept_name AS department_name, EXTRACT(YEAR FROM o.to_date)::INT AS transition_year, o.manager_name AS outgoing_manager, COALESCE(o.next_manager_name, 'No Successor') AS incoming_manager, COALESCE(GREATEST((o.next_from_date - o.to_date - 1), 0), 0)::INT AS transition_gap_days FROM ordered o WHERE o.to_date <> DATE '9999-01-01' ORDER BY department_name, transition_year; """) expected_results = cur.fetchall() if len(actual_results) != len(expected_results): print(f"❌ Expected {len(expected_results)} management transitions results, got {len(actual_results)}") return False mismatches = 0 for i, (actual, expected) in enumerate(zip(actual_results, expected_results)): if not rows_match(actual, expected): if mismatches < 5: # Only show first 5 mismatches print(f"❌ Row {i+1} mismatch: expected {expected}, got {actual}") mismatches += 1 if mismatches > 0: print(f"❌ Total mismatches: {mismatches}") return False print(f"✅ Management transitions results are correct ({len(actual_results)} records)") return True def verify_span_of_control_results(conn) -> bool: """Verify the span of control results.""" with conn.cursor() as cur: # Get actual results from the created table cur.execute(""" SELECT manager_id, manager_name, department_name, total_employees, current_employees, management_load FROM employees.span_of_control ORDER BY manager_id """) actual_results = cur.fetchall() # Execute ground truth query cur.execute(""" WITH dept_total AS ( SELECT de.department_id, COUNT(DISTINCT de.employee_id)::INT AS total_employees FROM employees.department_employee de GROUP BY de.department_id ), dept_current AS ( SELECT de.department_id, COUNT(DISTINCT de.employee_id)::INT AS current_employees FROM employees.department_employee de JOIN employees.salary s ON s.employee_id = de.employee_id AND s.to_date = DATE '9999-01-01' WHERE de.to_date = DATE '9999-01-01' GROUP BY de.department_id ) SELECT dm.employee_id AS manager_id, CONCAT(e.first_name, ' ', e.last_name) AS manager_name, d.dept_name AS department_name, COALESCE(dt.total_employees, 0) AS total_employees, COALESCE(dc.current_employees, 0) AS current_employees, CASE WHEN COALESCE(dc.current_employees, 0) < 5000 THEN 'light' WHEN COALESCE(dc.current_employees, 0) <= 15000 THEN 'moderate' ELSE 'heavy' END AS management_load FROM employees.department_manager dm JOIN employees.employee e ON e.id = dm.employee_id JOIN employees.department d ON d.id = dm.department_id LEFT JOIN dept_total dt ON dt.department_id = dm.department_id LEFT JOIN dept_current dc ON dc.department_id = dm.department_id WHERE dm.to_date = DATE '9999-01-01' ORDER BY dm.employee_id, d.dept_name; """) expected_results = cur.fetchall() if len(actual_results) != len(expected_results): print(f"❌ Expected {len(expected_results)} span of control results, got {len(actual_results)}") return False mismatches = 0 for i, (actual, expected) in enumerate(zip(actual_results, expected_results)): if not rows_match(actual, expected): if mismatches < 5: # Only show first 5 mismatches print(f"❌ Row {i+1} mismatch: expected {expected}, got {actual}") mismatches += 1 if mismatches > 0: print(f"❌ Total mismatches: {mismatches}") return False print(f"✅ Span of control results are correct ({len(actual_results)} records)") return True def main(): """Main verification function.""" print("=" * 50) # Get connection parameters conn_params = get_connection_params() if not conn_params["database"]: print("❌ No database specified") sys.exit(1) try: # Connect to database conn = psycopg2.connect(**conn_params) # Verify all four analysis results success = ( verify_manager_profile_results(conn) and verify_department_leadership_results(conn) and verify_management_transitions_results(conn) and verify_span_of_control_results(conn) ) conn.close() if success: print("\n🎉 Task verification: PASS") sys.exit(0) else: print("\n❌ Task verification: FAIL") sys.exit(1) except psycopg2.Error as e: print(f"❌ Database error: {e}") sys.exit(1) except Exception as e: print(f"❌ Verification error: {e}") sys.exit(1) if __name__ == "__main__": main() ================================================ FILE: tasks/postgres/standard/lego/consistency_enforcement/description.md ================================================ Implement a data consistency enforcement system for the LEGO database. The system must ensure that the reported part count in the `lego_sets` table matches the actual sum of non-spare parts in the latest inventory version. This involves a three-step process: identifying existing inconsistencies, fixing them, and creating a trigger-based constraint system to prevent future issues. ### Consistency Rule For any given `set_num`, the following invariant must be maintained: `lego_sets.num_parts = SUM(quantity)` FROM `lego_inventory_parts` WHERE `inventory_id` IN (latest inventory for that set) AND `is_spare` = false **Important**: If a set has no inventory records, the consistency check should be skipped. # Your Tasks: ## Task 1: Identify Data Inconsistencies ### Objective Write a single `SELECT` query to find all sets where the stored `num_parts` does not match the actual calculated number of parts from the latest inventory. 1. **Find the Latest Inventory**: For each `set_num`, find its latest inventory id by getting the `MAX(version)` from the `lego_inventories` table. 2. **Calculate Actual Part Count**: For these latest inventories, join with `lego_inventory_parts` and calculate the `SUM(quantity)`, but only for parts where `is_spare` is false. 3. **Compare and Filter**: Join this calculated result back to the `lego_sets` table and return the rows where `lego_sets.num_parts` is different from your calculated sum. ## Task 2: Fix Existing Inconsistencies ### Objective Correct all mismatched `num_parts` values using a clear, multi-step process with a temporary table. This approach is designed to be robust against all edge cases. #### Step 1: Create a Temporary Table Create a temporary table (e.g., `correct_counts`) with two columns: `set_num` (text) and `actual_parts` (integer). #### Step 2: Populate the Temporary Table This is the most critical step. Write an `INSERT` statement that calculates the correct part count for every single set listed in the `lego_sets` table. - The query must start by selecting from `public.lego_sets`. - It must then `LEFT JOIN` to a subquery that contains the part-counting logic (finding the latest inventory version and summing the non-spare parts). - Use `COALESCE` on the final result from the subquery to ensure that any set without parts or without an inventory record gets a value of `0`, not `NULL`. #### Step 3: Update from the Temporary Table Write a final, simple `UPDATE` statement that joins the `lego_sets` table with your temporary table on `set_num` and sets `num_parts` to the `actual_parts` value. ## Task 3: Create Constraint Enforcement System ### Objective Implement a deferrable constraint trigger system to enforce the consistency rule automatically for all future `INSERT` and `UPDATE` operations. ### Part A: Create the Trigger Function Create a single PL/pgSQL function, preferably named `check_set_parts_consistency()`, that performs the core validation. **Function Requirements**: - Returns `trigger`. - Accepts no arguments. - Contains the core validation logic: - **Identify the `set_num` to check**. This is the most critical part. The `set_num` must be retrieved based on which table fired the trigger (`TG_TABLE_NAME`): - If `lego_sets` or `lego_inventories`: get the `set_num` directly from `NEW.set_num`. - If `lego_inventory_parts`: you must first query `lego_inventories` using `NEW.inventory_id` to find the corresponding `set_num`. - **Perform the check**. For the identified `set_num`, execute the same core logic from Task 1 to get the `actual_parts` count and the `stored_num_parts` from the `lego_sets` table. - **Raise an exception on failure**. If `actual_parts` does not equal `stored_num_parts`, the function must raise an exception to block the transaction (e.g., `RAISE EXCEPTION 'Inconsistent part count for set %', relevant_set_num;`). - **Return `NEW` on success**. If the check passes or is skipped, the function should `RETURN NEW`. ### Part B: Create the Constraint Triggers Create three separate `CONSTRAINT TRIGGER` statements that attach the function from Part A to the following tables: - `public.lego_sets` - `public.lego_inventories` - `public.lego_inventory_parts` **Crucial Trigger Requirements**: - Each trigger must fire `AFTER INSERT OR UPDATE`. - Each trigger **MUST** be `DEFERRABLE` and `INITIALLY IMMEDIATE`. This is non-negotiable for the verification to pass. - Each trigger must execute the function `FOR EACH ROW`. ================================================ FILE: tasks/postgres/standard/lego/consistency_enforcement/meta.json ================================================ { "task_id": "consistency_enforcement", "task_name": "Consistency Enforcement", "category_id": "lego", "category_name": "Lego", "description": "Implement data consistency system ensuring reported part counts match actual inventory using triggers and constraint enforcement.", "author": "Jiawei Wang", "created_at": "2025-08-15", "difficulty": "L3", "tags": [ "data integrity enforcement", "stored procedures and functions", "transactional operations" ], "mcp": [ "postgres" ], "meta_data": { "stateType": "text", "stateContent": "Table \"lego_colors\" {\n \"id\" int4 [pk, not null, increment]\n \"name\" varchar(255) [not null]\n \"rgb\" varchar(6) [not null]\n \"is_trans\" bpchar(1) [not null]\n}\n\nTable \"lego_inventories\" {\n \"id\" int4 [pk, not null, increment]\n \"version\" int4 [not null]\n \"set_num\" varchar(255) [not null]\n}\n\nTable \"lego_inventory_parts\" {\n \"inventory_id\" int4 [not null]\n \"part_num\" varchar(255) [not null]\n \"color_id\" int4 [not null]\n \"quantity\" int4 [not null]\n \"is_spare\" bool [not null]\n}\n\nTable \"lego_inventory_sets\" {\n \"inventory_id\" int4 [not null]\n \"set_num\" varchar(255) [not null]\n \"quantity\" int4 [not null]\n}\n\nTable \"lego_part_categories\" {\n \"id\" int4 [pk, not null, increment]\n \"name\" varchar(255) [not null]\n}\n\nTable \"lego_parts\" {\n \"part_num\" varchar(255) [pk, not null]\n \"name\" text [not null]\n \"part_cat_id\" int4 [not null]\n}\n\nTable \"lego_sets\" {\n \"set_num\" varchar(255) [pk, not null]\n \"name\" varchar(255) [not null]\n \"year\" int4\n \"theme_id\" int4\n \"num_parts\" int4\n}\n\nTable \"lego_themes\" {\n \"id\" int4 [pk, not null, increment]\n \"name\" varchar(255) [not null]\n \"parent_id\" int4\n}\n", "stateUrl": null, "stateOriginalUrl": "https://github.com/neondatabase-labs/postgres-sample-dbs/blob/main/lego.sql" } } ================================================ FILE: tasks/postgres/standard/lego/consistency_enforcement/verify.py ================================================ """ Verification script for PostgreSQL LEGO Task 1: Parts Consistency Fix & Constraints Version 2.1: Relaxed consistency check to allow for one known corner case mismatch. """ import os import sys import psycopg2 import psycopg2.errors from typing import Optional, Tuple, List def get_connection_params() -> dict: """Get database connection parameters from environment variables.""" return { "host": os.getenv("POSTGRES_HOST", "localhost"), "port": int(os.getenv("POSTGRES_PORT", 5432)), "database": os.getenv("POSTGRES_DATABASE"), "user": os.getenv("POSTGRES_USERNAME"), "password": os.getenv("POSTGRES_PASSWORD"), } def fetch_candidate_part_row(cur) -> Optional[Tuple[int, str, str, int]]: """ Picks a concrete, non-spare inventory part from the latest inventory of any set. This provides a reliable target for testing update and insert triggers. Returns a tuple: (inventory_id, set_num, part_num, color_id) or None. """ cur.execute( """ WITH latest_inv AS ( SELECT set_num, MAX(version) AS max_version FROM public.lego_inventories GROUP BY set_num ), inv AS ( SELECT li.id, li.set_num FROM public.lego_inventories li JOIN latest_inv lv ON lv.set_num = li.set_num AND lv.max_version = li.version ) SELECT i.id AS inventory_id, i.set_num, lip.part_num, lip.color_id FROM inv i JOIN public.lego_inventory_parts lip ON lip.inventory_id = i.id WHERE lip.is_spare = false AND lip.quantity > 0 LIMIT 1; """ ) return cur.fetchone() def get_mismatch_count(cur) -> int: """Returns the number of sets where num_parts mismatches the computed actual sum.""" cur.execute( """ WITH latest_inv AS ( SELECT set_num, MAX(version) AS max_version FROM public.lego_inventories GROUP BY set_num ), inv_latest AS ( SELECT li.set_num, li.id FROM public.lego_inventories li JOIN latest_inv lv ON lv.set_num = li.set_num AND lv.max_version = li.version ), parts_agg AS ( SELECT i.set_num, SUM(lip.quantity) AS actual_parts FROM inv_latest i JOIN public.lego_inventory_parts lip ON lip.inventory_id = i.id WHERE lip.is_spare = false GROUP BY i.set_num ) SELECT COUNT(*) FROM public.lego_sets s LEFT JOIN parts_agg pa ON s.set_num = pa.set_num WHERE s.num_parts <> COALESCE(pa.actual_parts, 0); """ ) return cur.fetchone()[0] def verify_data_consistency(conn) -> bool: """ TASK 1 VERIFICATION: Checks if the initial data fix was successful. (Relaxed: Allows for one corner-case mismatch). """ print("\n-- Verifying Task 1: Data Consistency Fix (Relaxed) --") with conn.cursor() as cur: count = get_mismatch_count(cur) # RELAXED CONDITION: Allow 0 or 1 mismatch to pass. if count > 1: print(f"❌ FAIL: Found {count} sets with inconsistent part counts. Expected 0 or 1 after fix.") return False print("✅ PASS: Data consistency check passed (allowing for one known mismatch).") return True def verify_constraint_triggers_exist(conn) -> bool: """ TASK 2 VERIFICATION (Part A): Checks if constraint triggers are attached to all required tables. This is more robust than checking names or a total count. """ print("\n-- Verifying Task 2: Constraint Trigger Existence --") tables_to_check = [ 'public.lego_inventory_parts', 'public.lego_inventories', 'public.lego_sets' ] all_triggers_found = True with conn.cursor() as cur: for table in tables_to_check: cur.execute( """ SELECT COUNT(*) FROM pg_trigger WHERE tgrelid = %s::regclass AND tgconstraint <> 0; """, (table,) ) trigger_count = cur.fetchone()[0] if trigger_count == 0: print(f"❌ FAIL: No constraint trigger found on table '{table}'.") all_triggers_found = False else: print(f"✅ OK: Found constraint trigger(s) on table '{table}'.") if all_triggers_found: print("✅ PASS: Constraint triggers are attached to all required tables.") return all_triggers_found def verify_violation_is_blocked(conn) -> bool: """ TASK 2 VERIFICATION (Part B): Checks if triggers block a direct, inconsistent write. An attempt to increment a part quantity without updating the set's total should fail. """ print("\n-- Verifying Task 2: Immediate Constraint Enforcement --") with conn.cursor() as cur: candidate = fetch_candidate_part_row(cur) if not candidate: print("⚠️ SKIP: No candidate part row found to test constraints. Cannot verify.") return True # Skip if no data to test inventory_id, _, part_num, color_id = candidate try: # This transaction should fail due to the trigger cur.execute( """ UPDATE public.lego_inventory_parts SET quantity = quantity + 1 WHERE inventory_id = %s AND part_num = %s AND color_id = %s; """, (inventory_id, part_num, color_id), ) # If we reach here, the trigger failed to block the update. conn.rollback() print("❌ FAIL: An inconsistent write was NOT blocked by the trigger.") return False except psycopg2.Error as e: # We expect an error. Specifically, a constraint violation error. conn.rollback() # 23514 is check_violation, but custom triggers might raise others. # Any error here is considered a success as the transaction was blocked. print(f"✅ PASS: Inconsistent write was correctly blocked by the trigger. (Error: {e.pgcode})") return True def verify_deferred_transaction_is_allowed(conn) -> bool: """ TASK 2 VERIFICATION (Part C): Checks if a coordinated, consistent update is allowed when constraints are deferred. """ print("\n-- Verifying Task 2: Deferred Constraint Enforcement --") with conn.cursor() as cur: candidate = fetch_candidate_part_row(cur) if not candidate: print("⚠️ SKIP: No candidate part row found. Cannot test deferred transaction.") return True # Skip if no data to test inventory_id, set_num, part_num, color_id = candidate try: # This multi-statement transaction should succeed with deferred constraints with conn.cursor() as cur: cur.execute("BEGIN;") cur.execute("SET CONSTRAINTS ALL DEFERRED;") cur.execute( "UPDATE public.lego_inventory_parts SET quantity = quantity + 1 WHERE inventory_id = %s AND part_num = %s AND color_id = %s;", (inventory_id, part_num, color_id), ) cur.execute( "UPDATE public.lego_sets SET num_parts = num_parts + 1 WHERE set_num = %s;", (set_num,), ) cur.execute("COMMIT;") # This will fail if constraints are not deferrable or logic is wrong print("✅ PASS: Coordinated update with deferred constraints committed successfully.") # Revert changes to leave DB in its original state with conn.cursor() as cur: cur.execute("BEGIN;") cur.execute("SET CONSTRAINTS ALL DEFERRED;") cur.execute( "UPDATE public.lego_inventory_parts SET quantity = quantity - 1 WHERE inventory_id = %s AND part_num = %s AND color_id = %s;", (inventory_id, part_num, color_id), ) cur.execute( "UPDATE public.lego_sets SET num_parts = num_parts - 1 WHERE set_num = %s;", (set_num,), ) cur.execute("COMMIT;") print("INFO: Test changes were successfully reverted.") return True except psycopg2.Error as e: conn.rollback() print(f"❌ FAIL: Deferred transaction failed to commit. Error: {e}") return False def main(): """Main verification function.""" print("=" * 60) print("LEGO Database Consistency Verification Script") print("=" * 60) conn_params = get_connection_params() if not conn_params.get("database"): print("❌ CRITICAL: POSTGRES_DATABASE environment variable not set.") sys.exit(1) try: with psycopg2.connect(**conn_params) as conn: conn.autocommit = False # Ensure we control transactions # Run all verification steps results = [ verify_data_consistency(conn), verify_constraint_triggers_exist(conn), verify_violation_is_blocked(conn), verify_deferred_transaction_is_allowed(conn), ] if all(results): print("\n🎉 Overall Result: PASS - All tasks verified successfully!") sys.exit(0) else: print("\n❌ Overall Result: FAIL - One or more verification steps failed.") sys.exit(1) except psycopg2.OperationalError as e: print(f"❌ CRITICAL: Could not connect to the database. Details: {e}") sys.exit(1) except Exception as e: print(f"❌ CRITICAL: An unexpected error occurred during verification. Details: {e}") sys.exit(1) if __name__ == "__main__": main() ================================================ FILE: tasks/postgres/standard/lego/database_security_policies/description.md ================================================ Implement a comprehensive database security system with Row-Level Security (RLS) policies and role-based access control for the LEGO database. The system must ensure theme-based data isolation and prevent unauthorized access across different LEGO themes. ## Your Tasks: 1. **Create database role and permissions** — Create a new database role called `theme_analyst` with the following permissions: * `SELECT` permissions on all reference tables: `lego_themes`, `lego_colors`, `lego_parts`, `lego_part_categories` * `SELECT` permissions on main data tables: `lego_sets`, `lego_inventories`, `lego_inventory_parts` * No `INSERT`, `UPDATE`, or `DELETE` permissions on any tables 2. **Enable Row-Level Security** — Enable RLS on the following tables: * `lego_sets` * `lego_inventories` * `lego_inventory_parts` 3. **Create RLS policies** — Implement theme-based data isolation policies: **Policy 1: `theme_sets_policy` on `lego_sets`** * Allows access only to sets where `theme_id = 18` (Star Wars theme) * Policy should use a function that checks the current user's theme assignment **Policy 2: `theme_inventories_policy` on `lego_inventories`** * Allows access only to inventories for sets with `theme_id = 18` * Must join with `lego_sets` table to check theme_id **Policy 3: `theme_inventory_parts_policy` on `lego_inventory_parts`** * Allows access only to inventory parts for sets with `theme_id = 18` * Must join through `lego_inventories` and `lego_sets` to check theme_id 4. **Create theme assignment function** — Create a function `get_user_theme_id()` that: * Returns `18` for the `theme_analyst` role (Star Wars theme) * Can be extended to support other themes in the future * Uses `current_user` to determine the appropriate theme_id 5. **Test the security implementation** — Execute verification queries that demonstrate: * Star Wars theme (theme_id=18) returns exactly 2 sets: '65081-1' and 'K8008-1' * Technic theme (theme_id=1) returns 0 sets when accessed by theme_analyst role * Cross-theme data access is properly blocked * Reference tables are accessible for all data 6. **Create comprehensive security audit** — Generate a detailed report including: * Complete SQL statements for role creation and policy implementation * Expected query results for each theme * Verification queries to confirm proper data isolation * Documentation of the security model and access patterns ## Security Requirements: - The `theme_analyst` role must only see data related to Star Wars theme (theme_id=18) - All other themes must be completely hidden from this role - Reference tables (themes, colors, parts, part_categories) must be fully accessible - The system must prevent any cross-theme data leakage - RLS policies must be active and enforced for all data access ## Expected Results: When the `theme_analyst` role queries the database: - `lego_sets` should return only 2 Star Wars sets - `lego_inventories` should return only inventories for those 2 sets - `lego_inventory_parts` should return only parts for those 2 sets - All reference tables should return complete data - Queries for other themes should return empty results ================================================ FILE: tasks/postgres/standard/lego/database_security_policies/meta.json ================================================ { "task_id": "database_security_policies", "task_name": "Database Security Policies", "category_id": "lego", "category_name": "Lego", "description": "Implement Row-Level Security policies with role-based access control for theme-based data isolation in LEGO database.", "author": "Jiawei Wang", "created_at": "2025-08-15", "difficulty": "L3", "tags": [ "security and access control", "stored procedures and functions" ], "mcp": [ "postgres" ], "meta_data": { "stateType": "text", "stateContent": "Table \"lego_colors\" {\n \"id\" int4 [pk, not null, increment]\n \"name\" varchar(255) [not null]\n \"rgb\" varchar(6) [not null]\n \"is_trans\" bpchar(1) [not null]\n}\n\nTable \"lego_inventories\" {\n \"id\" int4 [pk, not null, increment]\n \"version\" int4 [not null]\n \"set_num\" varchar(255) [not null]\n}\n\nTable \"lego_inventory_parts\" {\n \"inventory_id\" int4 [not null]\n \"part_num\" varchar(255) [not null]\n \"color_id\" int4 [not null]\n \"quantity\" int4 [not null]\n \"is_spare\" bool [not null]\n}\n\nTable \"lego_inventory_sets\" {\n \"inventory_id\" int4 [not null]\n \"set_num\" varchar(255) [not null]\n \"quantity\" int4 [not null]\n}\n\nTable \"lego_part_categories\" {\n \"id\" int4 [pk, not null, increment]\n \"name\" varchar(255) [not null]\n}\n\nTable \"lego_parts\" {\n \"part_num\" varchar(255) [pk, not null]\n \"name\" text [not null]\n \"part_cat_id\" int4 [not null]\n}\n\nTable \"lego_sets\" {\n \"set_num\" varchar(255) [pk, not null]\n \"name\" varchar(255) [not null]\n \"year\" int4\n \"theme_id\" int4\n \"num_parts\" int4\n}\n\nTable \"lego_themes\" {\n \"id\" int4 [pk, not null, increment]\n \"name\" varchar(255) [not null]\n \"parent_id\" int4\n}\n", "stateUrl": null, "stateOriginalUrl": "https://github.com/neondatabase-labs/postgres-sample-dbs/blob/main/lego.sql" } } ================================================ FILE: tasks/postgres/standard/lego/database_security_policies/verify.py ================================================ """ Verification script for PostgreSQL LEGO Task 4: Database Security and RLS Implementation (Version 2 - Improved Robustness) """ import os import sys import psycopg2 import psycopg2.errors from typing import Dict def get_connection_params() -> Dict[str, any]: """Get database connection parameters from environment variables.""" return { "host": os.getenv("POSTGRES_HOST", "localhost"), "port": int(os.getenv("POSTGRES_PORT", 5432)), "database": os.getenv("POSTGRES_DATABASE"), "user": os.getenv("POSTGRES_USERNAME"), "password": os.getenv("POSTGRES_PASSWORD"), } def verify_role_creation(conn) -> bool: """ TASK 1 VERIFICATION: Check if theme_analyst role was created with proper permissions. """ print("\n-- Verifying Task 1: Role Creation and Permissions --") with conn.cursor() as cur: # Check if role exists cur.execute("SELECT 1 FROM pg_roles WHERE rolname = 'theme_analyst';") if not cur.fetchone(): print("❌ FAIL: The 'theme_analyst' role was not created.") return False print("✅ OK: Role 'theme_analyst' exists.") # Check SELECT permissions on reference and main tables all_tables = [ 'lego_themes', 'lego_colors', 'lego_parts', 'lego_part_categories', 'lego_sets', 'lego_inventories', 'lego_inventory_parts' ] for table in all_tables: cur.execute( """ SELECT has_table_privilege('theme_analyst', %s, 'SELECT'); """, (table,) ) if not cur.fetchone()[0]: print(f"❌ FAIL: 'theme_analyst' role is missing SELECT permission on '{table}'.") return False print("✅ OK: Role has correct SELECT permissions on all required tables.") # Check that no INSERT/UPDATE/DELETE permissions exist for table in all_tables: cur.execute( """ SELECT has_table_privilege('theme_analyst', %s, 'INSERT') OR has_table_privilege('theme_analyst', %s, 'UPDATE') OR has_table_privilege('theme_analyst', %s, 'DELETE'); """, (table, table, table) ) if cur.fetchone()[0]: print(f"❌ FAIL: 'theme_analyst' role has unauthorized INSERT, UPDATE, or DELETE permission on '{table}'.") return False print("✅ OK: Role does not have modification permissions.") print("✅ PASS: 'theme_analyst' role created with correct permissions.") return True def verify_rls_enabled(conn) -> bool: """ TASK 2 VERIFICATION: Check if Row-Level Security is enabled on required tables. """ print("\n-- Verifying Task 2: Row-Level Security Enablement --") tables_to_check = ['lego_sets', 'lego_inventories', 'lego_inventory_parts'] with conn.cursor() as cur: for table in tables_to_check: cur.execute( "SELECT relrowsecurity FROM pg_class WHERE relname = %s;", (table,) ) rls_enabled = cur.fetchone() if not rls_enabled or not rls_enabled[0]: print(f"❌ FAIL: RLS is not enabled on table '{table}'.") return False print(f"✅ OK: RLS is enabled on table '{table}'.") print("✅ PASS: Row-Level Security is enabled on all required tables.") return True def verify_rls_policies(conn) -> bool: """ TASK 3 VERIFICATION: Check if RLS policies were created on required tables. """ print("\n-- Verifying Task 3: RLS Policy Creation --") expected_policies = { 'lego_sets': 'theme_sets_policy', 'lego_inventories': 'theme_inventories_policy', 'lego_inventory_parts': 'theme_inventory_parts_policy' } with conn.cursor() as cur: for table, policy_name in expected_policies.items(): cur.execute( "SELECT 1 FROM pg_policies WHERE tablename = %s AND policyname = %s;", (table, policy_name) ) if not cur.fetchone(): print(f"❌ FAIL: RLS policy '{policy_name}' not found on table '{table}'.") return False print(f"✅ OK: RLS policy '{policy_name}' found on table '{table}'.") print("✅ PASS: All required RLS policies are created.") return True def verify_theme_function(conn) -> bool: """ TASK 4 VERIFICATION: Check if get_user_theme_id() function was created and works correctly. """ print("\n-- Verifying Task 4: Theme Assignment Function --") with conn.cursor() as cur: cur.execute( "SELECT 1 FROM pg_proc WHERE proname = 'get_user_theme_id';" ) if not cur.fetchone(): print("❌ FAIL: The 'get_user_theme_id' function was not created.") return False print("✅ OK: Function 'get_user_theme_id' exists.") try: # Test the function's output specifically for the 'theme_analyst' role cur.execute("SET ROLE theme_analyst;") cur.execute("SELECT get_user_theme_id();") theme_id = cur.fetchone()[0] cur.execute("RESET ROLE;") # IMPORTANT: Switch back if theme_id != 18: print(f"❌ FAIL: get_user_theme_id() returned {theme_id} for 'theme_analyst', but expected 18.") return False print("✅ OK: Function returns correct theme_id (18) for 'theme_analyst'.") print("✅ PASS: Theme assignment function is correct.") return True except Exception as e: conn.rollback() # Rollback any failed transaction state print(f"❌ FAIL: Error testing get_user_theme_id() function: {e}") return False def test_theme_analyst_access(conn) -> bool: """ TASK 5 VERIFICATION: Test data access by assuming the theme_analyst role. """ print("\n-- Verifying Task 5: Theme-Based Data Access --") try: with conn.cursor() as cur: # Assume the role of theme_analyst for this session cur.execute("SET ROLE theme_analyst;") # Test 1: Check Star Wars sets access (should return 2 sets) cur.execute("SELECT set_num FROM lego_sets ORDER BY set_num;") star_wars_sets = [row[0] for row in cur.fetchall()] expected_sets = ['65081-1', 'K8008-1'] if sorted(star_wars_sets) != sorted(expected_sets): print(f"❌ FAIL: Expected Star Wars sets {expected_sets}, but got {star_wars_sets}.") cur.execute("RESET ROLE;") return False print("✅ PASS: Star Wars sets access is correct (2 sets returned).") # Test 2: Check that Technic sets are not accessible (should return 0) cur.execute("SELECT COUNT(*) FROM lego_sets WHERE theme_id = 1;") technic_count = cur.fetchone()[0] if technic_count != 0: print(f"❌ FAIL: Technic sets should be blocked, but query returned {technic_count} sets.") cur.execute("RESET ROLE;") return False print("✅ PASS: Technic theme is correctly blocked (0 sets returned).") # Test 3: Check reference tables are fully accessible cur.execute("SELECT COUNT(*) > 10 FROM lego_themes;") # Check for a reasonable number if not cur.fetchone()[0]: print("❌ FAIL: 'lego_themes' table seems inaccessible or empty.") cur.execute("RESET ROLE;") return False print("✅ PASS: Reference tables appear to be accessible.") # Test 4 & 5: Check related tables cur.execute("SELECT COUNT(*) FROM lego_inventories;") if cur.fetchone()[0] == 0: print("❌ FAIL: No inventories are visible for the allowed sets.") cur.execute("RESET ROLE;") return False cur.execute("SELECT COUNT(*) FROM lego_inventory_parts;") if cur.fetchone()[0] == 0: print("❌ FAIL: No inventory parts are visible for the allowed sets.") cur.execute("RESET ROLE;") return False print("✅ PASS: Related tables (inventories, inventory_parts) are correctly filtered.") # IMPORTANT: Always reset the role at the end cur.execute("RESET ROLE;") return True except Exception as e: conn.rollback() # Ensure transaction is clean print(f"❌ FAIL: An error occurred while testing data access as 'theme_analyst': {e}") # Try to reset role even on failure to clean up session state try: with conn.cursor() as cleanup_cur: cleanup_cur.execute("RESET ROLE;") except: pass return False def main(): """Main verification function.""" print("=" * 60) print("LEGO Database Security and RLS Verification Script") print("=" * 60) conn_params = get_connection_params() if not conn_params.get("database"): print("❌ CRITICAL: POSTGRES_DATABASE environment variable not set.") sys.exit(1) conn = None try: conn = psycopg2.connect(**conn_params) results = [ verify_role_creation(conn), verify_rls_enabled(conn), verify_rls_policies(conn), verify_theme_function(conn), test_theme_analyst_access(conn), ] if all(results): print("\n🎉 Overall Result: PASS - All security tasks verified successfully!") sys.exit(0) else: print("\n❌ Overall Result: FAIL - One or more verification steps failed.") sys.exit(1) except psycopg2.OperationalError as e: print(f"❌ CRITICAL: Could not connect to the database. Check credentials and host. Details: {e}") sys.exit(1) except Exception as e: print(f"❌ CRITICAL: An unexpected error occurred. Details: {e}") sys.exit(1) finally: if conn: conn.close() if __name__ == "__main__": main() ================================================ FILE: tasks/postgres/standard/lego/transactional_inventory_transfer/description.md ================================================ Create a PostgreSQL function to handle inventory part transfers between LEGO sets with enhanced validation and audit capabilities. The LEGO warehouse management system needs to support transferring parts while maintaining data integrity and tracking transfer history. ## Your Tasks: 1. **Create the transfer function** — Implement a PostgreSQL function named `transfer_parts` with the following signature: ```sql CREATE OR REPLACE FUNCTION transfer_parts( source_inventory_id INTEGER, target_inventory_id INTEGER, part_to_transfer_num VARCHAR, color_to_transfer_id INTEGER, quantity_to_transfer INTEGER, transfer_reason VARCHAR DEFAULT 'manual_transfer' ) RETURNS TEXT ``` 2. **Create audit logging table** — Create a new table to track transfer history: ```sql CREATE TABLE inventory_transfer_log ( log_id SERIAL PRIMARY KEY, transfer_timestamp TIMESTAMP DEFAULT NOW(), source_inventory_id INTEGER NOT NULL, target_inventory_id INTEGER NOT NULL, part_num VARCHAR NOT NULL, color_id INTEGER NOT NULL, quantity_transferred INTEGER NOT NULL, transfer_reason VARCHAR NOT NULL, transfer_status VARCHAR NOT NULL CHECK (transfer_status IN ('success', 'failed')), error_message TEXT ); ``` 3. **Implement enhanced validation** — The function must perform these validations: **Validation A: Basic Checks** - Verify both inventory IDs exist in `lego_inventories` table - Verify part exists in `lego_parts` table - Verify color exists in `lego_colors` table - Check source has sufficient quantity (including spare parts) - Prevent self-transfers (source and target cannot be the same) **Validation B: Business Rules** - Maximum transfer quantity is 500 parts per operation - Minimum transfer quantity is 1 part - Source and target must be different inventories 4. **Implement transactional logic** — The function must perform these operations within a single transaction: **Step A: Pre-validation** - Lock both inventory records using `SELECT ... FOR UPDATE` - Perform all validation checks - Calculate transfer feasibility **Step B: Source Inventory Update** - Decrease quantity in source inventory - If quantity becomes zero, delete the row - Handle spare parts appropriately (maintain `is_spare` flag) **Step C: Target Inventory Update** - Check if part exists in target inventory - If exists: increase quantity - If not exists: insert new record - Handle spare parts appropriately **Step D: Audit Logging** - Log successful transfers with details - Log failed transfers with error messages - Include transfer reason and status 5. **Error handling requirements**: - Use `RAISE EXCEPTION` with descriptive error messages - Handle all validation failures gracefully - Ensure complete rollback on any failure - Log all attempts (successful and failed) 6. **Return value**: - Return success message: `'Successfully transferred {quantity} parts ({part_num}, color_id: {color_id}) from inventory {source_id} to inventory {target_id}. Reason: {reason}'` - Include transfer details and reason in the message ## Function Requirements: - **Transaction Safety**: All operations wrapped in transaction block - **Data Integrity**: No partial updates possible - **Audit Trail**: Complete logging of all transfer attempts - **Validation**: Comprehensive input and business rule validation - **Error Recovery**: Failed transfers leave database unchanged - **Performance**: Use appropriate locking to prevent race conditions ## Example Usage: ```sql -- Basic transfer with reason SELECT transfer_parts(14469, 14686, '3024', 15, 100, 'inventory_adjustment'); -- Transfer to new inventory (should create new record) SELECT transfer_parts(11124, 14686, '3001', 4, 50, 'part_redistribution'); -- This should fail due to insufficient quantity SELECT transfer_parts(14469, 14686, '3024', 15, 2000, 'large_transfer'); -- This should fail due to self-transfer SELECT transfer_parts(14469, 14469, '3024', 15, 10, 'self_transfer'); ``` ## Verification Criteria: - Function handles all validation rules correctly - Audit logging captures all transfer attempts - Failed transfers are properly logged with error details - Self-transfers are prevented - Quantity limits are enforced - Database state remains consistent after failures ================================================ FILE: tasks/postgres/standard/lego/transactional_inventory_transfer/meta.json ================================================ { "task_id": "transactional_inventory_transfer", "task_name": "Transactional Inventory Transfer", "category_id": "lego", "category_name": "Lego", "description": "Create PostgreSQL function to handle inventory part transfers between LEGO sets with validation and audit logging.", "author": "Jiawei Wang", "created_at": "2025-08-16", "difficulty": "L3", "tags": [ "transactional operations", "stored procedures and functions", "audit and compliance" ], "mcp": [ "postgres" ], "meta_data": { "stateType": "text", "stateContent": "Table \"lego_colors\" {\n \"id\" int4 [pk, not null, increment]\n \"name\" varchar(255) [not null]\n \"rgb\" varchar(6) [not null]\n \"is_trans\" bpchar(1) [not null]\n}\n\nTable \"lego_inventories\" {\n \"id\" int4 [pk, not null, increment]\n \"version\" int4 [not null]\n \"set_num\" varchar(255) [not null]\n}\n\nTable \"lego_inventory_parts\" {\n \"inventory_id\" int4 [not null]\n \"part_num\" varchar(255) [not null]\n \"color_id\" int4 [not null]\n \"quantity\" int4 [not null]\n \"is_spare\" bool [not null]\n}\n\nTable \"lego_inventory_sets\" {\n \"inventory_id\" int4 [not null]\n \"set_num\" varchar(255) [not null]\n \"quantity\" int4 [not null]\n}\n\nTable \"lego_part_categories\" {\n \"id\" int4 [pk, not null, increment]\n \"name\" varchar(255) [not null]\n}\n\nTable \"lego_parts\" {\n \"part_num\" varchar(255) [pk, not null]\n \"name\" text [not null]\n \"part_cat_id\" int4 [not null]\n}\n\nTable \"lego_sets\" {\n \"set_num\" varchar(255) [pk, not null]\n \"name\" varchar(255) [not null]\n \"year\" int4\n \"theme_id\" int4\n \"num_parts\" int4\n}\n\nTable \"lego_themes\" {\n \"id\" int4 [pk, not null, increment]\n \"name\" varchar(255) [not null]\n \"parent_id\" int4\n}\n", "stateUrl": null, "stateOriginalUrl": "https://github.com/neondatabase-labs/postgres-sample-dbs/blob/main/lego.sql" } } ================================================ FILE: tasks/postgres/standard/lego/transactional_inventory_transfer/verify.py ================================================ """ Verification script for PostgreSQL LEGO Task 2: Enhanced Inventory Transfer Function Tests the transfer_parts function with audit logging and enhanced validation. Key Features Tested: - Core transfer functionality with audit logging - Business rule validation (quantity limits, self-transfer prevention) - Error handling and rollback mechanisms - Audit trail maintenance for both success and failure cases """ import os import sys import psycopg2 import psycopg2.errors from typing import Optional, Tuple def get_connection_params() -> dict: """Get database connection parameters from environment variables.""" return { "host": os.getenv("POSTGRES_HOST", "localhost"), "port": int(os.getenv("POSTGRES_PORT", 5432)), "database": os.getenv("POSTGRES_DATABASE"), "user": os.getenv("POSTGRES_USERNAME"), "password": os.getenv("POSTGRES_PASSWORD"), } def get_inventory_part_quantity(conn, inventory_id: int, part_num: str, color_id: int) -> int: """Get the current quantity of a specific part in an inventory.""" with conn.cursor() as cur: cur.execute( """ SELECT quantity FROM public.lego_inventory_parts WHERE inventory_id = %s AND part_num = %s AND color_id = %s """, (inventory_id, part_num, color_id) ) result = cur.fetchone() return result[0] if result else 0 def verify_system_components(conn) -> bool: """Verify that all required system components exist.""" print("\n-- Verifying System Components --") try: with conn.cursor() as cur: # Check main function cur.execute( """ SELECT COUNT(*) FROM pg_proc p JOIN pg_namespace n ON p.pronamespace = n.oid WHERE n.nspname = 'public' AND p.proname = 'transfer_parts' """ ) main_func_count = cur.fetchone()[0] # Check audit table cur.execute( """ SELECT COUNT(*) FROM information_schema.tables WHERE table_schema = 'public' AND table_name = 'inventory_transfer_log' """ ) audit_table_count = cur.fetchone()[0] if main_func_count == 0: print("❌ FAIL: transfer_parts function does not exist") return False if audit_table_count == 0: print("❌ FAIL: inventory_transfer_log table does not exist") return False print("✅ PASS: All system components exist") return True finally: conn.rollback() def verify_successful_transfer_with_audit(conn) -> bool: """Test a successful transfer with audit logging.""" print("\n-- Verifying Successful Transfer with Audit --") passed = False try: # Test data: Transfer 100 white plates from Mosaic Dino to Mosaic Johnny Thunder source_id = 14469 target_id = 14686 part_num = '3024' color_id = 15 transfer_qty = 100 reason = 'inventory_adjustment' source_initial = get_inventory_part_quantity(conn, source_id, part_num, color_id) target_initial = get_inventory_part_quantity(conn, target_id, part_num, color_id) print(f"Initial quantities - Source: {source_initial}, Target: {target_initial}") # Get initial audit log count with conn.cursor() as cur: cur.execute("SELECT COUNT(*) FROM inventory_transfer_log") initial_log_count = cur.fetchone()[0] with conn.cursor() as cur: cur.execute( "SELECT transfer_parts(%s, %s, %s, %s, %s, %s)", (source_id, target_id, part_num, color_id, transfer_qty, reason) ) result = cur.fetchone() print(f"Transfer result: {result[0]}") source_final = get_inventory_part_quantity(conn, source_id, part_num, color_id) target_final = get_inventory_part_quantity(conn, target_id, part_num, color_id) print(f"Final quantities - Source: {source_final}, Target: {target_final}") # Verify audit log entry with conn.cursor() as cur: cur.execute("SELECT COUNT(*) FROM inventory_transfer_log") final_log_count = cur.fetchone()[0] if final_log_count <= initial_log_count: print("❌ FAIL: No audit log entry was created") return False # Check latest audit entry cur.execute( """ SELECT transfer_status, quantity_transferred, transfer_reason FROM inventory_transfer_log ORDER BY log_id DESC LIMIT 1 """ ) audit_entry = cur.fetchone() if not audit_entry: print("❌ FAIL: Could not retrieve audit log entry") return False status, qty_transferred, trans_reason = audit_entry if status != 'success': print(f"❌ FAIL: Transfer status should be 'success', got '{status}'") return False if qty_transferred != transfer_qty or trans_reason != reason: print(f"❌ FAIL: Audit log details don't match transfer parameters") return False expected_source = source_initial - transfer_qty expected_target = target_initial + transfer_qty if source_final != expected_source: print(f"❌ FAIL: Source quantity mismatch. Expected {expected_source}, got {source_final}") elif target_final != expected_target: print(f"❌ FAIL: Target quantity mismatch. Expected {expected_target}, got {target_final}") else: print("✅ PASS: Successful transfer with audit logging completed correctly") passed = True except psycopg2.Error as e: print(f"❌ FAIL: Transfer failed unexpectedly with error: {e}") finally: conn.rollback() return passed def verify_new_part_transfer(conn) -> bool: """Test transferring a part to an inventory that doesn't have it.""" print("\n-- Verifying New Part Transfer --") passed = False try: # Test data: Transfer red bricks to Mosaic Johnny Thunder (which doesn't have them) source_id = 11124 # Giant Lego Dacta Basic Set (has red bricks) target_id = 14686 # Lego Mosaic Johnny Thunder (doesn't have red bricks) part_num = '3001' color_id = 4 transfer_qty = 50 reason = 'part_redistribution' target_initial = get_inventory_part_quantity(conn, target_id, part_num, color_id) if target_initial != 0: print(f"❌ FAIL: Pre-condition failed. Target already has {target_initial} of this part, expected 0") return False source_initial = get_inventory_part_quantity(conn, source_id, part_num, color_id) print(f"Initial quantities - Source: {source_initial}, Target: {target_initial}") with conn.cursor() as cur: cur.execute( "SELECT transfer_parts(%s, %s, %s, %s, %s, %s)", (source_id, target_id, part_num, color_id, transfer_qty, reason) ) result = cur.fetchone() print(f"Transfer result: {result[0]}") source_final = get_inventory_part_quantity(conn, source_id, part_num, color_id) target_final = get_inventory_part_quantity(conn, target_id, part_num, color_id) print(f"Final quantities - Source: {source_final}, Target: {target_final}") expected_source = source_initial - transfer_qty expected_target = transfer_qty if source_final != expected_source: print(f"❌ FAIL: Source quantity mismatch. Expected {expected_source}, got {source_final}") elif target_final != expected_target: print(f"❌ FAIL: Target quantity mismatch. Expected {expected_target}, got {target_final}") else: print("✅ PASS: New part transfer completed correctly") passed = True except psycopg2.Error as e: print(f"❌ FAIL: Transfer failed unexpectedly with error: {e}") finally: conn.rollback() return passed def verify_business_rule_validation(conn) -> bool: """Test business rule validation including quantity limits and self-transfer prevention.""" print("\n-- Verifying Business Rule Validation --") # Test 1: Self-transfer (should fail) print("Test 1: Self-transfer (should fail)") test1_passed = False try: source_id = 14469 part_num = '3024' color_id = 15 transfer_qty = 10 reason = 'self_transfer' with conn.cursor() as cur: cur.execute( "SELECT transfer_parts(%s, %s, %s, %s, %s, %s)", (source_id, source_id, part_num, color_id, transfer_qty, reason) ) result = cur.fetchone() print(f"❌ FAIL: Self-transfer should have failed but succeeded: {result[0]}") except psycopg2.Error: print(f"✅ PASS: Self-transfer correctly failed") test1_passed = True except Exception as e: print(f"❌ FAIL: Self-transfer test failed with unexpected error: {e}") finally: conn.rollback() # Rollback after first test # Test 2: Transfer quantity exceeds maximum (should fail) print("Test 2: Transfer quantity exceeds maximum (should fail)") test2_passed = False try: source_id = 14469 target_id = 14686 part_num = '3024' color_id = 15 with conn.cursor() as cur: cur.execute( "SELECT transfer_parts(%s, %s, %s, %s, %s, %s)", (source_id, target_id, part_num, color_id, 600, 'large_transfer') ) result = cur.fetchone() print(f"❌ FAIL: Large transfer should have failed but succeeded: {result[0]}") except psycopg2.Error: print(f"✅ PASS: Large transfer correctly failed") test2_passed = True except Exception as e: print(f"❌ FAIL: Large transfer test failed with unexpected error: {e}") finally: conn.rollback() # Rollback after second test # Test 3: Transfer quantity below minimum (should fail) print("Test 3: Transfer quantity below minimum (should fail)") test3_passed = False try: source_id = 14469 target_id = 14686 part_num = '3024' color_id = 15 with conn.cursor() as cur: cur.execute( "SELECT transfer_parts(%s, %s, %s, %s, %s, %s)", (source_id, target_id, part_num, color_id, 0, 'zero_transfer') ) result = cur.fetchone() print(f"❌ FAIL: Zero transfer should have failed but succeeded: {result[0]}") except psycopg2.Error: print(f"✅ PASS: Zero transfer correctly failed") test3_passed = True except Exception as e: print(f"❌ FAIL: Zero transfer test failed with unexpected error: {e}") finally: conn.rollback() # Rollback after third test return test1_passed and test2_passed and test3_passed def verify_insufficient_quantity_error(conn) -> bool: """Test that transfer fails when source has insufficient quantity.""" print("\n-- Verifying Insufficient Quantity Error --") passed = False try: source_id = 14469 target_id = 14686 part_num = '3024' color_id = 15 transfer_qty = 99999 # Far more than available reason = 'insufficient_test' source_initial = get_inventory_part_quantity(conn, source_id, part_num, color_id) target_initial = get_inventory_part_quantity(conn, target_id, part_num, color_id) print(f"Initial quantities - Source: {source_initial}, Target: {target_initial}") with conn.cursor() as cur: try: cur.execute( "SELECT transfer_parts(%s, %s, %s, %s, %s, %s)", (source_id, target_id, part_num, color_id, transfer_qty, reason) ) result = cur.fetchone() print(f"❌ FAIL: Transfer should have failed but succeeded: {result[0]}") except psycopg2.Error as e: print(f"✅ PASS: Transfer correctly failed with an exception.") # After an exception, the transaction is in an aborted state. Must rollback before new queries. conn.rollback() source_final = get_inventory_part_quantity(conn, source_id, part_num, color_id) target_final = get_inventory_part_quantity(conn, target_id, part_num, color_id) if source_final != source_initial: print(f"❌ FAIL: Source quantity changed from {source_initial} to {source_final}") elif target_final != target_initial: print(f"❌ FAIL: Target quantity changed from {target_initial} to {target_final}") else: print("✅ PASS: Database state unchanged after failed transfer") passed = True finally: conn.rollback() return passed def verify_invalid_inventory_error(conn) -> bool: """Test that transfer fails with invalid inventory IDs.""" print("\n-- Verifying Invalid Inventory Error --") passed = False try: source_id = 99999 # Non-existent inventory target_id = 14686 part_num = '3024' color_id = 15 transfer_qty = 10 reason = 'invalid_test' target_initial = get_inventory_part_quantity(conn, target_id, part_num, color_id) with conn.cursor() as cur: try: cur.execute( "SELECT transfer_parts(%s, %s, %s, %s, %s, %s)", (source_id, target_id, part_num, color_id, transfer_qty, reason) ) result = cur.fetchone() print(f"❌ FAIL: Transfer should have failed but succeeded: {result[0]}") except psycopg2.Error as e: print(f"✅ PASS: Transfer correctly failed with an exception.") # Rollback the aborted transaction conn.rollback() target_final = get_inventory_part_quantity(conn, target_id, part_num, color_id) if target_final != target_initial: print(f"❌ FAIL: Target quantity changed from {target_initial} to {target_final}") else: print("✅ PASS: Database state unchanged after invalid inventory error") passed = True finally: conn.rollback() return passed def verify_audit_logging(conn) -> bool: """ Test that audit logging captures both successful and failed transfers. This function uses commits to separate test cases and work around the transactional paradox of logging a failure within a transaction that is about to be rolled back by the client. """ print("\n-- Verifying Audit Logging --") # Part 1: Test success logging print("Part 1: Verifying success log entry...") success_passed = False try: with conn.cursor() as cur: cur.execute("SELECT COUNT(*) FROM inventory_transfer_log") initial_count = cur.fetchone()[0] with conn.cursor() as cur: cur.execute( "SELECT transfer_parts(14469, 14686, '3024', 15, 5, 'audit_test_success')" ) # Check the log before committing/rolling back with conn.cursor() as cur: cur.execute("SELECT COUNT(*) FROM inventory_transfer_log") final_count = cur.fetchone()[0] if final_count == initial_count + 1: print("✅ PASS: Success log was correctly written within the transaction.") success_passed = True else: print("❌ FAIL: Success log was not created.") except Exception as e: print(f"❌ FAIL: Success logging test threw an unexpected error: {e}") finally: conn.rollback() # Clean up the transaction for the next part if not success_passed: return False # Part 2: Test failure logging print("\nPart 2: Verifying failure log entry...") failure_passed = False try: with conn.cursor() as cur: cur.execute("SELECT COUNT(*) FROM inventory_transfer_log") initial_count = cur.fetchone()[0] try: with conn.cursor() as cur: cur.execute( "SELECT transfer_parts(14469, 14469, '3024', 15, 5, 'audit_test_fail')" ) except psycopg2.Error: # This is the expected failure path. # The function should have logged the failure before raising the error. # Now, we check the log table. pass # The transaction is now in an aborted state. We must rollback to issue new commands. conn.rollback() with conn.cursor() as cur: cur.execute("SELECT COUNT(*) FROM inventory_transfer_log") final_count = cur.fetchone()[0] if final_count == initial_count: print("✅ PASS: Failure log was correctly rolled back as expected in a standard transaction.") failure_passed = True else: print("❌ FAIL: Failure log was not rolled back. This implies a non-standard transaction behavior.") print(f"Log count before: {initial_count}, Log count after: {final_count}") except Exception as e: print(f"❌ FAIL: Failure logging test threw an unexpected error: {e}") finally: conn.rollback() # Ensure cleanup return success_passed and failure_passed def verify_exact_quantity_transfer(conn) -> bool: """Test transferring exact quantity (should delete source row when quantity becomes 0).""" print("\n-- Verifying Exact Quantity Transfer --") passed = False target_id = 14686 # Use a fixed target inventory try: # Find a part with a small quantity that doesn't conflict with the target inventory with conn.cursor() as cur: cur.execute( """ SELECT inventory_id, part_num, color_id, quantity FROM public.lego_inventory_parts WHERE quantity BETWEEN 5 AND 20 AND inventory_id != %s LIMIT 1 """, (target_id,) ) result = cur.fetchone() if not result: print("⚠️ SKIP: No suitable part found for exact quantity test") return True source_id, part_num, color_id, exact_qty = result print(f"Testing exact transfer: {exact_qty} parts of '{part_num}' from inventory {source_id} to {target_id}") source_initial = get_inventory_part_quantity(conn, source_id, part_num, color_id) target_initial = get_inventory_part_quantity(conn, target_id, part_num, color_id) print(f"Initial quantities - Source: {source_initial}, Target: {target_initial}") with conn.cursor() as cur: cur.execute( "SELECT transfer_parts(%s, %s, %s, %s, %s, %s)", (source_id, target_id, part_num, color_id, exact_qty, 'exact_transfer') ) print(f"Transfer result: {cur.fetchone()[0]}") source_final = get_inventory_part_quantity(conn, source_id, part_num, color_id) target_final = get_inventory_part_quantity(conn, target_id, part_num, color_id) print(f"Final quantities - Source: {source_final}, Target: {target_final}") expected_source = 0 expected_target = target_initial + exact_qty if source_final != expected_source: print(f"❌ FAIL: Source quantity should be 0 (row deleted), but got {source_final}") elif target_final != expected_target: print(f"❌ FAIL: Target quantity mismatch. Expected {expected_target}, got {target_final}") else: print("✅ PASS: Exact quantity transfer completed correctly (source row deleted)") passed = True except psycopg2.Error as e: print(f"❌ FAIL: Transfer failed unexpectedly with error: {e}") finally: conn.rollback() return passed def main(): """Main verification function.""" print("=" * 60) print("LEGO Enhanced Inventory Transfer Function Verification Script") print("=" * 60) conn_params = get_connection_params() if not conn_params.get("database"): print("❌ CRITICAL: POSTGRES_DATABASE environment variable not set.") sys.exit(1) conn = None try: conn = psycopg2.connect(**conn_params) conn.autocommit = False # Ensure we can control transactions manually # Run all verification steps results = [ verify_system_components(conn), verify_successful_transfer_with_audit(conn), verify_new_part_transfer(conn), verify_business_rule_validation(conn), verify_insufficient_quantity_error(conn), verify_invalid_inventory_error(conn), verify_audit_logging(conn), verify_exact_quantity_transfer(conn), ] if all(results): print("\n🎉 Overall Result: PASS - All verification steps completed successfully!") sys.exit(0) else: print("\n❌ Overall Result: FAIL - One or more verification steps failed.") sys.exit(1) except psycopg2.OperationalError as e: print(f"❌ CRITICAL: Could not connect to the database. Details: {e}") sys.exit(1) except Exception as e: print(f"❌ CRITICAL: An unexpected error occurred. Details: {e}") sys.exit(1) finally: if conn: conn.close() if __name__ == "__main__": main() ================================================ FILE: tasks/postgres/standard/security/rls_business_access/description.md ================================================ Implement Row Level Security (RLS) policies for a social media platform with Users, Posts, Comments, and Channels. ## Your Mission: Build RLS policies for a social platform where users create posts and comments in channels. Implement proper access control so users can manage their own content, while channel moderators can moderate content in their channels. ## RLS Requirements: ### 1. Users Table Access Rules: - **SELECT**: Users can read all public user profiles (username, created_at) - **UPDATE**: Users can only modify their own profile - **DELETE**: Users can only delete their own account ### 2. Channels Table Access Rules: - **SELECT**: Everyone can read public channel information - **INSERT**: Any authenticated user can create a channel (becomes owner) - **UPDATE**: Only channel owners can modify channel details - **DELETE**: Only channel owners can delete channels ### 3. Posts Table Access Rules: - **SELECT**: Users can read all posts in channels they have access to - **INSERT**: Authenticated users can create posts in any channel - **UPDATE**: Post authors OR channel moderators OR channel owners can edit posts - **DELETE**: Post authors OR channel moderators OR channel owners can delete posts ### 4. Comments Table Access Rules: - **SELECT**: Users can read comments on posts they can access - **INSERT**: Authenticated users can comment on posts they can see - **UPDATE**: Comment authors OR post authors OR channel moderators OR channel owners can edit comments - **DELETE**: Comment authors OR post authors OR channel moderators OR channel owners can delete comments ### 5. Channel Moderators Table Access Rules: - **SELECT**: Users can see moderator lists for channels - **INSERT**: Only channel owners can add moderators - **DELETE**: Channel owners can remove moderators; moderators can remove themselves ## Session Context: Use `current_setting('app.current_user_id')` to get the current user ID from session context. ## Schema Requirements: - **Use only the `public` schema** for all tables, functions, and policies - All helper functions should be created in the `public` schema - Do not create additional schemas ## Expected Deliverables: 1. **Enable RLS** on all five tables 2. **Create policies** for SELECT, INSERT, UPDATE, DELETE operations on each table 3. **Helper functions** to check permissions efficiently: - `is_channel_owner(channel_id, user_id)` - `is_channel_moderator(channel_id, user_id)` - `can_moderate_channel(channel_id, user_id)` 4. **Proper indexing** to ensure RLS policies perform well ## Test Scenarios: Your RLS implementation will be verified with: - **Content ownership**: Users can only edit their own posts/comments - **Moderation hierarchy**: Moderators can moderate content in their channels - **Channel isolation**: Users only see content from accessible channels - **Permission escalation**: Owners have full control over their channels - **Cross-table access**: Comment policies respect post and channel permissions ## Success Criteria: - Users can manage their own content (posts, comments) - Channel owners have full control over their channels - Moderators can moderate content in their assigned channels - No unauthorized access to other users' private data - Policies are efficient and don't create performance bottlenecks - All operations (SELECT, INSERT, UPDATE, DELETE) are properly secured ================================================ FILE: tasks/postgres/standard/security/rls_business_access/ground_truth.sql ================================================ -- Ground Truth RLS Implementation BEGIN; -- ============================================================================ -- PERFORMANCE INDEXES FOR RLS -- ============================================================================ -- Users table indexes CREATE INDEX IF NOT EXISTS idx_users_is_public ON users(is_public); -- Channels table indexes CREATE INDEX IF NOT EXISTS idx_channels_owner_id ON channels(owner_id); CREATE INDEX IF NOT EXISTS idx_channels_is_public ON channels(is_public); -- Channel moderators table indexes CREATE INDEX IF NOT EXISTS idx_channel_moderators_channel_user ON channel_moderators(channel_id, user_id); CREATE INDEX IF NOT EXISTS idx_channel_moderators_user ON channel_moderators(user_id); -- Posts table indexes CREATE INDEX IF NOT EXISTS idx_posts_channel_id ON posts(channel_id); CREATE INDEX IF NOT EXISTS idx_posts_author_id ON posts(author_id); CREATE INDEX IF NOT EXISTS idx_posts_created_at ON posts(created_at); -- Comments table indexes CREATE INDEX IF NOT EXISTS idx_comments_post_id ON comments(post_id); CREATE INDEX IF NOT EXISTS idx_comments_author_id ON comments(author_id); CREATE INDEX IF NOT EXISTS idx_comments_created_at ON comments(created_at); -- ============================================================================ -- ENABLE ROW LEVEL SECURITY -- ============================================================================ ALTER TABLE users ENABLE ROW LEVEL SECURITY; ALTER TABLE channels ENABLE ROW LEVEL SECURITY; ALTER TABLE channel_moderators ENABLE ROW LEVEL SECURITY; ALTER TABLE posts ENABLE ROW LEVEL SECURITY; ALTER TABLE comments ENABLE ROW LEVEL SECURITY; -- ============================================================================ -- USERS TABLE POLICIES -- ============================================================================ -- Users SELECT: Can read public profiles OR own profile DROP POLICY IF EXISTS users_select ON users; CREATE POLICY users_select ON users FOR SELECT USING ( is_public = true OR id = app_current_user_id() ); -- Users UPDATE: Can only update own profile DROP POLICY IF EXISTS users_update ON users; CREATE POLICY users_update ON users FOR UPDATE USING (id = app_current_user_id()) WITH CHECK (id = app_current_user_id()); -- Users DELETE: Can only delete own account DROP POLICY IF EXISTS users_delete ON users; CREATE POLICY users_delete ON users FOR DELETE USING (id = app_current_user_id()); -- ============================================================================ -- CHANNELS TABLE POLICIES -- ============================================================================ -- Channels SELECT: Can read public channels OR channels where user is owner/moderator DROP POLICY IF EXISTS channels_select ON channels; CREATE POLICY channels_select ON channels FOR SELECT USING ( is_public = true OR owner_id = app_current_user_id() OR is_channel_moderator(id, app_current_user_id()) ); -- Channels INSERT: Authenticated users can create channels (become owner) DROP POLICY IF EXISTS channels_insert ON channels; CREATE POLICY channels_insert ON channels FOR INSERT WITH CHECK (owner_id = app_current_user_id()); -- Channels UPDATE: Only channel owners can modify DROP POLICY IF EXISTS channels_update ON channels; CREATE POLICY channels_update ON channels FOR UPDATE USING (owner_id = app_current_user_id()) WITH CHECK (owner_id = app_current_user_id()); -- Channels DELETE: Only channel owners can delete DROP POLICY IF EXISTS channels_delete ON channels; CREATE POLICY channels_delete ON channels FOR DELETE USING (owner_id = app_current_user_id()); -- ============================================================================ -- POSTS TABLE POLICIES -- ============================================================================ -- Posts SELECT: Can read posts in accessible channels DROP POLICY IF EXISTS posts_select ON posts; CREATE POLICY posts_select ON posts FOR SELECT USING ( EXISTS ( SELECT 1 FROM channels c WHERE c.id = posts.channel_id AND ( c.is_public = true OR c.owner_id = app_current_user_id() OR is_channel_moderator(c.id, app_current_user_id()) ) ) ); -- Posts INSERT: Authenticated users can create posts (must be author) DROP POLICY IF EXISTS posts_insert ON posts; CREATE POLICY posts_insert ON posts FOR INSERT WITH CHECK ( author_id = app_current_user_id() AND EXISTS ( SELECT 1 FROM channels c WHERE c.id = posts.channel_id AND ( c.is_public = true OR c.owner_id = app_current_user_id() OR is_channel_moderator(c.id, app_current_user_id()) ) ) ); -- Posts UPDATE: Post authors OR channel moderators/owners can edit DROP POLICY IF EXISTS posts_update ON posts; CREATE POLICY posts_update ON posts FOR UPDATE USING ( author_id = app_current_user_id() OR can_moderate_channel(channel_id, app_current_user_id()) ) WITH CHECK ( author_id = app_current_user_id() OR can_moderate_channel(channel_id, app_current_user_id()) ); -- Posts DELETE: Post authors OR channel moderators/owners can delete DROP POLICY IF EXISTS posts_delete ON posts; CREATE POLICY posts_delete ON posts FOR DELETE USING ( author_id = app_current_user_id() OR can_moderate_channel(channel_id, app_current_user_id()) ); -- ============================================================================ -- COMMENTS TABLE POLICIES -- ============================================================================ -- Comments SELECT: Can read comments on accessible posts DROP POLICY IF EXISTS comments_select ON comments; CREATE POLICY comments_select ON comments FOR SELECT USING ( EXISTS ( SELECT 1 FROM posts p JOIN channels c ON c.id = p.channel_id WHERE p.id = comments.post_id AND ( c.is_public = true OR c.owner_id = app_current_user_id() OR is_channel_moderator(c.id, app_current_user_id()) ) ) ); -- Comments INSERT: Authenticated users can comment on accessible posts DROP POLICY IF EXISTS comments_insert ON comments; CREATE POLICY comments_insert ON comments FOR INSERT WITH CHECK ( author_id = app_current_user_id() AND EXISTS ( SELECT 1 FROM posts p JOIN channels c ON c.id = p.channel_id WHERE p.id = comments.post_id AND ( c.is_public = true OR c.owner_id = app_current_user_id() OR is_channel_moderator(c.id, app_current_user_id()) ) ) ); -- Comments UPDATE: Comment authors OR post authors OR channel moderators/owners can edit DROP POLICY IF EXISTS comments_update ON comments; CREATE POLICY comments_update ON comments FOR UPDATE USING ( author_id = app_current_user_id() OR EXISTS ( SELECT 1 FROM posts p WHERE p.id = comments.post_id AND ( p.author_id = app_current_user_id() OR can_moderate_channel(p.channel_id, app_current_user_id()) ) ) ) WITH CHECK ( author_id = app_current_user_id() OR EXISTS ( SELECT 1 FROM posts p WHERE p.id = comments.post_id AND ( p.author_id = app_current_user_id() OR can_moderate_channel(p.channel_id, app_current_user_id()) ) ) ); -- Comments DELETE: Comment authors OR post authors OR channel moderators/owners can delete DROP POLICY IF EXISTS comments_delete ON comments; CREATE POLICY comments_delete ON comments FOR DELETE USING ( author_id = app_current_user_id() OR EXISTS ( SELECT 1 FROM posts p WHERE p.id = comments.post_id AND ( p.author_id = app_current_user_id() OR can_moderate_channel(p.channel_id, app_current_user_id()) ) ) ); -- ============================================================================ -- CHANNEL MODERATORS TABLE POLICIES -- ============================================================================ -- Channel moderators SELECT: Visible to users who can access the channel DROP POLICY IF EXISTS channel_moderators_select ON channel_moderators; CREATE POLICY channel_moderators_select ON channel_moderators FOR SELECT USING ( EXISTS ( SELECT 1 FROM channels c WHERE c.id = channel_moderators.channel_id AND ( c.is_public = true OR c.owner_id = app_current_user_id() OR is_channel_moderator(c.id, app_current_user_id()) ) ) ); -- Channel moderators INSERT: Only channel owners can add moderators DROP POLICY IF EXISTS channel_moderators_insert ON channel_moderators; CREATE POLICY channel_moderators_insert ON channel_moderators FOR INSERT WITH CHECK (is_channel_owner(channel_id, app_current_user_id())); -- Channel moderators DELETE: Channel owners can remove any; moderators can remove themselves DROP POLICY IF EXISTS channel_moderators_delete ON channel_moderators; CREATE POLICY channel_moderators_delete ON channel_moderators FOR DELETE USING ( is_channel_owner(channel_id, app_current_user_id()) OR user_id = app_current_user_id() ); -- ============================================================================ -- USAGE NOTES -- ============================================================================ /* Usage Instructions: 1. Set session context before queries: SET app.current_user_id = '<user-uuid>'; 2. For anonymous users: SET app.current_user_id = ''; 3. Test examples: -- Alice (owner of general channel) SET app.current_user_id = '11111111-1111-1111-1111-111111111111'; -- Bob (moderator of general channel) SET app.current_user_id = '22222222-2222-2222-2222-222222222222'; */ COMMIT; ================================================ FILE: tasks/postgres/standard/security/rls_business_access/meta.json ================================================ { "task_id": "rls_business_access", "task_name": "RLS Business Access", "category_id": "security", "category_name": "Security", "description": "Implement Row Level Security policies for social platform with proper access control for posts, comments, and channels.", "author": "Fanshi Zhang", "created_at": "2025-08-17", "difficulty": "L3", "tags": [ "security and access control", "stored procedures and functions", "schema design" ], "mcp": [ "postgres" ], "meta_data": { "stateType": "text", "stateContent": "Table \"users\" {\n \"id\" uuid [pk, not null, default: `gen_random_uuid()`]\n \"username\" varchar(50) [unique, not null]\n \"email\" varchar(100) [unique, not null]\n \"is_public\" bool [default: false]\n \"created_at\" timestamp [default: `CURRENT_TIMESTAMP`]\n\n Indexes {\n is_public [type: btree, name: \"idx_users_is_public\"]\n }\n}\n\nTable \"channels\" {\n \"id\" uuid [pk, not null, default: `gen_random_uuid()`]\n \"name\" varchar(100) [not null]\n \"description\" text\n \"is_public\" bool [default: true]\n \"owner_id\" uuid\n \"created_at\" timestamp [default: `CURRENT_TIMESTAMP`]\n\n Indexes {\n is_public [type: btree, name: \"idx_channels_is_public\"]\n owner_id [type: btree, name: \"idx_channels_owner_id\"]\n }\n}\n\nTable \"channel_moderators\" {\n \"channel_id\" uuid [not null]\n \"user_id\" uuid [not null]\n \"created_at\" timestamp [default: `CURRENT_TIMESTAMP`]\n\n Indexes {\n (channel_id, user_id) [type: btree, name: \"channel_moderators_pkey\"]\n (channel_id, user_id) [type: btree, name: \"idx_channel_moderators_channel_user\"]\n user_id [type: btree, name: \"idx_channel_moderators_user\"]\n }\n}\n\nTable \"posts\" {\n \"id\" uuid [pk, not null, default: `gen_random_uuid()`]\n \"channel_id\" uuid\n \"author_id\" uuid\n \"title\" varchar(200) [not null]\n \"content\" text\n \"created_at\" timestamp [default: `CURRENT_TIMESTAMP`]\n \"updated_at\" timestamp [default: `CURRENT_TIMESTAMP`]\n\n Indexes {\n author_id [type: btree, name: \"idx_posts_author_id\"]\n channel_id [type: btree, name: \"idx_posts_channel_id\"]\n created_at [type: btree, name: \"idx_posts_created_at\"]\n }\n}\n\nTable \"comments\" {\n \"id\" uuid [pk, not null, default: `gen_random_uuid()`]\n \"post_id\" uuid\n \"author_id\" uuid\n \"content\" text [not null]\n \"created_at\" timestamp [default: `CURRENT_TIMESTAMP`]\n \"updated_at\" timestamp [default: `CURRENT_TIMESTAMP`]\n\n Indexes {\n author_id [type: btree, name: \"idx_comments_author_id\"]\n created_at [type: btree, name: \"idx_comments_created_at\"]\n post_id [type: btree, name: \"idx_comments_post_id\"]\n }\n}\n\nRef \"channel_moderators_channel_id_fkey\":\"channels\".\"id\" < \"channel_moderators\".\"channel_id\" [delete: cascade]\n\nRef \"channel_moderators_user_id_fkey\":\"users\".\"id\" < \"channel_moderators\".\"user_id\" [delete: cascade]\n\nRef \"channels_owner_id_fkey\":\"users\".\"id\" < \"channels\".\"owner_id\" [delete: cascade]\n\nRef \"comments_author_id_fkey\":\"users\".\"id\" < \"comments\".\"author_id\" [delete: cascade]\n\nRef \"comments_post_id_fkey\":\"posts\".\"id\" < \"comments\".\"post_id\" [delete: cascade]\n\nRef \"posts_author_id_fkey\":\"users\".\"id\" < \"posts\".\"author_id\" [delete: cascade]\n\nRef \"posts_channel_id_fkey\":\"channels\".\"id\" < \"posts\".\"channel_id\" [delete: cascade]\n", "stateUrl": null, "stateOriginalUrl": null } } ================================================ FILE: tasks/postgres/standard/security/rls_business_access/prepare_environment.py ================================================ #!/usr/bin/env python3 import os import psycopg2 from psycopg2.extensions import ISOLATION_LEVEL_AUTOCOMMIT import sys def setup_rls_environment(): """ Set up a PostgreSQL environment for a social media platform with RLS policies. Creates Users, Channels, Posts, Comments, and Channel Moderators for testing RLS implementations. """ # Database connection parameters from environment db_params = { 'host': os.getenv('POSTGRES_HOST', 'localhost'), 'port': os.getenv('POSTGRES_PORT', '5432'), 'user': os.getenv('POSTGRES_USERNAME', 'postgres'), 'password': os.getenv('POSTGRES_PASSWORD', 'password'), 'database': os.getenv('POSTGRES_DATABASE', 'postgres') } try: conn = psycopg2.connect(**db_params) conn.set_isolation_level(ISOLATION_LEVEL_AUTOCOMMIT) cur = conn.cursor() # 1. Users Table (with correct field name for verification) cur.execute(""" CREATE TABLE IF NOT EXISTS users ( id UUID PRIMARY KEY DEFAULT gen_random_uuid(), username VARCHAR(50) UNIQUE NOT NULL, email VARCHAR(100) UNIQUE NOT NULL, is_public BOOLEAN DEFAULT false, created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP ); """) print("✓ Created users table") # 2. Channels Table cur.execute(""" CREATE TABLE IF NOT EXISTS channels ( id UUID PRIMARY KEY DEFAULT gen_random_uuid(), name VARCHAR(100) NOT NULL, description TEXT, is_public BOOLEAN DEFAULT true, owner_id UUID REFERENCES users(id) ON DELETE CASCADE, created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP ); """) print("✓ Created channels table") # 3. Channel Moderators Table cur.execute(""" CREATE TABLE IF NOT EXISTS channel_moderators ( channel_id UUID REFERENCES channels(id) ON DELETE CASCADE, user_id UUID REFERENCES users(id) ON DELETE CASCADE, created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP, PRIMARY KEY (channel_id, user_id) ); """) print("✓ Created channel_moderators table") # 4. Posts Table cur.execute(""" CREATE TABLE IF NOT EXISTS posts ( id UUID PRIMARY KEY DEFAULT gen_random_uuid(), channel_id UUID REFERENCES channels(id) ON DELETE CASCADE, author_id UUID REFERENCES users(id) ON DELETE CASCADE, title VARCHAR(200) NOT NULL, content TEXT, created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP, updated_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP ); """) print("✓ Created posts table") # 5. Comments Table cur.execute(""" CREATE TABLE IF NOT EXISTS comments ( id UUID PRIMARY KEY DEFAULT gen_random_uuid(), post_id UUID REFERENCES posts(id) ON DELETE CASCADE, author_id UUID REFERENCES users(id) ON DELETE CASCADE, content TEXT NOT NULL, created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP, updated_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP ); """) print("✓ Created comments table") # Create helper functions for RLS (matching ground truth expectations) cur.execute(""" -- Function to get current user ID from session context CREATE OR REPLACE FUNCTION app_current_user_id() RETURNS UUID AS $$ BEGIN RETURN NULLIF(current_setting('app.current_user_id', true), '')::UUID; END; $$ LANGUAGE plpgsql SECURITY DEFINER STABLE PARALLEL SAFE; -- Function to check if user owns a channel CREATE OR REPLACE FUNCTION is_channel_owner(p_channel_id UUID, p_user_id UUID) RETURNS BOOLEAN AS $$ BEGIN RETURN EXISTS ( SELECT 1 FROM channels WHERE id = p_channel_id AND owner_id = p_user_id ); END; $$ LANGUAGE plpgsql SECURITY DEFINER STABLE PARALLEL SAFE; -- Function to check if user moderates a channel CREATE OR REPLACE FUNCTION is_channel_moderator(p_channel_id UUID, p_user_id UUID) RETURNS BOOLEAN AS $$ BEGIN RETURN EXISTS ( SELECT 1 FROM channel_moderators WHERE channel_id = p_channel_id AND user_id = p_user_id ); END; $$ LANGUAGE plpgsql SECURITY DEFINER STABLE PARALLEL SAFE; -- Function to check if user can moderate channel (owner OR moderator) CREATE OR REPLACE FUNCTION can_moderate_channel(p_channel_id UUID, p_user_id UUID) RETURNS BOOLEAN AS $$ BEGIN RETURN is_channel_owner(p_channel_id, p_user_id) OR is_channel_moderator(p_channel_id, p_user_id); END; $$ LANGUAGE plpgsql SECURITY DEFINER STABLE PARALLEL SAFE; """) print("✓ Created RLS helper functions") # Insert sample data print("\nInserting sample data...") # Sample users (exact UUIDs expected by verification script) cur.execute(""" INSERT INTO users (id, username, email, is_public) VALUES ('11111111-1111-1111-1111-111111111111', 'alice', 'alice@example.com', true), ('22222222-2222-2222-2222-222222222222', 'bob', 'bob@example.com', true), ('33333333-3333-3333-3333-333333333333', 'charlie', 'charlie@example.com', false), ('44444444-4444-4444-4444-444444444444', 'diana', 'diana@example.com', true), ('55555555-5555-5555-5555-555555555555', 'eve', 'eve@example.com', false) ON CONFLICT (id) DO NOTHING; """) print("✓ Created 5 sample users") # Sample channels (exact UUIDs expected by verification script) cur.execute(""" INSERT INTO channels (id, name, description, is_public, owner_id) VALUES ('aaaaaaaa-aaaa-aaaa-aaaa-aaaaaaaaaaaa', 'general', 'General discussion channel', true, '11111111-1111-1111-1111-111111111111'), ('bbbbbbbb-bbbb-bbbb-bbbb-bbbbbbbbbbbb', 'tech-talk', 'Technical discussions', true, '22222222-2222-2222-2222-222222222222'), ('cccccccc-cccc-cccc-cccc-cccccccccccc', 'random', 'Random conversations', false, '33333333-3333-3333-3333-333333333333') ON CONFLICT (id) DO NOTHING; """) print("✓ Created 3 sample channels") # Sample moderators (exact relationships expected by verification script) cur.execute(""" INSERT INTO channel_moderators (channel_id, user_id) VALUES ('aaaaaaaa-aaaa-aaaa-aaaa-aaaaaaaaaaaa', '22222222-2222-2222-2222-222222222222'), -- Bob moderates general ('bbbbbbbb-bbbb-bbbb-bbbb-bbbbbbbbbbbb', '44444444-4444-4444-4444-444444444444') -- Diana moderates tech-talk ON CONFLICT (channel_id, user_id) DO NOTHING; """) print("✓ Created sample moderator assignments") # Sample posts (exact UUIDs expected by verification script) cur.execute(""" INSERT INTO posts (id, channel_id, author_id, title, content) VALUES ('dddddddd-dddd-dddd-dddd-dddddddddddd', 'aaaaaaaa-aaaa-aaaa-aaaa-aaaaaaaaaaaa', '11111111-1111-1111-1111-111111111111', 'Welcome to the platform!', 'This is our first post'), ('eeeeeeee-eeee-eeee-eeee-eeeeeeeeeeee', 'aaaaaaaa-aaaa-aaaa-aaaa-aaaaaaaaaaaa', '33333333-3333-3333-3333-333333333333', 'Hello everyone', 'Nice to meet you all'), ('ffffffff-ffff-ffff-ffff-ffffffffffff', 'bbbbbbbb-bbbb-bbbb-bbbb-bbbbbbbbbbbb', '22222222-2222-2222-2222-222222222222', 'PostgreSQL RLS Tutorial', 'Let''s discuss Row Level Security'), ('10101010-1010-1010-1010-101010101010', 'cccccccc-cccc-cccc-cccc-cccccccccccc', '55555555-5555-5555-5555-555555555555', 'Random thoughts', 'Just some random content here') ON CONFLICT (id) DO NOTHING; """) print("✓ Created 4 sample posts") # Sample comments (exact UUIDs expected by verification script) cur.execute(""" INSERT INTO comments (id, post_id, author_id, content) VALUES ('99999999-9999-9999-9999-999999999999', 'dddddddd-dddd-dddd-dddd-dddddddddddd', '22222222-2222-2222-2222-222222222222', 'Great to have you here!'), ('88888888-8888-8888-8888-888888888888', 'dddddddd-dddd-dddd-dddd-dddddddddddd', '33333333-3333-3333-3333-333333333333', 'Thanks for setting this up'), ('77777777-7777-7777-7777-777777777777', 'ffffffff-ffff-ffff-ffff-ffffffffffff', '44444444-4444-4444-4444-444444444444', 'RLS is really powerful!'), ('66666666-6666-6666-6666-666666666666', 'eeeeeeee-eeee-eeee-eeee-eeeeeeeeeeee', '11111111-1111-1111-1111-111111111111', 'Welcome Charlie!') ON CONFLICT (id) DO NOTHING; """) print("✓ Created 4 sample comments") # Create indexes for better RLS performance cur.execute(""" CREATE INDEX IF NOT EXISTS idx_channels_owner_id ON channels(owner_id); CREATE INDEX IF NOT EXISTS idx_channels_is_public ON channels(is_public); CREATE INDEX IF NOT EXISTS idx_channel_moderators_channel_user ON channel_moderators(channel_id, user_id); CREATE INDEX IF NOT EXISTS idx_channel_moderators_user ON channel_moderators(user_id); CREATE INDEX IF NOT EXISTS idx_posts_channel_id ON posts(channel_id); CREATE INDEX IF NOT EXISTS idx_posts_author_id ON posts(author_id); CREATE INDEX IF NOT EXISTS idx_posts_created_at ON posts(created_at); CREATE INDEX IF NOT EXISTS idx_comments_post_id ON comments(post_id); CREATE INDEX IF NOT EXISTS idx_comments_author_id ON comments(author_id); CREATE INDEX IF NOT EXISTS idx_comments_created_at ON comments(created_at); CREATE INDEX IF NOT EXISTS idx_users_is_public ON users(is_public); """) print("✓ Created performance indexes for RLS") cur.close() conn.close() except Exception as e: print(f"Error setting up environment: {e}") sys.exit(1) if __name__ == "__main__": setup_rls_environment() ================================================ FILE: tasks/postgres/standard/security/rls_business_access/verify.py ================================================ #!/usr/bin/env python3 import os import psycopg2 from psycopg2.extensions import ISOLATION_LEVEL_AUTOCOMMIT import sys def verify_rls_implementation(): """ Verify that Row Level Security policies have been properly implemented for the social media platform with Users, Posts, Comments, and Channels. """ # Database connection parameters from environment admin_db_params = { 'host': os.getenv('POSTGRES_HOST', 'localhost'), 'port': os.getenv('POSTGRES_PORT', '5432'), 'user': os.getenv('POSTGRES_USERNAME', 'postgres'), 'password': os.getenv('POSTGRES_PASSWORD', 'password'), 'database': os.getenv('POSTGRES_DATABASE', 'postgres') } # Test user parameters (non-superuser for proper RLS testing) test_db_params = { 'host': os.getenv('POSTGRES_HOST', 'localhost'), 'port': os.getenv('POSTGRES_PORT', '5432'), 'user': 'test_user', 'password': 'testpass', 'database': os.getenv('POSTGRES_DATABASE', 'postgres') } try: # First connect as admin to ensure test user exists admin_conn = psycopg2.connect(**admin_db_params) admin_conn.set_isolation_level(ISOLATION_LEVEL_AUTOCOMMIT) admin_cur = admin_conn.cursor() # Create test user if it doesn't exist try: admin_cur.execute("CREATE ROLE test_user LOGIN PASSWORD 'testpass';") except psycopg2.Error: pass # User already exists # Grant necessary permissions to test user on the current database admin_cur.execute("SELECT current_database();") current_db_name = admin_cur.fetchone()[0] admin_cur.execute(f"GRANT CONNECT ON DATABASE \"{current_db_name}\" TO test_user;") admin_cur.execute("GRANT USAGE ON SCHEMA public TO test_user;") admin_cur.execute("GRANT SELECT, INSERT, UPDATE, DELETE ON ALL TABLES IN SCHEMA public TO test_user;") admin_cur.execute("GRANT USAGE, SELECT ON ALL SEQUENCES IN SCHEMA public TO test_user;") admin_cur.execute("GRANT EXECUTE ON ALL FUNCTIONS IN SCHEMA public TO test_user;") admin_cur.close() admin_conn.close() # Update test_db_params with the correct database name test_db_params['database'] = current_db_name # Now connect as test user for RLS verification conn = psycopg2.connect(**test_db_params) conn.set_isolation_level(ISOLATION_LEVEL_AUTOCOMMIT) cur = conn.cursor() print("Verifying...") test_results = [] # Test 1: Check if RLS is enabled on all tables print("\n1. Checking RLS enablement...") expected_tables = ['users', 'channels', 'channel_moderators', 'posts', 'comments'] for table in expected_tables: cur.execute(""" SELECT relrowsecurity FROM pg_class WHERE relname = %s AND relkind = 'r' """, (table,)) result = cur.fetchone() if result and result[0]: test_results.append(f"✓ RLS enabled on {table}") else: test_results.append(f"✗ RLS NOT enabled on {table}") # Test 2: Users can only update their own profile print("\n2. Testing user profile access control...") # Alice tries to update her own profile (should work) try: cur.execute("SET app.current_user_id = '11111111-1111-1111-1111-111111111111';") # Alice cur.execute(""" UPDATE users SET email = 'alice.updated@example.com' WHERE id = '11111111-1111-1111-1111-111111111111' """) test_results.append("✓ Users can update their own profile") except Exception as e: test_results.append(f"✗ User cannot update own profile: {e}") # Alice tries to update Bob's profile (should fail) try: cur.execute("SET app.current_user_id = '11111111-1111-1111-1111-111111111111';") # Alice cur.execute(""" UPDATE users SET email = 'bob.hacked@example.com' WHERE id = '22222222-2222-2222-2222-222222222222' """) # Check if the update actually affected any rows (RLS blocks by affecting 0 rows) if cur.rowcount == 0: test_results.append("✓ Users blocked from updating other users' profiles") else: test_results.append("✗ User was able to update another user's profile (should be blocked)") except psycopg2.Error: test_results.append("✓ Users blocked from updating other users' profiles") # Test 3: Channel ownership controls print("\n3. Testing channel ownership controls...") # Alice (owner of general channel) tries to update her channel try: cur.execute("SET app.current_user_id = '11111111-1111-1111-1111-111111111111';") # Alice cur.execute(""" UPDATE channels SET description = 'Updated by Alice' WHERE id = 'aaaaaaaa-aaaa-aaaa-aaaa-aaaaaaaaaaaa' """) test_results.append("✓ Channel owners can update their channels") except Exception as e: test_results.append(f"✗ Channel owner cannot update channel: {e}") # Charlie tries to update Alice's channel (should fail) try: cur.execute("SET app.current_user_id = '33333333-3333-3333-3333-333333333333';") # Charlie cur.execute(""" UPDATE channels SET description = 'Hacked by Charlie' WHERE id = 'aaaaaaaa-aaaa-aaaa-aaaa-aaaaaaaaaaaa' """) # Check if the update actually affected any rows (RLS blocks by affecting 0 rows) if cur.rowcount == 0: test_results.append("✓ Non-owners blocked from updating channels") else: test_results.append("✗ Non-owner was able to update channel (should be blocked)") except psycopg2.Error: test_results.append("✓ Non-owners blocked from updating channels") # Test 4: Post authorship and moderation controls print("\n4. Testing post access controls...") # Alice (author) tries to update her own post try: cur.execute("SET app.current_user_id = '11111111-1111-1111-1111-111111111111';") # Alice cur.execute(""" UPDATE posts SET title = 'Updated by Alice' WHERE id = 'dddddddd-dddd-dddd-dddd-dddddddddddd' """) test_results.append("✓ Post authors can update their posts") except Exception as e: test_results.append(f"✗ Post author cannot update post: {e}") # Bob (moderator of general) tries to update Alice's post (should work) try: cur.execute("SET app.current_user_id = '22222222-2222-2222-2222-222222222222';") # Bob (moderator) cur.execute(""" UPDATE posts SET content = 'Moderated by Bob' WHERE id = 'dddddddd-dddd-dddd-dddd-dddddddddddd' """) test_results.append("✓ Channel moderators can update posts in their channels") except Exception as e: test_results.append(f"✗ Channel moderator cannot update post: {e}") # Eve tries to update Alice's post (should fail - not author, owner, or moderator) try: cur.execute("SET app.current_user_id = '55555555-5555-5555-5555-555555555555';") # Eve cur.execute(""" UPDATE posts SET content = 'Hacked by Eve' WHERE id = 'dddddddd-dddd-dddd-dddd-dddddddddddd' """) # Check if the update actually affected any rows (RLS blocks by affecting 0 rows) if cur.rowcount == 0: test_results.append("✓ Unauthorized users blocked from updating posts") else: test_results.append("✗ Unauthorized user was able to update post (should be blocked)") except psycopg2.Error: test_results.append("✓ Unauthorized users blocked from updating posts") # Test 5: Comment access controls print("\n5. Testing comment access controls...") # Bob (comment author) tries to update his own comment try: cur.execute("SET app.current_user_id = '22222222-2222-2222-2222-222222222222';") # Bob cur.execute(""" UPDATE comments SET content = 'Updated by Bob himself' WHERE id = '99999999-9999-9999-9999-999999999999' """) test_results.append("✓ Comment authors can update their comments") except Exception as e: test_results.append(f"✗ Comment author cannot update comment: {e}") # Alice (post author) tries to update Bob's comment on her post (should work) try: cur.execute("SET app.current_user_id = '11111111-1111-1111-1111-111111111111';") # Alice (post author) cur.execute(""" UPDATE comments SET content = 'Moderated by post author Alice' WHERE id = '99999999-9999-9999-9999-999999999999' """) test_results.append("✓ Post authors can moderate comments on their posts") except Exception as e: test_results.append(f"✗ Post author cannot moderate comment: {e}") # Test 6: Channel moderator assignment controls print("\n6. Testing moderator assignment controls...") # Alice (channel owner) tries to add a moderator try: cur.execute("SET app.current_user_id = '11111111-1111-1111-1111-111111111111';") # Alice (owner of general) cur.execute(""" INSERT INTO channel_moderators (channel_id, user_id) VALUES ('aaaaaaaa-aaaa-aaaa-aaaa-aaaaaaaaaaaa', '33333333-3333-3333-3333-333333333333') """) test_results.append("✓ Channel owners can add moderators") except Exception as e: test_results.append(f"✗ Channel owner cannot add moderator: {e}") # Charlie tries to add himself as moderator to Bob's channel (should fail) try: cur.execute("SET app.current_user_id = '33333333-3333-3333-3333-333333333333';") # Charlie cur.execute(""" INSERT INTO channel_moderators (channel_id, user_id) VALUES ('bbbbbbbb-bbbb-bbbb-bbbb-bbbbbbbbbbbb', '33333333-3333-3333-3333-333333333333') """) # Check if the insert actually affected any rows (RLS blocks by affecting 0 rows) if cur.rowcount == 0: test_results.append("✓ Non-owners blocked from adding moderators") else: test_results.append("✗ Non-owner was able to add moderator (should be blocked)") except psycopg2.Error: test_results.append("✓ Non-owners blocked from adding moderators") # Test 7: Content visibility based on user context print("\n7. Testing content visibility...") # Count posts visible to Alice cur.execute("SET app.current_user_id = '11111111-1111-1111-1111-111111111111';") # Alice cur.execute("SELECT COUNT(*) FROM posts;") alice_posts = cur.fetchone()[0] # Count posts visible to Eve cur.execute("SET app.current_user_id = '55555555-5555-5555-5555-555555555555';") # Eve cur.execute("SELECT COUNT(*) FROM posts;") eve_posts = cur.fetchone()[0] if alice_posts >= 2 and eve_posts >= 1: # Alice should see posts in channels she has access to test_results.append("✓ Content visibility varies correctly based on user context") else: test_results.append(f"✗ Content visibility issue: Alice sees {alice_posts}, Eve sees {eve_posts}") # Test 8: Anonymous user access print("\n8. Testing anonymous user restrictions...") try: cur.execute("SET app.current_user_id = '';") # Anonymous user cur.execute("SELECT COUNT(*) FROM users;") anon_users = cur.fetchone()[0] # Anonymous users should be able to see public user profiles per requirements # Count public users that should be visible cur.execute("SELECT COUNT(*) FROM users WHERE is_public = true;") public_users = cur.fetchone()[0] if cur.rowcount > 0 else 0 if anon_users == public_users and anon_users > 0: test_results.append(f"✓ Anonymous users can see {anon_users} public user profiles (correct)") elif anon_users == 0: test_results.append("✗ Anonymous users cannot see any users (should see public profiles)") else: test_results.append(f"✗ Anonymous users can see {anon_users} users but expected {public_users} public users") except Exception as e: test_results.append("✓ Anonymous users properly restricted") # Print results print("\n" + "="*60) print("RLS VERIFICATION RESULTS - SOCIAL MEDIA PLATFORM") print("="*60) passed = sum(1 for result in test_results if result.startswith("✓")) failed = sum(1 for result in test_results if result.startswith("✗")) for result in test_results: print(result) print(f"\nSummary: {passed} passed, {failed} failed") cur.close() conn.close() if failed == 0: print("\nAll tests passed.") return True else: print(f"\n{failed} test(s) failed.") return False except Exception as e: print(f"Error during verification: {e}") return False if __name__ == "__main__": success = verify_rls_implementation() sys.exit(0 if success else 1) ================================================ FILE: tasks/postgres/standard/security/user_permission_audit/description.md ================================================ Conduct a comprehensive security audit to identify PostgreSQL users with insufficient or dangling permissions in a business database environment. ## Your Mission: You've been hired as a security consultant to audit the PostgreSQL database permissions for a growing e-commerce company. The company has experienced rapid growth and multiple teams have been granted database access over time. However, there's concern about permission inconsistencies and security gaps. ## Security Audit Requirements: 1. **Discover the database structure**: Identify all business tables and their purposes 2. **Catalog all database users and roles**: Use `pg_user`, `pg_roles`, and `pg_auth_members` to find all accounts 3. **Analyze current permissions**: Use `information_schema.table_privileges` to map permissions 4. **Identify security issues**: - **Dangling users**: Inactive accounts that should be removed - **Missing permissions**: Users lacking permissions required for their business role - **Excessive permissions**: Users with unnecessary permissions that should be revoked ## Expected permissions by role (what they SHOULD have) ```python # users's role USER_ROLE = { # Active functional users 'analytics_user': 'Analytics Team', 'marketing_user': 'Marketing Department', 'customer_service': 'Customer Service', 'finance_user': 'Finance Team', 'product_manager': 'Product Management', 'security_auditor': 'Security Team', 'developer_user': 'Development Team', 'backup_user': 'Backup Service', } # each role has its permissions ROLE_EXPECTED_PERMISSIONS = { 'Analytics Team': [ ('user_profiles', 'SELECT'), ('user_stat_analysis', 'SELECT'), ('product_catalog', 'SELECT'), ('order_management', 'SELECT'), ], 'Marketing Department': [ ('user_profiles', 'SELECT'), ('user_stat_analysis', 'SELECT'), ('product_catalog', 'SELECT'), ], 'Customer Service': [ ('user_profiles', 'SELECT'), ('user_profiles', 'UPDATE'), ('order_management', 'SELECT'), ('order_management', 'INSERT'), ('order_management', 'UPDATE'), ('product_catalog', 'SELECT'), ], 'Finance Team': [ ('financial_transactions', 'SELECT'), ('order_management', 'SELECT'), ('user_profiles', 'SELECT'), ], 'Product Management': [ ('product_catalog', 'SELECT'), ('product_catalog', 'INSERT'), ('product_catalog', 'UPDATE'), ('product_catalog', 'DELETE'), ('order_management', 'SELECT'), ('user_stat_analysis', 'SELECT'), ], 'Security Team': [ ('audit_logs', 'SELECT'), ('user_credentials', 'SELECT'), ('user_profiles', 'SELECT'), ], 'Development Team': [ ('user_profiles', 'SELECT'), ('product_catalog', 'SELECT'), ], 'Backup Service': [ ('user_profiles', 'SELECT'), ('product_catalog', 'SELECT'), ('order_management', 'SELECT'), ('financial_transactions', 'SELECT'), ('user_stat_analysis', 'SELECT'), ('audit_logs', 'SELECT'), ('user_credentials', 'SELECT'), ] } ``` ## Expected Deliverables: Your audit must produce findings in a structured format that can be verified. Create two tables to store your audit results: **1. Summary Table:** ```sql CREATE TABLE security_audit_results ( audit_id SERIAL PRIMARY KEY, audit_type VARCHAR(50) NOT NULL, -- 'DANGLING_USERS', 'MISSING_PERMISSIONS', 'EXCESSIVE_PERMISSIONS' total_issues INTEGER NOT NULL, users_affected INTEGER NOT NULL, tables_affected INTEGER NOT NULL ); ``` **2. Detailed Findings Table:** ```sql CREATE TABLE security_audit_details ( detail_id SERIAL PRIMARY KEY, username VARCHAR(50) NOT NULL, issue_type VARCHAR(50) NOT NULL, -- 'DANGLING_USER', 'MISSING_PERMISSION', 'EXCESSIVE_PERMISSION' table_name VARCHAR(50), -- NULL for dangling users permission_type VARCHAR(20), -- 'SELECT', 'INSERT', 'UPDATE', 'DELETE', NULL for dangling users expected_access BOOLEAN NOT NULL -- TRUE if user should have access, FALSE if should not ); ``` ## Success Criteria: Your audit should populate both tables with: - **Summary data**: High-level counts of different types of security issues - **Detailed findings**: Specific permission gaps for each user and table combination ## Business Role Expectations Analyze usernames and infer their intended business roles based on naming patterns: - **analytics_user** → Analytics Team (needs user behavior and statistics data) - **marketing_user** → Marketing Department (needs customer and product data for campaigns) - **customer_service** → Customer Service (needs user profiles and order management) - **finance_user** → Finance Team (needs financial and order data) - **product_manager** → Product Management (needs full product catalog access) - **security_auditor** → Security Team (needs audit logs and credential data) - **developer_user** → Development Team (needs limited access for testing) - **backup_user** → Backup Service (needs read-only access to all business data) - **temp_contractor, old_employee, test_account** → Inactive/Temporary (should have NO permissions) The verification process will check that your findings correctly identify the actual permission gaps in the system by comparing against expected results. ================================================ FILE: tasks/postgres/standard/security/user_permission_audit/ground_truth.sql ================================================ -- Ground Truth Solution: Complete Security Audit Implementation -- This includes comprehensive PostgreSQL user, role, and permission discovery /* ================================================================================ PERMISSION MODEL DOCUMENTATION ================================================================================ ## Current Permission State | Username | Table | Permission | Status | Reason | |-------------------|------------------------|------------|---------|-------------------------------------------| | analytics_user | user_stat_analysis | SELECT | EXISTS | Correctly granted | | analytics_user | user_profiles | SELECT | MISSING | Permission was revoked | | analytics_user | financial_transactions | SELECT | EXISTS | Should be revoked - no business need | | marketing_user | user_profiles | SELECT | EXISTS | Correctly granted | | marketing_user | user_stat_analysis | SELECT | EXISTS | Correctly granted | | marketing_user | product_catalog | SELECT | MISSING | Permission was revoked | | marketing_user | financial_transactions | SELECT | EXISTS | Should be revoked - security risk | | customer_service | user_profiles | SELECT | EXISTS | Correctly granted | | customer_service | user_profiles | UPDATE | EXISTS | Correctly granted | | customer_service | order_management | SELECT | EXISTS | Correctly granted | | customer_service | order_management | INSERT | EXISTS | Correctly granted | | customer_service | order_management | UPDATE | EXISTS | Correctly granted | | customer_service | product_catalog | SELECT | MISSING | Permission was revoked | | customer_service | user_credentials | SELECT | EXISTS | Should be revoked - security risk | | finance_user | financial_transactions | SELECT | EXISTS | Correctly granted | | finance_user | order_management | SELECT | EXISTS | Correctly granted | | finance_user | user_profiles | SELECT | MISSING | Permission was revoked | | product_manager | product_catalog | SELECT | EXISTS | Correctly granted | | product_manager | product_catalog | INSERT | EXISTS | Correctly granted | | product_manager | product_catalog | UPDATE | EXISTS | Correctly granted | | product_manager | product_catalog | DELETE | EXISTS | Correctly granted | | product_manager | order_management | SELECT | EXISTS | Correctly granted | | product_manager | financial_transactions | SELECT | EXISTS | Should be revoked - no business need | | security_auditor | user_credentials | SELECT | EXISTS | Correctly granted | | security_auditor | user_profiles | SELECT | EXISTS | Correctly granted | | security_auditor | audit_logs | SELECT | MISSING | Permission was revoked | | security_auditor | financial_transactions | UPDATE | EXISTS | Should be revoked - excessive privilege | | developer_user | user_profiles | SELECT | EXISTS | Correctly granted | | developer_user | product_catalog | SELECT | MISSING | Permission was revoked | | developer_user | user_credentials | SELECT | EXISTS | Should be revoked - security risk | | developer_user | order_management | UPDATE | EXISTS | Should be revoked - no business need | | backup_user | user_profiles | SELECT | EXISTS | Correctly granted | | backup_user | product_catalog | SELECT | EXISTS | Correctly granted | | backup_user | audit_logs | SELECT | EXISTS | Correctly granted | | backup_user | order_management | SELECT | MISSING | Permission was revoked | | backup_user | product_catalog | DELETE | EXISTS | Should be revoked - backup should be read-only | | temp_contractor | product_catalog | SELECT | EXISTS | Should be revoked - user is inactive | | temp_contractor | user_profiles | SELECT | EXISTS | Should be revoked - user is inactive | | old_employee | audit_logs | SELECT | EXISTS | Should be revoked - user is inactive | | old_employee | user_stat_analysis | UPDATE | EXISTS | Should be revoked - user is inactive | | test_account | user_profiles | SELECT | EXISTS | Should be revoked - test account | ## Expected Permission State | Username | Table | Permission | Justification | |-------------------|------------------------|------------|--------------------------------------------------------------| | analytics_user | user_profiles | SELECT | Analytics team needs customer data for user behavior analysis| | analytics_user | user_stat_analysis | SELECT | Core analytics data required for reporting | | analytics_user | product_catalog | SELECT | Product performance analysis and customer preferences | | analytics_user | order_management | SELECT | Sales trend analysis and customer purchasing patterns | | marketing_user | user_profiles | SELECT | Customer segmentation and personalized marketing campaigns | | marketing_user | user_stat_analysis | SELECT | Campaign effectiveness analysis and user behavior tracking | | marketing_user | product_catalog | SELECT | Product promotion planning and marketing material creation | | customer_service | user_profiles | SELECT | Customer identity verification and support | | customer_service | user_profiles | UPDATE | Update customer information and resolve account issues | | customer_service | order_management | SELECT | Order status inquiries and customer support | | customer_service | order_management | INSERT | Create orders for customers over phone | | customer_service | order_management | UPDATE | Update order status and resolve order issues | | customer_service | product_catalog | SELECT | Product information for customer questions and support | | finance_user | financial_transactions | SELECT | Financial reporting, auditing, and compliance | | finance_user | order_management | SELECT | Revenue reconciliation and financial analysis | | finance_user | user_profiles | SELECT | Customer financial analysis and credit assessment | | product_manager | product_catalog | SELECT | Product information access and management | | product_manager | product_catalog | INSERT | Add new products to catalog | | product_manager | product_catalog | UPDATE | Update product details, pricing, and specifications | | product_manager | product_catalog | DELETE | Remove discontinued or obsolete products | | product_manager | order_management | SELECT | Product sales analysis and demand forecasting | | product_manager | user_stat_analysis | SELECT | Product usage analytics and customer behavior insights | | security_auditor | audit_logs | SELECT | Security monitoring and incident investigation | | security_auditor | user_credentials | SELECT | Security auditing and compliance verification | | security_auditor | user_profiles | SELECT | User account auditing and security incident investigation | | developer_user | user_profiles | SELECT | Application development and testing with realistic data | | developer_user | product_catalog | SELECT | Application development and testing with product data | | backup_user | user_profiles | SELECT | Complete data backup coverage for business continuity | | backup_user | product_catalog | SELECT | Complete data backup coverage for business continuity | | backup_user | order_management | SELECT | Complete data backup coverage for business continuity | | backup_user | financial_transactions | SELECT | Complete data backup coverage for business continuity | | backup_user | user_stat_analysis | SELECT | Complete data backup coverage for business continuity | | backup_user | audit_logs | SELECT | Complete data backup coverage for business continuity | | backup_user | user_credentials | SELECT | Complete data backup coverage for business continuity | Notes: - temp_contractor, old_employee, test_account should have NO permissions (accounts should be removed) - All excessive permissions should be revoked for security compliance - Missing permissions should be granted based on business role requirements ================================================================================ */ BEGIN; -- ============================================================================ -- CREATE AUDIT RESULTS TABLES -- ============================================================================ CREATE TABLE security_audit_results ( audit_id SERIAL PRIMARY KEY, audit_type VARCHAR(50) NOT NULL, -- 'DANGLING_USERS', 'MISSING_PERMISSIONS', 'EXCESSIVE_PERMISSIONS' total_issues INTEGER NOT NULL, users_affected INTEGER NOT NULL, tables_affected INTEGER NOT NULL ); CREATE TABLE security_audit_details ( detail_id SERIAL PRIMARY KEY, username VARCHAR(50) NOT NULL, issue_type VARCHAR(50) NOT NULL, -- 'DANGLING_USER', 'MISSING_PERMISSION', 'EXCESSIVE_PERMISSION' table_name VARCHAR(50), -- NULL for dangling users permission_type VARCHAR(20), -- 'SELECT', 'INSERT', 'UPDATE', 'DELETE', NULL for dangling users expected_access BOOLEAN NOT NULL -- TRUE if user should have access, FALSE if should not ); -- ============================================================================ -- DISCOVER DATABASE USERS AND ROLES -- ============================================================================ CREATE TEMP TABLE temp_user_discovery AS SELECT DISTINCT COALESCE(u.usename, r.rolname) as username, COALESCE(u.usesuper, r.rolsuper) as is_superuser, COALESCE(u.usecreatedb, r.rolcreatedb) as can_create_db, r.rolname as role_name, u.usename as user_name, CASE WHEN COALESCE(u.usename, r.rolname) LIKE '%analytics%' THEN 'Analytics Team' WHEN COALESCE(u.usename, r.rolname) LIKE '%marketing%' THEN 'Marketing Department' WHEN COALESCE(u.usename, r.rolname) LIKE '%customer%' OR COALESCE(u.usename, r.rolname) LIKE '%service%' THEN 'Customer Service' WHEN COALESCE(u.usename, r.rolname) LIKE '%finance%' THEN 'Finance Team' WHEN COALESCE(u.usename, r.rolname) LIKE '%product%' THEN 'Product Management' WHEN COALESCE(u.usename, r.rolname) LIKE '%security%' OR COALESCE(u.usename, r.rolname) LIKE '%audit%' THEN 'Security Team' WHEN COALESCE(u.usename, r.rolname) LIKE '%backup%' THEN 'Backup Service' WHEN COALESCE(u.usename, r.rolname) LIKE '%developer%' OR COALESCE(u.usename, r.rolname) LIKE '%dev%' THEN 'Development Team' WHEN COALESCE(u.usename, r.rolname) LIKE '%temp%' OR COALESCE(u.usename, r.rolname) LIKE '%old%' OR COALESCE(u.usename, r.rolname) LIKE '%test%' THEN 'Inactive/Temporary' ELSE 'Unknown' END as inferred_business_role FROM pg_user u FULL OUTER JOIN pg_roles r ON u.usename = r.rolname WHERE COALESCE(u.usename, r.rolname) NOT IN ('postgres', 'test_user') AND COALESCE(u.usename, r.rolname) IS NOT NULL; -- ============================================================================ -- DISCOVER ROLE MEMBERSHIPS -- ============================================================================ CREATE TEMP TABLE temp_role_memberships AS SELECT member_role.rolname as member_name, granted_role.rolname as granted_role_name, grantor_role.rolname as grantor_name, am.admin_option FROM pg_auth_members am JOIN pg_roles member_role ON am.member = member_role.oid JOIN pg_roles granted_role ON am.roleid = granted_role.oid JOIN pg_roles grantor_role ON am.grantor = grantor_role.oid WHERE member_role.rolname NOT IN ('postgres') AND granted_role.rolname NOT IN ('postgres'); -- ============================================================================ -- ANALYZE CURRENT PERMISSIONS -- ============================================================================ CREATE TEMP TABLE temp_current_permissions AS SELECT DISTINCT tp.grantee as username, tp.table_name, tp.privilege_type as permission_type, tp.is_grantable, tp.grantor, ud.inferred_business_role, ud.is_superuser FROM information_schema.table_privileges tp LEFT JOIN temp_user_discovery ud ON tp.grantee = ud.username WHERE tp.table_schema = 'public' AND tp.grantee NOT IN ('postgres', 'PUBLIC', 'test_user') AND tp.table_name NOT LIKE 'security_audit_%' ORDER BY tp.grantee, tp.table_name, tp.privilege_type; -- ============================================================================ -- IDENTIFY DANGLING USERS -- ============================================================================ INSERT INTO security_audit_details (username, issue_type, table_name, permission_type, expected_access) SELECT DISTINCT username, 'DANGLING_USER', NULL, NULL, FALSE FROM temp_user_discovery WHERE inferred_business_role = 'Inactive/Temporary'; -- ============================================================================ -- IDENTIFY EXCESSIVE PERMISSIONS -- ============================================================================ WITH excessive_permissions AS ( SELECT username, table_name, permission_type FROM (VALUES ('analytics_user', 'financial_transactions', 'SELECT'), ('marketing_user', 'financial_transactions', 'SELECT'), ('customer_service', 'user_credentials', 'SELECT'), ('product_manager', 'financial_transactions', 'SELECT'), ('security_auditor', 'financial_transactions', 'UPDATE'), ('developer_user', 'user_credentials', 'SELECT'), ('developer_user', 'order_management', 'UPDATE'), ('backup_user', 'product_catalog', 'DELETE'), ('temp_contractor', 'product_catalog', 'SELECT'), ('temp_contractor', 'user_profiles', 'SELECT'), ('old_employee', 'audit_logs', 'SELECT'), ('old_employee', 'user_stat_analysis', 'UPDATE'), ('test_account', 'user_profiles', 'SELECT') ) AS excessive(username, table_name, permission_type) ) INSERT INTO security_audit_details (username, issue_type, table_name, permission_type, expected_access) SELECT ep.username, 'EXCESSIVE_PERMISSION', ep.table_name, ep.permission_type, FALSE FROM excessive_permissions ep WHERE EXISTS ( SELECT 1 FROM temp_current_permissions cp WHERE cp.username = ep.username AND cp.table_name = ep.table_name AND cp.permission_type = ep.permission_type ); -- ============================================================================ -- IDENTIFY MISSING PERMISSIONS -- ============================================================================ WITH expected_permissions AS ( SELECT role_name, table_name, permission_type FROM (VALUES ('Analytics Team', 'user_profiles', 'SELECT'), ('Analytics Team', 'user_stat_analysis', 'SELECT'), ('Analytics Team', 'product_catalog', 'SELECT'), ('Analytics Team', 'order_management', 'SELECT'), ('Marketing Department', 'user_profiles', 'SELECT'), ('Marketing Department', 'user_stat_analysis', 'SELECT'), ('Marketing Department', 'product_catalog', 'SELECT'), ('Customer Service', 'user_profiles', 'SELECT'), ('Customer Service', 'user_profiles', 'UPDATE'), ('Customer Service', 'order_management', 'SELECT'), ('Customer Service', 'order_management', 'INSERT'), ('Customer Service', 'order_management', 'UPDATE'), ('Customer Service', 'product_catalog', 'SELECT'), ('Finance Team', 'financial_transactions', 'SELECT'), ('Finance Team', 'order_management', 'SELECT'), ('Finance Team', 'user_profiles', 'SELECT'), ('Product Management', 'product_catalog', 'SELECT'), ('Product Management', 'product_catalog', 'INSERT'), ('Product Management', 'product_catalog', 'UPDATE'), ('Product Management', 'product_catalog', 'DELETE'), ('Product Management', 'order_management', 'SELECT'), ('Product Management', 'user_stat_analysis', 'SELECT'), ('Security Team', 'audit_logs', 'SELECT'), ('Security Team', 'user_credentials', 'SELECT'), ('Security Team', 'user_profiles', 'SELECT'), ('Development Team', 'user_profiles', 'SELECT'), ('Development Team', 'product_catalog', 'SELECT'), ('Backup Service', 'user_profiles', 'SELECT'), ('Backup Service', 'product_catalog', 'SELECT'), ('Backup Service', 'order_management', 'SELECT'), ('Backup Service', 'financial_transactions', 'SELECT'), ('Backup Service', 'user_stat_analysis', 'SELECT'), ('Backup Service', 'audit_logs', 'SELECT'), ('Backup Service', 'user_credentials', 'SELECT') ) AS expected(role_name, table_name, permission_type) ) INSERT INTO security_audit_details (username, issue_type, table_name, permission_type, expected_access) SELECT DISTINCT ud.username, 'MISSING_PERMISSION', ep.table_name, ep.permission_type, TRUE FROM temp_user_discovery ud JOIN expected_permissions ep ON ud.inferred_business_role = ep.role_name LEFT JOIN temp_current_permissions cp ON ( cp.username = ud.username AND cp.table_name = ep.table_name AND cp.permission_type = ep.permission_type ) WHERE cp.username IS NULL AND ud.inferred_business_role != 'Inactive/Temporary' AND ud.inferred_business_role != 'Unknown' AND EXISTS ( SELECT 1 FROM information_schema.tables t WHERE t.table_name = ep.table_name AND t.table_schema = 'public' AND t.table_type = 'BASE TABLE' ); -- ============================================================================ -- POPULATE SUMMARY STATISTICS -- ============================================================================ INSERT INTO security_audit_results (audit_type, total_issues, users_affected, tables_affected) SELECT 'DANGLING_USERS', COUNT(*), COUNT(DISTINCT username), 0 FROM security_audit_details WHERE issue_type = 'DANGLING_USER'; INSERT INTO security_audit_results (audit_type, total_issues, users_affected, tables_affected) SELECT 'MISSING_PERMISSIONS', COUNT(*), COUNT(DISTINCT username), COUNT(DISTINCT table_name) FROM security_audit_details WHERE issue_type = 'MISSING_PERMISSION'; INSERT INTO security_audit_results (audit_type, total_issues, users_affected, tables_affected) SELECT 'EXCESSIVE_PERMISSIONS', COUNT(*), COUNT(DISTINCT username), COUNT(DISTINCT table_name) FROM security_audit_details WHERE issue_type = 'EXCESSIVE_PERMISSION'; -- ============================================================================ -- CLEANUP TEMPORARY TABLES -- ============================================================================ DROP TABLE temp_user_discovery; DROP TABLE temp_role_memberships; DROP TABLE temp_current_permissions; COMMIT; -- ============================================================================ -- DISCOVERY AND VERIFICATION QUERIES -- ============================================================================ -- Show all users and their properties SELECT usename as username, usesuper as is_superuser, usecreatedb as can_create_db, valuntil as password_expiry FROM pg_user WHERE usename NOT IN ('postgres', 'test_user') ORDER BY usename; -- Show all roles and their properties SELECT rolname as role_name, rolsuper as is_superuser, rolinherit as inherits_privileges, rolcanlogin as can_login FROM pg_roles WHERE rolname NOT LIKE 'pg_%' AND rolname NOT IN ('postgres', 'test_user') ORDER BY rolname; -- Show current table privileges SELECT grantee as username, table_name, privilege_type as permission, is_grantable FROM information_schema.table_privileges WHERE table_schema = 'public' AND grantee NOT IN ('postgres', 'PUBLIC', 'test_user') AND table_name NOT LIKE 'security_audit_%' ORDER BY grantee, table_name, privilege_type; -- Show role memberships SELECT member.rolname as member, granted.rolname as granted_role FROM pg_auth_members am JOIN pg_roles member ON am.member = member.oid JOIN pg_roles granted ON am.roleid = granted.oid WHERE member.rolname NOT IN ('postgres') ORDER BY member.rolname, granted.rolname; -- Display audit summary SELECT audit_type, total_issues, users_affected, tables_affected FROM security_audit_results ORDER BY audit_type; -- Display detailed findings SELECT username, issue_type, COALESCE(table_name, 'N/A') as table_name, COALESCE(permission_type, 'N/A') as permission_type, expected_access FROM security_audit_details ORDER BY issue_type, username, table_name; ================================================ FILE: tasks/postgres/standard/security/user_permission_audit/meta.json ================================================ { "task_id": "user_permission_audit", "task_name": "User Permission Audit", "category_id": "security", "category_name": "Security", "description": "Conduct comprehensive security audit identifying users with insufficient or dangling permissions in business database environment.", "author": "Fanshi Zhang", "created_at": "2025-08-17", "difficulty": "L3", "tags": [ "security and access control", "audit and compliance" ], "mcp": [ "postgres" ], "meta_data": { "stateType": "text", "stateContent": "Table \"user_profiles\" {\n \"user_id\" int4 [pk, not null, increment]\n \"username\" varchar(50) [unique, not null]\n \"email\" varchar(100) [unique, not null]\n \"first_name\" varchar(50) [not null]\n \"last_name\" varchar(50) [not null]\n \"phone\" varchar(20)\n \"address\" text\n \"city\" varchar(50)\n \"state\" varchar(2)\n \"zip_code\" varchar(10)\n \"date_created\" timestamp [default: `CURRENT_TIMESTAMP`]\n \"last_updated\" timestamp [default: `CURRENT_TIMESTAMP`]\n \"is_active\" bool [default: true]\n \"profile_picture_url\" text\n \"bio\" text\n}\n\nTable \"user_credentials\" {\n \"credential_id\" int4 [pk, not null, increment]\n \"user_id\" int4\n \"password_hash\" varchar(255) [not null]\n \"salt\" varchar(100) [not null]\n \"login_attempts\" int4 [default: 0]\n \"last_login\" timestamp\n \"password_created\" timestamp [default: `CURRENT_TIMESTAMP`]\n \"password_expires\" timestamp\n \"is_locked\" bool [default: false]\n \"two_factor_enabled\" bool [default: false]\n \"two_factor_secret\" varchar(32)\n \"backup_codes\" \"text[]\"\n \"security_questions\" jsonb\n}\n\nTable \"user_stat_analysis\" {\n \"analysis_id\" int4 [pk, not null, increment]\n \"user_id\" int4\n \"session_id\" varchar(100)\n \"page_views\" int4 [default: 0]\n \"time_spent_minutes\" int4 [default: 0]\n \"actions_performed\" jsonb\n \"device_info\" jsonb\n \"ip_address\" inet\n \"location_data\" jsonb\n \"referrer_url\" text\n \"conversion_events\" jsonb\n \"analysis_date\" date [default: `CURRENT_DATE`]\n \"created_at\" timestamp [default: `CURRENT_TIMESTAMP`]\n}\n\nTable \"product_catalog\" {\n \"product_id\" int4 [pk, not null, increment]\n \"product_name\" varchar(100) [not null]\n \"description\" text\n \"category\" varchar(50)\n \"price\" numeric(10,2) [not null]\n \"cost\" numeric(10,2)\n \"sku\" varchar(50) [unique]\n \"inventory_count\" int4 [default: 0]\n \"is_active\" bool [default: true]\n \"created_at\" timestamp [default: `CURRENT_TIMESTAMP`]\n \"updated_at\" timestamp [default: `CURRENT_TIMESTAMP`]\n \"supplier_info\" jsonb\n \"weight_kg\" numeric(6,2)\n \"dimensions\" jsonb\n}\n\nTable \"order_management\" {\n \"order_id\" int4 [pk, not null, increment]\n \"user_id\" int4\n \"order_number\" varchar(50) [unique, not null]\n \"order_status\" varchar(20) [default: 'pending']\n \"total_amount\" numeric(12,2) [not null]\n \"tax_amount\" numeric(12,2)\n \"shipping_amount\" numeric(12,2)\n \"discount_amount\" numeric(12,2) [default: 0]\n \"payment_method\" varchar(50)\n \"payment_status\" varchar(20) [default: 'pending']\n \"shipping_address\" jsonb\n \"billing_address\" jsonb\n \"order_date\" timestamp [default: `CURRENT_TIMESTAMP`]\n \"shipped_date\" timestamp\n \"delivered_date\" timestamp\n \"tracking_number\" varchar(100)\n}\n\nTable \"financial_transactions\" {\n \"transaction_id\" int4 [pk, not null, increment]\n \"order_id\" int4\n \"user_id\" int4\n \"transaction_type\" varchar(20) [not null]\n \"amount\" numeric(12,2) [not null]\n \"currency\" varchar(3) [default: 'USD']\n \"payment_gateway\" varchar(50)\n \"gateway_transaction_id\" varchar(100)\n \"credit_card_last_four\" bpchar(4)\n \"bank_account_last_four\" bpchar(4)\n \"transaction_status\" varchar(20) [default: 'pending']\n \"processed_at\" timestamp\n \"created_at\" timestamp [default: `CURRENT_TIMESTAMP`]\n \"fee_amount\" numeric(8,2)\n \"refund_amount\" numeric(12,2) [default: 0]\n \"notes\" text\n}\n\nTable \"audit_logs\" {\n \"log_id\" int4 [pk, not null, increment]\n \"user_id\" int4\n \"action_type\" varchar(50) [not null]\n \"table_name\" varchar(50)\n \"record_id\" int4\n \"old_values\" jsonb\n \"new_values\" jsonb\n \"ip_address\" inet\n \"user_agent\" text\n \"session_id\" varchar(100)\n \"timestamp\" timestamp [default: `CURRENT_TIMESTAMP`]\n \"success\" bool [default: true]\n \"error_message\" text\n}\n\nRef \"audit_logs_user_id_fkey\":\"user_profiles\".\"user_id\" < \"audit_logs\".\"user_id\"\n\nRef \"financial_transactions_order_id_fkey\":\"order_management\".\"order_id\" < \"financial_transactions\".\"order_id\"\n\nRef \"financial_transactions_user_id_fkey\":\"user_profiles\".\"user_id\" < \"financial_transactions\".\"user_id\"\n\nRef \"order_management_user_id_fkey\":\"user_profiles\".\"user_id\" < \"order_management\".\"user_id\"\n\nRef \"user_credentials_user_id_fkey\":\"user_profiles\".\"user_id\" < \"user_credentials\".\"user_id\" [delete: cascade]\n\nRef \"user_stat_analysis_user_id_fkey\":\"user_profiles\".\"user_id\" < \"user_stat_analysis\".\"user_id\" [delete: cascade]\n", "stateUrl": null, "stateOriginalUrl": null } } ================================================ FILE: tasks/postgres/standard/security/user_permission_audit/prepare_environment.py ================================================ #!/usr/bin/env python3 import os import psycopg2 from psycopg2.extensions import ISOLATION_LEVEL_AUTOCOMMIT import sys # Configuration for users and their permissions USER_CONFIGS = { # Active functional users 'analytics_user': { 'password': 'analytics123', 'role': 'Analytics Team', 'status': 'active' }, 'marketing_user': { 'password': 'marketing123', 'role': 'Marketing Department', 'status': 'active' }, 'customer_service': { 'password': 'service123', 'role': 'Customer Service', 'status': 'active' }, 'finance_user': { 'password': 'finance123', 'role': 'Finance Team', 'status': 'active' }, 'product_manager': { 'password': 'product123', 'role': 'Product Management', 'status': 'active' }, 'security_auditor': { 'password': 'security123', 'role': 'Security Team', 'status': 'active' }, 'developer_user': { 'password': 'dev123', 'role': 'Development Team', 'status': 'active' }, 'backup_user': { 'password': 'backup123', 'role': 'Backup Service', 'status': 'active' }, # Inactive/dangling users 'temp_contractor': { 'password': 'temp123', 'role': 'Inactive/Temporary', 'status': 'inactive' }, 'old_employee': { 'password': 'old456', 'role': 'Inactive/Temporary', 'status': 'inactive' }, 'test_account': { 'password': 'test789', 'role': 'Inactive/Temporary', 'status': 'inactive' } } # Expected permissions by role (what they SHOULD have) ROLE_EXPECTED_PERMISSIONS = { 'Analytics Team': [ ('user_profiles', 'SELECT'), ('user_stat_analysis', 'SELECT'), ('product_catalog', 'SELECT'), ('order_management', 'SELECT'), ], 'Marketing Department': [ ('user_profiles', 'SELECT'), ('user_stat_analysis', 'SELECT'), ('product_catalog', 'SELECT'), ], 'Customer Service': [ ('user_profiles', 'SELECT'), ('user_profiles', 'UPDATE'), ('order_management', 'SELECT'), ('order_management', 'INSERT'), ('order_management', 'UPDATE'), ('product_catalog', 'SELECT'), ], 'Finance Team': [ ('financial_transactions', 'SELECT'), ('order_management', 'SELECT'), ('user_profiles', 'SELECT'), ], 'Product Management': [ ('product_catalog', 'SELECT'), ('product_catalog', 'INSERT'), ('product_catalog', 'UPDATE'), ('product_catalog', 'DELETE'), ('order_management', 'SELECT'), ('user_stat_analysis', 'SELECT'), ], 'Security Team': [ ('audit_logs', 'SELECT'), ('user_credentials', 'SELECT'), ('user_profiles', 'SELECT'), ], 'Development Team': [ ('user_profiles', 'SELECT'), ('product_catalog', 'SELECT'), ], 'Backup Service': [ ('user_profiles', 'SELECT'), ('product_catalog', 'SELECT'), ('order_management', 'SELECT'), ('financial_transactions', 'SELECT'), ('user_stat_analysis', 'SELECT'), ('audit_logs', 'SELECT'), ('user_credentials', 'SELECT'), ], } # Excessive permissions that will be granted but should be flagged as security issues EXCESSIVE_PERMISSIONS = [ # Users getting financial access they shouldn't have ('analytics_user', 'financial_transactions', 'SELECT'), ('marketing_user', 'financial_transactions', 'SELECT'), ('product_manager', 'financial_transactions', 'SELECT'), # Security risks - credential access ('customer_service', 'user_credentials', 'SELECT'), ('developer_user', 'user_credentials', 'SELECT'), # Excessive privileges ('security_auditor', 'financial_transactions', 'UPDATE'), ('developer_user', 'order_management', 'UPDATE'), ('backup_user', 'product_catalog', 'DELETE'), # Backup should be read-only # Inactive users with permissions they shouldn't have ('temp_contractor', 'product_catalog', 'SELECT'), ('temp_contractor', 'user_profiles', 'SELECT'), ('old_employee', 'audit_logs', 'SELECT'), ('old_employee', 'user_stat_analysis', 'UPDATE'), ('test_account', 'user_profiles', 'SELECT'), ] # Permissions to revoke to create "missing permission" findings PERMISSIONS_TO_REVOKE = [ ('analytics_user', 'user_profiles', 'SELECT'), ('analytics_user', 'order_management', 'SELECT'), ('analytics_user', 'product_catalog', 'SELECT'), ('marketing_user', 'product_catalog', 'SELECT'), ('finance_user', 'user_profiles', 'SELECT'), ('developer_user', 'product_catalog', 'SELECT'), ('customer_service', 'product_catalog', 'SELECT'), ('security_auditor', 'audit_logs', 'SELECT'), ('product_manager', 'user_stat_analysis', 'SELECT'), ('backup_user', 'order_management', 'SELECT'), ('backup_user', 'financial_transactions', 'SELECT'), ('backup_user', 'user_stat_analysis', 'SELECT'), ('backup_user', 'user_credentials', 'SELECT'), ] def create_business_tables(cur): """Create all business tables""" tables = [ ('user_profiles', """ DROP TABLE IF EXISTS user_profiles CASCADE; CREATE TABLE user_profiles ( user_id SERIAL PRIMARY KEY, username VARCHAR(50) UNIQUE NOT NULL, email VARCHAR(100) UNIQUE NOT NULL, first_name VARCHAR(50) NOT NULL, last_name VARCHAR(50) NOT NULL, phone VARCHAR(20), address TEXT, city VARCHAR(50), state VARCHAR(2), zip_code VARCHAR(10), date_created TIMESTAMP DEFAULT CURRENT_TIMESTAMP, last_updated TIMESTAMP DEFAULT CURRENT_TIMESTAMP, is_active BOOLEAN DEFAULT true, profile_picture_url TEXT, bio TEXT ); """), ('user_credentials', """ DROP TABLE IF EXISTS user_credentials CASCADE; CREATE TABLE user_credentials ( credential_id SERIAL PRIMARY KEY, user_id INTEGER REFERENCES user_profiles(user_id) ON DELETE CASCADE, password_hash VARCHAR(255) NOT NULL, salt VARCHAR(100) NOT NULL, login_attempts INTEGER DEFAULT 0, last_login TIMESTAMP, password_created TIMESTAMP DEFAULT CURRENT_TIMESTAMP, password_expires TIMESTAMP, is_locked BOOLEAN DEFAULT false, two_factor_enabled BOOLEAN DEFAULT false, two_factor_secret VARCHAR(32), backup_codes TEXT[], security_questions JSONB ); """), ('user_stat_analysis', """ DROP TABLE IF EXISTS user_stat_analysis CASCADE; CREATE TABLE user_stat_analysis ( analysis_id SERIAL PRIMARY KEY, user_id INTEGER REFERENCES user_profiles(user_id) ON DELETE CASCADE, session_id VARCHAR(100), page_views INTEGER DEFAULT 0, time_spent_minutes INTEGER DEFAULT 0, actions_performed JSONB, device_info JSONB, ip_address INET, location_data JSONB, referrer_url TEXT, conversion_events JSONB, analysis_date DATE DEFAULT CURRENT_DATE, created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP ); """), ('product_catalog', """ DROP TABLE IF EXISTS product_catalog CASCADE; CREATE TABLE product_catalog ( product_id SERIAL PRIMARY KEY, product_name VARCHAR(100) NOT NULL, description TEXT, category VARCHAR(50), price DECIMAL(10,2) NOT NULL, cost DECIMAL(10,2), sku VARCHAR(50) UNIQUE, inventory_count INTEGER DEFAULT 0, is_active BOOLEAN DEFAULT true, created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP, updated_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP, supplier_info JSONB, weight_kg DECIMAL(6,2), dimensions JSONB ); """), ('order_management', """ DROP TABLE IF EXISTS order_management CASCADE; CREATE TABLE order_management ( order_id SERIAL PRIMARY KEY, user_id INTEGER REFERENCES user_profiles(user_id), order_number VARCHAR(50) UNIQUE NOT NULL, order_status VARCHAR(20) DEFAULT 'pending', total_amount DECIMAL(12,2) NOT NULL, tax_amount DECIMAL(12,2), shipping_amount DECIMAL(12,2), discount_amount DECIMAL(12,2) DEFAULT 0, payment_method VARCHAR(50), payment_status VARCHAR(20) DEFAULT 'pending', shipping_address JSONB, billing_address JSONB, order_date TIMESTAMP DEFAULT CURRENT_TIMESTAMP, shipped_date TIMESTAMP, delivered_date TIMESTAMP, tracking_number VARCHAR(100) ); """), ('financial_transactions', """ DROP TABLE IF EXISTS financial_transactions CASCADE; CREATE TABLE financial_transactions ( transaction_id SERIAL PRIMARY KEY, order_id INTEGER REFERENCES order_management(order_id), user_id INTEGER REFERENCES user_profiles(user_id), transaction_type VARCHAR(20) NOT NULL, amount DECIMAL(12,2) NOT NULL, currency VARCHAR(3) DEFAULT 'USD', payment_gateway VARCHAR(50), gateway_transaction_id VARCHAR(100), credit_card_last_four CHAR(4), bank_account_last_four CHAR(4), transaction_status VARCHAR(20) DEFAULT 'pending', processed_at TIMESTAMP, created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP, fee_amount DECIMAL(8,2), refund_amount DECIMAL(12,2) DEFAULT 0, notes TEXT ); """), ('audit_logs', """ DROP TABLE IF EXISTS audit_logs CASCADE; CREATE TABLE audit_logs ( log_id SERIAL PRIMARY KEY, user_id INTEGER REFERENCES user_profiles(user_id), action_type VARCHAR(50) NOT NULL, table_name VARCHAR(50), record_id INTEGER, old_values JSONB, new_values JSONB, ip_address INET, user_agent TEXT, session_id VARCHAR(100), timestamp TIMESTAMP DEFAULT CURRENT_TIMESTAMP, success BOOLEAN DEFAULT true, error_message TEXT ); """) ] for table_name, sql in tables: cur.execute(sql) def create_users(cur): """Create PostgreSQL users from configuration""" for username, config in USER_CONFIGS.items(): cur.execute(f"CREATE USER {username} WITH PASSWORD %s;", (config['password'],)) def grant_expected_permissions(cur): """Grant expected permissions to users based on their roles""" for username, config in USER_CONFIGS.items(): if config['status'] == 'active': role = config['role'] permissions = ROLE_EXPECTED_PERMISSIONS.get(role, []) for table_name, privilege in permissions: cur.execute(f"GRANT {privilege} ON {table_name} TO {username};") def grant_excessive_permissions(cur): """Grant excessive permissions that should be flagged as security issues""" for username, table_name, privilege in EXCESSIVE_PERMISSIONS: cur.execute(f"GRANT {privilege} ON {table_name} TO {username};") def revoke_permissions(cur): """Revoke specific permissions to create missing permission findings""" for username, table_name, privilege in PERMISSIONS_TO_REVOKE: cur.execute(f"REVOKE {privilege} ON {table_name} FROM {username};") def grant_sequence_permissions(cur): """Grant sequence permissions for users that need them""" users_needing_sequences = ['customer_service', 'product_manager'] for username in users_needing_sequences: cur.execute(f"GRANT USAGE, SELECT ON ALL SEQUENCES IN SCHEMA public TO {username};") def setup_security_environment(): """ Set up a security-focused PostgreSQL environment with business tables and users with various permissions. Creates a scenario where some users have dangling or insufficient permissions for realistic security analysis. """ # Database connection parameters from environment db_params = { 'host': os.getenv('POSTGRES_HOST', 'localhost'), 'port': os.getenv('POSTGRES_PORT', '5432'), 'user': os.getenv('POSTGRES_USERNAME', 'postgres'), 'password': os.getenv('POSTGRES_PASSWORD', 'password'), 'database': os.getenv('POSTGRES_DATABASE', 'postgres') } postgres_params = db_params.copy() postgres_params['database'] = 'postgres' try: conn_postgres = psycopg2.connect(**postgres_params) conn_postgres.set_isolation_level(ISOLATION_LEVEL_AUTOCOMMIT) cur_postgres = conn_postgres.cursor() current_db = db_params['database'] cur_postgres.execute("SELECT datname FROM pg_database WHERE datname LIKE %s AND datname != %s;", ('%user_permission_audit%', current_db)) audit_databases = cur_postgres.fetchall() if audit_databases: for db_row in audit_databases: db_name = db_row[0] try: cur_postgres.execute("SELECT pg_terminate_backend(pid) FROM pg_stat_activity WHERE datname = %s;", (db_name,)) cur_postgres.execute(f"DROP DATABASE IF EXISTS {db_name};") print(f"Dropped database: {db_name}") except Exception as e: print(f"Warning: Could not drop database {db_name}: {e}") # Clean up existing users for username in USER_CONFIGS.keys(): try: cur_postgres.execute("SELECT pg_terminate_backend(pid) FROM pg_stat_activity WHERE usename = %s;", (username,)) cur_postgres.execute(f"DROP USER IF EXISTS {username};") except Exception as e: print(f"Warning: Could not drop user {username}: {e}") cur_postgres.close() conn_postgres.close() except Exception as e: print(f"Warning: Could not clean up users: {e}") try: conn = psycopg2.connect(**db_params) conn.set_isolation_level(ISOLATION_LEVEL_AUTOCOMMIT) cur = conn.cursor() print("Setting up security audit environment...") # Create business tables with realistic structure create_business_tables(cur) print("Created 7 business tables") # Create users create_users(cur) active_count = len([u for u in USER_CONFIGS.values() if u['status'] == 'active']) inactive_count = len([u for u in USER_CONFIGS.values() if u['status'] == 'inactive']) print(f"Created {len(USER_CONFIGS)} users ({active_count} functional, {inactive_count} dangling)") # Grant expected permissions grant_expected_permissions(cur) # Grant excessive permissions that will be flagged as issues grant_excessive_permissions(cur) print("Granted initial permissions") # Revoke specific permissions to create missing permission findings revoke_permissions(cur) # Grant sequence permissions where needed grant_sequence_permissions(cur) cur.close() conn.close() except Exception as e: print(f"Error setting up environment: {e}") sys.exit(1) if __name__ == "__main__": setup_security_environment() ================================================ FILE: tasks/postgres/standard/security/user_permission_audit/verify.py ================================================ import os import psycopg2 import sys def verify_security_audit(): """ Verify that the security audit correctly identified all permission issues. """ # Database connection parameters from environment db_params = { 'host': os.getenv('POSTGRES_HOST', 'localhost'), 'port': os.getenv('POSTGRES_PORT', '5432'), 'user': os.getenv('POSTGRES_USERNAME', 'postgres'), 'password': os.getenv('POSTGRES_PASSWORD', 'password'), 'database': os.getenv('POSTGRES_DATABASE', 'postgres') } try: conn = psycopg2.connect(**db_params) cur = conn.cursor() print("| Verifying security audit findings...") # Check if security_audit_results table exists cur.execute(""" SELECT EXISTS ( SELECT FROM information_schema.tables WHERE table_name = 'security_audit_results' ); """) if not cur.fetchone()[0]: print("FAIL: security_audit_results table not found") return False # Check if security_audit_details table exists cur.execute(""" SELECT EXISTS ( SELECT FROM information_schema.tables WHERE table_name = 'security_audit_details' ); """) if not cur.fetchone()[0]: print("FAIL: security_audit_details table not found") return False # Get all detailed findings cur.execute("SELECT * FROM security_audit_details ORDER BY detail_id;") findings = cur.fetchall() if not findings: print("FAIL: No findings in security_audit_details table") return False print(f"| Found {len(findings)} audit findings") # Expected findings based on the ground truth: expected_findings = { # Expected dangling users 'dangling_users': {'temp_contractor', 'old_employee', 'test_account'}, # Expected missing permissions (should be granted) 'missing_permissions': { ('analytics_user', 'user_profiles', 'SELECT'), ('analytics_user', 'product_catalog', 'SELECT'), ('analytics_user', 'order_management', 'SELECT'), ('marketing_user', 'product_catalog', 'SELECT'), ('customer_service', 'product_catalog', 'SELECT'), ('finance_user', 'user_profiles', 'SELECT'), ('product_manager', 'user_stat_analysis', 'SELECT'), ('security_auditor', 'audit_logs', 'SELECT'), ('developer_user', 'product_catalog', 'SELECT'), ('backup_user', 'order_management', 'SELECT'), ('backup_user', 'financial_transactions', 'SELECT'), ('backup_user', 'user_stat_analysis', 'SELECT'), ('backup_user', 'user_credentials', 'SELECT') }, # Expected excessive permissions (should be revoked) 'excessive_permissions': { ('analytics_user', 'financial_transactions', 'SELECT'), ('marketing_user', 'financial_transactions', 'SELECT'), ('customer_service', 'user_credentials', 'SELECT'), ('product_manager', 'financial_transactions', 'SELECT'), ('security_auditor', 'financial_transactions', 'UPDATE'), ('developer_user', 'user_credentials', 'SELECT'), ('developer_user', 'order_management', 'UPDATE'), ('backup_user', 'product_catalog', 'DELETE'), ('temp_contractor', 'product_catalog', 'SELECT'), ('temp_contractor', 'user_profiles', 'SELECT'), ('old_employee', 'audit_logs', 'SELECT'), ('old_employee', 'user_stat_analysis', 'UPDATE'), ('test_account', 'user_profiles', 'SELECT') } } found_dangling = set() found_missing_permissions = set() found_excessive_permissions = set() # Analyze findings (detail_id, username, issue_type, table_name, permission_type, expected_access) for finding in findings: username = finding[1] issue_type = finding[2] table_name = finding[3] permission_type = finding[4] expected_access = finding[5] if issue_type == 'DANGLING_USER': found_dangling.add(username) elif issue_type == 'MISSING_PERMISSION' and expected_access: if table_name and permission_type: found_missing_permissions.add((username, table_name, permission_type)) elif issue_type == 'EXCESSIVE_PERMISSION' and not expected_access: if table_name and permission_type: found_excessive_permissions.add((username, table_name, permission_type)) # Verify dangling users missing_dangling = expected_findings['dangling_users'] - found_dangling extra_dangling = found_dangling - expected_findings['dangling_users'] # Verify missing permissions missing_missing_perms = expected_findings['missing_permissions'] - found_missing_permissions extra_missing_perms = found_missing_permissions - expected_findings['missing_permissions'] # Verify excessive permissions missing_excessive_perms = expected_findings['excessive_permissions'] - found_excessive_permissions extra_excessive_perms = found_excessive_permissions - expected_findings['excessive_permissions'] # Validate structure structure_valid = True for i, finding in enumerate(findings): if len(finding) != 6: # Should have 6 columns print(f"| FAIL: Finding {i + 1} has wrong number of columns (expected 6, got {len(finding)})") structure_valid = False continue detail_id, username, issue_type, table_name, permission_type, expected_access = finding if not username: print(f"| FAIL: Finding {i + 1} missing username") structure_valid = False if issue_type not in ['DANGLING_USER', 'MISSING_PERMISSION', 'EXCESSIVE_PERMISSION']: print(f"| FAIL: Finding {i + 1} invalid issue_type: {issue_type}") structure_valid = False if expected_access not in [True, False]: print(f"| FAIL: Finding {i + 1} invalid expected_access: {expected_access}") structure_valid = False if structure_valid: print(f"| ✓ structure is valid") # Check for missing findings all_correct = True print(f"| Expected dangling users: {expected_findings['dangling_users']} Found: {found_dangling}") if missing_dangling: print(f"| Missing dangling users: {missing_dangling}") all_correct = False print( f"| Expected missing permissions: {len(expected_findings['missing_permissions'])} Found: {len(found_missing_permissions)} Missing: {len(missing_missing_perms)}") if missing_missing_perms: print(f"| Missing 'missing permission' findings:") for perm in sorted(missing_missing_perms): print(f"| - {perm[0]} should be granted {perm[2]} on {perm[1]}") all_correct = False print( f"| Expected excessive permissions: {len(expected_findings['excessive_permissions'])} Found: {len(found_excessive_permissions)} Missing: {len(missing_excessive_perms)}") if missing_excessive_perms: print(f"| Missing 'excessive permission' findings:") for perm in sorted(missing_excessive_perms): print(f"| - {perm[0]} should have {perm[2]} revoked on {perm[1]}") all_correct = False # Check audit summary table cur.execute( "SELECT audit_type, total_issues, users_affected, tables_affected FROM security_audit_results ORDER BY audit_type;") summary_results = cur.fetchall() # Expected summary numbers based on ground truth expected_summary = { 'DANGLING_USERS': (3, 3, 0), # 3 issues, 3 users affected, 0 tables affected 'EXCESSIVE_PERMISSIONS': (13, 10, 7), # 13 issues, 10 users affected, 7 tables affected 'MISSING_PERMISSIONS': (13, 8, 7) # 13 issues, 8 users affected, 7 tables affected } summary_correct = True for result in summary_results: audit_type, total_issues, users_affected, tables_affected = result print(f"| Summary result: [{audit_type}] {total_issues} issues, {users_affected} users affected, {tables_affected} tables affected") if audit_type in expected_summary: expected = expected_summary[audit_type] if (total_issues, users_affected, tables_affected) != expected: print(f"| FAIL: {audit_type} summary mismatch - Expected: {expected}, Got: ({total_issues}, {users_affected}, {tables_affected})") summary_correct = False else: print(f"| ✓ {audit_type} summary matches expected values") # Assert exact counts match expected assert len(found_dangling) == 3, f"Expected 3 dangling users, found {len(found_dangling)}" assert len(found_missing_permissions) == 13, f"Expected 13 missing permissions, found {len(found_missing_permissions)}" assert len(found_excessive_permissions) == 13, f"Expected 13 excessive permissions, found {len(found_excessive_permissions)}" if all_correct and structure_valid and summary_correct: print("| ✓ All assertions passed") return True else: return False except Exception as e: print(f"FAIL: Error during verification: {e}") return False finally: if 'cur' in locals(): cur.close() if 'conn' in locals(): conn.close() if __name__ == "__main__": success = verify_security_audit() sys.exit(0 if success else 1) ================================================ FILE: tasks/postgres/standard/sports/baseball_player_analysis/description.md ================================================ Create comprehensive baseball player performance analysis in the sports database. ## Background You are a sports analyst working with a comprehensive sports database. The analytics team needs to create a detailed analysis of baseball players by combining their offensive and defensive statistics with personal information. Currently, this data is scattered across multiple tables and needs to be consolidated for reporting purposes. ## Your Task Create a table called `baseball_player_analysis` that consolidates baseball player performance data. The table should provide a comprehensive view of each qualifying player's performance metrics. ### Table Structure Create the `baseball_player_analysis` table with the following columns: - `player_id` (INTEGER, NOT NULL) - Player identifier - `player_name` (VARCHAR(255), NOT NULL) - Player's full name - `team_name` (VARCHAR(255)) - Set to 'Unknown' for all players - `games_played` (INTEGER) - Number of games/events the player participated in - `at_bats` (INTEGER) - Total at-bats for the player - `hits` (INTEGER) - Total hits for the player - `runs_scored` (INTEGER) - Total runs scored by the player - `rbi` (INTEGER) - Total runs batted in by the player - `home_runs` (INTEGER) - Total home runs hit by the player - `batting_average` (DECIMAL) - Calculated as hits/at_bats - `defensive_games` (INTEGER) - Number of defensive games played (same as games_played) - `putouts` (INTEGER) - Total putouts in defensive play - `assists` (INTEGER) - Total assists in defensive play - `errors` (INTEGER) - Total errors made in defensive play - `fielding_percentage` (DECIMAL) - Calculated as (putouts + assists)/(putouts + assists + errors) ### Data Requirements Include only baseball players that meet ALL of the following criteria: - Have offensive statistics available for regular season play - Have played at least 10 games/events - Have at least 50 at-bats - Have a valid name available in the system ### Important Notes - Focus on regular season statistics only - Handle NULL values appropriately in calculations (use 0 for missing stats) - Ensure batting average and fielding percentage calculations handle division by zero - Do NOT use ROUND functions - keep the full precision of calculated values - Sort results by batting average descending, then by games played descending ## Requirements - Explore the database to understand the table structure and relationships - Create the table with the exact structure specified above - Populate the table using appropriate queries and joins - Ensure all calculations are mathematically correct - Handle edge cases properly (division by zero, NULL values) ================================================ FILE: tasks/postgres/standard/sports/baseball_player_analysis/meta.json ================================================ { "task_id": "baseball_player_analysis", "task_name": "Baseball Player Analysis", "category_id": "sports", "category_name": "Sports", "description": "Consolidate scattered baseball player data into comprehensive analysis table combining offensive and defensive statistics.", "author": "Lingxiao Du", "created_at": "2025-08-18", "difficulty": "L3", "tags": [ "reporting and analytics", "statistical aggregation", "schema design" ], "mcp": [ "postgres" ], "meta_data": { "stateType": "text", "stateContent": "Table \"addresses\" {\n \"id\" int4 [not null, increment]\n \"location_id\" int4 [not null]\n \"language\" varchar(100)\n \"suite\" varchar(100)\n \"floor\" varchar(100)\n \"building\" varchar(100)\n \"street_number\" varchar(100)\n \"street_prefix\" varchar(100)\n \"street\" varchar(100)\n \"street_suffix\" varchar(100)\n \"neighborhood\" varchar(100)\n \"district\" varchar(100)\n \"locality\" varchar(100)\n \"county\" varchar(100)\n \"region\" varchar(100)\n \"postal_code\" varchar(100)\n \"country\" varchar(100)\n}\n\nTable \"affiliation_phases\" {\n \"id\" int4 [not null, increment]\n \"affiliation_id\" int4 [not null]\n \"ancestor_affiliation_id\" int4\n \"start_season_id\" int4\n \"start_date_time\" timestamp\n \"end_season_id\" int4\n \"end_date_time\" timestamp\n}\n\nTable \"affiliations\" {\n \"id\" int4 [not null, increment]\n \"affiliation_key\" varchar(100) [not null]\n \"affiliation_type\" varchar(100)\n \"publisher_id\" int4 [not null]\n}\n\nTable \"affiliations_documents\" {\n \"affiliation_id\" int4 [not null]\n \"document_id\" int4 [not null]\n}\n\nTable \"affiliations_events\" {\n \"affiliation_id\" int4 [not null]\n \"event_id\" int4 [not null]\n}\n\nTable \"affiliations_media\" {\n \"affiliation_id\" int4 [not null]\n \"media_id\" int4 [not null]\n}\n\nTable \"american_football_action_participants\" {\n \"id\" int4 [not null, increment]\n \"american_football_action_play_id\" int4 [not null]\n \"person_id\" int4 [not null]\n \"participant_role\" varchar(100) [not null]\n \"score_type\" varchar(100)\n \"field_line\" int4\n \"yardage\" int4\n \"score_credit\" int4\n \"yards_gained\" int4\n}\n\nTable \"american_football_action_plays\" {\n \"id\" int4 [not null, increment]\n \"american_football_event_state_id\" int4 [not null]\n \"play_type\" varchar(100)\n \"score_attempt_type\" varchar(100)\n \"drive_result\" varchar(100)\n \"points\" int4\n \"comment\" varchar(255)\n}\n\nTable \"american_football_defensive_stats\" {\n \"id\" int4 [not null, increment]\n \"tackles_total\" varchar(100)\n \"tackles_solo\" varchar(100)\n \"tackles_assists\" varchar(100)\n \"interceptions_total\" varchar(100)\n \"interceptions_yards\" varchar(100)\n \"interceptions_average\" varchar(100)\n \"interceptions_longest\" varchar(100)\n \"interceptions_touchdown\" varchar(100)\n \"quarterback_hurries\" varchar(100)\n \"sacks_total\" varchar(100)\n \"sacks_yards\" varchar(100)\n \"passes_defensed\" varchar(100)\n}\n\nTable \"american_football_down_progress_stats\" {\n \"id\" int4 [not null, increment]\n \"first_downs_total\" varchar(100)\n \"first_downs_pass\" varchar(100)\n \"first_downs_run\" varchar(100)\n \"first_downs_penalty\" varchar(100)\n \"conversions_third_down\" varchar(100)\n \"conversions_third_down_attempts\" varchar(100)\n \"conversions_third_down_percentage\" varchar(100)\n \"conversions_fourth_down\" varchar(100)\n \"conversions_fourth_down_attempts\" varchar(100)\n \"conversions_fourth_down_percentage\" varchar(100)\n}\n\nTable \"american_football_event_states\" {\n \"id\" int4 [not null, increment]\n \"event_id\" int4 [not null]\n \"current_state\" int2\n \"sequence_number\" int4\n \"period_value\" int4\n \"period_time_elapsed\" varchar(100)\n \"period_time_remaining\" varchar(100)\n \"clock_state\" varchar(100)\n \"down\" int4\n \"team_in_possession_id\" int4\n \"distance_for_1st_down\" int4\n \"field_side\" varchar(100)\n \"field_line\" int4\n \"context\" varchar(40)\n}\n\nTable \"american_football_fumbles_stats\" {\n \"id\" int4 [not null, increment]\n \"fumbles_committed\" varchar(100)\n \"fumbles_forced\" varchar(100)\n \"fumbles_recovered\" varchar(100)\n \"fumbles_lost\" varchar(100)\n \"fumbles_yards_gained\" varchar(100)\n \"fumbles_own_committed\" varchar(100)\n \"fumbles_own_recovered\" varchar(100)\n \"fumbles_own_lost\" varchar(100)\n \"fumbles_own_yards_gained\" varchar(100)\n \"fumbles_opposing_committed\" varchar(100)\n \"fumbles_opposing_recovered\" varchar(100)\n \"fumbles_opposing_lost\" varchar(100)\n \"fumbles_opposing_yards_gained\" varchar(100)\n}\n\nTable \"american_football_offensive_stats\" {\n \"id\" int4 [not null, increment]\n \"offensive_plays_yards\" varchar(100)\n \"offensive_plays_number\" varchar(100)\n \"offensive_plays_average_yards_per\" varchar(100)\n \"possession_duration\" varchar(100)\n \"turnovers_giveaway\" varchar(100)\n}\n\nTable \"american_football_passing_stats\" {\n \"id\" int4 [not null, increment]\n \"passes_attempts\" varchar(100)\n \"passes_completions\" varchar(100)\n \"passes_percentage\" varchar(100)\n \"passes_yards_gross\" varchar(100)\n \"passes_yards_net\" varchar(100)\n \"passes_yards_lost\" varchar(100)\n \"passes_touchdowns\" varchar(100)\n \"passes_touchdowns_percentage\" varchar(100)\n \"passes_interceptions\" varchar(100)\n \"passes_interceptions_percentage\" varchar(100)\n \"passes_longest\" varchar(100)\n \"passes_average_yards_per\" varchar(100)\n \"passer_rating\" varchar(100)\n \"receptions_total\" varchar(100)\n \"receptions_yards\" varchar(100)\n \"receptions_touchdowns\" varchar(100)\n \"receptions_first_down\" varchar(100)\n \"receptions_longest\" varchar(100)\n \"receptions_average_yards_per\" varchar(100)\n}\n\nTable \"american_football_penalties_stats\" {\n \"id\" int4 [not null, increment]\n \"penalties_total\" varchar(100)\n \"penalty_yards\" varchar(100)\n \"penalty_first_downs\" varchar(100)\n}\n\nTable \"american_football_rushing_stats\" {\n \"id\" int4 [not null, increment]\n \"rushes_attempts\" varchar(100)\n \"rushes_yards\" varchar(100)\n \"rushes_touchdowns\" varchar(100)\n \"rushing_average_yards_per\" varchar(100)\n \"rushes_first_down\" varchar(100)\n \"rushes_longest\" varchar(100)\n}\n\nTable \"american_football_sacks_against_stats\" {\n \"id\" int4 [not null, increment]\n \"sacks_against_yards\" varchar(100)\n \"sacks_against_total\" varchar(100)\n}\n\nTable \"american_football_scoring_stats\" {\n \"id\" int4 [not null, increment]\n \"touchdowns_total\" varchar(100)\n \"touchdowns_passing\" varchar(100)\n \"touchdowns_rushing\" varchar(100)\n \"touchdowns_special_teams\" varchar(100)\n \"touchdowns_defensive\" varchar(100)\n \"extra_points_attempts\" varchar(100)\n \"extra_points_made\" varchar(100)\n \"extra_points_missed\" varchar(100)\n \"extra_points_blocked\" varchar(100)\n \"field_goal_attempts\" varchar(100)\n \"field_goals_made\" varchar(100)\n \"field_goals_missed\" varchar(100)\n \"field_goals_blocked\" varchar(100)\n \"safeties_against\" varchar(100)\n \"two_point_conversions_attempts\" varchar(100)\n \"two_point_conversions_made\" varchar(100)\n \"touchbacks_total\" varchar(100)\n}\n\nTable \"american_football_special_teams_stats\" {\n \"id\" int4 [not null, increment]\n \"returns_punt_total\" varchar(100)\n \"returns_punt_yards\" varchar(100)\n \"returns_punt_average\" varchar(100)\n \"returns_punt_longest\" varchar(100)\n \"returns_punt_touchdown\" varchar(100)\n \"returns_kickoff_total\" varchar(100)\n \"returns_kickoff_yards\" varchar(100)\n \"returns_kickoff_average\" varchar(100)\n \"returns_kickoff_longest\" varchar(100)\n \"returns_kickoff_touchdown\" varchar(100)\n \"returns_total\" varchar(100)\n \"returns_yards\" varchar(100)\n \"punts_total\" varchar(100)\n \"punts_yards_gross\" varchar(100)\n \"punts_yards_net\" varchar(100)\n \"punts_longest\" varchar(100)\n \"punts_inside_20\" varchar(100)\n \"punts_inside_20_percentage\" varchar(100)\n \"punts_average\" varchar(100)\n \"punts_blocked\" varchar(100)\n \"touchbacks_total\" varchar(100)\n \"touchbacks_total_percentage\" varchar(100)\n \"touchbacks_kickoffs\" varchar(100)\n \"touchbacks_kickoffs_percentage\" varchar(100)\n \"touchbacks_punts\" varchar(100)\n \"touchbacks_punts_percentage\" varchar(100)\n \"touchbacks_interceptions\" varchar(100)\n \"touchbacks_interceptions_percentage\" varchar(100)\n \"fair_catches\" varchar(100)\n}\n\nTable \"baseball_action_contact_details\" {\n \"id\" int4 [not null, increment]\n \"baseball_action_pitch_id\" int4 [not null]\n \"location\" varchar(100)\n \"strength\" varchar(100)\n \"velocity\" int4\n \"comment\" text\n \"trajectory_coordinates\" varchar(100)\n \"trajectory_formula\" varchar(100)\n}\n\nTable \"baseball_action_pitches\" {\n \"id\" int4 [not null, increment]\n \"baseball_action_play_id\" int4 [not null]\n \"sequence_number\" int4\n \"baseball_defensive_group_id\" int4\n \"umpire_call\" varchar(100)\n \"pitch_location\" varchar(100)\n \"pitch_type\" varchar(100)\n \"pitch_velocity\" int4\n \"comment\" text\n \"trajectory_coordinates\" varchar(100)\n \"trajectory_formula\" varchar(100)\n \"ball_type\" varchar(40)\n \"strike_type\" varchar(40)\n}\n\nTable \"baseball_action_plays\" {\n \"id\" int4 [not null, increment]\n \"baseball_event_state_id\" int4 [not null]\n \"play_type\" varchar(100)\n \"notation\" varchar(100)\n \"notation_yaml\" text\n \"baseball_defensive_group_id\" int4\n \"comment\" varchar(255)\n \"runner_on_first_advance\" int4\n \"runner_on_second_advance\" int4\n \"runner_on_third_advance\" int4\n \"outs_recorded\" int4\n \"rbi\" int4\n \"runs_scored\" int4\n \"earned_runs_scored\" varchar(100)\n}\n\nTable \"baseball_action_substitutions\" {\n \"id\" int4 [not null, increment]\n \"baseball_event_state_id\" int4 [not null]\n \"sequence_number\" int4\n \"person_type\" varchar(100)\n \"person_original_id\" int4\n \"person_original_position_id\" int4\n \"person_original_lineup_slot\" int4\n \"person_replacing_id\" int4\n \"person_replacing_position_id\" int4\n \"person_replacing_lineup_slot\" int4\n \"substitution_reason\" varchar(100)\n \"comment\" varchar(100)\n}\n\nTable \"baseball_defensive_group\" {\n \"id\" int4 [not null, increment]\n}\n\nTable \"baseball_defensive_players\" {\n \"id\" int4 [not null, increment]\n \"baseball_defensive_group_id\" int4 [not null]\n \"player_id\" int4 [not null]\n \"position_id\" int4 [not null]\n}\n\nTable \"baseball_defensive_stats\" {\n \"id\" int4 [not null, increment]\n \"double_plays\" int4\n \"triple_plays\" int4\n \"putouts\" int4\n \"assists\" int4\n \"errors\" int4\n \"fielding_percentage\" numeric\n \"defensive_average\" numeric\n \"errors_passed_ball\" int4\n \"errors_catchers_interference\" int4\n}\n\nTable \"baseball_event_states\" {\n \"id\" int4 [not null, increment]\n \"event_id\" int4 [not null]\n \"current_state\" int2\n \"sequence_number\" int4\n \"at_bat_number\" int4\n \"inning_value\" int4\n \"inning_half\" varchar(100)\n \"outs\" int4\n \"balls\" int4\n \"strikes\" int4\n \"runner_on_first_id\" int4\n \"runner_on_second_id\" int4\n \"runner_on_third_id\" int4\n \"runner_on_first\" int2\n \"runner_on_second\" int2\n \"runner_on_third\" int2\n \"runs_this_inning_half\" int4\n \"pitcher_id\" int4\n \"batter_id\" int4\n \"batter_side\" varchar(100)\n \"context\" varchar(40)\n}\n\nTable \"baseball_offensive_stats\" {\n \"id\" int4 [not null, increment]\n \"average\" numeric\n \"runs_scored\" int4\n \"at_bats\" int4\n \"hits\" int4\n \"rbi\" int4\n \"total_bases\" int4\n \"slugging_percentage\" numeric\n \"bases_on_balls\" int4\n \"strikeouts\" int4\n \"left_on_base\" int4\n \"left_in_scoring_position\" int4\n \"singles\" int4\n \"doubles\" int4\n \"triples\" int4\n \"home_runs\" int4\n \"grand_slams\" int4\n \"at_bats_per_rbi\" numeric\n \"plate_appearances_per_rbi\" numeric\n \"at_bats_per_home_run\" numeric\n \"plate_appearances_per_home_run\" numeric\n \"sac_flies\" int4\n \"sac_bunts\" int4\n \"grounded_into_double_play\" int4\n \"moved_up\" int4\n \"on_base_percentage\" numeric\n \"stolen_bases\" int4\n \"stolen_bases_caught\" int4\n \"stolen_bases_average\" numeric\n \"hit_by_pitch\" int4\n \"defensive_interferance_reaches\" int4\n \"on_base_plus_slugging\" numeric\n \"plate_appearances\" int4\n \"hits_extra_base\" int4\n}\n\nTable \"baseball_pitching_stats\" {\n \"id\" int4 [not null, increment]\n \"runs_allowed\" int4\n \"singles_allowed\" int4\n \"doubles_allowed\" int4\n \"triples_allowed\" int4\n \"home_runs_allowed\" int4\n \"innings_pitched\" varchar(20)\n \"hits\" int4\n \"earned_runs\" int4\n \"unearned_runs\" int4\n \"bases_on_balls\" int4\n \"bases_on_balls_intentional\" int4\n \"strikeouts\" int4\n \"strikeout_to_bb_ratio\" numeric\n \"number_of_pitches\" int4\n \"era\" numeric\n \"inherited_runners_scored\" int4\n \"pick_offs\" int4\n \"errors_hit_with_pitch\" int4\n \"errors_wild_pitch\" int4\n \"balks\" int4\n \"wins\" int4\n \"losses\" int4\n \"saves\" int4\n \"shutouts\" int4\n \"games_complete\" int4\n \"games_finished\" int4\n \"winning_percentage\" numeric\n \"event_credit\" varchar(40)\n \"save_credit\" varchar(40)\n}\n\nTable \"basketball_defensive_stats\" {\n \"id\" int4 [not null, increment]\n \"steals_total\" varchar(100)\n \"steals_per_game\" varchar(100)\n \"blocks_total\" varchar(100)\n \"blocks_per_game\" varchar(100)\n}\n\nTable \"basketball_event_states\" {\n \"id\" int4 [not null, increment]\n \"event_id\" int4 [not null]\n \"current_state\" int4\n \"sequence_number\" int4\n \"period_value\" varchar(100)\n \"period_time_elapsed\" varchar(100)\n \"period_time_remaining\" varchar(100)\n \"context\" varchar(40)\n}\n\nTable \"basketball_offensive_stats\" {\n \"id\" int4 [not null, increment]\n \"field_goals_made\" int4\n \"field_goals_attempted\" int4\n \"field_goals_percentage\" varchar(100)\n \"field_goals_per_game\" varchar(100)\n \"field_goals_attempted_per_game\" varchar(100)\n \"field_goals_percentage_adjusted\" varchar(100)\n \"three_pointers_made\" int4\n \"three_pointers_attempted\" int4\n \"three_pointers_percentage\" varchar(100)\n \"three_pointers_per_game\" varchar(100)\n \"three_pointers_attempted_per_game\" varchar(100)\n \"free_throws_made\" varchar(100)\n \"free_throws_attempted\" varchar(100)\n \"free_throws_percentage\" varchar(100)\n \"free_throws_per_game\" varchar(100)\n \"free_throws_attempted_per_game\" varchar(100)\n \"points_scored_total\" varchar(100)\n \"points_scored_per_game\" varchar(100)\n \"assists_total\" varchar(100)\n \"assists_per_game\" varchar(100)\n \"turnovers_total\" varchar(100)\n \"turnovers_per_game\" varchar(100)\n \"points_scored_off_turnovers\" varchar(100)\n \"points_scored_in_paint\" varchar(100)\n \"points_scored_on_second_chance\" varchar(100)\n \"points_scored_on_fast_break\" varchar(100)\n}\n\nTable \"basketball_rebounding_stats\" {\n \"id\" int4 [not null, increment]\n \"rebounds_total\" varchar(100)\n \"rebounds_per_game\" varchar(100)\n \"rebounds_defensive\" varchar(100)\n \"rebounds_offensive\" varchar(100)\n \"team_rebounds_total\" varchar(100)\n \"team_rebounds_per_game\" varchar(100)\n \"team_rebounds_defensive\" varchar(100)\n \"team_rebounds_offensive\" varchar(100)\n}\n\nTable \"basketball_team_stats\" {\n \"id\" int4 [not null, increment]\n \"timeouts_left\" varchar(100)\n \"largest_lead\" varchar(100)\n \"fouls_total\" varchar(100)\n \"turnover_margin\" varchar(100)\n}\n\nTable \"bookmakers\" {\n \"id\" int4 [not null, increment]\n \"bookmaker_key\" varchar(100)\n \"publisher_id\" int4 [not null]\n \"location_id\" int4\n}\n\nTable \"core_person_stats\" {\n \"id\" int4 [not null, increment]\n \"time_played_event\" varchar(40)\n \"time_played_total\" varchar(40)\n \"time_played_event_average\" varchar(40)\n \"events_played\" int4\n \"events_started\" int4\n \"position_id\" int4\n}\n\nTable \"core_stats\" {\n \"id\" int4 [not null, increment]\n \"score\" varchar(100)\n \"score_opposing\" varchar(100)\n \"score_attempts\" varchar(100)\n \"score_attempts_opposing\" varchar(100)\n \"score_percentage\" varchar(100)\n \"score_percentage_opposing\" varchar(100)\n}\n\nTable \"db_info\" {\n \"version\" varchar(100) [not null, default: 16]\n}\n\nTable \"display_names\" {\n \"id\" int4 [not null, increment]\n \"language\" varchar(100) [not null]\n \"entity_type\" varchar(100) [not null]\n \"entity_id\" int4 [not null]\n \"full_name\" varchar(100)\n \"first_name\" varchar(100)\n \"middle_name\" varchar(100)\n \"last_name\" varchar(100)\n \"alias\" varchar(100)\n \"abbreviation\" varchar(100)\n \"short_name\" varchar(100)\n \"prefix\" varchar(20)\n \"suffix\" varchar(20)\n}\n\nTable \"document_classes\" {\n \"id\" int4 [not null, increment]\n \"name\" varchar(100)\n}\n\nTable \"document_contents\" {\n \"id\" int4 [not null, increment]\n \"document_id\" int4 [not null]\n \"sportsml\" varchar(200)\n \"abstract\" text\n}\n\nTable \"document_fixtures\" {\n \"id\" int4 [not null, increment]\n \"fixture_key\" varchar(100)\n \"publisher_id\" int4 [not null]\n \"name\" varchar(100)\n \"document_class_id\" int4 [not null]\n}\n\nTable \"document_fixtures_events\" {\n \"id\" int4 [not null, increment]\n \"document_fixture_id\" int4 [not null]\n \"event_id\" int4 [not null]\n \"latest_document_id\" int4 [not null]\n \"last_update\" timestamp\n}\n\nTable \"document_package_entry\" {\n \"id\" int4 [not null, increment]\n \"document_package_id\" int4 [not null]\n \"rank\" varchar(100)\n \"document_id\" int4 [not null]\n \"headline\" varchar(100)\n \"short_headline\" varchar(100)\n}\n\nTable \"document_packages\" {\n \"id\" int4 [not null, increment]\n \"package_key\" varchar(100)\n \"package_name\" varchar(100)\n \"date_time\" date\n}\n\nTable \"documents\" {\n \"id\" int4 [not null, increment]\n \"doc_id\" varchar(75) [not null]\n \"publisher_id\" int4 [not null]\n \"date_time\" timestamp\n \"title\" varchar(255)\n \"language\" varchar(100)\n \"priority\" varchar(100)\n \"revision_id\" varchar(75)\n \"stats_coverage\" varchar(100)\n \"document_fixture_id\" int4 [not null]\n \"source_id\" int4\n \"db_loading_date_time\" timestamp\n}\n\nTable \"documents_media\" {\n \"id\" int4 [not null, increment]\n \"document_id\" int4 [not null]\n \"media_id\" int4 [not null]\n \"media_caption_id\" int4 [not null]\n}\n\nTable \"events\" {\n \"id\" int4 [not null, increment]\n \"event_key\" varchar(100) [not null]\n \"publisher_id\" int4 [not null]\n \"start_date_time\" timestamp\n \"site_id\" int4\n \"site_alignment\" varchar(100)\n \"event_status\" varchar(100)\n \"duration\" varchar(100)\n \"attendance\" varchar(100)\n \"last_update\" timestamp\n}\n\nTable \"events_documents\" {\n \"event_id\" int4 [not null]\n \"document_id\" int4 [not null]\n}\n\nTable \"events_media\" {\n \"event_id\" int4 [not null]\n \"media_id\" int4 [not null]\n}\n\nTable \"events_sub_seasons\" {\n \"event_id\" int4 [not null]\n \"sub_season_id\" int4 [not null]\n}\n\nTable \"ice_hockey_action_participants\" {\n \"id\" int4 [not null, increment]\n \"ice_hockey_action_play_id\" int4 [not null]\n \"person_id\" int4 [not null]\n \"participant_role\" varchar(100) [not null]\n \"point_credit\" int4\n}\n\nTable \"ice_hockey_action_plays\" {\n \"id\" int4 [not null, increment]\n \"ice_hockey_event_state_id\" int4 [not null]\n \"play_type\" varchar(100)\n \"score_attempt_type\" varchar(100)\n \"play_result\" varchar(100)\n \"comment\" varchar(255)\n}\n\nTable \"ice_hockey_defensive_stats\" {\n \"id\" int4 [not null, increment]\n \"shots_power_play_allowed\" varchar(100)\n \"shots_penalty_shot_allowed\" varchar(100)\n \"goals_power_play_allowed\" varchar(100)\n \"goals_penalty_shot_allowed\" varchar(100)\n \"goals_against_average\" varchar(100)\n \"saves\" varchar(100)\n \"save_percentage\" varchar(100)\n \"penalty_killing_amount\" varchar(100)\n \"penalty_killing_percentage\" varchar(100)\n \"shots_blocked\" varchar(100)\n \"takeaways\" varchar(100)\n \"shutouts\" varchar(100)\n \"minutes_penalty_killing\" varchar(100)\n \"hits\" varchar(100)\n \"goals_empty_net_allowed\" varchar(100)\n \"goals_short_handed_allowed\" varchar(100)\n \"goals_shootout_allowed\" varchar(100)\n \"shots_shootout_allowed\" varchar(100)\n}\n\nTable \"ice_hockey_event_states\" {\n \"id\" int4 [not null, increment]\n \"event_id\" int4 [not null]\n \"current_state\" int4\n \"sequence_number\" int4\n \"period_value\" varchar(100)\n \"period_time_elapsed\" varchar(100)\n \"period_time_remaining\" varchar(100)\n \"context\" varchar(40)\n}\n\nTable \"ice_hockey_offensive_stats\" {\n \"id\" int4 [not null, increment]\n \"goals_game_winning\" varchar(100)\n \"goals_game_tying\" varchar(100)\n \"goals_power_play\" varchar(100)\n \"goals_short_handed\" varchar(100)\n \"goals_even_strength\" varchar(100)\n \"goals_empty_net\" varchar(100)\n \"goals_overtime\" varchar(100)\n \"goals_shootout\" varchar(100)\n \"goals_penalty_shot\" varchar(100)\n \"assists\" varchar(100)\n \"points\" varchar(100)\n \"power_play_amount\" varchar(100)\n \"power_play_percentage\" varchar(100)\n \"shots_penalty_shot_taken\" varchar(100)\n \"shots_penalty_shot_missed\" varchar(100)\n \"shots_penalty_shot_percentage\" varchar(100)\n \"giveaways\" varchar(100)\n \"minutes_power_play\" varchar(100)\n \"faceoff_wins\" varchar(100)\n \"faceoff_losses\" varchar(100)\n \"faceoff_win_percentage\" varchar(100)\n \"scoring_chances\" varchar(100)\n}\n\nTable \"ice_hockey_player_stats\" {\n \"id\" int4 [not null, increment]\n \"plus_minus\" varchar(100)\n}\n\nTable \"injury_phases\" {\n \"id\" int4 [not null, increment]\n \"person_id\" int4 [not null]\n \"injury_status\" varchar(100)\n \"injury_type\" varchar(100)\n \"injury_comment\" varchar(100)\n \"disabled_list\" varchar(100)\n \"start_date_time\" timestamp\n \"end_date_time\" timestamp\n \"season_id\" int4\n \"phase_type\" varchar(100)\n \"injury_side\" varchar(100)\n}\n\nTable \"key_aliases\" {\n \"id\" int4 [not null, increment]\n \"key_id\" int4 [not null]\n \"key_root_id\" int4 [not null]\n}\n\nTable \"key_roots\" {\n \"id\" int4 [not null, increment]\n \"key_type\" varchar(100)\n}\n\nTable \"latest_revisions\" {\n \"id\" int4 [not null, increment]\n \"revision_id\" varchar(75) [not null]\n \"latest_document_id\" int4 [not null]\n}\n\nTable \"locations\" {\n \"id\" int4 [not null, increment]\n \"timezone\" varchar(100)\n \"latitude\" varchar(100)\n \"longitude\" varchar(100)\n \"country_code\" varchar(100)\n}\n\nTable \"media\" {\n \"id\" int4 [not null, increment]\n \"object_id\" int4\n \"source_id\" int4\n \"revision_id\" int4\n \"media_type\" varchar(100)\n \"publisher_id\" int4 [not null]\n \"date_time\" varchar(100)\n \"credit_id\" int4 [not null]\n \"db_loading_date_time\" timestamp\n \"creation_location_id\" int4 [not null]\n}\n\nTable \"media_captions\" {\n \"id\" int4 [not null, increment]\n \"media_id\" int4 [not null]\n \"caption_type\" varchar(100)\n \"caption\" varchar(100)\n \"caption_author_id\" int4 [not null]\n \"language\" varchar(100)\n \"caption_size\" varchar(100)\n}\n\nTable \"media_contents\" {\n \"id\" int4 [not null, increment]\n \"media_id\" int4 [not null]\n \"object\" varchar(100)\n \"format\" varchar(100)\n \"mime_type\" varchar(100)\n \"height\" varchar(100)\n \"width\" varchar(100)\n \"duration\" varchar(100)\n \"file_size\" varchar(100)\n \"resolution\" varchar(100)\n}\n\nTable \"media_keywords\" {\n \"id\" int4 [not null, increment]\n \"keyword\" varchar(100)\n \"media_id\" int4 [not null]\n}\n\nTable \"motor_racing_event_states\" {\n \"id\" int4 [not null, increment]\n \"event_id\" int4 [not null]\n \"current_state\" int4\n \"sequence_number\" int4\n \"lap\" varchar(100)\n \"laps_remaining\" varchar(100)\n \"time_elapsed\" varchar(100)\n \"flag_state\" varchar(100)\n \"context\" varchar(40)\n}\n\nTable \"motor_racing_qualifying_stats\" {\n \"id\" int4 [not null, increment]\n \"grid\" varchar(100)\n \"pole_position\" varchar(100)\n \"pole_wins\" varchar(100)\n \"qualifying_speed\" varchar(100)\n \"qualifying_speed_units\" varchar(100)\n \"qualifying_time\" varchar(100)\n \"qualifying_position\" varchar(100)\n}\n\nTable \"motor_racing_race_stats\" {\n \"id\" int4 [not null, increment]\n \"time_behind_leader\" varchar(100)\n \"laps_behind_leader\" varchar(100)\n \"time_ahead_follower\" varchar(100)\n \"laps_ahead_follower\" varchar(100)\n \"time\" varchar(100)\n \"points\" varchar(100)\n \"points_rookie\" varchar(100)\n \"bonus\" varchar(100)\n \"laps_completed\" varchar(100)\n \"laps_leading_total\" varchar(100)\n \"distance_leading\" varchar(100)\n \"distance_completed\" varchar(100)\n \"distance_units\" varchar(40)\n \"speed_average\" varchar(40)\n \"speed_units\" varchar(40)\n \"status\" varchar(40)\n \"finishes_top_5\" varchar(40)\n \"finishes_top_10\" varchar(40)\n \"starts\" varchar(40)\n \"finishes\" varchar(40)\n \"non_finishes\" varchar(40)\n \"wins\" varchar(40)\n \"races_leading\" varchar(40)\n \"money\" varchar(40)\n \"money_units\" varchar(40)\n \"leads_total\" varchar(40)\n}\n\nTable \"outcome_totals\" {\n \"id\" int4 [not null, increment]\n \"standing_subgroup_id\" int4 [not null]\n \"outcome_holder_type\" varchar(100)\n \"outcome_holder_id\" int4\n \"rank\" varchar(100)\n \"wins\" varchar(100)\n \"losses\" varchar(100)\n \"ties\" varchar(100)\n \"undecideds\" varchar(100)\n \"winning_percentage\" varchar(100)\n \"points_scored_for\" varchar(100)\n \"points_scored_against\" varchar(100)\n \"points_difference\" varchar(100)\n \"standing_points\" varchar(100)\n \"streak_type\" varchar(100)\n \"streak_duration\" varchar(100)\n \"streak_total\" varchar(100)\n \"streak_start\" date\n \"streak_end\" date\n}\n\nTable \"participants_events\" {\n \"id\" int4 [not null, increment]\n \"participant_type\" varchar(100) [not null]\n \"participant_id\" int4 [not null]\n \"event_id\" int4 [not null]\n \"alignment\" varchar(100)\n \"score\" varchar(100)\n \"event_outcome\" varchar(100)\n \"rank\" int4\n}\n\nTable \"periods\" {\n \"id\" int4 [not null, increment]\n \"participant_event_id\" int4 [not null]\n \"period_value\" varchar(100)\n \"score\" varchar(100)\n}\n\nTable \"person_event_metadata\" {\n \"id\" int4 [not null, increment]\n \"person_id\" int4 [not null]\n \"event_id\" int4 [not null]\n \"status\" varchar(100)\n \"health\" varchar(100)\n \"weight\" varchar(100)\n \"role_id\" int4\n \"position_id\" int4\n \"team_id\" int4\n \"lineup_slot\" int4\n \"lineup_slot_sequence\" int4\n}\n\nTable \"person_phases\" {\n \"id\" int4 [not null, increment]\n \"person_id\" int4 [not null]\n \"membership_type\" varchar(40) [not null]\n \"membership_id\" int4 [not null]\n \"role_id\" int4\n \"role_status\" varchar(40)\n \"phase_status\" varchar(40)\n \"uniform_number\" varchar(20)\n \"regular_position_id\" int4\n \"regular_position_depth\" varchar(40)\n \"height\" varchar(100)\n \"weight\" varchar(100)\n \"start_date_time\" timestamp\n \"start_season_id\" int4\n \"end_date_time\" timestamp\n \"end_season_id\" int4\n \"entry_reason\" varchar(40)\n \"exit_reason\" varchar(40)\n \"selection_level\" int4\n \"selection_sublevel\" int4\n \"selection_overall\" int4\n}\n\nTable \"persons\" {\n \"id\" int4 [not null, increment]\n \"person_key\" varchar(100) [not null]\n \"publisher_id\" int4 [not null]\n \"gender\" varchar(20)\n \"birth_date\" varchar(30)\n \"death_date\" varchar(30)\n \"birth_location_id\" int4\n \"hometown_location_id\" int4\n \"residence_location_id\" int4\n \"death_location_id\" int4\n}\n\nTable \"persons_documents\" {\n \"person_id\" int4 [not null]\n \"document_id\" int4 [not null]\n}\n\nTable \"persons_media\" {\n \"person_id\" int4 [not null]\n \"media_id\" int4 [not null]\n}\n\nTable \"positions\" {\n \"id\" int4 [not null, increment]\n \"affiliation_id\" int4 [not null]\n \"abbreviation\" varchar(100) [not null]\n}\n\nTable \"publishers\" {\n \"id\" int4 [not null, increment]\n \"publisher_key\" varchar(100) [not null]\n \"publisher_name\" varchar(100)\n}\n\nTable \"roles\" {\n \"id\" int4 [not null, increment]\n \"role_key\" varchar(100) [not null]\n \"role_name\" varchar(100)\n \"comment\" varchar(100)\n}\n\nTable \"seasons\" {\n \"id\" int4 [not null, increment]\n \"season_key\" int4 [not null]\n \"publisher_id\" int4 [not null]\n \"league_id\" int4 [not null]\n \"start_date_time\" timestamp\n \"end_date_time\" timestamp\n}\n\nTable \"sites\" {\n \"id\" int4 [not null, increment]\n \"site_key\" int4 [not null]\n \"publisher_id\" int4 [not null]\n \"location_id\" int4\n}\n\nTable \"soccer_defensive_stats\" {\n \"id\" int4 [not null, increment]\n \"shots_penalty_shot_allowed\" varchar(100)\n \"goals_penalty_shot_allowed\" varchar(100)\n \"goals_against_average\" varchar(100)\n \"goals_against_total\" varchar(100)\n \"saves\" varchar(100)\n \"save_percentage\" varchar(100)\n \"catches_punches\" varchar(100)\n \"shots_on_goal_total\" varchar(100)\n \"shots_shootout_total\" varchar(100)\n \"shots_shootout_allowed\" varchar(100)\n \"shots_blocked\" varchar(100)\n \"shutouts\" varchar(100)\n}\n\nTable \"soccer_event_states\" {\n \"id\" int4 [not null, increment]\n \"event_id\" int4 [not null]\n \"current_state\" int4\n \"sequence_number\" int4\n \"period_value\" varchar(100)\n \"period_time_elapsed\" varchar(100)\n \"period_time_remaining\" varchar(100)\n \"minutes_elapsed\" varchar(100)\n \"period_minute_elapsed\" varchar(100)\n \"context\" varchar(40)\n}\n\nTable \"soccer_foul_stats\" {\n \"id\" int4 [not null, increment]\n \"fouls_suffered\" varchar(100)\n \"fouls_commited\" varchar(100)\n \"cautions_total\" varchar(100)\n \"cautions_pending\" varchar(100)\n \"caution_points_total\" varchar(100)\n \"caution_points_pending\" varchar(100)\n \"ejections_total\" varchar(100)\n}\n\nTable \"soccer_offensive_stats\" {\n \"id\" int4 [not null, increment]\n \"goals_game_winning\" varchar(100)\n \"goals_game_tying\" varchar(100)\n \"goals_overtime\" varchar(100)\n \"goals_shootout\" varchar(100)\n \"goals_total\" varchar(100)\n \"assists_game_winning\" varchar(100)\n \"assists_game_tying\" varchar(100)\n \"assists_overtime\" varchar(100)\n \"assists_total\" varchar(100)\n \"points\" varchar(100)\n \"shots_total\" varchar(100)\n \"shots_on_goal_total\" varchar(100)\n \"shots_hit_frame\" varchar(100)\n \"shots_penalty_shot_taken\" varchar(100)\n \"shots_penalty_shot_scored\" varchar(100)\n \"shots_penalty_shot_missed\" varchar(40)\n \"shots_penalty_shot_percentage\" varchar(40)\n \"shots_shootout_taken\" varchar(40)\n \"shots_shootout_scored\" varchar(40)\n \"shots_shootout_missed\" varchar(40)\n \"shots_shootout_percentage\" varchar(40)\n \"giveaways\" varchar(40)\n \"offsides\" varchar(40)\n \"corner_kicks\" varchar(40)\n \"hat_tricks\" varchar(40)\n}\n\nTable \"standing_subgroups\" {\n \"id\" int4 [not null, increment]\n \"standing_id\" int4 [not null]\n \"affiliation_id\" int4 [not null]\n}\n\nTable \"standings\" {\n \"id\" int4 [not null, increment]\n \"affiliation_id\" int4 [not null]\n \"standing_type\" varchar(100)\n \"sub_season_id\" int4 [not null]\n \"last_updated\" varchar(100)\n \"duration_scope\" varchar(100)\n \"competition_scope\" varchar(100)\n \"competition_scope_id\" varchar(100)\n \"alignment_scope\" varchar(100)\n \"site_scope\" varchar(100)\n \"scoping_label\" varchar(100)\n \"publisher_id\" int4 [not null]\n \"source\" varchar(100)\n}\n\nTable \"stats\" {\n \"id\" int4 [not null, increment]\n \"stat_repository_type\" varchar(100)\n \"stat_repository_id\" int4 [not null]\n \"stat_holder_type\" varchar(100)\n \"stat_holder_id\" int4\n \"stat_coverage_type\" varchar(100)\n \"stat_coverage_id\" int4\n \"context\" varchar(40) [not null]\n}\n\nTable \"sub_periods\" {\n \"id\" int4 [not null, increment]\n \"period_id\" int4 [not null]\n \"sub_period_value\" varchar(100)\n \"score\" varchar(100)\n}\n\nTable \"sub_seasons\" {\n \"id\" int4 [not null, increment]\n \"sub_season_key\" varchar(100) [not null]\n \"season_id\" int4 [not null]\n \"sub_season_type\" varchar(100) [not null]\n \"start_date_time\" timestamp\n \"end_date_time\" timestamp\n}\n\nTable \"team_american_football_stats\" {\n \"id\" int4 [not null, increment]\n \"yards_per_attempt\" varchar(100)\n \"average_starting_position\" varchar(100)\n \"timeouts\" varchar(100)\n \"time_of_possession\" varchar(100)\n \"turnover_ratio\" varchar(100)\n}\n\nTable \"team_phases\" {\n \"id\" int4 [not null, increment]\n \"team_id\" int4 [not null]\n \"start_season_id\" int4\n \"end_season_id\" int4\n \"affiliation_id\" int4 [not null]\n \"start_date_time\" varchar(100)\n \"end_date_time\" varchar(100)\n \"phase_status\" varchar(40)\n \"role_id\" int4\n}\n\nTable \"teams\" {\n \"id\" int4 [not null, increment]\n \"team_key\" varchar(100) [not null]\n \"publisher_id\" int4 [not null]\n \"home_site_id\" int4\n}\n\nTable \"teams_documents\" {\n \"team_id\" int4 [not null]\n \"document_id\" int4 [not null]\n}\n\nTable \"teams_media\" {\n \"team_id\" int4 [not null]\n \"media_id\" int4 [not null]\n}\n\nTable \"tennis_action_points\" {\n \"id\" int4 [not null, increment]\n \"sub_period_id\" varchar(100)\n \"sequence_number\" varchar(100)\n \"win_type\" varchar(100)\n}\n\nTable \"tennis_action_volleys\" {\n \"id\" int4 [not null, increment]\n \"sequence_number\" varchar(100)\n \"tennis_action_points_id\" int4\n \"landing_location\" varchar(100)\n \"swing_type\" varchar(100)\n \"result\" varchar(100)\n \"spin_type\" varchar(100)\n \"trajectory_details\" varchar(100)\n}\n\nTable \"tennis_event_states\" {\n \"id\" int4 [not null, increment]\n \"event_id\" int4 [not null]\n \"current_state\" int4\n \"sequence_number\" int4\n \"tennis_set\" varchar(100)\n \"game\" varchar(100)\n \"server_person_id\" int4\n \"server_score\" varchar(100)\n \"receiver_person_id\" int4\n \"receiver_score\" varchar(100)\n \"service_number\" varchar(100)\n \"context\" varchar(40)\n}\n\nTable \"tennis_return_stats\" {\n \"id\" int4 [not null, increment]\n \"returns_played\" varchar(100)\n \"matches_played\" varchar(100)\n \"first_service_return_points_won\" varchar(100)\n \"first_service_return_points_won_pct\" varchar(100)\n \"second_service_return_points_won\" varchar(100)\n \"second_service_return_points_won_pct\" varchar(100)\n \"return_games_played\" varchar(100)\n \"return_games_won\" varchar(100)\n \"return_games_won_pct\" varchar(100)\n \"break_points_played\" varchar(100)\n \"break_points_converted\" varchar(100)\n \"break_points_converted_pct\" varchar(100)\n}\n\nTable \"tennis_service_stats\" {\n \"id\" int4 [not null, increment]\n \"services_played\" varchar(100)\n \"matches_played\" varchar(100)\n \"aces\" varchar(100)\n \"first_services_good\" varchar(100)\n \"first_services_good_pct\" varchar(100)\n \"first_service_points_won\" varchar(100)\n \"first_service_points_won_pct\" varchar(100)\n \"second_service_points_won\" varchar(100)\n \"second_service_points_won_pct\" varchar(100)\n \"service_games_played\" varchar(100)\n \"service_games_won\" varchar(100)\n \"service_games_won_pct\" varchar(100)\n \"break_points_played\" varchar(100)\n \"break_points_saved\" varchar(100)\n \"break_points_saved_pct\" varchar(100)\n}\n\nTable \"wagering_moneylines\" {\n \"id\" int4 [not null, increment]\n \"bookmaker_id\" int4 [not null]\n \"event_id\" int4 [not null]\n \"date_time\" timestamp\n \"team_id\" int4 [not null]\n \"person_id\" int4\n \"rotation_key\" varchar(100)\n \"comment\" varchar(100)\n \"vigorish\" varchar(100)\n \"line\" varchar(100)\n \"line_opening\" varchar(100)\n \"prediction\" varchar(100)\n}\n\nTable \"wagering_odds_lines\" {\n \"id\" int4 [not null, increment]\n \"bookmaker_id\" int4 [not null]\n \"event_id\" int4 [not null]\n \"date_time\" timestamp\n \"team_id\" int4 [not null]\n \"person_id\" int4\n \"rotation_key\" varchar(100)\n \"comment\" varchar(100)\n \"numerator\" varchar(100)\n \"denominator\" varchar(100)\n \"prediction\" varchar(100)\n \"payout_calculation\" varchar(100)\n \"payout_amount\" varchar(100)\n}\n\nTable \"wagering_runlines\" {\n \"id\" int4 [not null, increment]\n \"bookmaker_id\" int4 [not null]\n \"event_id\" int4 [not null]\n \"date_time\" timestamp\n \"team_id\" int4 [not null]\n \"person_id\" int4\n \"rotation_key\" varchar(100)\n \"comment\" varchar(100)\n \"vigorish\" varchar(100)\n \"line\" varchar(100)\n \"line_opening\" varchar(100)\n \"line_value\" varchar(100)\n \"prediction\" varchar(100)\n}\n\nTable \"wagering_straight_spread_lines\" {\n \"id\" int4 [not null, increment]\n \"bookmaker_id\" int4 [not null]\n \"event_id\" int4 [not null]\n \"date_time\" timestamp\n \"team_id\" int4 [not null]\n \"person_id\" int4\n \"rotation_key\" varchar(100)\n \"comment\" varchar(100)\n \"vigorish\" varchar(100)\n \"line_value\" varchar(100)\n \"line_value_opening\" varchar(100)\n \"prediction\" varchar(100)\n}\n\nTable \"wagering_total_score_lines\" {\n \"id\" int4 [not null, increment]\n \"bookmaker_id\" int4 [not null]\n \"event_id\" int4 [not null]\n \"date_time\" timestamp\n \"team_id\" int4 [not null]\n \"person_id\" int4\n \"rotation_key\" varchar(100)\n \"comment\" varchar(100)\n \"vigorish\" varchar(100)\n \"line_over\" varchar(100)\n \"line_under\" varchar(100)\n \"total\" varchar(100)\n \"total_opening\" varchar(100)\n \"prediction\" varchar(100)\n}\n\nTable \"weather_conditions\" {\n \"id\" int4 [not null, increment]\n \"event_id\" int4 [not null]\n \"temperature\" varchar(100)\n \"temperature_units\" varchar(40)\n \"humidity\" varchar(100)\n \"clouds\" varchar(100)\n \"wind_direction\" varchar(100)\n \"wind_velocity\" varchar(100)\n \"weather_code\" varchar(100)\n}\n", "stateUrl": null, "stateOriginalUrl": "https://github.com/yugabyte/yugabyte-db/blob/master/sample/sportsdb_tables.sql" } } ================================================ FILE: tasks/postgres/standard/sports/baseball_player_analysis/verify.py ================================================ """ Verification script for PostgreSQL Sports Task 1: Baseball Player Analysis """ import os import sys import psycopg2 from decimal import Decimal def rows_match(actual_row, expected_row): """Compare two rows with appropriate tolerance for decimals and floats.""" if len(actual_row) != len(expected_row): return False for actual, expected in zip(actual_row, expected_row): if isinstance(actual, (Decimal, float)) and isinstance(expected, (Decimal, float)): # Use higher tolerance for floating point comparisons if abs(float(actual) - float(expected)) > 0.001: return False elif actual != expected: return False return True def get_connection_params() -> dict: """Get database connection parameters.""" return { "host": os.getenv("POSTGRES_HOST", "localhost"), "port": int(os.getenv("POSTGRES_PORT", 5432)), "database": os.getenv("POSTGRES_DATABASE"), "user": os.getenv("POSTGRES_USERNAME"), "password": os.getenv("POSTGRES_PASSWORD") } def verify_baseball_player_analysis_table(conn) -> bool: """Verify the baseball_player_analysis table results.""" with conn.cursor() as cur: cur.execute(""" SELECT player_id, player_name, team_name, games_played, at_bats, hits, runs_scored, rbi, home_runs, batting_average, defensive_games, putouts, assists, errors, fielding_percentage FROM baseball_player_analysis ORDER BY batting_average DESC, games_played DESC """) actual_results = cur.fetchall() cur.execute(""" SELECT p.id AS player_id, MAX(dn.full_name) AS player_name, 'Unknown' AS team_name, core.events_played AS games_played, off.at_bats, off.hits, off.runs_scored, off.rbi, off.home_runs, CASE WHEN off.at_bats > 0 THEN 1.0 * off.hits / off.at_bats ELSE 0 END AS batting_average, core.events_played AS defensive_games, COALESCE(def.putouts, 0) AS putouts, COALESCE(def.assists, 0) AS assists, COALESCE(def.errors, 0) AS errors, CASE WHEN (COALESCE(def.putouts,0) + COALESCE(def.assists,0) + COALESCE(def.errors,0)) > 0 THEN 1.0 * (COALESCE(def.putouts,0) + COALESCE(def.assists,0)) / (COALESCE(def.putouts,0) + COALESCE(def.assists,0) + COALESCE(def.errors,0)) ELSE 0 END AS fielding_percentage FROM persons p JOIN display_names dn ON dn.entity_id = p.id AND dn.entity_type = 'persons' AND NULLIF(TRIM(dn.full_name), '') IS NOT NULL JOIN ( SELECT s.stat_holder_id AS player_id, SUM(bos.at_bats) AS at_bats, SUM(bos.hits) AS hits, SUM(bos.runs_scored) AS runs_scored, SUM(bos.rbi) AS rbi, SUM(bos.home_runs) AS home_runs FROM stats s JOIN baseball_offensive_stats bos ON bos.id = s.stat_repository_id WHERE s.stat_holder_type = 'persons' AND s.stat_repository_type = 'baseball_offensive_stats' AND s.context = 'season-regular' GROUP BY s.stat_holder_id ) off ON off.player_id = p.id JOIN ( SELECT s.stat_holder_id AS player_id, SUM(cps.events_played) AS events_played FROM stats s JOIN core_person_stats cps ON cps.id = s.stat_repository_id WHERE s.stat_holder_type = 'persons' AND s.stat_repository_type = 'core_person_stats' AND s.context = 'season-regular' GROUP BY s.stat_holder_id ) core ON core.player_id = p.id LEFT JOIN ( SELECT s.stat_holder_id AS player_id, SUM(bds.putouts) AS putouts, SUM(bds.assists) AS assists, SUM(bds.errors) AS errors FROM stats s JOIN baseball_defensive_stats bds ON bds.id = s.stat_repository_id WHERE s.stat_holder_type = 'persons' AND s.stat_repository_type = 'baseball_defensive_stats' AND s.context = 'season-regular' GROUP BY s.stat_holder_id ) def ON def.player_id = p.id WHERE core.events_played >= 10 AND off.at_bats >= 50 GROUP BY p.id, core.events_played, off.at_bats, off.hits, off.runs_scored, off.rbi, off.home_runs, def.putouts, def.assists, def.errors ORDER BY batting_average DESC, games_played DESC; """) expected_results = cur.fetchall() if len(actual_results) != len(expected_results): print(f"❌ baseball_player_analysis table has {len(actual_results)} records, expected {len(expected_results)}") return False mismatches = 0 for i, (actual, expected) in enumerate(zip(actual_results, expected_results)): if not rows_match(actual, expected): if mismatches < 5: # Only show first 5 mismatches print(f"❌ Player analysis row {i+1} mismatch: expected {expected}, got {actual}") mismatches += 1 if mismatches > 0: print(f"❌ Total player analysis mismatches: {mismatches}") return False print(f"✅ baseball_player_analysis table created and populated correctly ({len(actual_results)} players)") return True def main(): """Main verification function.""" print("=" * 70) print("PostgreSQL Sports Task 1 Verification: Baseball Player Analysis") print("=" * 70) # Get connection parameters conn_params = get_connection_params() if not conn_params["database"]: print("❌ No database specified") sys.exit(1) try: # Connect to database conn = psycopg2.connect(**conn_params) # Verify results success = verify_baseball_player_analysis_table(conn) conn.close() if success: print("\n🎉 Task verification: PASS") sys.exit(0) else: print("\n❌ Task verification: FAIL") sys.exit(1) except psycopg2.Error as e: print(f"❌ Database error: {e}") sys.exit(1) except Exception as e: print(f"❌ Verification error: {e}") sys.exit(1) if __name__ == "__main__": main() ================================================ FILE: tasks/postgres/standard/sports/participant_report_optimization/description.md ================================================ # Query Performance Optimization ## Background You need to optimize a slow-running analytics query that generates performance reports. The query currently takes too long to execute and needs optimization. ## Requirements ### 1. Create Performance Report Table Create a table called `participant_performance_report` with the following structure: - report_id (serial primary key) - participant_id (integer not null) - event_count (integer) - stat_count (integer) - stat_type_count (integer) - last_event_date (timestamp) - created_at (timestamp default current_timestamp) Add constraint: CHECK (participant_id > 0) ### 2. Execute and Optimize the Slow Query The following query is currently running very slowly. Your task is to: 1. **Identify why the query is slow** 2. **Create appropriate indexes to optimize it** 3. **Populate the report table with the query results** ```sql SELECT pe.participant_id, COUNT(pe.event_id) as event_count, (SELECT COUNT(*) FROM stats s WHERE s.stat_holder_id = pe.participant_id AND s.stat_holder_type = 'persons') as stat_count, (SELECT COUNT(DISTINCT s.stat_repository_type) FROM stats s WHERE s.stat_holder_id = pe.participant_id AND s.stat_holder_type = 'persons') as stat_type_count, (SELECT MAX(e.start_date_time) FROM events e JOIN participants_events pe2 ON e.id = pe2.event_id WHERE pe2.participant_id = pe.participant_id) as last_event_date FROM participants_events pe WHERE pe.participant_id <= 50 GROUP BY pe.participant_id ORDER BY pe.participant_id; ``` ### 3. Document Performance Improvement After optimization, insert the results into your `participant_performance_report` table. ## Success Criteria - The query should execute significantly faster after your optimization - All results should be correctly inserted into the report table - Your optimization should use appropriate database indexes ## Important Notes - Analyze the query execution plan to identify bottlenecks - Focus on the most impactful optimizations - Handle NULL values appropriately in calculations ================================================ FILE: tasks/postgres/standard/sports/participant_report_optimization/meta.json ================================================ { "task_id": "participant_report_optimization", "task_name": "Participant Report Optimization", "category_id": "sports", "category_name": "Sports", "description": "Optimize slow-running participant performance query by creating indexes and populating performance report table.", "author": "Lingxiao Du", "created_at": "2025-08-18", "difficulty": "L3", "tags": [ "performance optimization", "schema design" ], "mcp": [ "postgres" ], "meta_data": { "stateType": "text", "stateContent": "Table \"addresses\" {\n \"id\" int4 [not null, increment]\n \"location_id\" int4 [not null]\n \"language\" varchar(100)\n \"suite\" varchar(100)\n \"floor\" varchar(100)\n \"building\" varchar(100)\n \"street_number\" varchar(100)\n \"street_prefix\" varchar(100)\n \"street\" varchar(100)\n \"street_suffix\" varchar(100)\n \"neighborhood\" varchar(100)\n \"district\" varchar(100)\n \"locality\" varchar(100)\n \"county\" varchar(100)\n \"region\" varchar(100)\n \"postal_code\" varchar(100)\n \"country\" varchar(100)\n}\n\nTable \"affiliation_phases\" {\n \"id\" int4 [not null, increment]\n \"affiliation_id\" int4 [not null]\n \"ancestor_affiliation_id\" int4\n \"start_season_id\" int4\n \"start_date_time\" timestamp\n \"end_season_id\" int4\n \"end_date_time\" timestamp\n}\n\nTable \"affiliations\" {\n \"id\" int4 [not null, increment]\n \"affiliation_key\" varchar(100) [not null]\n \"affiliation_type\" varchar(100)\n \"publisher_id\" int4 [not null]\n}\n\nTable \"affiliations_documents\" {\n \"affiliation_id\" int4 [not null]\n \"document_id\" int4 [not null]\n}\n\nTable \"affiliations_events\" {\n \"affiliation_id\" int4 [not null]\n \"event_id\" int4 [not null]\n}\n\nTable \"affiliations_media\" {\n \"affiliation_id\" int4 [not null]\n \"media_id\" int4 [not null]\n}\n\nTable \"american_football_action_participants\" {\n \"id\" int4 [not null, increment]\n \"american_football_action_play_id\" int4 [not null]\n \"person_id\" int4 [not null]\n \"participant_role\" varchar(100) [not null]\n \"score_type\" varchar(100)\n \"field_line\" int4\n \"yardage\" int4\n \"score_credit\" int4\n \"yards_gained\" int4\n}\n\nTable \"american_football_action_plays\" {\n \"id\" int4 [not null, increment]\n \"american_football_event_state_id\" int4 [not null]\n \"play_type\" varchar(100)\n \"score_attempt_type\" varchar(100)\n \"drive_result\" varchar(100)\n \"points\" int4\n \"comment\" varchar(255)\n}\n\nTable \"american_football_defensive_stats\" {\n \"id\" int4 [not null, increment]\n \"tackles_total\" varchar(100)\n \"tackles_solo\" varchar(100)\n \"tackles_assists\" varchar(100)\n \"interceptions_total\" varchar(100)\n \"interceptions_yards\" varchar(100)\n \"interceptions_average\" varchar(100)\n \"interceptions_longest\" varchar(100)\n \"interceptions_touchdown\" varchar(100)\n \"quarterback_hurries\" varchar(100)\n \"sacks_total\" varchar(100)\n \"sacks_yards\" varchar(100)\n \"passes_defensed\" varchar(100)\n}\n\nTable \"american_football_down_progress_stats\" {\n \"id\" int4 [not null, increment]\n \"first_downs_total\" varchar(100)\n \"first_downs_pass\" varchar(100)\n \"first_downs_run\" varchar(100)\n \"first_downs_penalty\" varchar(100)\n \"conversions_third_down\" varchar(100)\n \"conversions_third_down_attempts\" varchar(100)\n \"conversions_third_down_percentage\" varchar(100)\n \"conversions_fourth_down\" varchar(100)\n \"conversions_fourth_down_attempts\" varchar(100)\n \"conversions_fourth_down_percentage\" varchar(100)\n}\n\nTable \"american_football_event_states\" {\n \"id\" int4 [not null, increment]\n \"event_id\" int4 [not null]\n \"current_state\" int2\n \"sequence_number\" int4\n \"period_value\" int4\n \"period_time_elapsed\" varchar(100)\n \"period_time_remaining\" varchar(100)\n \"clock_state\" varchar(100)\n \"down\" int4\n \"team_in_possession_id\" int4\n \"distance_for_1st_down\" int4\n \"field_side\" varchar(100)\n \"field_line\" int4\n \"context\" varchar(40)\n}\n\nTable \"american_football_fumbles_stats\" {\n \"id\" int4 [not null, increment]\n \"fumbles_committed\" varchar(100)\n \"fumbles_forced\" varchar(100)\n \"fumbles_recovered\" varchar(100)\n \"fumbles_lost\" varchar(100)\n \"fumbles_yards_gained\" varchar(100)\n \"fumbles_own_committed\" varchar(100)\n \"fumbles_own_recovered\" varchar(100)\n \"fumbles_own_lost\" varchar(100)\n \"fumbles_own_yards_gained\" varchar(100)\n \"fumbles_opposing_committed\" varchar(100)\n \"fumbles_opposing_recovered\" varchar(100)\n \"fumbles_opposing_lost\" varchar(100)\n \"fumbles_opposing_yards_gained\" varchar(100)\n}\n\nTable \"american_football_offensive_stats\" {\n \"id\" int4 [not null, increment]\n \"offensive_plays_yards\" varchar(100)\n \"offensive_plays_number\" varchar(100)\n \"offensive_plays_average_yards_per\" varchar(100)\n \"possession_duration\" varchar(100)\n \"turnovers_giveaway\" varchar(100)\n}\n\nTable \"american_football_passing_stats\" {\n \"id\" int4 [not null, increment]\n \"passes_attempts\" varchar(100)\n \"passes_completions\" varchar(100)\n \"passes_percentage\" varchar(100)\n \"passes_yards_gross\" varchar(100)\n \"passes_yards_net\" varchar(100)\n \"passes_yards_lost\" varchar(100)\n \"passes_touchdowns\" varchar(100)\n \"passes_touchdowns_percentage\" varchar(100)\n \"passes_interceptions\" varchar(100)\n \"passes_interceptions_percentage\" varchar(100)\n \"passes_longest\" varchar(100)\n \"passes_average_yards_per\" varchar(100)\n \"passer_rating\" varchar(100)\n \"receptions_total\" varchar(100)\n \"receptions_yards\" varchar(100)\n \"receptions_touchdowns\" varchar(100)\n \"receptions_first_down\" varchar(100)\n \"receptions_longest\" varchar(100)\n \"receptions_average_yards_per\" varchar(100)\n}\n\nTable \"american_football_penalties_stats\" {\n \"id\" int4 [not null, increment]\n \"penalties_total\" varchar(100)\n \"penalty_yards\" varchar(100)\n \"penalty_first_downs\" varchar(100)\n}\n\nTable \"american_football_rushing_stats\" {\n \"id\" int4 [not null, increment]\n \"rushes_attempts\" varchar(100)\n \"rushes_yards\" varchar(100)\n \"rushes_touchdowns\" varchar(100)\n \"rushing_average_yards_per\" varchar(100)\n \"rushes_first_down\" varchar(100)\n \"rushes_longest\" varchar(100)\n}\n\nTable \"american_football_sacks_against_stats\" {\n \"id\" int4 [not null, increment]\n \"sacks_against_yards\" varchar(100)\n \"sacks_against_total\" varchar(100)\n}\n\nTable \"american_football_scoring_stats\" {\n \"id\" int4 [not null, increment]\n \"touchdowns_total\" varchar(100)\n \"touchdowns_passing\" varchar(100)\n \"touchdowns_rushing\" varchar(100)\n \"touchdowns_special_teams\" varchar(100)\n \"touchdowns_defensive\" varchar(100)\n \"extra_points_attempts\" varchar(100)\n \"extra_points_made\" varchar(100)\n \"extra_points_missed\" varchar(100)\n \"extra_points_blocked\" varchar(100)\n \"field_goal_attempts\" varchar(100)\n \"field_goals_made\" varchar(100)\n \"field_goals_missed\" varchar(100)\n \"field_goals_blocked\" varchar(100)\n \"safeties_against\" varchar(100)\n \"two_point_conversions_attempts\" varchar(100)\n \"two_point_conversions_made\" varchar(100)\n \"touchbacks_total\" varchar(100)\n}\n\nTable \"american_football_special_teams_stats\" {\n \"id\" int4 [not null, increment]\n \"returns_punt_total\" varchar(100)\n \"returns_punt_yards\" varchar(100)\n \"returns_punt_average\" varchar(100)\n \"returns_punt_longest\" varchar(100)\n \"returns_punt_touchdown\" varchar(100)\n \"returns_kickoff_total\" varchar(100)\n \"returns_kickoff_yards\" varchar(100)\n \"returns_kickoff_average\" varchar(100)\n \"returns_kickoff_longest\" varchar(100)\n \"returns_kickoff_touchdown\" varchar(100)\n \"returns_total\" varchar(100)\n \"returns_yards\" varchar(100)\n \"punts_total\" varchar(100)\n \"punts_yards_gross\" varchar(100)\n \"punts_yards_net\" varchar(100)\n \"punts_longest\" varchar(100)\n \"punts_inside_20\" varchar(100)\n \"punts_inside_20_percentage\" varchar(100)\n \"punts_average\" varchar(100)\n \"punts_blocked\" varchar(100)\n \"touchbacks_total\" varchar(100)\n \"touchbacks_total_percentage\" varchar(100)\n \"touchbacks_kickoffs\" varchar(100)\n \"touchbacks_kickoffs_percentage\" varchar(100)\n \"touchbacks_punts\" varchar(100)\n \"touchbacks_punts_percentage\" varchar(100)\n \"touchbacks_interceptions\" varchar(100)\n \"touchbacks_interceptions_percentage\" varchar(100)\n \"fair_catches\" varchar(100)\n}\n\nTable \"baseball_action_contact_details\" {\n \"id\" int4 [not null, increment]\n \"baseball_action_pitch_id\" int4 [not null]\n \"location\" varchar(100)\n \"strength\" varchar(100)\n \"velocity\" int4\n \"comment\" text\n \"trajectory_coordinates\" varchar(100)\n \"trajectory_formula\" varchar(100)\n}\n\nTable \"baseball_action_pitches\" {\n \"id\" int4 [not null, increment]\n \"baseball_action_play_id\" int4 [not null]\n \"sequence_number\" int4\n \"baseball_defensive_group_id\" int4\n \"umpire_call\" varchar(100)\n \"pitch_location\" varchar(100)\n \"pitch_type\" varchar(100)\n \"pitch_velocity\" int4\n \"comment\" text\n \"trajectory_coordinates\" varchar(100)\n \"trajectory_formula\" varchar(100)\n \"ball_type\" varchar(40)\n \"strike_type\" varchar(40)\n}\n\nTable \"baseball_action_plays\" {\n \"id\" int4 [not null, increment]\n \"baseball_event_state_id\" int4 [not null]\n \"play_type\" varchar(100)\n \"notation\" varchar(100)\n \"notation_yaml\" text\n \"baseball_defensive_group_id\" int4\n \"comment\" varchar(255)\n \"runner_on_first_advance\" int4\n \"runner_on_second_advance\" int4\n \"runner_on_third_advance\" int4\n \"outs_recorded\" int4\n \"rbi\" int4\n \"runs_scored\" int4\n \"earned_runs_scored\" varchar(100)\n}\n\nTable \"baseball_action_substitutions\" {\n \"id\" int4 [not null, increment]\n \"baseball_event_state_id\" int4 [not null]\n \"sequence_number\" int4\n \"person_type\" varchar(100)\n \"person_original_id\" int4\n \"person_original_position_id\" int4\n \"person_original_lineup_slot\" int4\n \"person_replacing_id\" int4\n \"person_replacing_position_id\" int4\n \"person_replacing_lineup_slot\" int4\n \"substitution_reason\" varchar(100)\n \"comment\" varchar(100)\n}\n\nTable \"baseball_defensive_group\" {\n \"id\" int4 [not null, increment]\n}\n\nTable \"baseball_defensive_players\" {\n \"id\" int4 [not null, increment]\n \"baseball_defensive_group_id\" int4 [not null]\n \"player_id\" int4 [not null]\n \"position_id\" int4 [not null]\n}\n\nTable \"baseball_defensive_stats\" {\n \"id\" int4 [not null, increment]\n \"double_plays\" int4\n \"triple_plays\" int4\n \"putouts\" int4\n \"assists\" int4\n \"errors\" int4\n \"fielding_percentage\" numeric\n \"defensive_average\" numeric\n \"errors_passed_ball\" int4\n \"errors_catchers_interference\" int4\n}\n\nTable \"baseball_event_states\" {\n \"id\" int4 [not null, increment]\n \"event_id\" int4 [not null]\n \"current_state\" int2\n \"sequence_number\" int4\n \"at_bat_number\" int4\n \"inning_value\" int4\n \"inning_half\" varchar(100)\n \"outs\" int4\n \"balls\" int4\n \"strikes\" int4\n \"runner_on_first_id\" int4\n \"runner_on_second_id\" int4\n \"runner_on_third_id\" int4\n \"runner_on_first\" int2\n \"runner_on_second\" int2\n \"runner_on_third\" int2\n \"runs_this_inning_half\" int4\n \"pitcher_id\" int4\n \"batter_id\" int4\n \"batter_side\" varchar(100)\n \"context\" varchar(40)\n}\n\nTable \"baseball_offensive_stats\" {\n \"id\" int4 [not null, increment]\n \"average\" numeric\n \"runs_scored\" int4\n \"at_bats\" int4\n \"hits\" int4\n \"rbi\" int4\n \"total_bases\" int4\n \"slugging_percentage\" numeric\n \"bases_on_balls\" int4\n \"strikeouts\" int4\n \"left_on_base\" int4\n \"left_in_scoring_position\" int4\n \"singles\" int4\n \"doubles\" int4\n \"triples\" int4\n \"home_runs\" int4\n \"grand_slams\" int4\n \"at_bats_per_rbi\" numeric\n \"plate_appearances_per_rbi\" numeric\n \"at_bats_per_home_run\" numeric\n \"plate_appearances_per_home_run\" numeric\n \"sac_flies\" int4\n \"sac_bunts\" int4\n \"grounded_into_double_play\" int4\n \"moved_up\" int4\n \"on_base_percentage\" numeric\n \"stolen_bases\" int4\n \"stolen_bases_caught\" int4\n \"stolen_bases_average\" numeric\n \"hit_by_pitch\" int4\n \"defensive_interferance_reaches\" int4\n \"on_base_plus_slugging\" numeric\n \"plate_appearances\" int4\n \"hits_extra_base\" int4\n}\n\nTable \"baseball_pitching_stats\" {\n \"id\" int4 [not null, increment]\n \"runs_allowed\" int4\n \"singles_allowed\" int4\n \"doubles_allowed\" int4\n \"triples_allowed\" int4\n \"home_runs_allowed\" int4\n \"innings_pitched\" varchar(20)\n \"hits\" int4\n \"earned_runs\" int4\n \"unearned_runs\" int4\n \"bases_on_balls\" int4\n \"bases_on_balls_intentional\" int4\n \"strikeouts\" int4\n \"strikeout_to_bb_ratio\" numeric\n \"number_of_pitches\" int4\n \"era\" numeric\n \"inherited_runners_scored\" int4\n \"pick_offs\" int4\n \"errors_hit_with_pitch\" int4\n \"errors_wild_pitch\" int4\n \"balks\" int4\n \"wins\" int4\n \"losses\" int4\n \"saves\" int4\n \"shutouts\" int4\n \"games_complete\" int4\n \"games_finished\" int4\n \"winning_percentage\" numeric\n \"event_credit\" varchar(40)\n \"save_credit\" varchar(40)\n}\n\nTable \"basketball_defensive_stats\" {\n \"id\" int4 [not null, increment]\n \"steals_total\" varchar(100)\n \"steals_per_game\" varchar(100)\n \"blocks_total\" varchar(100)\n \"blocks_per_game\" varchar(100)\n}\n\nTable \"basketball_event_states\" {\n \"id\" int4 [not null, increment]\n \"event_id\" int4 [not null]\n \"current_state\" int4\n \"sequence_number\" int4\n \"period_value\" varchar(100)\n \"period_time_elapsed\" varchar(100)\n \"period_time_remaining\" varchar(100)\n \"context\" varchar(40)\n}\n\nTable \"basketball_offensive_stats\" {\n \"id\" int4 [not null, increment]\n \"field_goals_made\" int4\n \"field_goals_attempted\" int4\n \"field_goals_percentage\" varchar(100)\n \"field_goals_per_game\" varchar(100)\n \"field_goals_attempted_per_game\" varchar(100)\n \"field_goals_percentage_adjusted\" varchar(100)\n \"three_pointers_made\" int4\n \"three_pointers_attempted\" int4\n \"three_pointers_percentage\" varchar(100)\n \"three_pointers_per_game\" varchar(100)\n \"three_pointers_attempted_per_game\" varchar(100)\n \"free_throws_made\" varchar(100)\n \"free_throws_attempted\" varchar(100)\n \"free_throws_percentage\" varchar(100)\n \"free_throws_per_game\" varchar(100)\n \"free_throws_attempted_per_game\" varchar(100)\n \"points_scored_total\" varchar(100)\n \"points_scored_per_game\" varchar(100)\n \"assists_total\" varchar(100)\n \"assists_per_game\" varchar(100)\n \"turnovers_total\" varchar(100)\n \"turnovers_per_game\" varchar(100)\n \"points_scored_off_turnovers\" varchar(100)\n \"points_scored_in_paint\" varchar(100)\n \"points_scored_on_second_chance\" varchar(100)\n \"points_scored_on_fast_break\" varchar(100)\n}\n\nTable \"basketball_rebounding_stats\" {\n \"id\" int4 [not null, increment]\n \"rebounds_total\" varchar(100)\n \"rebounds_per_game\" varchar(100)\n \"rebounds_defensive\" varchar(100)\n \"rebounds_offensive\" varchar(100)\n \"team_rebounds_total\" varchar(100)\n \"team_rebounds_per_game\" varchar(100)\n \"team_rebounds_defensive\" varchar(100)\n \"team_rebounds_offensive\" varchar(100)\n}\n\nTable \"basketball_team_stats\" {\n \"id\" int4 [not null, increment]\n \"timeouts_left\" varchar(100)\n \"largest_lead\" varchar(100)\n \"fouls_total\" varchar(100)\n \"turnover_margin\" varchar(100)\n}\n\nTable \"bookmakers\" {\n \"id\" int4 [not null, increment]\n \"bookmaker_key\" varchar(100)\n \"publisher_id\" int4 [not null]\n \"location_id\" int4\n}\n\nTable \"core_person_stats\" {\n \"id\" int4 [not null, increment]\n \"time_played_event\" varchar(40)\n \"time_played_total\" varchar(40)\n \"time_played_event_average\" varchar(40)\n \"events_played\" int4\n \"events_started\" int4\n \"position_id\" int4\n}\n\nTable \"core_stats\" {\n \"id\" int4 [not null, increment]\n \"score\" varchar(100)\n \"score_opposing\" varchar(100)\n \"score_attempts\" varchar(100)\n \"score_attempts_opposing\" varchar(100)\n \"score_percentage\" varchar(100)\n \"score_percentage_opposing\" varchar(100)\n}\n\nTable \"db_info\" {\n \"version\" varchar(100) [not null, default: 16]\n}\n\nTable \"display_names\" {\n \"id\" int4 [not null, increment]\n \"language\" varchar(100) [not null]\n \"entity_type\" varchar(100) [not null]\n \"entity_id\" int4 [not null]\n \"full_name\" varchar(100)\n \"first_name\" varchar(100)\n \"middle_name\" varchar(100)\n \"last_name\" varchar(100)\n \"alias\" varchar(100)\n \"abbreviation\" varchar(100)\n \"short_name\" varchar(100)\n \"prefix\" varchar(20)\n \"suffix\" varchar(20)\n}\n\nTable \"document_classes\" {\n \"id\" int4 [not null, increment]\n \"name\" varchar(100)\n}\n\nTable \"document_contents\" {\n \"id\" int4 [not null, increment]\n \"document_id\" int4 [not null]\n \"sportsml\" varchar(200)\n \"abstract\" text\n}\n\nTable \"document_fixtures\" {\n \"id\" int4 [not null, increment]\n \"fixture_key\" varchar(100)\n \"publisher_id\" int4 [not null]\n \"name\" varchar(100)\n \"document_class_id\" int4 [not null]\n}\n\nTable \"document_fixtures_events\" {\n \"id\" int4 [not null, increment]\n \"document_fixture_id\" int4 [not null]\n \"event_id\" int4 [not null]\n \"latest_document_id\" int4 [not null]\n \"last_update\" timestamp\n}\n\nTable \"document_package_entry\" {\n \"id\" int4 [not null, increment]\n \"document_package_id\" int4 [not null]\n \"rank\" varchar(100)\n \"document_id\" int4 [not null]\n \"headline\" varchar(100)\n \"short_headline\" varchar(100)\n}\n\nTable \"document_packages\" {\n \"id\" int4 [not null, increment]\n \"package_key\" varchar(100)\n \"package_name\" varchar(100)\n \"date_time\" date\n}\n\nTable \"documents\" {\n \"id\" int4 [not null, increment]\n \"doc_id\" varchar(75) [not null]\n \"publisher_id\" int4 [not null]\n \"date_time\" timestamp\n \"title\" varchar(255)\n \"language\" varchar(100)\n \"priority\" varchar(100)\n \"revision_id\" varchar(75)\n \"stats_coverage\" varchar(100)\n \"document_fixture_id\" int4 [not null]\n \"source_id\" int4\n \"db_loading_date_time\" timestamp\n}\n\nTable \"documents_media\" {\n \"id\" int4 [not null, increment]\n \"document_id\" int4 [not null]\n \"media_id\" int4 [not null]\n \"media_caption_id\" int4 [not null]\n}\n\nTable \"events\" {\n \"id\" int4 [not null, increment]\n \"event_key\" varchar(100) [not null]\n \"publisher_id\" int4 [not null]\n \"start_date_time\" timestamp\n \"site_id\" int4\n \"site_alignment\" varchar(100)\n \"event_status\" varchar(100)\n \"duration\" varchar(100)\n \"attendance\" varchar(100)\n \"last_update\" timestamp\n}\n\nTable \"events_documents\" {\n \"event_id\" int4 [not null]\n \"document_id\" int4 [not null]\n}\n\nTable \"events_media\" {\n \"event_id\" int4 [not null]\n \"media_id\" int4 [not null]\n}\n\nTable \"events_sub_seasons\" {\n \"event_id\" int4 [not null]\n \"sub_season_id\" int4 [not null]\n}\n\nTable \"ice_hockey_action_participants\" {\n \"id\" int4 [not null, increment]\n \"ice_hockey_action_play_id\" int4 [not null]\n \"person_id\" int4 [not null]\n \"participant_role\" varchar(100) [not null]\n \"point_credit\" int4\n}\n\nTable \"ice_hockey_action_plays\" {\n \"id\" int4 [not null, increment]\n \"ice_hockey_event_state_id\" int4 [not null]\n \"play_type\" varchar(100)\n \"score_attempt_type\" varchar(100)\n \"play_result\" varchar(100)\n \"comment\" varchar(255)\n}\n\nTable \"ice_hockey_defensive_stats\" {\n \"id\" int4 [not null, increment]\n \"shots_power_play_allowed\" varchar(100)\n \"shots_penalty_shot_allowed\" varchar(100)\n \"goals_power_play_allowed\" varchar(100)\n \"goals_penalty_shot_allowed\" varchar(100)\n \"goals_against_average\" varchar(100)\n \"saves\" varchar(100)\n \"save_percentage\" varchar(100)\n \"penalty_killing_amount\" varchar(100)\n \"penalty_killing_percentage\" varchar(100)\n \"shots_blocked\" varchar(100)\n \"takeaways\" varchar(100)\n \"shutouts\" varchar(100)\n \"minutes_penalty_killing\" varchar(100)\n \"hits\" varchar(100)\n \"goals_empty_net_allowed\" varchar(100)\n \"goals_short_handed_allowed\" varchar(100)\n \"goals_shootout_allowed\" varchar(100)\n \"shots_shootout_allowed\" varchar(100)\n}\n\nTable \"ice_hockey_event_states\" {\n \"id\" int4 [not null, increment]\n \"event_id\" int4 [not null]\n \"current_state\" int4\n \"sequence_number\" int4\n \"period_value\" varchar(100)\n \"period_time_elapsed\" varchar(100)\n \"period_time_remaining\" varchar(100)\n \"context\" varchar(40)\n}\n\nTable \"ice_hockey_offensive_stats\" {\n \"id\" int4 [not null, increment]\n \"goals_game_winning\" varchar(100)\n \"goals_game_tying\" varchar(100)\n \"goals_power_play\" varchar(100)\n \"goals_short_handed\" varchar(100)\n \"goals_even_strength\" varchar(100)\n \"goals_empty_net\" varchar(100)\n \"goals_overtime\" varchar(100)\n \"goals_shootout\" varchar(100)\n \"goals_penalty_shot\" varchar(100)\n \"assists\" varchar(100)\n \"points\" varchar(100)\n \"power_play_amount\" varchar(100)\n \"power_play_percentage\" varchar(100)\n \"shots_penalty_shot_taken\" varchar(100)\n \"shots_penalty_shot_missed\" varchar(100)\n \"shots_penalty_shot_percentage\" varchar(100)\n \"giveaways\" varchar(100)\n \"minutes_power_play\" varchar(100)\n \"faceoff_wins\" varchar(100)\n \"faceoff_losses\" varchar(100)\n \"faceoff_win_percentage\" varchar(100)\n \"scoring_chances\" varchar(100)\n}\n\nTable \"ice_hockey_player_stats\" {\n \"id\" int4 [not null, increment]\n \"plus_minus\" varchar(100)\n}\n\nTable \"injury_phases\" {\n \"id\" int4 [not null, increment]\n \"person_id\" int4 [not null]\n \"injury_status\" varchar(100)\n \"injury_type\" varchar(100)\n \"injury_comment\" varchar(100)\n \"disabled_list\" varchar(100)\n \"start_date_time\" timestamp\n \"end_date_time\" timestamp\n \"season_id\" int4\n \"phase_type\" varchar(100)\n \"injury_side\" varchar(100)\n}\n\nTable \"key_aliases\" {\n \"id\" int4 [not null, increment]\n \"key_id\" int4 [not null]\n \"key_root_id\" int4 [not null]\n}\n\nTable \"key_roots\" {\n \"id\" int4 [not null, increment]\n \"key_type\" varchar(100)\n}\n\nTable \"latest_revisions\" {\n \"id\" int4 [not null, increment]\n \"revision_id\" varchar(75) [not null]\n \"latest_document_id\" int4 [not null]\n}\n\nTable \"locations\" {\n \"id\" int4 [not null, increment]\n \"timezone\" varchar(100)\n \"latitude\" varchar(100)\n \"longitude\" varchar(100)\n \"country_code\" varchar(100)\n}\n\nTable \"media\" {\n \"id\" int4 [not null, increment]\n \"object_id\" int4\n \"source_id\" int4\n \"revision_id\" int4\n \"media_type\" varchar(100)\n \"publisher_id\" int4 [not null]\n \"date_time\" varchar(100)\n \"credit_id\" int4 [not null]\n \"db_loading_date_time\" timestamp\n \"creation_location_id\" int4 [not null]\n}\n\nTable \"media_captions\" {\n \"id\" int4 [not null, increment]\n \"media_id\" int4 [not null]\n \"caption_type\" varchar(100)\n \"caption\" varchar(100)\n \"caption_author_id\" int4 [not null]\n \"language\" varchar(100)\n \"caption_size\" varchar(100)\n}\n\nTable \"media_contents\" {\n \"id\" int4 [not null, increment]\n \"media_id\" int4 [not null]\n \"object\" varchar(100)\n \"format\" varchar(100)\n \"mime_type\" varchar(100)\n \"height\" varchar(100)\n \"width\" varchar(100)\n \"duration\" varchar(100)\n \"file_size\" varchar(100)\n \"resolution\" varchar(100)\n}\n\nTable \"media_keywords\" {\n \"id\" int4 [not null, increment]\n \"keyword\" varchar(100)\n \"media_id\" int4 [not null]\n}\n\nTable \"motor_racing_event_states\" {\n \"id\" int4 [not null, increment]\n \"event_id\" int4 [not null]\n \"current_state\" int4\n \"sequence_number\" int4\n \"lap\" varchar(100)\n \"laps_remaining\" varchar(100)\n \"time_elapsed\" varchar(100)\n \"flag_state\" varchar(100)\n \"context\" varchar(40)\n}\n\nTable \"motor_racing_qualifying_stats\" {\n \"id\" int4 [not null, increment]\n \"grid\" varchar(100)\n \"pole_position\" varchar(100)\n \"pole_wins\" varchar(100)\n \"qualifying_speed\" varchar(100)\n \"qualifying_speed_units\" varchar(100)\n \"qualifying_time\" varchar(100)\n \"qualifying_position\" varchar(100)\n}\n\nTable \"motor_racing_race_stats\" {\n \"id\" int4 [not null, increment]\n \"time_behind_leader\" varchar(100)\n \"laps_behind_leader\" varchar(100)\n \"time_ahead_follower\" varchar(100)\n \"laps_ahead_follower\" varchar(100)\n \"time\" varchar(100)\n \"points\" varchar(100)\n \"points_rookie\" varchar(100)\n \"bonus\" varchar(100)\n \"laps_completed\" varchar(100)\n \"laps_leading_total\" varchar(100)\n \"distance_leading\" varchar(100)\n \"distance_completed\" varchar(100)\n \"distance_units\" varchar(40)\n \"speed_average\" varchar(40)\n \"speed_units\" varchar(40)\n \"status\" varchar(40)\n \"finishes_top_5\" varchar(40)\n \"finishes_top_10\" varchar(40)\n \"starts\" varchar(40)\n \"finishes\" varchar(40)\n \"non_finishes\" varchar(40)\n \"wins\" varchar(40)\n \"races_leading\" varchar(40)\n \"money\" varchar(40)\n \"money_units\" varchar(40)\n \"leads_total\" varchar(40)\n}\n\nTable \"outcome_totals\" {\n \"id\" int4 [not null, increment]\n \"standing_subgroup_id\" int4 [not null]\n \"outcome_holder_type\" varchar(100)\n \"outcome_holder_id\" int4\n \"rank\" varchar(100)\n \"wins\" varchar(100)\n \"losses\" varchar(100)\n \"ties\" varchar(100)\n \"undecideds\" varchar(100)\n \"winning_percentage\" varchar(100)\n \"points_scored_for\" varchar(100)\n \"points_scored_against\" varchar(100)\n \"points_difference\" varchar(100)\n \"standing_points\" varchar(100)\n \"streak_type\" varchar(100)\n \"streak_duration\" varchar(100)\n \"streak_total\" varchar(100)\n \"streak_start\" date\n \"streak_end\" date\n}\n\nTable \"participants_events\" {\n \"id\" int4 [not null, increment]\n \"participant_type\" varchar(100) [not null]\n \"participant_id\" int4 [not null]\n \"event_id\" int4 [not null]\n \"alignment\" varchar(100)\n \"score\" varchar(100)\n \"event_outcome\" varchar(100)\n \"rank\" int4\n}\n\nTable \"periods\" {\n \"id\" int4 [not null, increment]\n \"participant_event_id\" int4 [not null]\n \"period_value\" varchar(100)\n \"score\" varchar(100)\n}\n\nTable \"person_event_metadata\" {\n \"id\" int4 [not null, increment]\n \"person_id\" int4 [not null]\n \"event_id\" int4 [not null]\n \"status\" varchar(100)\n \"health\" varchar(100)\n \"weight\" varchar(100)\n \"role_id\" int4\n \"position_id\" int4\n \"team_id\" int4\n \"lineup_slot\" int4\n \"lineup_slot_sequence\" int4\n}\n\nTable \"person_phases\" {\n \"id\" int4 [not null, increment]\n \"person_id\" int4 [not null]\n \"membership_type\" varchar(40) [not null]\n \"membership_id\" int4 [not null]\n \"role_id\" int4\n \"role_status\" varchar(40)\n \"phase_status\" varchar(40)\n \"uniform_number\" varchar(20)\n \"regular_position_id\" int4\n \"regular_position_depth\" varchar(40)\n \"height\" varchar(100)\n \"weight\" varchar(100)\n \"start_date_time\" timestamp\n \"start_season_id\" int4\n \"end_date_time\" timestamp\n \"end_season_id\" int4\n \"entry_reason\" varchar(40)\n \"exit_reason\" varchar(40)\n \"selection_level\" int4\n \"selection_sublevel\" int4\n \"selection_overall\" int4\n}\n\nTable \"persons\" {\n \"id\" int4 [not null, increment]\n \"person_key\" varchar(100) [not null]\n \"publisher_id\" int4 [not null]\n \"gender\" varchar(20)\n \"birth_date\" varchar(30)\n \"death_date\" varchar(30)\n \"birth_location_id\" int4\n \"hometown_location_id\" int4\n \"residence_location_id\" int4\n \"death_location_id\" int4\n}\n\nTable \"persons_documents\" {\n \"person_id\" int4 [not null]\n \"document_id\" int4 [not null]\n}\n\nTable \"persons_media\" {\n \"person_id\" int4 [not null]\n \"media_id\" int4 [not null]\n}\n\nTable \"positions\" {\n \"id\" int4 [not null, increment]\n \"affiliation_id\" int4 [not null]\n \"abbreviation\" varchar(100) [not null]\n}\n\nTable \"publishers\" {\n \"id\" int4 [not null, increment]\n \"publisher_key\" varchar(100) [not null]\n \"publisher_name\" varchar(100)\n}\n\nTable \"roles\" {\n \"id\" int4 [not null, increment]\n \"role_key\" varchar(100) [not null]\n \"role_name\" varchar(100)\n \"comment\" varchar(100)\n}\n\nTable \"seasons\" {\n \"id\" int4 [not null, increment]\n \"season_key\" int4 [not null]\n \"publisher_id\" int4 [not null]\n \"league_id\" int4 [not null]\n \"start_date_time\" timestamp\n \"end_date_time\" timestamp\n}\n\nTable \"sites\" {\n \"id\" int4 [not null, increment]\n \"site_key\" int4 [not null]\n \"publisher_id\" int4 [not null]\n \"location_id\" int4\n}\n\nTable \"soccer_defensive_stats\" {\n \"id\" int4 [not null, increment]\n \"shots_penalty_shot_allowed\" varchar(100)\n \"goals_penalty_shot_allowed\" varchar(100)\n \"goals_against_average\" varchar(100)\n \"goals_against_total\" varchar(100)\n \"saves\" varchar(100)\n \"save_percentage\" varchar(100)\n \"catches_punches\" varchar(100)\n \"shots_on_goal_total\" varchar(100)\n \"shots_shootout_total\" varchar(100)\n \"shots_shootout_allowed\" varchar(100)\n \"shots_blocked\" varchar(100)\n \"shutouts\" varchar(100)\n}\n\nTable \"soccer_event_states\" {\n \"id\" int4 [not null, increment]\n \"event_id\" int4 [not null]\n \"current_state\" int4\n \"sequence_number\" int4\n \"period_value\" varchar(100)\n \"period_time_elapsed\" varchar(100)\n \"period_time_remaining\" varchar(100)\n \"minutes_elapsed\" varchar(100)\n \"period_minute_elapsed\" varchar(100)\n \"context\" varchar(40)\n}\n\nTable \"soccer_foul_stats\" {\n \"id\" int4 [not null, increment]\n \"fouls_suffered\" varchar(100)\n \"fouls_commited\" varchar(100)\n \"cautions_total\" varchar(100)\n \"cautions_pending\" varchar(100)\n \"caution_points_total\" varchar(100)\n \"caution_points_pending\" varchar(100)\n \"ejections_total\" varchar(100)\n}\n\nTable \"soccer_offensive_stats\" {\n \"id\" int4 [not null, increment]\n \"goals_game_winning\" varchar(100)\n \"goals_game_tying\" varchar(100)\n \"goals_overtime\" varchar(100)\n \"goals_shootout\" varchar(100)\n \"goals_total\" varchar(100)\n \"assists_game_winning\" varchar(100)\n \"assists_game_tying\" varchar(100)\n \"assists_overtime\" varchar(100)\n \"assists_total\" varchar(100)\n \"points\" varchar(100)\n \"shots_total\" varchar(100)\n \"shots_on_goal_total\" varchar(100)\n \"shots_hit_frame\" varchar(100)\n \"shots_penalty_shot_taken\" varchar(100)\n \"shots_penalty_shot_scored\" varchar(100)\n \"shots_penalty_shot_missed\" varchar(40)\n \"shots_penalty_shot_percentage\" varchar(40)\n \"shots_shootout_taken\" varchar(40)\n \"shots_shootout_scored\" varchar(40)\n \"shots_shootout_missed\" varchar(40)\n \"shots_shootout_percentage\" varchar(40)\n \"giveaways\" varchar(40)\n \"offsides\" varchar(40)\n \"corner_kicks\" varchar(40)\n \"hat_tricks\" varchar(40)\n}\n\nTable \"standing_subgroups\" {\n \"id\" int4 [not null, increment]\n \"standing_id\" int4 [not null]\n \"affiliation_id\" int4 [not null]\n}\n\nTable \"standings\" {\n \"id\" int4 [not null, increment]\n \"affiliation_id\" int4 [not null]\n \"standing_type\" varchar(100)\n \"sub_season_id\" int4 [not null]\n \"last_updated\" varchar(100)\n \"duration_scope\" varchar(100)\n \"competition_scope\" varchar(100)\n \"competition_scope_id\" varchar(100)\n \"alignment_scope\" varchar(100)\n \"site_scope\" varchar(100)\n \"scoping_label\" varchar(100)\n \"publisher_id\" int4 [not null]\n \"source\" varchar(100)\n}\n\nTable \"stats\" {\n \"id\" int4 [not null, increment]\n \"stat_repository_type\" varchar(100)\n \"stat_repository_id\" int4 [not null]\n \"stat_holder_type\" varchar(100)\n \"stat_holder_id\" int4\n \"stat_coverage_type\" varchar(100)\n \"stat_coverage_id\" int4\n \"context\" varchar(40) [not null]\n}\n\nTable \"sub_periods\" {\n \"id\" int4 [not null, increment]\n \"period_id\" int4 [not null]\n \"sub_period_value\" varchar(100)\n \"score\" varchar(100)\n}\n\nTable \"sub_seasons\" {\n \"id\" int4 [not null, increment]\n \"sub_season_key\" varchar(100) [not null]\n \"season_id\" int4 [not null]\n \"sub_season_type\" varchar(100) [not null]\n \"start_date_time\" timestamp\n \"end_date_time\" timestamp\n}\n\nTable \"team_american_football_stats\" {\n \"id\" int4 [not null, increment]\n \"yards_per_attempt\" varchar(100)\n \"average_starting_position\" varchar(100)\n \"timeouts\" varchar(100)\n \"time_of_possession\" varchar(100)\n \"turnover_ratio\" varchar(100)\n}\n\nTable \"team_phases\" {\n \"id\" int4 [not null, increment]\n \"team_id\" int4 [not null]\n \"start_season_id\" int4\n \"end_season_id\" int4\n \"affiliation_id\" int4 [not null]\n \"start_date_time\" varchar(100)\n \"end_date_time\" varchar(100)\n \"phase_status\" varchar(40)\n \"role_id\" int4\n}\n\nTable \"teams\" {\n \"id\" int4 [not null, increment]\n \"team_key\" varchar(100) [not null]\n \"publisher_id\" int4 [not null]\n \"home_site_id\" int4\n}\n\nTable \"teams_documents\" {\n \"team_id\" int4 [not null]\n \"document_id\" int4 [not null]\n}\n\nTable \"teams_media\" {\n \"team_id\" int4 [not null]\n \"media_id\" int4 [not null]\n}\n\nTable \"tennis_action_points\" {\n \"id\" int4 [not null, increment]\n \"sub_period_id\" varchar(100)\n \"sequence_number\" varchar(100)\n \"win_type\" varchar(100)\n}\n\nTable \"tennis_action_volleys\" {\n \"id\" int4 [not null, increment]\n \"sequence_number\" varchar(100)\n \"tennis_action_points_id\" int4\n \"landing_location\" varchar(100)\n \"swing_type\" varchar(100)\n \"result\" varchar(100)\n \"spin_type\" varchar(100)\n \"trajectory_details\" varchar(100)\n}\n\nTable \"tennis_event_states\" {\n \"id\" int4 [not null, increment]\n \"event_id\" int4 [not null]\n \"current_state\" int4\n \"sequence_number\" int4\n \"tennis_set\" varchar(100)\n \"game\" varchar(100)\n \"server_person_id\" int4\n \"server_score\" varchar(100)\n \"receiver_person_id\" int4\n \"receiver_score\" varchar(100)\n \"service_number\" varchar(100)\n \"context\" varchar(40)\n}\n\nTable \"tennis_return_stats\" {\n \"id\" int4 [not null, increment]\n \"returns_played\" varchar(100)\n \"matches_played\" varchar(100)\n \"first_service_return_points_won\" varchar(100)\n \"first_service_return_points_won_pct\" varchar(100)\n \"second_service_return_points_won\" varchar(100)\n \"second_service_return_points_won_pct\" varchar(100)\n \"return_games_played\" varchar(100)\n \"return_games_won\" varchar(100)\n \"return_games_won_pct\" varchar(100)\n \"break_points_played\" varchar(100)\n \"break_points_converted\" varchar(100)\n \"break_points_converted_pct\" varchar(100)\n}\n\nTable \"tennis_service_stats\" {\n \"id\" int4 [not null, increment]\n \"services_played\" varchar(100)\n \"matches_played\" varchar(100)\n \"aces\" varchar(100)\n \"first_services_good\" varchar(100)\n \"first_services_good_pct\" varchar(100)\n \"first_service_points_won\" varchar(100)\n \"first_service_points_won_pct\" varchar(100)\n \"second_service_points_won\" varchar(100)\n \"second_service_points_won_pct\" varchar(100)\n \"service_games_played\" varchar(100)\n \"service_games_won\" varchar(100)\n \"service_games_won_pct\" varchar(100)\n \"break_points_played\" varchar(100)\n \"break_points_saved\" varchar(100)\n \"break_points_saved_pct\" varchar(100)\n}\n\nTable \"wagering_moneylines\" {\n \"id\" int4 [not null, increment]\n \"bookmaker_id\" int4 [not null]\n \"event_id\" int4 [not null]\n \"date_time\" timestamp\n \"team_id\" int4 [not null]\n \"person_id\" int4\n \"rotation_key\" varchar(100)\n \"comment\" varchar(100)\n \"vigorish\" varchar(100)\n \"line\" varchar(100)\n \"line_opening\" varchar(100)\n \"prediction\" varchar(100)\n}\n\nTable \"wagering_odds_lines\" {\n \"id\" int4 [not null, increment]\n \"bookmaker_id\" int4 [not null]\n \"event_id\" int4 [not null]\n \"date_time\" timestamp\n \"team_id\" int4 [not null]\n \"person_id\" int4\n \"rotation_key\" varchar(100)\n \"comment\" varchar(100)\n \"numerator\" varchar(100)\n \"denominator\" varchar(100)\n \"prediction\" varchar(100)\n \"payout_calculation\" varchar(100)\n \"payout_amount\" varchar(100)\n}\n\nTable \"wagering_runlines\" {\n \"id\" int4 [not null, increment]\n \"bookmaker_id\" int4 [not null]\n \"event_id\" int4 [not null]\n \"date_time\" timestamp\n \"team_id\" int4 [not null]\n \"person_id\" int4\n \"rotation_key\" varchar(100)\n \"comment\" varchar(100)\n \"vigorish\" varchar(100)\n \"line\" varchar(100)\n \"line_opening\" varchar(100)\n \"line_value\" varchar(100)\n \"prediction\" varchar(100)\n}\n\nTable \"wagering_straight_spread_lines\" {\n \"id\" int4 [not null, increment]\n \"bookmaker_id\" int4 [not null]\n \"event_id\" int4 [not null]\n \"date_time\" timestamp\n \"team_id\" int4 [not null]\n \"person_id\" int4\n \"rotation_key\" varchar(100)\n \"comment\" varchar(100)\n \"vigorish\" varchar(100)\n \"line_value\" varchar(100)\n \"line_value_opening\" varchar(100)\n \"prediction\" varchar(100)\n}\n\nTable \"wagering_total_score_lines\" {\n \"id\" int4 [not null, increment]\n \"bookmaker_id\" int4 [not null]\n \"event_id\" int4 [not null]\n \"date_time\" timestamp\n \"team_id\" int4 [not null]\n \"person_id\" int4\n \"rotation_key\" varchar(100)\n \"comment\" varchar(100)\n \"vigorish\" varchar(100)\n \"line_over\" varchar(100)\n \"line_under\" varchar(100)\n \"total\" varchar(100)\n \"total_opening\" varchar(100)\n \"prediction\" varchar(100)\n}\n\nTable \"weather_conditions\" {\n \"id\" int4 [not null, increment]\n \"event_id\" int4 [not null]\n \"temperature\" varchar(100)\n \"temperature_units\" varchar(40)\n \"humidity\" varchar(100)\n \"clouds\" varchar(100)\n \"wind_direction\" varchar(100)\n \"wind_velocity\" varchar(100)\n \"weather_code\" varchar(100)\n}\n", "stateUrl": null, "stateOriginalUrl": "https://github.com/yugabyte/yugabyte-db/blob/master/sample/sportsdb_tables.sql" } } ================================================ FILE: tasks/postgres/standard/sports/participant_report_optimization/verify.py ================================================ """ Verification script for PostgreSQL Sports Task 3: Query Performance Optimization """ import os import sys import psycopg2 from decimal import Decimal def rows_match(actual_row, expected_row): """ Compare two rows with appropriate tolerance. For Decimal types: allows 0.001 tolerance For other types: requires exact match """ if len(actual_row) != len(expected_row): return False for actual, expected in zip(actual_row, expected_row): if isinstance(actual, Decimal) and isinstance(expected, Decimal): if abs(float(actual) - float(expected)) > 0.001: return False elif isinstance(actual, float) and isinstance(expected, float): if abs(actual - expected) > 0.001: return False elif actual != expected: return False return True def get_connection_params() -> dict: """Get database connection parameters.""" return { "host": os.getenv("POSTGRES_HOST", "localhost"), "port": int(os.getenv("POSTGRES_PORT", 5432)), "database": os.getenv("POSTGRES_DATABASE", "sports"), "user": os.getenv("POSTGRES_USERNAME", "postgres"), "password": os.getenv("POSTGRES_PASSWORD", "postgres") } def verify_report_data(conn) -> bool: """Verify the report table contains the expected data.""" with conn.cursor() as cur: # Get actual results from the report table cur.execute(""" SELECT participant_id, event_count, stat_count, stat_type_count, last_event_date FROM participant_performance_report ORDER BY participant_id """) actual_results = cur.fetchall() if len(actual_results) == 0: print("❌ Report table is empty") return False # Execute ground truth query cur.execute(""" SELECT pe.participant_id, COUNT(pe.event_id) as event_count, (SELECT COUNT(*) FROM stats s WHERE s.stat_holder_id = pe.participant_id AND s.stat_holder_type = 'persons') as stat_count, (SELECT COUNT(DISTINCT s.stat_repository_type) FROM stats s WHERE s.stat_holder_id = pe.participant_id AND s.stat_holder_type = 'persons') as stat_type_count, (SELECT MAX(e.start_date_time) FROM events e JOIN participants_events pe2 ON e.id = pe2.event_id WHERE pe2.participant_id = pe.participant_id) as last_event_date FROM participants_events pe WHERE pe.participant_id <= 50 GROUP BY pe.participant_id ORDER BY pe.participant_id """) expected_results = cur.fetchall() if len(actual_results) != len(expected_results): print(f"❌ Expected {len(expected_results)} report records, got {len(actual_results)}") return False mismatches = 0 for actual, expected in zip(actual_results, expected_results): if not rows_match(actual, expected): if mismatches < 5: print(f"❌ Row mismatch: expected {expected}, got {actual}") mismatches += 1 if mismatches > 0: print(f"❌ Total mismatches in report data: {mismatches}") return False print(f"✅ Report data is correct ({len(actual_results)} records)") return True def verify_performance_optimization(conn) -> bool: """Verify that key performance optimization indexes have been implemented.""" with conn.cursor() as cur: print("\n🔍 Checking for critical performance indexes...") # Check 1: participants_events.participant_id index (critical for subqueries) cur.execute(""" SELECT indexname, indexdef FROM pg_indexes WHERE schemaname = 'public' AND tablename = 'participants_events' AND indexdef LIKE '%participant_id%' """) participant_indexes = cur.fetchall() has_participant_index = len(participant_indexes) > 0 # Check 2: stats table optimization (critical for subquery filtering) cur.execute(""" SELECT indexname, indexdef FROM pg_indexes WHERE schemaname = 'public' AND tablename = 'stats' AND indexdef LIKE '%stat_holder_type%' AND indexdef LIKE '%stat_holder_id%' """) stats_indexes = cur.fetchall() has_stats_index = len(stats_indexes) > 0 # Report findings critical_indexes_found = 0 if has_participant_index: print("✅ Found participant filtering index on participants_events.participant_id") critical_indexes_found += 1 else: print("❌ Missing critical index on participants_events.participant_id") if has_stats_index: print("✅ Found subquery optimization index on stats table") critical_indexes_found += 1 else: print("❌ Missing critical index on stats table") # Must have both critical indexes for this subquery-heavy query if critical_indexes_found >= 2: print(f"\n✅ Performance optimization: PASS ({critical_indexes_found}/2 critical indexes found)") return True else: print(f"\n❌ Performance optimization: FAIL ({critical_indexes_found}/2 critical indexes found)") print(" Create these critical indexes:") print(" - CREATE INDEX ON participants_events(participant_id);") print(" - CREATE INDEX ON stats(stat_holder_type, stat_holder_id);") return False def main(): """Main verification function.""" print("=" * 50) print("Verifying Sports Task 3: Query Performance Optimization") print("=" * 50) # Get connection parameters conn_params = get_connection_params() if not conn_params["database"]: print("❌ No database specified") sys.exit(1) try: # Connect to database conn = psycopg2.connect(**conn_params) # Verify all components success = ( verify_report_data(conn) and verify_performance_optimization(conn) ) conn.close() if success: print("\n🎉 Task verification: PASS") sys.exit(0) else: print("\n❌ Task verification: FAIL") sys.exit(1) except psycopg2.Error as e: print(f"❌ Database error: {e}") sys.exit(1) except Exception as e: print(f"❌ Verification error: {e}") sys.exit(1) if __name__ == "__main__": main() ================================================ FILE: tasks/postgres/standard/sports/team_roster_management/description.md ================================================ # Team Roster Management Operations ## Background You need to manage team rosters for the upcoming season, including player transfers, injury tracking, and performance evaluations. ## Requirements Complete the following 5 operations in order: ### 1. Set Up Player Performance Tracking Create a table called `player_evaluation` with the following structure: - performance_id (serial primary key) - person_id (integer not null, references persons(id)) - batting_avg (decimal) - home_runs (integer) - rbis (integer) - games_played (integer) - performance_score (decimal) - evaluation_date (date) Add constraint: CHECK (batting_avg BETWEEN 0 AND 1) ### 2. Load Historical Player Statistics Insert player performance data into `player_evaluation`: - Select all players who have offensive statistics - Calculate batting_avg as hits/at_bats (handle division by zero) - Sum up home_runs, rbi from baseball_offensive_stats - Count games_played from person_event_metadata - Calculate performance_score as: (batting_avg * 1000) + (home_runs * 5) + (rbi * 2) - Only include players with at least 10 games played - Set evaluation_date to '2024-01-01' ### 3. Track Player Health Status Create a table called `player_injury_status`: - status_id (serial primary key) - person_id (integer unique not null) - injury_count (integer default 0) - last_injury_date (date) - current_status (varchar check in ('healthy', 'injured', 'recovering')) Insert data by: - Including all players from player_evaluation - Count injuries from injury_phases for each player - Get the most recent injury start_date as last_injury_date - Set current_status: 'injured' if injury has no end_date, otherwise 'healthy' ### 4. Adjust Scores Based on Health Update `player_evaluation` to reduce performance scores for injured players: - Reduce performance_score by 20% for players with current_status = 'injured' - Reduce performance_score by 10% for players with injury_count > 2 - Set minimum performance_score to 0 (no negative scores) ### 5. Generate Performance Summary Report Create a summary table called `team_performance_summary`: - summary_id (serial primary key) - metric_name (varchar unique) - metric_value (decimal) Insert the following metrics: - 'total_players' - count of players in player_evaluation - 'avg_batting_average' - average batting_avg - 'total_home_runs' - sum of all home_runs - 'avg_performance_score' - average performance_score - 'injured_player_count' - count of injured players - 'healthy_player_count' - count of healthy players ## Important Notes - Handle NULL values appropriately (treat as 0 where needed) - Ensure foreign key constraints are properly set - Do NOT use ROUND functions in calculations - Use COALESCE to handle NULL values in calculations ================================================ FILE: tasks/postgres/standard/sports/team_roster_management/meta.json ================================================ { "task_id": "team_roster_management", "task_name": "Team Roster Management", "category_id": "sports", "category_name": "Sports", "description": "Manage team rosters with player transfers, injury tracking, performance evaluations, and health status adjustments.", "author": "Lingxiao Du", "created_at": "2025-08-18", "difficulty": "L3", "tags": [ "schema design", "data migration", "statistical aggregation" ], "mcp": [ "postgres" ], "meta_data": { "stateType": "text", "stateContent": "Table \"addresses\" {\n \"id\" int4 [not null, increment]\n \"location_id\" int4 [not null]\n \"language\" varchar(100)\n \"suite\" varchar(100)\n \"floor\" varchar(100)\n \"building\" varchar(100)\n \"street_number\" varchar(100)\n \"street_prefix\" varchar(100)\n \"street\" varchar(100)\n \"street_suffix\" varchar(100)\n \"neighborhood\" varchar(100)\n \"district\" varchar(100)\n \"locality\" varchar(100)\n \"county\" varchar(100)\n \"region\" varchar(100)\n \"postal_code\" varchar(100)\n \"country\" varchar(100)\n}\n\nTable \"affiliation_phases\" {\n \"id\" int4 [not null, increment]\n \"affiliation_id\" int4 [not null]\n \"ancestor_affiliation_id\" int4\n \"start_season_id\" int4\n \"start_date_time\" timestamp\n \"end_season_id\" int4\n \"end_date_time\" timestamp\n}\n\nTable \"affiliations\" {\n \"id\" int4 [not null, increment]\n \"affiliation_key\" varchar(100) [not null]\n \"affiliation_type\" varchar(100)\n \"publisher_id\" int4 [not null]\n}\n\nTable \"affiliations_documents\" {\n \"affiliation_id\" int4 [not null]\n \"document_id\" int4 [not null]\n}\n\nTable \"affiliations_events\" {\n \"affiliation_id\" int4 [not null]\n \"event_id\" int4 [not null]\n}\n\nTable \"affiliations_media\" {\n \"affiliation_id\" int4 [not null]\n \"media_id\" int4 [not null]\n}\n\nTable \"american_football_action_participants\" {\n \"id\" int4 [not null, increment]\n \"american_football_action_play_id\" int4 [not null]\n \"person_id\" int4 [not null]\n \"participant_role\" varchar(100) [not null]\n \"score_type\" varchar(100)\n \"field_line\" int4\n \"yardage\" int4\n \"score_credit\" int4\n \"yards_gained\" int4\n}\n\nTable \"american_football_action_plays\" {\n \"id\" int4 [not null, increment]\n \"american_football_event_state_id\" int4 [not null]\n \"play_type\" varchar(100)\n \"score_attempt_type\" varchar(100)\n \"drive_result\" varchar(100)\n \"points\" int4\n \"comment\" varchar(255)\n}\n\nTable \"american_football_defensive_stats\" {\n \"id\" int4 [not null, increment]\n \"tackles_total\" varchar(100)\n \"tackles_solo\" varchar(100)\n \"tackles_assists\" varchar(100)\n \"interceptions_total\" varchar(100)\n \"interceptions_yards\" varchar(100)\n \"interceptions_average\" varchar(100)\n \"interceptions_longest\" varchar(100)\n \"interceptions_touchdown\" varchar(100)\n \"quarterback_hurries\" varchar(100)\n \"sacks_total\" varchar(100)\n \"sacks_yards\" varchar(100)\n \"passes_defensed\" varchar(100)\n}\n\nTable \"american_football_down_progress_stats\" {\n \"id\" int4 [not null, increment]\n \"first_downs_total\" varchar(100)\n \"first_downs_pass\" varchar(100)\n \"first_downs_run\" varchar(100)\n \"first_downs_penalty\" varchar(100)\n \"conversions_third_down\" varchar(100)\n \"conversions_third_down_attempts\" varchar(100)\n \"conversions_third_down_percentage\" varchar(100)\n \"conversions_fourth_down\" varchar(100)\n \"conversions_fourth_down_attempts\" varchar(100)\n \"conversions_fourth_down_percentage\" varchar(100)\n}\n\nTable \"american_football_event_states\" {\n \"id\" int4 [not null, increment]\n \"event_id\" int4 [not null]\n \"current_state\" int2\n \"sequence_number\" int4\n \"period_value\" int4\n \"period_time_elapsed\" varchar(100)\n \"period_time_remaining\" varchar(100)\n \"clock_state\" varchar(100)\n \"down\" int4\n \"team_in_possession_id\" int4\n \"distance_for_1st_down\" int4\n \"field_side\" varchar(100)\n \"field_line\" int4\n \"context\" varchar(40)\n}\n\nTable \"american_football_fumbles_stats\" {\n \"id\" int4 [not null, increment]\n \"fumbles_committed\" varchar(100)\n \"fumbles_forced\" varchar(100)\n \"fumbles_recovered\" varchar(100)\n \"fumbles_lost\" varchar(100)\n \"fumbles_yards_gained\" varchar(100)\n \"fumbles_own_committed\" varchar(100)\n \"fumbles_own_recovered\" varchar(100)\n \"fumbles_own_lost\" varchar(100)\n \"fumbles_own_yards_gained\" varchar(100)\n \"fumbles_opposing_committed\" varchar(100)\n \"fumbles_opposing_recovered\" varchar(100)\n \"fumbles_opposing_lost\" varchar(100)\n \"fumbles_opposing_yards_gained\" varchar(100)\n}\n\nTable \"american_football_offensive_stats\" {\n \"id\" int4 [not null, increment]\n \"offensive_plays_yards\" varchar(100)\n \"offensive_plays_number\" varchar(100)\n \"offensive_plays_average_yards_per\" varchar(100)\n \"possession_duration\" varchar(100)\n \"turnovers_giveaway\" varchar(100)\n}\n\nTable \"american_football_passing_stats\" {\n \"id\" int4 [not null, increment]\n \"passes_attempts\" varchar(100)\n \"passes_completions\" varchar(100)\n \"passes_percentage\" varchar(100)\n \"passes_yards_gross\" varchar(100)\n \"passes_yards_net\" varchar(100)\n \"passes_yards_lost\" varchar(100)\n \"passes_touchdowns\" varchar(100)\n \"passes_touchdowns_percentage\" varchar(100)\n \"passes_interceptions\" varchar(100)\n \"passes_interceptions_percentage\" varchar(100)\n \"passes_longest\" varchar(100)\n \"passes_average_yards_per\" varchar(100)\n \"passer_rating\" varchar(100)\n \"receptions_total\" varchar(100)\n \"receptions_yards\" varchar(100)\n \"receptions_touchdowns\" varchar(100)\n \"receptions_first_down\" varchar(100)\n \"receptions_longest\" varchar(100)\n \"receptions_average_yards_per\" varchar(100)\n}\n\nTable \"american_football_penalties_stats\" {\n \"id\" int4 [not null, increment]\n \"penalties_total\" varchar(100)\n \"penalty_yards\" varchar(100)\n \"penalty_first_downs\" varchar(100)\n}\n\nTable \"american_football_rushing_stats\" {\n \"id\" int4 [not null, increment]\n \"rushes_attempts\" varchar(100)\n \"rushes_yards\" varchar(100)\n \"rushes_touchdowns\" varchar(100)\n \"rushing_average_yards_per\" varchar(100)\n \"rushes_first_down\" varchar(100)\n \"rushes_longest\" varchar(100)\n}\n\nTable \"american_football_sacks_against_stats\" {\n \"id\" int4 [not null, increment]\n \"sacks_against_yards\" varchar(100)\n \"sacks_against_total\" varchar(100)\n}\n\nTable \"american_football_scoring_stats\" {\n \"id\" int4 [not null, increment]\n \"touchdowns_total\" varchar(100)\n \"touchdowns_passing\" varchar(100)\n \"touchdowns_rushing\" varchar(100)\n \"touchdowns_special_teams\" varchar(100)\n \"touchdowns_defensive\" varchar(100)\n \"extra_points_attempts\" varchar(100)\n \"extra_points_made\" varchar(100)\n \"extra_points_missed\" varchar(100)\n \"extra_points_blocked\" varchar(100)\n \"field_goal_attempts\" varchar(100)\n \"field_goals_made\" varchar(100)\n \"field_goals_missed\" varchar(100)\n \"field_goals_blocked\" varchar(100)\n \"safeties_against\" varchar(100)\n \"two_point_conversions_attempts\" varchar(100)\n \"two_point_conversions_made\" varchar(100)\n \"touchbacks_total\" varchar(100)\n}\n\nTable \"american_football_special_teams_stats\" {\n \"id\" int4 [not null, increment]\n \"returns_punt_total\" varchar(100)\n \"returns_punt_yards\" varchar(100)\n \"returns_punt_average\" varchar(100)\n \"returns_punt_longest\" varchar(100)\n \"returns_punt_touchdown\" varchar(100)\n \"returns_kickoff_total\" varchar(100)\n \"returns_kickoff_yards\" varchar(100)\n \"returns_kickoff_average\" varchar(100)\n \"returns_kickoff_longest\" varchar(100)\n \"returns_kickoff_touchdown\" varchar(100)\n \"returns_total\" varchar(100)\n \"returns_yards\" varchar(100)\n \"punts_total\" varchar(100)\n \"punts_yards_gross\" varchar(100)\n \"punts_yards_net\" varchar(100)\n \"punts_longest\" varchar(100)\n \"punts_inside_20\" varchar(100)\n \"punts_inside_20_percentage\" varchar(100)\n \"punts_average\" varchar(100)\n \"punts_blocked\" varchar(100)\n \"touchbacks_total\" varchar(100)\n \"touchbacks_total_percentage\" varchar(100)\n \"touchbacks_kickoffs\" varchar(100)\n \"touchbacks_kickoffs_percentage\" varchar(100)\n \"touchbacks_punts\" varchar(100)\n \"touchbacks_punts_percentage\" varchar(100)\n \"touchbacks_interceptions\" varchar(100)\n \"touchbacks_interceptions_percentage\" varchar(100)\n \"fair_catches\" varchar(100)\n}\n\nTable \"baseball_action_contact_details\" {\n \"id\" int4 [not null, increment]\n \"baseball_action_pitch_id\" int4 [not null]\n \"location\" varchar(100)\n \"strength\" varchar(100)\n \"velocity\" int4\n \"comment\" text\n \"trajectory_coordinates\" varchar(100)\n \"trajectory_formula\" varchar(100)\n}\n\nTable \"baseball_action_pitches\" {\n \"id\" int4 [not null, increment]\n \"baseball_action_play_id\" int4 [not null]\n \"sequence_number\" int4\n \"baseball_defensive_group_id\" int4\n \"umpire_call\" varchar(100)\n \"pitch_location\" varchar(100)\n \"pitch_type\" varchar(100)\n \"pitch_velocity\" int4\n \"comment\" text\n \"trajectory_coordinates\" varchar(100)\n \"trajectory_formula\" varchar(100)\n \"ball_type\" varchar(40)\n \"strike_type\" varchar(40)\n}\n\nTable \"baseball_action_plays\" {\n \"id\" int4 [not null, increment]\n \"baseball_event_state_id\" int4 [not null]\n \"play_type\" varchar(100)\n \"notation\" varchar(100)\n \"notation_yaml\" text\n \"baseball_defensive_group_id\" int4\n \"comment\" varchar(255)\n \"runner_on_first_advance\" int4\n \"runner_on_second_advance\" int4\n \"runner_on_third_advance\" int4\n \"outs_recorded\" int4\n \"rbi\" int4\n \"runs_scored\" int4\n \"earned_runs_scored\" varchar(100)\n}\n\nTable \"baseball_action_substitutions\" {\n \"id\" int4 [not null, increment]\n \"baseball_event_state_id\" int4 [not null]\n \"sequence_number\" int4\n \"person_type\" varchar(100)\n \"person_original_id\" int4\n \"person_original_position_id\" int4\n \"person_original_lineup_slot\" int4\n \"person_replacing_id\" int4\n \"person_replacing_position_id\" int4\n \"person_replacing_lineup_slot\" int4\n \"substitution_reason\" varchar(100)\n \"comment\" varchar(100)\n}\n\nTable \"baseball_defensive_group\" {\n \"id\" int4 [not null, increment]\n}\n\nTable \"baseball_defensive_players\" {\n \"id\" int4 [not null, increment]\n \"baseball_defensive_group_id\" int4 [not null]\n \"player_id\" int4 [not null]\n \"position_id\" int4 [not null]\n}\n\nTable \"baseball_defensive_stats\" {\n \"id\" int4 [not null, increment]\n \"double_plays\" int4\n \"triple_plays\" int4\n \"putouts\" int4\n \"assists\" int4\n \"errors\" int4\n \"fielding_percentage\" numeric\n \"defensive_average\" numeric\n \"errors_passed_ball\" int4\n \"errors_catchers_interference\" int4\n}\n\nTable \"baseball_event_states\" {\n \"id\" int4 [not null, increment]\n \"event_id\" int4 [not null]\n \"current_state\" int2\n \"sequence_number\" int4\n \"at_bat_number\" int4\n \"inning_value\" int4\n \"inning_half\" varchar(100)\n \"outs\" int4\n \"balls\" int4\n \"strikes\" int4\n \"runner_on_first_id\" int4\n \"runner_on_second_id\" int4\n \"runner_on_third_id\" int4\n \"runner_on_first\" int2\n \"runner_on_second\" int2\n \"runner_on_third\" int2\n \"runs_this_inning_half\" int4\n \"pitcher_id\" int4\n \"batter_id\" int4\n \"batter_side\" varchar(100)\n \"context\" varchar(40)\n}\n\nTable \"baseball_offensive_stats\" {\n \"id\" int4 [not null, increment]\n \"average\" numeric\n \"runs_scored\" int4\n \"at_bats\" int4\n \"hits\" int4\n \"rbi\" int4\n \"total_bases\" int4\n \"slugging_percentage\" numeric\n \"bases_on_balls\" int4\n \"strikeouts\" int4\n \"left_on_base\" int4\n \"left_in_scoring_position\" int4\n \"singles\" int4\n \"doubles\" int4\n \"triples\" int4\n \"home_runs\" int4\n \"grand_slams\" int4\n \"at_bats_per_rbi\" numeric\n \"plate_appearances_per_rbi\" numeric\n \"at_bats_per_home_run\" numeric\n \"plate_appearances_per_home_run\" numeric\n \"sac_flies\" int4\n \"sac_bunts\" int4\n \"grounded_into_double_play\" int4\n \"moved_up\" int4\n \"on_base_percentage\" numeric\n \"stolen_bases\" int4\n \"stolen_bases_caught\" int4\n \"stolen_bases_average\" numeric\n \"hit_by_pitch\" int4\n \"defensive_interferance_reaches\" int4\n \"on_base_plus_slugging\" numeric\n \"plate_appearances\" int4\n \"hits_extra_base\" int4\n}\n\nTable \"baseball_pitching_stats\" {\n \"id\" int4 [not null, increment]\n \"runs_allowed\" int4\n \"singles_allowed\" int4\n \"doubles_allowed\" int4\n \"triples_allowed\" int4\n \"home_runs_allowed\" int4\n \"innings_pitched\" varchar(20)\n \"hits\" int4\n \"earned_runs\" int4\n \"unearned_runs\" int4\n \"bases_on_balls\" int4\n \"bases_on_balls_intentional\" int4\n \"strikeouts\" int4\n \"strikeout_to_bb_ratio\" numeric\n \"number_of_pitches\" int4\n \"era\" numeric\n \"inherited_runners_scored\" int4\n \"pick_offs\" int4\n \"errors_hit_with_pitch\" int4\n \"errors_wild_pitch\" int4\n \"balks\" int4\n \"wins\" int4\n \"losses\" int4\n \"saves\" int4\n \"shutouts\" int4\n \"games_complete\" int4\n \"games_finished\" int4\n \"winning_percentage\" numeric\n \"event_credit\" varchar(40)\n \"save_credit\" varchar(40)\n}\n\nTable \"basketball_defensive_stats\" {\n \"id\" int4 [not null, increment]\n \"steals_total\" varchar(100)\n \"steals_per_game\" varchar(100)\n \"blocks_total\" varchar(100)\n \"blocks_per_game\" varchar(100)\n}\n\nTable \"basketball_event_states\" {\n \"id\" int4 [not null, increment]\n \"event_id\" int4 [not null]\n \"current_state\" int4\n \"sequence_number\" int4\n \"period_value\" varchar(100)\n \"period_time_elapsed\" varchar(100)\n \"period_time_remaining\" varchar(100)\n \"context\" varchar(40)\n}\n\nTable \"basketball_offensive_stats\" {\n \"id\" int4 [not null, increment]\n \"field_goals_made\" int4\n \"field_goals_attempted\" int4\n \"field_goals_percentage\" varchar(100)\n \"field_goals_per_game\" varchar(100)\n \"field_goals_attempted_per_game\" varchar(100)\n \"field_goals_percentage_adjusted\" varchar(100)\n \"three_pointers_made\" int4\n \"three_pointers_attempted\" int4\n \"three_pointers_percentage\" varchar(100)\n \"three_pointers_per_game\" varchar(100)\n \"three_pointers_attempted_per_game\" varchar(100)\n \"free_throws_made\" varchar(100)\n \"free_throws_attempted\" varchar(100)\n \"free_throws_percentage\" varchar(100)\n \"free_throws_per_game\" varchar(100)\n \"free_throws_attempted_per_game\" varchar(100)\n \"points_scored_total\" varchar(100)\n \"points_scored_per_game\" varchar(100)\n \"assists_total\" varchar(100)\n \"assists_per_game\" varchar(100)\n \"turnovers_total\" varchar(100)\n \"turnovers_per_game\" varchar(100)\n \"points_scored_off_turnovers\" varchar(100)\n \"points_scored_in_paint\" varchar(100)\n \"points_scored_on_second_chance\" varchar(100)\n \"points_scored_on_fast_break\" varchar(100)\n}\n\nTable \"basketball_rebounding_stats\" {\n \"id\" int4 [not null, increment]\n \"rebounds_total\" varchar(100)\n \"rebounds_per_game\" varchar(100)\n \"rebounds_defensive\" varchar(100)\n \"rebounds_offensive\" varchar(100)\n \"team_rebounds_total\" varchar(100)\n \"team_rebounds_per_game\" varchar(100)\n \"team_rebounds_defensive\" varchar(100)\n \"team_rebounds_offensive\" varchar(100)\n}\n\nTable \"basketball_team_stats\" {\n \"id\" int4 [not null, increment]\n \"timeouts_left\" varchar(100)\n \"largest_lead\" varchar(100)\n \"fouls_total\" varchar(100)\n \"turnover_margin\" varchar(100)\n}\n\nTable \"bookmakers\" {\n \"id\" int4 [not null, increment]\n \"bookmaker_key\" varchar(100)\n \"publisher_id\" int4 [not null]\n \"location_id\" int4\n}\n\nTable \"core_person_stats\" {\n \"id\" int4 [not null, increment]\n \"time_played_event\" varchar(40)\n \"time_played_total\" varchar(40)\n \"time_played_event_average\" varchar(40)\n \"events_played\" int4\n \"events_started\" int4\n \"position_id\" int4\n}\n\nTable \"core_stats\" {\n \"id\" int4 [not null, increment]\n \"score\" varchar(100)\n \"score_opposing\" varchar(100)\n \"score_attempts\" varchar(100)\n \"score_attempts_opposing\" varchar(100)\n \"score_percentage\" varchar(100)\n \"score_percentage_opposing\" varchar(100)\n}\n\nTable \"db_info\" {\n \"version\" varchar(100) [not null, default: 16]\n}\n\nTable \"display_names\" {\n \"id\" int4 [not null, increment]\n \"language\" varchar(100) [not null]\n \"entity_type\" varchar(100) [not null]\n \"entity_id\" int4 [not null]\n \"full_name\" varchar(100)\n \"first_name\" varchar(100)\n \"middle_name\" varchar(100)\n \"last_name\" varchar(100)\n \"alias\" varchar(100)\n \"abbreviation\" varchar(100)\n \"short_name\" varchar(100)\n \"prefix\" varchar(20)\n \"suffix\" varchar(20)\n}\n\nTable \"document_classes\" {\n \"id\" int4 [not null, increment]\n \"name\" varchar(100)\n}\n\nTable \"document_contents\" {\n \"id\" int4 [not null, increment]\n \"document_id\" int4 [not null]\n \"sportsml\" varchar(200)\n \"abstract\" text\n}\n\nTable \"document_fixtures\" {\n \"id\" int4 [not null, increment]\n \"fixture_key\" varchar(100)\n \"publisher_id\" int4 [not null]\n \"name\" varchar(100)\n \"document_class_id\" int4 [not null]\n}\n\nTable \"document_fixtures_events\" {\n \"id\" int4 [not null, increment]\n \"document_fixture_id\" int4 [not null]\n \"event_id\" int4 [not null]\n \"latest_document_id\" int4 [not null]\n \"last_update\" timestamp\n}\n\nTable \"document_package_entry\" {\n \"id\" int4 [not null, increment]\n \"document_package_id\" int4 [not null]\n \"rank\" varchar(100)\n \"document_id\" int4 [not null]\n \"headline\" varchar(100)\n \"short_headline\" varchar(100)\n}\n\nTable \"document_packages\" {\n \"id\" int4 [not null, increment]\n \"package_key\" varchar(100)\n \"package_name\" varchar(100)\n \"date_time\" date\n}\n\nTable \"documents\" {\n \"id\" int4 [not null, increment]\n \"doc_id\" varchar(75) [not null]\n \"publisher_id\" int4 [not null]\n \"date_time\" timestamp\n \"title\" varchar(255)\n \"language\" varchar(100)\n \"priority\" varchar(100)\n \"revision_id\" varchar(75)\n \"stats_coverage\" varchar(100)\n \"document_fixture_id\" int4 [not null]\n \"source_id\" int4\n \"db_loading_date_time\" timestamp\n}\n\nTable \"documents_media\" {\n \"id\" int4 [not null, increment]\n \"document_id\" int4 [not null]\n \"media_id\" int4 [not null]\n \"media_caption_id\" int4 [not null]\n}\n\nTable \"events\" {\n \"id\" int4 [not null, increment]\n \"event_key\" varchar(100) [not null]\n \"publisher_id\" int4 [not null]\n \"start_date_time\" timestamp\n \"site_id\" int4\n \"site_alignment\" varchar(100)\n \"event_status\" varchar(100)\n \"duration\" varchar(100)\n \"attendance\" varchar(100)\n \"last_update\" timestamp\n}\n\nTable \"events_documents\" {\n \"event_id\" int4 [not null]\n \"document_id\" int4 [not null]\n}\n\nTable \"events_media\" {\n \"event_id\" int4 [not null]\n \"media_id\" int4 [not null]\n}\n\nTable \"events_sub_seasons\" {\n \"event_id\" int4 [not null]\n \"sub_season_id\" int4 [not null]\n}\n\nTable \"ice_hockey_action_participants\" {\n \"id\" int4 [not null, increment]\n \"ice_hockey_action_play_id\" int4 [not null]\n \"person_id\" int4 [not null]\n \"participant_role\" varchar(100) [not null]\n \"point_credit\" int4\n}\n\nTable \"ice_hockey_action_plays\" {\n \"id\" int4 [not null, increment]\n \"ice_hockey_event_state_id\" int4 [not null]\n \"play_type\" varchar(100)\n \"score_attempt_type\" varchar(100)\n \"play_result\" varchar(100)\n \"comment\" varchar(255)\n}\n\nTable \"ice_hockey_defensive_stats\" {\n \"id\" int4 [not null, increment]\n \"shots_power_play_allowed\" varchar(100)\n \"shots_penalty_shot_allowed\" varchar(100)\n \"goals_power_play_allowed\" varchar(100)\n \"goals_penalty_shot_allowed\" varchar(100)\n \"goals_against_average\" varchar(100)\n \"saves\" varchar(100)\n \"save_percentage\" varchar(100)\n \"penalty_killing_amount\" varchar(100)\n \"penalty_killing_percentage\" varchar(100)\n \"shots_blocked\" varchar(100)\n \"takeaways\" varchar(100)\n \"shutouts\" varchar(100)\n \"minutes_penalty_killing\" varchar(100)\n \"hits\" varchar(100)\n \"goals_empty_net_allowed\" varchar(100)\n \"goals_short_handed_allowed\" varchar(100)\n \"goals_shootout_allowed\" varchar(100)\n \"shots_shootout_allowed\" varchar(100)\n}\n\nTable \"ice_hockey_event_states\" {\n \"id\" int4 [not null, increment]\n \"event_id\" int4 [not null]\n \"current_state\" int4\n \"sequence_number\" int4\n \"period_value\" varchar(100)\n \"period_time_elapsed\" varchar(100)\n \"period_time_remaining\" varchar(100)\n \"context\" varchar(40)\n}\n\nTable \"ice_hockey_offensive_stats\" {\n \"id\" int4 [not null, increment]\n \"goals_game_winning\" varchar(100)\n \"goals_game_tying\" varchar(100)\n \"goals_power_play\" varchar(100)\n \"goals_short_handed\" varchar(100)\n \"goals_even_strength\" varchar(100)\n \"goals_empty_net\" varchar(100)\n \"goals_overtime\" varchar(100)\n \"goals_shootout\" varchar(100)\n \"goals_penalty_shot\" varchar(100)\n \"assists\" varchar(100)\n \"points\" varchar(100)\n \"power_play_amount\" varchar(100)\n \"power_play_percentage\" varchar(100)\n \"shots_penalty_shot_taken\" varchar(100)\n \"shots_penalty_shot_missed\" varchar(100)\n \"shots_penalty_shot_percentage\" varchar(100)\n \"giveaways\" varchar(100)\n \"minutes_power_play\" varchar(100)\n \"faceoff_wins\" varchar(100)\n \"faceoff_losses\" varchar(100)\n \"faceoff_win_percentage\" varchar(100)\n \"scoring_chances\" varchar(100)\n}\n\nTable \"ice_hockey_player_stats\" {\n \"id\" int4 [not null, increment]\n \"plus_minus\" varchar(100)\n}\n\nTable \"injury_phases\" {\n \"id\" int4 [not null, increment]\n \"person_id\" int4 [not null]\n \"injury_status\" varchar(100)\n \"injury_type\" varchar(100)\n \"injury_comment\" varchar(100)\n \"disabled_list\" varchar(100)\n \"start_date_time\" timestamp\n \"end_date_time\" timestamp\n \"season_id\" int4\n \"phase_type\" varchar(100)\n \"injury_side\" varchar(100)\n}\n\nTable \"key_aliases\" {\n \"id\" int4 [not null, increment]\n \"key_id\" int4 [not null]\n \"key_root_id\" int4 [not null]\n}\n\nTable \"key_roots\" {\n \"id\" int4 [not null, increment]\n \"key_type\" varchar(100)\n}\n\nTable \"latest_revisions\" {\n \"id\" int4 [not null, increment]\n \"revision_id\" varchar(75) [not null]\n \"latest_document_id\" int4 [not null]\n}\n\nTable \"locations\" {\n \"id\" int4 [not null, increment]\n \"timezone\" varchar(100)\n \"latitude\" varchar(100)\n \"longitude\" varchar(100)\n \"country_code\" varchar(100)\n}\n\nTable \"media\" {\n \"id\" int4 [not null, increment]\n \"object_id\" int4\n \"source_id\" int4\n \"revision_id\" int4\n \"media_type\" varchar(100)\n \"publisher_id\" int4 [not null]\n \"date_time\" varchar(100)\n \"credit_id\" int4 [not null]\n \"db_loading_date_time\" timestamp\n \"creation_location_id\" int4 [not null]\n}\n\nTable \"media_captions\" {\n \"id\" int4 [not null, increment]\n \"media_id\" int4 [not null]\n \"caption_type\" varchar(100)\n \"caption\" varchar(100)\n \"caption_author_id\" int4 [not null]\n \"language\" varchar(100)\n \"caption_size\" varchar(100)\n}\n\nTable \"media_contents\" {\n \"id\" int4 [not null, increment]\n \"media_id\" int4 [not null]\n \"object\" varchar(100)\n \"format\" varchar(100)\n \"mime_type\" varchar(100)\n \"height\" varchar(100)\n \"width\" varchar(100)\n \"duration\" varchar(100)\n \"file_size\" varchar(100)\n \"resolution\" varchar(100)\n}\n\nTable \"media_keywords\" {\n \"id\" int4 [not null, increment]\n \"keyword\" varchar(100)\n \"media_id\" int4 [not null]\n}\n\nTable \"motor_racing_event_states\" {\n \"id\" int4 [not null, increment]\n \"event_id\" int4 [not null]\n \"current_state\" int4\n \"sequence_number\" int4\n \"lap\" varchar(100)\n \"laps_remaining\" varchar(100)\n \"time_elapsed\" varchar(100)\n \"flag_state\" varchar(100)\n \"context\" varchar(40)\n}\n\nTable \"motor_racing_qualifying_stats\" {\n \"id\" int4 [not null, increment]\n \"grid\" varchar(100)\n \"pole_position\" varchar(100)\n \"pole_wins\" varchar(100)\n \"qualifying_speed\" varchar(100)\n \"qualifying_speed_units\" varchar(100)\n \"qualifying_time\" varchar(100)\n \"qualifying_position\" varchar(100)\n}\n\nTable \"motor_racing_race_stats\" {\n \"id\" int4 [not null, increment]\n \"time_behind_leader\" varchar(100)\n \"laps_behind_leader\" varchar(100)\n \"time_ahead_follower\" varchar(100)\n \"laps_ahead_follower\" varchar(100)\n \"time\" varchar(100)\n \"points\" varchar(100)\n \"points_rookie\" varchar(100)\n \"bonus\" varchar(100)\n \"laps_completed\" varchar(100)\n \"laps_leading_total\" varchar(100)\n \"distance_leading\" varchar(100)\n \"distance_completed\" varchar(100)\n \"distance_units\" varchar(40)\n \"speed_average\" varchar(40)\n \"speed_units\" varchar(40)\n \"status\" varchar(40)\n \"finishes_top_5\" varchar(40)\n \"finishes_top_10\" varchar(40)\n \"starts\" varchar(40)\n \"finishes\" varchar(40)\n \"non_finishes\" varchar(40)\n \"wins\" varchar(40)\n \"races_leading\" varchar(40)\n \"money\" varchar(40)\n \"money_units\" varchar(40)\n \"leads_total\" varchar(40)\n}\n\nTable \"outcome_totals\" {\n \"id\" int4 [not null, increment]\n \"standing_subgroup_id\" int4 [not null]\n \"outcome_holder_type\" varchar(100)\n \"outcome_holder_id\" int4\n \"rank\" varchar(100)\n \"wins\" varchar(100)\n \"losses\" varchar(100)\n \"ties\" varchar(100)\n \"undecideds\" varchar(100)\n \"winning_percentage\" varchar(100)\n \"points_scored_for\" varchar(100)\n \"points_scored_against\" varchar(100)\n \"points_difference\" varchar(100)\n \"standing_points\" varchar(100)\n \"streak_type\" varchar(100)\n \"streak_duration\" varchar(100)\n \"streak_total\" varchar(100)\n \"streak_start\" date\n \"streak_end\" date\n}\n\nTable \"participants_events\" {\n \"id\" int4 [not null, increment]\n \"participant_type\" varchar(100) [not null]\n \"participant_id\" int4 [not null]\n \"event_id\" int4 [not null]\n \"alignment\" varchar(100)\n \"score\" varchar(100)\n \"event_outcome\" varchar(100)\n \"rank\" int4\n}\n\nTable \"periods\" {\n \"id\" int4 [not null, increment]\n \"participant_event_id\" int4 [not null]\n \"period_value\" varchar(100)\n \"score\" varchar(100)\n}\n\nTable \"person_event_metadata\" {\n \"id\" int4 [not null, increment]\n \"person_id\" int4 [not null]\n \"event_id\" int4 [not null]\n \"status\" varchar(100)\n \"health\" varchar(100)\n \"weight\" varchar(100)\n \"role_id\" int4\n \"position_id\" int4\n \"team_id\" int4\n \"lineup_slot\" int4\n \"lineup_slot_sequence\" int4\n}\n\nTable \"person_phases\" {\n \"id\" int4 [not null, increment]\n \"person_id\" int4 [not null]\n \"membership_type\" varchar(40) [not null]\n \"membership_id\" int4 [not null]\n \"role_id\" int4\n \"role_status\" varchar(40)\n \"phase_status\" varchar(40)\n \"uniform_number\" varchar(20)\n \"regular_position_id\" int4\n \"regular_position_depth\" varchar(40)\n \"height\" varchar(100)\n \"weight\" varchar(100)\n \"start_date_time\" timestamp\n \"start_season_id\" int4\n \"end_date_time\" timestamp\n \"end_season_id\" int4\n \"entry_reason\" varchar(40)\n \"exit_reason\" varchar(40)\n \"selection_level\" int4\n \"selection_sublevel\" int4\n \"selection_overall\" int4\n}\n\nTable \"persons\" {\n \"id\" int4 [not null, increment]\n \"person_key\" varchar(100) [not null]\n \"publisher_id\" int4 [not null]\n \"gender\" varchar(20)\n \"birth_date\" varchar(30)\n \"death_date\" varchar(30)\n \"birth_location_id\" int4\n \"hometown_location_id\" int4\n \"residence_location_id\" int4\n \"death_location_id\" int4\n}\n\nTable \"persons_documents\" {\n \"person_id\" int4 [not null]\n \"document_id\" int4 [not null]\n}\n\nTable \"persons_media\" {\n \"person_id\" int4 [not null]\n \"media_id\" int4 [not null]\n}\n\nTable \"positions\" {\n \"id\" int4 [not null, increment]\n \"affiliation_id\" int4 [not null]\n \"abbreviation\" varchar(100) [not null]\n}\n\nTable \"publishers\" {\n \"id\" int4 [not null, increment]\n \"publisher_key\" varchar(100) [not null]\n \"publisher_name\" varchar(100)\n}\n\nTable \"roles\" {\n \"id\" int4 [not null, increment]\n \"role_key\" varchar(100) [not null]\n \"role_name\" varchar(100)\n \"comment\" varchar(100)\n}\n\nTable \"seasons\" {\n \"id\" int4 [not null, increment]\n \"season_key\" int4 [not null]\n \"publisher_id\" int4 [not null]\n \"league_id\" int4 [not null]\n \"start_date_time\" timestamp\n \"end_date_time\" timestamp\n}\n\nTable \"sites\" {\n \"id\" int4 [not null, increment]\n \"site_key\" int4 [not null]\n \"publisher_id\" int4 [not null]\n \"location_id\" int4\n}\n\nTable \"soccer_defensive_stats\" {\n \"id\" int4 [not null, increment]\n \"shots_penalty_shot_allowed\" varchar(100)\n \"goals_penalty_shot_allowed\" varchar(100)\n \"goals_against_average\" varchar(100)\n \"goals_against_total\" varchar(100)\n \"saves\" varchar(100)\n \"save_percentage\" varchar(100)\n \"catches_punches\" varchar(100)\n \"shots_on_goal_total\" varchar(100)\n \"shots_shootout_total\" varchar(100)\n \"shots_shootout_allowed\" varchar(100)\n \"shots_blocked\" varchar(100)\n \"shutouts\" varchar(100)\n}\n\nTable \"soccer_event_states\" {\n \"id\" int4 [not null, increment]\n \"event_id\" int4 [not null]\n \"current_state\" int4\n \"sequence_number\" int4\n \"period_value\" varchar(100)\n \"period_time_elapsed\" varchar(100)\n \"period_time_remaining\" varchar(100)\n \"minutes_elapsed\" varchar(100)\n \"period_minute_elapsed\" varchar(100)\n \"context\" varchar(40)\n}\n\nTable \"soccer_foul_stats\" {\n \"id\" int4 [not null, increment]\n \"fouls_suffered\" varchar(100)\n \"fouls_commited\" varchar(100)\n \"cautions_total\" varchar(100)\n \"cautions_pending\" varchar(100)\n \"caution_points_total\" varchar(100)\n \"caution_points_pending\" varchar(100)\n \"ejections_total\" varchar(100)\n}\n\nTable \"soccer_offensive_stats\" {\n \"id\" int4 [not null, increment]\n \"goals_game_winning\" varchar(100)\n \"goals_game_tying\" varchar(100)\n \"goals_overtime\" varchar(100)\n \"goals_shootout\" varchar(100)\n \"goals_total\" varchar(100)\n \"assists_game_winning\" varchar(100)\n \"assists_game_tying\" varchar(100)\n \"assists_overtime\" varchar(100)\n \"assists_total\" varchar(100)\n \"points\" varchar(100)\n \"shots_total\" varchar(100)\n \"shots_on_goal_total\" varchar(100)\n \"shots_hit_frame\" varchar(100)\n \"shots_penalty_shot_taken\" varchar(100)\n \"shots_penalty_shot_scored\" varchar(100)\n \"shots_penalty_shot_missed\" varchar(40)\n \"shots_penalty_shot_percentage\" varchar(40)\n \"shots_shootout_taken\" varchar(40)\n \"shots_shootout_scored\" varchar(40)\n \"shots_shootout_missed\" varchar(40)\n \"shots_shootout_percentage\" varchar(40)\n \"giveaways\" varchar(40)\n \"offsides\" varchar(40)\n \"corner_kicks\" varchar(40)\n \"hat_tricks\" varchar(40)\n}\n\nTable \"standing_subgroups\" {\n \"id\" int4 [not null, increment]\n \"standing_id\" int4 [not null]\n \"affiliation_id\" int4 [not null]\n}\n\nTable \"standings\" {\n \"id\" int4 [not null, increment]\n \"affiliation_id\" int4 [not null]\n \"standing_type\" varchar(100)\n \"sub_season_id\" int4 [not null]\n \"last_updated\" varchar(100)\n \"duration_scope\" varchar(100)\n \"competition_scope\" varchar(100)\n \"competition_scope_id\" varchar(100)\n \"alignment_scope\" varchar(100)\n \"site_scope\" varchar(100)\n \"scoping_label\" varchar(100)\n \"publisher_id\" int4 [not null]\n \"source\" varchar(100)\n}\n\nTable \"stats\" {\n \"id\" int4 [not null, increment]\n \"stat_repository_type\" varchar(100)\n \"stat_repository_id\" int4 [not null]\n \"stat_holder_type\" varchar(100)\n \"stat_holder_id\" int4\n \"stat_coverage_type\" varchar(100)\n \"stat_coverage_id\" int4\n \"context\" varchar(40) [not null]\n}\n\nTable \"sub_periods\" {\n \"id\" int4 [not null, increment]\n \"period_id\" int4 [not null]\n \"sub_period_value\" varchar(100)\n \"score\" varchar(100)\n}\n\nTable \"sub_seasons\" {\n \"id\" int4 [not null, increment]\n \"sub_season_key\" varchar(100) [not null]\n \"season_id\" int4 [not null]\n \"sub_season_type\" varchar(100) [not null]\n \"start_date_time\" timestamp\n \"end_date_time\" timestamp\n}\n\nTable \"team_american_football_stats\" {\n \"id\" int4 [not null, increment]\n \"yards_per_attempt\" varchar(100)\n \"average_starting_position\" varchar(100)\n \"timeouts\" varchar(100)\n \"time_of_possession\" varchar(100)\n \"turnover_ratio\" varchar(100)\n}\n\nTable \"team_phases\" {\n \"id\" int4 [not null, increment]\n \"team_id\" int4 [not null]\n \"start_season_id\" int4\n \"end_season_id\" int4\n \"affiliation_id\" int4 [not null]\n \"start_date_time\" varchar(100)\n \"end_date_time\" varchar(100)\n \"phase_status\" varchar(40)\n \"role_id\" int4\n}\n\nTable \"teams\" {\n \"id\" int4 [not null, increment]\n \"team_key\" varchar(100) [not null]\n \"publisher_id\" int4 [not null]\n \"home_site_id\" int4\n}\n\nTable \"teams_documents\" {\n \"team_id\" int4 [not null]\n \"document_id\" int4 [not null]\n}\n\nTable \"teams_media\" {\n \"team_id\" int4 [not null]\n \"media_id\" int4 [not null]\n}\n\nTable \"tennis_action_points\" {\n \"id\" int4 [not null, increment]\n \"sub_period_id\" varchar(100)\n \"sequence_number\" varchar(100)\n \"win_type\" varchar(100)\n}\n\nTable \"tennis_action_volleys\" {\n \"id\" int4 [not null, increment]\n \"sequence_number\" varchar(100)\n \"tennis_action_points_id\" int4\n \"landing_location\" varchar(100)\n \"swing_type\" varchar(100)\n \"result\" varchar(100)\n \"spin_type\" varchar(100)\n \"trajectory_details\" varchar(100)\n}\n\nTable \"tennis_event_states\" {\n \"id\" int4 [not null, increment]\n \"event_id\" int4 [not null]\n \"current_state\" int4\n \"sequence_number\" int4\n \"tennis_set\" varchar(100)\n \"game\" varchar(100)\n \"server_person_id\" int4\n \"server_score\" varchar(100)\n \"receiver_person_id\" int4\n \"receiver_score\" varchar(100)\n \"service_number\" varchar(100)\n \"context\" varchar(40)\n}\n\nTable \"tennis_return_stats\" {\n \"id\" int4 [not null, increment]\n \"returns_played\" varchar(100)\n \"matches_played\" varchar(100)\n \"first_service_return_points_won\" varchar(100)\n \"first_service_return_points_won_pct\" varchar(100)\n \"second_service_return_points_won\" varchar(100)\n \"second_service_return_points_won_pct\" varchar(100)\n \"return_games_played\" varchar(100)\n \"return_games_won\" varchar(100)\n \"return_games_won_pct\" varchar(100)\n \"break_points_played\" varchar(100)\n \"break_points_converted\" varchar(100)\n \"break_points_converted_pct\" varchar(100)\n}\n\nTable \"tennis_service_stats\" {\n \"id\" int4 [not null, increment]\n \"services_played\" varchar(100)\n \"matches_played\" varchar(100)\n \"aces\" varchar(100)\n \"first_services_good\" varchar(100)\n \"first_services_good_pct\" varchar(100)\n \"first_service_points_won\" varchar(100)\n \"first_service_points_won_pct\" varchar(100)\n \"second_service_points_won\" varchar(100)\n \"second_service_points_won_pct\" varchar(100)\n \"service_games_played\" varchar(100)\n \"service_games_won\" varchar(100)\n \"service_games_won_pct\" varchar(100)\n \"break_points_played\" varchar(100)\n \"break_points_saved\" varchar(100)\n \"break_points_saved_pct\" varchar(100)\n}\n\nTable \"wagering_moneylines\" {\n \"id\" int4 [not null, increment]\n \"bookmaker_id\" int4 [not null]\n \"event_id\" int4 [not null]\n \"date_time\" timestamp\n \"team_id\" int4 [not null]\n \"person_id\" int4\n \"rotation_key\" varchar(100)\n \"comment\" varchar(100)\n \"vigorish\" varchar(100)\n \"line\" varchar(100)\n \"line_opening\" varchar(100)\n \"prediction\" varchar(100)\n}\n\nTable \"wagering_odds_lines\" {\n \"id\" int4 [not null, increment]\n \"bookmaker_id\" int4 [not null]\n \"event_id\" int4 [not null]\n \"date_time\" timestamp\n \"team_id\" int4 [not null]\n \"person_id\" int4\n \"rotation_key\" varchar(100)\n \"comment\" varchar(100)\n \"numerator\" varchar(100)\n \"denominator\" varchar(100)\n \"prediction\" varchar(100)\n \"payout_calculation\" varchar(100)\n \"payout_amount\" varchar(100)\n}\n\nTable \"wagering_runlines\" {\n \"id\" int4 [not null, increment]\n \"bookmaker_id\" int4 [not null]\n \"event_id\" int4 [not null]\n \"date_time\" timestamp\n \"team_id\" int4 [not null]\n \"person_id\" int4\n \"rotation_key\" varchar(100)\n \"comment\" varchar(100)\n \"vigorish\" varchar(100)\n \"line\" varchar(100)\n \"line_opening\" varchar(100)\n \"line_value\" varchar(100)\n \"prediction\" varchar(100)\n}\n\nTable \"wagering_straight_spread_lines\" {\n \"id\" int4 [not null, increment]\n \"bookmaker_id\" int4 [not null]\n \"event_id\" int4 [not null]\n \"date_time\" timestamp\n \"team_id\" int4 [not null]\n \"person_id\" int4\n \"rotation_key\" varchar(100)\n \"comment\" varchar(100)\n \"vigorish\" varchar(100)\n \"line_value\" varchar(100)\n \"line_value_opening\" varchar(100)\n \"prediction\" varchar(100)\n}\n\nTable \"wagering_total_score_lines\" {\n \"id\" int4 [not null, increment]\n \"bookmaker_id\" int4 [not null]\n \"event_id\" int4 [not null]\n \"date_time\" timestamp\n \"team_id\" int4 [not null]\n \"person_id\" int4\n \"rotation_key\" varchar(100)\n \"comment\" varchar(100)\n \"vigorish\" varchar(100)\n \"line_over\" varchar(100)\n \"line_under\" varchar(100)\n \"total\" varchar(100)\n \"total_opening\" varchar(100)\n \"prediction\" varchar(100)\n}\n\nTable \"weather_conditions\" {\n \"id\" int4 [not null, increment]\n \"event_id\" int4 [not null]\n \"temperature\" varchar(100)\n \"temperature_units\" varchar(40)\n \"humidity\" varchar(100)\n \"clouds\" varchar(100)\n \"wind_direction\" varchar(100)\n \"wind_velocity\" varchar(100)\n \"weather_code\" varchar(100)\n}\n", "stateUrl": null, "stateOriginalUrl": "https://github.com/yugabyte/yugabyte-db/blob/master/sample/sportsdb_tables.sql" } } ================================================ FILE: tasks/postgres/standard/sports/team_roster_management/verify.py ================================================ """ Verification script for PostgreSQL Sports Task 2: Team Roster Management Operations """ import os import sys import psycopg2 from decimal import Decimal def rows_match(actual_row, expected_row): """ Compare two rows with appropriate tolerance. For Decimal types: allows 0.001 tolerance For other types: requires exact match """ if len(actual_row) != len(expected_row): return False for actual, expected in zip(actual_row, expected_row): if isinstance(actual, Decimal) and isinstance(expected, Decimal): if abs(float(actual) - float(expected)) > 0.001: return False elif isinstance(actual, float) and isinstance(expected, float): if abs(actual - expected) > 0.001: return False elif actual != expected: return False return True def get_connection_params() -> dict: """Get database connection parameters.""" return { "host": os.getenv("POSTGRES_HOST", "localhost"), "port": int(os.getenv("POSTGRES_PORT", 5432)), "database": os.getenv("POSTGRES_DATABASE", "sports"), "user": os.getenv("POSTGRES_USERNAME", "postgres"), "password": os.getenv("POSTGRES_PASSWORD", "postgres") } def verify_player_evaluation_table(conn) -> bool: """Verify the final state of player_evaluation table after all operations.""" with conn.cursor() as cur: # Get actual results from the created table cur.execute(""" SELECT person_id, batting_avg, home_runs, rbis, games_played, performance_score FROM player_evaluation ORDER BY person_id """) actual_results = cur.fetchall() # Execute ground truth query that simulates all steps: # 1. Initial insert (step 2) # 2. Update based on injuries (step 4) cur.execute(""" WITH initial_players AS ( SELECT s.stat_holder_id AS person_id, SUM(bos.hits) AS total_hits, SUM(bos.at_bats) AS total_at_bats, CASE WHEN SUM(bos.at_bats) > 0 THEN 1.0 * SUM(bos.hits) / SUM(bos.at_bats) ELSE 0 END AS batting_avg, SUM(bos.home_runs) AS home_runs, SUM(bos.rbi) AS rbis FROM stats s JOIN baseball_offensive_stats bos ON s.stat_repository_id = bos.id WHERE s.stat_holder_type = 'persons' AND s.stat_repository_type = 'baseball_offensive_stats' GROUP BY s.stat_holder_id ), game_counts AS ( SELECT person_id, COUNT(DISTINCT event_id) AS games_played FROM person_event_metadata GROUP BY person_id ), players_with_games AS ( SELECT ip.person_id, ip.batting_avg, ip.home_runs, ip.rbis, COALESCE(gc.games_played, 0) AS games_played, (ip.batting_avg * 1000) + (COALESCE(ip.home_runs, 0) * 5) + (COALESCE(ip.rbis, 0) * 2) AS initial_score FROM initial_players ip LEFT JOIN game_counts gc ON ip.person_id = gc.person_id WHERE COALESCE(gc.games_played, 0) >= 10 ), injury_info AS ( SELECT person_id, COUNT(*) AS injury_count, MAX(CASE WHEN end_date_time IS NULL THEN 1 ELSE 0 END) AS has_active_injury FROM injury_phases GROUP BY person_id ), adjusted_scores AS ( SELECT pwg.person_id, pwg.batting_avg, pwg.home_runs, pwg.rbis, pwg.games_played, GREATEST( CASE WHEN COALESCE(ii.has_active_injury, 0) = 1 AND COALESCE(ii.injury_count, 0) > 2 THEN pwg.initial_score * 0.8 * 0.9 WHEN COALESCE(ii.has_active_injury, 0) = 1 THEN pwg.initial_score * 0.8 WHEN COALESCE(ii.injury_count, 0) > 2 THEN pwg.initial_score * 0.9 ELSE pwg.initial_score END, 0 ) AS performance_score FROM players_with_games pwg LEFT JOIN injury_info ii ON ii.person_id = pwg.person_id ) SELECT person_id, batting_avg, home_runs, rbis, games_played, performance_score FROM adjusted_scores ORDER BY person_id; """) expected_results = cur.fetchall() if len(actual_results) != len(expected_results): print(f"❌ Expected {len(expected_results)} player evaluation records, got {len(actual_results)}") return False mismatches = 0 for i, (actual, expected) in enumerate(zip(actual_results, expected_results)): if not rows_match(actual, expected): if mismatches < 5: # Only show first 5 mismatches print(f"❌ Row {i+1} mismatch: expected {expected}, got {actual}") mismatches += 1 if mismatches > 0: print(f"❌ Total mismatches in player_evaluation: {mismatches}") return False print(f"✅ Player evaluation table is correct ({len(actual_results)} records)") return True def verify_injury_status_table(conn) -> bool: """Verify the player_injury_status table and data.""" with conn.cursor() as cur: # Get actual results cur.execute(""" SELECT person_id, injury_count, last_injury_date, current_status FROM player_injury_status ORDER BY person_id """) actual_results = cur.fetchall() # Execute ground truth query - get players from player_evaluation cur.execute(""" WITH player_list AS ( SELECT DISTINCT person_id FROM player_evaluation ), injury_counts AS ( SELECT person_id, COUNT(*) as injury_count, MAX(start_date_time::date) as last_injury_date, MAX(CASE WHEN end_date_time IS NULL THEN 1 ELSE 0 END) as has_active_injury FROM injury_phases GROUP BY person_id ) SELECT pl.person_id, COALESCE(ic.injury_count, 0) as injury_count, ic.last_injury_date, CASE WHEN COALESCE(ic.has_active_injury, 0) = 1 THEN 'injured' ELSE 'healthy' END as current_status FROM player_list pl LEFT JOIN injury_counts ic ON pl.person_id = ic.person_id ORDER BY pl.person_id """) expected_results = cur.fetchall() if len(actual_results) != len(expected_results): print(f"❌ Expected {len(expected_results)} injury status records, got {len(actual_results)}") return False mismatches = 0 for i, (actual, expected) in enumerate(zip(actual_results, expected_results)): if not rows_match(actual, expected): if mismatches < 5: print(f"❌ Row {i+1} mismatch: expected {expected}, got {actual}") mismatches += 1 if mismatches > 0: print(f"❌ Total mismatches in player_injury_status: {mismatches}") return False print(f"✅ Player injury status table is correct ({len(actual_results)} records)") return True def verify_summary_table(conn) -> bool: """Verify the team_performance_summary table.""" with conn.cursor() as cur: # Get actual results cur.execute(""" SELECT metric_name, metric_value FROM team_performance_summary ORDER BY metric_name """) actual_results = cur.fetchall() # Execute ground truth query cur.execute(""" WITH player_data AS ( SELECT COUNT(*) as total_players, AVG(batting_avg) as avg_batting_average, SUM(home_runs) as total_home_runs, AVG(performance_score) as avg_performance_score FROM player_evaluation ), health_data AS ( SELECT SUM(CASE WHEN current_status = 'injured' THEN 1 ELSE 0 END) as injured_count, SUM(CASE WHEN current_status = 'healthy' THEN 1 ELSE 0 END) as healthy_count FROM player_injury_status WHERE person_id IN (SELECT person_id FROM player_evaluation) ) SELECT metric_name, metric_value::DECIMAL FROM ( SELECT 'avg_batting_average' as metric_name, avg_batting_average as metric_value FROM player_data UNION ALL SELECT 'avg_performance_score', avg_performance_score FROM player_data UNION ALL SELECT 'healthy_player_count', healthy_count FROM health_data UNION ALL SELECT 'injured_player_count', injured_count FROM health_data UNION ALL SELECT 'total_home_runs', total_home_runs FROM player_data UNION ALL SELECT 'total_players', total_players FROM player_data ) metrics ORDER BY metric_name """) expected_results = cur.fetchall() if len(actual_results) != len(expected_results): print(f"❌ Expected {len(expected_results)} metrics, got {len(actual_results)}") return False mismatches = 0 for actual, expected in zip(actual_results, expected_results): if not rows_match(actual, expected): if mismatches < 5: print(f"❌ Metric mismatch: expected {expected}, got {actual}") mismatches += 1 if mismatches > 0: print(f"❌ Total mismatches in summary table: {mismatches}") return False print(f"✅ Team performance summary table is correct ({len(actual_results)} metrics)") return True def main(): """Main verification function.""" print("=" * 50) print("Verifying Sports Task 2: Team Roster Management Operations") print("=" * 50) # Get connection parameters conn_params = get_connection_params() if not conn_params["database"]: print("❌ No database specified") sys.exit(1) try: # Connect to database conn = psycopg2.connect(**conn_params) # Verify all steps success = ( verify_player_evaluation_table(conn) and verify_injury_status_table(conn) and verify_summary_table(conn) ) conn.close() if success: print("\n🎉 Task verification: PASS") sys.exit(0) else: print("\n❌ Task verification: FAIL") sys.exit(1) except psycopg2.Error as e: print(f"❌ Database error: {e}") sys.exit(1) except Exception as e: print(f"❌ Verification error: {e}") sys.exit(1) if __name__ == "__main__": main() ================================================ FILE: tasks/postgres/standard/vectors/dba_vector_analysis/description.md ================================================ # PostgreSQL Vector Database Analysis > Analyze and optimize a pgvector-powered database to understand storage patterns, performance characteristics, and data quality for embeddings in production workloads. ## What's this about? You've got a PostgreSQL database running with the vector extension that stores embeddings for RAG (document similarity search, image recognition), or other ML workloads. Your job is to dive deep into this vector database and figure out what's going on under the hood. You need to understand: - how vectors are stored - how much space they're taking up - whether indexes are working properly - if there are any data quality issues lurking around ## What you need to investigate First, get familiar with what you're working with: - Check vector extension status: ensuring it's installed properly, check version, identify any configuration issues - Identify all vector columns across entire database: providing me columns, types of columns, and vector dim (dimensions) - Map the vector landscape: understand relationships between vector tables and regular tables, foreign keys, dependencies Vectors can eat up a lot of storage, so let's see where the bytes are going: - Calculate vector storage overhead: measure how much space vectors take compared to regular columns in same tables - Analyze table sizes: identify which vector tables are biggest storage consumers, break down by table - Understand growth patterns: examine record counts and project future storage needs based on current data Vectors without proper indexes are painfully slow, so investigate: - Catalog vector indexes: find all HNSW and IVFFlat indexes, document their configurations and parameters - Measure index effectiveness: determine if indexes are actually being used and helping query performance - Identify optimization opportunities: spot missing indexes, suboptimal configurations, unused indexes Bad vector data makes everything worse: - Hunt for data issues: locate NULL vectors, dimension mismatches, corrupted embeddings that could break queries - Validate consistency: ensure vectors in each column have consistent dimensions across all rows - Check for outliers: find vectors that might be skewing similarity calculations or causing performance issues ## Your deliverables Create these analysis tables and populate them with your findings: ### `vector_analysis_columns` Complete catalog of every vector column you find: ```sql CREATE TABLE vector_analysis_columns ( schema VARCHAR(50), table_name VARCHAR(100), column_name VARCHAR(100), dimensions INTEGER, data_type VARCHAR(50), has_constraints BOOLEAN, rows BIGINT ); ``` ### `vector_analysis_storage_consumption` Show exactly where storage is being consumed: ```sql CREATE TABLE vector_analysis_storage_consumption ( schema VARCHAR(50), table_name VARCHAR(100), total_size_bytes BIGINT, vector_data_bytes BIGINT, regular_data_bytes BIGINT, vector_storage_pct NUMERIC(5,2), row_count BIGINT ); ``` ### `vector_analysis_indices` Document all vector indexes and their characteristics: ```sql CREATE TABLE vector_analysis_indices ( schema VARCHAR(50), table_name VARCHAR(100), column_name VARCHAR(100), index_name VARCHAR(100), index_type VARCHAR(50), -- 'hnsw', 'ivfflat', etc. index_size_bytes BIGINT ); ``` Use PostgreSQL system catalogs, pgvector-specific views, and storage analysis functions to gather comprehensive metrics about the vector database implementation. ================================================ FILE: tasks/postgres/standard/vectors/dba_vector_analysis/ground_truth.sql ================================================ -- Ground Truth Data for Vector Database Analysis Task -- This defines the exact expected results that candidates should discover and report /* ================================================================================ EXPECTED VECTOR DATABASE STRUCTURE (created by vectors_setup.py) ================================================================================ Tables with Vector Columns: 1. documents.embedding (vector(1536)) 2. document_chunks.embedding (vector(1536)) 3. user_queries.embedding (vector(1536)) Vector Indexes: 1. documents_embedding_idx (HNSW on documents.embedding) 2. chunks_embedding_idx (HNSW on document_chunks.embedding) 3. queries_embedding_idx (HNSW on user_queries.embedding) Expected Data Counts: - documents: 10 records - document_chunks: ~40-70 records (3-7 chunks per document) - user_queries: 10 records - embedding_models: 5 records (metadata) - knowledge_base: 5 records (metadata) - search_cache: 5 records (metadata) ================================================================================ DEFINITIVE GROUND TRUTH VERIFICATION DATA ================================================================================ */ BEGIN; -- Create expected analysis result structure CREATE TABLE IF NOT EXISTS expected_vector_column_inventory ( table_schema VARCHAR(50) DEFAULT 'public', table_name VARCHAR(100), column_name VARCHAR(100), vector_dimensions INTEGER, data_type VARCHAR(50) DEFAULT 'USER-DEFINED', has_constraints BOOLEAN DEFAULT false, min_estimated_rows BIGINT ); -- Insert expected vector column inventory INSERT INTO expected_vector_column_inventory (table_name, column_name, vector_dimensions, min_estimated_rows) VALUES ('documents', 'embedding', 1536, 10), ('document_chunks', 'embedding', 1536, 30), ('user_queries', 'embedding', 1536, 10); -- Create expected storage analysis structure CREATE TABLE IF NOT EXISTS expected_vector_storage_analysis ( table_name VARCHAR(100), has_vector_data BOOLEAN, min_row_count BIGINT, vector_column_exists BOOLEAN, should_have_storage_metrics BOOLEAN DEFAULT true ); -- Insert expected storage analysis INSERT INTO expected_vector_storage_analysis (table_name, has_vector_data, min_row_count, vector_column_exists) VALUES ('documents', true, 10, true), ('document_chunks', true, 30, true), ('user_queries', true, 10, true), ('embedding_models', false, 5, false), ('knowledge_base', false, 5, false), ('search_cache', false, 5, false); -- Create expected index analysis structure CREATE TABLE IF NOT EXISTS expected_vector_index_analysis ( index_name_pattern VARCHAR(100), table_name VARCHAR(100), column_name VARCHAR(100), expected_index_type VARCHAR(50), should_exist BOOLEAN DEFAULT true ); -- Insert expected vector index analysis INSERT INTO expected_vector_index_analysis (index_name_pattern, table_name, column_name, expected_index_type) VALUES ('%documents%embedding%', 'documents', 'embedding', 'hnsw'), ('%chunks%embedding%', 'document_chunks', 'embedding', 'hnsw'), ('%queries%embedding%', 'user_queries', 'embedding', 'hnsw'); -- Create storage analysis table CREATE TABLE vector_storage_analysis ( table_name VARCHAR(100), total_size_bytes BIGINT, vector_data_bytes BIGINT, regular_data_bytes BIGINT, vector_storage_pct NUMERIC(5,2), row_count BIGINT, avg_vector_size_bytes INTEGER ); -- Populate storage analysis with actual storage metrics DO $$ DECLARE rec RECORD; total_size BIGINT; row_cnt BIGINT; vector_size INTEGER := 1536 * 4; -- 1536 dimensions * 4 bytes per float BEGIN FOR rec IN SELECT tablename FROM pg_tables WHERE tablename IN ('documents', 'document_chunks', 'user_queries') LOOP EXECUTE format('SELECT COUNT(*) FROM %I', rec.tablename) INTO row_cnt; SELECT pg_total_relation_size(format('public.%I', rec.tablename)) INTO total_size; INSERT INTO vector_storage_analysis ( table_name, total_size_bytes, row_count, avg_vector_size_bytes, vector_data_bytes, regular_data_bytes, vector_storage_pct ) VALUES ( rec.tablename, total_size, row_cnt, vector_size, row_cnt * vector_size, GREATEST(total_size - (row_cnt * vector_size), 0), ROUND((row_cnt * vector_size * 100.0) / NULLIF(total_size, 0), 2) ); END LOOP; END $$; -- Create index analysis table CREATE TABLE vector_index_analysis ( index_name VARCHAR(100), table_name VARCHAR(100), column_name VARCHAR(100), index_type VARCHAR(50), index_size_bytes BIGINT, index_parameters TEXT, is_valid BOOLEAN ); -- Populate index analysis with actual vector indexes INSERT INTO vector_index_analysis (index_name, table_name, column_name, index_type, index_size_bytes, is_valid) SELECT i.indexname as index_name, i.tablename as table_name, 'embedding' as column_name, -- Known from our setup CASE WHEN i.indexdef ILIKE '%hnsw%' THEN 'hnsw' WHEN i.indexdef ILIKE '%ivfflat%' THEN 'ivfflat' ELSE 'unknown' END as index_type, pg_relation_size(format('public.%I', i.indexname)) as index_size_bytes, true as is_valid FROM pg_indexes i WHERE (i.indexdef ILIKE '%vector%' OR i.indexdef ILIKE '%hnsw%' OR i.indexdef ILIKE '%ivfflat%') AND i.tablename IN ('documents', 'document_chunks', 'user_queries') ORDER BY i.tablename, i.indexname; -- Create data quality analysis table CREATE TABLE vector_data_quality ( table_name VARCHAR(100), column_name VARCHAR(100), quality_check_type VARCHAR(50), total_records BIGINT, issue_count BIGINT, quality_status VARCHAR(20), details TEXT ); -- Populate data quality analysis with actual checks DO $$ DECLARE rec RECORD; total_cnt BIGINT; null_cnt BIGINT; BEGIN FOR rec IN SELECT tablename FROM pg_tables WHERE tablename IN ('documents', 'document_chunks', 'user_queries') LOOP -- Count total records EXECUTE format('SELECT COUNT(*) FROM %I', rec.tablename) INTO total_cnt; -- Count NULL vectors EXECUTE format('SELECT COUNT(*) FROM %I WHERE embedding IS NULL', rec.tablename) INTO null_cnt; -- Insert NULL_CHECK result INSERT INTO vector_data_quality ( table_name, column_name, quality_check_type, total_records, issue_count, quality_status ) VALUES ( rec.tablename, 'embedding', 'NULL_CHECK', total_cnt, null_cnt, CASE WHEN null_cnt = 0 THEN 'GOOD' ELSE 'WARNING' END ); -- Insert DIMENSION_CHECK result (all vectors in our setup are 1536-dimensional) INSERT INTO vector_data_quality ( table_name, column_name, quality_check_type, total_records, issue_count, quality_status ) VALUES ( rec.tablename, 'embedding', 'DIMENSION_CHECK', total_cnt - null_cnt, 0, 'GOOD' ); END LOOP; END $$; -- ============================================================================ -- GROUND TRUTH IMPLEMENTATION -- ============================================================================ -- This is the correct analysis implementation that candidates should produce -- Create vector_analysis_columns table and populate it CREATE TABLE vector_analysis_columns ( schema VARCHAR(50), table_name VARCHAR(100), column_name VARCHAR(100), dimensions INTEGER, data_type VARCHAR(50), has_constraints BOOLEAN, rows BIGINT ); -- Discover and insert vector columns INSERT INTO vector_analysis_columns (schema, table_name, column_name, dimensions, data_type, has_constraints, rows) SELECT 'public' as schema, c.table_name, c.column_name, 1536 as dimensions, -- pgvector embedding dimension 'USER-DEFINED' as data_type, false as has_constraints, -- Get actual row count using dynamic query CASE c.table_name WHEN 'documents' THEN (SELECT COUNT(*) FROM documents) WHEN 'document_chunks' THEN (SELECT COUNT(*) FROM document_chunks) WHEN 'user_queries' THEN (SELECT COUNT(*) FROM user_queries) ELSE 0 END as rows FROM information_schema.columns c WHERE c.data_type = 'USER-DEFINED' AND c.udt_name = 'vector' ORDER BY c.table_name, c.column_name; -- Create vector_analysis_storage_consumption table CREATE TABLE vector_analysis_storage_consumption ( schema VARCHAR(50), table_name VARCHAR(100), total_size_bytes BIGINT, vector_data_bytes BIGINT, regular_data_bytes BIGINT, vector_storage_pct NUMERIC(5,2), row_count BIGINT ); -- Populate storage analysis for vector tables DO $$ DECLARE rec RECORD; total_size BIGINT; row_cnt BIGINT; vector_size INTEGER := 1536 * 4; -- 1536 dimensions * 4 bytes per float BEGIN FOR rec IN SELECT DISTINCT c.table_name FROM information_schema.columns c WHERE c.data_type = 'USER-DEFINED' AND c.udt_name = 'vector' LOOP -- Get actual row count EXECUTE format('SELECT COUNT(*) FROM %I', rec.table_name) INTO row_cnt; -- Get actual table size SELECT pg_total_relation_size(format('public.%I', rec.table_name)) INTO total_size; -- Insert analysis results INSERT INTO vector_analysis_storage_consumption ( schema, table_name, total_size_bytes, vector_data_bytes, regular_data_bytes, vector_storage_pct, row_count ) VALUES ( 'public', rec.table_name, total_size, row_cnt * vector_size, GREATEST(total_size - (row_cnt * vector_size), 0), ROUND((row_cnt * vector_size * 100.0) / NULLIF(total_size, 0), 2), row_cnt ); END LOOP; END $$; -- Create vector_analysis_indices table CREATE TABLE vector_analysis_indices ( schema VARCHAR(50), table_name VARCHAR(100), column_name VARCHAR(100), index_name VARCHAR(100), index_type VARCHAR(50), index_size_bytes BIGINT ); -- Populate index analysis for vector indexes INSERT INTO vector_analysis_indices (schema, table_name, column_name, index_name, index_type, index_size_bytes) SELECT i.schemaname as schema, i.tablename as table_name, 'embedding' as column_name, -- known from our setup i.indexname as index_name, CASE WHEN i.indexdef ILIKE '%hnsw%' THEN 'hnsw' WHEN i.indexdef ILIKE '%ivfflat%' THEN 'ivfflat' ELSE 'unknown' END as index_type, pg_relation_size(format('public.%I', i.indexname)) as index_size_bytes FROM pg_indexes i WHERE (i.indexdef ILIKE '%hnsw%' OR i.indexdef ILIKE '%ivfflat%') AND i.tablename IN ( SELECT DISTINCT table_name FROM information_schema.columns WHERE data_type = 'USER-DEFINED' AND udt_name = 'vector' ) ORDER BY i.tablename, i.indexname; COMMIT; -- ============================================================================ -- VERIFICATION HELPER QUERIES -- ============================================================================ -- Query to check actual vector columns in the database /* SELECT table_schema, table_name, column_name, data_type, udt_name FROM information_schema.columns WHERE data_type = 'USER-DEFINED' AND udt_name = 'vector' ORDER BY table_name, column_name; */ -- Query to check actual vector indexes /* SELECT schemaname, tablename, indexname, indexdef FROM pg_indexes WHERE indexdef ILIKE '%vector%' OR indexdef ILIKE '%hnsw%' OR indexdef ILIKE '%ivfflat%' ORDER BY tablename, indexname; */ -- Query to check table row counts /* SELECT 'documents' as table_name, COUNT(*) as row_count FROM documents UNION ALL SELECT 'document_chunks' as table_name, COUNT(*) as row_count FROM document_chunks UNION ALL SELECT 'user_queries' as table_name, COUNT(*) as row_count FROM user_queries ORDER BY table_name; */ -- Query to check pgvector extension /* SELECT extname, extversion FROM pg_extension WHERE extname = 'vector'; */ ================================================ FILE: tasks/postgres/standard/vectors/dba_vector_analysis/meta.json ================================================ { "task_id": "dba_vector_analysis", "task_name": "DBA Vector Analysis", "category_id": "vectors", "category_name": "Vectors", "description": "Analyze pgvector database storage, identify vector columns, assess space utilization and performance for RAG applications.", "author": "Fanshi Zhang", "created_at": "2025-08-18", "difficulty": "L3", "tags": [ "performance optimization", "audit and compliance", "statistical aggregation" ], "mcp": [ "postgres" ], "meta_data": { "stateType": "text", "stateContent": "Table \"documents\" {\n \"id\" int4 [pk, not null, increment]\n \"title\" text [not null]\n \"content\" text [not null]\n \"source_url\" text\n \"document_type\" varchar(50) [default: 'article']\n \"created_at\" timestamp [default: `CURRENT_TIMESTAMP`]\n \"updated_at\" timestamp [default: `CURRENT_TIMESTAMP`]\n \"word_count\" int4\n \"embedding\" public.vector\n\n Indexes {\n created_at [type: btree, name: \"documents_created_idx\"]\n embedding [type: hnsw, name: \"documents_embedding_idx\"]\n title [type: btree, name: \"documents_title_idx\"]\n document_type [type: btree, name: \"documents_type_idx\"]\n }\n}\n\nTable \"document_chunks\" {\n \"id\" int4 [pk, not null, increment]\n \"document_id\" int4\n \"chunk_index\" int4 [not null]\n \"chunk_text\" text [not null]\n \"chunk_size\" int4\n \"overlap_size\" int4 [default: 0]\n \"created_at\" timestamp [default: `CURRENT_TIMESTAMP`]\n \"embedding\" public.vector\n\n Indexes {\n document_id [type: btree, name: \"chunks_doc_id_idx\"]\n embedding [type: hnsw, name: \"chunks_embedding_idx\"]\n chunk_index [type: btree, name: \"chunks_index_idx\"]\n }\n}\n\nTable \"user_queries\" {\n \"id\" int4 [pk, not null, increment]\n \"query_text\" text [not null]\n \"user_id\" varchar(100)\n \"session_id\" varchar(100)\n \"created_at\" timestamp [default: `CURRENT_TIMESTAMP`]\n \"response_time_ms\" int4\n \"embedding\" public.vector\n\n Indexes {\n created_at [type: btree, name: \"queries_created_idx\"]\n embedding [type: hnsw, name: \"queries_embedding_idx\"]\n user_id [type: btree, name: \"queries_user_idx\"]\n }\n}\n\nTable \"embedding_models\" {\n \"id\" int4 [pk, not null, increment]\n \"model_name\" varchar(100) [unique, not null]\n \"provider\" varchar(50) [not null]\n \"dimensions\" int4 [not null]\n \"max_tokens\" int4\n \"cost_per_token\" numeric(10,8)\n \"created_at\" timestamp [default: `CURRENT_TIMESTAMP`]\n \"is_active\" bool [default: true]\n}\n\nTable \"knowledge_base\" {\n \"id\" int4 [pk, not null, increment]\n \"kb_name\" varchar(100) [not null]\n \"description\" text\n \"domain\" varchar(50)\n \"language\" varchar(10) [default: 'en']\n \"total_documents\" int4 [default: 0]\n \"total_chunks\" int4 [default: 0]\n \"total_storage_mb\" numeric(10,2)\n \"created_at\" timestamp [default: `CURRENT_TIMESTAMP`]\n \"updated_at\" timestamp [default: `CURRENT_TIMESTAMP`]\n}\n\nTable \"search_cache\" {\n \"id\" int4 [pk, not null, increment]\n \"query_hash\" varchar(64) [not null]\n \"query_text\" text [not null]\n \"results_json\" jsonb\n \"result_count\" int4\n \"search_time_ms\" int4\n \"similarity_threshold\" numeric(4,3)\n \"created_at\" timestamp [default: `CURRENT_TIMESTAMP`]\n \"expires_at\" timestamp\n\n Indexes {\n expires_at [type: btree, name: \"cache_expires_idx\"]\n query_hash [type: btree, name: \"cache_hash_idx\"]\n }\n}\n\nRef \"document_chunks_document_id_fkey\":\"documents\".\"id\" < \"document_chunks\".\"document_id\" [delete: cascade]\n", "stateUrl": null, "stateOriginalUrl": null } } ================================================ FILE: tasks/postgres/standard/vectors/dba_vector_analysis/prepare_environment.py ================================================ """ Environment preparation script for Vector Database DBA Analysis task. This script imports and uses the shared vector database setup utilities. """ import sys import logging from pathlib import Path # Add the vectors directory to import the shared utilities sys.path.append(str(Path(__file__).resolve().parents[1])) from vectors_setup import prepare_vector_environment logger = logging.getLogger(__name__) def prepare_environment(): """Main function to prepare the vector database environment.""" prepare_vector_environment() if __name__ == "__main__": logging.basicConfig(level=logging.INFO) prepare_environment() ================================================ FILE: tasks/postgres/standard/vectors/dba_vector_analysis/verify.py ================================================ """ Verification script for Vector Database DBA Analysis task. This script verifies that the candidate has properly analyzed the vector database and stored their findings in appropriate result tables. """ import logging import psycopg2 import os import sys from typing import Dict, Any logger = logging.getLogger(__name__) def get_connection_params(): """Get database connection parameters from environment variables.""" return { "host": os.getenv("POSTGRES_HOST", "localhost"), "port": int(os.getenv("POSTGRES_PORT", 5432)), "database": os.getenv("POSTGRES_DATABASE"), "user": os.getenv("POSTGRES_USERNAME"), "password": os.getenv("POSTGRES_PASSWORD"), } def verify_vector_analysis_columns(conn) -> Dict[str, Any]: """Verify the vector_analysis_columns table exists, has correct columns, and contains actual vector columns from the database.""" results = {'passed': False, 'issues': []} expected_columns = [ 'schema', 'table_name', 'column_name', 'dimensions', 'data_type', 'has_constraints', 'rows' ] try: with conn.cursor() as cur: # Check if table exists cur.execute(""" SELECT EXISTS ( SELECT FROM information_schema.tables WHERE table_name = 'vector_analysis_columns' ); """) if not cur.fetchone()[0]: results['issues'].append("vector_analysis_columns table not found") return results # Check columns cur.execute(""" SELECT column_name FROM information_schema.columns WHERE table_name = 'vector_analysis_columns' ORDER BY column_name; """) actual_columns = {row[0] for row in cur.fetchall()} missing = set(expected_columns) - actual_columns extra = actual_columns - set(expected_columns) if missing: results['issues'].append(f"Missing columns: {missing}") if extra: results['issues'].append(f"Unexpected columns: {extra}") # Check for data cur.execute("SELECT COUNT(*) FROM vector_analysis_columns;") count = cur.fetchone()[0] if count == 0: results['issues'].append("No rows found in vector_analysis_columns") return results # Get actual vector columns from the database cur.execute(""" SELECT table_name, column_name FROM information_schema.columns WHERE data_type = 'USER-DEFINED' AND udt_name = 'vector' ORDER BY table_name, column_name; """) actual_vector_columns = set(cur.fetchall()) # Get what the agent found cur.execute(""" SELECT table_name, column_name FROM vector_analysis_columns ORDER BY table_name, column_name; """) found_vector_columns = set(cur.fetchall()) # Check if agent found the actual vector columns missing_vectors = actual_vector_columns - found_vector_columns extra_vectors = found_vector_columns - actual_vector_columns if missing_vectors: results['issues'].append(f"Missing: {missing_vectors}") if extra_vectors: results['issues'].append(f"Non-existing: {extra_vectors}") if not missing and not extra and count > 0 and not missing_vectors and not extra_vectors: results['passed'] = True except psycopg2.Error as e: results['issues'].append(f"Database error: {e}") except Exception as e: results['issues'].append(f"Verification error: {e}") return results def verify_vector_analysis_storage_consumption(conn) -> Dict[str, Any]: """Verify the vector_analysis_storage_consumption table exists, has correct columns, and analyzes actual vector tables.""" results = {'passed': False, 'issues': []} expected_columns = [ 'schema', 'table_name', 'total_size_bytes', 'vector_data_bytes', 'regular_data_bytes', 'vector_storage_pct', 'row_count' ] try: with conn.cursor() as cur: cur.execute(""" SELECT EXISTS ( SELECT FROM information_schema.tables WHERE table_name = 'vector_analysis_storage_consumption' ); """) if not cur.fetchone()[0]: results['issues'].append("vector_analysis_storage_consumption table not found") return results cur.execute(""" SELECT column_name FROM information_schema.columns WHERE table_name = 'vector_analysis_storage_consumption' ORDER BY column_name; """) actual_columns = {row[0] for row in cur.fetchall()} missing = set(expected_columns) - actual_columns extra = actual_columns - set(expected_columns) if missing: results['issues'].append(f"Missing columns: {missing}") if extra: results['issues'].append(f"Unexpected columns: {extra}") cur.execute("SELECT COUNT(*) FROM vector_analysis_storage_consumption;") count = cur.fetchone()[0] if count == 0: results['issues'].append("No rows found in vector_analysis_storage_consumption") return results # Get actual tables with vector columns cur.execute(""" SELECT DISTINCT table_name FROM information_schema.columns WHERE data_type = 'USER-DEFINED' AND udt_name = 'vector' ORDER BY table_name; """) actual_vector_tables = {row[0] for row in cur.fetchall()} # Get what the agent analyzed cur.execute(""" SELECT DISTINCT table_name FROM vector_analysis_storage_consumption ORDER BY table_name; """) analyzed_tables = {row[0] for row in cur.fetchall()} # Check if agent analyzed the actual vector tables missing_tables = actual_vector_tables - analyzed_tables if missing_tables: results['issues'].append(f"Agent missed analyzing vector tables: {missing_tables}") # Check that analyzed tables actually have vector columns extra_tables = analyzed_tables - actual_vector_tables if extra_tables: results['issues'].append(f"Agent analyzed non-vector tables: {extra_tables}") if not missing and not extra and count > 0 and not missing_tables and not extra_tables: results['passed'] = True except psycopg2.Error as e: results['issues'].append(f"Database error: {e}") except Exception as e: results['issues'].append(f"Verification error: {e}") return results def verify_vector_analysis_indices(conn) -> Dict[str, Any]: """Verify the vector_analysis_indices table exists, has correct columns, and identifies actual vector indexes.""" results = {'passed': False, 'issues': []} expected_columns = [ 'schema', 'table_name', 'column_name', 'index_name', 'index_type', 'index_size_bytes' ] try: with conn.cursor() as cur: cur.execute(""" SELECT EXISTS ( SELECT FROM information_schema.tables WHERE table_name = 'vector_analysis_indices' ); """) if not cur.fetchone()[0]: results['issues'].append("vector_analysis_indices table not found") return results cur.execute(""" SELECT column_name FROM information_schema.columns WHERE table_name = 'vector_analysis_indices' ORDER BY column_name; """) actual_columns = {row[0] for row in cur.fetchall()} missing = set(expected_columns) - actual_columns extra = actual_columns - set(expected_columns) if missing: results['issues'].append(f"Missing columns: {missing}") if extra: results['issues'].append(f"Unexpected columns: {extra}") cur.execute("SELECT COUNT(*) FROM vector_analysis_indices;") count = cur.fetchone()[0] if count == 0: results['issues'].append("No rows found in vector_analysis_indices") return results # Get actual vector indexes from the database (exclude ground truth table indexes) cur.execute(""" SELECT schemaname, tablename, indexname FROM pg_indexes WHERE (indexdef ILIKE '%hnsw%' OR indexdef ILIKE '%ivfflat%') AND tablename NOT LIKE '%analysis%' ORDER BY tablename, indexname; """) actual_vector_indexes = set(cur.fetchall()) # Get what the agent found cur.execute(""" SELECT schema, table_name, index_name FROM vector_analysis_indices ORDER BY table_name, index_name; """) found_indexes = set(cur.fetchall()) # Check if agent found the actual vector indexes missing_indexes = actual_vector_indexes - found_indexes if missing_indexes: results['issues'].append(f"Agent missed vector indexes: {missing_indexes}") # Allow agent to find more indexes than just vector ones (they might include related indexes) # but at least they should find the vector-specific ones if not missing and not extra and count > 0 and not missing_indexes: results['passed'] = True except psycopg2.Error as e: results['issues'].append(f"Database error: {e}") except Exception as e: results['issues'].append(f"Verification error: {e}") return results def verify_no_extra_analysis_tables(conn) -> Dict[str, Any]: """Check that only the required analysis tables exist (no legacy/extra analysis tables).""" results = {'passed': True, 'issues': []} # Start with passed=True, more lenient required = { 'vector_analysis_columns', 'vector_analysis_storage_consumption', 'vector_analysis_indices', } try: with conn.cursor() as cur: cur.execute(""" SELECT table_name FROM information_schema.tables WHERE table_schema = 'public' AND table_name LIKE 'vector_analysis_%'; """) analysis_tables = {row[0] for row in cur.fetchall()} # Only flag as issue if there are analysis tables that don't match our required set # Exclude ground truth tables from this check analysis_tables_filtered = {t for t in analysis_tables if not t.startswith('expected_') and not t.startswith('vector_analysis_results')} extra = analysis_tables_filtered - required if extra: results['issues'].append(f"Found unexpected analysis tables: {extra}") results['passed'] = False except Exception as e: results['issues'].append(f"Verification error: {e}") results['passed'] = False return results def main(): """Main verification function for vector analysis deliverables.""" conn_params = get_connection_params() if not conn_params["database"]: print("No database specified") sys.exit(1) try: conn = psycopg2.connect(**conn_params) checks = [ ("vector_analysis_columns", verify_vector_analysis_columns), ("vector_analysis_storage_consumption", verify_vector_analysis_storage_consumption), ("vector_analysis_indices", verify_vector_analysis_indices), ("no_extra_analysis_tables", verify_no_extra_analysis_tables), ] passed_checks = 0 all_issues = [] for i, (desc, check_func) in enumerate(checks, 1): result = check_func(conn) if result['passed']: print(f" PASSED") passed_checks += 1 else: print(f" FAILED") for issue in result['issues']: print(f" - {issue}") all_issues.extend(result['issues']) print() conn.close() total_checks = len(checks) print(f"Results: {passed_checks}/{total_checks} checks passed") if passed_checks == total_checks: sys.exit(0) elif passed_checks >= total_checks * 0.75: sys.exit(0) else: sys.exit(1) except psycopg2.Error as e: print(f"Database connection error: {e}") sys.exit(1) except Exception as e: print(f"Verification error: {e}") sys.exit(1) if __name__ == "__main__": main() ================================================ FILE: tasks/postgres/standard/vectors/vectors_setup.py ================================================ """ Shared Vector Database Setup Utilities This module provides utilities for setting up a complete PostgreSQL database with pgvector extension and sample RAG-related tables with vector data. Used by all vector database tasks. """ import os import logging import psycopg2 import json import random import numpy as np from typing import List logger = logging.getLogger(__name__) def get_connection_params(): """Get database connection parameters from environment variables.""" return { 'host': os.getenv('POSTGRES_HOST', 'localhost'), 'port': os.getenv('POSTGRES_PORT', '5432'), 'user': os.getenv('POSTGRES_USERNAME', 'postgres'), 'password': os.getenv('POSTGRES_PASSWORD', 'password'), 'database': os.getenv('POSTGRES_DATABASE', 'postgres') } def generate_mock_embedding(dimensions: int = 1536) -> List[float]: """Generate a mock embedding vector with specified dimensions.""" # Generate random values between -1 and 1, then normalize vector = np.random.uniform(-1, 1, dimensions) # Normalize to unit vector (common practice for embeddings) norm = np.linalg.norm(vector) if norm > 0: vector = vector / norm return vector.tolist() def create_vector_extension(): """Create the pgvector extension.""" conn_params = get_connection_params() try: conn = psycopg2.connect(**conn_params) conn.autocommit = True with conn.cursor() as cur: logger.info("Creating pgvector extension...") cur.execute("CREATE EXTENSION IF NOT EXISTS vector;") logger.info("pgvector extension created successfully") conn.close() except psycopg2.Error as e: logger.error(f"Failed to create pgvector extension: {e}") raise def create_vector_tables(): """Create sample tables with vector columns for RAG applications.""" conn_params = get_connection_params() try: conn = psycopg2.connect(**conn_params) conn.autocommit = True with conn.cursor() as cur: logger.info("Creating vector database tables...") # Create documents table for document embeddings cur.execute(""" CREATE TABLE IF NOT EXISTS documents ( id SERIAL PRIMARY KEY, title TEXT NOT NULL, content TEXT NOT NULL, source_url TEXT, document_type VARCHAR(50) DEFAULT 'article', created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP, updated_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP, word_count INTEGER, embedding vector(1536) ); """) # Create chunks table for document chunks (common in RAG) cur.execute(""" CREATE TABLE IF NOT EXISTS document_chunks ( id SERIAL PRIMARY KEY, document_id INTEGER REFERENCES documents(id) ON DELETE CASCADE, chunk_index INTEGER NOT NULL, chunk_text TEXT NOT NULL, chunk_size INTEGER, overlap_size INTEGER DEFAULT 0, created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP, embedding vector(1536) ); """) # Create queries table for storing user queries and their embeddings cur.execute(""" CREATE TABLE IF NOT EXISTS user_queries ( id SERIAL PRIMARY KEY, query_text TEXT NOT NULL, user_id VARCHAR(100), session_id VARCHAR(100), created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP, response_time_ms INTEGER, embedding vector(1536) ); """) # Create embeddings metadata table cur.execute(""" CREATE TABLE IF NOT EXISTS embedding_models ( id SERIAL PRIMARY KEY, model_name VARCHAR(100) NOT NULL UNIQUE, provider VARCHAR(50) NOT NULL, dimensions INTEGER NOT NULL, max_tokens INTEGER, cost_per_token DECIMAL(10, 8), created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP, is_active BOOLEAN DEFAULT TRUE ); """) # Create knowledge base table cur.execute(""" CREATE TABLE IF NOT EXISTS knowledge_base ( id SERIAL PRIMARY KEY, kb_name VARCHAR(100) NOT NULL, description TEXT, domain VARCHAR(50), language VARCHAR(10) DEFAULT 'en', total_documents INTEGER DEFAULT 0, total_chunks INTEGER DEFAULT 0, total_storage_mb DECIMAL(10, 2), created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP, updated_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP ); """) # Create similarity search results cache cur.execute(""" CREATE TABLE IF NOT EXISTS search_cache ( id SERIAL PRIMARY KEY, query_hash VARCHAR(64) NOT NULL, query_text TEXT NOT NULL, results_json JSONB, result_count INTEGER, search_time_ms INTEGER, similarity_threshold DECIMAL(4, 3), created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP, expires_at TIMESTAMP ); """) logger.info("Vector database tables created successfully") conn.close() except psycopg2.Error as e: logger.error(f"Failed to create vector tables: {e}") raise def create_vector_indexes(): """Create indexes for vector columns and other frequently queried fields.""" conn_params = get_connection_params() try: conn = psycopg2.connect(**conn_params) conn.autocommit = True with conn.cursor() as cur: logger.info("Creating vector indexes...") # Vector indexes using HNSW (Hierarchical Navigable Small World) indexes = [ ("documents_embedding_idx", "documents", "embedding", "hnsw"), ("chunks_embedding_idx", "document_chunks", "embedding", "hnsw"), ("queries_embedding_idx", "user_queries", "embedding", "hnsw"), ] for idx_name, table_name, column_name, method in indexes: try: if method == "hnsw": cur.execute(f""" CREATE INDEX IF NOT EXISTS {idx_name} ON {table_name} USING hnsw ({column_name} vector_cosine_ops); """) else: cur.execute(f""" CREATE INDEX IF NOT EXISTS {idx_name} ON {table_name} USING ivfflat ({column_name} vector_cosine_ops) WITH (lists = 100); """) logger.info(f"Created index {idx_name} on {table_name}") except psycopg2.Error as e: logger.warning(f"Could not create {method} index {idx_name}: {e}") # Try with IVFFlat as fallback if method == "hnsw": try: cur.execute(f""" CREATE INDEX IF NOT EXISTS {idx_name}_ivf ON {table_name} USING ivfflat ({column_name} vector_cosine_ops) WITH (lists = 100); """) logger.info(f"Created fallback IVFFlat index {idx_name}_ivf on {table_name}") except psycopg2.Error as e2: logger.warning(f"Could not create fallback index: {e2}") # Regular indexes for performance regular_indexes = [ ("documents_title_idx", "documents", "title"), ("documents_type_idx", "documents", "document_type"), ("documents_created_idx", "documents", "created_at"), ("chunks_doc_id_idx", "document_chunks", "document_id"), ("chunks_index_idx", "document_chunks", "chunk_index"), ("queries_user_idx", "user_queries", "user_id"), ("queries_created_idx", "user_queries", "created_at"), ("cache_hash_idx", "search_cache", "query_hash"), ("cache_expires_idx", "search_cache", "expires_at"), ] for idx_name, table_name, column_name in regular_indexes: try: cur.execute(f"CREATE INDEX IF NOT EXISTS {idx_name} ON {table_name} ({column_name});") logger.debug(f"Created regular index {idx_name}") except psycopg2.Error as e: logger.warning(f"Could not create regular index {idx_name}: {e}") logger.info("Vector indexes created successfully") conn.close() except psycopg2.Error as e: logger.error(f"Failed to create vector indexes: {e}") raise def insert_sample_data(): """Insert sample data into vector tables.""" conn_params = get_connection_params() try: conn = psycopg2.connect(**conn_params) conn.autocommit = True with conn.cursor() as cur: logger.info("Inserting sample data...") # Insert embedding models embedding_models = [ ('text-embedding-3-small', 'OpenAI', 1536, 8192, 0.00000002, True), ('text-embedding-3-large', 'OpenAI', 3072, 8192, 0.00000013, True), ('text-embedding-ada-002', 'OpenAI', 1536, 8192, 0.00000010, False), ('all-MiniLM-L6-v2', 'Sentence-Transformers', 384, 512, 0.0, True), ('all-mpnet-base-v2', 'Sentence-Transformers', 768, 514, 0.0, True), ] for model_data in embedding_models: cur.execute(""" INSERT INTO embedding_models (model_name, provider, dimensions, max_tokens, cost_per_token, is_active) VALUES (%s, %s, %s, %s, %s, %s) ON CONFLICT (model_name) DO NOTHING; """, model_data) # Insert knowledge bases knowledge_bases = [ ('Technical Documentation', 'Software engineering and API documentation', 'technology'), ('Research Papers', 'Academic papers and research publications', 'research'), ('Customer Support', 'FAQ and troubleshooting guides', 'support'), ('Product Catalog', 'Product descriptions and specifications', 'commerce'), ('Legal Documents', 'Contracts, policies, and legal texts', 'legal'), ] kb_ids = [] for kb_data in knowledge_bases: cur.execute(""" INSERT INTO knowledge_base (kb_name, description, domain, total_documents, total_chunks, total_storage_mb) VALUES (%s, %s, %s, %s, %s, %s) RETURNING id; """, kb_data + (random.randint(50, 500), random.randint(200, 2000), round(random.uniform(10.5, 250.8), 2))) kb_ids.append(cur.fetchone()[0]) # Insert sample documents sample_documents = [ ("PostgreSQL Performance Tuning", "Comprehensive guide to optimizing PostgreSQL database performance including indexing strategies, query optimization, and configuration tuning.", "https://example.com/pg-performance", "technical_guide"), ("Vector Similarity Search", "Understanding vector embeddings and similarity search algorithms for AI applications and recommendation systems.", "https://example.com/vector-search", "technical_guide"), ("RAG Implementation Best Practices", "Best practices for implementing Retrieval-Augmented Generation systems using vector databases and large language models.", "https://example.com/rag-practices", "best_practices"), ("Database Security Guidelines", "Security considerations and implementation guidelines for PostgreSQL databases in production environments.", "https://example.com/db-security", "security_guide"), ("Machine Learning with SQL", "Integrating machine learning workflows with SQL databases and leveraging database extensions for AI applications.", "https://example.com/ml-sql", "tutorial"), ("API Documentation Standards", "Standards and best practices for creating comprehensive and user-friendly API documentation.", "https://example.com/api-docs", "documentation"), ("Microservices Architecture", "Design patterns and implementation strategies for microservices architecture in modern applications.", "https://example.com/microservices", "architecture_guide"), ("Data Pipeline Optimization", "Optimizing data processing pipelines for scalability, reliability, and performance in enterprise environments.", "https://example.com/data-pipelines", "optimization_guide"), ("Cloud Database Migration", "Step-by-step guide for migrating on-premises databases to cloud infrastructure with minimal downtime.", "https://example.com/cloud-migration", "migration_guide"), ("NoSQL vs SQL Comparison", "Detailed comparison of NoSQL and SQL databases, including use cases, performance characteristics, and selection criteria.", "https://example.com/nosql-sql", "comparison_guide"), ] doc_ids = [] for title, content, url, doc_type in sample_documents: embedding = generate_mock_embedding(1536) word_count = len(content.split()) cur.execute(""" INSERT INTO documents (title, content, source_url, document_type, word_count, embedding) VALUES (%s, %s, %s, %s, %s, %s) RETURNING id; """, (title, content, url, doc_type, word_count, embedding)) doc_ids.append(cur.fetchone()[0]) # Insert document chunks chunk_count = 0 for doc_id in doc_ids: # Generate 3-7 chunks per document num_chunks = random.randint(3, 7) for chunk_idx in range(num_chunks): chunk_text = f"This is chunk {chunk_idx + 1} of document {doc_id}. " + \ "It contains relevant information that would be useful for similarity search and RAG applications. " + \ "The content includes technical details, examples, and best practices." chunk_size = len(chunk_text) overlap_size = random.randint(20, 50) if chunk_idx > 0 else 0 embedding = generate_mock_embedding(1536) cur.execute(""" INSERT INTO document_chunks (document_id, chunk_index, chunk_text, chunk_size, overlap_size, embedding) VALUES (%s, %s, %s, %s, %s, %s); """, (doc_id, chunk_idx, chunk_text, chunk_size, overlap_size, embedding)) chunk_count += 1 # Insert sample user queries sample_queries = [ ("How to optimize PostgreSQL performance?", "user123", "session_abc1"), ("What are vector embeddings?", "user456", "session_def2"), ("Best practices for RAG implementation", "user789", "session_ghi3"), ("Database security checklist", "user123", "session_abc2"), ("Machine learning with databases", "user456", "session_def3"), ("API documentation examples", "user321", "session_jkl1"), ("Microservices design patterns", "user654", "session_mno2"), ("Data pipeline best practices", "user987", "session_pqr3"), ("Cloud migration strategies", "user111", "session_stu4"), ("NoSQL vs SQL databases", "user222", "session_vwx5"), ] for query_text, user_id, session_id in sample_queries: embedding = generate_mock_embedding(1536) response_time = random.randint(50, 500) cur.execute(""" INSERT INTO user_queries (query_text, user_id, session_id, response_time_ms, embedding) VALUES (%s, %s, %s, %s, %s); """, (query_text, user_id, session_id, response_time, embedding)) # Insert some search cache entries for i in range(5): query_hash = f"hash_{random.randint(100000, 999999)}" query_text = f"Sample cached query {i + 1}" results = [{"doc_id": random.randint(1, len(doc_ids)), "similarity": round(random.uniform(0.7, 0.95), 3)} for _ in range(3)] result_count = len(results) search_time = random.randint(10, 100) threshold = round(random.uniform(0.6, 0.8), 3) cur.execute(""" INSERT INTO search_cache (query_hash, query_text, results_json, result_count, search_time_ms, similarity_threshold) VALUES (%s, %s, %s, %s, %s, %s); """, (query_hash, query_text, json.dumps(results), result_count, search_time, threshold)) logger.info(f"Sample data inserted successfully:") logger.info(f" {len(sample_documents)} documents") logger.info(f" {chunk_count} document chunks") logger.info(f" {len(sample_queries)} user queries") logger.info(f" {len(embedding_models)} embedding models") logger.info(f" {len(knowledge_bases)} knowledge bases") conn.close() except psycopg2.Error as e: logger.error(f"Failed to insert sample data: {e}") raise def verify_vector_setup(): """Verify that the vector database was set up correctly.""" conn_params = get_connection_params() try: conn = psycopg2.connect(**conn_params) with conn.cursor() as cur: logger.info("Verifying vector database setup...") # Check extension cur.execute("SELECT extname FROM pg_extension WHERE extname = 'vector';") if cur.fetchone(): logger.info("pgvector extension is installed") else: logger.error("pgvector extension not found") return False # Check tables and record counts tables_to_check = [ 'documents', 'document_chunks', 'user_queries', 'embedding_models', 'knowledge_base', 'search_cache' ] table_counts = {} for table in tables_to_check: cur.execute(f'SELECT COUNT(*) FROM {table}') count = cur.fetchone()[0] table_counts[table] = count logger.info(f"Table {table}: {count} records") # Check vector columns cur.execute(""" SELECT table_name, column_name, data_type FROM information_schema.columns WHERE data_type = 'USER-DEFINED' AND udt_name = 'vector' ORDER BY table_name, column_name; """) vector_columns = cur.fetchall() logger.info(f"Found {len(vector_columns)} vector columns:") for table, column, dtype in vector_columns: logger.info(f" {table}.{column} ({dtype})") # Check indexes cur.execute(""" SELECT schemaname, tablename, indexname, indexdef FROM pg_indexes WHERE indexdef LIKE '%vector%' OR indexdef LIKE '%hnsw%' OR indexdef LIKE '%ivfflat%' ORDER BY tablename, indexname; """) vector_indexes = cur.fetchall() logger.info(f"Found {len(vector_indexes)} vector indexes:") for schema, table, index, definition in vector_indexes: logger.info(f" {index} on {table}") # Test a simple vector similarity query mock_embedding = generate_mock_embedding(1536) cur.execute(""" SELECT id, title, embedding <-> %s::vector as distance FROM documents ORDER BY embedding <-> %s::vector LIMIT 3; """, (mock_embedding, mock_embedding)) results = cur.fetchall() logger.info(f"Vector similarity query returned {len(results)} results") conn.close() logger.info("Vector database verification completed successfully") return table_counts, vector_columns, vector_indexes except psycopg2.Error as e: logger.error(f"Verification failed: {e}") raise def prepare_vector_environment(): """Main function to prepare the vector database environment.""" logger.info("Preparing vector database environment...") try: # Create pgvector extension create_vector_extension() # Create vector tables create_vector_tables() # Insert sample data first insert_sample_data() # Create indexes after data insertion for better performance create_vector_indexes() # Verify the setup table_counts, vector_columns, vector_indexes = verify_vector_setup() logger.info("Vector database environment prepared successfully!") logger.info(f"Total tables created: {len(table_counts)}") logger.info(f"Total vector columns: {len(vector_columns)}") logger.info(f"Total vector indexes: {len(vector_indexes)}") return { 'table_counts': table_counts, 'vector_columns': vector_columns, 'vector_indexes': vector_indexes } except Exception as e: logger.error(f"Failed to prepare vector environment: {e}") raise if __name__ == "__main__": # Allow running this module directly for testing logging.basicConfig(level=logging.INFO) prepare_vector_environment() ================================================ FILE: tasks/utils/__init__.py ================================================ ================================================ FILE: tasks/utils/notion_utils.py ================================================ import os from notion_client import Client import sys from dotenv import load_dotenv def get_notion_client(): # Construct the absolute path to the .env file in the project root load_dotenv(dotenv_path=".mcp_env") api_key = os.getenv("EVAL_NOTION_API_KEY") if not api_key: print( "Error: EVAL_NOTION_API_KEY not found in environment variables.", file=sys.stderr, ) sys.exit(1) return Client(auth=api_key) def _find_object(notion: Client, title: str, object_type: str): """Generic helper to find a Notion page or database by title. Args: notion: Authenticated Notion Client. title: Title (or partial title) to search for. object_type: Either "page" or "database". Returns: The ID string if found, otherwise None. """ search_results = ( notion.search( query=title, filter={"property": "object", "value": object_type} ).get("results") or [] ) if not search_results: return None # Shortcut when there is only one match if len(search_results) == 1: return search_results[0]["id"] # Attempt to find a case-insensitive match on the title field for result in search_results: if object_type == "page": # Pages store their title inside the "properties.title.title" rich text list title_rich_texts = ( result.get("properties", {}).get("title", {}).get("title", []) ) else: # database title_rich_texts = result.get("title", []) for text_obj in title_rich_texts: if title.lower() in text_obj.get("plain_text", "").lower(): return result["id"] # Fallback: return the first result return search_results[0]["id"] def find_page(notion: Client, page_title: str): """Finds a page by title. Wrapper around _find_object with object_type='page'.""" return _find_object(notion, page_title, "page") def get_page_by_id(notion: Client, page_id: str): """Gets a page by its ID. Returns the page object if found, None otherwise.""" try: return notion.pages.retrieve(page_id=page_id) except Exception: return None def find_page_by_id(notion: Client, page_id: str): """Finds a page by its ID and returns the ID if it exists, None otherwise.""" try: notion.pages.retrieve(page_id=page_id) return page_id except Exception: return None def find_database_by_id(notion: Client, database_id: str): """Finds a database by its ID and returns the ID if it exists, None otherwise.""" try: notion.databases.retrieve(database_id=database_id) return database_id except Exception: return None def find_page_or_database_by_id(notion: Client, object_id: str): """ Finds either a page or database by ID. Returns a tuple (object_id, object_type) where object_type is either 'page' or 'database', or (None, None) if not found. """ # Try as page first try: notion.pages.retrieve(page_id=object_id) return (object_id, "page") except Exception: pass # Try as database try: notion.databases.retrieve(database_id=object_id) return (object_id, "database") except Exception: pass return (None, None) def find_database(notion: Client, db_title: str): """Finds a database by title. Wrapper around _find_object with object_type='database'.""" return _find_object(notion, db_title, "database") def find_database_in_block(notion: Client, block_id: str, db_title: str): """ Recursively find a database by title within a block. """ blocks = notion.blocks.children.list(block_id=block_id).get("results") for block in blocks: if ( block.get("type") == "child_database" and block.get("child_database", {}).get("title") == db_title ): return block["id"] if block.get("has_children"): db_id = find_database_in_block(notion, block["id"], db_title) if db_id: return db_id return None def get_all_blocks_recursively(notion: Client, block_id: str): """ Recursively fetches all blocks from a starting block ID and its children, returning a single flat list of block objects. """ all_blocks = [] try: direct_children = notion.blocks.children.list(block_id=block_id).get( "results", [] ) except Exception: return [] for block in direct_children: all_blocks.append(block) if block.get("has_children"): all_blocks.extend(get_all_blocks_recursively(notion, block["id"])) return all_blocks def get_block_plain_text(block): """ Safely extract plain_text from a block (paragraph, heading, etc.). """ block_type = block.get("type") if not block_type: return "" block_content = block.get(block_type) if not block_content: return "" rich_text_list = block_content.get("rich_text", []) plain_text = "".join([rt.get("plain_text", "") for rt in rich_text_list]) return plain_text ================================================ FILE: tasks/utils/postgres_utils.py ================================================ """ PostgreSQL Data Loading Utilities for MCPMark Tasks =================================================== Common utilities for loading data into PostgreSQL databases from CSV files and setting up schemas in prepare_environment.py scripts. """ import csv import os import psycopg2 from pathlib import Path from typing import Dict, List, Any, Optional import logging logger = logging.getLogger(__name__) def get_connection_params() -> dict: """Get database connection parameters from environment variables.""" return { "host": os.getenv("POSTGRES_HOST", "localhost"), "port": int(os.getenv("POSTGRES_PORT", 5432)), "database": os.getenv("POSTGRES_DATABASE"), "user": os.getenv("POSTGRES_USERNAME"), "password": os.getenv("POSTGRES_PASSWORD"), } def execute_schema_sql(conn, schema_sql: str): """Execute schema SQL with proper error handling.""" with conn.cursor() as cur: cur.execute(schema_sql) conn.commit() logger.info("✅ Database schema created successfully") def load_csv_to_table( conn, csv_file_path: Path, table_name: str, columns: Optional[List[str]] = None, skip_header: bool = True ): """ Load CSV data into a PostgreSQL table. Args: conn: Database connection csv_file_path: Path to CSV file table_name: Target table name columns: List of column names (if None, uses all columns) skip_header: Whether to skip the first row """ if not csv_file_path.exists(): raise FileNotFoundError(f"CSV file not found: {csv_file_path}") with conn.cursor() as cur: with open(csv_file_path, 'r', encoding='utf-8') as f: csv_reader = csv.reader(f) # Skip header if needed if skip_header: next(csv_reader) # Build COPY command if columns: copy_sql = f"COPY {table_name} ({', '.join(columns)}) FROM STDIN WITH CSV" else: copy_sql = f"COPY {table_name} FROM STDIN WITH CSV" # Reset file pointer and copy data f.seek(0) if skip_header: next(csv.reader(f)) # Skip header again cur.copy_expert(copy_sql, f) conn.commit() logger.info(f"✅ Loaded data from {csv_file_path.name} into {table_name}") def insert_data_from_dict(conn, table_name: str, data: List[Dict[str, Any]]): """ Insert data from a list of dictionaries into a table. Args: conn: Database connection table_name: Target table name data: List of dictionaries with column_name: value pairs """ if not data: return # Get column names from first record columns = list(data[0].keys()) placeholders = ', '.join(['%s'] * len(columns)) columns_str = ', '.join(columns) insert_sql = f"INSERT INTO {table_name} ({columns_str}) VALUES ({placeholders}) ON CONFLICT DO NOTHING" with conn.cursor() as cur: for row in data: values = [row[col] for col in columns] cur.execute(insert_sql, values) conn.commit() logger.info(f"✅ Inserted {len(data)} rows into {table_name}") def create_table_with_data( conn, table_name: str, schema_sql: str, data: Optional[List[Dict[str, Any]]] = None, data_from_csv: Optional[Path] = None ): """ Create a table and optionally load data. Args: conn: Database connection table_name: Table name schema_sql: CREATE TABLE SQL statement data: Optional list of dictionaries to insert data_from_csv: Optional CSV file to load """ with conn.cursor() as cur: # Create table cur.execute(schema_sql) logger.info(f"✅ Created table {table_name}") # Load data if provided if data: insert_data_from_dict(conn, table_name, data) elif data_from_csv: load_csv_to_table(conn, data_from_csv, table_name) def setup_database_with_config(setup_config: Dict[str, Any]): """ Set up database using a configuration dictionary. Args: setup_config: Dictionary with 'tables' key containing table configurations Example config: { "tables": { "artists": { "schema": "CREATE TABLE artists (id SERIAL PRIMARY KEY, name VARCHAR(120))", "data": [{"id": 1, "name": "Iron Maiden"}], "data_from_csv": "data/artists.csv" # alternative to data } } } """ conn_params = get_connection_params() if not conn_params["database"]: raise ValueError("❌ No database specified in POSTGRES_DATABASE environment variable") try: conn = psycopg2.connect(**conn_params) for table_name, config in setup_config["tables"].items(): schema_sql = config["schema"] data = config.get("data") csv_file_path = None # Handle CSV file path if "data_from_csv" in config: csv_file_path = Path(config["data_from_csv"]) if not csv_file_path.is_absolute(): # Assume relative to current working directory (task directory) csv_file_path = Path.cwd() / csv_file_path create_table_with_data( conn, table_name, schema_sql, data=data, data_from_csv=csv_file_path ) conn.close() logger.info("🎉 Database setup completed successfully") except psycopg2.Error as e: logger.error(f"❌ Database error during setup: {e}") raise except Exception as e: logger.error(f"❌ Setup error: {e}") raise