Repository: Usagi-org/ai-goofish-monitor Branch: master Commit: 88aa5f7071ad Files: 283 Total size: 844.8 KB Directory structure: gitextract_pcspcjt1/ ├── .dockerignore ├── .github/ │ └── workflows/ │ ├── claude.yml │ └── docker-image.yml ├── .gitignore ├── AGENTS.md ├── CLAUDE.md ├── DISCLAIMER.md ├── Dockerfile ├── Dockerfile.base ├── Dockerfile.release ├── LICENSE ├── README.md ├── README_EN.md ├── chrome-extension/ │ ├── README.md │ ├── background.js │ ├── manifest.json │ ├── popup.html │ └── popup.js ├── config.json.example ├── desktop_launcher.py ├── docker-compose.dev.yaml ├── docker-compose.yaml ├── pyproject.toml ├── requirements-runtime.txt ├── requirements.txt ├── run_live_smoke.sh ├── spider_v2.py ├── src/ │ ├── __init__.py │ ├── ai_handler.py │ ├── ai_message_builder.py │ ├── api/ │ │ ├── __init__.py │ │ ├── dependencies.py │ │ └── routes/ │ │ ├── __init__.py │ │ ├── accounts.py │ │ ├── dashboard.py │ │ ├── login_state.py │ │ ├── logs.py │ │ ├── prompts.py │ │ ├── results.py │ │ ├── settings.py │ │ ├── tasks.py │ │ └── websocket.py │ ├── app.py │ ├── config.py │ ├── core/ │ │ └── cron_utils.py │ ├── domain/ │ │ ├── __init__.py │ │ ├── models/ │ │ │ ├── __init__.py │ │ │ ├── task.py │ │ │ └── task_generation.py │ │ └── repositories/ │ │ ├── __init__.py │ │ └── task_repository.py │ ├── failure_guard.py │ ├── infrastructure/ │ │ ├── __init__.py │ │ ├── config/ │ │ │ ├── __init__.py │ │ │ ├── env_manager.py │ │ │ └── settings.py │ │ ├── external/ │ │ │ ├── __init__.py │ │ │ ├── ai_client.py │ │ │ └── notification_clients/ │ │ │ ├── __init__.py │ │ │ ├── bark_client.py │ │ │ ├── base.py │ │ │ ├── factory.py │ │ │ ├── gotify_client.py │ │ │ ├── ntfy_client.py │ │ │ ├── telegram_client.py │ │ │ ├── webhook_client.py │ │ │ └── wecom_bot_client.py │ │ └── persistence/ │ │ ├── __init__.py │ │ ├── json_task_repository.py │ │ ├── sqlite_bootstrap.py │ │ ├── sqlite_connection.py │ │ ├── sqlite_task_repository.py │ │ └── storage_names.py │ ├── keyword_rule_engine.py │ ├── parsers.py │ ├── prompt_utils.py │ ├── rotation.py │ ├── scraper.py │ ├── services/ │ │ ├── __init__.py │ │ ├── account_strategy_service.py │ │ ├── ai_request_compat.py │ │ ├── ai_response_parser.py │ │ ├── ai_service.py │ │ ├── dashboard_payloads.py │ │ ├── dashboard_service.py │ │ ├── item_analysis_dispatcher.py │ │ ├── notification_config_service.py │ │ ├── notification_service.py │ │ ├── price_history_service.py │ │ ├── process_service.py │ │ ├── result_export_service.py │ │ ├── result_file_service.py │ │ ├── result_storage_service.py │ │ ├── scheduler_service.py │ │ ├── search_pagination.py │ │ ├── seller_profile_cache.py │ │ ├── task_generation_runner.py │ │ ├── task_generation_service.py │ │ ├── task_log_cleanup_service.py │ │ ├── task_payloads.py │ │ └── task_service.py │ └── utils.py ├── start.sh ├── tests/ │ ├── README.md │ ├── __init__.py │ ├── conftest.py │ ├── fixtures/ │ │ ├── config.sample.json │ │ ├── ratings.json │ │ ├── search_results.json │ │ ├── state.sample.json │ │ ├── user_head.json │ │ └── user_items.json │ ├── integration/ │ │ ├── test_api_dashboard.py │ │ ├── test_api_results.py │ │ ├── test_api_settings.py │ │ ├── test_api_tasks.py │ │ ├── test_cli_spider.py │ │ └── test_pipeline_parse.py │ ├── live/ │ │ ├── _support.py │ │ ├── conftest.py │ │ └── test_live_smoke.py │ ├── test_failure_guard.py │ ├── test_frontend_build_paths.py │ └── unit/ │ ├── test_ai_client.py │ ├── test_ai_handler_analysis.py │ ├── test_ai_handler_downloads.py │ ├── test_ai_request_compat.py │ ├── test_ai_response_parser.py │ ├── test_app_lifespan.py │ ├── test_cron_utils.py │ ├── test_domain_task.py │ ├── test_item_analysis_dispatcher.py │ ├── test_keyword_rule_engine.py │ ├── test_notification_service.py │ ├── test_price_history_service.py │ ├── test_process_service.py │ ├── test_prompt_utils.py │ ├── test_scraper_browser_channel.py │ ├── test_search_pagination.py │ ├── test_seller_profile_cache.py │ ├── test_task_log_cleanup_service.py │ └── test_utils.py ├── web-ui/ │ ├── .gitignore │ ├── .vscode/ │ │ └── extensions.json │ ├── Dockerfile │ ├── README.md │ ├── components.json │ ├── index.html │ ├── nginx.conf │ ├── package.json │ ├── postcss.config.cjs │ ├── src/ │ │ ├── App.vue │ │ ├── api/ │ │ │ ├── accounts.ts │ │ │ ├── dashboard.ts │ │ │ ├── logs.ts │ │ │ ├── prompts.ts │ │ │ ├── results.ts │ │ │ ├── settings.ts │ │ │ └── tasks.ts │ │ ├── assets/ │ │ │ └── main.css │ │ ├── components/ │ │ │ ├── HelloWorld.vue │ │ │ ├── layout/ │ │ │ │ ├── DashboardTaskSearch.vue │ │ │ │ ├── LocaleToggle.vue │ │ │ │ ├── TheHeader.vue │ │ │ │ └── TheSidebar.vue │ │ │ ├── results/ │ │ │ │ ├── PriceTrendChart.vue │ │ │ │ ├── ResultCard.vue │ │ │ │ ├── ResultsFilterBar.vue │ │ │ │ ├── ResultsGrid.vue │ │ │ │ └── ResultsInsightsPanel.vue │ │ │ ├── settings/ │ │ │ │ ├── NotificationSettingsPanel.vue │ │ │ │ └── RotationSettingsPanel.vue │ │ │ ├── tasks/ │ │ │ │ ├── TaskCreateDialog.vue │ │ │ │ ├── TaskForm.vue │ │ │ │ ├── TaskGenerationDialog.vue │ │ │ │ ├── TaskGenerationProgress.vue │ │ │ │ ├── TaskRegionSelector.vue │ │ │ │ └── TasksTable.vue │ │ │ └── ui/ │ │ │ ├── badge/ │ │ │ │ ├── Badge.vue │ │ │ │ └── index.ts │ │ │ ├── button/ │ │ │ │ ├── Button.vue │ │ │ │ └── index.ts │ │ │ ├── card/ │ │ │ │ ├── Card.vue │ │ │ │ ├── CardContent.vue │ │ │ │ ├── CardDescription.vue │ │ │ │ ├── CardFooter.vue │ │ │ │ ├── CardHeader.vue │ │ │ │ ├── CardTitle.vue │ │ │ │ └── index.ts │ │ │ ├── checkbox/ │ │ │ │ ├── Checkbox.vue │ │ │ │ └── index.ts │ │ │ ├── dialog/ │ │ │ │ ├── Dialog.vue │ │ │ │ ├── DialogClose.vue │ │ │ │ ├── DialogContent.vue │ │ │ │ ├── DialogDescription.vue │ │ │ │ ├── DialogFooter.vue │ │ │ │ ├── DialogHeader.vue │ │ │ │ ├── DialogScrollContent.vue │ │ │ │ ├── DialogTitle.vue │ │ │ │ ├── DialogTrigger.vue │ │ │ │ └── index.ts │ │ │ ├── input/ │ │ │ │ ├── Input.vue │ │ │ │ └── index.ts │ │ │ ├── label/ │ │ │ │ ├── Label.vue │ │ │ │ └── index.ts │ │ │ ├── select/ │ │ │ │ ├── Select.vue │ │ │ │ ├── SelectContent.vue │ │ │ │ ├── SelectGroup.vue │ │ │ │ ├── SelectItem.vue │ │ │ │ ├── SelectItemText.vue │ │ │ │ ├── SelectLabel.vue │ │ │ │ ├── SelectScrollDownButton.vue │ │ │ │ ├── SelectScrollUpButton.vue │ │ │ │ ├── SelectSeparator.vue │ │ │ │ ├── SelectTrigger.vue │ │ │ │ ├── SelectValue.vue │ │ │ │ └── index.ts │ │ │ ├── switch/ │ │ │ │ ├── Switch.vue │ │ │ │ └── index.ts │ │ │ ├── table/ │ │ │ │ ├── Table.vue │ │ │ │ ├── TableBody.vue │ │ │ │ ├── TableCaption.vue │ │ │ │ ├── TableCell.vue │ │ │ │ ├── TableHead.vue │ │ │ │ ├── TableHeader.vue │ │ │ │ ├── TableRow.vue │ │ │ │ └── index.ts │ │ │ ├── tabs/ │ │ │ │ ├── Tabs.vue │ │ │ │ ├── TabsContent.vue │ │ │ │ ├── TabsList.vue │ │ │ │ ├── TabsTrigger.vue │ │ │ │ └── index.ts │ │ │ ├── textarea/ │ │ │ │ ├── Textarea.vue │ │ │ │ └── index.ts │ │ │ └── toast/ │ │ │ ├── Toast.vue │ │ │ ├── ToastAction.vue │ │ │ ├── ToastClose.vue │ │ │ ├── ToastDescription.vue │ │ │ ├── ToastProvider.vue │ │ │ ├── ToastTitle.vue │ │ │ ├── ToastViewport.vue │ │ │ ├── Toaster.vue │ │ │ ├── index.ts │ │ │ └── use-toast.ts │ │ ├── composables/ │ │ │ ├── useAuth.ts │ │ │ ├── useDashboard.ts │ │ │ ├── useLogs.ts │ │ │ ├── useMobileNav.ts │ │ │ ├── useResults.ts │ │ │ ├── useSettings.ts │ │ │ ├── useTaskGenerationJob.ts │ │ │ ├── useTasks.ts │ │ │ └── useWebSocket.ts │ │ ├── i18n/ │ │ │ ├── index.ts │ │ │ └── messages/ │ │ │ ├── en-US-extra.ts │ │ │ ├── en-US.ts │ │ │ ├── zh-CN-extra.ts │ │ │ └── zh-CN.ts │ │ ├── layouts/ │ │ │ └── MainLayout.vue │ │ ├── lib/ │ │ │ ├── http.ts │ │ │ ├── taskFormQuery.ts │ │ │ ├── taskSchedule.ts │ │ │ └── utils.ts │ │ ├── main.ts │ │ ├── router/ │ │ │ └── index.ts │ │ ├── services/ │ │ │ └── websocket.ts │ │ ├── style.css │ │ ├── types/ │ │ │ ├── dashboard.d.ts │ │ │ ├── result.d.ts │ │ │ └── task.d.ts │ │ └── views/ │ │ ├── AccountsView.vue │ │ ├── DashboardView.vue │ │ ├── LoginView.vue │ │ ├── LogsView.vue │ │ ├── ResultsView.vue │ │ ├── SettingsView.vue │ │ └── TasksView.vue │ ├── tailwind.config.cjs │ ├── tsconfig.app.json │ ├── tsconfig.json │ ├── tsconfig.node.json │ └── vite.config.ts └── xianyu-login-state-privacy.html ================================================ FILE CONTENTS ================================================ ================================================ FILE: .dockerignore ================================================ __pycache__ *.pyc .env .venv venv .idea .claude .serena .pytest_cache logs/ jsonl/ web-ui/node_modules web-ui/dist dist/ .git .DS_Store task_images/ images/ archive/ tests/ data/ price_history/ *.md !README.md ================================================ FILE: .github/workflows/claude.yml ================================================ name: Claude Code on: issue_comment: types: [created] pull_request_review_comment: types: [created] issues: types: [opened, assigned] pull_request_review: types: [submitted] jobs: claude: if: | (github.event_name == 'issue_comment' && contains(github.event.comment.body, '@claude')) || (github.event_name == 'pull_request_review_comment' && contains(github.event.comment.body, '@claude')) || (github.event_name == 'pull_request_review' && contains(github.event.review.body, '@claude')) || (github.event_name == 'issues' && (contains(github.event.issue.body, '@claude') || contains(github.event.issue.title, '@claude'))) runs-on: ubuntu-latest permissions: contents: read pull-requests: read issues: read id-token: write steps: - name: Checkout repository uses: actions/checkout@v4 with: fetch-depth: 1 - name: Prepare Environment run: | curl -fsSL https://bun.sh/install | bash mkdir -p $HOME/.claude-code-router cat << 'EOF' > $HOME/.claude-code-router/config.json { "log": true, "NON_INTERACTIVE_MODE": true, "OPENAI_API_KEY": "${{ secrets.OPENAI_API_KEY }}", "OPENAI_BASE_URL": "https://api-inference.modelscope.cn/v1", "OPENAI_MODEL": "MiniMax/MiniMax-M2.5" } EOF shell: bash - name: Start Claude Code Router run: | nohup ~/.bun/bin/bunx @musistudio/claude-code-router@1.0.8 start & shell: bash - name: Run Claude Code id: claude uses: anthropics/claude-code-action@beta env: ANTHROPIC_BASE_URL: http://localhost:3456 with: anthropic_api_key: "Any-string-is-ok" ================================================ FILE: .github/workflows/docker-image.yml ================================================ name: Docker Image CI on merge to master on: workflow_dispatch: pull_request: types: [closed] branches: ["master"] env: IMAGE_NAME: ai-goofish BASE_IMAGE_NAME: ai-goofish-base jobs: build-base: if: | github.event_name == 'workflow_dispatch' || (github.event_name == 'pull_request' && github.event.pull_request.merged == true) runs-on: ubuntu-latest permissions: contents: read packages: write outputs: base_image: ${{ steps.prepare.outputs.base_image }} steps: - name: Checkout code uses: actions/checkout@v4 with: fetch-depth: 0 - name: Set up QEMU uses: docker/setup-qemu-action@v3 - name: Set up Docker Buildx uses: docker/setup-buildx-action@v3 - name: Prepare image tags id: prepare env: REPO_OWNER: ${{ github.repository_owner }} BASE_IMAGE_NAME: ${{ env.BASE_IMAGE_NAME }} run: | set -euo pipefail owner_lower=$(echo "$REPO_OWNER" | tr '[:upper:]' '[:lower:]') base_image="ghcr.io/${owner_lower}/${BASE_IMAGE_NAME}:latest" base_tags=( "${base_image}" "ghcr.io/${owner_lower}/${BASE_IMAGE_NAME}:${GITHUB_SHA}" ) { echo "base_image=${base_image}" echo 'base_tags<> "$GITHUB_OUTPUT" - name: Log in to GitHub Container Registry uses: docker/login-action@v3 with: registry: ghcr.io username: ${{ github.actor }} password: ${{ secrets.GITHUB_TOKEN }} - name: Build and push base Docker image uses: docker/build-push-action@v5 with: context: . file: ./Dockerfile.base platforms: linux/amd64,linux/arm64 pull: true push: true tags: ${{ steps.prepare.outputs.base_tags }} cache-from: type=gha,scope=ai-goofish-base-docker cache-to: type=gha,scope=ai-goofish-base-docker,mode=max build-and-push: needs: build-base if: ${{ needs.build-base.result == 'success' }} runs-on: ubuntu-latest permissions: contents: read packages: write steps: - name: Checkout code uses: actions/checkout@v4 with: fetch-depth: 0 - name: Set up QEMU uses: docker/setup-qemu-action@v3 - name: Set up Docker Buildx uses: docker/setup-buildx-action@v3 - name: Prepare release tags id: prepare env: REPO_OWNER: ${{ github.repository_owner }} IMAGE_NAME: ${{ env.IMAGE_NAME }} run: | set -euo pipefail owner_lower=$(echo "$REPO_OWNER" | tr '[:upper:]' '[:lower:]') app_tags=( "ghcr.io/${owner_lower}/${IMAGE_NAME}:latest" "ghcr.io/${owner_lower}/${IMAGE_NAME}:${GITHUB_SHA}" ) { echo 'app_tags<> "$GITHUB_OUTPUT" - name: Log in to GitHub Container Registry uses: docker/login-action@v3 with: registry: ghcr.io username: ${{ github.actor }} password: ${{ secrets.GITHUB_TOKEN }} - name: Build and push multi-arch Docker images uses: docker/build-push-action@v5 with: context: . file: ./Dockerfile.release platforms: linux/amd64,linux/arm64 pull: true push: true build-args: | BASE_IMAGE=${{ needs.build-base.outputs.base_image }} tags: ${{ steps.prepare.outputs.app_tags }} cache-from: type=gha,scope=ai-goofish-release-docker cache-to: type=gha,scope=ai-goofish-release-docker,mode=max ================================================ FILE: .gitignore ================================================ .idea *.iml xianyu_state.json .env .aider* images/ logs/ jsonl/ __pycache__/ src/__pycache__/ dist/ state/ config.json prompts/ .serena/ price_history/ data/ ================================================ FILE: AGENTS.md ================================================ # Repository Guidelines ## 项目结构与模块组织 - 后端位于 `src/`,入口 `src/app.py`,API 路由在 `src/api/routes/`,服务层在 `src/services/`,领域模型在 `src/domain/`,基础设施在 `src/infrastructure/`。 - 前端在 `web-ui/`(Vue 3 + Vite),视图放于 `web-ui/src/views/`,组件在 `web-ui/src/components/`,构建产物会复制到根目录 `dist/`。 - 测试位于 `tests/`,命名遵循 `test_*.py` 或 `tests/*/test_*.py`。 - 运行数据与资源:`prompts/`、`jsonl/`、`logs/`、`images/`、`static/`、`state/`,配置文件 `config.json` 与 `.env` 位于仓库根目录。 ## 构建、测试与本地开发 - 后端开发:`python -m src.app` 或 `uvicorn src.app:app --host 0.0.0.0 --port 8000 --reload`。 - 爬虫任务:`python spider_v2.py --task-name "MacBook Air M1" --debug-limit 3`(可用 `--config` 指定自定义配置)。 - 前端开发:`cd web-ui && npm install && npm run dev`;构建:`cd web-ui && npm run build`(产物复制到根目录 `dist/`)。 - 一键本地启动:`bash start.sh`(自动安装依赖、前端构建并启动后端)。 - Docker:`docker compose up --build -d`,查看日志 `docker compose logs -f app`,停止 `docker compose down`。 ## 编码风格与命名约定 - 保持分层:API → services → domain → infrastructure,避免跨层耦合,模块保持精简。 - Python 测试函数命名为 `test_*`,文件与路径遵循上述测试目录规范。 - 使用描述性、任务导向的命名(如爬虫任务名、配置键),与业务含义对应。 ## 架构与运行时 - 后端使用 FastAPI 提供 API 与静态资源,爬虫与 AI 推理在独立任务进程中协作,前后端通过 HTTP/Web UI 交互。 - 任务运行会在 `jsonl/` 写入结果、在 `logs/` 留存运行日志、在 `images/` 下载图片,前端监控页面依赖这些数据。 - 默认监听 8000 端口,前端构建后静态文件可由后端或 Docker 镜像直接提供。 ## 测试指南 - 测试框架:`pytest`(默认同步测试,无需 `pytest-asyncio`)。 - 运行全部测试:`pytest`;覆盖率:`pytest --cov=src` 或 `coverage run -m pytest`;定向测试:`pytest tests/test_utils.py::test_safe_get`。 - 优先覆盖核心服务、爬虫管道的异常分支与重试逻辑,避免回归。 - PR 前请运行相关测试,新增逻辑补充针对性用例。 ## 提交与 PR 规范 - Commit 采用类 Conventional Commits:`feat(...)`、`fix(...)`、`refactor(...)`、`chore(...)`、`docs(...)` 等。 - PR 需说明变更范围与影响模块;UI 变更在 `web-ui/` 提供截图;关联相关 Issue;提及配置或迁移步骤。 ## 安全与配置提示 - 复制 `.env.example` 为 `.env`,设置必填项 `OPENAI_API_KEY`、`OPENAI_BASE_URL`、`OPENAI_MODEL_NAME` 等。 - 不要提交真实凭据或 cookies(如 `state.json`);Playwright 需本地浏览器,Docker 镜像已预装 Chromium。 - Web 认证默认 `admin/admin123`,生产环境务必修改,推荐启用 HTTPS 并限制访问来源。 ================================================ FILE: CLAUDE.md ================================================ # CLAUDE.md This file provides guidance to Claude Code (claude.ai/code) when working with code in this repository. ## 项目概述 基于 Playwright + AI 的闲鱼智能监控机器人。FastAPI 后端 + Vue 3 前端,支持多任务并发监控、多模态 AI 商品分析、多渠道通知推送。 ## 核心架构 ``` API层 (src/api/routes/) ↓ 服务层 (src/services/) ↓ 领域层 (src/domain/) ↓ 基础设施层 (src/infrastructure/) ``` 关键入口: - `src/app.py` - FastAPI 应用主入口 - `spider_v2.py` - 爬虫 CLI 入口 - `src/scraper.py` - Playwright 爬虫核心逻辑 服务层: - `TaskService` - 任务 CRUD - `ProcessService` - 爬虫子进程管理 - `SchedulerService` - APScheduler 定时调度 - `AIAnalysisService` - 多模态 AI 分析 - `NotificationService` - 多渠道通知(ntfy/Bark/企业微信/Telegram/Webhook) 前端 (`web-ui/`):Vue 3 + Vite + shadcn-vue + Tailwind CSS ## 开发命令 ```bash # 后端开发 python -m src.app # 或 uvicorn src.app:app --host 0.0.0.0 --port 8000 --reload # 前端开发 cd web-ui && npm install && npm run dev # 前端构建 cd web-ui && npm run build # 一键本地启动(构建前端 + 启动后端) bash start.sh # Docker 部署 docker compose up --build -d ``` ## 爬虫命令 ```bash python spider_v2.py # 运行所有启用任务 python spider_v2.py --task-name "MacBook" # 运行指定任务 python spider_v2.py --debug-limit 3 # 调试模式,限制商品数 python spider_v2.py --config custom.json # 自定义配置文件 ``` ## 测试 ```bash pytest # 运行所有测试 pytest --cov=src # 覆盖率报告 pytest tests/unit/test_utils.py # 运行单个测试文件 pytest tests/unit/test_utils.py::test_safe_get # 运行单个测试函数 ``` 测试规范:文件 `tests/**/test_*.py`,函数 `test_*` ## 配置 环境变量 (`.env`): - AI 模型:`OPENAI_API_KEY`, `OPENAI_BASE_URL`, `OPENAI_MODEL_NAME` - 通知:`NTFY_TOPIC_URL`, `BARK_URL`, `WX_BOT_URL`, `TELEGRAM_BOT_TOKEN`, `TELEGRAM_CHAT_ID` - 爬虫:`RUN_HEADLESS`, `LOGIN_IS_EDGE` - Web 认证:`WEB_USERNAME`, `WEB_PASSWORD` - 端口:`SERVER_PORT` 任务配置 (`config.json`):定义监控任务(关键词、价格范围、cron 表达式、AI prompt 文件等) ## 数据流 1. Web UI / config.json 创建任务 2. SchedulerService 按 cron 触发或手动启动 3. ProcessService 启动 spider_v2.py 子进程 4. scraper.py 使用 Playwright 抓取商品 5. AIAnalysisService 调用多模态模型分析 6. NotificationService 推送符合条件的商品 7. 结果存储:`jsonl/`(数据)、`images/`(图片)、`logs/`(日志) ## 注意事项 - AI 模型必须支持图片上传(多模态) - Docker 部署需通过 Web UI 手动更新登录状态(`state.json`) - 遇到滑动验证码时设置 `RUN_HEADLESS=false` 手动处理 - 生产环境务必修改默认 Web 认证密码 ================================================ FILE: DISCLAIMER.md ================================================ # 免责声明 / Disclaimer ## 中文版本 本项目是一个开源软件,仅供学习和研究目的使用。使用者在使用本软件时,必须遵守所在国家/地区的所有相关法律法规。 项目作者及贡献者明确声明: 1. 本项目仅用于技术学习和研究目的,不得用于任何违法或不道德的活动。 2. 使用者对本软件的使用行为承担全部责任,包括但不限于任何修改、分发或商业应用。 3. 项目作者及贡献者不对因使用本软件而导致的任何直接、间接、附带或特殊的损害或损失承担责任,即使已被告知可能发生此类损害。 4. 如果您的使用行为违反了所在司法管辖区的法律,请立即停止使用并删除本软件。 5. 本项目按"现状"提供,不提供任何形式的担保,包括但不限于适销性、特定用途适用性和非侵权性担保。 本项目采用 MIT 许可证发布。根据该许可证,您可以自由使用、复制、修改、分发本软件,但必须保留原始版权声明和本免责声明。 项目作者保留随时更改本免责声明的权利,恕不另行通知。使用本软件即表示您同意受本免责声明条款的约束。 ## English Version This project is open source software provided for learning and research purposes only. Users must comply with all relevant laws and regulations in their jurisdiction when using this software. The project owner and contributors explicitly state: 1. This project is for technical learning and research purposes only and must not be used for any illegal or unethical activities. 2. Users assume full responsibility for their use of the software, including but not limited to any modifications, distributions, or commercial applications. 3. The project owner and contributors are not liable for any direct, indirect, incidental, or special damages or losses resulting from the use of this software, even if advised of the possibility of such damages. 4. If your use violates the laws of your jurisdiction, please stop using and delete this software immediately. 5. This project is provided "as is" without warranty of any kind, either express or implied, including but not limited to the warranties of merchantability, fitness for a particular purpose, or non-infringement. This project is released under the MIT License. Under this license, you are free to use, copy, modify, and distribute this software, but you must retain the original copyright notice and this disclaimer. The project owner reserves the right to change this disclaimer at any time without notice. Your use of the software indicates your acceptance of the terms of this disclaimer. ================================================ FILE: Dockerfile ================================================ # Stage 1: Build the Vue application FROM node:22-alpine AS frontend-builder WORKDIR /web-ui COPY web-ui/package*.json ./ RUN npm ci COPY web-ui/ . RUN npm run build # Stage 2: Build the python environment with dependencies FROM python:3.11-slim-bookworm AS builder # 设置环境变量以防止交互式提示 ENV DEBIAN_FRONTEND=noninteractive \ VIRTUAL_ENV=/opt/venv \ PATH="/opt/venv/bin:$PATH" # 创建虚拟环境并安装 Python 运行时依赖 RUN python3 -m venv $VIRTUAL_ENV COPY requirements-runtime.txt . RUN pip install --no-cache-dir -i https://pypi.tuna.tsinghua.edu.cn/simple -r requirements-runtime.txt # Stage 3: Create the final, lean image FROM python:3.11-slim-bookworm WORKDIR /app ENV DEBIAN_FRONTEND=noninteractive \ VIRTUAL_ENV=/opt/venv \ PATH="/opt/venv/bin:$PATH" \ PYTHONUNBUFFERED=1 \ RUNNING_IN_DOCKER=true \ PLAYWRIGHT_BROWSERS_PATH=/ms-playwright \ TZ=Asia/Shanghai COPY --from=builder ${VIRTUAL_ENV} ${VIRTUAL_ENV} RUN apt-get update \ && apt-get install -y --no-install-recommends \ tzdata \ tini \ libzbar0 \ && playwright install --with-deps --no-shell chromium \ && apt-get clean \ && rm -rf /var/lib/apt/lists/* COPY --from=frontend-builder /dist /app/dist COPY src /app/src COPY spider_v2.py /app/spider_v2.py COPY prompts /app/prompts COPY static /app/static COPY config.json.example /app/config.json.example RUN mkdir -p /app/data /app/state /app/logs /app/images /app/jsonl /app/price_history EXPOSE 8000 USER root ENTRYPOINT ["tini", "--"] CMD ["python", "-m", "src.app"] ================================================ FILE: Dockerfile.base ================================================ # syntax=docker/dockerfile:1.7 FROM python:3.11-slim-bookworm WORKDIR /app ENV DEBIAN_FRONTEND=noninteractive \ VIRTUAL_ENV=/opt/venv \ PATH="/opt/venv/bin:$PATH" \ PYTHONUNBUFFERED=1 \ RUNNING_IN_DOCKER=true \ PLAYWRIGHT_BROWSERS_PATH=/ms-playwright \ TZ=Asia/Shanghai COPY requirements-runtime.txt /tmp/requirements-runtime.txt RUN python3 -m venv "$VIRTUAL_ENV" RUN --mount=type=cache,target=/root/.cache/pip \ pip install -i https://pypi.tuna.tsinghua.edu.cn/simple -r /tmp/requirements-runtime.txt RUN apt-get update \ && apt-get install -y --no-install-recommends \ tzdata \ tini \ libzbar0 \ && playwright install --with-deps --no-shell chromium \ && apt-get clean \ && rm -rf /var/lib/apt/lists/* /tmp/requirements-runtime.txt RUN mkdir -p /app/data /app/state /app/logs /app/images /app/jsonl /app/price_history ENTRYPOINT ["tini", "--"] ================================================ FILE: Dockerfile.release ================================================ # syntax=docker/dockerfile:1.7 ARG BASE_IMAGE=ghcr.io/usagi-org/ai-goofish-base:latest FROM node:22-alpine AS frontend-builder WORKDIR /web-ui COPY web-ui/package*.json ./ RUN --mount=type=cache,target=/root/.npm npm ci COPY web-ui/ . RUN npm run build FROM ${BASE_IMAGE} WORKDIR /app COPY --from=frontend-builder /dist /app/dist COPY src /app/src COPY spider_v2.py /app/spider_v2.py COPY prompts /app/prompts COPY static /app/static COPY config.json.example /app/config.json.example EXPOSE 8000 CMD ["python", "-m", "src.app"] ================================================ FILE: LICENSE ================================================ MIT License Copyright (c) 2025 dingyufei615 Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. ================================================ FILE: README.md ================================================ # 闲鱼智能监控机器人 [English README](README_EN.md) 基于 Playwright 和 AI 的闲鱼多任务实时监控工具,提供完整的 Web 管理界面。 ## 核心特性 - **Web 可视化管理**: 任务管理、账号管理、AI 标准编辑、运行日志、结果浏览 - **AI 驱动**: 自然语言创建任务,多模态模型深度分析商品 - **多任务并发**: 独立配置关键词、价格、筛选条件和 AI Prompt - **高级筛选**: 包邮、新发布时间范围、省/市/区三级区域筛选 - **即时通知**: 支持 ntfy.sh、企业微信、Bark、Telegram、Webhook - **定时调度**: Cron 表达式配置周期性任务 - **账号与代理轮换**: 多账号管理、任务绑定账号、代理池轮换与失败重试 - **Docker 部署**: 一键容器化部署 ## 截图 ![监控概览](static/img.png) ![任务管理](static/img_1.png) ![结果查看](static/img_2.png) ![通知推送](static/img_3.png) ## 🐳 Docker 部署(推荐) ```bash git clone https://github.com/Usagi-org/ai-goofish-monitor && cd ai-goofish-monitor cp .env.example .env vim .env # 填写相关配置项 docker compose up -d docker compose logs -f app docker compose down ``` 如果镜像无法访问或下载速度慢,可尝试使用加速: ```bash docker pull ghcr.nju.edu.cn/usagi-org/ai-goofish:latest docker tag ghcr.nju.edu.cn/usagi-org/ai-goofish:latest ghcr.io/usagi-org/ai-goofish:latest docker compose up -d ``` - 默认 Web UI 地址:`http://127.0.0.1:8000` - Docker 镜像已内置 Chromium,无需宿主机额外安装浏览器。 - 官方镜像地址:`ghcr.io/usagi-org/ai-goofish:latest` - 更新镜像:`docker compose pull && docker compose up -d` - 如果你修改了 `.env` 中的 `SERVER_PORT`,请同步更新 `docker-compose.yaml` 里的端口映射。 - `docker-compose.yaml` 默认会把 SQLite 主库挂载到 `./data:/app/data`,数据库文件默认为 `data/app.sqlite3` - 目前默认持久化这些目录: - `data/` SQLite 主存储(任务、结果、价格历史) - `state/` 登录状态 cookie 文件 - `prompts/` 任务提示词 - `logs/` 运行日志 - `images/` 商品图片与任务临时图片目录 - `config.json`、`jsonl/`、`price_history/` 首次升级到 SQLite 时用于兼容导入的旧数据源 ### 数据存储与迁移 - 当前在线主存储为 SQLite,默认路径 `data/app.sqlite3` - 可通过环境变量 `APP_DATABASE_FILE` 自定义数据库路径;Docker 默认设置为 `/app/data/app.sqlite3` - 应用启动时会自动建库建表,并尝试从旧的 `config.json`、`jsonl/`、`price_history/` 导入一次历史数据 - `state/`、`prompts/`、`logs/`、`images/` 仍然是文件系统目录,不在 SQLite 中 - 商品图片会临时落到 `images/task_images_/`,任务结束后默认会清理 - 首次升级完成并确认 `data/app.sqlite3` 中数据正确后,可视部署方式决定是否继续保留旧的 `config.json`、`jsonl/`、`price_history/` 挂载 ### 最少配置 | 变量 | 说明 | 必填 | |------|------|------| | `OPENAI_API_KEY` | AI 模型 API Key | 是 | | `OPENAI_BASE_URL` | OpenAI 兼容接口地址 | 是 | | `OPENAI_MODEL_NAME` | 支持图片输入的模型名称 | 是 | | `WEB_USERNAME` / `WEB_PASSWORD` | Web UI 登录账号密码,默认 `admin/admin123` | 否 | 其余配置见下方“配置说明”。 ### 第一次使用 1. 打开默认 Web UI `http://127.0.0.1:8000` 并登录。 2. 进入“闲鱼账号管理”,使用 [Chrome 扩展](https://chromewebstore.google.com/detail/xianyu-login-state-extrac/eidlpfjiodpigmfcahkmlenhppfklcoa) 导出并粘贴闲鱼登录态 JSON。 3. 登录态文件会保存到 `state/` 目录,例如 `state/acc_1.json`。 4. 回到“任务管理”,创建任务并绑定账号后即可运行。 ### 创建第一个任务 - `AI判断`:填写“详细需求”,提交后会弹出独立进度弹窗,后台异步生成分析标准。 - `关键词判断`:填写关键词规则,任务会直接创建,不经过 AI 生成流程。 - `区域筛选`:已改为省 / 市 / 区三级选择器,数据基于闲鱼页面抓取快照内置。 ## 用户使用说明
点击展开 Web UI 功能说明 ### 任务管理 - 支持 AI 创建、关键词规则、价格范围、新发布范围、区域筛选、账号绑定、定时规则。 - AI 任务创建是后台 job 流程,提交后会打开单独的进度弹窗。 - 区域筛选会显著缩小结果集,默认留空。 ### 账号管理 - 支持导入、更新、删除闲鱼账号登录态。 - 每个任务可指定账号,也可不绑定并交给系统自动选择。 ### 结果查看与运行日志 - 结果页和导出功能现在从 SQLite 查询,不再直接扫描 `jsonl` 文件。 - 日志页按任务展示运行过程,便于排查登录态失效、风控和 AI 调用问题。 ### 系统设置 - 可查看系统状态、编辑 Prompt、调整代理与轮换相关配置。
## 开发者开发 ### 环境要求 - Python 3.10+ - Node.js + npm(本地验证 `Node v20.18.3` 可完成前端构建) - Playwright CLI 与 Chromium,首次运行前建议执行 `python3 -m pip install playwright && python3 -m playwright install chromium` - Chrome / Edge 浏览器(Linux 环境也可使用 Chromium;`start.sh` 会先检查浏览器是否存在) ```bash git clone https://github.com/Usagi-org/ai-goofish-monitor cd ai-goofish-monitor cp .env.example .env ``` ### 一键启动 ```bash chmod +x start.sh ./start.sh ``` `start.sh` 会先检查 Playwright CLI 和浏览器前置条件;在前置条件满足后自动安装项目依赖、构建前端、复制构建产物并启动后端。 ### 手动启动 ```bash # 后端 python -m src.app # 或 uvicorn src.app:app --host 0.0.0.0 --port 8000 --reload # 前端 cd web-ui npm install npm run dev ``` - FastAPI 启动时会自动初始化 SQLite,并在首次启动时尝试导入旧的 `config.json/jsonl/price_history` - `spider_v2.py` 默认从 SQLite 读取任务;只有显式传入 `--config ` 时才会走 JSON 配置兼容模式 - 默认数据库路径为 `data/app.sqlite3` - Vite 开发服务器会将 `/api`、`/auth`、`/ws` 代理到 `http://127.0.0.1:8000`。 - `npm run build` 先生成 `web-ui/dist/`,`start.sh` 再复制到仓库根目录 `dist/`。 - FastAPI 负责提供根目录 `dist/index.html` 和 `dist/assets/`。 - `./start.sh` 默认输出访问地址 `http://localhost:8000` 和 API 文档 `http://localhost:8000/docs`。 ### 测试与校验 ```bash PYTEST_DISABLE_PLUGIN_AUTOLOAD=1 pytest cd web-ui && npm run build ``` ### 任务创建 API
点击展开 API 行为说明 - `POST /api/tasks/generate` - `decision_mode=ai`:返回 `202` 和 `job`,需要继续轮询进度。 - `decision_mode=keyword`:直接返回已创建任务。 - `GET /api/tasks/generate-jobs/{job_id}`:查询 AI 任务生成进度。 - `POST /auth/status`:校验 Web UI 登录凭据。
## 配置说明
点击展开常用配置项 ### AI 与运行时 - `OPENAI_API_KEY` / `OPENAI_BASE_URL` / `OPENAI_MODEL_NAME`:AI 模型接入必填项。 - `PROXY_URL`:为 AI 请求单独指定 HTTP/SOCKS5 代理。 - `RUN_HEADLESS`:是否以无头模式运行爬虫;Docker 中应保持 `true`。 - `SERVER_PORT`:后端监听端口,默认 `8000`。 - `LOGIN_IS_EDGE`:本地环境可切换为 Edge 内核;Docker 镜像未内置 Edge,容器内会固定使用 Chromium。 - `PCURL_TO_MOBILE`:是否将 PC 商品链接转换为移动端链接。 ### 通知 - `NTFY_TOPIC_URL` - `GOTIFY_URL` / `GOTIFY_TOKEN` - `BARK_URL` - `WX_BOT_URL` - `TELEGRAM_BOT_TOKEN` / `TELEGRAM_CHAT_ID` / `TELEGRAM_API_BASE_URL` - `WEBHOOK_*` ### 代理轮换与失败保护 - `PROXY_ROTATION_ENABLED` - `PROXY_ROTATION_MODE` - `PROXY_POOL` - `PROXY_ROTATION_RETRY_LIMIT` - `PROXY_BLACKLIST_TTL` - `TASK_FAILURE_THRESHOLD` - `TASK_FAILURE_PAUSE_SECONDS` - `TASK_FAILURE_GUARD_PATH` 完整示例见 `.env.example`。
## Web 界面认证
点击展开认证说明 - Web UI 当前使用登录页收集账号密码,并通过 `POST /auth/status` 校验。 - 登录成功后,前端会在浏览器本地保存登录状态,用于路由守卫和 WebSocket 初始化。 - 默认账号密码为 `admin/admin123`,生产环境请务必修改。
## 🚀 工作流程 下图描述了单个监控任务从启动到完成的核心处理逻辑。主服务运行于 `src.app`,按用户操作或定时调度启动一个或多个任务进程。 ```mermaid graph TD A[启动监控任务] --> B[选择账号/代理配置]; B --> C[任务: 搜索商品]; C --> D{发现新商品?}; D -- 是 --> E[抓取商品详情 & 卖家信息]; E --> F[下载商品图片]; F --> G[调用AI进行分析]; G --> H{AI是否推荐?}; H -- 是 --> I[发送通知]; H -- 否 --> J[保存记录到 SQLite]; I --> J; D -- 否 --> K[翻页/等待]; K --> C; J --> C; C --> L{触发风控/异常?}; L -- 是 --> M[账号/代理轮换并重试]; M --> C; ``` ## 常见问题
点击展开常见问题 ### AI 任务创建为什么不是立即完成? AI 模式会先生成分析标准,再创建任务。现在该流程已改为后台 job,提交后会显示独立进度弹窗,避免表单长时间卡住。 ### 区域筛选为什么默认建议留空? 区域筛选会显著减少搜索结果,适合明确只看某个区域的场景。若你先验证整体市场,建议先不填。 ### 本地页面打开后提示前端构建产物不存在? 说明根目录 `dist/` 缺失。可直接执行 `./start.sh`,或先在 `web-ui/` 里执行 `npm run build`,再确认构建产物已复制到仓库根目录。 ### `./start.sh` 为什么提示缺少 Playwright 或浏览器? 这是脚本的前置检查。请先安装 Playwright CLI 与 Chromium,并确保系统中可用 Chrome / Edge(Linux 环境也可用 Chromium),然后重新执行 `./start.sh`。
## 致谢
点击展开致谢内容 本项目在开发过程中参考了以下优秀项目,特此感谢: - [superboyyy/xianyu_spider](https://github.com/superboyyy/xianyu_spider) 以及感谢LinuxDo相关人员的脚本贡献 - [@jooooody](https://linux.do/u/jooooody/summary) 以及感谢 [LinuxDo](https://linux.do/) 社区。 以及感谢 ClaudeCode/Gemini/Codex 等模型工具,解放双手 体验Vibe Coding的快乐。
## 注意事项
点击展开注意事项详情 - 请遵守闲鱼的用户协议和robots.txt规则,不要进行过于频繁的请求,以免对服务器造成负担或导致账号被限制。 - 本项目仅供学习和技术研究使用,请勿用于非法用途。 - 本项目采用 [MIT 许可证](LICENSE) 发布,按"现状"提供,不提供任何形式的担保。 - 项目作者及贡献者不对因使用本软件而导致的任何直接、间接、附带或特殊的损害或损失承担责任。 - 如需了解更多详细信息,请查看 [免责声明](DISCLAIMER.md) 文件。
## Star History [![Star History Chart](https://api.star-history.com/svg?repos=Usagi-org/ai-goofish-monitor&type=Date)](https://www.star-history.com/#Usagi-org/ai-goofish-monitor&Date) ![Alt](https://repobeats.axiom.co/api/embed/b40d8a112271b4bddabadd8fe2635be3c1aa28a3.svg "Repobeats analytics image") ================================================ FILE: README_EN.md ================================================ # Xianyu Intelligent Monitor Bot [中文说明](README.md) A Playwright and AI-powered multi-task real-time monitoring tool for Xianyu (闲鱼), featuring a complete web management interface. ## Core Features - **Web Visual Management**: Task management, account management, AI criteria editing, run logs, results browsing - **AI-Driven**: Natural language task creation, multimodal model for in-depth product analysis - **Multi-Task Concurrency**: Independent configuration for keywords, prices, filters, and AI prompts - **SQLite as Primary Storage**: Tasks, results, and price history are persisted in one embedded database instead of repeatedly scanning `jsonl` - **Advanced Filtering**: Free shipping, new listing time range, province/city/district filtering - **Instant Notifications**: Supports ntfy.sh, WeChat Work (企业微信), Bark, Telegram, Webhook - **Scheduled Tasks**: Cron expression configuration for periodic tasks - **Account & Proxy Rotation**: Multi-account management, task-account binding, proxy pool rotation with failure retry - **Docker Deployment**: One-click containerized deployment ## Screenshots ![Monitoring Overview](static/img.png) ![Task Management](static/img_1.png) ![Result Viewer](static/img_2.png) ![Notification Settings](static/img_3.png) ## Quick Start ### Requirements - Python 3.10+ - Node.js + npm (`Node v20.18.3` has been verified to complete the frontend build) - Playwright CLI and Chromium. Before the first local run, install them with `python3 -m pip install playwright && python3 -m playwright install chromium` - Chrome or Edge on desktop systems. On Linux, Chromium also works. `start.sh` checks this prerequisite before continuing ```bash git clone https://github.com/Usagi-org/ai-goofish-monitor cd ai-goofish-monitor cp .env.example .env ``` ### Minimum Configuration | Variable | Description | Required | |----------|-------------|----------| | `OPENAI_API_KEY` | AI model API key | Yes | | `OPENAI_BASE_URL` | OpenAI-compatible API base URL | Yes | | `OPENAI_MODEL_NAME` | Model name with image input support | Yes | | `WEB_USERNAME` / `WEB_PASSWORD` | Web UI login credentials, default `admin/admin123` | No | See "Configuration" below for the rest. ### Start Locally ```bash chmod +x start.sh ./start.sh ``` `start.sh` first validates the Playwright CLI and browser prerequisites. Once they are available, it installs project dependencies, builds the frontend, copies the artifacts, and starts the backend. ### First-Time Setup 1. Open the default Web UI at `http://127.0.0.1:8000` and sign in. 2. Go to "Xianyu Account Management" and use the [Chrome Extension](https://chromewebstore.google.com/detail/xianyu-login-state-extrac/eidlpfjiodpigmfcahkmlenhppfklcoa) to export and paste the Xianyu login-state JSON. 3. Login-state files are stored in `state/`, for example `state/acc_1.json`. 4. Go back to "Task Management", create a task, bind an account if needed, and run it. ### Create Your First Task - `AI mode`: fill in the requirement description. Submission opens a separate progress dialog while the criteria are generated asynchronously. - `Keyword mode`: provide keyword rules and the task is created immediately. - `Region filter`: now uses a province / city / district selector backed by an embedded Xianyu page snapshot instead of manual text input. ## 🐳 Docker Deployment (Recommended) ```bash git clone https://github.com/Usagi-org/ai-goofish-monitor && cd ai-goofish-monitor cp .env.example .env vim .env # fill in the required values docker compose up -d docker compose logs -f app docker compose down ``` - Default Web UI: `http://127.0.0.1:8000` - The published Docker image already includes Chromium, so no extra browser install is required on the host. - Update image: `docker compose pull && docker compose up -d` - If you change `SERVER_PORT` in `.env`, update the `ports` mapping in `docker-compose.yaml` as well. - `docker-compose.yaml` now mounts the primary SQLite database directory as `./data:/app/data`, with the default database file at `data/app.sqlite3` - These paths are persisted by default: - `data/` for the SQLite primary store (tasks, results, price history) - `state/` for login-state cookie files - `prompts/` for task prompt files - `logs/` for runtime logs - `images/` for downloaded product images and per-task temporary image folders - `config.json`, `jsonl/`, and `price_history/` as legacy sources for the first SQLite migration ### Storage and Migration - SQLite is now the online primary storage, with the default path `data/app.sqlite3` - You can override the database path with `APP_DATABASE_FILE`; Docker sets it to `/app/data/app.sqlite3` - On startup, the app initializes the schema and tries to import existing data once from legacy `config.json`, `jsonl/`, and `price_history/` - `state/`, `prompts/`, `logs/`, and `images/` remain filesystem-based and are not stored in SQLite - Product images are temporarily downloaded to `images/task_images_/` and are normally cleaned up when the task finishes - After the first upgrade and after verifying the database contents in `data/app.sqlite3`, you can decide whether to keep the legacy `config.json`, `jsonl/`, and `price_history/` mounts ## User Guide
Click to expand Web UI usage notes ### Task Management - Supports AI creation, keyword rules, price range, new listing filters, region filters, account binding, and cron scheduling. - AI task creation runs as a background job and shows a dedicated progress dialog after submission. - Region filtering can greatly reduce results, so leaving it empty is the safer default. ### Account Management - Import, update, and delete Xianyu login states. - Each task can bind a specific account or leave account selection to the system. ### Results and Logs - The results page and export endpoints now query SQLite instead of directly scanning `jsonl` files. - The logs page is the first place to inspect login-state expiry, anti-bot issues, or AI call failures. ### System Settings - View system status, edit prompts, and adjust proxy / rotation-related settings.
## Developer Guide ### Local Development ```bash # backend python -m src.app # or uvicorn src.app:app --host 0.0.0.0 --port 8000 --reload # frontend cd web-ui npm install npm run dev ``` - FastAPI initializes SQLite on startup and performs the one-time legacy import from `config.json/jsonl/price_history` when needed - `spider_v2.py` now loads tasks from SQLite by default; JSON config is only used when `--config ` is passed explicitly - The default local database path is `data/app.sqlite3` - The Vite dev server proxies `/api`, `/auth`, and `/ws` to `http://127.0.0.1:8000`. - `npm run build` writes `web-ui/dist/`, and `start.sh` copies it to the repository root `dist/`. - FastAPI serves `dist/index.html` and `dist/assets/` from the repository root. - `./start.sh` prints the default app URL `http://localhost:8000` and API docs URL `http://localhost:8000/docs`. ### Validation ```bash PYTEST_DISABLE_PLUGIN_AUTOLOAD=1 pytest cd web-ui && npm run build ``` ### Task Creation API
Click to expand API behavior - `POST /api/tasks/generate` - `decision_mode=ai`: returns `202` with a `job`; the client should poll for progress. - `decision_mode=keyword`: returns the created task directly. - `GET /api/tasks/generate-jobs/{job_id}`: fetch AI task-generation progress. - `POST /auth/status`: validate Web UI credentials.
## Configuration
Click to expand common configuration items ### AI and Runtime - `OPENAI_API_KEY` / `OPENAI_BASE_URL` / `OPENAI_MODEL_NAME`: required AI model settings. - `PROXY_URL`: dedicated HTTP/SOCKS5 proxy for AI requests. - `RUN_HEADLESS`: whether the scraper runs headless; keep it `true` in Docker. - `SERVER_PORT`: backend port, default `8000`. - `LOGIN_IS_EDGE`: use Edge instead of Chrome locally; Docker images do not bundle Edge and always run with Chromium. - `PCURL_TO_MOBILE`: convert desktop item URLs to mobile URLs. ### Notifications - `NTFY_TOPIC_URL` - `GOTIFY_URL` / `GOTIFY_TOKEN` - `BARK_URL` - `WX_BOT_URL` - `TELEGRAM_BOT_TOKEN` / `TELEGRAM_CHAT_ID` / `TELEGRAM_API_BASE_URL` - `WEBHOOK_*` ### Proxy Rotation and Failure Guard - `PROXY_ROTATION_ENABLED` - `PROXY_ROTATION_MODE` - `PROXY_POOL` - `PROXY_ROTATION_RETRY_LIMIT` - `PROXY_BLACKLIST_TTL` - `TASK_FAILURE_THRESHOLD` - `TASK_FAILURE_PAUSE_SECONDS` - `TASK_FAILURE_GUARD_PATH` See `.env.example` for the full list.
## Web Authentication
Click to expand authentication notes - The Web UI uses a login page and validates credentials through `POST /auth/status`. - After login, the frontend stores local auth state for route guards and WebSocket startup. - The default credentials are `admin/admin123`; change them in production.
## 🚀 Workflow The diagram below shows the core processing flow of a monitoring task. The main service runs in `src.app` and launches one or more task processes based on user actions or schedule triggers. ```mermaid graph TD A[Start Monitoring Task] --> B[Select Account/Proxy Configuration]; B --> C[Task: Search Products]; C --> D{Found New Products?}; D -- Yes --> E[Scrape Product Details & Seller Info]; E --> F[Download Product Images]; F --> G[Call AI for Analysis]; G --> H{AI Recommended?}; H -- Yes --> I[Send Notification]; H -- No --> J[Save Record to SQLite]; I --> J; D -- No --> K[Next Page/Wait]; K --> C; J --> C; C --> L{Risk Control/Exception?}; L -- Yes --> M[Account/Proxy Rotation and Retry]; M --> C; ``` ## FAQ
Click to expand FAQ ### Why does AI task creation take time? In AI mode, the system generates analysis criteria before the task itself is created. This now runs as a background job with a separate progress dialog instead of blocking the task form. ### Why is the region filter optional by default? Region filtering can sharply reduce result volume. Leave it empty if you want a broader market scan first. ### Why does the app say the frontend build artifacts are missing? It means the repository root `dist/` directory is missing. Run `./start.sh`, or build the frontend in `web-ui/` and make sure the artifacts are copied to the root `dist/`. ### Why does `./start.sh` complain about missing Playwright or a browser? The script performs a prerequisite check before installing project dependencies. Install the Playwright CLI and Chromium first, then make sure Chrome, Edge, or Chromium is available on the system and rerun `./start.sh`.
## Acknowledgments
Click to expand acknowledgments This project referenced the following excellent projects during development. Special thanks to: - [superboyyy/xianyu_spider](https://github.com/superboyyy/xianyu_spider) Also thanks to LinuxDo contributors for script contributions: - [@jooooody](https://linux.do/u/jooooody/summary) And thanks to the [LinuxDo](https://linux.do/) community. Also thanks to ClaudeCode/Gemini/Codex and other model tools for freeing our hands and experiencing the joy of Vibe Coding.
## Notices
Click to expand notice details - Please comply with Xianyu's user agreement and robots.txt rules. Do not make frequent requests to avoid burdening the server or having your account restricted. - This project is for learning and technical research purposes only. Do not use it for illegal purposes. - This project is released under the [MIT License](LICENSE), provided "as is", without any form of warranty. - The project authors and contributors are not responsible for any direct, indirect, incidental, or special damages or losses caused by the use of this software. - For more details, please refer to the [Disclaimer](DISCLAIMER.md) file.
## Star History [![Star History Chart](https://api.star-history.com/svg?repos=Usagi-org/ai-goofish-monitor&type=Date)](https://www.star-history.com/#Usagi-org/ai-goofish-monitor&Date) ================================================ FILE: chrome-extension/README.md ================================================ # Xianyu Login State Extractor Chrome Extension This Chrome extension helps extract complete login state information from Xianyu (Goofish) for use with the monitoring robot. It also records browser environment hints and request headers to better mimic a real session. ## Installation 1. Open Chrome and navigate to `chrome://extensions` 2. Enable "Developer mode" in the top right corner 3. Click "Load unpacked" and select the `chrome-extension` directory 4. The extension icon should now appear in your toolbar ## Usage 1. Navigate to [https://www.goofish.com](https://www.goofish.com) 2. Log in to your account 3. Click the extension icon in the toolbar 4. Click "Extract Login State" (collects cookies + environment + headers,自动过滤无用/超大字段) 5. The complete JSON will be displayed - click "Copy to Clipboard" 6. Save the JSON文本到 `xianyu_state.json`(或自定义文件名)即可 ## Features - Extracts all cookies including HttpOnly cookies - Captures browser environment (UA, locale, timezone, screen size, device memory, hardware concurrency) - Captures observed request headers for the current tab - Captures localStorage/sessionStorage snapshot for the current domain(会自动丢弃超大或无用字段) - Outputs a single JSON payload, ready for the monitoring robot - Copy to clipboard with real-time status feedback ## How It Works The extension uses the `chrome.cookies` API to access all cookies for the `.goofish.com` domain, including those with the HttpOnly flag set. This bypasses the normal JavaScript security restrictions that prevent access to these cookies. ================================================ FILE: chrome-extension/background.js ================================================ // Service worker for capturing browser environment, headers, storage, and cookies const GOOFISH_HOST_PATTERN = "*://*.goofish.com/*"; const MAX_STORAGE_ENTRY_LENGTH = 4096; // bytes limit to drop oversized values function mapSameSiteValue(chromeSameSite) { if (chromeSameSite === undefined || chromeSameSite === null) return "Lax"; const sameSiteMap = { no_restriction: "None", lax: "Lax", strict: "Strict", unspecified: "Lax", }; return sameSiteMap[chromeSameSite] || "Lax"; } function headersArrayToObject(headers = []) { const result = {}; headers.forEach((item) => { if (item && item.name) { result[item.name] = item.value || ""; } }); return result; } async function getActiveGoofishTab() { const tabs = await chrome.tabs.query({ active: true, currentWindow: true }); const tab = tabs?.[0]; if (!tab || !tab.id || !tab.url) { throw new Error("未找到活动标签页"); } if (!tab.url.includes("goofish.com")) { throw new Error("请先打开 goofish.com 页面"); } return tab; } async function capturePageData(tabId) { const [result] = await chrome.scripting.executeScript({ target: { tabId }, func: () => { const safeEntries = (storage) => { try { const obj = {}; for (let i = 0; i < storage.length; i += 1) { const key = storage.key(i); if (key !== null) { obj[key] = storage.getItem(key); } } return obj; } catch (e) { return {}; } }; const intl = (() => { try { return Intl.DateTimeFormat().resolvedOptions(); } catch (e) { return {}; } })(); const uaData = (() => { try { return navigator.userAgentData ? navigator.userAgentData.toJSON() : null; } catch (e) { return null; } })(); return { page: { pageUrl: location.href, referrer: document.referrer || null, visibilityState: document.visibilityState, }, env: { navigator: { userAgent: navigator.userAgent, platform: navigator.platform, vendor: navigator.vendor, language: navigator.language, languages: navigator.languages, hardwareConcurrency: navigator.hardwareConcurrency, deviceMemory: navigator.deviceMemory, webdriver: navigator.webdriver, doNotTrack: navigator.doNotTrack, maxTouchPoints: navigator.maxTouchPoints, userAgentData: uaData, }, screen: { width: screen.width, height: screen.height, availWidth: screen.availWidth, availHeight: screen.availHeight, colorDepth: screen.colorDepth, pixelDepth: screen.pixelDepth, devicePixelRatio: window.devicePixelRatio, }, intl, }, storage: { local: safeEntries(localStorage), session: safeEntries(sessionStorage), }, }; }, }); if (!result || !result.result) { throw new Error("无法获取页面信息"); } return result.result; } function filterEnvData(env = {}) { const nav = env.navigator || {}; const screen = env.screen || {}; const intl = env.intl || {}; return { navigator: { userAgent: nav.userAgent, platform: nav.platform, language: nav.language, languages: nav.languages, hardwareConcurrency: nav.hardwareConcurrency, deviceMemory: nav.deviceMemory, maxTouchPoints: nav.maxTouchPoints, webdriver: nav.webdriver, doNotTrack: nav.doNotTrack, userAgentData: nav.userAgentData || null, }, screen: { width: screen.width, height: screen.height, devicePixelRatio: screen.devicePixelRatio, colorDepth: screen.colorDepth, }, intl: { timeZone: intl.timeZone, locale: intl.locale, }, }; } function pruneStorageEntries(entries = {}) { const data = {}; const dropped = []; Object.entries(entries).forEach(([key, value]) => { const str = value == null ? "" : String(value); if (str.length <= MAX_STORAGE_ENTRY_LENGTH) { data[key] = value; } else { dropped.push(key); } }); return { data, dropped }; } function filterHeaders(rawHeaders = {}) { const allowList = [ "user-agent", "accept", "accept-language", "accept-encoding", "referer", "sec-ch-ua", "sec-ch-ua-mobile", "sec-ch-ua-platform", "sec-fetch-site", "sec-fetch-mode", "sec-fetch-dest", "sec-fetch-user", "origin", "cache-control", "pragma", "upgrade-insecure-requests", "content-type", ]; const normalized = {}; Object.entries(rawHeaders).forEach(([k, v]) => { const lower = k.toLowerCase(); if (allowList.includes(lower)) { normalized[k] = v; } }); return normalized; } async function captureHeaders(tabId) { return new Promise((resolve) => { let resolved = false; const cleanup = (headers) => { if (resolved) return; resolved = true; chrome.webRequest.onBeforeSendHeaders.removeListener(listener); clearTimeout(timer); resolve(headersArrayToObject(headers || [])); }; const listener = (details) => { if (details.tabId !== tabId) return; cleanup(details.requestHeaders || []); }; const extraInfo = ["requestHeaders"]; // extraHeaders 提供更完整的 header 视图,在新版本 Chrome 需要显式声明 extraInfo.push("extraHeaders"); chrome.webRequest.onBeforeSendHeaders.addListener( listener, { urls: [GOOFISH_HOST_PATTERN], tabId }, extraInfo, ); const timer = setTimeout(() => cleanup(null), 2000); // 触发一次轻量请求以获取真实请求头 chrome.scripting .executeScript({ target: { tabId }, func: () => { try { fetch(`${window.location.origin}/__codex_probe?ts=${Date.now()}`, { credentials: "include", cache: "no-store", redirect: "follow", }).catch(() => {}); } catch (e) { /* ignore */ } }, }) .catch(() => { // 如果注入失败,继续等待可能已有的请求 }); }); } async function captureCookies(url) { const cookies = await chrome.cookies.getAll({ url }); return cookies.map((cookie) => ({ name: cookie.name, value: cookie.value, domain: cookie.domain, path: cookie.path, expires: cookie.expirationDate, httpOnly: cookie.httpOnly, secure: cookie.secure, sameSite: mapSameSiteValue(cookie.sameSite), })); } async function buildSnapshot() { const tab = await getActiveGoofishTab(); const pageData = await capturePageData(tab.id); const headers = await captureHeaders(tab.id); const cookies = await captureCookies(new URL(tab.url).origin); const filteredEnv = filterEnvData(pageData.env); const localPruned = pruneStorageEntries(pageData.storage.local); const sessionPruned = pruneStorageEntries(pageData.storage.session); const filteredStorage = { local: localPruned.data, session: sessionPruned.data, }; return { capturedAt: new Date().toISOString(), pageUrl: tab.url, page: pageData.page, env: filteredEnv, storage: filteredStorage, meta: { droppedStorageKeys: { local: localPruned.dropped, session: sessionPruned.dropped, }, }, headers: filterHeaders(headers), cookies, }; } chrome.runtime.onMessage.addListener((message, sender, sendResponse) => { if (!message || message.type !== "captureSnapshot") { return false; } buildSnapshot() .then((data) => sendResponse({ ok: true, data })) .catch((error) => sendResponse({ ok: false, error: error.message })); return true; }); ================================================ FILE: chrome-extension/manifest.json ================================================ { "manifest_version": 3, "name": "Xianyu Login State Extractor", "version": "1.1", "description": "Extract login state and browser environment for Xianyu monitoring robot", "permissions": [ "activeTab", "cookies", "scripting", "storage", "tabs", "webRequest" ], "host_permissions": [ "*://*.goofish.com/*" ], "background": { "service_worker": "background.js" }, "action": { "default_popup": "popup.html", "default_title": "Extract Xianyu Login State" } } ================================================ FILE: chrome-extension/popup.html ================================================

Xianyu Login State Extractor

================================================ FILE: chrome-extension/popup.js ================================================ // Popup script for the Chrome extension document.addEventListener('DOMContentLoaded', function() { const extractBtn = document.getElementById('extractBtn'); const copyBtn = document.getElementById('copyBtn'); const stateOutput = document.getElementById('stateOutput'); const statusDiv = document.getElementById('status'); let latestSnapshot = null; function setLoading(isLoading) { extractBtn.disabled = isLoading; extractBtn.textContent = isLoading ? '采集中,请稍候...' : '1.点击获取环境+登录状态'; } function updateStatus(message, isSuccess = false) { statusDiv.textContent = message; statusDiv.className = 'status ' + (isSuccess ? 'success' : 'error'); setTimeout(() => { statusDiv.textContent = ''; statusDiv.className = 'status'; }, 4000); } function renderSnapshot(snapshot) { latestSnapshot = snapshot; stateOutput.value = JSON.stringify(snapshot, null, 2); } async function captureSnapshot() { setLoading(true); updateStatus('正在采集浏览器环境与登录状态...'); stateOutput.value = ''; chrome.runtime.sendMessage({ type: 'captureSnapshot' }, (response) => { setLoading(false); if (chrome.runtime.lastError) { updateStatus('通信失败: ' + chrome.runtime.lastError.message); return; } if (!response || !response.ok) { updateStatus('采集失败: ' + (response?.error || '未知错误')); return; } renderSnapshot(response.data); updateStatus('采集完成,已生成JSON', true); }); } function copySnapshot() { if (!stateOutput.value) { updateStatus('没有可复制的数据'); return; } navigator.clipboard.writeText(stateOutput.value) .then(() => updateStatus('已复制到剪贴板', true)) .catch(err => updateStatus('复制失败: ' + err)); } extractBtn.addEventListener('click', captureSnapshot); copyBtn.addEventListener('click', copySnapshot); }); ================================================ FILE: config.json.example ================================================ [ { "task_name": "苹果watch S10", "enabled": true, "keyword": "苹果watch S10", "description": "九成新,充电线包装盒齐全,无明显磕碰,卖家信用优秀", "max_pages": 10, "personal_only": true, "min_price": "8000", "max_price": "2000", "cron": null, "ai_prompt_base_file": "prompts/base_prompt.txt", "ai_prompt_criteria_file": "prompts/苹果watch_s10_criteria.txt", "account_state_file": "state/acc1.json", "free_shipping": true, "new_publish_option": "14天内", "region": "江苏/南京/全南京", "is_running": false } ] ================================================ FILE: desktop_launcher.py ================================================ """ 桌面启动入口 使用 PyInstaller 打包后作为单一可执行文件的入口,自动启动 FastAPI 服务并打开浏览器。 """ import os import sys import time import webbrowser from pathlib import Path import uvicorn # PyInstaller 运行时资源目录:_MEIPASS;未打包时则为当前文件所在目录 BASE_DIR = Path(getattr(sys, "_MEIPASS", Path(__file__).resolve().parent)) def _prepare_environment() -> None: """确保工作目录和模块路径正确""" os.chdir(BASE_DIR) if str(BASE_DIR) not in sys.path: sys.path.insert(0, str(BASE_DIR)) def run_app() -> None: """启动 FastAPI 应用并自动打开浏览器""" _prepare_environment() from src.app import app from src.infrastructure.config.settings import settings # 先尝试打开浏览器,稍等服务起来 url = f"http://127.0.0.1:{settings.server_port}" webbrowser.open(url) time.sleep(0.5) uvicorn.run( app, host="127.0.0.1", port=settings.server_port, log_level="info", reload=False, ) if __name__ == "__main__": run_app() ================================================ FILE: docker-compose.dev.yaml ================================================ services: app: build: . container_name: ai-goofish-monitor-app init: true ports: - "8000:8000" env_file: - .env volumes: - .:/app - /app/dist restart: unless-stopped ================================================ FILE: docker-compose.yaml ================================================ services: app: image: ${APP_IMAGE:-ghcr.io/usagi-org/ai-goofish:latest} container_name: ai-goofish-monitor-app pull_policy: always init: true ports: - "8000:8000" env_file: - .env environment: APP_DATABASE_FILE: /app/data/app.sqlite3 volumes: - ./.env:/app/.env - ./data:/app/data - ./state:/app/state - ./config.json:/app/config.json - ./prompts:/app/prompts - ./jsonl:/app/jsonl - ./logs:/app/logs - ./images:/app/images - ./price_history:/app/price_history restart: unless-stopped ================================================ FILE: pyproject.toml ================================================ [tool.pytest.ini_options] addopts = "-v --tb=short" testpaths = ["tests"] python_files = ["test_*.py"] python_classes = ["Test*"] python_functions = ["test_*"] markers = [ "live: real traffic smoke tests that require real credentials and external services", "live_slow: slower optional live smoke tests such as AI task generation", ] [tool.coverage.run] source = ["src"] [tool.coverage.report] exclude_lines = [ "pragma: no cover", "def __repr__", "raise AssertionError", "raise NotImplementedError", "if __name__ == .__main__.:", ] ================================================ FILE: requirements-runtime.txt ================================================ python-dotenv playwright requests openai fastapi uvicorn[standard] pydantic-settings jinja2 aiofiles python-socks apscheduler httpx[socks] Pillow pyzbar qrcode ================================================ FILE: requirements.txt ================================================ python-dotenv playwright requests openai fastapi uvicorn[standard] pydantic-settings jinja2 aiofiles python-socks apscheduler httpx[socks] Pillow pyzbar qrcode pytest pytest-asyncio coverage ================================================ FILE: run_live_smoke.sh ================================================ #!/bin/bash set -euo pipefail RED='\033[0;31m' GREEN='\033[0;32m' YELLOW='\033[1;33m' NC='\033[0m' SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" cd "$SCRIPT_DIR" PYTHON_CMD="${PYTHON_CMD:-python3}" MARK_EXPRESSION="" DRY_RUN=false WITH_GENERATION=true PYTEST_ARGS=() TASK_CREATE_TEST="tests/integration/test_api_tasks.py::test_create_list_update_delete_task" TEST_TARGETS=( "$TASK_CREATE_TEST" "tests/live" ) usage() { cat <<'EOF' 用法: ./run_live_smoke.sh [选项] [-- pytest额外参数] 选项: --keyword <关键词> 覆盖 LIVE_TEST_KEYWORD --account-file <路径> 覆盖 LIVE_TEST_ACCOUNT_STATE_FILE --task-name <名称> 覆盖 LIVE_TEST_TASK_NAME --timeout <秒> 覆盖 LIVE_TIMEOUT_SECONDS --min-items <数量> 覆盖 LIVE_EXPECT_MIN_ITEMS --debug-limit <数量> 覆盖 LIVE_TEST_DEBUG_LIMIT(默认 1,仅分析前 N 个新商品) --with-generation 显式开启 live_slow(默认已开启) --without-generation 关闭 live_slow,只执行主 smoke --dry-run 只打印配置和将执行的命令,不真正运行 --help 显示帮助 示例: ./run_live_smoke.sh ./run_live_smoke.sh --keyword "MacBook Air M1" --min-items 2 ./run_live_smoke.sh --without-generation ./run_live_smoke.sh -- -k live_real_traffic 说明: 0. 默认先执行任务创建 CRUD 集成测试,再执行 tests/live 真实流量 smoke 1. 脚本会自动设置 RUN_LIVE_TESTS=1 2. 若未设置 LIVE_TEST_ACCOUNT_STATE_FILE,会自动尝试使用 state/ 下第一个 *.json 3. 默认使用 PYTEST_DISABLE_PLUGIN_AUTOLOAD=1,避免本机第三方 pytest 插件干扰 4. 默认设置 LIVE_TEST_DEBUG_LIMIT=1,使冒烟测试只抓取并分析 1 个新商品 EOF } require_value() { local option="$1" local value="${2:-}" if [[ -z "$value" ]]; then echo -e "${RED}错误:${NC} ${option} 需要一个值" exit 1 fi } resolve_default_account_file() { local first_match="" while IFS= read -r file; do first_match="$file" break done < <(find "$SCRIPT_DIR/state" -maxdepth 1 -type f -name '*.json' | sort) printf '%s' "$first_match" } while [[ $# -gt 0 ]]; do case "$1" in --keyword) require_value "$1" "${2:-}" export LIVE_TEST_KEYWORD="$2" shift 2 ;; --account-file) require_value "$1" "${2:-}" export LIVE_TEST_ACCOUNT_STATE_FILE="$2" shift 2 ;; --task-name) require_value "$1" "${2:-}" export LIVE_TEST_TASK_NAME="$2" shift 2 ;; --timeout) require_value "$1" "${2:-}" export LIVE_TIMEOUT_SECONDS="$2" shift 2 ;; --min-items) require_value "$1" "${2:-}" export LIVE_EXPECT_MIN_ITEMS="$2" shift 2 ;; --debug-limit) require_value "$1" "${2:-}" export LIVE_TEST_DEBUG_LIMIT="$2" shift 2 ;; --with-generation) WITH_GENERATION=true shift ;; --without-generation) WITH_GENERATION=false shift ;; --dry-run) DRY_RUN=true shift ;; --help|-h) usage exit 0 ;; --) shift PYTEST_ARGS+=("$@") break ;; *) PYTEST_ARGS+=("$1") shift ;; esac done if ! command -v "$PYTHON_CMD" >/dev/null 2>&1; then echo -e "${RED}错误:${NC} 未找到 Python 命令: $PYTHON_CMD" exit 1 fi if ! "$PYTHON_CMD" -m pytest --version >/dev/null 2>&1; then echo -e "${RED}错误:${NC} 当前 Python 环境缺少 pytest" exit 1 fi if ! "$PYTHON_CMD" -m playwright --version >/dev/null 2>&1; then echo -e "${RED}错误:${NC} 当前 Python 环境缺少 Playwright,请先安装浏览器依赖" exit 1 fi export RUN_LIVE_TESTS=1 export PYTEST_DISABLE_PLUGIN_AUTOLOAD="${PYTEST_DISABLE_PLUGIN_AUTOLOAD:-1}" export LIVE_TEST_KEYWORD="${LIVE_TEST_KEYWORD:-MacBook Pro M2}" export LIVE_TEST_TASK_NAME="${LIVE_TEST_TASK_NAME:-Live Smoke Task}" export LIVE_EXPECT_MIN_ITEMS="${LIVE_EXPECT_MIN_ITEMS:-1}" export LIVE_TEST_DEBUG_LIMIT="${LIVE_TEST_DEBUG_LIMIT:-1}" export LIVE_TIMEOUT_SECONDS="${LIVE_TIMEOUT_SECONDS:-180}" if [[ -z "${LIVE_TEST_ACCOUNT_STATE_FILE:-}" ]]; then DEFAULT_ACCOUNT_FILE="$(resolve_default_account_file)" if [[ -n "$DEFAULT_ACCOUNT_FILE" ]]; then export LIVE_TEST_ACCOUNT_STATE_FILE="$DEFAULT_ACCOUNT_FILE" fi fi if [[ -z "${LIVE_TEST_ACCOUNT_STATE_FILE:-}" ]]; then echo -e "${RED}错误:${NC} 未找到 live 登录态文件。请使用 --account-file 指定,或在 state/ 下放置 *.json" exit 1 fi if [[ ! -f "${LIVE_TEST_ACCOUNT_STATE_FILE}" ]]; then echo -e "${RED}错误:${NC} 登录态文件不存在: ${LIVE_TEST_ACCOUNT_STATE_FILE}" exit 1 fi if [[ "$WITH_GENERATION" == "true" ]]; then export LIVE_ENABLE_TASK_GENERATION=1 MARK_EXPRESSION="" else export LIVE_ENABLE_TASK_GENERATION=0 MARK_EXPRESSION="not live_slow" fi echo -e "${GREEN}========================================${NC}" echo -e "${GREEN}闲鱼真实流量 Live Smoke 一键测试${NC}" echo -e "${GREEN}========================================${NC}" echo -e "${YELLOW}Python:${NC} $PYTHON_CMD" echo -e "${YELLOW}关键词:${NC} ${LIVE_TEST_KEYWORD}" echo -e "${YELLOW}任务名:${NC} ${LIVE_TEST_TASK_NAME}" echo -e "${YELLOW}登录态:${NC} ${LIVE_TEST_ACCOUNT_STATE_FILE}" echo -e "${YELLOW}最少结果数:${NC} ${LIVE_EXPECT_MIN_ITEMS}" echo -e "${YELLOW}抓取/分析商品上限:${NC} ${LIVE_TEST_DEBUG_LIMIT}" echo -e "${YELLOW}超时(秒):${NC} ${LIVE_TIMEOUT_SECONDS}" echo -e "${YELLOW}任务生成慢用例:${NC} ${LIVE_ENABLE_TASK_GENERATION}" echo -e "${YELLOW}任务创建前置用例:${NC} ${TASK_CREATE_TEST}" if [[ -n "$MARK_EXPRESSION" ]]; then echo -e "${YELLOW}Pytest Marker:${NC} ${MARK_EXPRESSION}" else echo -e "${YELLOW}Pytest Marker:${NC} " fi echo -e "${YELLOW}禁用插件自动加载:${NC} ${PYTEST_DISABLE_PLUGIN_AUTOLOAD}" CMD=( "$PYTHON_CMD" -m pytest "${TEST_TARGETS[@]}" -v ) if [[ -n "$MARK_EXPRESSION" ]]; then CMD+=(-m "$MARK_EXPRESSION") fi if [[ ${#PYTEST_ARGS[@]} -gt 0 ]]; then CMD+=("${PYTEST_ARGS[@]}") fi echo -e "${YELLOW}执行命令:${NC} ${CMD[*]}" if [[ "$DRY_RUN" == "true" ]]; then echo -e "${GREEN}Dry run 完成,未实际执行测试。${NC}" exit 0 fi "${CMD[@]}" ================================================ FILE: spider_v2.py ================================================ import asyncio import sys import os import argparse import json import signal import contextlib import re from src.config import STATE_FILE from src.infrastructure.persistence.sqlite_task_repository import SqliteTaskRepository from src.scraper import scrape_xianyu async def main(): parser = argparse.ArgumentParser( description="闲鱼商品监控脚本,支持多任务配置和实时AI分析。", epilog=""" 使用示例: # 运行 config.json 中定义的所有任务 python spider_v2.py # 只运行名为 "Sony A7M4" 的任务 (通常由调度器调用) python spider_v2.py --task-name "Sony A7M4" # 调试模式: 运行所有任务,但每个任务只处理前3个新发现的商品 python spider_v2.py --debug-limit 3 """, formatter_class=argparse.RawDescriptionHelpFormatter ) parser.add_argument("--debug-limit", type=int, default=0, help="调试模式:每个任务仅处理前 N 个新商品(0 表示无限制)") parser.add_argument("--config", type=str, help="指定任务配置文件路径(传入时优先读取 JSON)") parser.add_argument("--task-name", type=str, help="只运行指定名称的单个任务 (用于定时任务调度)") args = parser.parse_args() if args.config: if not os.path.exists(args.config): sys.exit(f"错误: 配置文件 '{args.config}' 不存在。") try: with open(args.config, 'r', encoding='utf-8') as f: tasks_config = json.load(f) except (json.JSONDecodeError, IOError) as e: sys.exit(f"错误: 读取或解析配置文件 '{args.config}' 失败: {e}") else: repository = SqliteTaskRepository() tasks = await repository.find_all() tasks_config = [task.dict() for task in tasks] def normalize_keywords(value): if value is None: return [] if isinstance(value, str): raw_values = re.split(r"[\n,]+", value) elif isinstance(value, (list, tuple, set)): raw_values = list(value) else: raw_values = [value] normalized = [] seen = set() for item in raw_values: text = str(item).strip() if not text: continue key = text.lower() if key in seen: continue seen.add(key) normalized.append(text) return normalized def flatten_legacy_groups(groups): merged = [] for group in groups or []: if isinstance(group, dict): merged.extend(normalize_keywords(group.get("include_keywords"))) return normalize_keywords(merged) def has_bound_account(tasks: list) -> bool: for task in tasks: account = task.get("account_state_file") if isinstance(account, str) and account.strip(): return True return False def has_any_state_file() -> bool: state_dir = os.getenv("ACCOUNT_STATE_DIR", "state").strip().strip('"').strip("'") if os.path.isdir(state_dir): for name in os.listdir(state_dir): if name.endswith(".json"): return True return False if not os.path.exists(STATE_FILE) and not has_bound_account(tasks_config) and not has_any_state_file(): sys.exit( f"错误: 未找到登录状态文件。请在 state/ 中添加账号或配置 account_state_file。" ) # 读取所有prompt文件内容(关键词模式不需要加载prompt) for task in tasks_config: decision_mode = str(task.get("decision_mode", "ai")).strip().lower() if decision_mode not in {"ai", "keyword"}: decision_mode = "ai" task["decision_mode"] = decision_mode keyword_rules = task.get("keyword_rules") if keyword_rules is None and task.get("keyword_rule_groups") is not None: task["keyword_rules"] = flatten_legacy_groups(task.get("keyword_rule_groups") or []) else: task["keyword_rules"] = normalize_keywords(keyword_rules) if decision_mode == "keyword": task["ai_prompt_text"] = "" continue if task.get("enabled", False) and task.get("ai_prompt_base_file") and task.get("ai_prompt_criteria_file"): try: with open(task["ai_prompt_base_file"], 'r', encoding='utf-8') as f_base: base_prompt = f_base.read() with open(task["ai_prompt_criteria_file"], 'r', encoding='utf-8') as f_criteria: criteria_text = f_criteria.read() # 动态组合成最终的Prompt task['ai_prompt_text'] = base_prompt.replace("{{CRITERIA_SECTION}}", criteria_text) # 验证生成的prompt是否有效 if len(task['ai_prompt_text']) < 100: print(f"警告: 任务 '{task['task_name']}' 生成的prompt过短 ({len(task['ai_prompt_text'])} 字符),可能存在问题。") elif "{{CRITERIA_SECTION}}" in task['ai_prompt_text']: print(f"警告: 任务 '{task['task_name']}' 的prompt中仍包含占位符,替换可能失败。") else: print(f"✅ 任务 '{task['task_name']}' 的prompt生成成功,长度: {len(task['ai_prompt_text'])} 字符") except FileNotFoundError as e: print(f"警告: 任务 '{task['task_name']}' 的prompt文件缺失: {e},该任务的AI分析将被跳过。") task['ai_prompt_text'] = "" except Exception as e: print(f"错误: 任务 '{task['task_name']}' 处理prompt文件时发生异常: {e},该任务的AI分析将被跳过。") task['ai_prompt_text'] = "" elif task.get("enabled", False) and task.get("ai_prompt_file"): try: with open(task["ai_prompt_file"], 'r', encoding='utf-8') as f: task['ai_prompt_text'] = f.read() print(f"✅ 任务 '{task['task_name']}' 的prompt文件读取成功,长度: {len(task['ai_prompt_text'])} 字符") except FileNotFoundError: print(f"警告: 任务 '{task['task_name']}' 的prompt文件 '{task['ai_prompt_file']}' 未找到,该任务的AI分析将被跳过。") task['ai_prompt_text'] = "" except Exception as e: print(f"错误: 任务 '{task['task_name']}' 读取prompt文件时发生异常: {e},该任务的AI分析将被跳过。") task['ai_prompt_text'] = "" print("\n--- 开始执行监控任务 ---") if args.debug_limit > 0: print(f"** 调试模式已激活,每个任务最多处理 {args.debug_limit} 个新商品 **") if args.task_name: print(f"** 定时任务模式:只执行任务 '{args.task_name}' **") print("--------------------") active_task_configs = [] if args.task_name: # 如果指定了任务名称,只查找该任务 task_found = next((task for task in tasks_config if task.get('task_name') == args.task_name), None) if task_found: if task_found.get("enabled", False): active_task_configs.append(task_found) else: print(f"任务 '{args.task_name}' 已被禁用,跳过执行。") else: print(f"错误:在配置文件中未找到名为 '{args.task_name}' 的任务。") return else: # 否则,按原计划加载所有启用的任务 active_task_configs = [task for task in tasks_config if task.get("enabled", False)] if not active_task_configs: print("没有需要执行的任务,程序退出。") return # 为每个启用的任务创建一个异步执行协程 stop_event = asyncio.Event() loop = asyncio.get_running_loop() for sig in (signal.SIGTERM, signal.SIGINT): try: loop.add_signal_handler(sig, stop_event.set) except NotImplementedError: pass tasks = [] for task_conf in active_task_configs: print(f"-> 任务 '{task_conf['task_name']}' 已加入执行队列。") tasks.append(asyncio.create_task(scrape_xianyu(task_config=task_conf, debug_limit=args.debug_limit))) async def _shutdown_watcher(): await stop_event.wait() print("\n收到终止信号,正在优雅退出,取消所有爬虫任务...") for t in tasks: if not t.done(): t.cancel() shutdown_task = asyncio.create_task(_shutdown_watcher()) try: # 并发执行所有任务 results = await asyncio.gather(*tasks, return_exceptions=True) finally: shutdown_task.cancel() with contextlib.suppress(asyncio.CancelledError): await shutdown_task print("\n--- 所有任务执行完毕 ---") for i, result in enumerate(results): task_name = active_task_configs[i]['task_name'] if isinstance(result, Exception): print(f"任务 '{task_name}' 因异常而终止: {result}") else: print(f"任务 '{task_name}' 正常结束,本次运行共处理了 {result} 个新商品。") if __name__ == "__main__": asyncio.run(main()) ================================================ FILE: src/__init__.py ================================================ # This file makes src a Python package ================================================ FILE: src/ai_handler.py ================================================ import asyncio import base64 import json import os import re import sys import shutil import traceback from datetime import datetime, timedelta from urllib.parse import urlencode, urlparse, urlunparse, parse_qsl import requests # 设置标准输出编码为UTF-8,解决Windows控制台编码问题 if sys.platform.startswith('win'): import codecs sys.stdout = codecs.getwriter('utf-8')(sys.stdout.detach()) sys.stderr = codecs.getwriter('utf-8')(sys.stderr.detach()) from src.config import ( AI_DEBUG_MODE, IMAGE_DOWNLOAD_HEADERS, IMAGE_SAVE_DIR, TASK_IMAGE_DIR_PREFIX, MODEL_NAME, ENABLE_RESPONSE_FORMAT, client, ) from src.ai_message_builder import ( build_analysis_text_prompt, build_user_message_content, ) from src.services.ai_response_parser import ( EmptyAIResponseError, extract_ai_response_content, parse_ai_response_json, ) from src.services.ai_request_compat import ( CHAT_COMPLETIONS_API_MODE, RESPONSES_API_MODE, build_ai_request_params, create_ai_response_async, is_chat_completions_api_unsupported_error, is_json_output_unsupported_error, is_responses_api_unsupported_error, is_temperature_unsupported_error, remove_temperature_param, ) from src.services.notification_service import build_notification_service from src.utils import convert_goofish_link, retry_on_failure def _positive_int(value, default: int) -> int: try: return max(1, int(value)) except (TypeError, ValueError): return default DEFAULT_IMAGE_DOWNLOAD_CONCURRENCY = max( 1, _positive_int(os.getenv("IMAGE_DOWNLOAD_CONCURRENCY", "3"), 3), ) def safe_print(text): """安全的打印函数,处理编码错误""" try: print(text) except UnicodeEncodeError: # 如果遇到编码错误,尝试用ASCII编码并忽略无法编码的字符 try: print(text.encode('ascii', errors='ignore').decode('ascii')) except: # 如果还是失败,打印一个简化的消息 print("[输出包含无法显示的字符]") def _build_debug_request_summary(api_mode: str, request_params: dict) -> dict: summary = { "api_mode": api_mode, "model": request_params.get("model"), } if "temperature" in request_params: summary["temperature"] = request_params["temperature"] if "max_output_tokens" in request_params: summary["max_output_tokens"] = request_params["max_output_tokens"] if "max_tokens" in request_params: summary["max_tokens"] = request_params["max_tokens"] if "text" in request_params: summary["text"] = request_params["text"] if "response_format" in request_params: summary["response_format"] = request_params["response_format"] if "input" in request_params: summary["input_content_types"] = [ [item.get("type") for item in message.get("content", [])] for message in request_params["input"] ] if "messages" in request_params: summary["message_content_types"] = [ _extract_message_content_types(message) for message in request_params["messages"] ] return summary def _extract_message_content_types(message: dict) -> list[str]: content = message.get("content") if isinstance(content, str): return ["text"] if not isinstance(content, list): return [type(content).__name__] return [str(item.get("type")) for item in content if isinstance(item, dict)] @retry_on_failure(retries=2, delay=3) async def _download_single_image(url, save_path): """一个带重试的内部函数,用于异步下载单个图片。""" loop = asyncio.get_running_loop() # 使用 run_in_executor 运行同步的 requests 代码,避免阻塞事件循环 response = await loop.run_in_executor( None, lambda: requests.get(url, headers=IMAGE_DOWNLOAD_HEADERS, timeout=20, stream=True) ) response.raise_for_status() with open(save_path, 'wb') as f: for chunk in response.iter_content(chunk_size=8192): f.write(chunk) return save_path def _build_image_save_path( product_id: str, index: int, url: str, task_image_dir: str, ) -> str: clean_url = url.split('.heic')[0] if '.heic' in url else url file_name_base = os.path.basename(clean_url).split('?')[0] file_name = f"product_{product_id}_{index}_{file_name_base}" file_name = re.sub(r'[\\/*?:"<>|]', "", file_name) if not os.path.splitext(file_name)[1]: file_name += ".jpg" return os.path.join(task_image_dir, file_name) async def download_all_images(product_id, image_urls, task_name="default", concurrency=None): """异步下载一个商品的所有图片。如果图片已存在则跳过。支持任务隔离。""" if not image_urls: return [] # 为每个任务创建独立的图片目录 task_image_dir = os.path.join(IMAGE_SAVE_DIR, f"{TASK_IMAGE_DIR_PREFIX}{task_name}") os.makedirs(task_image_dir, exist_ok=True) urls = [url.strip() for url in image_urls if url.strip().startswith('http')] if not urls: return [] max_concurrency = _positive_int(concurrency, DEFAULT_IMAGE_DOWNLOAD_CONCURRENCY) semaphore = asyncio.Semaphore(max_concurrency) total_images = len(urls) async def _download_one(index: int, url: str): save_path = _build_image_save_path(product_id, index, url, task_image_dir) if os.path.exists(save_path): safe_print( f" [图片] 图片 {index}/{total_images} 已存在,跳过下载: {os.path.basename(save_path)}" ) return save_path async with semaphore: safe_print(f" [图片] 正在下载图片 {index}/{total_images}: {url}") if await _download_single_image(url, save_path): safe_print( f" [图片] 图片 {index}/{total_images} 已成功下载到: {os.path.basename(save_path)}" ) return save_path return None tasks = [ asyncio.create_task(_download_one(index, url)) for index, url in enumerate(urls, start=1) ] results = await asyncio.gather(*tasks, return_exceptions=True) saved_paths = [] for url, result in zip(urls, results): try: if isinstance(result, Exception): raise result if result: saved_paths.append(result) except Exception as e: safe_print(f" [图片] 处理图片 {url} 时发生错误,已跳过此图: {e}") return saved_paths def cleanup_task_images(task_name): """清理指定任务的图片目录""" task_image_dir = os.path.join(IMAGE_SAVE_DIR, f"{TASK_IMAGE_DIR_PREFIX}{task_name}") if os.path.exists(task_image_dir): try: shutil.rmtree(task_image_dir) safe_print(f" [清理] 已删除任务 '{task_name}' 的临时图片目录: {task_image_dir}") except Exception as e: safe_print(f" [清理] 删除任务 '{task_name}' 的临时图片目录时出错: {e}") else: safe_print(f" [清理] 任务 '{task_name}' 的临时图片目录不存在: {task_image_dir}") def cleanup_ai_logs(logs_dir: str, keep_days: int = 1) -> None: try: cutoff = datetime.now() - timedelta(days=keep_days) for filename in os.listdir(logs_dir): if not filename.endswith(".log"): continue try: timestamp = datetime.strptime(filename[:15], "%Y%m%d_%H%M%S") except ValueError: continue if timestamp < cutoff: os.remove(os.path.join(logs_dir, filename)) except Exception as e: safe_print(f" [日志] 清理AI日志时出错: {e}") def encode_image_to_base64(image_path): """将本地图片文件编码为 Base64 字符串。""" if not image_path or not os.path.exists(image_path): return None try: with open(image_path, "rb") as image_file: return base64.b64encode(image_file.read()).decode('utf-8') except Exception as e: safe_print(f"编码图片时出错: {e}") return None def validate_ai_response_format(parsed_response): """验证AI响应的格式是否符合预期结构""" required_fields = [ "prompt_version", "is_recommended", "reason", "risk_tags", "criteria_analysis" ] # 检查顶层字段 for field in required_fields: if field not in parsed_response: safe_print(f" [AI分析] 警告:响应缺少必需字段 '{field}'") return False # 检查criteria_analysis是否为字典且不为空 criteria_analysis = parsed_response.get("criteria_analysis", {}) if not isinstance(criteria_analysis, dict) or not criteria_analysis: safe_print(" [AI分析] 警告:criteria_analysis必须是非空字典") return False # 检查seller_type字段(所有商品都需要) if "seller_type" not in criteria_analysis: safe_print(" [AI分析] 警告:criteria_analysis缺少必需字段 'seller_type'") return False # 检查数据类型 if not isinstance(parsed_response.get("is_recommended"), bool): safe_print(" [AI分析] 警告:is_recommended字段不是布尔类型") return False if not isinstance(parsed_response.get("risk_tags"), list): safe_print(" [AI分析] 警告:risk_tags字段不是列表类型") return False return True @retry_on_failure(retries=3, delay=5) async def send_ntfy_notification(product_data, reason): """兼容旧调用名,内部统一走 NotificationService。""" service = build_notification_service() if not service.clients: safe_print( "警告:未在 .env 文件中配置任何通知服务,跳过通知。" ) return {} results = await service.send_notification(product_data, reason) for channel, result in results.items(): if result["success"]: safe_print(f" -> {channel} 通知发送成功。") continue safe_print(f" -> {channel} 通知发送失败: {result['message']}") return results async def get_ai_analysis(product_data, image_paths=None, prompt_text=""): """将完整的商品JSON数据和所有图片发送给 AI 进行分析(异步)。""" if not client: safe_print(" [AI分析] 错误:AI客户端未初始化,跳过分析。") return None item_info = product_data.get('商品信息', {}) product_id = item_info.get('商品ID', 'N/A') safe_print(f"\n [AI分析] 开始分析商品 #{product_id} (含 {len(image_paths or [])} 张图片)...") safe_print(f" [AI分析] 标题: {item_info.get('商品标题', '无')}") if not prompt_text: safe_print(" [AI分析] 错误:未提供AI分析所需的prompt文本。") return None product_details_json = json.dumps(product_data, ensure_ascii=False, indent=2) system_prompt = prompt_text if AI_DEBUG_MODE: safe_print("\n--- [AI DEBUG] ---") safe_print("--- PRODUCT DATA (JSON) ---") safe_print(product_details_json) safe_print("--- PROMPT TEXT (完整内容) ---") safe_print(prompt_text) safe_print("-------------------\n") image_data_urls = [] if image_paths: for path in image_paths: base64_image = encode_image_to_base64(path) if base64_image: image_data_urls.append(f"data:image/jpeg;base64,{base64_image}") combined_text_prompt = build_analysis_text_prompt( product_details_json, system_prompt, include_images=bool(image_data_urls), ) user_content = build_user_message_content(combined_text_prompt, image_data_urls) messages = [{"role": "user", "content": user_content}] # 保存最终传输内容到日志文件 try: # 创建logs文件夹 logs_dir = os.path.join("logs", "ai") os.makedirs(logs_dir, exist_ok=True) cleanup_ai_logs(logs_dir, keep_days=1) # 生成日志文件名(当前时间) current_time = datetime.now().strftime("%Y%m%d_%H%M%S") log_filename = f"{current_time}.log" log_filepath = os.path.join(logs_dir, log_filename) task_name = product_data.get("任务名称") or product_data.get("任务名") or "unknown" log_payload = { "timestamp": current_time, "task_name": task_name, "product_id": product_id, "title": item_info.get("商品标题", "无"), "image_count": len(image_data_urls), } log_content = json.dumps(log_payload, ensure_ascii=False) # 写入日志文件 with open(log_filepath, 'w', encoding='utf-8') as f: f.write(log_content) safe_print(f" [日志] AI分析请求已保存到: {log_filepath}") except Exception as e: safe_print(f" [日志] 保存AI分析日志时出错: {e}") # 增强的AI调用,包含更严格的结构化输出控制和重试机制 max_retries = 4 api_mode = CHAT_COMPLETIONS_API_MODE use_response_format = ENABLE_RESPONSE_FORMAT use_temperature = True for attempt in range(max_retries): try: # 根据重试次数调整参数 current_temperature = 0.1 if attempt == 0 else 0.05 # 重试时使用更低的温度 from src.config import get_ai_request_params request_params = build_ai_request_params( api_mode, model=MODEL_NAME, messages=messages, temperature=current_temperature, max_output_tokens=4000, enable_json_output=use_response_format, ) if not use_temperature: request_params = remove_temperature_param(request_params) request_params = get_ai_request_params(**request_params) if AI_DEBUG_MODE: safe_print(f"\n--- [AI DEBUG] 第{attempt + 1}次尝试 REQUEST ---") safe_print( json.dumps( _build_debug_request_summary(api_mode, request_params), ensure_ascii=False, indent=2, ) ) safe_print("-----------------------------------\n") response = await create_ai_response_async( client, api_mode, request_params, ) ai_response_content = extract_ai_response_content(response) if AI_DEBUG_MODE: safe_print(f"\n--- [AI DEBUG] 第{attempt + 1}次尝试 ---") safe_print("--- RAW AI RESPONSE ---") safe_print(ai_response_content) safe_print("---------------------\n") try: parsed_response = parse_ai_response_json(ai_response_content) # 验证响应格式 if validate_ai_response_format(parsed_response): safe_print(f" [AI分析] 第{attempt + 1}次尝试成功,响应格式验证通过") return parsed_response safe_print(f" [AI分析] 第{attempt + 1}次尝试格式验证失败") if attempt < max_retries - 1: safe_print(f" [AI分析] 准备第{attempt + 2}次重试...") continue raise ValueError("AI响应格式缺少必需字段或字段类型不正确。") except json.JSONDecodeError as e: safe_print(f" [AI分析] 第{attempt + 1}次尝试JSON解析失败: {e}") if attempt < max_retries - 1: safe_print(f" [AI分析] 准备第{attempt + 2}次重试...") continue raise e except EmptyAIResponseError as e: safe_print(f" [AI分析] 第{attempt + 1}次尝试返回空响应: {e}") if attempt < max_retries - 1: safe_print(f" [AI分析] 准备第{attempt + 2}次重试...") continue raise e except Exception as e: if ( api_mode == CHAT_COMPLETIONS_API_MODE and is_chat_completions_api_unsupported_error(e) ): api_mode = RESPONSES_API_MODE safe_print( " [AI分析] 当前服务未实现 Chat Completions API,后续重试将自动回退到 Responses API。" ) elif api_mode == RESPONSES_API_MODE and is_responses_api_unsupported_error(e): api_mode = CHAT_COMPLETIONS_API_MODE safe_print( " [AI分析] 当前服务未实现 Responses API,后续重试将自动回退到 Chat Completions API。" ) if use_response_format and is_json_output_unsupported_error(e): use_response_format = False safe_print( " [AI分析] 当前模型不支持结构化 JSON 输出,后续重试将自动禁用该参数。" ) if use_temperature and is_temperature_unsupported_error(e): use_temperature = False safe_print( " [AI分析] 当前模型不支持 temperature 参数,后续重试将自动禁用该参数。" ) if AI_DEBUG_MODE: safe_print(f"\n--- [AI DEBUG] 第{attempt + 1}次尝试 EXCEPTION ---") safe_print(repr(e)) safe_print(traceback.format_exc()) safe_print("-------------------------------------\n") safe_print(f" [AI分析] 第{attempt + 1}次尝试AI调用失败: {e}") if attempt < max_retries - 1: safe_print(f" [AI分析] 准备第{attempt + 2}次重试...") continue else: raise e ================================================ FILE: src/ai_message_builder.py ================================================ """ AI 请求消息构造辅助函数 """ from typing import Dict, List, Union TEXT_ONLY_ANALYSIS_NOTE = ( "补充说明:本次未提供商品图片,请仅根据商品文字字段和卖家信息判断,不要推断图片内容。" ) def build_analysis_text_prompt( product_json: str, prompt_text: str, *, include_images: bool, ) -> str: note = "" if include_images else f"\n{TEXT_ONLY_ANALYSIS_NOTE}\n" value_note = ( "\n如果商品 JSON 中包含“价格参考”或 price_insight,请结合价格位置、历史走势、" "配置、成色、附件、卖家信息综合判断性价比。" "你可以额外输出可选字段 value_score(0-100) 和 value_summary," "但必须保留原有 is_recommended/reason 等字段。\n" ) return f"""请基于你的专业知识和我的要求,分析以下完整的商品JSON数据: ```json {product_json} ``` {prompt_text} {value_note} {note}""" def build_user_message_content( text_prompt: str, image_data_urls: List[str], ) -> Union[str, List[Dict[str, object]]]: if not image_data_urls: return text_prompt user_content: List[Dict[str, object]] = [ {"type": "image_url", "image_url": {"url": url}} for url in image_data_urls ] user_content.append({"type": "text", "text": text_prompt}) return user_content ================================================ FILE: src/api/__init__.py ================================================ ================================================ FILE: src/api/dependencies.py ================================================ """ FastAPI 依赖注入 提供服务实例的创建和管理 """ from fastapi import Depends from src.services.task_service import TaskService from src.services.notification_service import NotificationService, build_notification_service from src.services.ai_service import AIAnalysisService from src.services.process_service import ProcessService from src.services.scheduler_service import SchedulerService from src.services.task_generation_service import TaskGenerationService from src.infrastructure.persistence.sqlite_task_repository import SqliteTaskRepository from src.infrastructure.external.ai_client import AIClient # 全局 ProcessService 实例(将在 app.py 中设置) _process_service_instance = None _scheduler_service_instance = None _task_generation_service_instance = None def set_process_service(service: ProcessService): """设置全局 ProcessService 实例""" global _process_service_instance _process_service_instance = service def set_scheduler_service(service: SchedulerService): """设置全局 SchedulerService 实例""" global _scheduler_service_instance _scheduler_service_instance = service def set_task_generation_service(service: TaskGenerationService): """设置全局 TaskGenerationService 实例""" global _task_generation_service_instance _task_generation_service_instance = service # 服务依赖注入 def get_task_service() -> TaskService: """获取任务管理服务实例""" repository = SqliteTaskRepository() return TaskService(repository) def get_notification_service() -> NotificationService: """获取通知服务实例""" return build_notification_service() def get_ai_service() -> AIAnalysisService: """获取AI分析服务实例""" ai_client = AIClient() return AIAnalysisService(ai_client) def get_process_service() -> ProcessService: """获取进程管理服务实例""" if _process_service_instance is None: raise RuntimeError("ProcessService 未初始化") return _process_service_instance def get_scheduler_service() -> SchedulerService: """获取调度服务实例""" if _scheduler_service_instance is None: raise RuntimeError("SchedulerService 未初始化") return _scheduler_service_instance def get_task_generation_service() -> TaskGenerationService: """获取任务生成作业服务实例""" if _task_generation_service_instance is None: raise RuntimeError("TaskGenerationService 未初始化") return _task_generation_service_instance ================================================ FILE: src/api/routes/__init__.py ================================================ ================================================ FILE: src/api/routes/accounts.py ================================================ """ 闲鱼账号管理路由 """ import json import os import re import aiofiles from fastapi import APIRouter, HTTPException from pydantic import BaseModel from typing import List from src.infrastructure.config.env_manager import env_manager router = APIRouter(prefix="/api/accounts", tags=["accounts"]) ACCOUNT_NAME_RE = re.compile(r"^[a-zA-Z0-9_-]{1,50}$") class AccountCreate(BaseModel): name: str content: str class AccountUpdate(BaseModel): content: str def _strip_quotes(value: str) -> str: if not value: return value if value.startswith(("\"", "'")) and value.endswith(("\"", "'")): return value[1:-1] return value def _state_dir() -> str: raw = env_manager.get_value("ACCOUNT_STATE_DIR", "state") or "state" return _strip_quotes(raw.strip()) def _ensure_state_dir(path: str) -> None: os.makedirs(path, exist_ok=True) def _validate_name(name: str) -> str: trimmed = name.strip() if not trimmed or not ACCOUNT_NAME_RE.match(trimmed): raise HTTPException(status_code=400, detail="账号名称只能包含字母、数字、下划线或短横线。") return trimmed def _account_path(name: str) -> str: filename = f"{name}.json" return os.path.join(_state_dir(), filename) def _validate_json(content: str) -> None: try: json.loads(content) except json.JSONDecodeError: raise HTTPException(status_code=400, detail="提供的内容不是有效的JSON格式。") @router.get("", response_model=List[dict]) async def list_accounts(): state_dir = _state_dir() if not os.path.isdir(state_dir): return [] files = [f for f in os.listdir(state_dir) if f.endswith(".json")] accounts = [] for filename in sorted(files): name = filename[:-5] accounts.append({ "name": name, "path": os.path.join(state_dir, filename), }) return accounts @router.get("/{name}", response_model=dict) async def get_account(name: str): account_name = _validate_name(name) path = _account_path(account_name) if not os.path.exists(path): raise HTTPException(status_code=404, detail="账号不存在") async with aiofiles.open(path, "r", encoding="utf-8") as f: content = await f.read() return {"name": account_name, "path": path, "content": content} @router.post("", response_model=dict) async def create_account(data: AccountCreate): account_name = _validate_name(data.name) _validate_json(data.content) state_dir = _state_dir() _ensure_state_dir(state_dir) path = _account_path(account_name) if os.path.exists(path): raise HTTPException(status_code=409, detail="账号已存在") async with aiofiles.open(path, "w", encoding="utf-8") as f: await f.write(data.content) return {"message": "账号已添加", "name": account_name, "path": path} @router.put("/{name}", response_model=dict) async def update_account(name: str, data: AccountUpdate): account_name = _validate_name(name) _validate_json(data.content) state_dir = _state_dir() _ensure_state_dir(state_dir) path = _account_path(account_name) if not os.path.exists(path): raise HTTPException(status_code=404, detail="账号不存在") async with aiofiles.open(path, "w", encoding="utf-8") as f: await f.write(data.content) return {"message": "账号已更新", "name": account_name, "path": path} @router.delete("/{name}", response_model=dict) async def delete_account(name: str): account_name = _validate_name(name) path = _account_path(account_name) if not os.path.exists(path): raise HTTPException(status_code=404, detail="账号不存在") os.remove(path) return {"message": "账号已删除"} ================================================ FILE: src/api/routes/dashboard.py ================================================ """ Dashboard 概览路由 """ from fastapi import APIRouter, Depends, HTTPException from src.api.dependencies import get_task_service from src.services.dashboard_service import build_dashboard_snapshot from src.services.task_service import TaskService router = APIRouter(prefix="/api/dashboard", tags=["dashboard"]) @router.get("/summary") async def get_dashboard_summary( task_service: TaskService = Depends(get_task_service), ): try: tasks = await task_service.get_all_tasks() return await build_dashboard_snapshot(tasks) except Exception as exc: raise HTTPException(status_code=500, detail=f"加载 dashboard 数据失败: {exc}") ================================================ FILE: src/api/routes/login_state.py ================================================ """ 登录状态管理路由 """ import os import json import aiofiles from fastapi import APIRouter, HTTPException from pydantic import BaseModel router = APIRouter(prefix="/api/login-state", tags=["login-state"]) class LoginStateUpdate(BaseModel): """登录状态更新模型""" content: str @router.post("", response_model=dict) async def update_login_state( data: LoginStateUpdate, ): """接收前端发送的登录状态JSON字符串,并保存到 xianyu_state.json""" state_file = "xianyu_state.json" try: # 验证是否是有效的JSON json.loads(data.content) except json.JSONDecodeError: raise HTTPException(status_code=400, detail="提供的内容不是有效的JSON格式。") try: async with aiofiles.open(state_file, 'w', encoding='utf-8') as f: await f.write(data.content) return {"message": f"登录状态文件 '{state_file}' 已成功更新。"} except Exception as e: raise HTTPException(status_code=500, detail=f"写入登录状态文件时出错: {e}") @router.delete("", response_model=dict) async def delete_login_state(): """删除 xianyu_state.json 文件""" state_file = "xianyu_state.json" if os.path.exists(state_file): try: os.remove(state_file) return {"message": "登录状态文件已成功删除。"} except OSError as e: raise HTTPException(status_code=500, detail=f"删除登录状态文件时出错: {e}") return {"message": "登录状态文件不存在,无需删除。"} ================================================ FILE: src/api/routes/logs.py ================================================ """ 日志管理路由 """ import os from typing import Optional, Tuple, List import aiofiles from fastapi import APIRouter, Depends, Query from fastapi.responses import JSONResponse from src.api.dependencies import get_task_service from src.services.task_service import TaskService from src.utils import resolve_task_log_path router = APIRouter(prefix="/api/logs", tags=["logs"]) async def _read_tail_lines( log_file_path: str, offset_lines: int, limit_lines: int, chunk_size: int = 8192 ) -> Tuple[List[str], bool, int]: async with aiofiles.open(log_file_path, 'rb') as f: await f.seek(0, os.SEEK_END) file_size = await f.tell() if file_size == 0 or limit_lines <= 0: return [], False, file_size offset_lines = max(0, int(offset_lines)) limit_lines = max(0, int(limit_lines)) lines_needed = offset_lines + limit_lines pos = file_size buffer = b"" lines: List[bytes] = [] while pos > 0 and len(lines) < lines_needed: read_size = min(chunk_size, pos) pos -= read_size await f.seek(pos) chunk = await f.read(read_size) buffer = chunk + buffer lines = buffer.splitlines() start = max(0, len(lines) - lines_needed) end = max(0, len(lines) - offset_lines) selected = lines[start:end] if end > start else [] has_more = pos > 0 or len(lines) > lines_needed decoded = [line.decode('utf-8', errors='replace') for line in selected] return decoded, has_more, file_size @router.get("") async def get_logs( from_pos: int = 0, task_id: Optional[int] = Query(default=None, ge=0), task_service: TaskService = Depends(get_task_service), ): """获取日志内容(增量读取)""" if task_id is None: return JSONResponse(content={ "new_content": "请选择任务后查看日志。", "new_pos": 0 }) task = await task_service.get_task(task_id) if not task: return JSONResponse(status_code=404, content={ "new_content": "任务不存在或已删除。", "new_pos": 0 }) log_file_path = resolve_task_log_path(task_id, task.task_name) if not os.path.exists(log_file_path): return JSONResponse(content={ "new_content": "", "new_pos": 0 }) try: async with aiofiles.open(log_file_path, 'rb') as f: await f.seek(0, os.SEEK_END) file_size = await f.tell() if from_pos >= file_size: return {"new_content": "", "new_pos": file_size} await f.seek(from_pos) new_bytes = await f.read() new_content = new_bytes.decode('utf-8', errors='replace') return {"new_content": new_content, "new_pos": file_size} except Exception as e: return JSONResponse( status_code=500, content={"new_content": f"\n读取日志文件时出错: {e}", "new_pos": from_pos} ) @router.get("/tail") async def get_logs_tail( task_id: Optional[int] = Query(default=None, ge=0), offset_lines: int = Query(default=0, ge=0), limit_lines: int = Query(default=50, ge=1, le=1000), task_service: TaskService = Depends(get_task_service), ): """获取日志尾部内容(按行分页)""" if task_id is None: return JSONResponse(content={ "content": "", "has_more": False, "next_offset": 0, "new_pos": 0 }) task = await task_service.get_task(task_id) if not task: return JSONResponse(status_code=404, content={ "content": "", "has_more": False, "next_offset": 0, "new_pos": 0 }) log_file_path = resolve_task_log_path(task_id, task.task_name) if not os.path.exists(log_file_path): return JSONResponse(content={ "content": "", "has_more": False, "next_offset": 0, "new_pos": 0 }) try: lines, has_more, file_size = await _read_tail_lines( log_file_path, offset_lines=offset_lines, limit_lines=limit_lines ) next_offset = offset_lines + len(lines) return { "content": "\n".join(lines), "has_more": has_more, "next_offset": next_offset, "new_pos": file_size } except Exception as e: return JSONResponse( status_code=500, content={ "content": f"读取日志文件时出错: {e}", "has_more": False, "next_offset": offset_lines, "new_pos": 0 } ) @router.delete("", response_model=dict) async def clear_logs( task_id: Optional[int] = Query(default=None, ge=0), task_service: TaskService = Depends(get_task_service), ): """清空日志文件""" if task_id is None: return {"message": "未指定任务,无法清空日志。"} task = await task_service.get_task(task_id) if not task: return {"message": "任务不存在或已删除。"} log_file_path = resolve_task_log_path(task_id, task.task_name) if not os.path.exists(log_file_path): return {"message": "日志文件不存在,无需清空。"} try: async with aiofiles.open(log_file_path, 'w', encoding='utf-8') as f: await f.write("") return {"message": "日志已成功清空。"} except Exception as e: return JSONResponse( status_code=500, content={"message": f"清空日志文件时出错: {e}"} ) if not os.path.exists(log_file_path): return {"message": "日志文件不存在,无需清空。"} try: async with aiofiles.open(log_file_path, 'w', encoding='utf-8') as f: await f.write("") return {"message": "日志已成功清空。"} except Exception as e: return JSONResponse( status_code=500, content={"message": f"清空日志文件时出错: {e}"} ) ================================================ FILE: src/api/routes/prompts.py ================================================ """ Prompt 管理路由 """ import os import aiofiles from fastapi import APIRouter, HTTPException from pydantic import BaseModel router = APIRouter(prefix="/api/prompts", tags=["prompts"]) class PromptUpdate(BaseModel): """Prompt 更新模型""" content: str @router.get("") async def list_prompts(): """列出所有 prompt 文件""" prompts_dir = "prompts" if not os.path.isdir(prompts_dir): return [] return [f for f in os.listdir(prompts_dir) if f.endswith(".txt")] @router.get("/{filename}") async def get_prompt(filename: str): """获取 prompt 文件内容""" if "/" in filename or ".." in filename: raise HTTPException(status_code=400, detail="无效的文件名") filepath = os.path.join("prompts", filename) if not os.path.exists(filepath): raise HTTPException(status_code=404, detail="Prompt 文件未找到") async with aiofiles.open(filepath, 'r', encoding='utf-8') as f: content = await f.read() return {"filename": filename, "content": content} @router.put("/{filename}") async def update_prompt( filename: str, prompt_update: PromptUpdate, ): """更新 prompt 文件内容""" if "/" in filename or ".." in filename: raise HTTPException(status_code=400, detail="无效的文件名") filepath = os.path.join("prompts", filename) if not os.path.exists(filepath): raise HTTPException(status_code=404, detail="Prompt 文件未找到") try: async with aiofiles.open(filepath, 'w', encoding='utf-8') as f: await f.write(prompt_update.content) return {"message": f"Prompt 文件 '{filename}' 更新成功"} except Exception as e: raise HTTPException(status_code=500, detail=f"写入文件时出错: {e}") ================================================ FILE: src/api/routes/results.py ================================================ """ 结果文件管理路由 """ from fastapi import APIRouter, HTTPException, Query from fastapi.responses import Response from urllib.parse import quote from src.services.price_history_service import build_price_history_insights from src.services.result_export_service import build_results_csv from src.services.result_file_service import ( enrich_records_with_price_insight, validate_result_filename, ) from src.services.result_storage_service import ( build_result_ndjson, delete_result_file_records, list_result_filenames, load_all_result_records, query_result_records, result_file_exists, ) router = APIRouter(prefix="/api/results", tags=["results"]) DEFAULT_EXPORT_FILENAME = "export.csv" def _build_download_headers(export_name: str) -> dict[str, str]: ascii_name = export_name.encode("ascii", "ignore").decode("ascii") if ascii_name != export_name or not ascii_name: ascii_name = DEFAULT_EXPORT_FILENAME encoded_name = quote(export_name, safe="") return { "Content-Disposition": ( f'attachment; filename="{ascii_name}"; ' f"filename*=UTF-8''{encoded_name}" ) } @router.get("/files") async def get_result_files(): """获取所有结果文件列表""" return {"files": await list_result_filenames()} @router.get("/files/{filename:path}") async def download_result_file(filename: str): """下载指定的结果文件""" if ".." in filename or filename.startswith("/"): return {"error": "非法的文件路径"} if not filename.endswith(".jsonl") or not await result_file_exists(filename): return {"error": "文件不存在"} return Response( content=await build_result_ndjson(filename), media_type="application/x-ndjson", headers={"Content-Disposition": f'attachment; filename="{filename}"'}, ) @router.delete("/files/{filename:path}") async def delete_result_file(filename: str): """删除指定的结果文件""" if ".." in filename or filename.startswith("/"): raise HTTPException(status_code=400, detail="非法的文件路径") if not filename.endswith(".jsonl"): raise HTTPException(status_code=400, detail="只能删除 .jsonl 文件") deleted_rows = await delete_result_file_records(filename) if deleted_rows <= 0: raise HTTPException(status_code=404, detail="文件不存在") return {"message": f"文件 {filename} 已成功删除"} @router.get("/{filename}") async def get_result_file_content( filename: str, page: int = Query(1, ge=1), limit: int = Query(20, ge=1, le=100), recommended_only: bool = Query(False), # 兼容旧参数,等价于 ai_recommended_only ai_recommended_only: bool = Query(False), keyword_recommended_only: bool = Query(False), sort_by: str = Query("crawl_time"), sort_order: str = Query("desc"), ): """读取指定的 .jsonl 文件内容,支持分页、筛选和排序""" if ai_recommended_only and keyword_recommended_only: raise HTTPException(status_code=400, detail="AI推荐筛选与关键词推荐筛选不能同时开启。") if recommended_only and not ai_recommended_only and not keyword_recommended_only: ai_recommended_only = True try: validate_result_filename(filename) total_items, items = await query_result_records( filename, ai_recommended_only=ai_recommended_only, keyword_recommended_only=keyword_recommended_only, sort_by=sort_by, sort_order=sort_order, page=page, limit=limit, ) except ValueError as exc: raise HTTPException(status_code=400, detail=str(exc)) except Exception as exc: raise HTTPException(status_code=500, detail=f"读取结果文件时出错: {exc}") if total_items <= 0 and not await result_file_exists(filename): raise HTTPException(status_code=404, detail="结果文件未找到") paginated_results = enrich_records_with_price_insight(items, filename) return { "total_items": total_items, "page": page, "limit": limit, "items": paginated_results } @router.get("/{filename}/insights") async def get_result_file_insights(filename: str): try: validate_result_filename(filename) keyword = filename.replace("_full_data.jsonl", "") return build_price_history_insights(keyword) except ValueError as exc: raise HTTPException(status_code=400, detail=str(exc)) @router.get("/{filename}/export") async def export_result_file_content( filename: str, recommended_only: bool = Query(False), ai_recommended_only: bool = Query(False), keyword_recommended_only: bool = Query(False), sort_by: str = Query("crawl_time"), sort_order: str = Query("desc"), ): if ai_recommended_only and keyword_recommended_only: raise HTTPException(status_code=400, detail="AI推荐筛选与关键词推荐筛选不能同时开启。") if recommended_only and not ai_recommended_only and not keyword_recommended_only: ai_recommended_only = True try: validate_result_filename(filename) results = await load_all_result_records( filename, ai_recommended_only=ai_recommended_only, keyword_recommended_only=keyword_recommended_only, sort_by=sort_by, sort_order=sort_order, ) csv_text = build_results_csv( enrich_records_with_price_insight(results, filename) ) except ValueError as exc: raise HTTPException(status_code=400, detail=str(exc)) except Exception as exc: raise HTTPException(status_code=500, detail=f"导出结果文件时出错: {exc}") if not results and not await result_file_exists(filename): raise HTTPException(status_code=404, detail="结果文件未找到") export_name = filename.replace(".jsonl", ".csv") headers = _build_download_headers(export_name) return Response(content=csv_text, media_type="text/csv; charset=utf-8", headers=headers) ================================================ FILE: src/api/routes/settings.py ================================================ """ 设置管理路由 """ import os from typing import Optional from dotenv import load_dotenv from fastapi import APIRouter, Depends, HTTPException from pydantic import BaseModel, Field from src.api.dependencies import get_process_service from src.infrastructure.config.env_manager import env_manager from src.infrastructure.config.settings import ( AISettings, reload_settings, scraper_settings, ) from src.services.ai_request_compat import ( CHAT_COMPLETIONS_API_MODE, RESPONSES_API_MODE, build_ai_request_params, create_ai_response_sync, is_chat_completions_api_unsupported_error, is_responses_api_unsupported_error, ) from src.services.ai_response_parser import extract_ai_response_content from src.services.notification_config_service import ( NotificationSettingsValidationError, build_configured_channels, build_notification_settings_response, build_notification_status_flags, load_notification_settings, model_dump, prepare_notification_settings_update, ) from src.services.notification_service import build_notification_service from src.services.process_service import ProcessService router = APIRouter(prefix="/api/settings", tags=["settings"]) AI_TEST_PROMPT = "Reply with OK only." AI_TEST_MAX_OUTPUT_TOKENS = 32 def _reload_env() -> None: load_dotenv(dotenv_path=env_manager.env_file, override=True) reload_settings() def _env_bool(key: str, default: bool = False) -> bool: value = env_manager.get_value(key) if value is None: return default return str(value).strip().lower() in {"1", "true", "yes", "y", "on"} def _env_int(key: str, default: int) -> int: value = env_manager.get_value(key) if value is None: return default try: return int(value) except ValueError: return default def _normalize_bool_value(value: bool) -> str: return "true" if value else "false" class NotificationSettingsModel(BaseModel): """通知设置模型""" NTFY_TOPIC_URL: Optional[str] = None GOTIFY_URL: Optional[str] = None GOTIFY_TOKEN: Optional[str] = None BARK_URL: Optional[str] = None WX_BOT_URL: Optional[str] = None TELEGRAM_BOT_TOKEN: Optional[str] = None TELEGRAM_CHAT_ID: Optional[str] = None TELEGRAM_API_BASE_URL: Optional[str] = None WEBHOOK_URL: Optional[str] = None WEBHOOK_METHOD: Optional[str] = None WEBHOOK_HEADERS: Optional[str] = None WEBHOOK_CONTENT_TYPE: Optional[str] = None WEBHOOK_QUERY_PARAMETERS: Optional[str] = None WEBHOOK_BODY: Optional[str] = None PCURL_TO_MOBILE: Optional[bool] = None class NotificationTestRequest(BaseModel): """通知测试请求""" channel: Optional[str] = None settings: NotificationSettingsModel = Field(default_factory=NotificationSettingsModel) class AISettingsModel(BaseModel): """AI设置模型""" OPENAI_API_KEY: Optional[str] = None OPENAI_BASE_URL: Optional[str] = None OPENAI_MODEL_NAME: Optional[str] = None SKIP_AI_ANALYSIS: Optional[bool] = None PROXY_URL: Optional[str] = None class RotationSettingsModel(BaseModel): ACCOUNT_ROTATION_ENABLED: Optional[bool] = None ACCOUNT_ROTATION_MODE: Optional[str] = None ACCOUNT_ROTATION_RETRY_LIMIT: Optional[int] = None ACCOUNT_BLACKLIST_TTL: Optional[int] = None ACCOUNT_STATE_DIR: Optional[str] = None PROXY_ROTATION_ENABLED: Optional[bool] = None PROXY_ROTATION_MODE: Optional[str] = None PROXY_POOL: Optional[str] = None PROXY_ROTATION_RETRY_LIMIT: Optional[int] = None PROXY_BLACKLIST_TTL: Optional[int] = None @router.get("/notifications") async def get_notification_settings(): return build_notification_settings_response(load_notification_settings()) @router.put("/notifications") async def update_notification_settings(settings: NotificationSettingsModel): try: updates, deletions, merged_settings = prepare_notification_settings_update( model_dump(settings, exclude_unset=True), load_notification_settings(), ) except NotificationSettingsValidationError as exc: raise HTTPException(status_code=422, detail=str(exc)) from exc success = env_manager.apply_changes(updates=updates, deletions=deletions) if not success: raise HTTPException(status_code=500, detail="更新通知设置失败") _reload_env() return { "message": "通知设置已成功更新", "configured_channels": build_configured_channels(merged_settings), } @router.post("/notifications/test") async def test_notification_settings(payload: NotificationTestRequest): try: _, _, merged_settings = prepare_notification_settings_update( model_dump(payload.settings, exclude_unset=True), load_notification_settings(), ) except NotificationSettingsValidationError as exc: raise HTTPException(status_code=422, detail=str(exc)) from exc service = build_notification_service(merged_settings) if not service.clients: raise HTTPException(status_code=422, detail="请至少配置一个可用的通知渠道") results = await service.send_test_notification() if payload.channel: if payload.channel not in results: raise HTTPException( status_code=422, detail=f"渠道 {payload.channel} 未配置或不受支持", ) results = {payload.channel: results[payload.channel]} return { "message": "测试通知已执行", "results": results, } @router.get("/rotation") async def get_rotation_settings(): return { "ACCOUNT_ROTATION_ENABLED": _env_bool("ACCOUNT_ROTATION_ENABLED", False), "ACCOUNT_ROTATION_MODE": env_manager.get_value("ACCOUNT_ROTATION_MODE", "per_task"), "ACCOUNT_ROTATION_RETRY_LIMIT": _env_int("ACCOUNT_ROTATION_RETRY_LIMIT", 2), "ACCOUNT_BLACKLIST_TTL": _env_int("ACCOUNT_BLACKLIST_TTL", 300), "ACCOUNT_STATE_DIR": env_manager.get_value("ACCOUNT_STATE_DIR", "state"), "PROXY_ROTATION_ENABLED": _env_bool("PROXY_ROTATION_ENABLED", False), "PROXY_ROTATION_MODE": env_manager.get_value("PROXY_ROTATION_MODE", "per_task"), "PROXY_POOL": env_manager.get_value("PROXY_POOL", ""), "PROXY_ROTATION_RETRY_LIMIT": _env_int("PROXY_ROTATION_RETRY_LIMIT", 2), "PROXY_BLACKLIST_TTL": _env_int("PROXY_BLACKLIST_TTL", 300), } @router.put("/rotation") async def update_rotation_settings(settings: RotationSettingsModel): updates = {} payload = model_dump(settings, exclude_unset=True) for key, value in payload.items(): if isinstance(value, bool): updates[key] = _normalize_bool_value(value) else: updates[key] = str(value) success = env_manager.update_values(updates) if not success: raise HTTPException(status_code=500, detail="更新轮换设置失败") _reload_env() return {"message": "轮换设置已成功更新"} @router.get("/status") async def get_system_status( process_service: ProcessService = Depends(get_process_service), ): state_file = "xianyu_state.json" login_state_exists = os.path.exists(state_file) env_file_exists = os.path.exists(env_manager.env_file) openai_api_key = env_manager.get_value("OPENAI_API_KEY", "") openai_base_url = env_manager.get_value("OPENAI_BASE_URL", "") openai_model_name = env_manager.get_value("OPENAI_MODEL_NAME", "") ai_settings = AISettings() notification_settings = load_notification_settings() running_task_ids = [ task_id for task_id, process in process_service.processes.items() if process and process.returncode is None ] return { "ai_configured": ai_settings.is_configured(), "notification_configured": notification_settings.has_any_notification_enabled(), "headless_mode": scraper_settings.run_headless, "running_in_docker": scraper_settings.running_in_docker, "scraper_running": len(running_task_ids) > 0, "running_task_ids": running_task_ids, "login_state_file": { "exists": login_state_exists, "path": state_file, }, "env_file": { "exists": env_file_exists, "openai_api_key_set": bool(openai_api_key), "openai_base_url_set": bool(openai_base_url), "openai_model_name_set": bool(openai_model_name), **build_notification_status_flags(notification_settings), }, "configured_notification_channels": build_configured_channels(notification_settings), } @router.get("/ai") async def get_ai_settings(): return { "OPENAI_BASE_URL": env_manager.get_value("OPENAI_BASE_URL", ""), "OPENAI_MODEL_NAME": env_manager.get_value("OPENAI_MODEL_NAME", ""), "SKIP_AI_ANALYSIS": env_manager.get_value("SKIP_AI_ANALYSIS", "false").lower() == "true", "PROXY_URL": env_manager.get_value("PROXY_URL", ""), } @router.put("/ai") async def update_ai_settings(settings: AISettingsModel): updates = {} if settings.OPENAI_API_KEY is not None: updates["OPENAI_API_KEY"] = settings.OPENAI_API_KEY if settings.OPENAI_BASE_URL is not None: updates["OPENAI_BASE_URL"] = settings.OPENAI_BASE_URL if settings.OPENAI_MODEL_NAME is not None: updates["OPENAI_MODEL_NAME"] = settings.OPENAI_MODEL_NAME if settings.SKIP_AI_ANALYSIS is not None: updates["SKIP_AI_ANALYSIS"] = str(settings.SKIP_AI_ANALYSIS).lower() if settings.PROXY_URL is not None: updates["PROXY_URL"] = settings.PROXY_URL success = env_manager.update_values(updates) if not success: raise HTTPException(status_code=500, detail="更新AI设置失败") _reload_env() return {"message": "AI设置已成功更新"} @router.post("/ai/test") async def test_ai_settings(settings: dict): """测试AI模型设置是否有效""" try: from openai import OpenAI import httpx stored_api_key = env_manager.get_value("OPENAI_API_KEY", "") submitted_api_key = settings.get("OPENAI_API_KEY", "") api_key = submitted_api_key or stored_api_key client_params = { "api_key": api_key, "base_url": settings.get("OPENAI_BASE_URL", ""), "timeout": httpx.Timeout(30.0), } proxy_url = settings.get("PROXY_URL", "") if proxy_url: client_params["http_client"] = httpx.Client(proxy=proxy_url) model_name = settings.get("OPENAI_MODEL_NAME", "") client = OpenAI(**client_params) messages = [{"role": "user", "content": AI_TEST_PROMPT}] api_mode = CHAT_COMPLETIONS_API_MODE try: response = create_ai_response_sync( client, api_mode, build_ai_request_params( api_mode, model=model_name, messages=messages, max_output_tokens=AI_TEST_MAX_OUTPUT_TOKENS, ), ) except Exception as exc: if not is_chat_completions_api_unsupported_error(exc): raise api_mode = RESPONSES_API_MODE response = create_ai_response_sync( client, api_mode, build_ai_request_params( api_mode, model=model_name, messages=messages, max_output_tokens=AI_TEST_MAX_OUTPUT_TOKENS, ), ) return { "success": True, "message": "AI模型连接测试成功!", "response": extract_ai_response_content(response), } except Exception as exc: return { "success": False, "message": f"AI模型连接测试失败: {exc}", } ================================================ FILE: src/api/routes/tasks.py ================================================ """ 任务管理路由 """ from fastapi import APIRouter, Depends, HTTPException from fastapi.responses import JSONResponse from typing import List import os import aiofiles from src.api.dependencies import ( get_process_service, get_scheduler_service, get_task_generation_service, get_task_service, ) from src.services.task_service import TaskService from src.services.process_service import ProcessService from src.services.scheduler_service import SchedulerService from src.services.task_generation_service import TaskGenerationService from src.services.task_generation_runner import ( build_task_create, run_ai_generation_job, ) from src.services.task_payloads import serialize_task, serialize_tasks from src.domain.models.task import TaskCreate, TaskUpdate, TaskGenerateRequest from src.prompt_utils import generate_criteria from src.utils import resolve_task_log_path from src.services.account_strategy_service import normalize_account_strategy from src.infrastructure.persistence.storage_names import build_result_filename from src.services.price_history_service import delete_price_snapshots from src.services.result_storage_service import delete_result_file_records router = APIRouter(prefix="/api/tasks", tags=["tasks"]) async def _reload_scheduler_if_needed( task_service: TaskService, scheduler_service: SchedulerService, ): tasks = await task_service.get_all_tasks() await scheduler_service.reload_jobs(tasks) def _has_keyword_rules(rules) -> bool: return bool(rules and len(rules) > 0) def _validate_final_account_strategy(existing_task, task_update: TaskUpdate) -> None: account_state_file = ( task_update.account_state_file if task_update.account_state_file is not None else existing_task.account_state_file ) account_strategy = normalize_account_strategy( task_update.account_strategy, account_state_file, ) task_update.account_strategy = account_strategy if account_strategy == "fixed" and not account_state_file: raise HTTPException(status_code=400, detail="固定账号模式下必须选择账号。") @router.get("", response_model=List[dict]) async def get_tasks( service: TaskService = Depends(get_task_service), scheduler_service: SchedulerService = Depends(get_scheduler_service), ): """获取所有任务""" tasks = await service.get_all_tasks() return serialize_tasks(tasks, scheduler_service) @router.get("/{task_id}", response_model=dict) async def get_task( task_id: int, service: TaskService = Depends(get_task_service), scheduler_service: SchedulerService = Depends(get_scheduler_service), ): """获取单个任务""" task = await service.get_task(task_id) if not task: raise HTTPException(status_code=404, detail="任务未找到") return serialize_task(task, scheduler_service) @router.post("/", response_model=dict) async def create_task( task_create: TaskCreate, service: TaskService = Depends(get_task_service), scheduler_service: SchedulerService = Depends(get_scheduler_service), ): """创建新任务""" task = await service.create_task(task_create) await _reload_scheduler_if_needed(service, scheduler_service) return {"message": "任务创建成功", "task": serialize_task(task, scheduler_service)} @router.post("/generate", response_model=dict) async def generate_task( req: TaskGenerateRequest, service: TaskService = Depends(get_task_service), scheduler_service: SchedulerService = Depends(get_scheduler_service), generation_service: TaskGenerationService = Depends(get_task_generation_service), ): """创建任务。AI模式会生成分析标准,关键词模式直接保存规则。""" print(f"收到任务生成请求: {req.task_name},模式: {req.decision_mode}") try: mode = req.decision_mode or "ai" if mode == "ai": job = await generation_service.create_job(req.task_name) generation_service.track( run_ai_generation_job( job_id=job.job_id, req=req, task_service=service, scheduler_service=scheduler_service, generation_service=generation_service, ) ) return JSONResponse( status_code=202, content={ "message": "AI 任务生成已开始。", "job": job.model_dump(mode="json"), }, ) task = await service.create_task(build_task_create(req, "")) await _reload_scheduler_if_needed(service, scheduler_service) return {"message": "任务创建成功。", "task": serialize_task(task, scheduler_service)} except HTTPException: raise except Exception as e: error_msg = f"AI任务生成API发生未知错误: {str(e)}" print(error_msg) import traceback print(traceback.format_exc()) raise HTTPException(status_code=500, detail=error_msg) @router.get("/generate-jobs/{job_id}", response_model=dict) async def get_task_generation_job( job_id: str, generation_service: TaskGenerationService = Depends(get_task_generation_service), ): """获取任务生成作业状态""" job = await generation_service.get_job(job_id) if not job: raise HTTPException(status_code=404, detail="任务生成作业未找到") return {"job": job.model_dump(mode="json")} @router.patch("/{task_id}", response_model=dict) async def update_task( task_id: int, task_update: TaskUpdate, service: TaskService = Depends(get_task_service), scheduler_service: SchedulerService = Depends(get_scheduler_service), ): """更新任务""" try: existing_task = await service.get_task(task_id) if not existing_task: raise HTTPException(status_code=404, detail="任务未找到") _validate_final_account_strategy(existing_task, task_update) current_mode = getattr(existing_task, "decision_mode", "ai") or "ai" target_mode = task_update.decision_mode or current_mode description_changed = ( task_update.description is not None and task_update.description != existing_task.description ) switched_to_ai = current_mode != "ai" and target_mode == "ai" if target_mode == "keyword": final_rules = ( task_update.keyword_rules if task_update.keyword_rules is not None else getattr(existing_task, "keyword_rules", []) ) if not _has_keyword_rules(final_rules): raise HTTPException(status_code=400, detail="关键词模式下至少需要一个关键词。") if target_mode == "ai" and (description_changed or switched_to_ai): print(f"检测到任务 {task_id} 需要刷新 AI 标准文件,开始重新生成...") try: description_for_ai = ( task_update.description if task_update.description is not None else existing_task.description ) if not str(description_for_ai or "").strip(): raise HTTPException(status_code=400, detail="AI 模式下详细需求不能为空。") safe_keyword = "".join( c for c in existing_task.keyword.lower().replace(' ', '_') if c.isalnum() or c in "_-" ).rstrip() output_filename = f"prompts/{safe_keyword}_criteria.txt" print(f"目标文件路径: {output_filename}") print("开始调用 AI 生成新的分析标准...") generated_criteria = await generate_criteria( user_description=description_for_ai, reference_file_path="prompts/macbook_criteria.txt" ) if not generated_criteria or len(generated_criteria.strip()) == 0: print("AI 返回的内容为空") raise HTTPException(status_code=500, detail="AI 未能生成分析标准,返回内容为空。") print(f"保存新的分析标准到: {output_filename}") os.makedirs("prompts", exist_ok=True) async with aiofiles.open(output_filename, 'w', encoding='utf-8') as f: await f.write(generated_criteria) print(f"新的分析标准已保存") task_update.ai_prompt_criteria_file = output_filename print(f"已更新 ai_prompt_criteria_file 字段为: {output_filename}") except HTTPException: raise except Exception as e: error_msg = f"重新生成 criteria 文件时出错: {str(e)}" print(error_msg) import traceback print(traceback.format_exc()) raise HTTPException(status_code=500, detail=error_msg) task = await service.update_task(task_id, task_update) await _reload_scheduler_if_needed(service, scheduler_service) return {"message": "任务更新成功", "task": serialize_task(task, scheduler_service)} except ValueError as e: raise HTTPException(status_code=404, detail=str(e)) @router.delete("/{task_id}", response_model=dict) async def delete_task( task_id: int, service: TaskService = Depends(get_task_service), process_service: ProcessService = Depends(get_process_service), scheduler_service: SchedulerService = Depends(get_scheduler_service), ): """删除任务""" task = await service.get_task(task_id) if not task: raise HTTPException(status_code=404, detail="任务未找到") await process_service.stop_task(task_id) success = await service.delete_task(task_id) if not success: raise HTTPException(status_code=404, detail="任务未找到") await _reload_scheduler_if_needed(service, scheduler_service) try: keyword = (task.keyword or "").strip() if keyword: remaining_tasks = await service.get_all_tasks() keyword_still_in_use = any( (remaining_task.keyword or "").strip() == keyword for remaining_task in remaining_tasks ) if not keyword_still_in_use: await delete_result_file_records(build_result_filename(keyword)) delete_price_snapshots(keyword) except Exception as e: print(f"删除任务结果文件时出错: {e}") try: log_file_path = resolve_task_log_path(task_id, task.task_name) if os.path.exists(log_file_path): os.remove(log_file_path) except Exception as e: print(f"删除任务日志文件时出错: {e}") return {"message": "任务删除成功"} @router.post("/start/{task_id}", response_model=dict) async def start_task( task_id: int, task_service: TaskService = Depends(get_task_service), process_service: ProcessService = Depends(get_process_service), ): """启动单个任务""" task = await task_service.get_task(task_id) if not task: raise HTTPException(status_code=404, detail="任务未找到") if not task.enabled: raise HTTPException(status_code=400, detail="任务已被禁用,无法启动") if task.is_running: raise HTTPException(status_code=400, detail="任务已在运行中") success = await process_service.start_task(task_id, task.task_name) if not success: raise HTTPException(status_code=500, detail="启动任务失败") return {"message": f"任务 '{task.task_name}' 已启动"} @router.post("/stop/{task_id}", response_model=dict) async def stop_task( task_id: int, task_service: TaskService = Depends(get_task_service), process_service: ProcessService = Depends(get_process_service), ): """停止单个任务""" task = await task_service.get_task(task_id) if not task: raise HTTPException(status_code=404, detail="任务未找到") await process_service.stop_task(task_id) return {"message": f"任务ID {task_id} 已发送停止信号"} ================================================ FILE: src/api/routes/websocket.py ================================================ """ WebSocket 路由 提供实时通信功能 """ from fastapi import APIRouter, WebSocket, WebSocketDisconnect from typing import Set router = APIRouter() # 全局 WebSocket 连接管理 active_connections: Set[WebSocket] = set() @router.websocket("/ws") async def websocket_endpoint( websocket: WebSocket, ): """WebSocket 端点""" # 接受连接 await websocket.accept() active_connections.add(websocket) try: # 保持连接并接收消息 while True: # 接收客户端消息(如果有的话) data = await websocket.receive_text() # 这里可以处理客户端发送的消息 # 目前我们主要用于服务端推送,所以暂时不处理 except WebSocketDisconnect: active_connections.remove(websocket) except Exception as e: print(f"WebSocket 错误: {e}") if websocket in active_connections: active_connections.remove(websocket) async def broadcast_message(message_type: str, data: dict): """向所有连接的客户端广播消息""" message = { "type": message_type, "data": data } # 移除已断开的连接 disconnected = set() for connection in active_connections: try: await connection.send_json(message) except Exception: disconnected.add(connection) # 清理断开的连接 for connection in disconnected: active_connections.discard(connection) ================================================ FILE: src/app.py ================================================ """ 新架构的主应用入口 整合所有路由和服务 """ from contextlib import asynccontextmanager from fastapi import FastAPI from fastapi.staticfiles import StaticFiles from fastapi.templating import Jinja2Templates from src.api.routes import ( dashboard, tasks, logs, settings, prompts, results, login_state, websocket, accounts, ) from src.api.dependencies import ( set_process_service, set_scheduler_service, set_task_generation_service, ) from src.services.task_service import TaskService from src.services.process_service import ProcessService from src.services.scheduler_service import SchedulerService from src.services.task_log_cleanup_service import cleanup_task_logs from src.services.task_generation_service import TaskGenerationService from src.infrastructure.persistence.sqlite_bootstrap import bootstrap_sqlite_storage from src.infrastructure.persistence.sqlite_task_repository import SqliteTaskRepository from src.infrastructure.config.settings import settings as app_settings # 全局服务实例 process_service = ProcessService() scheduler_service = SchedulerService(process_service) task_generation_service = TaskGenerationService() async def _sync_task_runtime_status(task_id: int, is_running: bool) -> None: task_service = TaskService(SqliteTaskRepository()) task = await task_service.get_task(task_id) if not task or task.is_running == is_running: return await task_service.update_task_status(task_id, is_running) await websocket.broadcast_message( "task_status_changed", {"id": task_id, "is_running": is_running}, ) process_service.set_lifecycle_hooks( on_started=lambda task_id: _sync_task_runtime_status(task_id, True), on_stopped=lambda task_id: _sync_task_runtime_status(task_id, False), ) # 设置全局 ProcessService 实例供依赖注入使用 set_process_service(process_service) set_scheduler_service(scheduler_service) set_task_generation_service(task_generation_service) @asynccontextmanager async def lifespan(app: FastAPI): """应用生命周期管理""" # 启动时 print("正在启动应用...") bootstrap_sqlite_storage() cleanup_task_logs(keep_days=app_settings.task_log_retention_days) # 重置所有任务状态为停止 task_repo = SqliteTaskRepository() task_service = TaskService(task_repo) tasks_list = await task_service.get_all_tasks() for task in tasks_list: if task.is_running: await task_service.update_task_status(task.id, False) # 加载定时任务 await scheduler_service.reload_jobs(tasks_list) scheduler_service.start() print("应用启动完成") yield # 关闭时 print("正在关闭应用...") scheduler_service.stop() await process_service.stop_all() print("应用已关闭") # 创建 FastAPI 应用 app = FastAPI( title="闲鱼智能监控机器人", description="基于AI的闲鱼商品监控系统", version="2.0.0", lifespan=lifespan ) # 注册路由 app.include_router(tasks.router) app.include_router(dashboard.router) app.include_router(logs.router) app.include_router(settings.router) app.include_router(prompts.router) app.include_router(results.router) app.include_router(login_state.router) app.include_router(websocket.router) app.include_router(accounts.router) # 挂载静态文件 # 旧的静态文件目录(用于截图等) app.mount("/static", StaticFiles(directory="static"), name="static") # 挂载 Vue 3 前端构建产物 # 注意:需要在所有 API 路由之后挂载,以避免覆盖 API 路由 import os if os.path.exists("dist"): app.mount("/assets", StaticFiles(directory="dist/assets"), name="assets") # 健康检查端点 @app.get("/health") async def health_check(): """健康检查(无需认证)""" return {"status": "healthy", "message": "服务正常运行"} # 认证状态检查端点 from fastapi import Request, HTTPException from fastapi.responses import FileResponse from pydantic import BaseModel class LoginRequest(BaseModel): username: str password: str @app.post("/auth/status") async def auth_status(payload: LoginRequest): """检查认证状态""" if payload.username == app_settings.web_username and payload.password == app_settings.web_password: return {"authenticated": True, "username": payload.username} raise HTTPException(status_code=401, detail="认证失败") # 主页路由 - 服务 Vue 3 SPA from fastapi.responses import JSONResponse @app.get("/") async def read_root(request: Request): """提供 Vue 3 SPA 的主页面""" if os.path.exists("dist/index.html"): return FileResponse("dist/index.html") else: return JSONResponse( status_code=500, content={"error": "前端构建产物不存在,请先运行 cd web-ui && npm run build"} ) # Catch-all 路由 - 处理所有前端路由(必须放在最后) @app.get("/{full_path:path}") async def serve_spa(request: Request, full_path: str): """ Catch-all 路由,将所有非 API 请求重定向到 index.html 这样可以支持 Vue Router 的 HTML5 History 模式 """ # 如果请求的是静态资源(如 favicon.ico),返回 404 if full_path.endswith(('.ico', '.png', '.jpg', '.jpeg', '.gif', '.svg', '.css', '.js', '.json')): return JSONResponse(status_code=404, content={"error": "资源未找到"}) # 其他所有路径都返回 index.html,让前端路由处理 if os.path.exists("dist/index.html"): return FileResponse("dist/index.html") else: return JSONResponse( status_code=500, content={"error": "前端构建产物不存在,请先运行 cd web-ui && npm run build"} ) if __name__ == "__main__": import uvicorn from src.infrastructure.config.settings import settings print(f"启动新架构应用,端口: {app_settings.server_port}") uvicorn.run(app, host="0.0.0.0", port=app_settings.server_port) ================================================ FILE: src/config.py ================================================ import os import sys from dotenv import load_dotenv from openai import AsyncOpenAI # --- AI & Notification Configuration --- load_dotenv() # --- File Paths & Directories --- STATE_FILE = "xianyu_state.json" IMAGE_SAVE_DIR = "images" CONFIG_FILE = "config.json" os.makedirs(IMAGE_SAVE_DIR, exist_ok=True) # 任务隔离的临时图片目录前缀 TASK_IMAGE_DIR_PREFIX = "task_images_" # --- API URL Patterns --- API_URL_PATTERN = "h5api.m.goofish.com/h5/mtop.taobao.idlemtopsearch.pc.search" DETAIL_API_URL_PATTERN = "h5api.m.goofish.com/h5/mtop.taobao.idle.pc.detail" # --- Environment Variables --- API_KEY = os.getenv("OPENAI_API_KEY") BASE_URL = os.getenv("OPENAI_BASE_URL") MODEL_NAME = os.getenv("OPENAI_MODEL_NAME") PROXY_URL = os.getenv("PROXY_URL") NTFY_TOPIC_URL = os.getenv("NTFY_TOPIC_URL") GOTIFY_URL = os.getenv("GOTIFY_URL") GOTIFY_TOKEN = os.getenv("GOTIFY_TOKEN") BARK_URL = os.getenv("BARK_URL") WX_BOT_URL = os.getenv("WX_BOT_URL") TELEGRAM_BOT_TOKEN = os.getenv("TELEGRAM_BOT_TOKEN") TELEGRAM_CHAT_ID = os.getenv("TELEGRAM_CHAT_ID") WEBHOOK_URL = os.getenv("WEBHOOK_URL") WEBHOOK_METHOD = os.getenv("WEBHOOK_METHOD", "POST").upper() WEBHOOK_HEADERS = os.getenv("WEBHOOK_HEADERS") WEBHOOK_CONTENT_TYPE = os.getenv("WEBHOOK_CONTENT_TYPE", "JSON").upper() WEBHOOK_QUERY_PARAMETERS = os.getenv("WEBHOOK_QUERY_PARAMETERS") WEBHOOK_BODY = os.getenv("WEBHOOK_BODY") PCURL_TO_MOBILE = os.getenv("PCURL_TO_MOBILE", "false").lower() == "true" RUN_HEADLESS = os.getenv("RUN_HEADLESS", "true").lower() != "false" LOGIN_IS_EDGE = os.getenv("LOGIN_IS_EDGE", "false").lower() == "true" RUNNING_IN_DOCKER = os.getenv("RUNNING_IN_DOCKER", "false").lower() == "true" AI_DEBUG_MODE = os.getenv("AI_DEBUG_MODE", "false").lower() == "true" SKIP_AI_ANALYSIS = os.getenv("SKIP_AI_ANALYSIS", "false").lower() == "true" ENABLE_THINKING = os.getenv("ENABLE_THINKING", "false").lower() == "true" ENABLE_RESPONSE_FORMAT = os.getenv("ENABLE_RESPONSE_FORMAT", "true").lower() == "true" # --- Headers --- IMAGE_DOWNLOAD_HEADERS = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:139.0) Gecko/20100101 Firefox/139.0', 'Accept': 'image/avif,image/webp,image/apng,image/svg+xml,image/*,*/*;q=0.8', 'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8', 'Connection': 'keep-alive', 'Upgrade-Insecure-Requests': '1', } # --- Client Initialization --- # 检查配置是否齐全 if not all([BASE_URL, MODEL_NAME]): print("警告:未在 .env 文件中完整设置 OPENAI_BASE_URL 和 OPENAI_MODEL_NAME。AI相关功能可能无法使用。") client = None else: try: if PROXY_URL: print(f"正在为AI请求使用HTTP/S代理: {PROXY_URL}") # httpx 会自动从环境变量中读取代理设置 os.environ['HTTP_PROXY'] = PROXY_URL os.environ['HTTPS_PROXY'] = PROXY_URL # openai 客户端内部的 httpx 会自动从环境变量中获取代理配置 client = AsyncOpenAI(api_key=API_KEY, base_url=BASE_URL) except Exception as e: print(f"初始化 OpenAI 客户端时出错: {e}") client = None # 检查AI客户端是否成功初始化 if not client: # 在 prompt_generator.py 中,如果 client 为 None,会直接报错退出 # 在 spider_v2.py 中,AI分析会跳过 # 为了保持一致性,这里只打印警告,具体逻辑由调用方处理 pass # 检查关键配置 if not all([BASE_URL, MODEL_NAME]) and 'prompt_generator.py' in sys.argv[0]: sys.exit("错误:请确保在 .env 文件中完整设置了 OPENAI_BASE_URL 和 OPENAI_MODEL_NAME。(OPENAI_API_KEY 对于某些服务是可选的)") def get_ai_request_params(**kwargs): """ 构建AI请求参数,根据ENABLE_THINKING和ENABLE_RESPONSE_FORMAT环境变量决定是否添加相应参数 """ if ENABLE_THINKING: kwargs["extra_body"] = {"enable_thinking": False} # 如果禁用结构化输出,则移除 text.format 配置 if not ENABLE_RESPONSE_FORMAT and "text" in kwargs: text_config = kwargs.get("text") if isinstance(text_config, dict): text_config = dict(text_config) text_config.pop("format", None) if text_config: kwargs["text"] = text_config else: del kwargs["text"] return kwargs ================================================ FILE: src/core/cron_utils.py ================================================ """ Cron 解析与校验工具。 """ from __future__ import annotations from typing import Optional from apscheduler.triggers.cron import CronTrigger CRON_ALIASES = { "@yearly": "0 0 1 1 *", "@annually": "0 0 1 1 *", "@monthly": "0 0 1 * *", "@weekly": "0 0 * * 0", "@daily": "0 0 * * *", "@midnight": "0 0 * * *", "@hourly": "0 * * * *", } CRON_FORMAT_HINT = ( "Cron 表达式无效。支持 5 段(分 时 日 月 周)、" "6 段(秒 分 时 日 月 周)和常见别名(@hourly/@daily/@weekly/@monthly/@yearly)。" "示例:*/15 * * * *、0 8 * * *、0 0 8 * * *、@daily。" ) def normalize_cron_expression(value: Optional[str]) -> Optional[str]: if value is None: return None normalized = " ".join(str(value).strip().split()) if not normalized: return None return CRON_ALIASES.get(normalized.lower(), normalized) def build_cron_trigger( expression: str, *, timezone=None, ) -> CronTrigger: normalized = normalize_cron_expression(expression) if normalized is None: raise ValueError(CRON_FORMAT_HINT) parts = normalized.split() try: if len(parts) == 5: return CronTrigger.from_crontab(normalized, timezone=timezone) if len(parts) == 6: second, minute, hour, day, month, day_of_week = parts return CronTrigger( second=second, minute=minute, hour=hour, day=day, month=month, day_of_week=day_of_week, timezone=timezone, ) except ValueError as exc: raise ValueError(CRON_FORMAT_HINT) from exc raise ValueError(CRON_FORMAT_HINT) def validate_cron_expression(value: Optional[str]) -> Optional[str]: normalized = normalize_cron_expression(value) if normalized is None: return None build_cron_trigger(normalized) return normalized ================================================ FILE: src/domain/__init__.py ================================================ ================================================ FILE: src/domain/models/__init__.py ================================================ from .task import Task, TaskCreate, TaskUpdate, TaskStatus __all__ = ["Task", "TaskCreate", "TaskUpdate", "TaskStatus"] ================================================ FILE: src/domain/models/task.py ================================================ """ 任务领域模型 定义任务实体及其业务逻辑 """ import re from enum import Enum from typing import Any, List, Literal, Optional from pydantic import BaseModel, ConfigDict, Field, field_validator, model_validator from src.core.cron_utils import validate_cron_expression from src.services.account_strategy_service import ( clean_account_state_file, normalize_account_strategy, ) class TaskStatus(str, Enum): """任务状态枚举""" STOPPED = "stopped" RUNNING = "running" SCHEDULED = "scheduled" def _normalize_keyword_values(value) -> List[str]: if value is None: return [] raw_values = [] if isinstance(value, (list, tuple, set)): raw_values = list(value) elif isinstance(value, str): raw_values = re.split(r"[\n,]+", value) else: raw_values = [value] normalized: List[str] = [] seen = set() for item in raw_values: text = str(item).strip() if not text: continue dedup_key = text.lower() if dedup_key in seen: continue seen.add(dedup_key) normalized.append(text) return normalized def _extract_keywords_from_legacy_groups(groups) -> List[str]: if not groups: return [] merged: List[str] = [] for group in groups: include_keywords = [] if isinstance(group, dict): include_keywords = group.get("include_keywords") or [] else: include_keywords = getattr(group, "include_keywords", []) or [] merged.extend(_normalize_keyword_values(include_keywords)) return _normalize_keyword_values(merged) def _normalize_payload_keywords(payload: Any) -> Any: if payload is None or not isinstance(payload, dict): return payload values = dict(payload) values["account_state_file"] = clean_account_state_file(values.get("account_state_file")) values["account_strategy"] = normalize_account_strategy( values.get("account_strategy"), values.get("account_state_file"), ) if "keyword_rules" in values: values["keyword_rules"] = _normalize_keyword_values(values.get("keyword_rules")) elif "keyword_rule_groups" in values: values["keyword_rules"] = _extract_keywords_from_legacy_groups( values.get("keyword_rule_groups") ) return values def _has_keyword_rules(keyword_rules: List[str]) -> bool: return bool(keyword_rules and len(keyword_rules) > 0) def _normalize_optional_string(value): if value == "" or value == "null" or value == "undefined" or value is None: return None return value def _validate_cron_expression(value: Optional[str]) -> Optional[str]: return validate_cron_expression(value) def _normalize_price_value(value): if _normalize_optional_string(value) is None: return None if isinstance(value, (int, float)): return str(value) return value class Task(BaseModel): """任务实体""" model_config = ConfigDict(use_enum_values=True, extra="ignore") id: Optional[int] = None task_name: str enabled: bool keyword: str description: Optional[str] = "" analyze_images: bool = True max_pages: int personal_only: bool min_price: Optional[str] = None max_price: Optional[str] = None cron: Optional[str] = None ai_prompt_base_file: str ai_prompt_criteria_file: str account_state_file: Optional[str] = None account_strategy: Literal["auto", "fixed", "rotate"] = "auto" free_shipping: bool = True new_publish_option: Optional[str] = None region: Optional[str] = None decision_mode: Literal["ai", "keyword"] = "ai" keyword_rules: List[str] = Field(default_factory=list) is_running: bool = False @model_validator(mode="before") @classmethod def normalize_legacy_keyword_payload(cls, values): return _normalize_payload_keywords(values) @field_validator("keyword_rules", mode="before") @classmethod def normalize_keyword_rules(cls, value): return _normalize_keyword_values(value) def can_start(self) -> bool: """检查任务是否可以启动""" return self.enabled and not self.is_running def can_stop(self) -> bool: """检查任务是否可以停止""" return self.is_running def apply_update(self, update: "TaskUpdate") -> "Task": """应用更新并返回新的任务实例""" update_data = update.model_dump(exclude_unset=True) return self.model_copy(update=update_data) class TaskCreate(BaseModel): """创建任务的DTO""" model_config = ConfigDict(extra="ignore") task_name: str enabled: bool = True keyword: str description: Optional[str] = "" analyze_images: bool = True max_pages: int = 3 personal_only: bool = True min_price: Optional[str] = None max_price: Optional[str] = None cron: Optional[str] = None ai_prompt_base_file: str = "prompts/base_prompt.txt" ai_prompt_criteria_file: str = "" account_state_file: Optional[str] = None account_strategy: Literal["auto", "fixed", "rotate"] = "auto" free_shipping: bool = True new_publish_option: Optional[str] = None region: Optional[str] = None decision_mode: Literal["ai", "keyword"] = "ai" keyword_rules: List[str] = Field(default_factory=list) @model_validator(mode="before") @classmethod def normalize_legacy_keyword_payload(cls, values): return _normalize_payload_keywords(values) @field_validator("min_price", "max_price", mode="before") @classmethod def convert_price_to_str(cls, value): return _normalize_price_value(value) @field_validator("cron", mode="before") @classmethod def normalize_cron(cls, value): return _normalize_optional_string(value) @field_validator("account_state_file", mode="before") @classmethod def normalize_account_state_file(cls, value): return clean_account_state_file(value) @field_validator("cron") @classmethod def validate_cron(cls, value): return _validate_cron_expression(value) @field_validator("keyword_rules", mode="before") @classmethod def normalize_keyword_rules(cls, value): return _normalize_keyword_values(value) @model_validator(mode="after") def validate_decision_mode_payload(self): description = str(self.description or "").strip() if self.decision_mode == "ai" and not description: raise ValueError("AI 判断模式下,详细需求(description)不能为空。") if self.decision_mode == "keyword" and not _has_keyword_rules(self.keyword_rules): raise ValueError("关键词判断模式下,至少需要一个关键词。") if self.account_strategy == "fixed" and not self.account_state_file: raise ValueError("固定账号模式下必须选择账号。") return self class TaskUpdate(BaseModel): """更新任务的DTO""" model_config = ConfigDict(extra="ignore") task_name: Optional[str] = None enabled: Optional[bool] = None keyword: Optional[str] = None description: Optional[str] = None analyze_images: Optional[bool] = None max_pages: Optional[int] = None personal_only: Optional[bool] = None min_price: Optional[str] = None max_price: Optional[str] = None cron: Optional[str] = None ai_prompt_base_file: Optional[str] = None ai_prompt_criteria_file: Optional[str] = None account_state_file: Optional[str] = None account_strategy: Optional[Literal["auto", "fixed", "rotate"]] = None free_shipping: Optional[bool] = None new_publish_option: Optional[str] = None region: Optional[str] = None decision_mode: Optional[Literal["ai", "keyword"]] = None keyword_rules: Optional[List[str]] = None is_running: Optional[bool] = None @model_validator(mode="before") @classmethod def normalize_legacy_keyword_payload(cls, values): return _normalize_payload_keywords(values) @field_validator("min_price", "max_price", mode="before") @classmethod def convert_price_to_str(cls, value): return _normalize_price_value(value) @field_validator("cron", mode="before") @classmethod def normalize_cron(cls, value): return _normalize_optional_string(value) @field_validator("account_state_file", mode="before") @classmethod def normalize_account_state_file(cls, value): return clean_account_state_file(value) @field_validator("cron") @classmethod def validate_cron(cls, value): return _validate_cron_expression(value) @field_validator("keyword_rules", mode="before") @classmethod def normalize_keyword_rules(cls, value): return _normalize_keyword_values(value) @model_validator(mode="after") def validate_partial_keyword_payload(self): if self.decision_mode == "keyword" and self.keyword_rules is not None: if not _has_keyword_rules(self.keyword_rules): raise ValueError("关键词判断模式下,至少需要一个关键词。") if self.decision_mode == "ai" and self.description is not None: if not str(self.description).strip(): raise ValueError("AI 判断模式下,详细需求(description)不能为空。") return self class TaskGenerateRequest(BaseModel): """任务创建请求DTO(AI模式支持自动生成标准)""" model_config = ConfigDict(extra="ignore") task_name: str keyword: str description: Optional[str] = "" analyze_images: bool = True personal_only: bool = True min_price: Optional[str] = None max_price: Optional[str] = None max_pages: int = 3 cron: Optional[str] = None account_state_file: Optional[str] = None account_strategy: Literal["auto", "fixed", "rotate"] = "auto" free_shipping: bool = True new_publish_option: Optional[str] = None region: Optional[str] = None decision_mode: Literal["ai", "keyword"] = "ai" keyword_rules: List[str] = Field(default_factory=list) @model_validator(mode="before") @classmethod def normalize_legacy_keyword_payload(cls, values): return _normalize_payload_keywords(values) @field_validator("min_price", "max_price", mode="before") @classmethod def convert_price_to_str(cls, value): return _normalize_price_value(value) @field_validator("cron", mode="before") @classmethod def empty_str_to_none(cls, value): return _normalize_optional_string(value) @field_validator("cron") @classmethod def validate_cron(cls, value): return _validate_cron_expression(value) @field_validator("account_state_file", mode="before") @classmethod def empty_account_to_none(cls, value): return _normalize_optional_string(value) @field_validator("new_publish_option", "region", mode="before") @classmethod def empty_str_to_none_for_strings(cls, value): return _normalize_optional_string(value) @field_validator("keyword_rules", mode="before") @classmethod def normalize_keyword_rules(cls, value): return _normalize_keyword_values(value) @model_validator(mode="after") def validate_decision_mode_payload(self): description = str(self.description or "").strip() if self.decision_mode == "ai" and not description: raise ValueError("AI 判断模式下,详细需求(description)不能为空。") if self.decision_mode == "keyword" and not _has_keyword_rules(self.keyword_rules): raise ValueError("关键词判断模式下,至少需要一个关键词。") if self.account_strategy == "fixed" and not self.account_state_file: raise ValueError("固定账号模式下必须选择账号。") return self ================================================ FILE: src/domain/models/task_generation.py ================================================ """ 任务生成作业模型 """ from typing import List, Literal, Optional from pydantic import BaseModel, Field from src.domain.models.task import Task TaskGenerationStatus = Literal["queued", "running", "completed", "failed"] TaskGenerationStepStatus = Literal["pending", "running", "completed", "failed"] class TaskGenerationStep(BaseModel): """单个任务生成步骤""" key: str label: str status: TaskGenerationStepStatus = "pending" message: str = "" class TaskGenerationJob(BaseModel): """任务生成作业""" job_id: str task_name: str status: TaskGenerationStatus = "queued" message: str = "任务已排队,等待开始。" current_step: Optional[str] = None steps: List[TaskGenerationStep] = Field(default_factory=list) task: Optional[Task] = None error: Optional[str] = None ================================================ FILE: src/domain/repositories/__init__.py ================================================ ================================================ FILE: src/domain/repositories/task_repository.py ================================================ """ 任务仓储层 负责任务数据的持久化操作 """ from typing import List, Optional from abc import ABC, abstractmethod import json import aiofiles from src.domain.models.task import Task class TaskRepository(ABC): """任务仓储接口""" @abstractmethod async def find_all(self) -> List[Task]: """获取所有任务""" pass @abstractmethod async def find_by_id(self, task_id: int) -> Optional[Task]: """根据ID获取任务""" pass @abstractmethod async def save(self, task: Task) -> Task: """保存任务(创建或更新)""" pass @abstractmethod async def delete(self, task_id: int) -> bool: """删除任务""" pass ================================================ FILE: src/failure_guard.py ================================================ """Task-level failure circuit breaker. 目标: - 当登录态失效/风控导致任务持续失败时,避免无限重试、避免高频请求。 - 失败达到阈值后暂停任务一段时间。 - 暂停期间最多每天通知一次,直到用户更新 cookies / 登录态文件后自动恢复。 说明: - 仅使用标准库,既可被 API 主进程使用,也可被爬虫子进程使用。 """ from __future__ import annotations import json import os import time from dataclasses import dataclass from datetime import datetime, timedelta from typing import Any, Optional try: from zoneinfo import ZoneInfo # py3.9+ def _load_tz(name: str): return ZoneInfo(name) except Exception: # pragma: no cover def _load_tz(name: str): return None def _as_int(value: Any, default: int) -> int: try: return int(value) except (TypeError, ValueError): return default def _now(tz_name: str, now: Optional[datetime] = None) -> datetime: if now is not None: return now tz = _load_tz(tz_name) if tz is None: return datetime.now() return datetime.now(tz) def _today_str(tz_name: str, now: Optional[datetime] = None) -> str: return _now(tz_name, now=now).date().isoformat() def _dt_to_str(dt: Optional[datetime]) -> Optional[str]: if dt is None: return None return dt.isoformat() def _str_to_dt(value: Optional[str]) -> Optional[datetime]: if not value: return None try: return datetime.fromisoformat(value) except ValueError: return None def _get_mtime(path: Optional[str]) -> Optional[float]: if not path: return None try: return os.path.getmtime(path) except OSError: return None def _cookie_changed( cookie_path: Optional[str], previous_mtime: Optional[float] ) -> bool: if not cookie_path: return False current = _get_mtime(cookie_path) if current is None or previous_mtime is None: return False return current > (previous_mtime + 1e-6) class _FileLock: def __init__(self, fh): self._fh = fh def __enter__(self): try: import fcntl fcntl.flock(self._fh.fileno(), fcntl.LOCK_EX) except Exception: pass return self def __exit__(self, exc_type, exc, tb): try: import fcntl fcntl.flock(self._fh.fileno(), fcntl.LOCK_UN) except Exception: pass return False def _ensure_parent_dir(path: str) -> None: parent = os.path.dirname(path) if parent: os.makedirs(parent, exist_ok=True) def _read_json_file(path: str) -> dict: try: with open(path, "r", encoding="utf-8") as f: data = json.load(f) return data if isinstance(data, dict) else {} except FileNotFoundError: return {} except Exception: # 文件损坏时保留现场,避免无限解析失败。 try: ts = str(int(time.time())) os.replace(path, f"{path}.corrupt.{ts}") except Exception: pass return {} def _atomic_write_json(path: str, data: dict) -> None: _ensure_parent_dir(path) tmp = f"{path}.tmp" with open(tmp, "w", encoding="utf-8") as f: json.dump(data, f, ensure_ascii=False, indent=2, sort_keys=True) f.flush() os.fsync(f.fileno()) os.replace(tmp, path) @dataclass(frozen=True) class SkipDecision: skip: bool should_notify: bool reason: str paused_until: Optional[datetime] consecutive_failures: int class FailureGuard: def __init__( self, path: Optional[str] = None, *, threshold: Optional[int] = None, pause_seconds: Optional[int] = None, tz_name: Optional[str] = None, ): self.path = ( path or os.getenv("TASK_FAILURE_GUARD_PATH") or "logs/task-failure-guard.json" ) self.threshold = max( 1, threshold or _as_int(os.getenv("TASK_FAILURE_THRESHOLD"), 3) ) self.pause_seconds = max( 60, pause_seconds or _as_int(os.getenv("TASK_FAILURE_PAUSE_SECONDS"), 24 * 60 * 60), ) self.tz_name = tz_name or os.getenv("TASK_FAILURE_TZ") or "Asia/Shanghai" def _load(self) -> dict: data = _read_json_file(self.path) if "tasks" not in data or not isinstance(data.get("tasks"), dict): data = {"version": 1, "tasks": {}} data.setdefault("version", 1) return data def _save(self, data: dict) -> None: _atomic_write_json(self.path, data) def _update_task(self, task_key: str, updater) -> dict: _ensure_parent_dir(self.path) with open(self.path, "a+", encoding="utf-8") as fh: with _FileLock(fh): fh.seek(0) data = self._load() tasks = data.setdefault("tasks", {}) entry = tasks.get(task_key) or {} if not isinstance(entry, dict): entry = {} entry = updater(entry) or entry tasks[task_key] = entry self._save(data) return entry def record_success(self, task_key: str, *, now: Optional[datetime] = None) -> None: def _reset(_: dict) -> dict: current = _now(self.tz_name, now=now) return { "consecutive_failures": 0, "paused_until": None, "last_notified_date": None, "last_failure_reason": None, "last_failure_at": None, "last_success_at": _dt_to_str(current), "cookie_path": None, "cookie_mtime": None, } self._update_task(task_key, _reset) def should_skip_start( self, task_key: str, *, cookie_path: Optional[str] = None, now: Optional[datetime] = None, ) -> SkipDecision: current = _now(self.tz_name, now=now) today = _today_str(self.tz_name, now=current) data = self._load() entry = (data.get("tasks") or {}).get(task_key) or {} if not isinstance(entry, dict): entry = {} paused_until = _str_to_dt(entry.get("paused_until")) consecutive = _as_int(entry.get("consecutive_failures"), 0) last_reason = (entry.get("last_failure_reason") or "").strip() or "未知错误" last_notified_date = entry.get("last_notified_date") previous_cookie_mtime = entry.get("cookie_mtime") if cookie_path and previous_cookie_mtime is not None: try: previous_cookie_mtime = float(previous_cookie_mtime) except (TypeError, ValueError): previous_cookie_mtime = None if ( paused_until and paused_until > current and cookie_path and _cookie_changed(cookie_path, previous_cookie_mtime) ): # cookies / 登录态更新 => 自动恢复 self.record_success(task_key, now=current) return SkipDecision( skip=False, should_notify=False, reason="cookie_updated", paused_until=None, consecutive_failures=0, ) if paused_until and current < paused_until: should_notify = last_notified_date != today if should_notify: def _touch(e: dict) -> dict: e = dict(e or {}) e["last_notified_date"] = today return e self._update_task(task_key, _touch) return SkipDecision( skip=True, should_notify=should_notify, reason=last_reason, paused_until=paused_until, consecutive_failures=consecutive, ) return SkipDecision( skip=False, should_notify=False, reason="not_paused", paused_until=None, consecutive_failures=consecutive, ) def record_failure( self, task_key: str, reason: str, *, cookie_path: Optional[str] = None, min_failures_to_pause: Optional[int] = None, now: Optional[datetime] = None, ) -> dict: current = _now(self.tz_name, now=now) today = _today_str(self.tz_name, now=current) cookie_mtime = _get_mtime(cookie_path) effective_threshold = max(1, int(min_failures_to_pause or self.threshold)) result = { "should_notify": False, "opened_circuit": False, "paused_until": None, "consecutive_failures": 0, } def _apply(entry: dict) -> dict: entry = dict(entry or {}) previous_paused_until = _str_to_dt(entry.get("paused_until")) was_paused = bool(previous_paused_until and current < previous_paused_until) prev_mtime = entry.get("cookie_mtime") try: prev_mtime = float(prev_mtime) if prev_mtime is not None else None except (TypeError, ValueError): prev_mtime = None if cookie_path and _cookie_changed(cookie_path, prev_mtime): entry["consecutive_failures"] = 0 entry["paused_until"] = None entry["last_notified_date"] = None consecutive = _as_int(entry.get("consecutive_failures"), 0) + 1 entry["consecutive_failures"] = consecutive entry["last_failure_reason"] = (reason or "未知错误")[:1000] entry["last_failure_at"] = _dt_to_str(current) if cookie_path: entry["cookie_path"] = cookie_path if cookie_mtime is not None: entry["cookie_mtime"] = cookie_mtime opened = False if consecutive >= effective_threshold: paused_until = current + timedelta(seconds=self.pause_seconds) entry["paused_until"] = _dt_to_str(paused_until) opened = not was_paused if entry.get("last_notified_date") != today: entry["last_notified_date"] = today result["should_notify"] = True result["paused_until"] = paused_until else: entry["paused_until"] = None result["opened_circuit"] = opened result["consecutive_failures"] = consecutive return entry self._update_task(task_key, _apply) return result ================================================ FILE: src/infrastructure/__init__.py ================================================ ================================================ FILE: src/infrastructure/config/__init__.py ================================================ from .settings import settings, AppSettings, AISettings, NotificationSettings __all__ = ["settings", "AppSettings", "AISettings", "NotificationSettings"] ================================================ FILE: src/infrastructure/config/env_manager.py ================================================ """ 环境变量管理器 负责读取和更新 .env 文件,并在读取时回退到运行时环境变量 """ import os import re from typing import Dict, List, Optional from pathlib import Path from dotenv import dotenv_values _PLAIN_ENV_VALUE_PATTERN = re.compile(r"^[A-Za-z0-9_./:-]+$") class EnvManager: """环境变量管理器""" def __init__(self, env_file: str = ".env"): self.env_file = Path(env_file) self._ensure_env_file_exists() def _ensure_env_file_exists(self): """确保 .env 文件存在""" if not self.env_file.exists(): self.env_file.touch() def read_env(self) -> Dict[str, str]: """读取所有环境变量""" if not self.env_file.exists(): return {} loaded = dotenv_values(self.env_file, encoding="utf-8") return { key: value for key, value in loaded.items() if key and value is not None } def get_value(self, key: str, default: Optional[str] = None) -> Optional[str]: """获取单个环境变量的值,优先返回运行时环境变量""" runtime_value = os.getenv(key) if runtime_value is not None: return runtime_value env_vars = self.read_env() return env_vars.get(key, default) def update_values(self, updates: Dict[str, str]) -> bool: """批量更新环境变量""" return self.apply_changes(updates=updates) def apply_changes( self, updates: Dict[str, str], deletions: List[str] | None = None, ) -> bool: """批量更新并删除环境变量""" try: existing_vars = self.read_env() existing_vars.update(updates) for key in deletions or []: existing_vars.pop(key, None) return self._write_env(existing_vars) except Exception as e: print(f"更新环境变量失败: {e}") return False def set_value(self, key: str, value: str) -> bool: """设置单个环境变量""" return self.update_values({key: value}) def delete_keys(self, keys: List[str]) -> bool: """删除指定的环境变量""" try: existing_vars = self.read_env() for key in keys: existing_vars.pop(key, None) return self._write_env(existing_vars) except Exception as e: print(f"删除环境变量失败: {e}") return False def _write_env(self, env_vars: Dict[str, str]) -> bool: """写入环境变量到文件""" try: with open(self.env_file, 'w', encoding='utf-8') as f: for key, value in env_vars.items(): f.write(f"{key}={self._serialize_value(value)}\n") return True except Exception as e: print(f"写入 .env 文件失败: {e}") return False def _serialize_value(self, value: str) -> str: text = str(value) if text == "": return "" if _PLAIN_ENV_VALUE_PATTERN.fullmatch(text): return text escaped = text.replace("\\", "\\\\").replace('"', '\\"').replace("\n", "\\n") return f'"{escaped}"' # 全局实例 env_manager = EnvManager() ================================================ FILE: src/infrastructure/config/settings.py ================================================ """ 统一配置管理模块 使用 Pydantic 进行类型安全的配置管理 """ try: from pydantic_settings import BaseSettings, SettingsConfigDict _USING_PYDANTIC_SETTINGS = True except ImportError: from pydantic import BaseSettings _USING_PYDANTIC_SETTINGS = False from pydantic import Field from typing import Optional import os DEFAULT_TELEGRAM_API_BASE_URL = "https://api.telegram.org" def _env_field(default, env_name: str, **kwargs): if _USING_PYDANTIC_SETTINGS: return Field(default, validation_alias=env_name, **kwargs) return Field(default, env=env_name, **kwargs) if _USING_PYDANTIC_SETTINGS: class _EnvSettings(BaseSettings): model_config = SettingsConfigDict( env_file=".env", env_file_encoding="utf-8", extra="ignore", protected_namespaces=(), ) else: class _EnvSettings(BaseSettings): class Config: env_file = ".env" env_file_encoding = "utf-8" extra = "ignore" protected_namespaces = () class AISettings(_EnvSettings): """AI模型配置""" api_key: Optional[str] = _env_field(None, "OPENAI_API_KEY") base_url: str = _env_field("", "OPENAI_BASE_URL") model_name: str = _env_field("", "OPENAI_MODEL_NAME") proxy_url: Optional[str] = _env_field(None, "PROXY_URL") debug_mode: bool = _env_field(False, "AI_DEBUG_MODE") enable_response_format: bool = _env_field(True, "ENABLE_RESPONSE_FORMAT") enable_thinking: bool = _env_field(False, "ENABLE_THINKING") skip_analysis: bool = _env_field(False, "SKIP_AI_ANALYSIS") def is_configured(self) -> bool: """检查AI是否已正确配置""" return bool(self.base_url and self.model_name) class NotificationSettings(_EnvSettings): """通知服务配置""" ntfy_topic_url: Optional[str] = _env_field(None, "NTFY_TOPIC_URL") gotify_url: Optional[str] = _env_field(None, "GOTIFY_URL") gotify_token: Optional[str] = _env_field(None, "GOTIFY_TOKEN") bark_url: Optional[str] = _env_field(None, "BARK_URL") wx_bot_url: Optional[str] = _env_field(None, "WX_BOT_URL") telegram_bot_token: Optional[str] = _env_field(None, "TELEGRAM_BOT_TOKEN") telegram_chat_id: Optional[str] = _env_field(None, "TELEGRAM_CHAT_ID") telegram_api_base_url: Optional[str] = _env_field( DEFAULT_TELEGRAM_API_BASE_URL, "TELEGRAM_API_BASE_URL", ) webhook_url: Optional[str] = _env_field(None, "WEBHOOK_URL") webhook_method: str = _env_field("POST", "WEBHOOK_METHOD") webhook_headers: Optional[str] = _env_field(None, "WEBHOOK_HEADERS") webhook_content_type: str = _env_field("JSON", "WEBHOOK_CONTENT_TYPE") webhook_query_parameters: Optional[str] = _env_field(None, "WEBHOOK_QUERY_PARAMETERS") webhook_body: Optional[str] = _env_field(None, "WEBHOOK_BODY") pcurl_to_mobile: bool = _env_field(True, "PCURL_TO_MOBILE") def has_any_notification_enabled(self) -> bool: """检查是否配置了任何通知服务""" return any([ self.ntfy_topic_url, self.wx_bot_url, self.gotify_url and self.gotify_token, self.bark_url, self.telegram_bot_token and self.telegram_chat_id, self.webhook_url ]) class ScraperSettings(_EnvSettings): """爬虫相关配置""" run_headless: bool = _env_field(True, "RUN_HEADLESS") login_is_edge: bool = _env_field(False, "LOGIN_IS_EDGE") running_in_docker: bool = _env_field(False, "RUNNING_IN_DOCKER") state_file: str = _env_field("xianyu_state.json", "STATE_FILE") class AppSettings(_EnvSettings): """应用主配置""" server_port: int = _env_field(8000, "SERVER_PORT") web_username: str = _env_field("admin", "WEB_USERNAME") web_password: str = _env_field("admin123", "WEB_PASSWORD") task_log_retention_days: int = _env_field(7, "TASK_LOG_RETENTION_DAYS", ge=1) # 文件路径配置 config_file: str = "config.json" image_save_dir: str = "images" task_image_dir_prefix: str = "task_images_" def __init__(self, **kwargs): super().__init__(**kwargs) # 创建必要的目录 os.makedirs(self.image_save_dir, exist_ok=True) # 全局配置实例(单例模式) _settings_instance = None def get_settings() -> AppSettings: """获取全局配置实例""" global _settings_instance if _settings_instance is None: _settings_instance = AppSettings() return _settings_instance def reload_settings() -> None: """重新加载全局配置实例""" global _settings_instance, settings, ai_settings, notification_settings, scraper_settings from dotenv import load_dotenv from src.infrastructure.config.env_manager import env_manager load_dotenv(dotenv_path=env_manager.env_file, override=True) _settings_instance = None settings = get_settings() ai_settings = AISettings() notification_settings = NotificationSettings() scraper_settings = ScraperSettings() # 导出便捷访问的配置实例 settings = get_settings() ai_settings = AISettings() notification_settings = NotificationSettings() scraper_settings = ScraperSettings() ================================================ FILE: src/infrastructure/external/__init__.py ================================================ ================================================ FILE: src/infrastructure/external/ai_client.py ================================================ """ AI 客户端封装 提供统一的 AI 调用接口 """ import os import json import base64 from typing import Dict, List, Optional from datetime import datetime from dotenv import load_dotenv from openai import AsyncOpenAI from src.ai_message_builder import ( build_analysis_text_prompt, build_user_message_content, ) from src.infrastructure.config.settings import AISettings from src.infrastructure.config.env_manager import env_manager from src.services.ai_request_compat import ( CHAT_COMPLETIONS_API_MODE, RESPONSES_API_MODE, build_ai_request_params, create_ai_response_async, is_chat_completions_api_unsupported_error, is_json_output_unsupported_error, is_responses_api_unsupported_error, is_temperature_unsupported_error, remove_temperature_param, ) from src.services.ai_response_parser import ( EmptyAIResponseError, extract_ai_response_content, parse_ai_response_json, ) class AIClient: """AI 客户端封装""" def __init__(self): self.settings: Optional[AISettings] = None self.client: Optional[AsyncOpenAI] = None self.refresh() def _load_settings(self) -> None: load_dotenv(dotenv_path=env_manager.env_file, override=True) self.settings = AISettings() def refresh(self) -> None: self._load_settings() self.client = self._initialize_client() def _initialize_client(self) -> Optional[AsyncOpenAI]: """初始化 OpenAI 客户端""" if not self.settings or not self.settings.is_configured(): print("警告:AI 配置不完整,AI 功能将不可用") return None try: if self.settings.proxy_url: print(f"正在为 AI 请求使用代理: {self.settings.proxy_url}") os.environ['HTTP_PROXY'] = self.settings.proxy_url os.environ['HTTPS_PROXY'] = self.settings.proxy_url return AsyncOpenAI( api_key=self.settings.api_key, base_url=self.settings.base_url ) except Exception as e: print(f"初始化 AI 客户端失败: {e}") return None def is_available(self) -> bool: """检查 AI 客户端是否可用""" return self.client is not None async def close(self) -> None: """关闭底层异步客户端,避免事件循环结束后再触发清理。""" client = self.client self.client = None if client is None: return close = getattr(client, "close", None) if close is None: return await close() @staticmethod def encode_image(image_path: str) -> Optional[str]: """将图片编码为 Base64""" if not image_path or not os.path.exists(image_path): return None try: with open(image_path, "rb") as f: return base64.b64encode(f.read()).decode('utf-8') except Exception as e: print(f"编码图片失败: {e}") return None async def analyze( self, product_data: Dict, image_paths: List[str], prompt_text: str ) -> Optional[Dict]: """ 分析商品数据 Args: product_data: 商品数据 image_paths: 图片路径列表 prompt_text: 分析提示词 Returns: 分析结果 """ if not self.is_available(): print("AI 客户端不可用") return None try: messages = self._build_messages(product_data, image_paths, prompt_text) response = await self._call_ai(messages) return self._parse_response(response) except Exception as e: print(f"AI 分析失败: {e}") return None def _build_messages(self, product_data: Dict, image_paths: List[str], prompt_text: str) -> List[Dict]: """构建 AI 消息""" product_json = json.dumps(product_data, ensure_ascii=False, indent=2) image_data_urls: List[str] = [] for path in image_paths: base64_img = self.encode_image(path) if base64_img: image_data_urls.append(f"data:image/jpeg;base64,{base64_img}") text_prompt = build_analysis_text_prompt( product_json, prompt_text, include_images=bool(image_data_urls), ) user_content = build_user_message_content(text_prompt, image_data_urls) return [{"role": "user", "content": user_content}] async def _call_ai( self, messages: List[Dict], *, temperature: float = 0.1, max_output_tokens: int = 4000, enable_json_output: Optional[bool] = None, ) -> str: """调用 AI API""" api_mode = CHAT_COMPLETIONS_API_MODE use_response_format = ( self.settings.enable_response_format if enable_json_output is None else enable_json_output ) use_temperature = True max_attempts = 4 for attempt in range(max_attempts): request_params = build_ai_request_params( api_mode, model=self.settings.model_name, messages=messages, temperature=temperature, max_output_tokens=max_output_tokens, enable_json_output=use_response_format, ) if not use_temperature: request_params = remove_temperature_param(request_params) if self.settings.enable_thinking: request_params["extra_body"] = {"enable_thinking": False} try: response = await create_ai_response_async( self.client, api_mode, request_params, ) return extract_ai_response_content(response) except EmptyAIResponseError as exc: if attempt < max_attempts - 1: print( f"AI响应为空,正在自动重试 ({attempt + 2}/{max_attempts})" ) continue raise exc except Exception as exc: changed = False if ( api_mode == CHAT_COMPLETIONS_API_MODE and is_chat_completions_api_unsupported_error(exc) ): api_mode = RESPONSES_API_MODE changed = True print("当前服务未实现 Chat Completions API,正在自动回退到 Responses API") elif ( api_mode == RESPONSES_API_MODE and is_responses_api_unsupported_error(exc) ): api_mode = CHAT_COMPLETIONS_API_MODE changed = True print("当前服务未实现 Responses API,正在自动回退到 Chat Completions API") if use_response_format and is_json_output_unsupported_error(exc): use_response_format = False changed = True print("当前模型不支持结构化 JSON 输出,正在自动重试并移除该参数") if use_temperature and is_temperature_unsupported_error(exc): use_temperature = False changed = True print("当前模型不支持 temperature 参数,正在自动重试并移除该参数") if changed and attempt < max_attempts - 1: continue raise raise RuntimeError("AI 调用在兼容性重试后仍未返回结果") def _parse_response(self, response_text: str) -> Optional[Dict]: """解析 AI 响应""" try: return parse_ai_response_json(response_text) except json.JSONDecodeError: print(f"无法解析 AI 响应: {response_text[:100]}") return None ================================================ FILE: src/infrastructure/external/notification_clients/__init__.py ================================================ from .base import NotificationClient, NotificationMessage from .bark_client import BarkClient from .gotify_client import GotifyClient from .ntfy_client import NtfyClient from .telegram_client import TelegramClient from .wecom_bot_client import WeComBotClient from .webhook_client import WebhookClient __all__ = [ "NotificationClient", "NotificationMessage", "BarkClient", "GotifyClient", "NtfyClient", "TelegramClient", "WeComBotClient", "WebhookClient", ] ================================================ FILE: src/infrastructure/external/notification_clients/bark_client.py ================================================ """ Bark 通知客户端 """ import asyncio import requests from typing import Dict from .base import NotificationClient class BarkClient(NotificationClient): """Bark 通知客户端""" channel_key = "bark" display_name = "Bark" def __init__(self, bark_url: str = None, pcurl_to_mobile: bool = True): super().__init__(enabled=bool(bark_url), pcurl_to_mobile=pcurl_to_mobile) self.bark_url = bark_url async def send(self, product_data: Dict, reason: str) -> None: """发送 Bark 通知""" if not self.is_enabled(): raise RuntimeError("Bark 未启用") message = self._build_message(product_data, reason) bark_payload = { "title": message.notification_title, "body": message.content, "url": message.mobile_link or message.desktop_link, "level": "timeSensitive", "group": "闲鱼监控" } if message.image_url: bark_payload["icon"] = message.image_url headers = {"Content-Type": "application/json; charset=utf-8"} loop = asyncio.get_running_loop() response = await loop.run_in_executor( None, lambda: requests.post( self.bark_url, json=bark_payload, headers=headers, timeout=10 ) ) response.raise_for_status() ================================================ FILE: src/infrastructure/external/notification_clients/base.py ================================================ """ 通知客户端基类 定义通知客户端的统一接口 """ from abc import ABC, abstractmethod from dataclasses import dataclass from typing import Dict from src.utils import convert_goofish_link @dataclass(frozen=True) class NotificationMessage: title: str price: str reason: str desktop_link: str mobile_link: str | None notification_title: str content: str image_url: str | None class NotificationClient(ABC): """通知客户端抽象基类""" channel_key = "unknown" display_name = "未知渠道" def __init__(self, enabled: bool = False, pcurl_to_mobile: bool = True): self._enabled = enabled self._pcurl_to_mobile = pcurl_to_mobile def is_enabled(self) -> bool: """检查客户端是否启用""" return self._enabled @abstractmethod async def send(self, product_data: Dict, reason: str) -> bool: """ 发送通知 Args: product_data: 商品数据 reason: 推荐原因 Returns: 是否发送成功 """ raise NotImplementedError def _build_message(self, product_data: Dict, reason: str) -> NotificationMessage: """格式化消息内容""" title = product_data.get('商品标题', 'N/A') price = product_data.get('当前售价', 'N/A') desktop_link = product_data.get('商品链接', '#') mobile_link = None if self._pcurl_to_mobile and desktop_link and desktop_link != "#": mobile_link = convert_goofish_link(desktop_link) content_lines = [ f"价格: {price}", f"原因: {reason}", ] if mobile_link: content_lines.append(f"手机端链接: {mobile_link}") content_lines.append(f"电脑端链接: {desktop_link}") else: content_lines.append(f"链接: {desktop_link}") short_title = title[:30] suffix = "..." if len(title) > 30 else "" notification_title = f"🚨 新推荐! {short_title}{suffix}" main_image = product_data.get('商品主图链接') if not main_image: image_list = product_data.get('商品图片列表', []) if image_list: main_image = image_list[0] return NotificationMessage( title=title, price=price, reason=reason, desktop_link=desktop_link, mobile_link=mobile_link, notification_title=notification_title, content="\n".join(content_lines), image_url=main_image, ) ================================================ FILE: src/infrastructure/external/notification_clients/factory.py ================================================ """ 通知客户端工厂 """ from src.infrastructure.config.settings import NotificationSettings from .bark_client import BarkClient from .gotify_client import GotifyClient from .ntfy_client import NtfyClient from .telegram_client import TelegramClient from .wecom_bot_client import WeComBotClient from .webhook_client import WebhookClient def build_notification_clients(settings: NotificationSettings): pcurl_to_mobile = settings.pcurl_to_mobile return [ NtfyClient(settings.ntfy_topic_url, pcurl_to_mobile=pcurl_to_mobile), BarkClient(settings.bark_url, pcurl_to_mobile=pcurl_to_mobile), GotifyClient( settings.gotify_url, settings.gotify_token, pcurl_to_mobile=pcurl_to_mobile, ), WeComBotClient(settings.wx_bot_url, pcurl_to_mobile=pcurl_to_mobile), TelegramClient( settings.telegram_bot_token, settings.telegram_chat_id, settings.telegram_api_base_url, pcurl_to_mobile=pcurl_to_mobile, ), WebhookClient( settings.webhook_url, webhook_method=settings.webhook_method, webhook_headers=settings.webhook_headers, webhook_content_type=settings.webhook_content_type, webhook_query_parameters=settings.webhook_query_parameters, webhook_body=settings.webhook_body, pcurl_to_mobile=pcurl_to_mobile, ), ] ================================================ FILE: src/infrastructure/external/notification_clients/gotify_client.py ================================================ """ Gotify 通知客户端 """ import asyncio from typing import Dict import requests from .base import NotificationClient class GotifyClient(NotificationClient): """Gotify 通知客户端""" channel_key = "gotify" display_name = "Gotify" def __init__( self, gotify_url: str | None = None, gotify_token: str | None = None, pcurl_to_mobile: bool = True, ): super().__init__( enabled=bool(gotify_url and gotify_token), pcurl_to_mobile=pcurl_to_mobile, ) self.gotify_url = (gotify_url or "").rstrip("/") self.gotify_token = gotify_token async def send(self, product_data: Dict, reason: str) -> None: if not self.is_enabled(): raise RuntimeError("Gotify 未启用") message = self._build_message(product_data, reason) payload = { "title": (None, message.notification_title), "message": (None, message.content), "priority": (None, "5"), } final_url = f"{self.gotify_url}/message?token={self.gotify_token}" loop = asyncio.get_running_loop() response = await loop.run_in_executor( None, lambda: requests.post(final_url, files=payload, timeout=10), ) response.raise_for_status() ================================================ FILE: src/infrastructure/external/notification_clients/ntfy_client.py ================================================ """ Ntfy 通知客户端 """ import asyncio import requests from typing import Dict from .base import NotificationClient class NtfyClient(NotificationClient): """Ntfy 通知客户端""" channel_key = "ntfy" display_name = "Ntfy" def __init__(self, topic_url: str = None, pcurl_to_mobile: bool = True): super().__init__(enabled=bool(topic_url), pcurl_to_mobile=pcurl_to_mobile) self.topic_url = topic_url async def send(self, product_data: Dict, reason: str) -> None: """发送 Ntfy 通知""" if not self.is_enabled(): raise RuntimeError("Ntfy 未启用") message = self._build_message(product_data, reason) loop = asyncio.get_running_loop() response = await loop.run_in_executor( None, lambda: requests.post( self.topic_url, data=message.content.encode('utf-8'), headers={ "Title": message.notification_title.encode('utf-8'), "Priority": "urgent", "Tags": "bell,vibration" }, timeout=10 ) ) response.raise_for_status() ================================================ FILE: src/infrastructure/external/notification_clients/telegram_client.py ================================================ """ Telegram 通知客户端 """ import asyncio from typing import Dict import requests from src.infrastructure.config.settings import DEFAULT_TELEGRAM_API_BASE_URL from .base import NotificationClient class TelegramClient(NotificationClient): """Telegram 通知客户端""" channel_key = "telegram" display_name = "Telegram" def __init__( self, bot_token: str = None, chat_id: str = None, api_base_url: str = DEFAULT_TELEGRAM_API_BASE_URL, pcurl_to_mobile: bool = True, ): super().__init__(enabled=bool(bot_token and chat_id), pcurl_to_mobile=pcurl_to_mobile) self.bot_token = bot_token self.chat_id = chat_id self.api_base_url = ( (api_base_url or DEFAULT_TELEGRAM_API_BASE_URL).rstrip("/") ) async def send(self, product_data: Dict, reason: str) -> None: """发送 Telegram 通知""" if not self.is_enabled(): raise RuntimeError("Telegram 未启用") message = self._build_message(product_data, reason) telegram_message = [ "🚨 新推荐!", "", f"{message.title[:50]}{'...' if len(message.title) > 50 else ''}", "", f"💰 价格: {message.price}", f"📝 原因: {message.reason}", ] if message.mobile_link: telegram_message.append(f"📱 手机端链接") telegram_message.append(f"💻 电脑端链接") telegram_api_url = f"{self.api_base_url}/bot{self.bot_token}/sendMessage" telegram_payload = { "chat_id": self.chat_id, "text": "\n".join(telegram_message), "parse_mode": "HTML", "disable_web_page_preview": False } headers = {"Content-Type": "application/json"} loop = asyncio.get_running_loop() response = await loop.run_in_executor( None, lambda: requests.post( telegram_api_url, json=telegram_payload, headers=headers, timeout=10 ) ) response.raise_for_status() result = response.json() if not result.get("ok"): raise RuntimeError(result.get("description", "Telegram 返回未知错误")) ================================================ FILE: src/infrastructure/external/notification_clients/webhook_client.py ================================================ """ 通用 Webhook 通知客户端 """ import asyncio import json from typing import Any, Dict from urllib.parse import parse_qsl, urlencode, urlparse, urlunparse import requests from .base import NotificationClient, NotificationMessage class WebhookClient(NotificationClient): """通用 Webhook 通知客户端""" channel_key = "webhook" display_name = "Webhook" def __init__( self, webhook_url: str | None = None, webhook_method: str = "POST", webhook_headers: str | None = None, webhook_content_type: str = "JSON", webhook_query_parameters: str | None = None, webhook_body: str | None = None, pcurl_to_mobile: bool = True, ): super().__init__(enabled=bool(webhook_url), pcurl_to_mobile=pcurl_to_mobile) self.webhook_url = webhook_url self.webhook_method = (webhook_method or "POST").upper() self.webhook_headers = webhook_headers self.webhook_content_type = (webhook_content_type or "JSON").upper() self.webhook_query_parameters = webhook_query_parameters self.webhook_body = webhook_body async def send(self, product_data: Dict, reason: str) -> None: if not self.is_enabled(): raise RuntimeError("Webhook 未启用") message = self._build_message(product_data, reason) headers = self._parse_json(self.webhook_headers, "WEBHOOK_HEADERS", expect_dict=True) or {} final_url = self._build_url(message) loop = asyncio.get_running_loop() if self.webhook_method == "GET": response = await loop.run_in_executor( None, lambda: requests.get(final_url, headers=headers, timeout=15), ) response.raise_for_status() return json_payload, form_payload = self._build_body(message, headers) response = await loop.run_in_executor( None, lambda: requests.post( final_url, headers=headers, json=json_payload, data=form_payload, timeout=15, ), ) response.raise_for_status() def _build_url(self, message: NotificationMessage) -> str: params = self._parse_json( self.webhook_query_parameters, "WEBHOOK_QUERY_PARAMETERS", expect_dict=True, ) or {} rendered = self._render_template(params, message) parsed_url = list(urlparse(self.webhook_url)) query = dict(parse_qsl(parsed_url[4])) query.update(rendered) parsed_url[4] = urlencode(query) return urlunparse(parsed_url) def _build_body( self, message: NotificationMessage, headers: Dict[str, str], ) -> tuple[Any | None, Any | None]: if not self.webhook_body: return None, None body_template = self._parse_json(self.webhook_body, "WEBHOOK_BODY") rendered_body = self._render_template(body_template, message) if self.webhook_content_type == "JSON": if "Content-Type" not in headers and "content-type" not in headers: headers["Content-Type"] = "application/json; charset=utf-8" return rendered_body, None if self.webhook_content_type == "FORM": if not isinstance(rendered_body, dict): raise ValueError("WEBHOOK_BODY 在 FORM 模式下必须是 JSON 对象") if "Content-Type" not in headers and "content-type" not in headers: headers["Content-Type"] = "application/x-www-form-urlencoded" return None, rendered_body raise ValueError(f"不支持的 WEBHOOK_CONTENT_TYPE: {self.webhook_content_type}") def _parse_json( self, raw_value: str | None, field_name: str, expect_dict: bool = False, ) -> Any | None: if not raw_value: return None try: parsed = json.loads(raw_value) except json.JSONDecodeError as exc: raise ValueError(f"{field_name} 不是合法 JSON: {exc.msg}") from exc if expect_dict and not isinstance(parsed, dict): raise ValueError(f"{field_name} 必须是 JSON 对象") return parsed def _render_template(self, value: Any, message: NotificationMessage) -> Any: if isinstance(value, str): return self._replace_placeholders(value, message) if isinstance(value, list): return [self._render_template(item, message) for item in value] if isinstance(value, dict): return { key: self._render_template(item, message) for key, item in value.items() } return value def _replace_placeholders(self, value: str, message: NotificationMessage) -> str: replacements = { "title": message.notification_title, "content": message.content, "price": message.price, "reason": message.reason, "desktop_link": message.desktop_link, "mobile_link": message.mobile_link or message.desktop_link, } rendered = value for key, replacement in replacements.items(): rendered = rendered.replace(f"${{{key}}}", replacement) rendered = rendered.replace(f"{{{{{key}}}}}", replacement) return rendered ================================================ FILE: src/infrastructure/external/notification_clients/wecom_bot_client.py ================================================ """ 企业微信机器人通知客户端 """ import asyncio from typing import Dict import requests from .base import NotificationClient class WeComBotClient(NotificationClient): """企业微信机器人通知客户端""" channel_key = "wecom" display_name = "企业微信" def __init__(self, bot_url: str | None = None, pcurl_to_mobile: bool = True): super().__init__(enabled=bool(bot_url), pcurl_to_mobile=pcurl_to_mobile) self.bot_url = bot_url async def send(self, product_data: Dict, reason: str) -> None: if not self.is_enabled(): raise RuntimeError("企业微信 未启用") message = self._build_message(product_data, reason) markdown_lines = [f"## {message.notification_title}", ""] markdown_lines.append(f"- 价格: {message.price}") markdown_lines.append(f"- 原因: {message.reason}") if message.mobile_link: markdown_lines.append(f"- 手机端链接: [{message.mobile_link}]({message.mobile_link})") markdown_lines.append(f"- 电脑端链接: [{message.desktop_link}]({message.desktop_link})") payload = { "msgtype": "markdown", "markdown": {"content": "\n".join(markdown_lines)}, } headers = {"Content-Type": "application/json"} loop = asyncio.get_running_loop() response = await loop.run_in_executor( None, lambda: requests.post( self.bot_url, json=payload, headers=headers, timeout=10, ), ) response.raise_for_status() result = response.json() if result.get("errcode", 0) != 0: raise RuntimeError(result.get("errmsg", "企业微信返回未知错误")) ================================================ FILE: src/infrastructure/persistence/__init__.py ================================================ ================================================ FILE: src/infrastructure/persistence/json_task_repository.py ================================================ """ 基于JSON文件的任务仓储实现 """ from typing import List, Optional import json import aiofiles from src.domain.models.task import Task from src.domain.repositories.task_repository import TaskRepository class JsonTaskRepository(TaskRepository): """基于JSON文件的任务仓储""" def __init__(self, config_file: str = "config.json"): self.config_file = config_file async def find_all(self) -> List[Task]: """获取所有任务""" try: async with aiofiles.open(self.config_file, 'r', encoding='utf-8') as f: content = await f.read() if not content.strip(): return [] tasks_data = json.loads(content) tasks = [] for i, task_data in enumerate(tasks_data): task_data['id'] = i tasks.append(Task(**task_data)) return tasks except FileNotFoundError: return [] except json.JSONDecodeError: print(f"配置文件 {self.config_file} 格式错误") return [] async def find_by_id(self, task_id: int) -> Optional[Task]: """根据ID获取任务""" tasks = await self.find_all() if 0 <= task_id < len(tasks): return tasks[task_id] return None async def save(self, task: Task) -> Task: """保存任务(创建或更新)""" tasks = await self.find_all() if task.id is not None and 0 <= task.id < len(tasks): # 更新现有任务 tasks[task.id] = task else: # 创建新任务 task.id = len(tasks) tasks.append(task) await self._write_tasks(tasks) return task async def delete(self, task_id: int) -> bool: """删除任务""" tasks = await self.find_all() if 0 <= task_id < len(tasks): tasks.pop(task_id) await self._write_tasks(tasks) return True return False async def _write_tasks(self, tasks: List[Task]): """写入任务列表到文件""" tasks_data = [task.model_dump(exclude={'id'}) for task in tasks] async with aiofiles.open(self.config_file, 'w', encoding='utf-8') as f: await f.write(json.dumps(tasks_data, ensure_ascii=False, indent=2)) ================================================ FILE: src/infrastructure/persistence/sqlite_bootstrap.py ================================================ """ SQLite 启动初始化与旧文件迁移。 """ from __future__ import annotations import hashlib import json import threading from pathlib import Path from src.infrastructure.persistence.sqlite_connection import init_schema, sqlite_connection from src.infrastructure.persistence.storage_names import ( build_result_filename, normalize_keyword_from_filename, normalize_keyword_slug, ) BOOTSTRAP_LOCK = threading.Lock() LEGACY_CONFIG_FILE = "config.json" LEGACY_RESULT_DIR = "jsonl" LEGACY_PRICE_HISTORY_DIR = "price_history" TASKS_BOOTSTRAP_KEY = "bootstrap:legacy_tasks" RESULTS_BOOTSTRAP_KEY = "bootstrap:legacy_results" SNAPSHOTS_BOOTSTRAP_KEY = "bootstrap:legacy_price_snapshots" def bootstrap_sqlite_storage( db_path: str | None = None, *, legacy_config_file: str | None = LEGACY_CONFIG_FILE, legacy_result_dir: str = LEGACY_RESULT_DIR, legacy_price_history_dir: str = LEGACY_PRICE_HISTORY_DIR, ) -> None: with BOOTSTRAP_LOCK: with sqlite_connection(db_path) as conn: init_schema(conn) _import_tasks_if_needed(conn, legacy_config_file) _import_results_if_needed(conn, legacy_result_dir) _import_price_snapshots_if_needed(conn, legacy_price_history_dir) def _table_is_empty(conn, table_name: str) -> bool: row = conn.execute(f"SELECT COUNT(1) AS total FROM {table_name}").fetchone() return row is None or int(row["total"]) == 0 def _load_json_file(path: Path): if not path.exists(): return None content = path.read_text(encoding="utf-8").strip() if not content: return None return json.loads(content) def _import_tasks_if_needed(conn, legacy_config_file: str | None) -> None: if _bootstrap_completed(conn, TASKS_BOOTSTRAP_KEY): return if not _table_is_empty(conn, "tasks"): _mark_bootstrap_completed(conn, TASKS_BOOTSTRAP_KEY) conn.commit() return if legacy_config_file is None: _mark_bootstrap_completed(conn, TASKS_BOOTSTRAP_KEY) conn.commit() return path = Path(legacy_config_file) tasks = _load_json_file(path) if not isinstance(tasks, list): _mark_bootstrap_completed(conn, TASKS_BOOTSTRAP_KEY) conn.commit() return for index, raw_task in enumerate(tasks): if not isinstance(raw_task, dict): continue conn.execute( """ INSERT INTO tasks ( id, task_name, enabled, keyword, description, analyze_images, max_pages, personal_only, min_price, max_price, cron, ai_prompt_base_file, ai_prompt_criteria_file, account_state_file, account_strategy, free_shipping, new_publish_option, region, decision_mode, keyword_rules_json, is_running ) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?) """, ( index, raw_task.get("task_name", ""), _as_int(raw_task.get("enabled", True)), raw_task.get("keyword", ""), raw_task.get("description", ""), _as_int(raw_task.get("analyze_images", True)), int(raw_task.get("max_pages", 1) or 1), _as_int(raw_task.get("personal_only", False)), raw_task.get("min_price"), raw_task.get("max_price"), raw_task.get("cron"), raw_task.get("ai_prompt_base_file", "prompts/base_prompt.txt"), raw_task.get("ai_prompt_criteria_file", ""), raw_task.get("account_state_file"), raw_task.get("account_strategy", "auto"), _as_int(raw_task.get("free_shipping", True)), raw_task.get("new_publish_option"), raw_task.get("region"), raw_task.get("decision_mode", "ai"), json.dumps(raw_task.get("keyword_rules") or [], ensure_ascii=False), _as_int(raw_task.get("is_running", False)), ), ) _mark_bootstrap_completed(conn, TASKS_BOOTSTRAP_KEY) conn.commit() def _import_results_if_needed(conn, legacy_result_dir: str) -> None: if _bootstrap_completed(conn, RESULTS_BOOTSTRAP_KEY): return if not _table_is_empty(conn, "result_items"): _mark_bootstrap_completed(conn, RESULTS_BOOTSTRAP_KEY) conn.commit() return result_dir = Path(legacy_result_dir) if not result_dir.exists(): _mark_bootstrap_completed(conn, RESULTS_BOOTSTRAP_KEY) conn.commit() return for path in sorted(result_dir.glob("*.jsonl")): filename = path.name keyword = normalize_keyword_from_filename(filename) with path.open("r", encoding="utf-8") as handle: for line in handle: text = line.strip() if not text: continue try: record = json.loads(text) except json.JSONDecodeError: continue _insert_result_record(conn, record, keyword=keyword, filename=filename) _mark_bootstrap_completed(conn, RESULTS_BOOTSTRAP_KEY) conn.commit() def _import_price_snapshots_if_needed(conn, legacy_price_history_dir: str) -> None: if _bootstrap_completed(conn, SNAPSHOTS_BOOTSTRAP_KEY): return if not _table_is_empty(conn, "price_snapshots"): _mark_bootstrap_completed(conn, SNAPSHOTS_BOOTSTRAP_KEY) conn.commit() return history_dir = Path(legacy_price_history_dir) if not history_dir.exists(): _mark_bootstrap_completed(conn, SNAPSHOTS_BOOTSTRAP_KEY) conn.commit() return for path in sorted(history_dir.glob("*_history.jsonl")): with path.open("r", encoding="utf-8") as handle: for line in handle: text = line.strip() if not text: continue try: record = json.loads(text) except json.JSONDecodeError: continue _insert_price_snapshot(conn, record) _mark_bootstrap_completed(conn, SNAPSHOTS_BOOTSTRAP_KEY) conn.commit() def _insert_result_record(conn, record: dict, *, keyword: str, filename: str) -> None: item = record.get("商品信息", {}) or {} analysis = record.get("ai_analysis", {}) or {} link = str(item.get("商品链接") or "") if link: link_unique_key = link.split("&", 1)[0] else: item_id = str(item.get("商品ID") or "").strip() if item_id: link_unique_key = f"item:{item_id}" else: link_unique_key = "hash:" + hashlib.sha1( json.dumps(record, ensure_ascii=False, sort_keys=True).encode("utf-8") ).hexdigest() final_keyword = str(record.get("搜索关键字") or keyword) result_filename = filename or build_result_filename(final_keyword) keyword_hit_count = analysis.get("keyword_hit_count", 0) try: keyword_hit_count = int(keyword_hit_count) except (TypeError, ValueError): keyword_hit_count = 0 conn.execute( """ INSERT OR IGNORE INTO result_items ( result_filename, keyword, task_name, crawl_time, publish_time, price, price_display, item_id, title, link, link_unique_key, seller_nickname, is_recommended, analysis_source, keyword_hit_count, raw_json ) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?) """, ( result_filename, final_keyword, record.get("任务名称", ""), record.get("爬取时间", ""), item.get("发布时间"), _parse_price(item.get("当前售价")), item.get("当前售价"), item.get("商品ID"), item.get("商品标题"), link, link_unique_key, (record.get("卖家信息", {}) or {}).get("卖家昵称") or item.get("卖家昵称"), _as_int(analysis.get("is_recommended", False)), analysis.get("analysis_source"), keyword_hit_count, json.dumps(record, ensure_ascii=False), ), ) def _insert_price_snapshot(conn, record: dict) -> None: keyword = str(record.get("keyword") or "") slug = str(record.get("keyword_slug") or normalize_keyword_slug(keyword)) conn.execute( """ INSERT OR IGNORE INTO price_snapshots ( keyword_slug, keyword, task_name, snapshot_time, snapshot_day, run_id, item_id, title, price, price_display, tags_json, region, seller, publish_time, link ) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?) """, ( slug, keyword, record.get("task_name", ""), record.get("snapshot_time", ""), record.get("snapshot_day", ""), record.get("run_id", ""), record.get("item_id", ""), record.get("title", ""), _parse_price(record.get("price")), record.get("price_display"), json.dumps(record.get("tags") or [], ensure_ascii=False), record.get("region"), record.get("seller"), record.get("publish_time"), record.get("link"), ), ) def _as_int(value) -> int: if isinstance(value, bool): return 1 if value else 0 if value is None: return 0 return 1 if str(value).strip().lower() in {"1", "true", "yes", "on"} else 0 def _parse_price(value): if value is None: return None if isinstance(value, (int, float)): return round(float(value), 2) text = str(value).strip().replace("¥", "").replace(",", "") if not text or text in {"价格异常", "暂无", "-", "N/A"}: return None if text.endswith("万"): text = str(float(text[:-1]) * 10000) try: return round(float(text), 2) except (TypeError, ValueError): return None def _bootstrap_completed(conn, key: str) -> bool: row = conn.execute( "SELECT value FROM app_metadata WHERE key = ?", (key,), ).fetchone() return row is not None def _mark_bootstrap_completed(conn, key: str) -> None: conn.execute( """ INSERT OR REPLACE INTO app_metadata(key, value) VALUES (?, 'done') """, (key,), ) ================================================ FILE: src/infrastructure/persistence/sqlite_connection.py ================================================ """ SQLite 连接与 schema 初始化。 """ from __future__ import annotations import os import sqlite3 from contextlib import contextmanager from pathlib import Path from typing import Iterator from src.infrastructure.persistence.storage_names import DEFAULT_DATABASE_PATH BUSY_TIMEOUT_MS = 5000 SCHEMA_STATEMENTS = ( """ CREATE TABLE IF NOT EXISTS app_metadata ( key TEXT PRIMARY KEY, value TEXT NOT NULL ) """, """ CREATE TABLE IF NOT EXISTS tasks ( id INTEGER PRIMARY KEY, task_name TEXT NOT NULL, enabled INTEGER NOT NULL, keyword TEXT NOT NULL, description TEXT, analyze_images INTEGER NOT NULL, max_pages INTEGER NOT NULL, personal_only INTEGER NOT NULL, min_price TEXT, max_price TEXT, cron TEXT, ai_prompt_base_file TEXT NOT NULL, ai_prompt_criteria_file TEXT NOT NULL, account_state_file TEXT, account_strategy TEXT NOT NULL, free_shipping INTEGER NOT NULL, new_publish_option TEXT, region TEXT, decision_mode TEXT NOT NULL, keyword_rules_json TEXT NOT NULL, is_running INTEGER NOT NULL ) """, """ CREATE TABLE IF NOT EXISTS result_items ( id INTEGER PRIMARY KEY AUTOINCREMENT, result_filename TEXT NOT NULL, keyword TEXT NOT NULL, task_name TEXT NOT NULL, crawl_time TEXT NOT NULL, publish_time TEXT, price REAL, price_display TEXT, item_id TEXT, title TEXT, link TEXT, link_unique_key TEXT NOT NULL, seller_nickname TEXT, is_recommended INTEGER NOT NULL, analysis_source TEXT, keyword_hit_count INTEGER NOT NULL, raw_json TEXT NOT NULL, UNIQUE(result_filename, link_unique_key) ) """, """ CREATE TABLE IF NOT EXISTS price_snapshots ( id INTEGER PRIMARY KEY AUTOINCREMENT, keyword_slug TEXT NOT NULL, keyword TEXT NOT NULL, task_name TEXT NOT NULL, snapshot_time TEXT NOT NULL, snapshot_day TEXT NOT NULL, run_id TEXT NOT NULL, item_id TEXT NOT NULL, title TEXT, price REAL NOT NULL, price_display TEXT, tags_json TEXT NOT NULL, region TEXT, seller TEXT, publish_time TEXT, link TEXT, UNIQUE(keyword_slug, run_id, item_id) ) """, "CREATE INDEX IF NOT EXISTS idx_tasks_name ON tasks(task_name)", """ CREATE INDEX IF NOT EXISTS idx_results_filename_crawl ON result_items(result_filename, crawl_time DESC) """, """ CREATE INDEX IF NOT EXISTS idx_results_filename_publish ON result_items(result_filename, publish_time DESC) """, """ CREATE INDEX IF NOT EXISTS idx_results_filename_price ON result_items(result_filename, price DESC) """, """ CREATE INDEX IF NOT EXISTS idx_results_filename_recommended ON result_items(result_filename, is_recommended, analysis_source, crawl_time DESC) """, """ CREATE INDEX IF NOT EXISTS idx_snapshots_keyword_time ON price_snapshots(keyword_slug, snapshot_time DESC) """, """ CREATE INDEX IF NOT EXISTS idx_snapshots_keyword_item_time ON price_snapshots(keyword_slug, item_id, snapshot_time DESC) """, ) def get_database_path() -> str: return os.getenv("APP_DATABASE_FILE", DEFAULT_DATABASE_PATH) def _prepare_database_file(path: str) -> None: Path(path).parent.mkdir(parents=True, exist_ok=True) def _apply_pragmas(conn: sqlite3.Connection) -> None: conn.execute("PRAGMA journal_mode=WAL") conn.execute("PRAGMA foreign_keys=ON") conn.execute(f"PRAGMA busy_timeout={BUSY_TIMEOUT_MS}") def init_schema(conn: sqlite3.Connection) -> None: for statement in SCHEMA_STATEMENTS: conn.execute(statement) conn.commit() @contextmanager def sqlite_connection( db_path: str | None = None, ) -> Iterator[sqlite3.Connection]: path = db_path or get_database_path() _prepare_database_file(path) conn = sqlite3.connect(path) conn.row_factory = sqlite3.Row try: _apply_pragmas(conn) yield conn finally: conn.close() ================================================ FILE: src/infrastructure/persistence/sqlite_task_repository.py ================================================ """ 基于 SQLite 的任务仓储实现。 """ from __future__ import annotations import asyncio import json from typing import List, Optional from src.domain.models.task import Task from src.domain.repositories.task_repository import TaskRepository from src.infrastructure.persistence.sqlite_bootstrap import bootstrap_sqlite_storage from src.infrastructure.persistence.sqlite_connection import sqlite_connection def _row_to_task(row) -> Task: payload = dict(row) payload["enabled"] = bool(payload["enabled"]) payload["analyze_images"] = bool(payload["analyze_images"]) payload["personal_only"] = bool(payload["personal_only"]) payload["free_shipping"] = bool(payload["free_shipping"]) payload["is_running"] = bool(payload["is_running"]) payload["keyword_rules"] = json.loads(payload.pop("keyword_rules_json") or "[]") return Task(**payload) def find_task_by_name_sync(task_name: str) -> Task | None: bootstrap_sqlite_storage() with sqlite_connection() as conn: row = conn.execute( "SELECT * FROM tasks WHERE task_name = ? ORDER BY id ASC LIMIT 1", (task_name,), ).fetchone() return _row_to_task(row) if row else None class SqliteTaskRepository(TaskRepository): """基于 SQLite 的任务仓储""" def __init__( self, db_path: str | None = None, legacy_config_file: str | None = "config.json", ): self.db_path = db_path self.legacy_config_file = legacy_config_file async def find_all(self) -> List[Task]: return await asyncio.to_thread(self._find_all_sync) async def find_by_id(self, task_id: int) -> Optional[Task]: return await asyncio.to_thread(self._find_by_id_sync, task_id) async def save(self, task: Task) -> Task: return await asyncio.to_thread(self._save_sync, task) async def delete(self, task_id: int) -> bool: return await asyncio.to_thread(self._delete_sync, task_id) def _find_all_sync(self) -> List[Task]: bootstrap_sqlite_storage( self.db_path, legacy_config_file=self.legacy_config_file, ) with sqlite_connection(self.db_path) as conn: rows = conn.execute("SELECT * FROM tasks ORDER BY id ASC").fetchall() return [_row_to_task(row) for row in rows] def _find_by_id_sync(self, task_id: int) -> Optional[Task]: bootstrap_sqlite_storage( self.db_path, legacy_config_file=self.legacy_config_file, ) with sqlite_connection(self.db_path) as conn: row = conn.execute("SELECT * FROM tasks WHERE id = ?", (task_id,)).fetchone() return _row_to_task(row) if row else None def _save_sync(self, task: Task) -> Task: bootstrap_sqlite_storage( self.db_path, legacy_config_file=self.legacy_config_file, ) with sqlite_connection(self.db_path) as conn: task_id = task.id if task_id is None: task_id = self._next_task_id(conn) payload = self._task_values(task.model_copy(update={"id": task_id})) conn.execute( """ INSERT OR REPLACE INTO tasks ( id, task_name, enabled, keyword, description, analyze_images, max_pages, personal_only, min_price, max_price, cron, ai_prompt_base_file, ai_prompt_criteria_file, account_state_file, account_strategy, free_shipping, new_publish_option, region, decision_mode, keyword_rules_json, is_running ) VALUES ( :id, :task_name, :enabled, :keyword, :description, :analyze_images, :max_pages, :personal_only, :min_price, :max_price, :cron, :ai_prompt_base_file, :ai_prompt_criteria_file, :account_state_file, :account_strategy, :free_shipping, :new_publish_option, :region, :decision_mode, :keyword_rules_json, :is_running ) """, payload, ) conn.commit() return task.model_copy(update={"id": task_id}) def _delete_sync(self, task_id: int) -> bool: bootstrap_sqlite_storage( self.db_path, legacy_config_file=self.legacy_config_file, ) with sqlite_connection(self.db_path) as conn: cursor = conn.execute("DELETE FROM tasks WHERE id = ?", (task_id,)) conn.commit() return cursor.rowcount > 0 def _next_task_id(self, conn) -> int: row = conn.execute("SELECT COALESCE(MAX(id), -1) AS max_id FROM tasks").fetchone() return int(row["max_id"]) + 1 def _task_values(self, task: Task) -> dict: values = task.model_dump() values["enabled"] = int(task.enabled) values["analyze_images"] = int(task.analyze_images) values["personal_only"] = int(task.personal_only) values["free_shipping"] = int(task.free_shipping) values["is_running"] = int(task.is_running) values["keyword_rules_json"] = json.dumps(task.keyword_rules or [], ensure_ascii=False) values.pop("keyword_rules", None) return values ================================================ FILE: src/infrastructure/persistence/storage_names.py ================================================ """ SQLite 持久化相关的统一命名规则。 """ from __future__ import annotations DEFAULT_DATABASE_PATH = "data/app.sqlite3" RESULT_FILE_SUFFIX = "_full_data.jsonl" def build_result_filename(keyword: str) -> str: return f"{str(keyword or '').replace(' ', '_')}{RESULT_FILE_SUFFIX}" def normalize_keyword_from_filename(filename: str) -> str: return str(filename or "").replace(RESULT_FILE_SUFFIX, "") def normalize_keyword_slug(keyword: str) -> str: text = "".join( char for char in str(keyword or "").lower().replace(" ", "_") if char.isalnum() or char in "_-" ).rstrip("_") return text or "unknown" ================================================ FILE: src/keyword_rule_engine.py ================================================ """ 关键词判断引擎:单组 OR 逻辑,命中任意关键词即推荐。 纯英数字关键词按完整词匹配,避免 Q1 误命中 Q1R5。 """ import re from typing import Any, Dict, Iterable, List _ASCII_TOKEN_KEYWORD_PATTERN = re.compile(r"^[a-z0-9 ]+$") _ASCII_TOKEN_BOUNDARY = r"[a-z0-9]" def normalize_text(value: str) -> str: return " ".join((value or "").lower().split()) def _collect_text_fragments(value: Any, bucket: List[str]) -> None: if value is None: return if isinstance(value, str): text = value.strip() if text: bucket.append(text) return if isinstance(value, (int, float, bool)): bucket.append(str(value)) return if isinstance(value, dict): for item in value.values(): _collect_text_fragments(item, bucket) return if isinstance(value, list): for item in value: _collect_text_fragments(item, bucket) def build_search_text(record: Dict[str, Any]) -> str: fragments: List[str] = [] product_info = record.get("商品信息", {}) seller_info = record.get("卖家信息", {}) _collect_text_fragments(product_info.get("商品标题"), fragments) _collect_text_fragments(product_info, fragments) _collect_text_fragments(seller_info, fragments) return normalize_text(" ".join(fragments)) def _normalize_keywords(values: Iterable[str]) -> List[str]: normalized: List[str] = [] seen = set() for raw in values or []: text = normalize_text(str(raw).strip()) if not text or text in seen: continue seen.add(text) normalized.append(text) return normalized def _uses_ascii_token_match(keyword: str) -> bool: return bool(keyword) and _ASCII_TOKEN_KEYWORD_PATTERN.fullmatch(keyword) is not None def _keyword_matches(keyword: str, normalized_text: str) -> bool: if not _uses_ascii_token_match(keyword): return keyword in normalized_text pattern = rf"(? Dict[str, Any]: normalized_text = normalize_text(search_text) normalized_keywords = _normalize_keywords(keywords) if not normalized_text: return { "analysis_source": "keyword", "is_recommended": False, "reason": "可匹配文本为空,关键词规则无法执行。", "matched_keywords": [], "keyword_hit_count": 0, } if not normalized_keywords: return { "analysis_source": "keyword", "is_recommended": False, "reason": "未配置关键词规则。", "matched_keywords": [], "keyword_hit_count": 0, } matched_keywords = [kw for kw in normalized_keywords if _keyword_matches(kw, normalized_text)] hit_count = len(matched_keywords) is_recommended = hit_count > 0 if is_recommended: reason = f"命中 {hit_count} 个关键词:{', '.join(matched_keywords)}" else: reason = "未命中任何关键词。" return { "analysis_source": "keyword", "is_recommended": is_recommended, "reason": reason, "matched_keywords": matched_keywords, "keyword_hit_count": hit_count, } ================================================ FILE: src/parsers.py ================================================ import json from datetime import datetime from src.config import AI_DEBUG_MODE from src.utils import safe_get async def _parse_search_results_json(json_data: dict, source: str) -> list: """解析搜索API的JSON数据,返回基础商品信息列表。""" page_data = [] try: items = await safe_get(json_data, "data", "resultList", default=[]) if not items: print(f"LOG: ({source}) API响应中未找到商品列表 (resultList)。") if AI_DEBUG_MODE: print(f"--- [SEARCH DEBUG] RAW JSON RESPONSE from {source} ---") print(json.dumps(json_data, ensure_ascii=False, indent=2)) print("----------------------------------------------------") return [] for item in items: main_data = await safe_get(item, "data", "item", "main", "exContent", default={}) click_params = await safe_get(item, "data", "item", "main", "clickParam", "args", default={}) title = await safe_get(main_data, "title", default="未知标题") price_parts = await safe_get(main_data, "price", default=[]) price = "".join([str(p.get("text", "")) for p in price_parts if isinstance(p, dict)]).replace("当前价", "").strip() if isinstance(price_parts, list) else "价格异常" if "万" in price: price = f"¥{float(price.replace('¥', '').replace('万', '')) * 10000:.0f}" area = await safe_get(main_data, "area", default="地区未知") seller = await safe_get(main_data, "userNickName", default="匿名卖家") raw_link = await safe_get(item, "data", "item", "main", "targetUrl", default="") image_url = await safe_get(main_data, "picUrl", default="") pub_time_ts = click_params.get("publishTime", "") item_id = await safe_get(main_data, "itemId", default="未知ID") original_price = await safe_get(main_data, "oriPrice", default="暂无") wants_count = await safe_get(click_params, "wantNum", default='NaN') tags = [] if await safe_get(click_params, "tag") == "freeship": tags.append("包邮") r1_tags = await safe_get(main_data, "fishTags", "r1", "tagList", default=[]) for tag_item in r1_tags: content = await safe_get(tag_item, "data", "content", default="") if "验货宝" in content: tags.append("验货宝") page_data.append({ "商品标题": title, "当前售价": price, "商品原价": original_price, "“想要”人数": wants_count, "商品标签": tags, "发货地区": area, "卖家昵称": seller, "商品链接": raw_link.replace("fleamarket://", "https://www.goofish.com/"), "发布时间": datetime.fromtimestamp(int(pub_time_ts)/1000).strftime("%Y-%m-%d %H:%M") if pub_time_ts.isdigit() else "未知时间", "商品ID": item_id }) print(f"LOG: ({source}) 成功解析到 {len(page_data)} 条商品基础信息。") return page_data except Exception as e: print(f"LOG: ({source}) JSON数据处理异常: {str(e)}") return [] async def calculate_reputation_from_ratings(ratings_json: list) -> dict: """从原始评价API数据列表中,计算作为卖家和买家的好评数与好评率。""" seller_total = 0 seller_positive = 0 buyer_total = 0 buyer_positive = 0 for card in ratings_json: # 使用 safe_get 保证安全访问 data = await safe_get(card, 'cardData', default={}) role_tag = await safe_get(data, 'rateTagList', 0, 'text', default='') rate_type = await safe_get(data, 'rate') # 1=好评, 0=中评, -1=差评 if "卖家" in role_tag: seller_total += 1 if rate_type == 1: seller_positive += 1 elif "买家" in role_tag: buyer_total += 1 if rate_type == 1: buyer_positive += 1 # 计算比率,并处理除以零的情况 seller_rate = f"{(seller_positive / seller_total * 100):.2f}%" if seller_total > 0 else "N/A" buyer_rate = f"{(buyer_positive / buyer_total * 100):.2f}%" if buyer_total > 0 else "N/A" return { "作为卖家的好评数": f"{seller_positive}/{seller_total}", "作为卖家的好评率": seller_rate, "作为买家的好评数": f"{buyer_positive}/{buyer_total}", "作为买家的好评率": buyer_rate } async def _parse_user_items_data(items_json: list) -> list: """解析用户主页的商品列表API的JSON数据。""" parsed_list = [] for card in items_json: data = card.get('cardData', {}) status_code = data.get('itemStatus') if status_code == 0: status_text = "在售" elif status_code == 1: status_text = "已售" else: status_text = f"未知状态 ({status_code})" parsed_list.append({ "商品ID": data.get('id'), "商品标题": data.get('title'), "商品价格": data.get('priceInfo', {}).get('price'), "商品主图": data.get('picInfo', {}).get('picUrl'), "商品状态": status_text }) return parsed_list async def parse_user_head_data(head_json: dict) -> dict: """解析用户头部API的JSON数据。""" data = head_json.get('data', {}) ylz_tags = await safe_get(data, 'module', 'base', 'ylzTags', default=[]) seller_credit, buyer_credit = {}, {} for tag in ylz_tags: if await safe_get(tag, 'attributes', 'role') == 'seller': seller_credit = {'level': await safe_get(tag, 'attributes', 'level'), 'text': tag.get('text')} elif await safe_get(tag, 'attributes', 'role') == 'buyer': buyer_credit = {'level': await safe_get(tag, 'attributes', 'level'), 'text': tag.get('text')} return { "卖家昵称": await safe_get(data, 'module', 'base', 'displayName'), "卖家头像链接": await safe_get(data, 'module', 'base', 'avatar', 'avatar'), "卖家个性签名": await safe_get(data, 'module', 'base', 'introduction', default=''), "卖家在售/已售商品数": await safe_get(data, 'module', 'tabs', 'item', 'number'), "卖家收到的评价总数": await safe_get(data, 'module', 'tabs', 'rate', 'number'), "卖家信用等级": seller_credit.get('text', '暂无'), "买家信用等级": buyer_credit.get('text', '暂无') } async def parse_ratings_data(ratings_json: list) -> list: """解析评价列表API的JSON数据。""" parsed_list = [] for card in ratings_json: data = await safe_get(card, 'cardData', default={}) rate_tag = await safe_get(data, 'rateTagList', 0, 'text', default='未知角色') rate_type = await safe_get(data, 'rate') if rate_type == 1: rate_text = "好评" elif rate_type == 0: rate_text = "中评" elif rate_type == -1: rate_text = "差评" else: rate_text = "未知" parsed_list.append({ "评价ID": data.get('rateId'), "评价内容": data.get('feedback'), "评价类型": rate_text, "评价来源角色": rate_tag, "评价者昵称": data.get('raterUserNick'), "评价时间": data.get('gmtCreate'), "评价图片": await safe_get(data, 'pictCdnUrlList', default=[]) }) return parsed_list ================================================ FILE: src/prompt_utils.py ================================================ import json import os import sys from typing import Awaitable, Callable, Optional import aiofiles from src.infrastructure.external.ai_client import AIClient # The meta-prompt to instruct the AI META_PROMPT_TEMPLATE = """ 你是一位世界级的AI提示词工程大师。你的任务是根据用户提供的【购买需求】,模仿一个【参考范例】,为闲鱼监控机器人的AI分析模块(代号 EagleEye)生成一份全新的【分析标准】文本。 你的输出必须严格遵循【参考范例】的结构、语气和核心原则,但内容要完全针对用户的【购买需求】进行定制。最终生成的文本将作为AI分析模块的思考指南。 --- 这是【参考范例】(`macbook_criteria.txt`): ```text {reference_text} ``` --- 这是用户的【购买需求】: ```text {user_description} ``` --- 请现在开始生成全新的【分析标准】文本。请注意: 1. **只输出新生成的文本内容**,不要包含任何额外的解释、标题或代码块标记。 2. 保留范例中的 `[V6.3 核心升级]`、`[V6.4 逻辑修正]` 等版本标记,这有助于保持格式一致性。 3. 将范例中所有与 "MacBook" 相关的内容,替换为与用户需求商品相关的内容。 4. 思考并生成针对新商品类型的“一票否决硬性原则”和“危险信号清单”。 """ ProgressCallback = Callable[[str, str], Awaitable[None]] async def _report_progress( progress_callback: Optional[ProgressCallback], step_key: str, message: str, ) -> None: if progress_callback: await progress_callback(step_key, message) def _read_reference_text(reference_file_path: str) -> str: try: with open(reference_file_path, "r", encoding="utf-8") as file: return file.read() except FileNotFoundError: raise FileNotFoundError(f"参考文件未找到: {reference_file_path}") except IOError as exc: raise IOError(f"读取参考文件失败: {exc}") async def _request_generated_text(ai_client: AIClient, prompt: str) -> str: print("正在调用AI生成新的分析标准,请稍候...") try: generated_text = await ai_client._call_ai( [{"role": "user", "content": prompt}], temperature=0.5, max_output_tokens=800, enable_json_output=False, ) except Exception as exc: print(f"调用 OpenAI API 时出错: {exc}") raise print("AI已成功生成内容。") return generated_text.strip() async def _close_ai_client( ai_client: AIClient, active_error: BaseException | None, ) -> None: try: await ai_client.close() except Exception as close_error: print(f"关闭 AI 客户端时出错: {close_error}") if active_error is None: raise async def generate_criteria( user_description: str, reference_file_path: str, progress_callback: Optional[ProgressCallback] = None, ) -> str: """ Generates a new criteria file content using AI. """ ai_client = AIClient() active_error: BaseException | None = None try: if not ai_client.is_available(): ai_client.refresh() if not ai_client.is_available(): raise RuntimeError("AI客户端未初始化,无法生成分析标准。请检查.env配置。") await _report_progress(progress_callback, "reference", "正在读取参考文件。") print(f"正在读取参考文件: {reference_file_path}") reference_text = _read_reference_text(reference_file_path) await _report_progress(progress_callback, "prompt", "正在构建发送给 AI 的指令。") print("正在构建发送给AI的指令...") prompt = META_PROMPT_TEMPLATE.format( reference_text=reference_text, user_description=user_description, ) await _report_progress(progress_callback, "llm", "正在调用 AI 生成分析标准。") return await _request_generated_text(ai_client, prompt) except Exception as exc: active_error = exc raise finally: await _close_ai_client(ai_client, active_error) async def update_config_with_new_task(new_task: dict, config_file: str = "config.json"): """ 将一个新任务添加到指定的JSON配置文件中。 """ print(f"正在更新配置文件: {config_file}") try: # 读取现有配置 config_data = [] if os.path.exists(config_file): async with aiofiles.open(config_file, 'r', encoding='utf-8') as f: content = await f.read() # 处理空文件的情况 if content.strip(): try: config_data = json.loads(content) print(f"成功读取现有配置,当前任务数量: {len(config_data)}") except json.JSONDecodeError as e: print(f"解析配置文件失败,将创建新配置: {e}") config_data = [] else: print(f"配置文件不存在,将创建新文件: {config_file}") # 追加新任务 config_data.append(new_task) # 写回配置文件 async with aiofiles.open(config_file, 'w', encoding='utf-8') as f: await f.write(json.dumps(config_data, ensure_ascii=False, indent=2)) print(f"配置文件写入完成") print(f"成功!新任务 '{new_task.get('task_name')}' 已添加到 {config_file} 并已启用。") return True except json.JSONDecodeError as e: error_msg = f"错误: 配置文件 {config_file} 格式错误,无法解析: {e}" sys.stderr.write(error_msg + "\n") print(error_msg) return False except IOError as e: error_msg = f"错误: 读写配置文件失败: {e}" sys.stderr.write(error_msg + "\n") print(error_msg) return False except Exception as e: error_msg = f"错误: 更新配置文件时发生未知错误: {e}" sys.stderr.write(error_msg + "\n") print(error_msg) import traceback print(traceback.format_exc()) return False ================================================ FILE: src/rotation.py ================================================ import os import random import time from dataclasses import dataclass from typing import Dict, List, Optional @dataclass class RotationItem: value: str last_error: Optional[str] = None class RotationPool: def __init__(self, items: List[str], blacklist_ttl: int = 300, name: str = ""): self.items = [RotationItem(value=item) for item in items if item] self.blacklist_ttl = max(0, int(blacklist_ttl)) self.name = name or "rotation" self._blacklist: Dict[str, float] = {} def _cleanup_blacklist(self) -> None: now = time.time() expired = [key for key, ts in self._blacklist.items() if ts <= now] for key in expired: self._blacklist.pop(key, None) def available_items(self) -> List[RotationItem]: self._cleanup_blacklist() return [item for item in self.items if item.value not in self._blacklist] def pick_random(self) -> Optional[RotationItem]: candidates = self.available_items() if not candidates: return None return random.choice(candidates) def mark_bad(self, item: Optional[RotationItem], reason: str = "") -> None: if not item: return item.last_error = reason if self.blacklist_ttl <= 0: return self._blacklist[item.value] = time.time() + self.blacklist_ttl def parse_proxy_pool(value: Optional[str]) -> List[str]: if not value: return [] if isinstance(value, list): return [str(item).strip() for item in value if str(item).strip()] return [entry.strip() for entry in str(value).split(",") if entry.strip()] def load_state_files(state_dir: str) -> List[str]: if not state_dir: return [] if not os.path.isdir(state_dir): return [] files = [] for name in os.listdir(state_dir): if name.endswith(".json"): files.append(os.path.join(state_dir, name)) return sorted(files) ================================================ FILE: src/scraper.py ================================================ import asyncio import json import os import random from datetime import datetime from typing import Optional from urllib.parse import urlencode from playwright.async_api import ( Response, TimeoutError as PlaywrightTimeoutError, async_playwright, ) from src.ai_handler import ( download_all_images, get_ai_analysis, send_ntfy_notification, cleanup_task_images, ) from src.config import ( AI_DEBUG_MODE, DETAIL_API_URL_PATTERN, LOGIN_IS_EDGE, RUN_HEADLESS, RUNNING_IN_DOCKER, SKIP_AI_ANALYSIS, STATE_FILE, ) from src.parsers import ( _parse_search_results_json, _parse_user_items_data, calculate_reputation_from_ratings, parse_ratings_data, parse_user_head_data, ) from src.utils import ( format_registration_days, get_link_unique_key, log_time, random_sleep, safe_get, save_to_jsonl, ) from src.rotation import RotationPool, load_state_files, parse_proxy_pool, RotationItem from src.failure_guard import FailureGuard from src.services.account_strategy_service import resolve_account_runtime_plan from src.infrastructure.persistence.storage_names import build_result_filename from src.services.item_analysis_dispatcher import ( ItemAnalysisDispatcher, ItemAnalysisJob, ) from src.services.price_history_service import ( build_market_reference, load_price_snapshots, record_market_snapshots, ) from src.services.result_storage_service import load_processed_link_keys from src.services.seller_profile_cache import SellerProfileCache from src.services.search_pagination import ( advance_search_page, is_search_results_response, ) class RiskControlError(Exception): pass class LoginRequiredError(Exception): """Raised when Goofish redirects to the passport/mini_login flow.""" FAILURE_GUARD = FailureGuard() EDGE_DOCKER_WARNING_PRINTED = False def _is_login_url(url: str) -> bool: if not url: return False lowered = url.lower() return "passport.goofish.com" in lowered or "mini_login" in lowered def _resolve_browser_channel() -> str: global EDGE_DOCKER_WARNING_PRINTED if RUNNING_IN_DOCKER: if LOGIN_IS_EDGE and not EDGE_DOCKER_WARNING_PRINTED: print( "检测到 LOGIN_IS_EDGE=true,但 Docker 镜像未内置 Edge," "任务运行时将改用 Chromium。" ) EDGE_DOCKER_WARNING_PRINTED = True return "chromium" return "msedge" if LOGIN_IS_EDGE else "chrome" def _should_analyze_images(task_config: dict) -> bool: raw_value = task_config.get("analyze_images", True) if isinstance(raw_value, bool): return raw_value return str(raw_value).strip().lower() not in {"false", "0", "no", "off"} def _format_failure_reason(reason: str, limit: int = 500) -> str: if not reason: return "未知错误" cleaned = " ".join(str(reason).split()) if len(cleaned) <= limit: return cleaned return cleaned[: limit - 3] + "..." async def _notify_task_failure( task_config: dict, reason: str, *, cookie_path: Optional[str] ) -> None: task_name = task_config.get("task_name", "未命名任务") keyword = task_config.get("keyword", "") formatted_reason = _format_failure_reason(reason) # Some failures are deterministic misconfiguration and should pause/notify immediately. pause_immediately = any( marker in formatted_reason for marker in ( "未找到可用的代理地址", "未找到可用的登录状态文件", ) ) guard_result = FAILURE_GUARD.record_failure( task_name, formatted_reason, cookie_path=cookie_path, min_failures_to_pause=1 if pause_immediately else None, ) if not guard_result.get("should_notify"): print( f"[FailureGuard] 任务 '{task_name}' 失败计数 {guard_result.get('consecutive_failures')}/{FAILURE_GUARD.threshold},暂不通知。" ) return paused_until = guard_result.get("paused_until") paused_until_str = ( paused_until.strftime("%Y-%m-%d %H:%M:%S") if paused_until else "N/A" ) product_data = { "商品标题": f"[任务异常] {task_name}", "当前售价": "N/A", "商品链接": "#", } notify_reason = ( f"任务运行失败(已连续 {guard_result.get('consecutive_failures')}/{FAILURE_GUARD.threshold} 次): {formatted_reason}" f"\n任务: {task_name}" f"\n关键词: {keyword or 'N/A'}" f"\n已自动暂停重试,暂停到: {paused_until_str}" f"\n修复后(更新登录态/cookies文件)将自动恢复。" ) try: await send_ntfy_notification(product_data, notify_reason) except Exception as e: print(f"发送任务异常通知失败: {e}") def _as_bool(value, default: bool = False) -> bool: if value is None: return default if isinstance(value, bool): return value return str(value).strip().lower() in {"1", "true", "yes", "y", "on"} def _as_int(value, default: int) -> int: if value is None: return default try: return int(value) except (TypeError, ValueError): return default def _get_rotation_settings(task_config: dict) -> dict: account_cfg = task_config.get("account_rotation") or {} proxy_cfg = task_config.get("proxy_rotation") or {} account_enabled = _as_bool( account_cfg.get("enabled"), _as_bool(os.getenv("ACCOUNT_ROTATION_ENABLED"), False), ) account_mode = ( account_cfg.get("mode") or os.getenv("ACCOUNT_ROTATION_MODE", "per_task") ).lower() account_state_dir = account_cfg.get("state_dir") or os.getenv( "ACCOUNT_STATE_DIR", "state" ) account_retry_limit = _as_int( account_cfg.get("retry_limit"), _as_int(os.getenv("ACCOUNT_ROTATION_RETRY_LIMIT"), 2), ) account_blacklist_ttl = _as_int( account_cfg.get("blacklist_ttl_sec"), _as_int(os.getenv("ACCOUNT_BLACKLIST_TTL"), 300), ) proxy_enabled = _as_bool( proxy_cfg.get("enabled"), _as_bool(os.getenv("PROXY_ROTATION_ENABLED"), False) ) proxy_mode = ( proxy_cfg.get("mode") or os.getenv("PROXY_ROTATION_MODE", "per_task") ).lower() proxy_pool = proxy_cfg.get("proxy_pool") or os.getenv("PROXY_POOL", "") proxy_retry_limit = _as_int( proxy_cfg.get("retry_limit"), _as_int(os.getenv("PROXY_ROTATION_RETRY_LIMIT"), 2), ) proxy_blacklist_ttl = _as_int( proxy_cfg.get("blacklist_ttl_sec"), _as_int(os.getenv("PROXY_BLACKLIST_TTL"), 300), ) return { "account_enabled": account_enabled, "account_mode": account_mode, "account_state_dir": account_state_dir, "account_retry_limit": max(1, account_retry_limit), "account_blacklist_ttl": max(0, account_blacklist_ttl), "proxy_enabled": proxy_enabled, "proxy_mode": proxy_mode, "proxy_pool": proxy_pool, "proxy_retry_limit": max(1, proxy_retry_limit), "proxy_blacklist_ttl": max(0, proxy_blacklist_ttl), } def _get_ai_analysis_concurrency(task_config: dict) -> int: configured = task_config.get("ai_analysis_concurrency") default = _as_int(os.getenv("AI_ANALYSIS_CONCURRENCY"), 2) return max(1, _as_int(configured, default)) def _get_seller_profile_cache_ttl(task_config: dict) -> int: configured = task_config.get("seller_profile_cache_ttl") default = _as_int(os.getenv("SELLER_PROFILE_CACHE_TTL"), 1800) return max(0, _as_int(configured, default)) def _default_context_options() -> dict: return { "user_agent": "Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Mobile Safari/537.36", "viewport": {"width": 412, "height": 915}, "device_scale_factor": 2.625, "is_mobile": True, "has_touch": True, "locale": "zh-CN", "timezone_id": "Asia/Shanghai", "permissions": ["geolocation"], "geolocation": {"longitude": 121.4737, "latitude": 31.2304}, "color_scheme": "light", } def _clean_kwargs(options: dict) -> dict: return {k: v for k, v in options.items() if v is not None} def _looks_like_mobile(ua: str) -> Optional[bool]: if not ua: return None ua_lower = ua.lower() if "mobile" in ua_lower or "android" in ua_lower or "iphone" in ua_lower: return True if "windows" in ua_lower or "macintosh" in ua_lower: return False return None def _build_context_overrides(snapshot: dict) -> dict: env = snapshot.get("env") or {} headers = snapshot.get("headers") or {} navigator = env.get("navigator") or {} screen = env.get("screen") or {} intl = env.get("intl") or {} overrides = {} ua = ( headers.get("User-Agent") or headers.get("user-agent") or navigator.get("userAgent") ) if ua: overrides["user_agent"] = ua accept_language = headers.get("Accept-Language") or headers.get("accept-language") locale = None if accept_language: locale = accept_language.split(",")[0].strip() elif navigator.get("language"): locale = navigator["language"] if locale: overrides["locale"] = locale tz = intl.get("timeZone") if tz: overrides["timezone_id"] = tz width = screen.get("width") height = screen.get("height") if isinstance(width, (int, float)) and isinstance(height, (int, float)): overrides["viewport"] = {"width": int(width), "height": int(height)} dpr = screen.get("devicePixelRatio") if isinstance(dpr, (int, float)): overrides["device_scale_factor"] = float(dpr) touch_points = navigator.get("maxTouchPoints") if isinstance(touch_points, (int, float)): overrides["has_touch"] = touch_points > 0 mobile_flag = _looks_like_mobile(ua or "") if mobile_flag is not None: overrides["is_mobile"] = mobile_flag return _clean_kwargs(overrides) def _build_extra_headers(raw_headers: Optional[dict]) -> dict: if not raw_headers: return {} excluded = {"cookie", "content-length"} headers = {} for key, value in raw_headers.items(): if not key or key.lower() in excluded or value is None: continue headers[key] = value return headers async def scrape_user_profile(context, user_id: str) -> dict: """ 【新版】访问指定用户的个人主页,按顺序采集其摘要信息、完整的商品列表和完整的评价列表。 """ print(f" -> 开始采集用户ID: {user_id} 的完整信息...") profile_data = {} page = await context.new_page() # 为各项异步任务准备Future和数据容器 head_api_future = asyncio.get_event_loop().create_future() all_items, all_ratings = [], [] stop_item_scrolling, stop_rating_scrolling = asyncio.Event(), asyncio.Event() async def handle_response(response: Response): # 捕获头部摘要API if ( "mtop.idle.web.user.page.head" in response.url and not head_api_future.done() ): try: head_api_future.set_result(await response.json()) print(f" [API捕获] 用户头部信息... 成功") except Exception as e: if not head_api_future.done(): head_api_future.set_exception(e) # 捕获商品列表API elif "mtop.idle.web.xyh.item.list" in response.url: try: data = await response.json() all_items.extend(data.get("data", {}).get("cardList", [])) print(f" [API捕获] 商品列表... 当前已捕获 {len(all_items)} 件") if not data.get("data", {}).get("nextPage", True): stop_item_scrolling.set() except Exception as e: stop_item_scrolling.set() # 捕获评价列表API elif "mtop.idle.web.trade.rate.list" in response.url: try: data = await response.json() all_ratings.extend(data.get("data", {}).get("cardList", [])) print(f" [API捕获] 评价列表... 当前已捕获 {len(all_ratings)} 条") if not data.get("data", {}).get("nextPage", True): stop_rating_scrolling.set() except Exception as e: stop_rating_scrolling.set() page.on("response", handle_response) try: # --- 任务1: 导航并采集头部信息 --- await page.goto( f"https://www.goofish.com/personal?userId={user_id}", wait_until="domcontentloaded", timeout=20000, ) head_data = await asyncio.wait_for(head_api_future, timeout=15) profile_data = await parse_user_head_data(head_data) # --- 任务2: 滚动加载所有商品 (默认页面) --- print(" [采集阶段] 开始采集该用户的商品列表...") await random_sleep(2, 4) # 等待第一页商品API完成 while not stop_item_scrolling.is_set(): await page.evaluate("window.scrollTo(0, document.body.scrollHeight)") try: await asyncio.wait_for(stop_item_scrolling.wait(), timeout=8) except asyncio.TimeoutError: print(" [滚动超时] 商品列表可能已加载完毕。") break profile_data["卖家发布的商品列表"] = await _parse_user_items_data(all_items) # --- 任务3: 点击并采集所有评价 --- print(" [采集阶段] 开始采集该用户的评价列表...") rating_tab_locator = page.locator("//div[text()='信用及评价']/ancestor::li") if await rating_tab_locator.count() > 0: await rating_tab_locator.click() await random_sleep(3, 5) # 等待第一页评价API完成 while not stop_rating_scrolling.is_set(): await page.evaluate("window.scrollTo(0, document.body.scrollHeight)") try: await asyncio.wait_for(stop_rating_scrolling.wait(), timeout=8) except asyncio.TimeoutError: print(" [滚动超时] 评价列表可能已加载完毕。") break profile_data["卖家收到的评价列表"] = await parse_ratings_data(all_ratings) reputation_stats = await calculate_reputation_from_ratings(all_ratings) profile_data.update(reputation_stats) else: print(" [警告] 未找到评价选项卡,跳过评价采集。") except Exception as e: print(f" [错误] 采集用户 {user_id} 信息时发生错误: {e}") finally: page.remove_listener("response", handle_response) await page.close() print(f" -> 用户 {user_id} 信息采集完成。") return profile_data async def scrape_xianyu(task_config: dict, debug_limit: int = 0): """ 【核心执行器】 根据单个任务配置,异步爬取闲鱼商品数据,并对每个新发现的商品进行实时的、独立的AI分析和通知。 """ keyword = task_config["keyword"] max_pages = task_config.get("max_pages", 1) personal_only = task_config.get("personal_only", False) min_price = task_config.get("min_price") max_price = task_config.get("max_price") ai_prompt_text = task_config.get("ai_prompt_text", "") analyze_images = _should_analyze_images(task_config) decision_mode = str(task_config.get("decision_mode", "ai")).strip().lower() if decision_mode not in {"ai", "keyword"}: decision_mode = "ai" keyword_rules = task_config.get("keyword_rules") or [] free_shipping = task_config.get("free_shipping", False) raw_new_publish = task_config.get("new_publish_option") or "" new_publish_option = raw_new_publish.strip() if new_publish_option == "__none__": new_publish_option = "" region_filter = (task_config.get("region") or "").strip() processed_links = set() history_run_id = datetime.now().strftime("%Y%m%d%H%M%S") history_seen_item_ids: set[str] = set() historical_snapshots = load_price_snapshots(keyword) result_filename = build_result_filename(keyword) processed_links = load_processed_link_keys(keyword) if processed_links: print(f"LOG: 发现已存在结果集 {result_filename},已加载 {len(processed_links)} 个历史商品用于去重。") else: print(f"LOG: 结果集 {result_filename} 当前为空,将写入新记录。") rotation_settings = _get_rotation_settings(task_config) account_items = load_state_files(rotation_settings["account_state_dir"]) runtime_plan = resolve_account_runtime_plan( strategy=task_config.get("account_strategy"), account_state_file=task_config.get("account_state_file"), has_root_state_file=os.path.exists(STATE_FILE), available_account_files=account_items, ) forced_account = runtime_plan["forced_account"] if runtime_plan["prefer_root_state"]: account_items = [STATE_FILE] rotation_settings["account_enabled"] = False elif runtime_plan["use_account_pool"]: rotation_settings["account_enabled"] = True else: rotation_settings["account_enabled"] = False account_pool = RotationPool( account_items, rotation_settings["account_blacklist_ttl"], "account" ) proxy_pool = RotationPool( parse_proxy_pool(rotation_settings["proxy_pool"]), rotation_settings["proxy_blacklist_ttl"], "proxy", ) selected_account: Optional[RotationItem] = None selected_proxy: Optional[RotationItem] = None def _select_account(force_new: bool = False) -> Optional[RotationItem]: nonlocal selected_account if forced_account: return RotationItem(value=forced_account) if not rotation_settings["account_enabled"]: if os.path.exists(STATE_FILE): return RotationItem(value=STATE_FILE) return None if ( rotation_settings["account_mode"] == "per_task" and selected_account and not force_new ): return selected_account picked = account_pool.pick_random() return picked or selected_account def _select_proxy(force_new: bool = False) -> Optional[RotationItem]: nonlocal selected_proxy if not rotation_settings["proxy_enabled"]: return None if ( rotation_settings["proxy_mode"] == "per_task" and selected_proxy and not force_new ): return selected_proxy picked = proxy_pool.pick_random() return picked or selected_proxy async def _run_scrape_attempt(state_file: str, proxy_server: Optional[str]) -> int: processed_item_count = 0 stop_scraping = False if not os.path.exists(state_file): raise FileNotFoundError(f"登录状态文件不存在: {state_file}") snapshot_data = None try: with open(state_file, "r", encoding="utf-8") as f: snapshot_data = json.load(f) except Exception as e: print(f"警告:读取登录状态文件失败,将直接按路径使用: {e}") async with async_playwright() as p: # 反检测启动参数 launch_args = [ "--disable-blink-features=AutomationControlled", "--disable-dev-shm-usage", "--no-sandbox", "--disable-setuid-sandbox", "--disable-web-security", "--disable-features=IsolateOrigins,site-per-process", ] launch_kwargs = {"headless": RUN_HEADLESS, "args": launch_args} if proxy_server: launch_kwargs["proxy"] = {"server": proxy_server} launch_kwargs["channel"] = _resolve_browser_channel() browser = await p.chromium.launch(**launch_kwargs) context_kwargs = _default_context_options() storage_state_arg = state_file analysis_dispatcher: Optional[ItemAnalysisDispatcher] = None if isinstance(snapshot_data, dict): # 新版扩展导出的增强快照,包含环境和Header if any( key in snapshot_data for key in ("env", "headers", "page", "storage") ): print(f"检测到增强浏览器快照,应用环境参数: {state_file}") storage_state_arg = {"cookies": snapshot_data.get("cookies", [])} context_kwargs.update(_build_context_overrides(snapshot_data)) extra_headers = _build_extra_headers(snapshot_data.get("headers")) if extra_headers: context_kwargs["extra_http_headers"] = extra_headers else: storage_state_arg = snapshot_data context_kwargs = _clean_kwargs(context_kwargs) context = await browser.new_context( storage_state=storage_state_arg, **context_kwargs ) seller_profile_cache = SellerProfileCache( ttl_seconds=_get_seller_profile_cache_ttl(task_config) ) analysis_dispatcher = ItemAnalysisDispatcher( concurrency=_get_ai_analysis_concurrency(task_config), skip_ai_analysis=SKIP_AI_ANALYSIS, seller_loader=lambda user_id: seller_profile_cache.get_or_load( str(user_id), lambda seller_key: scrape_user_profile(context, seller_key), ), image_downloader=download_all_images, ai_analyzer=get_ai_analysis, notifier=send_ntfy_notification, saver=save_to_jsonl, ) # 增强反检测脚本(模拟真实移动设备) await context.add_init_script(""" // 移除webdriver标识 Object.defineProperty(navigator, 'webdriver', {get: () => undefined}); // 模拟真实移动设备的navigator属性 Object.defineProperty(navigator, 'plugins', {get: () => [1, 2, 3, 4, 5]}); Object.defineProperty(navigator, 'languages', {get: () => ['zh-CN', 'zh', 'en-US', 'en']}); // 添加chrome对象 window.chrome = {runtime: {}, loadTimes: function() {}, csi: function() {}}; // 模拟触摸支持 Object.defineProperty(navigator, 'maxTouchPoints', {get: () => 5}); // 覆盖permissions查询(避免暴露自动化) const originalQuery = window.navigator.permissions.query; window.navigator.permissions.query = (parameters) => ( parameters.name === 'notifications' ? Promise.resolve({state: Notification.permission}) : originalQuery(parameters) ); """) page = await context.new_page() try: # 步骤 0 - 模拟真实用户:先访问首页(重要的反检测措施) log_time("步骤 0 - 模拟真实用户访问首页...") await page.goto( "https://www.goofish.com/", wait_until="domcontentloaded", timeout=30000, ) log_time("[反爬] 在首页停留,模拟浏览...") await random_sleep(1, 2) # 模拟随机滚动(移动设备的触摸滚动) await page.evaluate("window.scrollBy(0, Math.random() * 500 + 200)") await random_sleep(1, 2) log_time("步骤 1 - 导航到搜索结果页...") # 使用 'q' 参数构建正确的搜索URL,并进行URL编码 params = {"q": keyword} search_url = f"https://www.goofish.com/search?{urlencode(params)}" log_time(f"目标URL: {search_url}") # 先监听搜索接口响应,再执行导航,避免错过首次请求 async with page.expect_response( is_search_results_response, timeout=30000 ) as initial_response_info: await page.goto( search_url, wait_until="domcontentloaded", timeout=60000 ) if _is_login_url(page.url): raise LoginRequiredError( f"Login required: redirected to {page.url} (cookies/state likely expired)" ) # 捕获初始搜索的API数据 initial_response = await initial_response_info.value # 等待页面加载出关键筛选元素,以确认已成功进入搜索结果页 try: await page.wait_for_selector("text=新发布", timeout=15000) except PlaywrightTimeoutError as e: if _is_login_url(page.url): raise LoginRequiredError( f"Login required: redirected to {page.url} (cookies/state likely expired)" ) from e raise # 模拟真实用户行为:页面加载后的初始停留和浏览 log_time("[反爬] 模拟用户查看页面...") await random_sleep(1, 3) # --- 新增:检查是否存在验证弹窗 --- baxia_dialog = page.locator("div.baxia-dialog-mask") middleware_widget = page.locator("div.J_MIDDLEWARE_FRAME_WIDGET") try: # 等待弹窗在2秒内出现。如果出现,则执行块内代码。 await baxia_dialog.wait_for(state="visible", timeout=2000) print( "\n==================== CRITICAL BLOCK DETECTED ====================" ) print("检测到闲鱼反爬虫验证弹窗 (baxia-dialog),无法继续操作。") print("这通常是因为操作过于频繁或被识别为机器人。") print("建议:") print("1. 停止脚本一段时间再试。") print( "2. (推荐) 在 .env 文件中设置 RUN_HEADLESS=false,以非无头模式运行,这有助于绕过检测。" ) print(f"任务 '{keyword}' 将在此处中止。") print( "===================================================================" ) raise RiskControlError("baxia-dialog") except PlaywrightTimeoutError: # 2秒内弹窗未出现,这是正常情况,继续执行 pass # 检查是否有J_MIDDLEWARE_FRAME_WIDGET覆盖层 try: await middleware_widget.wait_for(state="visible", timeout=2000) print( "\n==================== CRITICAL BLOCK DETECTED ====================" ) print( "检测到闲鱼反爬虫验证弹窗 (J_MIDDLEWARE_FRAME_WIDGET),无法继续操作。" ) print("这通常是因为操作过于频繁或被识别为机器人。") print("建议:") print("1. 停止脚本一段时间再试。") print("2. (推荐) 更新登录状态文件,确保登录状态有效。") print("3. 降低任务执行频率,避免被识别为机器人。") print(f"任务 '{keyword}' 将在此处中止。") print( "===================================================================" ) raise RiskControlError("J_MIDDLEWARE_FRAME_WIDGET") except PlaywrightTimeoutError: # 2秒内弹窗未出现,这是正常情况,继续执行 pass # --- 结束新增 --- try: await page.click("div[class*='closeIconBg']", timeout=3000) print("LOG: 已关闭广告弹窗。") except PlaywrightTimeoutError: print("LOG: 未检测到广告弹窗。") final_response = None log_time("步骤 2 - 应用筛选条件...") if new_publish_option: try: await page.click("text=新发布") await random_sleep(1, 2) # 原来是 (1.5, 2.5) async with page.expect_response( is_search_results_response, timeout=20000 ) as response_info: await page.click(f"text={new_publish_option}") # --- 修改: 增加排序后的等待时间 --- await random_sleep(2, 4) # 原来是 (3, 5) final_response = await response_info.value except PlaywrightTimeoutError: log_time( f"新发布筛选 '{new_publish_option}' 请求超时,继续执行。" ) except Exception as e: print(f"LOG: 应用新发布筛选失败: {e}") if personal_only: async with page.expect_response( is_search_results_response, timeout=20000 ) as response_info: await page.click("text=个人闲置") # --- 修改: 将固定等待改为随机等待,并加长 --- await random_sleep(2, 4) # 原来是 asyncio.sleep(5) final_response = await response_info.value if free_shipping: try: async with page.expect_response( is_search_results_response, timeout=20000 ) as response_info: await page.click("text=包邮") await random_sleep(2, 4) final_response = await response_info.value except PlaywrightTimeoutError: log_time("包邮筛选请求超时,继续执行。") except Exception as e: print(f"LOG: 应用包邮筛选失败: {e}") if region_filter: try: area_trigger = page.get_by_text("区域", exact=True) if await area_trigger.count(): await area_trigger.first.click() await random_sleep(1.5, 2) popover_candidates = page.locator("div.ant-popover") popover = popover_candidates.filter( has=page.locator( ".areaWrap--FaZHsn8E, [class*='areaWrap']" ) ).last if not await popover.count(): popover = popover_candidates.filter( has=page.get_by_text("重新定位") ).last if not await popover.count(): popover = popover_candidates.filter( has=page.get_by_text("查看") ).last if not await popover.count(): print("LOG: 未找到区域弹窗,跳过区域筛选。") raise PlaywrightTimeoutError("region-popover-not-found") await popover.wait_for(state="visible", timeout=5000) # 列表容器:第一层 children 即省/市/区三列,不再强依赖具体类名,提升鲁棒性 area_wrap = popover.locator( ".areaWrap--FaZHsn8E, [class*='areaWrap']" ).first await area_wrap.wait_for(state="visible", timeout=3000) columns = area_wrap.locator(":scope > div") col_prov = columns.nth(0) col_city = columns.nth(1) col_dist = columns.nth(2) region_parts = [ p.strip() for p in region_filter.split("/") if p.strip() ] async def _click_in_column( column_locator, text_value: str, desc: str ) -> None: option = column_locator.locator( ".provItem--QAdOx8nD", has_text=text_value ).first if await option.count(): await option.click() await random_sleep(1.5, 2) try: await option.wait_for( state="attached", timeout=1500 ) await option.wait_for( state="visible", timeout=1500 ) except PlaywrightTimeoutError: pass else: print(f"LOG: 未找到{desc} '{text_value}',跳过。") if len(region_parts) >= 1: await _click_in_column( col_prov, region_parts[0], "省份" ) await random_sleep(1, 2) if len(region_parts) >= 2: await _click_in_column( col_city, region_parts[1], "城市" ) await random_sleep(1, 2) if len(region_parts) >= 3: await _click_in_column( col_dist, region_parts[2], "区/县" ) await random_sleep(1, 2) search_btn = popover.locator( "div.searchBtn--Ic6RKcAb" ).first if await search_btn.count(): try: async with page.expect_response( is_search_results_response, timeout=20000, ) as response_info: await search_btn.click() await random_sleep(2, 3) final_response = await response_info.value except PlaywrightTimeoutError: log_time("区域筛选提交超时,继续执行。") else: print( "LOG: 未找到区域弹窗的“查看XX件宝贝”按钮,跳过提交。" ) else: print("LOG: 未找到区域筛选触发器。") except PlaywrightTimeoutError: log_time(f"区域筛选 '{region_filter}' 请求超时,继续执行。") except Exception as e: print(f"LOG: 应用区域筛选 '{region_filter}' 失败: {e}") if min_price or max_price: price_container = page.locator( 'div[class*="search-price-input-container"]' ).first if await price_container.is_visible(): if min_price: await price_container.get_by_placeholder("¥").first.fill( min_price ) # --- 修改: 将固定等待改为随机等待 --- await random_sleep(1, 2.5) # 原来是 asyncio.sleep(5) if max_price: await ( price_container.get_by_placeholder("¥") .nth(1) .fill(max_price) ) # --- 修改: 将固定等待改为随机等待 --- await random_sleep(1, 2.5) # 原来是 asyncio.sleep(5) async with page.expect_response( is_search_results_response, timeout=20000 ) as response_info: await page.keyboard.press("Tab") # --- 修改: 增加确认价格后的等待时间 --- await random_sleep(2, 4) # 原来是 asyncio.sleep(5) final_response = await response_info.value else: print("LOG: 警告 - 未找到价格输入容器。") log_time("所有筛选已完成,开始处理商品列表...") current_response = ( final_response if final_response and final_response.ok else initial_response ) for page_num in range(1, max_pages + 1): if stop_scraping: break log_time(f"开始处理第 {page_num}/{max_pages} 页 ...") if page_num > 1: page_advance_result = await advance_search_page( page=page, page_num=page_num, ) if not page_advance_result.advanced: break current_response = page_advance_result.response if not (current_response and current_response.ok): log_time(f"第 {page_num} 页响应无效,跳过。") continue basic_items = await _parse_search_results_json( await current_response.json(), f"第 {page_num} 页" ) if not basic_items: break historical_snapshots.extend( record_market_snapshots( keyword=keyword, task_name=task_config.get("task_name", "Untitled Task"), items=basic_items, run_id=history_run_id, snapshot_time=datetime.now().isoformat(), seen_item_ids=history_seen_item_ids, ) ) total_items_on_page = len(basic_items) for i, item_data in enumerate(basic_items, 1): if debug_limit > 0 and processed_item_count >= debug_limit: log_time( f"已达到调试上限 ({debug_limit}),停止获取新商品。" ) stop_scraping = True break unique_key = get_link_unique_key(item_data["商品链接"]) if unique_key in processed_links: log_time( f"[页内进度 {i}/{total_items_on_page}] 商品 '{item_data['商品标题'][:20]}...' 已存在,跳过。" ) continue log_time( f"[页内进度 {i}/{total_items_on_page}] 发现新商品,获取详情: {item_data['商品标题'][:30]}..." ) # --- 修改: 访问详情页前的等待时间,模拟用户在列表页上看了一会儿 --- await random_sleep(2, 4) # 原来是 (2, 4) detail_page = await context.new_page() try: async with detail_page.expect_response( lambda r: DETAIL_API_URL_PATTERN in r.url, timeout=25000 ) as detail_info: await detail_page.goto( item_data["商品链接"], wait_until="domcontentloaded", timeout=25000, ) detail_response = await detail_info.value if detail_response.ok: detail_json = await detail_response.json() ret_string = str( await safe_get(detail_json, "ret", default=[]) ) if "FAIL_SYS_USER_VALIDATE" in ret_string: print( "\n==================== CRITICAL BLOCK DETECTED ====================" ) print( "检测到闲鱼反爬虫验证 (FAIL_SYS_USER_VALIDATE),程序将终止。" ) long_sleep_duration = random.randint(3, 60) print( f"为避免账户风险,将执行一次长时间休眠 ({long_sleep_duration} 秒) 后再退出..." ) await asyncio.sleep(long_sleep_duration) print("长时间休眠结束,现在将安全退出。") print( "===================================================================" ) raise RiskControlError("FAIL_SYS_USER_VALIDATE") # 解析商品详情数据并更新 item_data item_do = await safe_get( detail_json, "data", "itemDO", default={} ) seller_do = await safe_get( detail_json, "data", "sellerDO", default={} ) reg_days_raw = await safe_get( seller_do, "userRegDay", default=0 ) registration_duration_text = format_registration_days( reg_days_raw ) # --- START: 新增代码块 --- # 1. 提取卖家的芝麻信用信息 zhima_credit_text = await safe_get( seller_do, "zhimaLevelInfo", "levelName" ) # 2. 提取该商品的完整图片列表 image_infos = await safe_get( item_do, "imageInfos", default=[] ) if image_infos: # 使用列表推导式获取所有有效的图片URL all_image_urls = [ img.get("url") for img in image_infos if img.get("url") ] if all_image_urls: # 用新的字段存储图片列表,替换掉旧的单个链接 item_data["商品图片列表"] = all_image_urls # (可选) 仍然保留主图链接,以防万一 item_data["商品主图链接"] = all_image_urls[0] # --- END: 新增代码块 --- item_data["“想要”人数"] = await safe_get( item_do, "wantCnt", default=item_data.get("“想要”人数", "NaN"), ) item_data["浏览量"] = await safe_get( item_do, "browseCnt", default="-" ) # ...[此处可添加更多从详情页解析出的商品信息]... user_id = await safe_get(seller_do, "sellerId") # 构建基础记录 final_record = { "爬取时间": datetime.now().isoformat(), "搜索关键字": keyword, "任务名称": task_config.get( "task_name", "Untitled Task" ), "商品信息": item_data, "卖家信息": {}, } price_reference = build_market_reference( keyword=keyword, item=item_data, current_market_items=basic_items, historical_snapshots=historical_snapshots, ) final_record["价格参考"] = price_reference final_record["price_insight"] = price_reference.get( "本商品价格位置", {} ) analysis_dispatcher.submit( ItemAnalysisJob( keyword=keyword, task_name=task_config.get( "task_name", "Untitled Task" ), decision_mode=decision_mode, analyze_images=analyze_images, prompt_text=ai_prompt_text, keyword_rules=tuple(keyword_rules or []), final_record=final_record, seller_id=str(user_id) if user_id else None, zhima_credit_text=zhima_credit_text, registration_duration_text=registration_duration_text, ) ) processed_links.add(unique_key) processed_item_count += 1 log_time( f"商品已提交后台分析。累计处理 {processed_item_count} 个新商品。" ) # --- 修改: 增加单个商品处理后的主要延迟 --- log_time( "[反爬] 执行一次主要的随机延迟以模拟用户浏览间隔..." ) await random_sleep(5, 10) else: print( f" 错误: 获取商品详情API响应失败,状态码: {detail_response.status}" ) if AI_DEBUG_MODE: print( f"--- [DETAIL DEBUG] FAILED RESPONSE from {item_data['商品链接']} ---" ) try: print(await detail_response.text()) except Exception as e: print(f"无法读取响应内容: {e}") print( "----------------------------------------------------" ) except PlaywrightTimeoutError: print(f" 错误: 访问商品详情页或等待API响应超时。") except Exception as e: print(f" 错误: 处理商品详情时发生未知错误: {e}") finally: await detail_page.close() # --- 修改: 增加关闭页面后的短暂整理时间 --- await random_sleep(2, 4) # 原来是 (1, 2.5) # --- 新增: 在处理完一页所有商品后,翻页前,增加一个更长的“休息”时间 --- if not stop_scraping and page_num < max_pages: print( f"--- 第 {page_num} 页处理完毕,准备翻页。执行一次页面间的长时休息... ---" ) await random_sleep(10, 15) except PlaywrightTimeoutError as e: if _is_login_url(page.url): raise LoginRequiredError( f"Login required: redirected to {page.url} (cookies/state likely expired)" ) from e print(f"\n操作超时错误: 页面元素或网络响应未在规定时间内出现。\n{e}") raise except asyncio.CancelledError: log_time("收到取消信号,正在终止当前爬虫任务...") raise except Exception as e: if type(e).__name__ == "TargetClosedError": log_time("浏览器已关闭,忽略后续异常(可能是任务被停止)。") return processed_item_count if "passport.goofish.com" in str(e): raise LoginRequiredError( f"Login required: redirected to passport flow ({e})" ) from e print(f"\n爬取过程中发生未知错误: {e}") raise finally: if analysis_dispatcher is not None: log_time("等待后台分析任务完成...") await analysis_dispatcher.join() log_time("任务执行完毕,浏览器将在5秒后自动关闭...") await asyncio.sleep(5) if debug_limit: input("按回车键关闭浏览器...") await browser.close() return processed_item_count processed_item_count = 0 attempt_limit = max( rotation_settings["account_retry_limit"], rotation_settings["proxy_retry_limit"], 1, ) last_error = "" last_state_path: Optional[str] = None # If this task is already in a paused state, skip immediately. task_name_for_guard = task_config.get("task_name", "未命名任务") pause_cookie_path = None if ( isinstance(task_config.get("account_state_file"), str) and task_config.get("account_state_file").strip() ): pause_cookie_path = task_config.get("account_state_file").strip() elif os.path.exists(STATE_FILE): pause_cookie_path = STATE_FILE decision = FAILURE_GUARD.should_skip_start( task_name_for_guard, cookie_path=pause_cookie_path ) if decision.skip: print( f"[FailureGuard] 任务 '{task_name_for_guard}' 已暂停重试 (连续失败 {decision.consecutive_failures}/{FAILURE_GUARD.threshold})" ) if decision.should_notify: try: await send_ntfy_notification( { "商品标题": f"[任务暂停] {task_name_for_guard}", "当前售价": "N/A", "商品链接": "#", }, "任务处于暂停状态,将跳过执行。\n" f"原因: {decision.reason}\n" f"连续失败: {decision.consecutive_failures}/{FAILURE_GUARD.threshold}\n" f"暂停到: {decision.paused_until.strftime('%Y-%m-%d %H:%M:%S') if decision.paused_until else 'N/A'}\n" "修复方法: 更新登录态/cookies文件后会自动恢复。", ) except Exception as e: print(f"发送任务暂停通知失败: {e}") cleanup_task_images(task_config.get("task_name", "default")) return 0 for attempt in range(1, attempt_limit + 1): if attempt == 1: selected_account = _select_account() selected_proxy = _select_proxy() else: if ( rotation_settings["account_enabled"] and rotation_settings["account_mode"] == "on_failure" ): account_pool.mark_bad(selected_account, last_error) selected_account = _select_account(force_new=True) if ( rotation_settings["proxy_enabled"] and rotation_settings["proxy_mode"] == "on_failure" ): proxy_pool.mark_bad(selected_proxy, last_error) selected_proxy = _select_proxy(force_new=True) if rotation_settings["account_enabled"] and not selected_account: last_error = "未找到可用的登录状态文件,无法继续执行任务。" print(last_error) break if not rotation_settings["account_enabled"] and not selected_account: last_error = "未找到可用的登录状态文件,无法继续执行任务。" print(last_error) break if rotation_settings["proxy_enabled"] and not selected_proxy: last_error = "未找到可用的代理地址,无法继续执行任务。" print(last_error) break state_path = selected_account.value if selected_account else STATE_FILE last_state_path = state_path proxy_server = selected_proxy.value if selected_proxy else None if rotation_settings["account_enabled"]: print(f"账号轮换:使用登录状态 {state_path}") if rotation_settings["proxy_enabled"] and proxy_server: print(f"IP 轮换:使用代理 {proxy_server}") try: processed_item_count += await _run_scrape_attempt(state_path, proxy_server) last_error = "" FAILURE_GUARD.record_success(task_name_for_guard) break except LoginRequiredError as e: last_error = str(e) print(f"检测到登录失效/重定向: {e}") break except RiskControlError as e: last_error = str(e) print(f"检测到风控或验证触发: {e}") # 风控验证通常不是简单轮换能解决的,避免无意义重试。 break except Exception as e: last_error = f"{type(e).__name__}: {e}" print(f"本次尝试失败: {last_error}") if attempt < attempt_limit: print("将尝试轮换账号/IP 后重试...") if last_error: await _notify_task_failure(task_config, last_error, cookie_path=last_state_path) # 清理任务图片目录 cleanup_task_images(task_config.get("task_name", "default")) return processed_item_count ================================================ FILE: src/services/__init__.py ================================================ ================================================ FILE: src/services/account_strategy_service.py ================================================ """ 账号策略辅助函数 """ from typing import Optional ACCOUNT_STRATEGIES = {"auto", "fixed", "rotate"} def clean_account_state_file(value: Optional[str]) -> Optional[str]: if value is None: return None text = str(value).strip() if not text or text in {"null", "undefined"}: return None return text def normalize_account_strategy( strategy: Optional[str], account_state_file: Optional[str] = None, ) -> str: raw = str(strategy or "").strip().lower() if raw in ACCOUNT_STRATEGIES: return raw if clean_account_state_file(account_state_file): return "fixed" return "auto" def resolve_account_runtime_plan( *, strategy: Optional[str], account_state_file: Optional[str], has_root_state_file: bool, available_account_files: list[str], ) -> dict: normalized_strategy = normalize_account_strategy(strategy, account_state_file) cleaned_account = clean_account_state_file(account_state_file) has_account_pool = len(available_account_files) > 0 if normalized_strategy == "fixed": return { "strategy": normalized_strategy, "forced_account": cleaned_account, "use_account_pool": False, "prefer_root_state": False, } if normalized_strategy == "rotate": return { "strategy": normalized_strategy, "forced_account": None, "use_account_pool": has_account_pool, "prefer_root_state": False, } return { "strategy": normalized_strategy, "forced_account": None, "use_account_pool": (not has_root_state_file) and has_account_pool, "prefer_root_state": has_root_state_file, } ================================================ FILE: src/services/ai_request_compat.py ================================================ """AI 请求兼容性辅助逻辑。""" import copy from typing import Any, Dict, Iterable, List RESPONSES_API_MODE = "responses" CHAT_COMPLETIONS_API_MODE = "chat_completions" INPUT_TEXT_TYPE = "input_text" INPUT_IMAGE_TYPE = "input_image" IMAGE_DETAIL_AUTO = "auto" JSON_OUTPUT_TYPE = "json_object" UNSUPPORTED_JSON_OUTPUT_MARKERS = ( "not supported by this model", "json_object", "json_schema", "text.format", "response_format.type", ) RESPONSES_API_UNSUPPORTED_MARKERS = ( "404 page not found", "page not found", "/responses", "/v1/responses", ) CHAT_COMPLETIONS_API_UNSUPPORTED_MARKERS = ( "404 page not found", "page not found", "/chat/completions", "/v1/chat/completions", ) UNSUPPORTED_TEMPERATURE_MARKERS = ( "temperature", "sampling temperature", ) def build_responses_input(messages: Iterable[Dict[str, Any]]) -> List[Dict[str, Any]]: """将 Chat Completions 风格的消息转换为 Responses API 输入。""" input_items: List[Dict[str, Any]] = [] for message in messages: role = str(message.get("role") or "user") input_items.append( { "role": role, "content": _build_input_content(message.get("content")), } ) return input_items def add_json_text_format( request_params: Dict[str, Any], enabled: bool, ) -> Dict[str, Any]: """按需附加 Responses API 的结构化 JSON 输出参数。""" next_params = dict(request_params) if not enabled: return next_params text_config = dict(next_params.get("text") or {}) text_config["format"] = {"type": JSON_OUTPUT_TYPE} next_params["text"] = text_config return next_params def add_json_response_format( request_params: Dict[str, Any], enabled: bool, ) -> Dict[str, Any]: """按需附加 Chat Completions 的 JSON 输出参数。""" next_params = dict(request_params) if enabled: next_params["response_format"] = {"type": JSON_OUTPUT_TYPE} return next_params def is_json_output_unsupported_error(error: Exception) -> bool: """识别模型不支持结构化 JSON 输出参数的错误。""" message = str(error) return ( "not supported" in message.lower() and any(marker in message for marker in UNSUPPORTED_JSON_OUTPUT_MARKERS) ) def is_responses_api_unsupported_error(error: Exception) -> bool: """识别 OpenAI 兼容服务未实现 Responses API 的错误。""" return _is_api_unsupported_error(error, RESPONSES_API_UNSUPPORTED_MARKERS) def is_chat_completions_api_unsupported_error(error: Exception) -> bool: """识别 OpenAI 兼容服务未实现 Chat Completions API 的错误。""" return _is_api_unsupported_error(error, CHAT_COMPLETIONS_API_UNSUPPORTED_MARKERS) def build_ai_request_params( api_mode: str, *, model: str, messages: Iterable[Dict[str, Any]], temperature: float | None = None, max_output_tokens: int | None = None, enable_json_output: bool = False, ) -> Dict[str, Any]: """根据 API 模式构建请求参数。""" request_params = {"model": model} if api_mode == RESPONSES_API_MODE: request_params["input"] = build_responses_input(messages) if max_output_tokens is not None: request_params["max_output_tokens"] = max_output_tokens if temperature is not None: request_params["temperature"] = temperature return add_json_text_format(request_params, enable_json_output) if api_mode == CHAT_COMPLETIONS_API_MODE: request_params["messages"] = copy.deepcopy(list(messages)) if max_output_tokens is not None: request_params["max_tokens"] = max_output_tokens if temperature is not None: request_params["temperature"] = temperature return add_json_response_format(request_params, enable_json_output) raise ValueError(f"不支持的 AI API 模式: {api_mode}") async def create_ai_response_async( client: Any, api_mode: str, request_params: Dict[str, Any], ) -> Any: """根据 API 模式发起异步请求。""" if api_mode == RESPONSES_API_MODE: return await client.responses.create(**request_params) if api_mode == CHAT_COMPLETIONS_API_MODE: return await client.chat.completions.create(**request_params) raise ValueError(f"不支持的 AI API 模式: {api_mode}") def create_ai_response_sync( client: Any, api_mode: str, request_params: Dict[str, Any], ) -> Any: """根据 API 模式发起同步请求。""" if api_mode == RESPONSES_API_MODE: return client.responses.create(**request_params) if api_mode == CHAT_COMPLETIONS_API_MODE: return client.chat.completions.create(**request_params) raise ValueError(f"不支持的 AI API 模式: {api_mode}") def is_temperature_unsupported_error(error: Exception) -> bool: """识别模型或中转站不支持 temperature 参数的错误。""" message = str(error).lower() return ( "not supported" in message or "unsupported" in message or "invalid" in message or "参数错误" in message ) and any(marker in message for marker in UNSUPPORTED_TEMPERATURE_MARKERS) def remove_temperature_param(request_params: Dict[str, Any]) -> Dict[str, Any]: """移除 temperature 参数,适配不支持采样温度的模型网关。""" next_params = dict(request_params) next_params.pop("temperature", None) return next_params def _is_api_unsupported_error( error: Exception, markers: tuple[str, ...], ) -> bool: message = str(error).lower() if any(marker in message for marker in markers): return True status_code = getattr(error, "status_code", None) body = getattr(error, "body", None) response = getattr(error, "response", None) response_text = getattr(response, "text", None) if response else None return ( status_code == 404 and message.strip() == "error code: 404" and not body and not response_text ) def _build_input_content(content: Any) -> List[Dict[str, Any]]: if isinstance(content, str): return [{"type": INPUT_TEXT_TYPE, "text": content}] if not isinstance(content, list): raise ValueError(f"AI消息内容类型不受支持: {type(content).__name__}") return [_coerce_content_item(item) for item in content] def _coerce_content_item(item: Any) -> Dict[str, Any]: if not isinstance(item, dict): raise ValueError(f"AI消息片段类型不受支持: {type(item).__name__}") item_type = item.get("type") if item_type in {"text", INPUT_TEXT_TYPE}: text = item.get("text") if not isinstance(text, str): raise ValueError("文本消息片段缺少 text 字段。") return {"type": INPUT_TEXT_TYPE, "text": text} if item_type in {"image_url", INPUT_IMAGE_TYPE}: return _build_image_input_item(item) raise ValueError(f"不支持的 AI 消息片段类型: {item_type}") def _build_image_input_item(item: Dict[str, Any]) -> Dict[str, Any]: raw_image = item.get("image_url") if isinstance(raw_image, dict): image_url = raw_image.get("url") else: image_url = raw_image if not isinstance(image_url, str) or not image_url.strip(): raise ValueError("图片消息片段缺少有效的 image_url。") return { "type": INPUT_IMAGE_TYPE, "image_url": image_url, "detail": item.get("detail", IMAGE_DETAIL_AUTO), } ================================================ FILE: src/services/ai_response_parser.py ================================================ """ AI 响应解析工具 """ import json from typing import Any class EmptyAIResponseError(ValueError): """AI 返回了空内容。""" def extract_ai_response_content(response: Any) -> str: """从不同形态的 AI 响应中提取文本内容。""" if response is None: raise EmptyAIResponseError("AI响应对象为空。") if isinstance(response, (bytes, bytearray)): text = response.decode("utf-8", errors="replace") return _normalize_text_content(text) if isinstance(response, str): return _normalize_text_content(response) output_text = getattr(response, "output_text", None) if isinstance(output_text, str): return _normalize_text_content(output_text) choices = getattr(response, "choices", None) if choices: message = getattr(choices[0], "message", None) if message is None: raise EmptyAIResponseError("AI响应缺少 message。") content = getattr(message, "content", None) return _normalize_text_content(_coerce_content_parts(content)) raise ValueError(f"无法识别的AI响应类型: {type(response).__name__}") def parse_ai_response_json(content: str) -> dict: """解析 AI 文本响应中的 JSON。""" cleaned = _strip_code_fences(content) try: return json.loads(cleaned) except json.JSONDecodeError as exc: return _extract_first_json_value(cleaned, exc) def _coerce_content_parts(content: Any) -> str: if content is None: return "" if isinstance(content, str): return content if isinstance(content, (bytes, bytearray)): return content.decode("utf-8", errors="replace") if not isinstance(content, list): raise ValueError(f"AI响应内容类型不受支持: {type(content).__name__}") parts: list[str] = [] for item in content: if isinstance(item, str): parts.append(item) continue if isinstance(item, dict): text = item.get("text") if isinstance(text, str): parts.append(text) continue text = getattr(item, "text", None) if isinstance(text, str): parts.append(text) return "".join(parts) def _normalize_text_content(content: str) -> str: text = str(content).strip() if not text: raise EmptyAIResponseError("AI响应内容为空。") return text def _strip_code_fences(content: str) -> str: cleaned = content.strip() if cleaned.startswith("```json"): cleaned = cleaned[7:] if cleaned.startswith("```"): cleaned = cleaned[3:] if cleaned.endswith("```"): cleaned = cleaned[:-3] return cleaned.strip() def _extract_first_json_value( content: str, fallback_error: json.JSONDecodeError, ): decoder = json.JSONDecoder() last_error: json.JSONDecodeError | None = None for start_index, char in enumerate(content): if char not in "{[": continue try: parsed, _ = decoder.raw_decode(content[start_index:]) return parsed except json.JSONDecodeError as exc: last_error = exc if last_error is not None: raise last_error raise fallback_error ================================================ FILE: src/services/ai_service.py ================================================ """ AI 分析服务 封装 AI 分析相关的业务逻辑 """ from typing import Dict, List, Optional from src.infrastructure.external.ai_client import AIClient class AIAnalysisService: """AI 分析服务""" def __init__(self, ai_client: AIClient): self.ai_client = ai_client async def analyze_product( self, product_data: Dict, image_paths: List[str], prompt_text: str ) -> Optional[Dict]: """ 分析商品 Args: product_data: 商品数据 image_paths: 图片路径列表 prompt_text: 分析提示词 Returns: 分析结果 """ if not self.ai_client.is_available(): print("AI 客户端不可用,跳过分析") return None try: result = await self.ai_client.analyze(product_data, image_paths, prompt_text) if result and self._validate_result(result): return result else: print("AI 分析结果验证失败") return None except Exception as e: print(f"AI 分析服务出错: {e}") return None def _validate_result(self, result: Dict) -> bool: """验证 AI 分析结果的格式""" required_fields = [ "prompt_version", "is_recommended", "reason", "risk_tags", "criteria_analysis" ] # 检查必需字段 for field in required_fields: if field not in result: print(f"AI 响应缺少必需字段: {field}") return False # 检查数据类型 if not isinstance(result.get("is_recommended"), bool): print("is_recommended 字段不是布尔类型") return False if not isinstance(result.get("risk_tags"), list): print("risk_tags 字段不是列表类型") return False criteria_analysis = result.get("criteria_analysis", {}) if not isinstance(criteria_analysis, dict) or not criteria_analysis: print("criteria_analysis 必须是非空字典") return False return True ================================================ FILE: src/services/dashboard_payloads.py ================================================ """ Dashboard 数据拼装辅助函数 """ from __future__ import annotations from datetime import datetime from typing import Any from src.domain.models.task import Task from src.services.price_history_service import parse_price_value from src.services.result_file_service import ( normalize_keyword_from_filename, ) from src.services.result_storage_service import load_result_summary def normalize_text(value: str | None) -> str: return (value or "").strip().lower() def parse_timestamp(value: str | None) -> datetime | None: if not value: return None normalized = value.strip() for candidate in (normalized, normalized.replace("Z", "+00:00"), normalized.replace(" ", "T")): try: return datetime.fromisoformat(candidate) except ValueError: continue return None def serialize_timestamp(value: datetime | None) -> str | None: return value.isoformat() if value else None def build_empty_summary(task: Task) -> dict[str, Any]: return { "task_id": task.id, "task_name": task.task_name, "keyword": task.keyword, "filename": None, "enabled": task.enabled, "is_running": task.is_running, "account_strategy": task.account_strategy, "cron": task.cron, "region": task.region, "total_items": 0, "recommended_items": 0, "ai_recommended_items": 0, "keyword_recommended_items": 0, "latest_crawl_time": None, "latest_recommended_title": None, "latest_recommended_price": None, } def build_activity( *, activity_id: str, activity_type: str, task_name: str, keyword: str, title: str, status: str, timestamp: datetime | None, detail: str | None = None, filename: str | None = None, ) -> dict[str, Any]: return { "id": activity_id, "type": activity_type, "task_name": task_name, "keyword": keyword, "title": title, "status": status, "detail": detail, "filename": filename, "timestamp": serialize_timestamp(timestamp), } def sort_key_by_latest_time(item: dict[str, Any]) -> tuple[float, str]: timestamp = parse_timestamp(item.get("latest_crawl_time")) return (timestamp.timestamp() if timestamp else 0.0, item.get("task_name", "")) def sort_key_by_activity_time(item: dict[str, Any]) -> tuple[float, str]: timestamp = parse_timestamp(item.get("timestamp")) return (timestamp.timestamp() if timestamp else 0.0, item.get("id", "")) def _build_fallback_summary(task_name: str, keyword: str) -> dict[str, Any]: return { "task_id": None, "task_name": task_name, "keyword": keyword, "filename": None, "enabled": False, "is_running": False, "account_strategy": "auto", "cron": None, "region": None, "total_items": 0, "recommended_items": 0, "ai_recommended_items": 0, "keyword_recommended_items": 0, "latest_crawl_time": None, "latest_recommended_title": None, "latest_recommended_price": None, } def _resolve_task( task_lookup: dict[str, Task], latest_record: dict[str, Any] | None, keyword: str, ) -> Task | None: task = task_lookup.get(normalize_text(keyword)) if task is not None or latest_record is None: return task fallback_name = str(latest_record.get("任务名称") or "") return next( (candidate for candidate in task_lookup.values() if candidate.task_name == fallback_name), None, ) def _collect_record_metrics(records: list[dict[str, Any]]) -> dict[str, Any]: latest_crawl_time: datetime | None = None latest_record: dict[str, Any] | None = None latest_recommendation: dict[str, Any] | None = None recommended_items = 0 ai_recommended_items = 0 keyword_recommended_items = 0 for record in records: crawl_time = parse_timestamp(record.get("爬取时间")) if crawl_time and (latest_crawl_time is None or crawl_time > latest_crawl_time): latest_crawl_time = crawl_time latest_record = record analysis = record.get("ai_analysis", {}) or {} if analysis.get("is_recommended") is not True: continue recommended_items += 1 source = analysis.get("analysis_source") if source == "ai": ai_recommended_items += 1 elif source == "keyword": keyword_recommended_items += 1 recommendation_time = parse_timestamp( latest_recommendation.get("爬取时间") if latest_recommendation else None ) if latest_recommendation is None or (crawl_time and recommendation_time and crawl_time > recommendation_time): latest_recommendation = record elif latest_recommendation is None and crawl_time: latest_recommendation = record return { "latest_crawl_time": latest_crawl_time, "latest_record": latest_record, "latest_recommendation": latest_recommendation, "recommended_items": recommended_items, "ai_recommended_items": ai_recommended_items, "keyword_recommended_items": keyword_recommended_items, } def _build_recommendation_activity( *, filename: str, task_name: str, keyword: str, latest_recommendation: dict[str, Any] | None, ) -> tuple[dict[str, Any] | None, str | None, float | None]: if not latest_recommendation: return None, None, None product = latest_recommendation.get("商品信息", {}) or {} analysis = latest_recommendation.get("ai_analysis", {}) or {} title = str(product.get("商品标题") or "发现推荐商品") price = parse_price_value(product.get("当前售价")) status = "AI 推荐" if analysis.get("analysis_source") == "ai" else "关键词命中" detail = f"当前价 ¥{price:.0f}" if isinstance(price, (int, float)) else None activity = build_activity( activity_id=f"{filename}:recommended", activity_type="recommendation", task_name=task_name, keyword=keyword, title=title, status=status, timestamp=parse_timestamp(latest_recommendation.get("爬取时间")), detail=detail, filename=filename, ) return activity, title, price def _build_scan_activity( *, filename: str, task_name: str, keyword: str, latest_record: dict[str, Any] | None, total_items: int, ) -> dict[str, Any] | None: if not latest_record: return None product = latest_record.get("商品信息", {}) or {} title = str(product.get("商品标题") or task_name) return build_activity( activity_id=f"{filename}:scan", activity_type="scan", task_name=task_name, keyword=keyword, title=title, status="结果已更新", timestamp=parse_timestamp(latest_record.get("爬取时间")), detail=f"已累计 {total_items} 条样本", filename=filename, ) async def summarize_result_file( filename: str, task_lookup: dict[str, Task], ) -> tuple[dict[str, Any] | None, list[dict[str, Any]], datetime | None]: metrics = await load_result_summary(filename) if not metrics: return None, [], None latest_record = metrics["latest_record"] latest_crawl_time = parse_timestamp(metrics["latest_crawl_time"]) keyword = str((latest_record or {}).get("搜索关键字") or "") or normalize_keyword_from_filename(filename) task = _resolve_task(task_lookup, latest_record, keyword) task_name = task.task_name if task else str((latest_record or {}).get("任务名称") or keyword) summary = build_empty_summary(task) if task else _build_fallback_summary(task_name, keyword) activities: list[dict[str, Any]] = [] recommendation, title, price = _build_recommendation_activity( filename=filename, task_name=task_name, keyword=keyword, latest_recommendation=metrics["latest_recommendation"], ) if recommendation: activities.append(recommendation) scan_activity = _build_scan_activity( filename=filename, task_name=task_name, keyword=keyword, latest_record=latest_record, total_items=metrics["total_items"], ) if scan_activity: activities.append(scan_activity) summary.update( { "filename": filename, "total_items": metrics["total_items"], "recommended_items": metrics["recommended_items"], "ai_recommended_items": metrics["ai_recommended_items"], "keyword_recommended_items": metrics["keyword_recommended_items"], "latest_crawl_time": serialize_timestamp(latest_crawl_time), "latest_recommended_title": title, "latest_recommended_price": price, } ) return summary, activities, latest_crawl_time def build_task_state_activities(tasks: list[Task]) -> list[dict[str, Any]]: activities: list[dict[str, Any]] = [] for task in tasks: status = "运行中" if task.is_running else "已启用" detail = "任务正在轮询闲鱼结果" if task.is_running else "等待下一次调度执行" if not task.is_running and not task.enabled: continue activities.append( build_activity( activity_id=f"task:{task.id}:{'running' if task.is_running else 'ready'}", activity_type="task", task_name=task.task_name, keyword=task.keyword, title=task.task_name, status=status, timestamp=None, detail=detail, ) ) return activities ================================================ FILE: src/services/dashboard_service.py ================================================ """ Dashboard 聚合服务 统一汇总任务、结果文件和最近活动,供首页概览使用。 """ from __future__ import annotations from typing import Any from src.domain.models.task import Task from src.services.dashboard_payloads import ( build_empty_summary, build_task_state_activities, normalize_text, serialize_timestamp, sort_key_by_activity_time, sort_key_by_latest_time, summarize_result_file, ) from src.services.result_storage_service import list_result_filenames MAX_RECENT_ACTIVITIES = 8 def _build_summary_metrics(tasks: list[Task], summary_list: list[dict[str, Any]], last_updated_at: Any) -> dict[str, Any]: return { "enabled_tasks": sum(1 for task in tasks if task.enabled), "running_tasks": sum(1 for task in tasks if task.is_running), "result_files": sum(1 for item in summary_list if item.get("filename")), "scanned_items": sum(int(item["total_items"]) for item in summary_list), "recommended_items": sum(int(item["recommended_items"]) for item in summary_list), "ai_recommended_items": sum(int(item["ai_recommended_items"]) for item in summary_list), "keyword_recommended_items": sum(int(item["keyword_recommended_items"]) for item in summary_list), "last_updated_at": serialize_timestamp(last_updated_at), } async def build_dashboard_snapshot(tasks: list[Task]) -> dict[str, Any]: task_lookup = {normalize_text(task.keyword): task for task in tasks} task_summaries: dict[str, dict[str, Any]] = { task.task_name: build_empty_summary(task) for task in tasks } recent_activities = build_task_state_activities(tasks) latest_updated_at = None for filename in await list_result_filenames(): summary, activities, file_latest_time = await summarize_result_file(filename, task_lookup) if summary: task_summaries[summary["task_name"]] = summary recent_activities.extend(activities) if file_latest_time and (latest_updated_at is None or file_latest_time > latest_updated_at): latest_updated_at = file_latest_time summary_list = sorted(task_summaries.values(), key=sort_key_by_latest_time, reverse=True) focus_file = next((item["filename"] for item in summary_list if item.get("filename")), None) return { "summary": _build_summary_metrics(tasks, summary_list, latest_updated_at), "task_summaries": summary_list, "recent_activities": sorted( recent_activities, key=sort_key_by_activity_time, reverse=True, )[:MAX_RECENT_ACTIVITIES], "focus_file": focus_file, } ================================================ FILE: src/services/item_analysis_dispatcher.py ================================================ """ 商品分析分发器 将卖家资料采集、图片下载、AI 分析和结果保存移出主抓取链路。 """ import asyncio import copy import os from dataclasses import dataclass from typing import Awaitable, Callable, Optional from src.keyword_rule_engine import build_search_text, evaluate_keyword_rules SellerLoader = Callable[[str], Awaitable[dict]] ImageDownloader = Callable[[str, list[str], str], Awaitable[list[str]]] AIAnalyzer = Callable[[dict, list[str], str], Awaitable[Optional[dict]]] Notifier = Callable[[dict, str], Awaitable[None]] Saver = Callable[[dict, str], Awaitable[bool]] @dataclass(frozen=True) class ItemAnalysisJob: keyword: str task_name: str decision_mode: str analyze_images: bool prompt_text: str keyword_rules: tuple[str, ...] final_record: dict seller_id: Optional[str] zhima_credit_text: Optional[str] registration_duration_text: str class ItemAnalysisDispatcher: """用受控并发处理商品分析和落盘。""" def __init__( self, *, concurrency: int, skip_ai_analysis: bool, seller_loader: SellerLoader, image_downloader: ImageDownloader, ai_analyzer: AIAnalyzer, notifier: Notifier, saver: Saver, ) -> None: self._semaphore = asyncio.Semaphore(max(1, concurrency)) self._skip_ai_analysis = skip_ai_analysis self._seller_loader = seller_loader self._image_downloader = image_downloader self._ai_analyzer = ai_analyzer self._notifier = notifier self._saver = saver self._tasks: set[asyncio.Task] = set() self.completed_count = 0 def submit(self, job: ItemAnalysisJob) -> None: task = asyncio.create_task(self._process_with_limit(job)) self._tasks.add(task) task.add_done_callback(self._tasks.discard) async def join(self) -> None: while self._tasks: await asyncio.gather(*tuple(self._tasks)) async def _process_with_limit(self, job: ItemAnalysisJob) -> None: async with self._semaphore: await self._process_job(job) async def _process_job(self, job: ItemAnalysisJob) -> None: record = copy.deepcopy(job.final_record) item_data = record.get("商品信息", {}) or {} record["卖家信息"] = await self._load_seller_info(job) record["ai_analysis"] = await self._build_analysis_result(job, record) if await self._saver(record, job.keyword): self.completed_count += 1 await self._notify_if_recommended(item_data, record["ai_analysis"]) async def _load_seller_info(self, job: ItemAnalysisJob) -> dict: seller_info = {} if job.seller_id: try: seller_info = await self._seller_loader(job.seller_id) except Exception as exc: print(f" [卖家] 采集卖家 {job.seller_id} 信息失败: {exc}") merged = copy.deepcopy(seller_info or {}) merged["卖家芝麻信用"] = job.zhima_credit_text merged["卖家注册时长"] = job.registration_duration_text return merged async def _build_analysis_result(self, job: ItemAnalysisJob, record: dict) -> dict: if job.decision_mode == "keyword": return self._build_keyword_result(job, record) if self._skip_ai_analysis: return self._build_skip_ai_result() return await self._run_ai_analysis(job, record) def _build_keyword_result(self, job: ItemAnalysisJob, record: dict) -> dict: search_text = build_search_text(record) return evaluate_keyword_rules(list(job.keyword_rules), search_text) def _build_skip_ai_result(self) -> dict: return { "analysis_source": "ai", "is_recommended": True, "reason": "商品已跳过AI分析,直接通知", "keyword_hit_count": 0, } def _build_ai_error_result(self, reason: str, *, error: str = "") -> dict: payload = { "analysis_source": "ai", "is_recommended": False, "reason": reason, "keyword_hit_count": 0, } if error: payload["error"] = error return payload async def _run_ai_analysis(self, job: ItemAnalysisJob, record: dict) -> dict: image_paths: list[str] = [] try: image_paths = await self._download_images(job, record) if not job.prompt_text: return self._build_ai_error_result("任务未配置AI prompt,跳过分析。") ai_result = await self._ai_analyzer(record, image_paths, job.prompt_text) if not ai_result: return self._build_ai_error_result( "AI analysis returned None after retries.", error="AI analysis returned None after retries.", ) ai_result.setdefault("analysis_source", "ai") ai_result.setdefault("keyword_hit_count", 0) return ai_result except Exception as exc: return self._build_ai_error_result( f"AI分析异常: {exc}", error=str(exc), ) finally: self._cleanup_images(image_paths) async def _download_images(self, job: ItemAnalysisJob, record: dict) -> list[str]: if not job.analyze_images: return [] item_data = record.get("商品信息", {}) or {} image_urls = item_data.get("商品图片列表", []) if not image_urls: return [] return await self._image_downloader( item_data["商品ID"], image_urls, job.task_name, ) def _cleanup_images(self, image_paths: list[str]) -> None: for img_path in image_paths: try: if os.path.exists(img_path): os.remove(img_path) except Exception as exc: print(f" [图片] 删除图片文件时出错: {exc}") async def _notify_if_recommended(self, item_data: dict, analysis_result: dict) -> None: if not analysis_result.get("is_recommended"): return try: await self._notifier(item_data, analysis_result.get("reason", "无")) except Exception as exc: print(f" [通知] 发送推荐通知失败: {exc}") ================================================ FILE: src/services/notification_config_service.py ================================================ """ 通知配置读写与校验服务 """ import json from urllib.parse import urlparse from src.infrastructure.config.env_manager import env_manager from src.infrastructure.config.settings import ( DEFAULT_TELEGRAM_API_BASE_URL, NotificationSettings, ) NOTIFICATION_FIELD_MAP = { "NTFY_TOPIC_URL": "ntfy_topic_url", "GOTIFY_URL": "gotify_url", "GOTIFY_TOKEN": "gotify_token", "BARK_URL": "bark_url", "WX_BOT_URL": "wx_bot_url", "TELEGRAM_BOT_TOKEN": "telegram_bot_token", "TELEGRAM_CHAT_ID": "telegram_chat_id", "TELEGRAM_API_BASE_URL": "telegram_api_base_url", "WEBHOOK_URL": "webhook_url", "WEBHOOK_METHOD": "webhook_method", "WEBHOOK_HEADERS": "webhook_headers", "WEBHOOK_CONTENT_TYPE": "webhook_content_type", "WEBHOOK_QUERY_PARAMETERS": "webhook_query_parameters", "WEBHOOK_BODY": "webhook_body", "PCURL_TO_MOBILE": "pcurl_to_mobile", } SECRET_NOTIFICATION_FIELDS = { "BARK_URL", "GOTIFY_TOKEN", "WX_BOT_URL", "TELEGRAM_BOT_TOKEN", "WEBHOOK_URL", "WEBHOOK_HEADERS", } JSON_NOTIFICATION_FIELDS = { "WEBHOOK_HEADERS": True, "WEBHOOK_QUERY_PARAMETERS": True, "WEBHOOK_BODY": False, } URL_FIELDS = { "NTFY_TOPIC_URL", "GOTIFY_URL", "BARK_URL", "WX_BOT_URL", "TELEGRAM_API_BASE_URL", "WEBHOOK_URL", } ALLOWED_WEBHOOK_METHODS = {"GET", "POST"} ALLOWED_WEBHOOK_CONTENT_TYPES = {"JSON", "FORM"} class NotificationSettingsValidationError(ValueError): """通知配置校验错误""" def model_dump(model, *, exclude_unset: bool = False) -> dict: if hasattr(model, "model_dump"): return model.model_dump(exclude_unset=exclude_unset) return model.dict(exclude_unset=exclude_unset) def build_notification_settings_response( settings: NotificationSettings | None = None, ) -> dict: notification_settings = settings or load_notification_settings() response = { "NTFY_TOPIC_URL": notification_settings.ntfy_topic_url or "", "GOTIFY_URL": notification_settings.gotify_url or "", "GOTIFY_TOKEN": "", "BARK_URL": "", "WX_BOT_URL": "", "TELEGRAM_BOT_TOKEN": "", "TELEGRAM_CHAT_ID": notification_settings.telegram_chat_id or "", "TELEGRAM_API_BASE_URL": ( notification_settings.telegram_api_base_url or DEFAULT_TELEGRAM_API_BASE_URL ), "WEBHOOK_URL": "", "WEBHOOK_METHOD": notification_settings.webhook_method, "WEBHOOK_HEADERS": "", "WEBHOOK_CONTENT_TYPE": notification_settings.webhook_content_type, "WEBHOOK_QUERY_PARAMETERS": notification_settings.webhook_query_parameters or "", "WEBHOOK_BODY": notification_settings.webhook_body or "", "PCURL_TO_MOBILE": notification_settings.pcurl_to_mobile, } for field in SECRET_NOTIFICATION_FIELDS: attr_name = NOTIFICATION_FIELD_MAP[field] response[f"{field}_SET"] = bool(getattr(notification_settings, attr_name)) response["CONFIGURED_CHANNELS"] = build_configured_channels(notification_settings) return response def build_notification_status_flags( settings: NotificationSettings | None = None, ) -> dict: notification_settings = settings or load_notification_settings() return { "ntfy_topic_url_set": bool(notification_settings.ntfy_topic_url), "gotify_url_set": bool(notification_settings.gotify_url), "gotify_token_set": bool(notification_settings.gotify_token), "bark_url_set": bool(notification_settings.bark_url), "wx_bot_url_set": bool(notification_settings.wx_bot_url), "telegram_bot_token_set": bool(notification_settings.telegram_bot_token), "telegram_chat_id_set": bool(notification_settings.telegram_chat_id), "webhook_url_set": bool(notification_settings.webhook_url), "webhook_headers_set": bool(notification_settings.webhook_headers), } def build_configured_channels( settings: NotificationSettings | None = None, ) -> list[str]: notification_settings = settings or load_notification_settings() channels = [] if notification_settings.ntfy_topic_url: channels.append("ntfy") if notification_settings.bark_url: channels.append("bark") if notification_settings.gotify_url and notification_settings.gotify_token: channels.append("gotify") if notification_settings.wx_bot_url: channels.append("wecom") if notification_settings.telegram_bot_token and notification_settings.telegram_chat_id: channels.append("telegram") if notification_settings.webhook_url: channels.append("webhook") return channels def prepare_notification_settings_update( patch_payload: dict, existing_settings: NotificationSettings | None = None, ) -> tuple[dict[str, str], list[str], NotificationSettings]: current_settings = existing_settings or load_notification_settings() merged_values = _notification_settings_to_values(current_settings) for env_name, raw_value in patch_payload.items(): attr_name = NOTIFICATION_FIELD_MAP.get(env_name) if attr_name is None: continue merged_values[attr_name] = _normalize_patch_value(env_name, raw_value) normalized_values = _normalize_notification_values(merged_values) candidate_settings = _build_notification_settings_model(normalized_values) _validate_notification_settings(candidate_settings) updates = {} deletions = [] for env_name, raw_value in patch_payload.items(): attr_name = NOTIFICATION_FIELD_MAP.get(env_name) if attr_name is None: continue value = normalized_values[attr_name] if isinstance(value, bool): updates[env_name] = "true" if value else "false" continue if value is None: deletions.append(env_name) continue updates[env_name] = value return updates, deletions, candidate_settings def _notification_settings_to_values(settings: NotificationSettings) -> dict: return { attr_name: getattr(settings, attr_name) for attr_name in NOTIFICATION_FIELD_MAP.values() } def load_notification_settings() -> NotificationSettings: return _build_notification_settings_model( { "ntfy_topic_url": _normalize_existing_text(env_manager.get_value("NTFY_TOPIC_URL")), "gotify_url": _normalize_existing_text(env_manager.get_value("GOTIFY_URL")), "gotify_token": _normalize_existing_text(env_manager.get_value("GOTIFY_TOKEN")), "bark_url": _normalize_existing_text(env_manager.get_value("BARK_URL")), "wx_bot_url": _normalize_existing_text(env_manager.get_value("WX_BOT_URL")), "telegram_bot_token": _normalize_existing_text(env_manager.get_value("TELEGRAM_BOT_TOKEN")), "telegram_chat_id": _normalize_existing_text(env_manager.get_value("TELEGRAM_CHAT_ID")), "telegram_api_base_url": ( _normalize_existing_text(env_manager.get_value("TELEGRAM_API_BASE_URL")) or DEFAULT_TELEGRAM_API_BASE_URL ), "webhook_url": _normalize_existing_text(env_manager.get_value("WEBHOOK_URL")), "webhook_method": _normalize_existing_text(env_manager.get_value("WEBHOOK_METHOD")) or "POST", "webhook_headers": _normalize_existing_text(env_manager.get_value("WEBHOOK_HEADERS")), "webhook_content_type": _normalize_existing_text(env_manager.get_value("WEBHOOK_CONTENT_TYPE")) or "JSON", "webhook_query_parameters": _normalize_existing_text(env_manager.get_value("WEBHOOK_QUERY_PARAMETERS")), "webhook_body": _normalize_existing_text(env_manager.get_value("WEBHOOK_BODY")), "pcurl_to_mobile": _env_bool(env_manager.get_value("PCURL_TO_MOBILE"), True), } ) def _build_notification_settings_model(values: dict) -> NotificationSettings: if hasattr(NotificationSettings, "model_construct"): return NotificationSettings.model_construct(**values) return NotificationSettings.construct(**values) def _normalize_patch_value(env_name: str, value): if env_name == "PCURL_TO_MOBILE": return bool(value) if value is None: return None text = str(value).strip() return text or None def _normalize_existing_text(value: str | None) -> str | None: if value is None: return None text = str(value).strip() return text or None def _env_bool(value: str | None, default: bool) -> bool: if value is None: return default return str(value).strip().lower() in {"1", "true", "yes", "y", "on"} def _normalize_notification_values(values: dict) -> dict: normalized = dict(values) normalized["webhook_method"] = ( (normalized.get("webhook_method") or "POST").strip().upper() ) normalized["webhook_content_type"] = ( (normalized.get("webhook_content_type") or "JSON").strip().upper() ) for env_name, expect_dict in JSON_NOTIFICATION_FIELDS.items(): attr_name = NOTIFICATION_FIELD_MAP[env_name] raw_value = normalized.get(attr_name) if raw_value is None: continue parsed = _parse_json_field(env_name, raw_value, expect_dict=expect_dict) normalized[attr_name] = json.dumps( parsed, ensure_ascii=False, separators=(",", ":"), ) return normalized def _validate_notification_settings(settings: NotificationSettings) -> None: for field_name in URL_FIELDS: value = getattr(settings, NOTIFICATION_FIELD_MAP[field_name]) if value is not None: _validate_http_url(field_name, value) _validate_pair( "GOTIFY_URL", settings.gotify_url, "GOTIFY_TOKEN", settings.gotify_token, ) _validate_pair( "TELEGRAM_BOT_TOKEN", settings.telegram_bot_token, "TELEGRAM_CHAT_ID", settings.telegram_chat_id, ) if settings.webhook_method not in ALLOWED_WEBHOOK_METHODS: allowed = ", ".join(sorted(ALLOWED_WEBHOOK_METHODS)) raise NotificationSettingsValidationError( f"WEBHOOK_METHOD 仅支持: {allowed}" ) if settings.webhook_content_type not in ALLOWED_WEBHOOK_CONTENT_TYPES: allowed = ", ".join(sorted(ALLOWED_WEBHOOK_CONTENT_TYPES)) raise NotificationSettingsValidationError( f"WEBHOOK_CONTENT_TYPE 仅支持: {allowed}" ) has_webhook_extras = any( [ settings.webhook_headers, settings.webhook_query_parameters, settings.webhook_body, ] ) if has_webhook_extras and not settings.webhook_url: raise NotificationSettingsValidationError( "配置 Webhook 高级参数前必须先填写 WEBHOOK_URL" ) if settings.webhook_content_type == "FORM" and settings.webhook_body: parsed_body = json.loads(settings.webhook_body) if not isinstance(parsed_body, dict): raise NotificationSettingsValidationError( "WEBHOOK_BODY 在 FORM 模式下必须是 JSON 对象" ) def _validate_http_url(field_name: str, value: str) -> None: parsed = urlparse(value) if parsed.scheme not in {"http", "https"} or not parsed.netloc: raise NotificationSettingsValidationError( f"{field_name} 必须是合法的 HTTP/HTTPS URL" ) def _validate_pair( left_name: str, left_value: str | None, right_name: str, right_value: str | None, ) -> None: if bool(left_value) == bool(right_value): return raise NotificationSettingsValidationError( f"{left_name} 与 {right_name} 必须成对配置" ) def _parse_json_field( field_name: str, raw_value: str, expect_dict: bool, ): try: parsed = json.loads(raw_value) except json.JSONDecodeError as exc: raise NotificationSettingsValidationError( f"{field_name} 不是合法 JSON: {exc.msg}" ) from exc if expect_dict and not isinstance(parsed, dict): raise NotificationSettingsValidationError( f"{field_name} 必须是 JSON 对象" ) return parsed ================================================ FILE: src/services/notification_service.py ================================================ """ 通知服务 统一管理所有通知渠道 """ import asyncio from typing import Dict, List from src.infrastructure.external.notification_clients.base import NotificationClient from src.infrastructure.external.notification_clients.factory import build_notification_clients from src.services.notification_config_service import load_notification_settings from src.infrastructure.config.settings import NotificationSettings class NotificationService: """通知服务""" def __init__(self, clients: List[NotificationClient]): self.clients = [client for client in clients if client.is_enabled()] async def send_notification( self, product_data: Dict, reason: str, ) -> Dict[str, Dict[str, str | bool]]: """ 发送通知到所有启用的渠道 Returns: 各渠道发送结果,包含成功状态和消息 """ if not self.clients: return {} tasks = [ self._send_with_result(client, product_data, reason) for client in self.clients ] results = await asyncio.gather(*tasks) return {result["channel"]: result for result in results} async def send_test_notification(self) -> Dict[str, Dict[str, str | bool]]: test_product = { "商品标题": "[测试通知] 闲鱼智能监控", "当前售价": "0", "商品链接": "https://www.goofish.com/", } return await self.send_notification( test_product, "这是一条测试通知,用于验证推送渠道是否可用。", ) async def _send_with_result( self, client: NotificationClient, product_data: Dict, reason: str, ) -> Dict[str, str | bool]: try: await client.send(product_data, reason) return { "channel": client.channel_key, "label": client.display_name, "success": True, "message": "发送成功", } except Exception as exc: return { "channel": client.channel_key, "label": client.display_name, "success": False, "message": str(exc), } def build_notification_service( settings: NotificationSettings | None = None, ) -> NotificationService: notification_settings = settings or load_notification_settings() return NotificationService(build_notification_clients(notification_settings)) ================================================ FILE: src/services/price_history_service.py ================================================ """ 价格历史记录与聚合服务 """ from __future__ import annotations import json import math import os from collections import defaultdict from datetime import datetime from statistics import median from typing import Any, Iterable, Optional from src.infrastructure.persistence.sqlite_bootstrap import bootstrap_sqlite_storage from src.infrastructure.persistence.sqlite_connection import sqlite_connection PRICE_HISTORY_DIR = "price_history" DEFAULT_HISTORY_WINDOW_DAYS = 30 def normalize_keyword_slug(keyword: str) -> str: text = "".join( char for char in str(keyword or "").lower().replace(" ", "_") if char.isalnum() or char in "_-" ).rstrip("_") return text or "unknown" def build_price_history_path(keyword: str) -> str: return os.path.join( PRICE_HISTORY_DIR, f"{normalize_keyword_slug(keyword)}_history.jsonl", ) def parse_price_value(value: Any) -> Optional[float]: if value is None: return None if isinstance(value, (int, float)): return round(float(value), 2) text = str(value).strip().replace("¥", "").replace(",", "") if not text or text in {"价格异常", "暂无", "-", "N/A"}: return None if text.endswith("万"): text = str(float(text[:-1]) * 10000) try: return round(float(text), 2) except (TypeError, ValueError): return None def _safe_iso_datetime(value: Optional[str]) -> str: if value: return value return datetime.now().isoformat() def _to_day(iso_text: str) -> str: return iso_text[:10] def _build_snapshot_record( *, keyword: str, task_name: str, item: dict, run_id: str, snapshot_time: str, ) -> Optional[dict]: item_id = str(item.get("商品ID") or "").strip() link = str(item.get("商品链接") or "").strip() unique_id = item_id or link price_value = parse_price_value(item.get("当前售价")) if not unique_id or price_value is None: return None return { "snapshot_time": snapshot_time, "snapshot_day": _to_day(snapshot_time), "run_id": run_id, "task_name": task_name, "keyword": keyword, "item_id": unique_id, "title": item.get("商品标题") or "", "price": price_value, "price_display": item.get("当前售价") or "", "tags": item.get("商品标签") or [], "region": item.get("发货地区") or "", "seller": item.get("卖家昵称") or "", "publish_time": item.get("发布时间") or "", "link": link, } def record_market_snapshots( *, keyword: str, task_name: str, items: Iterable[dict], run_id: str, snapshot_time: Optional[str] = None, seen_item_ids: Optional[set[str]] = None, ) -> list[dict]: snapshot_time = _safe_iso_datetime(snapshot_time) seen = seen_item_ids if seen_item_ids is not None else set() records: list[dict] = [] for item in items: record = _build_snapshot_record( keyword=keyword, task_name=task_name, item=item, run_id=run_id, snapshot_time=snapshot_time, ) if record is None or record["item_id"] in seen: continue seen.add(record["item_id"]) records.append(record) if not records: return [] bootstrap_sqlite_storage() keyword_slug = normalize_keyword_slug(keyword) with sqlite_connection() as conn: for record in records: conn.execute( """ INSERT OR IGNORE INTO price_snapshots ( keyword_slug, keyword, task_name, snapshot_time, snapshot_day, run_id, item_id, title, price, price_display, tags_json, region, seller, publish_time, link ) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?) """, ( keyword_slug, record.get("keyword", keyword), record.get("task_name", task_name), record.get("snapshot_time", snapshot_time), record.get("snapshot_day", _to_day(snapshot_time)), record.get("run_id", run_id), record.get("item_id", ""), record.get("title", ""), record.get("price"), record.get("price_display", ""), json.dumps(record.get("tags") or [], ensure_ascii=False), record.get("region", ""), record.get("seller", ""), record.get("publish_time", ""), record.get("link", ""), ), ) conn.commit() return records def load_price_snapshots(keyword: str) -> list[dict]: bootstrap_sqlite_storage() with sqlite_connection() as conn: rows = conn.execute( """ SELECT * FROM price_snapshots WHERE keyword_slug = ? ORDER BY snapshot_time ASC, id ASC """, (normalize_keyword_slug(keyword),), ).fetchall() snapshots: list[dict] = [] for row in rows: snapshots.append( { "snapshot_time": row["snapshot_time"], "snapshot_day": row["snapshot_day"], "run_id": row["run_id"], "task_name": row["task_name"], "keyword": row["keyword"], "item_id": row["item_id"], "title": row["title"], "price": row["price"], "price_display": row["price_display"], "tags": json.loads(row["tags_json"] or "[]"), "region": row["region"], "seller": row["seller"], "publish_time": row["publish_time"], "link": row["link"], } ) return snapshots def delete_price_snapshots(keyword: str) -> int: bootstrap_sqlite_storage() with sqlite_connection() as conn: cursor = conn.execute( "DELETE FROM price_snapshots WHERE keyword_slug = ?", (normalize_keyword_slug(keyword),), ) conn.commit() return int(cursor.rowcount or 0) def _dedupe_latest(records: Iterable[dict], group_key: str) -> list[dict]: latest_by_key: dict[str, dict] = {} for record in records: key = str(record.get(group_key) or "").strip() if not key: continue latest_by_key[key] = record return list(latest_by_key.values()) def _summarize_prices(records: Iterable[dict]) -> dict: entries = [record for record in records if parse_price_value(record.get("price")) is not None] prices = [float(record["price"]) for record in entries] if not prices: return { "sample_count": 0, "avg_price": None, "median_price": None, "min_price": None, "max_price": None, } return { "sample_count": len(prices), "avg_price": round(sum(prices) / len(prices), 2), "median_price": round(float(median(prices)), 2), "min_price": round(min(prices), 2), "max_price": round(max(prices), 2), } def _build_daily_trend(snapshots: list[dict]) -> list[dict]: grouped: dict[str, list[dict]] = defaultdict(list) for snapshot in snapshots: grouped[str(snapshot.get("snapshot_day") or "")].append(snapshot) points: list[dict] = [] for day in sorted(grouped.keys()): day_records = _dedupe_latest(grouped[day], "item_id") summary = _summarize_prices(day_records) summary["day"] = day points.append(summary) return points def _recent_window_snapshots(snapshots: list[dict], window_days: int) -> list[dict]: if not snapshots: return [] latest_time = max(str(record.get("snapshot_time") or "") for record in snapshots) latest_dt = datetime.fromisoformat(latest_time) filtered = [] for record in snapshots: current_time = datetime.fromisoformat(str(record.get("snapshot_time") or latest_time)) if (latest_dt - current_time).days <= max(0, window_days): filtered.append(record) return filtered def _resolve_deal_label(score: int) -> str: if score >= 65: return "高性价比" if score >= 50: return "值得关注" if score >= 40: return "价格正常" return "价格偏高" def build_item_price_context( snapshots: list[dict], *, item_id: str, current_price: Optional[float], ) -> dict: if not item_id: return {"observation_count": 0, "deal_score": None, "deal_label": "暂无数据"} item_snapshots = [record for record in snapshots if str(record.get("item_id")) == str(item_id)] if not item_snapshots: return {"observation_count": 0, "deal_score": None, "deal_label": "暂无数据"} latest_item_snapshot = item_snapshots[-1] price_now = current_price if current_price is not None else parse_price_value(latest_item_snapshot.get("price")) historical_prices = [float(record["price"]) for record in item_snapshots if parse_price_value(record.get("price")) is not None] latest_run_id = str(snapshots[-1].get("run_id") or "") latest_market = _dedupe_latest( [record for record in snapshots if str(record.get("run_id") or "") == latest_run_id], "item_id", ) market_summary = _summarize_prices(latest_market) market_avg = market_summary.get("avg_price") market_median = market_summary.get("median_price") score = 50 if price_now is not None and market_avg: score += int(((market_avg - price_now) / market_avg) * 60) if price_now is not None and historical_prices: historical_max = max(historical_prices) if historical_max > 0: score += int(((historical_max - price_now) / historical_max) * 20) if math.isclose(price_now, min(historical_prices), rel_tol=0.001): score += 8 score = max(0, min(100, score)) previous_price = historical_prices[-2] if len(historical_prices) >= 2 else None change_amount = None if previous_price is None or price_now is None else round(price_now - previous_price, 2) change_percent = None if change_amount is not None and previous_price: change_percent = round(change_amount / previous_price * 100, 2) return { "observation_count": len(historical_prices), "current_price": price_now, "avg_price": round(sum(historical_prices) / len(historical_prices), 2), "median_price": round(float(median(historical_prices)), 2), "min_price": round(min(historical_prices), 2), "max_price": round(max(historical_prices), 2), "first_seen_at": item_snapshots[0].get("snapshot_time"), "last_seen_at": latest_item_snapshot.get("snapshot_time"), "market_avg_price": market_avg, "market_median_price": market_median, "price_change_amount": change_amount, "price_change_percent": change_percent, "deal_score": score, "deal_label": _resolve_deal_label(score), } def build_market_reference( *, keyword: str, item: dict, current_market_items: list[dict], historical_snapshots: list[dict], ) -> dict: current_market_records = [] for market_item in current_market_items: price = parse_price_value(market_item.get("当前售价")) if price is None: continue current_market_records.append({"price": price}) market_snapshot = _summarize_prices(current_market_records) history_summary = _summarize_prices(_dedupe_latest(historical_snapshots, "item_id")) item_context = build_item_price_context( historical_snapshots, item_id=str(item.get("商品ID") or ""), current_price=parse_price_value(item.get("当前售价")), ) return { "当前搜索样本": market_snapshot, "历史价格概览": history_summary, "本商品价格位置": item_context, "关键词": keyword, } def build_price_history_insights( keyword: str, *, window_days: int = DEFAULT_HISTORY_WINDOW_DAYS, ) -> dict: snapshots = load_price_snapshots(keyword) if not snapshots: return { "market_summary": _summarize_prices([]), "history_summary": {"unique_items": 0, **_summarize_prices([])}, "daily_trend": [], "latest_snapshot_at": None, } recent_snapshots = _recent_window_snapshots(snapshots, window_days) latest_run_id = str(snapshots[-1].get("run_id") or "") latest_run_snapshots = _dedupe_latest( [record for record in snapshots if str(record.get("run_id") or "") == latest_run_id], "item_id", ) latest_records_by_item = _dedupe_latest(recent_snapshots, "item_id") return { "market_summary": { **_summarize_prices(latest_run_snapshots), "snapshot_time": snapshots[-1].get("snapshot_time"), }, "history_summary": { "unique_items": len(latest_records_by_item), **_summarize_prices(latest_records_by_item), }, "daily_trend": _build_daily_trend(recent_snapshots), "latest_snapshot_at": snapshots[-1].get("snapshot_time"), } ================================================ FILE: src/services/process_service.py ================================================ """ 进程管理服务 负责管理爬虫进程的启动和停止 """ import asyncio import contextlib import os import signal import sys from datetime import datetime from typing import Awaitable, Callable, Dict, TextIO from src.ai_handler import send_ntfy_notification from src.config import STATE_FILE from src.failure_guard import FailureGuard from src.infrastructure.persistence.sqlite_task_repository import find_task_by_name_sync from src.utils import build_task_log_path STOP_TIMEOUT_SECONDS = 20 SPIDER_DEBUG_LIMIT_ENV = "SPIDER_DEBUG_LIMIT" LifecycleHook = Callable[[int], Awaitable[None] | None] class ProcessService: """进程管理服务""" def __init__(self): self.processes: Dict[int, asyncio.subprocess.Process] = {} self.log_paths: Dict[int, str] = {} self.log_handles: Dict[int, TextIO] = {} self.task_names: Dict[int, str] = {} self.exit_watchers: Dict[int, asyncio.Task] = {} self.failure_guard = FailureGuard() self._on_started: LifecycleHook | None = None self._on_stopped: LifecycleHook | None = None def set_lifecycle_hooks( self, *, on_started: LifecycleHook | None = None, on_stopped: LifecycleHook | None = None, ) -> None: self._on_started = on_started self._on_stopped = on_stopped async def _invoke_hook(self, hook: LifecycleHook | None, task_id: int) -> None: if hook is None: return result = hook(task_id) if asyncio.iscoroutine(result): await result def _resolve_cookie_path(self, task_name: str) -> str | None: """Best-effort cookie/state path for a task.""" try: task = find_task_by_name_sync(task_name) if task and isinstance(task.account_state_file, str) and task.account_state_file.strip(): return task.account_state_file.strip() except Exception: pass return STATE_FILE if os.path.exists(STATE_FILE) else None def is_running(self, task_id: int) -> bool: """检查任务是否正在运行""" process = self.processes.get(task_id) return process is not None and process.returncode is None async def _drain_finished_process(self, task_id: int) -> None: process = self.processes.get(task_id) if process is None or process.returncode is None: return watcher = self.exit_watchers.get(task_id) if watcher is not None: await asyncio.shield(watcher) return self._cleanup_runtime(task_id, process) await self._invoke_hook(self._on_stopped, task_id) def _open_log_file(self, task_id: int, task_name: str) -> tuple[str, TextIO]: os.makedirs("logs", exist_ok=True) log_file_path = build_task_log_path(task_id, task_name) log_file_handle = open(log_file_path, "a", encoding="utf-8") return log_file_path, log_file_handle def _build_spawn_command(self, task_name: str) -> list[str]: command = [ sys.executable, "-u", "spider_v2.py", "--task-name", task_name, ] debug_limit = str(os.getenv(SPIDER_DEBUG_LIMIT_ENV, "")).strip() if debug_limit.isdigit() and int(debug_limit) > 0: command.extend(["--debug-limit", debug_limit]) return command async def _spawn_process( self, task_name: str, log_file_handle: TextIO, ) -> asyncio.subprocess.Process: preexec_fn = os.setsid if sys.platform != "win32" else None child_env = os.environ.copy() child_env["PYTHONIOENCODING"] = "utf-8" child_env["PYTHONUTF8"] = "1" return await asyncio.create_subprocess_exec( *self._build_spawn_command(task_name), stdout=log_file_handle, stderr=log_file_handle, preexec_fn=preexec_fn, env=child_env, ) def _register_runtime( self, task_id: int, task_name: str, process: asyncio.subprocess.Process, log_file_path: str, log_file_handle: TextIO, ) -> None: self.processes[task_id] = process self.log_paths[task_id] = log_file_path self.log_handles[task_id] = log_file_handle self.task_names[task_id] = task_name self.exit_watchers[task_id] = asyncio.create_task(self._watch_process_exit(process)) async def start_task(self, task_id: int, task_name: str) -> bool: """启动任务进程""" await self._drain_finished_process(task_id) if self.is_running(task_id): print(f"任务 '{task_name}' (ID: {task_id}) 已在运行中") return False decision = self.failure_guard.should_skip_start( task_name, cookie_path=self._resolve_cookie_path(task_name), ) if decision.skip: await self._notify_skip(task_name, decision) return False log_file_path = "" log_file_handle = None try: log_file_path, log_file_handle = self._open_log_file(task_id, task_name) process = await self._spawn_process(task_name, log_file_handle) except Exception as exc: self._close_log_handle(log_file_handle) print(f"启动任务 '{task_name}' 失败: {exc}") return False self._register_runtime(task_id, task_name, process, log_file_path, log_file_handle) print(f"启动任务 '{task_name}' (PID: {process.pid})") await self._invoke_hook(self._on_started, task_id) return True async def _notify_skip(self, task_name: str, decision) -> None: print( f"[FailureGuard] 跳过启动任务 '{task_name}',已暂停重试 " f"(连续失败 {decision.consecutive_failures}/{self.failure_guard.threshold})" ) if not decision.should_notify: return try: await send_ntfy_notification( { "商品标题": f"[任务暂停] {task_name}", "当前售价": "N/A", "商品链接": "#", }, "任务处于暂停状态,将跳过执行。\n" f"原因: {decision.reason}\n" f"连续失败: {decision.consecutive_failures}/{self.failure_guard.threshold}\n" f"暂停到: {decision.paused_until.strftime('%Y-%m-%d %H:%M:%S') if decision.paused_until else 'N/A'}\n" "修复方法: 更新登录态/cookies文件后会自动恢复。", ) except Exception as exc: print(f"发送任务暂停通知失败: {exc}") async def _watch_process_exit(self, process: asyncio.subprocess.Process) -> None: await process.wait() task_id = self._find_task_id_by_process(process) if task_id is None: return self._cleanup_runtime(task_id, process) await self._invoke_hook(self._on_stopped, task_id) def _find_task_id_by_process(self, process: asyncio.subprocess.Process) -> int | None: for task_id, current_process in self.processes.items(): if current_process is process: return task_id return None def _cleanup_runtime( self, task_id: int, process: asyncio.subprocess.Process, ) -> None: if self.processes.get(task_id) is not process: return self.processes.pop(task_id, None) self.log_paths.pop(task_id, None) self.task_names.pop(task_id, None) self._close_log_handle(self.log_handles.pop(task_id, None)) self.exit_watchers.pop(task_id, None) def _close_log_handle(self, log_handle: TextIO | None) -> None: if log_handle is None: return with contextlib.suppress(Exception): log_handle.close() def _append_stop_marker(self, log_path: str | None) -> None: if not log_path: return try: timestamp = datetime.now().strftime(" %Y-%m-%d %H:%M:%S") with open(log_path, "a", encoding="utf-8") as log_file: log_file.write(f"[{timestamp}] !!! 任务已被终止 !!!\n") except Exception as exc: print(f"写入任务终止标记失败: {exc}") async def stop_task(self, task_id: int) -> bool: """停止任务进程""" await self._drain_finished_process(task_id) process = self.processes.get(task_id) if process is None: print(f"任务 ID {task_id} 没有正在运行的进程") return False if process.returncode is not None: await self._await_exit_watcher(task_id) print(f"任务进程 {process.pid} (ID: {task_id}) 已退出,略过停止") return False try: await self._terminate_process(process, task_id) self._append_stop_marker(self.log_paths.get(task_id)) await self._await_exit_watcher(task_id) print(f"任务进程 {process.pid} (ID: {task_id}) 已终止") return True except ProcessLookupError: print(f"进程 (ID: {task_id}) 已不存在") return False except Exception as exc: print(f"停止任务进程 (ID: {task_id}) 时出错: {exc}") return False async def _terminate_process( self, process: asyncio.subprocess.Process, task_id: int, ) -> None: if sys.platform != "win32": os.killpg(os.getpgid(process.pid), signal.SIGTERM) else: process.terminate() try: await asyncio.wait_for(process.wait(), timeout=STOP_TIMEOUT_SECONDS) return except asyncio.TimeoutError: print( f"任务进程 {process.pid} (ID: {task_id}) 未在 " f"{STOP_TIMEOUT_SECONDS} 秒内退出,准备强制终止..." ) if sys.platform != "win32": with contextlib.suppress(ProcessLookupError): os.killpg(os.getpgid(process.pid), signal.SIGKILL) else: process.kill() await process.wait() async def _await_exit_watcher(self, task_id: int) -> None: watcher = self.exit_watchers.get(task_id) if watcher is None: return await asyncio.shield(watcher) def reindex_after_delete(self, deleted_task_id: int) -> None: """删除任务后同步重排运行时索引,避免任务下标漂移。""" self.processes = self._reindex_mapping(self.processes, deleted_task_id) self.log_paths = self._reindex_mapping(self.log_paths, deleted_task_id) self.log_handles = self._reindex_mapping(self.log_handles, deleted_task_id) self.task_names = self._reindex_mapping(self.task_names, deleted_task_id) self.exit_watchers = self._reindex_mapping(self.exit_watchers, deleted_task_id) def _reindex_mapping(self, mapping: Dict[int, object], deleted_task_id: int) -> Dict[int, object]: reindexed: Dict[int, object] = {} for task_id, value in mapping.items(): if task_id == deleted_task_id: continue next_task_id = task_id - 1 if task_id > deleted_task_id else task_id reindexed[next_task_id] = value return reindexed async def stop_all(self) -> None: """停止所有任务进程""" task_ids = list(self.processes.keys()) for task_id in task_ids: await self.stop_task(task_id) ================================================ FILE: src/services/result_export_service.py ================================================ """ 结果导出服务 """ import csv from io import StringIO EXPORT_HEADERS = [ "任务名称", "搜索关键字", "商品ID", "商品标题", "当前售价", "发布时间", "卖家昵称", "AI是否推荐", "分析来源", "原因", "价格观察次数", "价格最低值", "价格最高值", "市场均价", "性价比分数", "性价比标签", "商品链接", ] def build_results_csv(records: list[dict]) -> str: buffer = StringIO() writer = csv.DictWriter(buffer, fieldnames=EXPORT_HEADERS) writer.writeheader() for record in records: item = record.get("商品信息", {}) or {} seller = record.get("卖家信息", {}) or {} ai_analysis = record.get("ai_analysis", {}) or {} price_insight = record.get("price_insight", {}) or {} writer.writerow( { "任务名称": record.get("任务名称", ""), "搜索关键字": record.get("搜索关键字", ""), "商品ID": item.get("商品ID", ""), "商品标题": item.get("商品标题", ""), "当前售价": item.get("当前售价", ""), "发布时间": item.get("发布时间", ""), "卖家昵称": seller.get("卖家昵称") or item.get("卖家昵称", ""), "AI是否推荐": "是" if ai_analysis.get("is_recommended") else "否", "分析来源": ai_analysis.get("analysis_source", ""), "原因": ai_analysis.get("reason", ""), "价格观察次数": price_insight.get("observation_count", ""), "价格最低值": price_insight.get("min_price", ""), "价格最高值": price_insight.get("max_price", ""), "市场均价": price_insight.get("market_avg_price", ""), "性价比分数": ai_analysis.get("value_score", price_insight.get("deal_score", "")), "性价比标签": ai_analysis.get("value_summary", price_insight.get("deal_label", "")), "商品链接": item.get("商品链接", ""), } ) return buffer.getvalue() ================================================ FILE: src/services/result_file_service.py ================================================ """ 结果记录富化与文件名校验服务 """ from src.infrastructure.persistence.storage_names import normalize_keyword_from_filename from src.services.price_history_service import ( build_item_price_context, load_price_snapshots, parse_price_value, ) def validate_result_filename(filename: str) -> None: if not filename.endswith(".jsonl") or "/" in filename or ".." in filename: raise ValueError("无效的文件名") def enrich_records_with_price_insight(records: list[dict], filename: str) -> list[dict]: snapshots = load_price_snapshots(normalize_keyword_from_filename(filename)) if not snapshots: return records enriched = [] for record in records: info = record.get("商品信息", {}) or {} clone = dict(record) clone["price_insight"] = build_item_price_context( snapshots, item_id=str(info.get("商品ID") or ""), current_price=parse_price_value(info.get("当前售价")), ) enriched.append(clone) return enriched ================================================ FILE: src/services/result_storage_service.py ================================================ """ 结果数据的 SQLite 读写服务。 """ from __future__ import annotations import asyncio import hashlib import json from src.infrastructure.persistence.sqlite_bootstrap import bootstrap_sqlite_storage from src.infrastructure.persistence.sqlite_connection import sqlite_connection from src.infrastructure.persistence.storage_names import build_result_filename from src.services.price_history_service import parse_price_value SORT_COLUMN_MAP = { "crawl_time": "crawl_time", "publish_time": "COALESCE(publish_time, '')", "price": "COALESCE(price, 0)", "keyword_hit_count": "keyword_hit_count", } def _get_link_unique_key(link: str) -> str: return link.split("&", 1)[0] def _fallback_unique_key(record: dict, item: dict) -> str: item_id = str(item.get("商品ID") or "").strip() if item_id: return f"item:{item_id}" digest = hashlib.sha1( json.dumps(record, ensure_ascii=False, sort_keys=True).encode("utf-8") ).hexdigest() return f"hash:{digest}" def _parse_raw_record(raw_json: str) -> dict: return json.loads(raw_json) def _build_query_conditions( *, filename: str, ai_recommended_only: bool, keyword_recommended_only: bool, ) -> tuple[str, list]: conditions = ["result_filename = ?"] params: list = [filename] if ai_recommended_only: conditions.append("is_recommended = 1") conditions.append("analysis_source = ?") params.append("ai") if keyword_recommended_only: conditions.append("is_recommended = 1") conditions.append("analysis_source = ?") params.append("keyword") return " AND ".join(conditions), params def _sort_expression(sort_by: str, sort_order: str) -> str: column = SORT_COLUMN_MAP.get(sort_by, SORT_COLUMN_MAP["crawl_time"]) direction = "ASC" if sort_order == "asc" else "DESC" return f"{column} {direction}, id {direction}" async def save_result_record(record: dict, keyword: str) -> bool: return await asyncio.to_thread(_save_result_record_sync, record, keyword) def _save_result_record_sync(record: dict, keyword: str) -> bool: bootstrap_sqlite_storage() item = record.get("商品信息", {}) or {} analysis = record.get("ai_analysis", {}) or {} link = str(item.get("商品链接") or "") link_unique_key = _get_link_unique_key(link) if link else _fallback_unique_key(record, item) keyword_hit_count = analysis.get("keyword_hit_count", 0) try: keyword_hit_count = int(keyword_hit_count) except (TypeError, ValueError): keyword_hit_count = 0 with sqlite_connection() as conn: conn.execute( """ INSERT OR IGNORE INTO result_items ( result_filename, keyword, task_name, crawl_time, publish_time, price, price_display, item_id, title, link, link_unique_key, seller_nickname, is_recommended, analysis_source, keyword_hit_count, raw_json ) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?) """, ( build_result_filename(keyword), record.get("搜索关键字", keyword), record.get("任务名称", ""), record.get("爬取时间", ""), item.get("发布时间"), parse_price_value(item.get("当前售价")), item.get("当前售价"), item.get("商品ID"), item.get("商品标题"), link, link_unique_key, (record.get("卖家信息", {}) or {}).get("卖家昵称") or item.get("卖家昵称"), 1 if analysis.get("is_recommended") else 0, analysis.get("analysis_source"), keyword_hit_count, json.dumps(record, ensure_ascii=False), ), ) conn.commit() return True def load_processed_link_keys(keyword: str) -> set[str]: bootstrap_sqlite_storage() filename = build_result_filename(keyword) with sqlite_connection() as conn: rows = conn.execute( "SELECT link_unique_key FROM result_items WHERE result_filename = ?", (filename,), ).fetchall() return {str(row["link_unique_key"]) for row in rows if row["link_unique_key"]} async def list_result_filenames() -> list[str]: return await asyncio.to_thread(_list_result_filenames_sync) def _list_result_filenames_sync() -> list[str]: bootstrap_sqlite_storage() with sqlite_connection() as conn: rows = conn.execute( """ SELECT result_filename, MAX(crawl_time) AS latest_crawl_time FROM result_items GROUP BY result_filename ORDER BY latest_crawl_time DESC, result_filename DESC """ ).fetchall() return [str(row["result_filename"]) for row in rows] async def result_file_exists(filename: str) -> bool: return await asyncio.to_thread(_result_file_exists_sync, filename) def _result_file_exists_sync(filename: str) -> bool: bootstrap_sqlite_storage() with sqlite_connection() as conn: row = conn.execute( "SELECT 1 FROM result_items WHERE result_filename = ? LIMIT 1", (filename,), ).fetchone() return row is not None async def delete_result_file_records(filename: str) -> int: return await asyncio.to_thread(_delete_result_file_records_sync, filename) def _delete_result_file_records_sync(filename: str) -> int: bootstrap_sqlite_storage() with sqlite_connection() as conn: cursor = conn.execute( "DELETE FROM result_items WHERE result_filename = ?", (filename,), ) conn.commit() return int(cursor.rowcount or 0) async def query_result_records( filename: str, *, ai_recommended_only: bool, keyword_recommended_only: bool, sort_by: str, sort_order: str, page: int, limit: int, ) -> tuple[int, list[dict]]: return await asyncio.to_thread( _query_result_records_sync, filename, ai_recommended_only, keyword_recommended_only, sort_by, sort_order, page, limit, ) def _query_result_records_sync( filename: str, ai_recommended_only: bool, keyword_recommended_only: bool, sort_by: str, sort_order: str, page: int, limit: int, ) -> tuple[int, list[dict]]: bootstrap_sqlite_storage() where_clause, params = _build_query_conditions( filename=filename, ai_recommended_only=ai_recommended_only, keyword_recommended_only=keyword_recommended_only, ) offset = max(page - 1, 0) * limit order_clause = _sort_expression(sort_by, sort_order) with sqlite_connection() as conn: total_row = conn.execute( f"SELECT COUNT(1) AS total FROM result_items WHERE {where_clause}", tuple(params), ).fetchone() rows = conn.execute( f""" SELECT raw_json FROM result_items WHERE {where_clause} ORDER BY {order_clause} LIMIT ? OFFSET ? """, tuple(params + [limit, offset]), ).fetchall() total = int(total_row["total"]) if total_row else 0 return total, [_parse_raw_record(str(row["raw_json"])) for row in rows] async def load_all_result_records( filename: str, *, ai_recommended_only: bool, keyword_recommended_only: bool, sort_by: str, sort_order: str, ) -> list[dict]: return await asyncio.to_thread( _load_all_result_records_sync, filename, ai_recommended_only, keyword_recommended_only, sort_by, sort_order, ) def _load_all_result_records_sync( filename: str, ai_recommended_only: bool, keyword_recommended_only: bool, sort_by: str, sort_order: str, ) -> list[dict]: bootstrap_sqlite_storage() where_clause, params = _build_query_conditions( filename=filename, ai_recommended_only=ai_recommended_only, keyword_recommended_only=keyword_recommended_only, ) order_clause = _sort_expression(sort_by, sort_order) with sqlite_connection() as conn: rows = conn.execute( f""" SELECT raw_json FROM result_items WHERE {where_clause} ORDER BY {order_clause} """, tuple(params), ).fetchall() return [_parse_raw_record(str(row["raw_json"])) for row in rows] async def build_result_ndjson(filename: str) -> str: records = await load_all_result_records( filename, ai_recommended_only=False, keyword_recommended_only=False, sort_by="crawl_time", sort_order="asc", ) return "\n".join(json.dumps(record, ensure_ascii=False) for record in records) async def load_result_summary(filename: str) -> dict | None: return await asyncio.to_thread(_load_result_summary_sync, filename) def _load_result_summary_sync(filename: str) -> dict | None: bootstrap_sqlite_storage() with sqlite_connection() as conn: aggregate_row = conn.execute( """ SELECT COUNT(1) AS total_items, SUM(CASE WHEN is_recommended = 1 THEN 1 ELSE 0 END) AS recommended_items, SUM(CASE WHEN is_recommended = 1 AND analysis_source = 'ai' THEN 1 ELSE 0 END) AS ai_recommended_items, SUM(CASE WHEN is_recommended = 1 AND analysis_source = 'keyword' THEN 1 ELSE 0 END) AS keyword_recommended_items, MAX(crawl_time) AS latest_crawl_time FROM result_items WHERE result_filename = ? """, (filename,), ).fetchone() if aggregate_row is None or int(aggregate_row["total_items"] or 0) == 0: return None latest_record = conn.execute( """ SELECT raw_json FROM result_items WHERE result_filename = ? ORDER BY crawl_time DESC, id DESC LIMIT 1 """, (filename,), ).fetchone() latest_recommendation = conn.execute( """ SELECT raw_json FROM result_items WHERE result_filename = ? AND is_recommended = 1 ORDER BY crawl_time DESC, id DESC LIMIT 1 """, (filename,), ).fetchone() return { "total_items": int(aggregate_row["total_items"] or 0), "recommended_items": int(aggregate_row["recommended_items"] or 0), "ai_recommended_items": int(aggregate_row["ai_recommended_items"] or 0), "keyword_recommended_items": int(aggregate_row["keyword_recommended_items"] or 0), "latest_crawl_time": aggregate_row["latest_crawl_time"], "latest_record": ( _parse_raw_record(str(latest_record["raw_json"])) if latest_record else None ), "latest_recommendation": ( _parse_raw_record(str(latest_recommendation["raw_json"])) if latest_recommendation else None ), } ================================================ FILE: src/services/scheduler_service.py ================================================ """ 调度服务 负责管理定时任务的调度 """ from datetime import datetime from apscheduler.schedulers.asyncio import AsyncIOScheduler from typing import List from src.core.cron_utils import build_cron_trigger from src.domain.models.task import Task from src.services.process_service import ProcessService class SchedulerService: """调度服务""" def __init__(self, process_service: ProcessService): self.scheduler = AsyncIOScheduler(timezone="Asia/Shanghai") self.process_service = process_service def start(self): """启动调度器""" if not self.scheduler.running: self.scheduler.start() print("调度器已启动") def stop(self): """停止调度器""" if self.scheduler.running: self.scheduler.shutdown() print("调度器已停止") def get_next_run_time(self, task_id: int): job = self.scheduler.get_job(f"task_{task_id}") if job is None: return None next_run_time = getattr(job, "next_run_time", None) if next_run_time is not None: return next_run_time trigger = getattr(job, "trigger", None) if trigger is None or not hasattr(trigger, "get_next_fire_time"): return None try: now = datetime.now(self.scheduler.timezone) return trigger.get_next_fire_time(None, now) except Exception: return None async def reload_jobs(self, tasks: List[Task]): """重新加载所有定时任务""" print("正在重新加载定时任务...") self.scheduler.remove_all_jobs() for task in tasks: if task.enabled and task.cron: try: trigger = build_cron_trigger( task.cron, timezone=self.scheduler.timezone, ) self.scheduler.add_job( self._run_task, trigger=trigger, args=[task.id, task.task_name], id=f"task_{task.id}", name=f"Scheduled: {task.task_name}", replace_existing=True ) print(f" -> 已为任务 '{task.task_name}' 添加定时规则: '{task.cron}'") except ValueError as e: print(f" -> [警告] 任务 '{task.task_name}' 的 Cron 表达式无效: {e}") print("定时任务加载完成") async def _run_task(self, task_id: int, task_name: str): """执行定时任务""" print(f"定时任务触发: 正在为任务 '{task_name}' 启动爬虫...") await self.process_service.start_task(task_id, task_name) ================================================ FILE: src/services/search_pagination.py ================================================ import asyncio from dataclasses import dataclass from typing import Any, Awaitable, Callable, Optional from playwright.async_api import TimeoutError as PlaywrightTimeoutError from src.utils import log_time, random_sleep NEXT_PAGE_SELECTOR = ( "button[class*='search-pagination-arrow-container']" ":has([class*='search-pagination-arrow-right'])" ":not([disabled])" ) SEARCH_RESULTS_API_FRAGMENT = "/h5/mtop.taobao.idlemtopsearch.pc.search/1.0/" PAGE_REQUEST_TIMEOUT_MS = 20_000 PAGE_CLICK_TIMEOUT_MS = 10_000 PAGE_RETRY_DELAY_SECONDS = 5 PAGE_RETRY_COUNT = 2 PAGE_CLICK_SLEEP_MIN_SECONDS = 2 PAGE_CLICK_SLEEP_MAX_SECONDS = 5 @dataclass(frozen=True) class PageAdvanceResult: advanced: bool response: Optional[Any] = None stop_reason: Optional[str] = None def is_search_results_response( response: Any, api_url_fragment: str = SEARCH_RESULTS_API_FRAGMENT, ) -> bool: request = getattr(response, "request", None) request_method = getattr(request, "method", None) response_url = getattr(response, "url", "") return api_url_fragment in response_url and request_method == "POST" async def advance_search_page( *, page: Any, page_num: int, logger: Callable[[str], None] = log_time, wait_after_click: Callable[[float, float], Awaitable[None]] = random_sleep, retry_sleep: Callable[[float], Awaitable[None]] = asyncio.sleep, max_retries: int = PAGE_RETRY_COUNT, ) -> PageAdvanceResult: next_button = page.locator(NEXT_PAGE_SELECTOR).first if not await next_button.count(): logger("已到达最后一页,未找到可用的'下一页'按钮,停止翻页。") return PageAdvanceResult(advanced=False, stop_reason="no_next_button") for retry_index in range(max_retries): try: await next_button.scroll_into_view_if_needed() async with page.expect_response( is_search_results_response, timeout=PAGE_REQUEST_TIMEOUT_MS, ) as response_info: try: await next_button.click(timeout=PAGE_CLICK_TIMEOUT_MS) except PlaywrightTimeoutError: logger(f"第 {page_num} 页下一页按钮点击超时,停止翻页。") return PageAdvanceResult( advanced=False, stop_reason="click_timeout", ) await wait_after_click( PAGE_CLICK_SLEEP_MIN_SECONDS, PAGE_CLICK_SLEEP_MAX_SECONDS, ) return PageAdvanceResult( advanced=True, response=await response_info.value, ) except PlaywrightTimeoutError: if retry_index < max_retries - 1: logger( f"等待第 {page_num} 页搜索响应超时," f"{PAGE_RETRY_DELAY_SECONDS}秒后重试..." ) await retry_sleep(PAGE_RETRY_DELAY_SECONDS) continue logger(f"等待第 {page_num} 页搜索响应超时 {max_retries} 次,停止翻页。") return PageAdvanceResult(advanced=False, stop_reason="response_timeout") return PageAdvanceResult(advanced=False, stop_reason="unknown") ================================================ FILE: src/services/seller_profile_cache.py ================================================ """ 卖家资料缓存服务 """ import asyncio import copy import time from dataclasses import dataclass from typing import Awaitable, Callable, Optional SellerProfileLoader = Callable[[str], Awaitable[dict]] @dataclass(frozen=True) class _CacheEntry: value: dict expires_at: float class SellerProfileCache: """带 TTL 和并发合并的卖家资料缓存。""" def __init__( self, ttl_seconds: int = 1800, time_source: Optional[Callable[[], float]] = None, ) -> None: self._ttl_seconds = max(0, ttl_seconds) self._time_source = time_source or time.monotonic self._entries: dict[str, _CacheEntry] = {} self._inflight: dict[str, asyncio.Task] = {} self._lock = asyncio.Lock() def _now(self) -> float: return float(self._time_source()) def _clone(self, value: dict) -> dict: return copy.deepcopy(value) def _get_entry_value(self, user_id: str) -> Optional[dict]: entry = self._entries.get(user_id) if entry is None: return None if entry.expires_at < self._now(): self._entries.pop(user_id, None) return None return self._clone(entry.value) async def get_or_load(self, user_id: str, loader: SellerProfileLoader) -> dict: async with self._lock: cached_value = self._get_entry_value(user_id) if cached_value is not None: return cached_value task = self._inflight.get(user_id) if task is None: task = asyncio.create_task(self._load_and_store(user_id, loader)) self._inflight[user_id] = task return self._clone(await task) async def _load_and_store(self, user_id: str, loader: SellerProfileLoader) -> dict: try: value = self._clone(await loader(user_id)) expires_at = self._now() + self._ttl_seconds async with self._lock: self._entries[user_id] = _CacheEntry(value=value, expires_at=expires_at) return value finally: async with self._lock: self._inflight.pop(user_id, None) ================================================ FILE: src/services/task_generation_runner.py ================================================ """ 任务生成作业执行器 """ import os import aiofiles from src.domain.models.task import TaskCreate, TaskGenerateRequest from src.prompt_utils import generate_criteria from src.services.scheduler_service import SchedulerService from src.services.task_generation_service import TaskGenerationService from src.services.task_service import TaskService def build_criteria_filename(keyword: str) -> str: safe_keyword = "".join( char for char in keyword.lower().replace(" ", "_") if char.isalnum() or char in "_-" ).rstrip() return f"prompts/{safe_keyword}_criteria.txt" def build_task_create(req: TaskGenerateRequest, criteria_file: str) -> TaskCreate: return TaskCreate( task_name=req.task_name, enabled=True, keyword=req.keyword, description=req.description or "", analyze_images=req.analyze_images, max_pages=req.max_pages, personal_only=req.personal_only, min_price=req.min_price, max_price=req.max_price, cron=req.cron, ai_prompt_base_file="prompts/base_prompt.txt", ai_prompt_criteria_file=criteria_file, account_state_file=req.account_state_file, account_strategy=req.account_strategy, free_shipping=req.free_shipping, new_publish_option=req.new_publish_option, region=req.region, decision_mode=req.decision_mode or "ai", keyword_rules=req.keyword_rules, ) async def save_generated_criteria(output_filename: str, generated_criteria: str) -> None: if not generated_criteria or not generated_criteria.strip(): raise RuntimeError("AI 未能生成分析标准,返回内容为空。") os.makedirs("prompts", exist_ok=True) async with aiofiles.open(output_filename, "w", encoding="utf-8") as file: await file.write(generated_criteria) async def reload_scheduler( task_service: TaskService, scheduler_service: SchedulerService, ) -> None: tasks = await task_service.get_all_tasks() await scheduler_service.reload_jobs(tasks) async def advance_job( generation_service: TaskGenerationService, job_id: str, step_key: str, message: str, ) -> None: await generation_service.advance(job_id, step_key, message) async def run_ai_generation_job( *, job_id: str, req: TaskGenerateRequest, task_service: TaskService, scheduler_service: SchedulerService, generation_service: TaskGenerationService, ) -> None: output_filename = build_criteria_filename(req.keyword) try: await advance_job( generation_service, job_id, "prepare", "已接收请求,开始准备分析标准。", ) async def report_progress(step_key: str, message: str) -> None: await advance_job(generation_service, job_id, step_key, message) generated_criteria = await generate_criteria( user_description=req.description or "", reference_file_path="prompts/macbook_criteria.txt", progress_callback=report_progress, ) await advance_job( generation_service, job_id, "persist", f"正在保存分析标准到 {output_filename}。", ) await save_generated_criteria(output_filename, generated_criteria) await advance_job( generation_service, job_id, "task", "分析标准已生成,正在创建任务记录。", ) task = await task_service.create_task(build_task_create(req, output_filename)) await reload_scheduler(task_service, scheduler_service) await generation_service.complete(job_id, task, f"任务“{req.task_name}”创建完成。") except Exception as exc: if os.path.exists(output_filename): os.remove(output_filename) await generation_service.fail(job_id, f"AI 任务生成失败: {exc}") ================================================ FILE: src/services/task_generation_service.py ================================================ """ 任务生成作业服务 """ import asyncio from copy import deepcopy import threading from typing import Awaitable, Dict, Iterable, Optional from uuid import uuid4 from src.domain.models.task import Task from src.domain.models.task_generation import TaskGenerationJob, TaskGenerationStep DEFAULT_GENERATION_STEPS: tuple[tuple[str, str], ...] = ( ("prepare", "接收创建请求"), ("reference", "读取参考文件"), ("prompt", "构建提示词"), ("llm", "调用 AI 生成标准"), ("persist", "保存分析标准"), ("task", "创建任务记录"), ) class TaskGenerationService: """管理 AI 任务生成的后台作业状态""" def __init__(self, step_specs: Iterable[tuple[str, str]] = DEFAULT_GENERATION_STEPS): self._step_specs = tuple(step_specs) self._jobs: Dict[str, TaskGenerationJob] = {} self._lock = threading.Lock() self._workers: set[threading.Thread] = set() async def create_job(self, task_name: str) -> TaskGenerationJob: job = TaskGenerationJob( job_id=uuid4().hex, task_name=task_name, steps=[ TaskGenerationStep(key=key, label=label) for key, label in self._step_specs ], ) with self._lock: self._jobs[job.job_id] = job return deepcopy(job) async def get_job(self, job_id: str) -> Optional[TaskGenerationJob]: with self._lock: job = self._jobs.get(job_id) if not job: return None return deepcopy(job) def track(self, coroutine: Awaitable[None]) -> None: thread: Optional[threading.Thread] = None def runner() -> None: try: asyncio.run(coroutine) finally: if thread is None: return with self._lock: self._workers.discard(thread) thread = threading.Thread(target=runner, daemon=True) with self._lock: self._workers.add(thread) thread.start() async def advance(self, job_id: str, step_key: str, message: str) -> TaskGenerationJob: with self._lock: job = self._require_job(job_id) target_index = self._find_step_index(job, step_key) job.status = "running" job.current_step = step_key job.message = message for index, step in enumerate(job.steps): if step.status == "failed": continue if index < target_index: step.status = "completed" elif index == target_index: step.status = "running" step.message = message elif step.status != "pending": step.status = "pending" step.message = "" return deepcopy(job) async def complete(self, job_id: str, task: Task, message: str) -> TaskGenerationJob: with self._lock: job = self._require_job(job_id) job.status = "completed" job.current_step = None job.message = message job.error = None job.task = task for step in job.steps: if step.status != "failed": step.status = "completed" return deepcopy(job) async def fail( self, job_id: str, error: str, step_key: Optional[str] = None, ) -> TaskGenerationJob: with self._lock: job = self._require_job(job_id) failed_step = step_key or job.current_step job.status = "failed" job.error = error job.message = error job.current_step = failed_step if failed_step: step = self._find_step(job, failed_step) if step: step.status = "failed" step.message = error return deepcopy(job) def _require_job(self, job_id: str) -> TaskGenerationJob: job = self._jobs.get(job_id) if not job: raise KeyError(f"任务生成作业不存在: {job_id}") return job def _find_step(self, job: TaskGenerationJob, step_key: str) -> Optional[TaskGenerationStep]: for step in job.steps: if step.key == step_key: return step return None def _find_step_index(self, job: TaskGenerationJob, step_key: str) -> int: for index, step in enumerate(job.steps): if step.key == step_key: return index raise KeyError(f"未知的任务生成步骤: {step_key}") ================================================ FILE: src/services/task_log_cleanup_service.py ================================================ """ 任务运行日志清理服务。 """ from __future__ import annotations from datetime import datetime, timedelta from pathlib import Path def cleanup_task_logs( logs_dir: str = "logs", *, keep_days: int = 7, now: datetime | None = None, ) -> list[str]: if keep_days < 1: print(f"任务日志清理已跳过:保留天数配置无效 ({keep_days})") return [] root = Path(logs_dir) if not root.exists(): return [] current_time = now or datetime.now() cutoff = current_time - timedelta(days=keep_days) removed_files: list[str] = [] for path in root.glob("*.log"): if not path.is_file(): continue try: modified_at = datetime.fromtimestamp(path.stat().st_mtime) except OSError as exc: print(f"读取任务日志时间失败,已跳过: {path} ({exc})") continue if modified_at >= cutoff: continue try: path.unlink() removed_files.append(str(path)) except OSError as exc: print(f"删除历史任务日志失败,已跳过: {path} ({exc})") if removed_files: print( f"任务日志清理完成:已删除 {len(removed_files)} 个超过 {keep_days} 天的历史日志文件。" ) return removed_files ================================================ FILE: src/services/task_payloads.py ================================================ """ 任务接口响应序列化辅助。 """ from __future__ import annotations from datetime import datetime from typing import Any from src.domain.models.task import Task def serialize_timestamp(value: datetime | None) -> str | None: return value.isoformat() if value else None def serialize_task(task: Task, scheduler_service) -> dict[str, Any]: payload = task.model_dump() next_run_time = None if task.id is not None and scheduler_service is not None: next_run_time = scheduler_service.get_next_run_time(task.id) payload["next_run_at"] = serialize_timestamp(next_run_time) return payload def serialize_tasks(tasks: list[Task], scheduler_service) -> list[dict[str, Any]]: return [serialize_task(task, scheduler_service) for task in tasks] ================================================ FILE: src/services/task_service.py ================================================ """ 任务管理服务 封装任务相关的业务逻辑 """ from typing import List, Optional from src.domain.models.task import Task, TaskCreate, TaskUpdate from src.domain.repositories.task_repository import TaskRepository class TaskService: """任务管理服务""" def __init__(self, repository: TaskRepository): self.repository = repository async def get_all_tasks(self) -> List[Task]: """获取所有任务""" return await self.repository.find_all() async def get_task(self, task_id: int) -> Optional[Task]: """获取单个任务""" return await self.repository.find_by_id(task_id) async def create_task(self, task_create: TaskCreate) -> Task: """创建新任务""" task = Task(**task_create.model_dump(), is_running=False) return await self.repository.save(task) async def update_task(self, task_id: int, task_update: TaskUpdate) -> Task: """更新任务""" task = await self.repository.find_by_id(task_id) if not task: raise ValueError(f"任务 {task_id} 不存在") updated_task = task.apply_update(task_update) return await self.repository.save(updated_task) async def delete_task(self, task_id: int) -> bool: """删除任务""" return await self.repository.delete(task_id) async def update_task_status(self, task_id: int, is_running: bool) -> Task: """更新任务运行状态""" task_update = TaskUpdate(is_running=is_running) return await self.update_task(task_id, task_update) ================================================ FILE: src/utils.py ================================================ import asyncio import json import math import os import random import re import glob from datetime import datetime from functools import wraps from urllib.parse import quote from openai import APIStatusError from requests.exceptions import HTTPError from src.services.result_storage_service import save_result_record def retry_on_failure(retries=3, delay=5): """ 一个通用的异步重试装饰器,增加了对HTTP错误的详细日志记录。 """ def decorator(func): @wraps(func) async def wrapper(*args, **kwargs): for i in range(retries): try: return await func(*args, **kwargs) except (APIStatusError, HTTPError) as e: print(f"函数 {func.__name__} 第 {i + 1}/{retries} 次尝试失败,发生HTTP错误。") if hasattr(e, 'status_code'): print(f" - 状态码 (Status Code): {e.status_code}") if hasattr(e, 'response') and hasattr(e.response, 'text'): response_text = e.response.text print( f" - 返回值 (Response): {response_text[:300]}{'...' if len(response_text) > 300 else ''}") except json.JSONDecodeError as e: print(f"函数 {func.__name__} 第 {i + 1}/{retries} 次尝试失败: JSON解析错误 - {e}") except Exception as e: print(f"函数 {func.__name__} 第 {i + 1}/{retries} 次尝试失败: {type(e).__name__} - {e}") if i < retries - 1: print(f"将在 {delay} 秒后重试...") await asyncio.sleep(delay) print(f"函数 {func.__name__} 在 {retries} 次尝试后彻底失败。") return None return wrapper return decorator async def safe_get(data, *keys, default="暂无"): """安全获取嵌套字典值""" for key in keys: try: data = data[key] except (KeyError, TypeError, IndexError): return default return data async def random_sleep(min_seconds: float, max_seconds: float): """异步等待一个在指定范围内的随机时间。""" delay = random.uniform(min_seconds, max_seconds) print(f" [延迟] 等待 {delay:.2f} 秒... (范围: {min_seconds}-{max_seconds}s)") await asyncio.sleep(delay) def log_time(message: str, prefix: str = "") -> None: """在日志前加上 YY-MM-DD HH:MM:SS 时间戳的简单打印。""" try: ts = datetime.now().strftime(' %Y-%m-%d %H:%M:%S') except Exception: ts = "--:--:--" print(f"[{ts}] {prefix}{message}") def sanitize_filename(value: str) -> str: """生成安全的文件名片段。""" if not value: return "task" cleaned = re.sub(r"[^a-zA-Z0-9_-]+", "_", value.strip()) cleaned = re.sub(r"_+", "_", cleaned).strip("_") return cleaned or "task" def build_task_log_path(task_id: int, task_name: str) -> str: """生成任务日志路径(包含任务名)。""" safe_name = sanitize_filename(task_name) filename = f"{safe_name}_{task_id}.log" return os.path.join("logs", filename) def resolve_task_log_path(task_id: int, task_name: str) -> str: """优先使用任务名生成日志路径,不存在时回退为按 ID 匹配。""" primary_path = build_task_log_path(task_id, task_name) if os.path.exists(primary_path): return primary_path pattern = os.path.join("logs", f"*_{task_id}.log") matches = glob.glob(pattern) if matches: return matches[0] return primary_path def convert_goofish_link(url: str) -> str: """ 将Goofish商品链接转换为只包含商品ID的手机端格式。 """ match_first_link = re.search(r'item\?id=(\d+)', url) if match_first_link: item_id = match_first_link.group(1) bfp_json = f'{{"id":{item_id}}}' return f"https://pages.goofish.com/sharexy?loadingVisible=false&bft=item&bfs=idlepc.item&spm=a21ybx.item.0.0&bfp={quote(bfp_json)}" return url def get_link_unique_key(link: str) -> str: """截取链接中第一个"&"之前的内容作为唯一标识依据。""" return link.split('&', 1)[0] async def save_to_jsonl(data_record: dict, keyword: str): """兼容旧调用名,实际将结果写入 SQLite。""" try: return await save_result_record(data_record, keyword) except Exception as e: print(f"写入 SQLite 结果记录出错: {e}") return False def format_registration_days(total_days: int) -> str: """ 将总天数格式化为“X年Y个月”的字符串。 """ if not isinstance(total_days, int) or total_days <= 0: return '未知' DAYS_IN_YEAR = 365.25 DAYS_IN_MONTH = DAYS_IN_YEAR / 12 years = math.floor(total_days / DAYS_IN_YEAR) remaining_days = total_days - (years * DAYS_IN_YEAR) months = round(remaining_days / DAYS_IN_MONTH) if months == 12: years += 1 months = 0 if years > 0 and months > 0: return f"来闲鱼{years}年{months}个月" elif years > 0 and months == 0: return f"来闲鱼{years}年整" elif years == 0 and months > 0: return f"来闲鱼{months}个月" else: return "来闲鱼不足一个月" ================================================ FILE: start.sh ================================================ #!/bin/bash # 闲鱼监控系统本地启动脚本 # 功能:清理旧构建、安装依赖、构建前端、启动服务 set -e # 遇到错误立即退出 # 颜色输出 RED='\033[0;31m' GREEN='\033[0;32m' YELLOW='\033[1;33m' NC='\033[0m' # No Color # 获取脚本所在目录 SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" cd "$SCRIPT_DIR" echo -e "${GREEN}========================================${NC}" echo -e "${GREEN}闲鱼监控系统 - 本地启动脚本${NC}" echo -e "${GREEN}========================================${NC}" # 0. 环境与依赖检查 echo -e "\n${YELLOW}[1/6] 检查环境与依赖...${NC}" OS_FAMILY="unknown" LINUX_ID="" LINUX_LIKE="" PYTHON_CMD="python3" PIP_CMD="python3 -m pip" if [ -f /etc/os-release ]; then . /etc/os-release LINUX_ID="$ID" LINUX_LIKE="$ID_LIKE" fi case "$(uname -s 2>/dev/null || echo unknown)" in Darwin) OS_FAMILY="macos" ;; Linux) if grep -qi microsoft /proc/version 2>/dev/null; then OS_FAMILY="wsl" else OS_FAMILY="linux" fi ;; MINGW*|MSYS*|CYGWIN*) OS_FAMILY="windows" ;; *) OS_FAMILY="unknown" ;; esac MISSING_ITEMS=() if ! command -v python3 >/dev/null 2>&1; then MISSING_ITEMS+=("python3(>=3.10)") else if ! python3 -c 'import sys; raise SystemExit(0 if sys.version_info >= (3, 10) else 1)' >/dev/null 2>&1; then MISSING_ITEMS+=("python3(>=3.10)") fi fi if ! python3 -m pip --version >/dev/null 2>&1; then MISSING_ITEMS+=("pip") fi if ! command -v node >/dev/null 2>&1; then MISSING_ITEMS+=("node") fi if ! command -v npm >/dev/null 2>&1; then MISSING_ITEMS+=("npm") fi if ! python3 -m playwright --version >/dev/null 2>&1; then MISSING_ITEMS+=("playwright") fi has_browser=false case "$OS_FAMILY" in macos) if [ -d "/Applications/Google Chrome.app" ] || [ -d "/Applications/Microsoft Edge.app" ]; then has_browser=true fi ;; linux|wsl) if command -v google-chrome >/dev/null 2>&1 \ || command -v google-chrome-stable >/dev/null 2>&1 \ || command -v chromium >/dev/null 2>&1 \ || command -v chromium-browser >/dev/null 2>&1 \ || command -v microsoft-edge >/dev/null 2>&1 \ || command -v microsoft-edge-stable >/dev/null 2>&1; then has_browser=true fi ;; windows) if [ -d "/c/Program Files/Google/Chrome/Application" ] \ || [ -d "/c/Program Files (x86)/Google/Chrome/Application" ] \ || [ -d "/c/Program Files (x86)/Microsoft/Edge/Application" ] \ || [ -d "/c/Program Files/Microsoft/Edge/Application" ]; then has_browser=true fi ;; esac if [ "$has_browser" = false ]; then MISSING_ITEMS+=("浏览器(Chrome 或 Edge)") fi print_solution_macos() { cat <<'EOF' macOS 解决办法: 1) 安装 Homebrew: /bin/bash -c "$(curl -fsSL https://raw.githubusercontent.com/Homebrew/install/HEAD/install.sh)" 2) 安装 Python 与 Node: brew install python@3.11 node 3) 安装 Playwright: python3 -m pip install playwright python3 -m playwright install chromium 4) 安装浏览器: brew install --cask google-chrome # 或 brew install --cask microsoft-edge 5) 配置文件(可选): cp .env.example .env cp config.json.example config.json EOF } print_solution_linux_deb() { cat <<'EOF' Linux (Debian/Ubuntu) 解决办法: 1) 安装 Python 与 pip: sudo apt-get update sudo apt-get install -y python3 python3-pip python3-venv 2) 安装 Node.js 与 npm (LTS): curl -fsSL https://deb.nodesource.com/setup_lts.x | sudo -E bash - sudo apt-get install -y nodejs 3) 安装 Playwright: python3 -m pip install playwright python3 -m playwright install chromium python3 -m playwright install-deps chromium 4) 安装浏览器: sudo apt-get install -y chromium-browser || sudo apt-get install -y chromium # 或安装 Edge: sudo apt-get install -y microsoft-edge-stable 5) 配置文件(可选): cp .env.example .env cp config.json.example config.json EOF } print_solution_linux_rpm() { cat <<'EOF' Linux (RHEL/CentOS/Fedora) 解决办法: 1) 安装 Python 与 pip: sudo dnf install -y python3 python3-pip 2) 安装 Node.js 与 npm (LTS): sudo dnf install -y nodejs 3) 安装 Playwright: python3 -m pip install playwright python3 -m playwright install chromium python3 -m playwright install-deps chromium 4) 安装浏览器: sudo dnf install -y chromium # 或安装 Edge: sudo dnf install -y microsoft-edge-stable 5) 配置文件(可选): cp .env.example .env cp config.json.example config.json EOF } print_solution_linux_arch() { cat <<'EOF' Linux (Arch) 解决办法: 1) 安装 Python 与 pip: sudo pacman -S --noconfirm python python-pip 2) 安装 Node.js 与 npm: sudo pacman -S --noconfirm nodejs npm 3) 安装 Playwright: python3 -m pip install playwright python3 -m playwright install chromium python3 -m playwright install-deps chromium 4) 安装浏览器: sudo pacman -S --noconfirm chromium # 或安装 Edge: yay -S microsoft-edge-stable 5) 配置文件: cp .env.example .env cp config.json.example config.json EOF } print_solution_wsl() { cat <<'EOF' WSL 解决办法: 1) 安装 Python 与 pip: sudo apt-get update sudo apt-get install -y python3 python3-pip python3-venv 2) 安装 Node.js 与 npm (LTS): curl -fsSL https://deb.nodesource.com/setup_lts.x | sudo -E bash - sudo apt-get install -y nodejs 3) 安装 Playwright: python3 -m pip install playwright python3 -m playwright install chromium python3 -m playwright install-deps chromium 4) 安装浏览器: sudo apt-get install -y chromium-browser || sudo apt-get install -y chromium # 或在 Windows 安装 Chrome/Edge 并在 WSL 使用 Linux 版本浏览器 5) 配置文件: cp .env.example .env cp config.json.example config.json EOF } print_solution_windows() { cat <<'EOF' Windows (PowerShell) 解决办法: 1) 安装 Python 与 Node: winget install Python.Python.3.11 winget install OpenJS.NodeJS.LTS 2) 安装 Playwright: py -m pip install playwright py -m playwright install chromium 3) 安装浏览器: winget install Google.Chrome # 或 winget install Microsoft.Edge 4) 配置文件(可选): Copy-Item .env.example .env Copy-Item config.json.example config.json EOF } print_solution_generic() { cat <<'EOF' 通用解决办法: 1) 安装 Python 3.10+ 与 pip 2) 安装 Node.js 与 npm 3) 安装 Playwright: python3 -m pip install playwright python3 -m playwright install chromium 4) 安装浏览器 Chrome 或 Edge 5) 配置文件(可选): cp .env.example .env cp config.json.example config.json EOF } if [ "${#MISSING_ITEMS[@]}" -ne 0 ]; then echo -e "${RED}✗ 检测到缺失的环境/依赖:${NC}" for item in "${MISSING_ITEMS[@]}"; do echo " - $item" done echo "" case "$OS_FAMILY" in macos) print_solution_macos ;; linux) if [ "$LINUX_ID" = "arch" ] || echo "$LINUX_LIKE" | grep -qi "arch"; then print_solution_linux_arch elif [ "$LINUX_ID" = "fedora" ] || [ "$LINUX_ID" = "rhel" ] || [ "$LINUX_ID" = "centos" ] || echo "$LINUX_LIKE" | grep -qi "rhel\|fedora"; then print_solution_linux_rpm else print_solution_linux_deb fi ;; wsl) print_solution_wsl ;; windows) print_solution_windows ;; *) print_solution_generic ;; esac exit 1 fi echo -e "${GREEN}✓ 环境与依赖检查通过${NC}" # 1. 清理旧的 dist 目录 echo -e "\n${YELLOW}[2/6] 清理旧的构建产物...${NC}" if [ -d "dist" ]; then rm -rf dist echo -e "${GREEN}✓ 已删除旧的 dist 目录${NC}" else echo -e "${GREEN}✓ dist 目录不存在,跳过清理${NC}" fi # 2. 检查并安装 Python 依赖 echo -e "\n${YELLOW}[3/6] 检查 Python 依赖...${NC}" if [ ! -f "requirements.txt" ]; then echo -e "${RED}✗ 错误: requirements.txt 文件不存在${NC}" exit 1 fi echo "正在安装 Python 依赖..." python3 -m pip install -r requirements.txt --quiet echo -e "${GREEN}✓ Python 依赖安装完成${NC}" # 3. 构建前端 echo -e "\n${YELLOW}[4/6] 构建前端项目...${NC}" if [ ! -d "web-ui" ]; then echo -e "${RED}✗ 错误: web-ui 目录不存在${NC}" exit 1 fi cd web-ui # 检查 node_modules 是否存在 if [ ! -d "node_modules" ]; then echo "首次运行,正在安装前端依赖..." npm install fi echo "正在构建前端..." npm run build cd "$SCRIPT_DIR" if [ ! -d "dist" ]; then echo -e "${RED}✗ 错误: 前端构建失败,dist 目录未生成${NC}" exit 1 fi echo -e "${GREEN}✓ 前端构建完成,产物已输出到项目根目录 dist/${NC}" # 4. 校验构建产物 echo -e "\n${YELLOW}[5/6] 校验构建产物...${NC}" echo -e "${GREEN}✓ 已确认构建产物位于项目根目录 dist/${NC}" # 5. 启动后端服务 echo -e "\n${YELLOW}[6/6] 启动后端服务...${NC}" echo -e "${GREEN}========================================${NC}" echo -e "${GREEN}服务启动中...${NC}" echo -e "${GREEN}访问地址: http://localhost:8000${NC}" echo -e "${GREEN}API 文档: http://localhost:8000/docs${NC}" echo -e "${GREEN}========================================${NC}\n" python3 -m src.app ================================================ FILE: tests/README.md ================================================ # 测试指南 本项目使用 pytest 作为测试框架。以下是运行测试的指南。 ## 安装依赖 在运行测试之前,请确保已安装所有开发依赖项: ```bash pip install -r requirements.txt ``` ## 运行测试 ### 运行所有测试 ```bash pytest ``` ### 运行特定测试文件 ```bash pytest tests/integration/test_api_tasks.py ``` ### 运行特定测试函数 ```bash pytest tests/unit/test_utils.py::test_safe_get_nested_and_default ``` ### 生成覆盖率报告 ```bash coverage run -m pytest coverage report coverage html # 生成 HTML 报告 ``` ## 测试文件结构 ``` tests/ ├── __init__.py ├── conftest.py # 共享 fixtures(API/CLI/样例数据) ├── fixtures/ # 贴近真实的样例数据(搜索/用户/评价/任务配置) │ ├── config.sample.json │ ├── ratings.json │ ├── search_results.json │ ├── state.sample.json │ ├── user_head.json │ └── user_items.json ├── integration/ # 关键链路集成测试(API/CLI/解析器) │ ├── test_api_tasks.py │ ├── test_cli_spider.py │ └── test_pipeline_parse.py └── unit/ # 核心纯函数单元测试 ├── test_domain_task.py └── test_utils.py ``` ## 编写新测试 1. 新增测试放在 `tests/integration/` 或 `tests/unit/` 2. 文件名以 `test_` 开头,函数名以 `test_` 开头 3. 测试采用同步执行(不依赖 pytest-asyncio) 4. 外部依赖(Playwright/AI/通知/网络)统一 mock 5. 使用 `tests/fixtures/` 的样例数据,避免依赖真实网络 ## 注意事项 1. 目标是离线可跑且稳定复现 2. 集成测试优先覆盖真实运行链路(API、CLI、解析器) 3. 如需新增真实场景样例,统一补充到 `tests/fixtures/` ## Live smoke(真实冒烟测试) - 目录:`tests/live/` - 默认关闭;仅当显式设置 `RUN_LIVE_TESTS=1` 时才会执行 - 推荐命令: ```bash RUN_LIVE_TESTS=1 \ LIVE_TEST_ACCOUNT_STATE_FILE=/absolute/path/to/account.json \ LIVE_TEST_KEYWORD="MacBook Pro M2" \ pytest tests/live -m live -v ``` - 一键脚本: ```bash ./run_live_smoke.sh ./run_live_smoke.sh --without-generation ``` - 可选环境变量: - `LIVE_TEST_TASK_NAME` - `LIVE_EXPECT_MIN_ITEMS`(默认 `1`) - `LIVE_TEST_DEBUG_LIMIT`(默认 `1`,只抓取/分析前 N 个新商品) - `LIVE_TIMEOUT_SECONDS`(默认 `180`) - `LIVE_ENABLE_TASK_GENERATION`(脚本默认 `1`;设为 `0` 或使用 `--without-generation` 可关闭真实 AI 任务生成慢用例) - live 套件会在临时工作目录中启动真实 `uvicorn`,并清空通知相关 env,避免污染仓库根目录或向真实通知通道发消息。 ================================================ FILE: tests/__init__.py ================================================ ================================================ FILE: tests/conftest.py ================================================ import json import os import sys from datetime import datetime, timedelta from pathlib import Path from zoneinfo import ZoneInfo import pytest from fastapi import FastAPI from fastapi.testclient import TestClient # Add repository root to the path so package imports work consistently repo_root = Path(__file__).resolve().parents[1] sys.path.insert(0, str(repo_root)) from src.api import dependencies as deps from src.api.routes import tasks from src.infrastructure.persistence.sqlite_task_repository import SqliteTaskRepository from src.services.task_service import TaskService from src.services.task_generation_service import TaskGenerationService @pytest.fixture() def fixtures_dir() -> Path: return Path(__file__).parent / "fixtures" @pytest.fixture() def load_json_fixture(fixtures_dir): def _load(name: str): return json.loads((fixtures_dir / name).read_text(encoding="utf-8")) return _load @pytest.fixture() def sample_task_payload(): return { "task_name": "Sony A7M4", "enabled": True, "keyword": "sony a7m4", "description": "Good condition body with accessories", "analyze_images": True, "max_pages": 2, "personal_only": True, "min_price": "8000", "max_price": "16000", "cron": "*/15 * * * *", "ai_prompt_base_file": "prompts/base_prompt.txt", "ai_prompt_criteria_file": "prompts/sony_a7m4_criteria.txt", "decision_mode": "ai", "keyword_rules": [], } class FakeProcessService: def __init__(self): self.started = [] self.stopped = [] self.reindexed = [] self._on_started = None self._on_stopped = None def set_lifecycle_hooks(self, *, on_started=None, on_stopped=None): self._on_started = on_started self._on_stopped = on_stopped async def start_task(self, task_id: int, task_name: str) -> bool: self.started.append((task_id, task_name)) if self._on_started: await self._on_started(task_id) return True async def stop_task(self, task_id: int): self.stopped.append(task_id) if self._on_stopped: await self._on_stopped(task_id) def reindex_after_delete(self, deleted_task_id: int): self.reindexed.append(deleted_task_id) class FakeSchedulerService: def __init__(self): self.reload_calls = 0 self.next_run_times = {} async def reload_jobs(self, tasks): self.reload_calls += 1 base = datetime(2026, 3, 19, 8, 0, tzinfo=ZoneInfo("Asia/Shanghai")) self.next_run_times = { task.id: base + timedelta(minutes=(index + 1) * 15) for index, task in enumerate(tasks) if task.id is not None and task.enabled and task.cron } def get_next_run_time(self, task_id: int): return self.next_run_times.get(task_id) @pytest.fixture() def api_context(tmp_path): config_file = tmp_path / "config.json" config_file.write_text("[]", encoding="utf-8") db_path = tmp_path / "app.sqlite3" repository = SqliteTaskRepository( db_path=str(db_path), legacy_config_file=None, ) task_service = TaskService(repository) process_service = FakeProcessService() scheduler_service = FakeSchedulerService() task_generation_service = TaskGenerationService() app = FastAPI() app.include_router(tasks.router) def override_get_task_service(): return task_service def override_get_process_service(): return process_service def override_get_scheduler_service(): return scheduler_service def override_get_task_generation_service(): return task_generation_service async def mark_started(task_id: int): await task_service.update_task_status(task_id, True) async def mark_stopped(task_id: int): task = await task_service.get_task(task_id) if task: await task_service.update_task_status(task_id, False) process_service.set_lifecycle_hooks(on_started=mark_started, on_stopped=mark_stopped) app.dependency_overrides[deps.get_task_service] = override_get_task_service app.dependency_overrides[deps.get_process_service] = override_get_process_service app.dependency_overrides[deps.get_scheduler_service] = override_get_scheduler_service app.dependency_overrides[deps.get_task_generation_service] = override_get_task_generation_service return { "app": app, "config_file": config_file, "db_path": db_path, "process_service": process_service, "scheduler_service": scheduler_service, "task_generation_service": task_generation_service, } @pytest.fixture() def api_client(api_context): return TestClient(api_context["app"]) ================================================ FILE: tests/fixtures/config.sample.json ================================================ [ { "task_name": "Sony A7M4", "enabled": true, "keyword": "sony a7m4", "description": "body only", "max_pages": 2, "personal_only": true, "min_price": "8000", "max_price": "16000", "cron": "*/15 * * * *", "ai_prompt_base_file": "prompts/base_prompt.txt", "ai_prompt_criteria_file": "prompts/sony_a7m4_criteria.txt", "decision_mode": "ai", "keyword_rules": [] }, { "task_name": "Canon R6", "enabled": false, "keyword": "canon r6", "description": "body only", "max_pages": 1, "personal_only": true, "min_price": "5000", "max_price": "12000", "cron": "*/30 * * * *", "ai_prompt_base_file": "prompts/base_prompt.txt", "ai_prompt_criteria_file": "prompts/canon_r6_criteria.txt", "decision_mode": "ai", "keyword_rules": [] } ] ================================================ FILE: tests/fixtures/ratings.json ================================================ [ { "cardData": { "rateId": "r1", "feedback": "Great seller", "rate": 1, "rateTagList": [{"text": "\u5356\u5bb6"}], "raterUserNick": "buyer_01", "gmtCreate": "2024-01-01", "pictCdnUrlList": [] } }, { "cardData": { "rateId": "r2", "feedback": "Nice buyer", "rate": 1, "rateTagList": [{"text": "\u4e70\u5bb6"}], "raterUserNick": "seller_02", "gmtCreate": "2024-02-01", "pictCdnUrlList": ["https://img.example.com/pic.jpg"] } }, { "cardData": { "rateId": "r3", "feedback": "Average", "rate": 0, "rateTagList": [{"text": "\u5356\u5bb6"}], "raterUserNick": "buyer_02", "gmtCreate": "2024-03-01", "pictCdnUrlList": [] } } ] ================================================ FILE: tests/fixtures/search_results.json ================================================ { "data": { "resultList": [ { "data": { "item": { "main": { "exContent": { "title": "Sony A7M4 Body", "price": [ {"text": "\u00a5"}, {"text": "13999"} ], "area": "Shanghai", "userNickName": "seller_01", "picUrl": "https://img.example.com/a7m4.jpg", "itemId": "123456", "oriPrice": "\u00a516999", "fishTags": { "r1": { "tagList": [ {"data": {"content": "\u9a8c\u8d27\u5b9d"}} ] } } }, "clickParam": { "args": { "publishTime": "1710000000000", "wantNum": 12, "tag": "freeship" } }, "targetUrl": "fleamarket://item?id=123456" } } } } ] } } ================================================ FILE: tests/fixtures/state.sample.json ================================================ { "session": "dummy" } ================================================ FILE: tests/fixtures/user_head.json ================================================ { "data": { "module": { "base": { "displayName": "seller_01", "avatar": {"avatar": "https://img.example.com/avatar.jpg"}, "introduction": "Trusted seller", "ylzTags": [ {"attributes": {"role": "seller", "level": "3"}, "text": "S3"}, {"attributes": {"role": "buyer", "level": "2"}, "text": "B2"} ] }, "tabs": { "item": {"number": 12}, "rate": {"number": 88} } } } } ================================================ FILE: tests/fixtures/user_items.json ================================================ [ { "cardData": { "itemStatus": 0, "id": "10001", "title": "Lens 24-70", "priceInfo": {"price": "3500"}, "picInfo": {"picUrl": "https://img.example.com/lens.jpg"} } }, { "cardData": { "itemStatus": 1, "id": "10002", "title": "Battery Grip", "priceInfo": {"price": "600"}, "picInfo": {"picUrl": "https://img.example.com/grip.jpg"} } } ] ================================================ FILE: tests/integration/test_api_dashboard.py ================================================ import json from fastapi import FastAPI from fastapi.testclient import TestClient from src.api import dependencies as deps from src.api.routes import dashboard from src.domain.models.task import TaskCreate from src.infrastructure.persistence.sqlite_task_repository import SqliteTaskRepository from src.services.task_service import TaskService def _write_jsonl(path, records): with open(path, "w", encoding="utf-8") as file: for record in records: file.write(json.dumps(record, ensure_ascii=False) + "\n") def test_dashboard_summary_aggregates_tasks_and_results(tmp_path, monkeypatch): monkeypatch.chdir(tmp_path) jsonl_dir = tmp_path / "jsonl" jsonl_dir.mkdir(parents=True, exist_ok=True) repository = SqliteTaskRepository( db_path=str(tmp_path / "app.sqlite3"), legacy_config_file=None, ) task_service = TaskService(repository) app = FastAPI() app.include_router(dashboard.router) app.dependency_overrides[deps.get_task_service] = lambda: task_service client = TestClient(app) first = TaskCreate( task_name="Apple Watch 任务", keyword="apple watch", description="只关注价格合适且成色好的 Apple Watch。", max_pages=3, personal_only=True, ) second = TaskCreate( task_name="iPad 任务", keyword="ipad pro", description="关注 2024 款 iPad Pro。", max_pages=2, personal_only=True, ) created_first = task_service.create_task(first) created_second = task_service.create_task(second) import asyncio created_first = asyncio.run(created_first) created_second = asyncio.run(created_second) asyncio.run(task_service.update_task_status(created_second.id, True)) records = [ { "爬取时间": "2026-03-10T10:00:00", "搜索关键字": "apple watch", "任务名称": "Apple Watch 任务", "商品信息": { "商品ID": "watch-1", "商品标题": "Apple Watch S10", "商品链接": "https://www.goofish.com/item?id=watch-1", "当前售价": "¥1800", }, "ai_analysis": { "analysis_source": "ai", "is_recommended": True, "reason": "价格低于均价", }, }, { "爬取时间": "2026-03-10T11:00:00", "搜索关键字": "apple watch", "任务名称": "Apple Watch 任务", "商品信息": { "商品ID": "watch-2", "商品标题": "Apple Watch S10 蜂窝版", "商品链接": "https://www.goofish.com/item?id=watch-2", "当前售价": "¥2100", }, "ai_analysis": { "analysis_source": "keyword", "is_recommended": False, "reason": "未命中规则", }, }, ] _write_jsonl(jsonl_dir / "apple_watch_full_data.jsonl", records) response = client.get("/api/dashboard/summary") assert response.status_code == 200 payload = response.json() assert payload["summary"]["enabled_tasks"] == 2 assert payload["summary"]["running_tasks"] == 1 assert payload["summary"]["result_files"] == 1 assert payload["summary"]["scanned_items"] == 2 assert payload["summary"]["recommended_items"] == 1 assert payload["summary"]["ai_recommended_items"] == 1 assert payload["summary"]["keyword_recommended_items"] == 0 assert payload["focus_file"] == "apple_watch_full_data.jsonl" watch_summary = next( item for item in payload["task_summaries"] if item["task_name"] == "Apple Watch 任务" ) assert watch_summary["filename"] == "apple_watch_full_data.jsonl" assert watch_summary["total_items"] == 2 assert watch_summary["latest_recommended_title"] == "Apple Watch S10" ipad_summary = next( item for item in payload["task_summaries"] if item["task_name"] == "iPad 任务" ) assert ipad_summary["filename"] is None assert ipad_summary["is_running"] is True statuses = {item["status"] for item in payload["recent_activities"]} assert "AI 推荐" in statuses assert "结果已更新" in statuses assert "运行中" in statuses ================================================ FILE: tests/integration/test_api_results.py ================================================ import json from fastapi import FastAPI from fastapi.testclient import TestClient from src.api.routes import results from src.services.price_history_service import record_market_snapshots def _write_jsonl(path, records): with open(path, "w", encoding="utf-8") as f: for record in records: f.write(json.dumps(record, ensure_ascii=False) + "\n") def test_results_filter_and_sort_for_keyword_recommendations(tmp_path, monkeypatch): monkeypatch.chdir(tmp_path) jsonl_dir = tmp_path / "jsonl" jsonl_dir.mkdir(parents=True, exist_ok=True) target_file = jsonl_dir / "demo_full_data.jsonl" records = [ { "爬取时间": "2026-01-01T01:00:00", "商品信息": {"当前售价": "¥1000", "发布时间": "2026-01-01 10:00"}, "ai_analysis": { "analysis_source": "keyword", "is_recommended": True, "keyword_hit_count": 3, "reason": "命中 3 个关键词", }, }, { "爬取时间": "2026-01-01T02:00:00", "商品信息": {"当前售价": "¥2000", "发布时间": "2026-01-01 11:00"}, "ai_analysis": { "analysis_source": "keyword", "is_recommended": True, "keyword_hit_count": 1, "reason": "命中 1 个关键词", }, }, { "爬取时间": "2026-01-01T03:00:00", "商品信息": {"当前售价": "¥3000", "发布时间": "2026-01-01 12:00"}, "ai_analysis": { "analysis_source": "ai", "is_recommended": True, "reason": "AI推荐", }, }, ] _write_jsonl(target_file, records) app = FastAPI() app.include_router(results.router) client = TestClient(app) resp = client.get( "/api/results/demo_full_data.jsonl", params={"keyword_recommended_only": True, "sort_by": "keyword_hit_count", "sort_order": "desc"}, ) assert resp.status_code == 200 data = resp.json() assert data["total_items"] == 2 assert data["items"][0]["ai_analysis"]["keyword_hit_count"] == 3 assert data["items"][1]["ai_analysis"]["keyword_hit_count"] == 1 resp = client.get( "/api/results/demo_full_data.jsonl", params={"ai_recommended_only": True}, ) assert resp.status_code == 200 data = resp.json() assert data["total_items"] == 1 assert data["items"][0]["ai_analysis"]["analysis_source"] == "ai" resp = client.get( "/api/results/demo_full_data.jsonl", params={"ai_recommended_only": True, "keyword_recommended_only": True}, ) assert resp.status_code == 400 def test_results_insights_and_export_csv(tmp_path, monkeypatch): monkeypatch.chdir(tmp_path) jsonl_dir = tmp_path / "jsonl" jsonl_dir.mkdir(parents=True, exist_ok=True) target_file = jsonl_dir / "demo_full_data.jsonl" records = [ { "爬取时间": "2026-01-02T09:00:00", "搜索关键字": "demo", "任务名称": "Demo 任务", "商品信息": { "商品ID": "1001", "商品标题": "Demo One", "商品链接": "https://www.goofish.com/item?id=1001", "当前售价": "¥950", "发布时间": "2026-01-02 08:30", }, "卖家信息": {"卖家昵称": "卖家A"}, "ai_analysis": { "analysis_source": "ai", "is_recommended": True, "reason": "价格低于近期均价", }, }, { "爬取时间": "2026-01-02T09:05:00", "搜索关键字": "demo", "任务名称": "Demo 任务", "商品信息": { "商品ID": "1002", "商品标题": "Demo Two", "商品链接": "https://www.goofish.com/item?id=1002", "当前售价": "¥1200", "发布时间": "2026-01-02 08:45", }, "卖家信息": {"卖家昵称": "卖家B"}, "ai_analysis": { "analysis_source": "keyword", "is_recommended": False, "reason": "未命中", "keyword_hit_count": 0, }, }, ] _write_jsonl(target_file, records) record_market_snapshots( keyword="demo", task_name="Demo 任务", items=[ { "商品ID": "1001", "商品标题": "Demo One", "当前售价": "¥1000", "商品链接": "https://www.goofish.com/item?id=1001", }, { "商品ID": "1002", "商品标题": "Demo Two", "当前售价": "¥1200", "商品链接": "https://www.goofish.com/item?id=1002", }, ], run_id="run-1", snapshot_time="2026-01-01T10:00:00", seen_item_ids=set(), ) record_market_snapshots( keyword="demo", task_name="Demo 任务", items=[ { "商品ID": "1001", "商品标题": "Demo One", "当前售价": "¥950", "商品链接": "https://www.goofish.com/item?id=1001", }, { "商品ID": "1002", "商品标题": "Demo Two", "当前售价": "¥1180", "商品链接": "https://www.goofish.com/item?id=1002", }, ], run_id="run-2", snapshot_time="2026-01-02T10:00:00", seen_item_ids=set(), ) app = FastAPI() app.include_router(results.router) client = TestClient(app) insights_resp = client.get("/api/results/demo_full_data.jsonl/insights") assert insights_resp.status_code == 200 insights = insights_resp.json() assert insights["market_summary"]["sample_count"] == 2 assert len(insights["daily_trend"]) == 2 list_resp = client.get("/api/results/demo_full_data.jsonl") assert list_resp.status_code == 200 items = list_resp.json()["items"] assert items[0]["price_insight"]["observation_count"] >= 1 export_resp = client.get( "/api/results/demo_full_data.jsonl/export", params={"sort_by": "price", "sort_order": "asc"}, ) assert export_resp.status_code == 200 assert "text/csv" in export_resp.headers["content-type"] text = export_resp.text assert "任务名称,搜索关键字,商品ID,商品标题" in text assert "Demo One" in text def test_results_export_csv_supports_unicode_filename(tmp_path, monkeypatch): monkeypatch.chdir(tmp_path) jsonl_dir = tmp_path / "jsonl" jsonl_dir.mkdir(parents=True, exist_ok=True) target_file = jsonl_dir / "演示_full_data.jsonl" records = [ { "爬取时间": "2026-01-02T09:00:00", "搜索关键字": "演示", "任务名称": "演示任务", "商品信息": { "商品ID": "1001", "商品标题": "演示商品", "商品链接": "https://www.goofish.com/item?id=1001", "当前售价": "¥950", "发布时间": "2026-01-02 08:30", }, "卖家信息": {"卖家昵称": "卖家A"}, "ai_analysis": { "analysis_source": "ai", "is_recommended": True, "reason": "价格合理", }, } ] _write_jsonl(target_file, records) app = FastAPI() app.include_router(results.router) client = TestClient(app) export_resp = client.get("/api/results/演示_full_data.jsonl/export") assert export_resp.status_code == 200 assert "text/csv" in export_resp.headers["content-type"] disposition = export_resp.headers["content-disposition"] assert 'filename="export.csv"' in disposition assert "filename*=UTF-8''%E6%BC%94%E7%A4%BA_full_data.csv" in disposition ================================================ FILE: tests/integration/test_api_settings.py ================================================ from fastapi import FastAPI from fastapi.testclient import TestClient from src.api import dependencies as deps from src.api.routes import settings from src.infrastructure.config.env_manager import env_manager _SETTINGS_ENV_KEYS = [ "ACCOUNT_ROTATION_ENABLED", "ACCOUNT_ROTATION_MODE", "ACCOUNT_ROTATION_RETRY_LIMIT", "ACCOUNT_BLACKLIST_TTL", "ACCOUNT_STATE_DIR", "PROXY_ROTATION_ENABLED", "PROXY_ROTATION_MODE", "PROXY_POOL", "PROXY_ROTATION_RETRY_LIMIT", "PROXY_BLACKLIST_TTL", "OPENAI_API_KEY", "OPENAI_BASE_URL", "OPENAI_MODEL_NAME", "SKIP_AI_ANALYSIS", "PROXY_URL", "NTFY_TOPIC_URL", "GOTIFY_URL", "GOTIFY_TOKEN", "BARK_URL", "WX_BOT_URL", "TELEGRAM_BOT_TOKEN", "TELEGRAM_CHAT_ID", "TELEGRAM_API_BASE_URL", "WEBHOOK_URL", "WEBHOOK_METHOD", "WEBHOOK_HEADERS", "WEBHOOK_CONTENT_TYPE", "WEBHOOK_QUERY_PARAMETERS", "WEBHOOK_BODY", "PCURL_TO_MOBILE", ] class _IdleProcessService: def __init__(self) -> None: self.processes = {} def _build_settings_client() -> TestClient: app = FastAPI() app.include_router(settings.router) app.dependency_overrides[deps.get_process_service] = _IdleProcessService return TestClient(app) def _clear_settings_env(monkeypatch) -> None: for key in _SETTINGS_ENV_KEYS: monkeypatch.delenv(key, raising=False) def test_rotation_settings_include_account_rotation_fields(tmp_path, monkeypatch): _clear_settings_env(monkeypatch) env_file = tmp_path / ".env" env_file.write_text( "\n".join( [ "ACCOUNT_ROTATION_ENABLED=false", "ACCOUNT_ROTATION_MODE=per_task", "ACCOUNT_ROTATION_RETRY_LIMIT=2", "ACCOUNT_BLACKLIST_TTL=300", "ACCOUNT_STATE_DIR=state", "PROXY_ROTATION_ENABLED=false", "PROXY_ROTATION_MODE=per_task", "PROXY_ROTATION_RETRY_LIMIT=2", "PROXY_BLACKLIST_TTL=300", ] ), encoding="utf-8", ) monkeypatch.setattr(env_manager, "env_file", env_file) client = _build_settings_client() response = client.get("/api/settings/rotation") assert response.status_code == 200 payload = response.json() assert payload["ACCOUNT_ROTATION_ENABLED"] is False assert payload["ACCOUNT_ROTATION_MODE"] == "per_task" assert payload["ACCOUNT_STATE_DIR"] == "state" update_response = client.put( "/api/settings/rotation", json={ "ACCOUNT_ROTATION_ENABLED": True, "ACCOUNT_ROTATION_MODE": "on_failure", "ACCOUNT_ROTATION_RETRY_LIMIT": 4, "ACCOUNT_BLACKLIST_TTL": 900, "ACCOUNT_STATE_DIR": "accounts", }, ) assert update_response.status_code == 200 latest = env_file.read_text(encoding="utf-8") assert "ACCOUNT_ROTATION_ENABLED=true" in latest assert "ACCOUNT_ROTATION_MODE=on_failure" in latest assert "ACCOUNT_ROTATION_RETRY_LIMIT=4" in latest assert "ACCOUNT_BLACKLIST_TTL=900" in latest assert "ACCOUNT_STATE_DIR=accounts" in latest def test_notification_settings_redact_sensitive_values_and_expose_flags(tmp_path, monkeypatch): _clear_settings_env(monkeypatch) env_file = tmp_path / ".env" env_file.write_text( "\n".join( [ "NTFY_TOPIC_URL=https://ntfy.sh/demo-topic", "GOTIFY_URL=https://gotify.example.com", "GOTIFY_TOKEN=secret-token", "BARK_URL=https://api.day.app/private-key/", "WX_BOT_URL=https://qyapi.weixin.qq.com/cgi-bin/webhook/send?key=secret", "TELEGRAM_BOT_TOKEN=telegram-secret", "TELEGRAM_CHAT_ID=123456", "TELEGRAM_API_BASE_URL=https://tg.example.com/proxy", "WEBHOOK_URL=https://hooks.example.com/notify?token=secret", 'WEBHOOK_HEADERS={"Authorization":"Bearer secret"}', 'WEBHOOK_BODY={"message":"{{content}}"}', ] ), encoding="utf-8", ) monkeypatch.setattr(env_manager, "env_file", env_file) client = _build_settings_client() response = client.get("/api/settings/notifications") assert response.status_code == 200 payload = response.json() assert payload["NTFY_TOPIC_URL"] == "https://ntfy.sh/demo-topic" assert payload["GOTIFY_URL"] == "https://gotify.example.com" assert payload["TELEGRAM_CHAT_ID"] == "123456" assert payload["TELEGRAM_API_BASE_URL"] == "https://tg.example.com/proxy" assert payload["BARK_URL"] == "" assert payload["WX_BOT_URL"] == "" assert payload["GOTIFY_TOKEN"] == "" assert payload["TELEGRAM_BOT_TOKEN"] == "" assert payload["WEBHOOK_URL"] == "" assert payload["WEBHOOK_HEADERS"] == "" assert payload["BARK_URL_SET"] is True assert payload["WX_BOT_URL_SET"] is True assert payload["GOTIFY_TOKEN_SET"] is True assert payload["TELEGRAM_BOT_TOKEN_SET"] is True assert payload["WEBHOOK_URL_SET"] is True assert payload["WEBHOOK_HEADERS_SET"] is True assert payload["WEBHOOK_BODY"] == '{"message":"{{content}}"}' def test_update_notification_settings_rejects_invalid_channel_config(tmp_path, monkeypatch): _clear_settings_env(monkeypatch) env_file = tmp_path / ".env" env_file.write_text("", encoding="utf-8") monkeypatch.setattr(env_manager, "env_file", env_file) client = _build_settings_client() gotify_response = client.put( "/api/settings/notifications", json={"GOTIFY_URL": "https://gotify.example.com"}, ) assert gotify_response.status_code == 422 assert "GOTIFY_TOKEN" in gotify_response.text telegram_proxy_response = client.put( "/api/settings/notifications", json={"TELEGRAM_API_BASE_URL": "not-a-url"}, ) assert telegram_proxy_response.status_code == 422 assert "TELEGRAM_API_BASE_URL" in telegram_proxy_response.text webhook_response = client.put( "/api/settings/notifications", json={ "WEBHOOK_URL": "https://hooks.example.com/notify", "WEBHOOK_METHOD": "POST", "WEBHOOK_CONTENT_TYPE": "JSON", "WEBHOOK_HEADERS": '{"Authorization": "Bearer secret"', }, ) assert webhook_response.status_code == 422 assert "WEBHOOK_HEADERS" in webhook_response.text def test_system_status_includes_notification_channel_flags(tmp_path, monkeypatch): _clear_settings_env(monkeypatch) env_file = tmp_path / ".env" env_file.write_text( "\n".join( [ "NTFY_TOPIC_URL=https://ntfy.sh/demo-topic", "GOTIFY_URL=https://gotify.example.com", "GOTIFY_TOKEN=secret-token", "BARK_URL=https://api.day.app/private-key/", "WX_BOT_URL=https://qyapi.weixin.qq.com/cgi-bin/webhook/send?key=secret", "TELEGRAM_BOT_TOKEN=telegram-secret", "TELEGRAM_CHAT_ID=123456", "WEBHOOK_URL=https://hooks.example.com/notify", ] ), encoding="utf-8", ) monkeypatch.setattr(env_manager, "env_file", env_file) client = _build_settings_client() response = client.get("/api/settings/status") assert response.status_code == 200 env_payload = response.json()["env_file"] assert env_payload["ntfy_topic_url_set"] is True assert env_payload["gotify_url_set"] is True assert env_payload["gotify_token_set"] is True assert env_payload["bark_url_set"] is True assert env_payload["wx_bot_url_set"] is True assert env_payload["telegram_bot_token_set"] is True assert env_payload["telegram_chat_id_set"] is True assert env_payload["webhook_url_set"] is True def test_notification_test_endpoint_merges_stored_secret_values(tmp_path, monkeypatch): _clear_settings_env(monkeypatch) env_file = tmp_path / ".env" env_file.write_text( "\n".join( [ "TELEGRAM_BOT_TOKEN=stored-token", "TELEGRAM_CHAT_ID=10001", "TELEGRAM_API_BASE_URL=https://tg-proxy.example.com/base", ] ), encoding="utf-8", ) monkeypatch.setattr(env_manager, "env_file", env_file) client = _build_settings_client() captured = {} class _FakeResponse: status_code = 200 def raise_for_status(self): return None def json(self): return {"ok": True} def _fake_post(url, json=None, headers=None, timeout=None): captured["url"] = url captured["json"] = json return _FakeResponse() monkeypatch.setattr("requests.post", _fake_post) response = client.post( "/api/settings/notifications/test", json={ "channel": "telegram", "settings": { "TELEGRAM_CHAT_ID": "20002", }, }, ) assert response.status_code == 200 payload = response.json() assert payload["results"]["telegram"]["success"] is True assert captured["url"] == "https://tg-proxy.example.com/base/botstored-token/sendMessage" assert captured["json"]["chat_id"] == "20002" def test_ai_settings_fall_back_to_runtime_environment_when_env_file_missing(tmp_path, monkeypatch): _clear_settings_env(monkeypatch) env_file = tmp_path / ".env" monkeypatch.setattr(env_manager, "env_file", env_file) monkeypatch.setenv("OPENAI_API_KEY", "runtime-key") monkeypatch.setenv("OPENAI_BASE_URL", "https://runtime.example.com/v1") monkeypatch.setenv("OPENAI_MODEL_NAME", "runtime-model") monkeypatch.setenv("PROXY_URL", "http://127.0.0.1:7890") client = _build_settings_client() ai_response = client.get("/api/settings/ai") assert ai_response.status_code == 200 assert ai_response.json() == { "OPENAI_BASE_URL": "https://runtime.example.com/v1", "OPENAI_MODEL_NAME": "runtime-model", "SKIP_AI_ANALYSIS": False, "PROXY_URL": "http://127.0.0.1:7890", } status_response = client.get("/api/settings/status") assert status_response.status_code == 200 env_payload = status_response.json()["env_file"] assert env_payload["exists"] is False assert env_payload["openai_api_key_set"] is True assert env_payload["openai_base_url_set"] is True assert env_payload["openai_model_name_set"] is True def test_notification_settings_fall_back_to_runtime_environment_when_env_file_missing( tmp_path, monkeypatch ): _clear_settings_env(monkeypatch) env_file = tmp_path / ".env" monkeypatch.setattr(env_manager, "env_file", env_file) monkeypatch.setenv("NTFY_TOPIC_URL", "https://ntfy.sh/runtime-topic") monkeypatch.setenv("TELEGRAM_BOT_TOKEN", "runtime-telegram-token") monkeypatch.setenv("TELEGRAM_CHAT_ID", "20001") monkeypatch.setenv("TELEGRAM_API_BASE_URL", "https://runtime-tg-proxy.example.com") monkeypatch.setenv("BARK_URL", "https://api.day.app/runtime-secret/") client = _build_settings_client() response = client.get("/api/settings/notifications") assert response.status_code == 200 payload = response.json() assert payload["NTFY_TOPIC_URL"] == "https://ntfy.sh/runtime-topic" assert payload["TELEGRAM_CHAT_ID"] == "20001" assert payload["TELEGRAM_API_BASE_URL"] == "https://runtime-tg-proxy.example.com" assert payload["BARK_URL"] == "" assert payload["BARK_URL_SET"] is True assert payload["TELEGRAM_BOT_TOKEN_SET"] is True assert sorted(payload["CONFIGURED_CHANNELS"]) == ["bark", "ntfy", "telegram"] def test_ai_test_endpoint_falls_back_to_responses_when_chat_completions_api_404( tmp_path, monkeypatch ): _clear_settings_env(monkeypatch) env_file = tmp_path / ".env" env_file.write_text("", encoding="utf-8") monkeypatch.setattr(env_manager, "env_file", env_file) client = _build_settings_client() request_history = [] class _FakeOpenAI: def __init__(self, **_kwargs): self.responses = type( "_Responses", (), {"create": self._responses_create}, )() self.chat = type( "_Chat", (), { "completions": type( "_Completions", (), {"create": self._chat_create}, )() }, )() def _responses_create(self, **kwargs): request_history.append(("responses", kwargs)) return type( "_Response", (), {"output_text": "OK"}, )() def _chat_create(self, **kwargs): request_history.append(("chat", kwargs)) raise Exception("Error code: 404 - page not found") import openai monkeypatch.setattr(openai, "OpenAI", _FakeOpenAI) response = client.post( "/api/settings/ai/test", json={ "OPENAI_API_KEY": "demo", "OPENAI_BASE_URL": "https://example.com/v1/", "OPENAI_MODEL_NAME": "demo-model", }, ) assert response.status_code == 200 payload = response.json() assert payload["success"] is True assert payload["response"] == "OK" assert request_history[0][0] == "chat" assert request_history[0][1]["messages"][0]["content"] == settings.AI_TEST_PROMPT assert request_history[1][0] == "responses" assert request_history[1][1]["input"][0]["content"][0]["text"] == settings.AI_TEST_PROMPT ================================================ FILE: tests/integration/test_api_tasks.py ================================================ import asyncio import time def test_create_list_update_delete_task(api_client, api_context, sample_task_payload): response = api_client.post("/api/tasks/", json=sample_task_payload) assert response.status_code == 200 created = response.json()["task"] assert created["task_name"] == sample_task_payload["task_name"] assert created["analyze_images"] is True assert created["next_run_at"] == "2026-03-19T08:15:00+08:00" response = api_client.get("/api/tasks") assert response.status_code == 200 tasks = response.json() assert len(tasks) == 1 assert tasks[0]["keyword"] == sample_task_payload["keyword"] assert tasks[0]["analyze_images"] is True assert tasks[0]["next_run_at"] == "2026-03-19T08:15:00+08:00" response = api_client.patch("/api/tasks/0", json={"enabled": False, "analyze_images": False}) assert response.status_code == 200 updated = response.json()["task"] assert updated["enabled"] is False assert updated["analyze_images"] is False assert updated["next_run_at"] is None response = api_client.delete("/api/tasks/0") assert response.status_code == 200 response = api_client.get("/api/tasks") assert response.status_code == 200 assert response.json() == [] def test_start_stop_task_updates_status(api_client, api_context, sample_task_payload): response = api_client.post("/api/tasks/", json=sample_task_payload) assert response.status_code == 200 response = api_client.post("/api/tasks/start/0") assert response.status_code == 200 response = api_client.get("/api/tasks/0") assert response.status_code == 200 assert response.json()["is_running"] is True response = api_client.post("/api/tasks/stop/0") assert response.status_code == 200 response = api_client.get("/api/tasks/0") assert response.status_code == 200 assert response.json()["is_running"] is False process_service = api_context["process_service"] assert process_service.started == [(0, sample_task_payload["task_name"])] assert process_service.stopped == [0] def test_generate_keyword_mode_task_without_ai_criteria(api_client): payload = { "task_name": "A7M4 关键词筛选", "keyword": "sony a7m4", "description": "", "decision_mode": "keyword", "keyword_rules": ["a7m4", "验货宝"], "max_pages": 2, "personal_only": True, } response = api_client.post("/api/tasks/generate", json=payload) assert response.status_code == 200 created = response.json()["task"] assert created["decision_mode"] == "keyword" assert created["ai_prompt_criteria_file"] == "" assert created["keyword_rules"] == ["a7m4", "验货宝"] def test_generate_ai_task_returns_job_and_completes_async(api_client, api_context, monkeypatch): payload = { "task_name": "Apple Watch S10", "keyword": "apple watch s10", "description": "只看国行蜂窝版,电池健康高于 95%,拒绝维修机。", "analyze_images": False, "decision_mode": "ai", "max_pages": 2, "personal_only": True, } async def fake_generate_criteria(*_args, **_kwargs): await asyncio.sleep(0.05) return "[V6.3 核心升级]\\nApple Watch criteria" monkeypatch.setattr( "src.services.task_generation_runner.generate_criteria", fake_generate_criteria, ) response = api_client.post("/api/tasks/generate", json=payload) assert response.status_code == 202 job = response.json()["job"] assert isinstance(job["job_id"], str) assert job["status"] in {"queued", "running"} assert job["task"] is None status_response = api_client.get(f"/api/tasks/generate-jobs/{job['job_id']}") assert status_response.status_code == 200 for _ in range(50): status_response = api_client.get(f"/api/tasks/generate-jobs/{job['job_id']}") latest_job = status_response.json()["job"] if latest_job["status"] == "completed": break time.sleep(0.02) else: raise AssertionError("任务生成作业未在预期时间内完成") assert latest_job["task"]["task_name"] == payload["task_name"] assert latest_job["task"]["ai_prompt_criteria_file"].endswith("_criteria.txt") assert latest_job["task"]["analyze_images"] is False assert api_context["scheduler_service"].reload_calls == 1 def test_create_task_accepts_cron_alias(api_client, sample_task_payload): payload = dict(sample_task_payload) payload["cron"] = "@daily" response = api_client.post("/api/tasks/", json=payload) assert response.status_code == 200 assert response.json()["task"]["cron"] == "0 0 * * *" def test_create_task_rejects_fixed_account_strategy_without_state_file(api_client, sample_task_payload): payload = dict(sample_task_payload) payload["account_strategy"] = "fixed" response = api_client.post("/api/tasks/", json=payload) assert response.status_code == 422 def test_create_task_accepts_rotate_account_strategy(api_client, sample_task_payload): payload = dict(sample_task_payload) payload["account_strategy"] = "rotate" response = api_client.post("/api/tasks/", json=payload) assert response.status_code == 200 task = response.json()["task"] assert task["account_strategy"] == "rotate" def test_update_task_accepts_six_field_cron_expression(api_client, sample_task_payload): create_response = api_client.post("/api/tasks/", json=sample_task_payload) assert create_response.status_code == 200 response = api_client.patch("/api/tasks/0", json={"cron": "0 0 8 * * *"}) assert response.status_code == 200 task_response = api_client.get("/api/tasks/0") assert task_response.status_code == 200 assert task_response.json()["cron"] == "0 0 8 * * *" def test_create_task_rejects_invalid_cron_expression(api_client, sample_task_payload): payload = dict(sample_task_payload) payload["cron"] = "every day at 8" response = api_client.post("/api/tasks/", json=payload) assert response.status_code == 422 def test_delete_task_stops_runtime_and_reindexes_process_state( api_client, api_context, sample_task_payload, ): second_payload = dict(sample_task_payload) second_payload["task_name"] = "Sony A7CR" second_payload["keyword"] = "sony a7cr" second_payload["ai_prompt_criteria_file"] = "prompts/sony_a7cr_criteria.txt" assert api_client.post("/api/tasks/", json=sample_task_payload).status_code == 200 assert api_client.post("/api/tasks/", json=second_payload).status_code == 200 assert api_client.post("/api/tasks/start/0").status_code == 200 response = api_client.delete("/api/tasks/0") assert response.status_code == 200 process_service = api_context["process_service"] assert process_service.stopped == [0] assert process_service.reindexed == [] ================================================ FILE: tests/integration/test_cli_spider.py ================================================ import asyncio import importlib import json import sys import types def test_cli_runs_single_task_with_prompt(tmp_path, load_json_fixture, monkeypatch): fake_scraper = types.ModuleType("src.scraper") async def placeholder_scrape(task_config, debug_limit): return 0 fake_scraper.scrape_xianyu = placeholder_scrape monkeypatch.setitem(sys.modules, "src.scraper", fake_scraper) sys.modules.pop("spider_v2", None) spider_v2 = importlib.import_module("spider_v2") config_data = load_json_fixture("config.sample.json") base_prompt = "Base prompt. " + ("x" * 120) + " {{CRITERIA_SECTION}}" criteria_prompt = "Criteria text for A7M4." base_path = tmp_path / "base_prompt.txt" criteria_path = tmp_path / "criteria_prompt.txt" base_path.write_text(base_prompt, encoding="utf-8") criteria_path.write_text(criteria_prompt, encoding="utf-8") config_data[0]["ai_prompt_base_file"] = str(base_path) config_data[0]["ai_prompt_criteria_file"] = str(criteria_path) config_data[1]["ai_prompt_base_file"] = str(base_path) config_data[1]["ai_prompt_criteria_file"] = str(criteria_path) config_path = tmp_path / "config.json" config_path.write_text(json.dumps(config_data, ensure_ascii=False), encoding="utf-8") state_path = tmp_path / "state.json" state_path.write_text("{}", encoding="utf-8") monkeypatch.setattr(spider_v2, "STATE_FILE", str(state_path)) called = [] async def fake_scrape_xianyu(task_config, debug_limit): called.append(task_config["task_name"]) assert "{{CRITERIA_SECTION}}" not in task_config["ai_prompt_text"] assert "Criteria text for A7M4." in task_config["ai_prompt_text"] return 1 monkeypatch.setattr(spider_v2, "scrape_xianyu", fake_scrape_xianyu) monkeypatch.setattr(sys, "argv", ["spider_v2.py", "--config", str(config_path), "--task-name", "Sony A7M4"]) asyncio.run(spider_v2.main()) assert called == ["Sony A7M4"] def test_cli_runs_keyword_mode_without_prompt_files(tmp_path, load_json_fixture, monkeypatch): fake_scraper = types.ModuleType("src.scraper") async def placeholder_scrape(task_config, debug_limit): return 0 fake_scraper.scrape_xianyu = placeholder_scrape monkeypatch.setitem(sys.modules, "src.scraper", fake_scraper) sys.modules.pop("spider_v2", None) spider_v2 = importlib.import_module("spider_v2") config_data = load_json_fixture("config.sample.json") config_data[0]["enabled"] = True config_data[0]["decision_mode"] = "keyword" config_data[0]["keyword_rules"] = ["a7m4", "验货宝"] config_data[0]["ai_prompt_base_file"] = "missing_base.txt" config_data[0]["ai_prompt_criteria_file"] = "missing_criteria.txt" config_path = tmp_path / "config.json" config_path.write_text(json.dumps(config_data, ensure_ascii=False), encoding="utf-8") state_path = tmp_path / "state.json" state_path.write_text("{}", encoding="utf-8") monkeypatch.setattr(spider_v2, "STATE_FILE", str(state_path)) captured = [] async def fake_scrape_xianyu(task_config, debug_limit): captured.append(task_config) return 1 monkeypatch.setattr(spider_v2, "scrape_xianyu", fake_scrape_xianyu) monkeypatch.setattr(sys, "argv", ["spider_v2.py", "--config", str(config_path), "--task-name", "Sony A7M4"]) asyncio.run(spider_v2.main()) assert len(captured) == 1 assert captured[0]["decision_mode"] == "keyword" assert captured[0]["ai_prompt_text"] == "" ================================================ FILE: tests/integration/test_pipeline_parse.py ================================================ import asyncio from src.parsers import ( _parse_search_results_json, _parse_user_items_data, calculate_reputation_from_ratings, parse_ratings_data, parse_user_head_data, ) def test_parse_search_results(load_json_fixture): raw = load_json_fixture("search_results.json") items = asyncio.run(_parse_search_results_json(raw, source="search")) assert len(items) == 1 item = items[0] assert item["商品标题"] == "Sony A7M4 Body" assert item["当前售价"].startswith("¥") assert "包邮" in item["商品标签"] assert "验货宝" in item["商品标签"] assert item["商品链接"].startswith("https://www.goofish.com/") def test_parse_user_head_and_items(load_json_fixture): head_json = load_json_fixture("user_head.json") items_json = load_json_fixture("user_items.json") head = asyncio.run(parse_user_head_data(head_json)) assert head["卖家昵称"] == "seller_01" assert head["卖家收到的评价总数"] == 88 items = asyncio.run(_parse_user_items_data(items_json)) assert items[0]["商品状态"] == "在售" assert items[1]["商品状态"] == "已售" def test_parse_ratings_and_reputation(load_json_fixture): ratings_json = load_json_fixture("ratings.json") ratings = asyncio.run(parse_ratings_data(ratings_json)) assert ratings[0]["评价类型"] == "好评" reputation = asyncio.run(calculate_reputation_from_ratings(ratings_json)) assert reputation["作为卖家的好评数"].startswith("1/") assert reputation["作为买家的好评数"].startswith("1/") ================================================ FILE: tests/live/_support.py ================================================ from __future__ import annotations import os import shutil import socket import subprocess import time from dataclasses import dataclass from pathlib import Path import requests from dotenv import dotenv_values DEFAULT_EXPECT_MIN_ITEMS = 1 DEFAULT_LIVE_DEBUG_LIMIT = 1 DEFAULT_LIVE_KEYWORD = "MacBook Pro M2" DEFAULT_LIVE_TIMEOUT_SECONDS = 180 HEALTH_TIMEOUT_SECONDS = 60 NOTIFICATION_ENV_KEYS = ( "NTFY_TOPIC_URL", "GOTIFY_URL", "GOTIFY_TOKEN", "BARK_URL", "WX_BOT_URL", "TELEGRAM_BOT_TOKEN", "TELEGRAM_CHAT_ID", "TELEGRAM_API_BASE_URL", "WEBHOOK_URL", "WEBHOOK_METHOD", "WEBHOOK_HEADERS", "WEBHOOK_CONTENT_TYPE", "WEBHOOK_QUERY_PARAMETERS", "WEBHOOK_BODY", ) @dataclass(frozen=True) class LiveTestSettings: repo_root: Path keyword: str task_name: str expect_min_items: int debug_limit: int timeout_seconds: int enable_task_generation: bool account_source_path: Path ai_test_payload: dict[str, str] @dataclass(frozen=True) class LiveServer: base_url: str workspace: Path server_log_path: Path account_state_file: Path settings: LiveTestSettings def env_flag(name: str, default: bool = False) -> bool: value = os.getenv(name) if value is None: return default return str(value).strip().lower() in {"1", "true", "yes", "on"} def env_int(name: str, default: int) -> int: value = os.getenv(name) if value is None: return default return int(value) def load_runtime_env(repo_root: Path) -> dict[str, str]: runtime_env = os.environ.copy() env_file = repo_root / ".env" if not env_file.exists(): return runtime_env file_values = { key: value for key, value in dotenv_values(env_file, encoding="utf-8").items() if key and value is not None } file_values.update(runtime_env) return file_values def build_ai_test_payload(runtime_env: dict[str, str]) -> dict[str, str]: payload = { "OPENAI_BASE_URL": runtime_env.get("OPENAI_BASE_URL", ""), "OPENAI_MODEL_NAME": runtime_env.get("OPENAI_MODEL_NAME", ""), } api_key = runtime_env.get("OPENAI_API_KEY") if api_key: payload["OPENAI_API_KEY"] = api_key proxy_url = runtime_env.get("PROXY_URL") if proxy_url: payload["PROXY_URL"] = proxy_url return payload def resolve_account_source(repo_root: Path) -> Path: configured = os.getenv("LIVE_TEST_ACCOUNT_STATE_FILE") if configured: return Path(configured).expanduser().resolve() state_dir = repo_root / "state" candidates = sorted(state_dir.glob("*.json")) if not candidates: raise FileNotFoundError( "LIVE_TEST_ACCOUNT_STATE_FILE 未设置,且 state/ 下没有可用 JSON 登录态文件。" ) return candidates[0] def load_live_settings(repo_root: Path) -> LiveTestSettings: runtime_env = load_runtime_env(repo_root) return LiveTestSettings( repo_root=repo_root, keyword=os.getenv("LIVE_TEST_KEYWORD", DEFAULT_LIVE_KEYWORD).strip(), task_name=(os.getenv("LIVE_TEST_TASK_NAME", "Live Smoke Task").strip() or "Live Smoke Task"), expect_min_items=env_int("LIVE_EXPECT_MIN_ITEMS", DEFAULT_EXPECT_MIN_ITEMS), debug_limit=env_int("LIVE_TEST_DEBUG_LIMIT", DEFAULT_LIVE_DEBUG_LIMIT), timeout_seconds=env_int("LIVE_TIMEOUT_SECONDS", DEFAULT_LIVE_TIMEOUT_SECONDS), enable_task_generation=env_flag("LIVE_ENABLE_TASK_GENERATION"), account_source_path=resolve_account_source(repo_root), ai_test_payload=build_ai_test_payload(runtime_env), ) def mirror_path(source: Path, destination: Path) -> None: if destination.exists() or destination.is_symlink(): return try: destination.symlink_to(source, target_is_directory=source.is_dir()) except OSError: if source.is_dir(): shutil.copytree(source, destination) return shutil.copy2(source, destination) def prepare_workspace(workspace: Path, settings: LiveTestSettings) -> Path: for name in ("src", "spider_v2.py", "static", "dist"): mirror_path(settings.repo_root / name, workspace / name) shutil.copytree(settings.repo_root / "prompts", workspace / "prompts", dirs_exist_ok=True) state_dir = workspace / "state" state_dir.mkdir(parents=True, exist_ok=True) account_target = state_dir / settings.account_source_path.name shutil.copy2(settings.account_source_path, account_target) for name in ("logs", "images", "data"): (workspace / name).mkdir(parents=True, exist_ok=True) return account_target def build_server_env(workspace: Path, repo_root: Path, port: int) -> dict[str, str]: env = load_runtime_env(repo_root) python_path_parts = [str(repo_root)] if env.get("PYTHONPATH"): python_path_parts.append(env["PYTHONPATH"]) debug_limit = str(os.getenv("LIVE_TEST_DEBUG_LIMIT", DEFAULT_LIVE_DEBUG_LIMIT)).strip() env.update( { "APP_DATABASE_FILE": str(workspace / "data" / "live.sqlite3"), "ACCOUNT_STATE_DIR": str(workspace / "state"), "RUN_HEADLESS": "true", "SKIP_AI_ANALYSIS": "false", "AI_DEBUG_MODE": "true", "PYTHONUNBUFFERED": "1", "SERVER_PORT": str(port), "SPIDER_DEBUG_LIMIT": debug_limit, "PYTHONPATH": os.pathsep.join(python_path_parts), } ) for key in NOTIFICATION_ENV_KEYS: env[key] = "" return env def find_free_port() -> int: with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as sock: sock.bind(("127.0.0.1", 0)) sock.listen(1) return int(sock.getsockname()[1]) def wait_for_server_ready(base_url: str, process: subprocess.Popen, log_path: Path) -> None: deadline = time.monotonic() + HEALTH_TIMEOUT_SECONDS last_error = "unknown" while time.monotonic() < deadline: if process.poll() is not None: break try: response = requests.get(f"{base_url}/health", timeout=2) if response.status_code == 200: return last_error = f"health status={response.status_code} body={response.text[:200]}" except requests.RequestException as exc: last_error = str(exc) time.sleep(1) log_excerpt = "" if log_path.exists(): log_excerpt = log_path.read_text(encoding="utf-8", errors="ignore")[-4000:] raise RuntimeError( "Live app 未在预期时间内启动。" f" last_error={last_error}\nserver_log={log_path}\n{log_excerpt}" ) def terminate_process(process: subprocess.Popen, timeout_seconds: int = 20) -> None: if process.poll() is not None: return process.terminate() try: process.wait(timeout=timeout_seconds) except subprocess.TimeoutExpired: process.kill() process.wait(timeout=5) ================================================ FILE: tests/live/conftest.py ================================================ from __future__ import annotations import os import shutil import subprocess import sys from pathlib import Path import pytest from tests.live._support import ( LiveServer, build_server_env, find_free_port, load_live_settings, prepare_workspace, terminate_process, wait_for_server_ready, ) LIVE_SKIP_REASON = "真实流量 live smoke 默认关闭;请显式设置 RUN_LIVE_TESTS=1。" def pytest_collection_modifyitems(config, items): if os.environ.get("RUN_LIVE_TESTS") == "1": return skip_marker = pytest.mark.skip(reason=LIVE_SKIP_REASON) for item in items: if "live" not in item.keywords and "live_slow" not in item.keywords: continue item.add_marker(skip_marker) @pytest.fixture(scope="session") def live_settings(): if os.environ.get("RUN_LIVE_TESTS") != "1": pytest.skip(LIVE_SKIP_REASON) repo_root = Path(__file__).resolve().parents[2] settings = load_live_settings(repo_root) if not settings.account_source_path.exists(): pytest.fail(f"live 登录态文件不存在: {settings.account_source_path}") if not settings.ai_test_payload.get("OPENAI_BASE_URL") or not settings.ai_test_payload.get( "OPENAI_MODEL_NAME" ): pytest.fail("live 测试需要 OPENAI_BASE_URL 与 OPENAI_MODEL_NAME。") return settings @pytest.fixture(scope="session") def live_server(live_settings, request, tmp_path_factory): workspace = tmp_path_factory.mktemp("live-smoke") account_state_file = prepare_workspace(workspace, live_settings) port = find_free_port() base_url = f"http://127.0.0.1:{port}" log_path = workspace / "live-app.log" env = build_server_env(workspace, live_settings.repo_root, port) log_handle = log_path.open("w", encoding="utf-8") process = subprocess.Popen( [ sys.executable, "-m", "uvicorn", "src.app:app", "--host", "127.0.0.1", "--port", str(port), ], cwd=workspace, env=env, stdout=log_handle, stderr=subprocess.STDOUT, text=True, ) try: wait_for_server_ready(base_url, process, log_path) except Exception: terminate_process(process) log_handle.close() raise server = LiveServer( base_url=base_url, workspace=workspace, server_log_path=log_path, account_state_file=account_state_file, settings=live_settings, ) try: yield server finally: terminate_process(process) log_handle.close() if request.session.testsfailed: print(f"live smoke 失败,保留工作目录供排查: {workspace}") return shutil.rmtree(workspace, ignore_errors=True) ================================================ FILE: tests/live/test_live_smoke.py ================================================ from __future__ import annotations import time from pathlib import Path import pytest import requests from src.infrastructure.persistence.storage_names import build_result_filename pytestmark = pytest.mark.live REQUEST_TIMEOUT_SECONDS = 60 TASK_POLL_INTERVAL_SECONDS = 2 FORBIDDEN_LOG_MARKERS = ( "Login required", "passport.goofish.com", "FAIL_SYS_USER_VALIDATE", "AI客户端未初始化", "未找到可用的登录状态文件", "检测到登录失效/重定向", ) def api_request(session: requests.Session, method: str, url: str, **kwargs) -> requests.Response: kwargs.setdefault("timeout", REQUEST_TIMEOUT_SECONDS) return session.request(method=method, url=url, **kwargs) def fetch_task(session: requests.Session, base_url: str, task_id: int) -> dict: response = api_request(session, "get", f"{base_url}/api/tasks/{task_id}") assert response.status_code == 200, response.text return response.json() def fetch_results_or_none( session: requests.Session, base_url: str, filename: str, *, limit: int = 5, ) -> dict | None: response = api_request( session, "get", f"{base_url}/api/results/{filename}", params={"page": 1, "limit": limit}, ) if response.status_code == 404: return None assert response.status_code == 200, response.text return response.json() def find_task_log(workspace: Path, task_id: int) -> Path | None: log_dir = workspace / "logs" matches = sorted(log_dir.glob(f"*_{task_id}.log")) return matches[0] if matches else None def read_task_log(workspace: Path, task_id: int) -> tuple[Path | None, str]: log_path = find_task_log(workspace, task_id) if log_path is None: return None, "" return log_path, log_path.read_text(encoding="utf-8", errors="ignore") def assert_log_is_clean(log_text: str, log_path: Path | None) -> None: assert log_path is not None, "live 任务日志不存在。" for marker in FORBIDDEN_LOG_MARKERS: assert marker not in log_text, f"日志包含失败标记 {marker},请检查 {log_path}" def wait_for_task_running( session: requests.Session, base_url: str, task_id: int, timeout_seconds: int, ) -> dict: deadline = time.monotonic() + timeout_seconds last_task = {} while time.monotonic() < deadline: last_task = fetch_task(session, base_url, task_id) if last_task.get("is_running"): return last_task time.sleep(TASK_POLL_INTERVAL_SECONDS) pytest.fail(f"任务 {task_id} 未在预期时间内进入运行态: {last_task}") def wait_for_task_completion( session: requests.Session, base_url: str, task_id: int, filename: str, expect_min_items: int, timeout_seconds: int, workspace: Path, ) -> tuple[dict, dict | None]: deadline = time.monotonic() + timeout_seconds last_task = {} last_results = None stop_sent = False while time.monotonic() < deadline: last_task = fetch_task(session, base_url, task_id) last_results = fetch_results_or_none(session, base_url, filename) if ( last_results and last_results.get("total_items", 0) >= expect_min_items and last_task.get("is_running") and not stop_sent ): stop_response = api_request(session, "post", f"{base_url}/api/tasks/stop/{task_id}") assert stop_response.status_code == 200, stop_response.text stop_sent = True if not last_task.get("is_running"): return last_task, last_results time.sleep(TASK_POLL_INTERVAL_SECONDS) log_path, log_text = read_task_log(workspace, task_id) pytest.fail( f"任务 {task_id} 在 {timeout_seconds}s 内未结束。log={log_path}\n{log_text[-4000:]}" ) def delete_task_safely(session: requests.Session, base_url: str, task_id: int) -> None: response = api_request(session, "delete", f"{base_url}/api/tasks/{task_id}") assert response.status_code in {200, 404}, response.text def build_live_task_payload(account_state_file: Path, task_name: str, keyword: str) -> dict: return { "task_name": task_name, "enabled": True, "keyword": keyword, "description": "Live smoke task for real Goofish traffic and real AI response validation.", "analyze_images": False, "max_pages": 1, "personal_only": True, "ai_prompt_base_file": "prompts/base_prompt.txt", "ai_prompt_criteria_file": "prompts/macbook_criteria.txt", "account_state_file": str(account_state_file), "account_strategy": "fixed", "decision_mode": "ai", } def test_live_preflight_smoke(live_server): with requests.Session() as session: health_response = api_request(session, "get", f"{live_server.base_url}/health") assert health_response.status_code == 200, health_response.text assert health_response.json()["status"] == "healthy" ai_response = api_request( session, "post", f"{live_server.base_url}/api/settings/ai/test", json=live_server.settings.ai_test_payload, ) assert ai_response.status_code == 200, ai_response.text ai_result = ai_response.json() assert ai_result["success"] is True, ai_result assert live_server.account_state_file.exists() def test_live_real_traffic_task_smoke(live_server): task_name = live_server.settings.task_name keyword = live_server.settings.keyword filename = build_result_filename(keyword) payload = build_live_task_payload(live_server.account_state_file, task_name, keyword) with requests.Session() as session: create_response = api_request( session, "post", f"{live_server.base_url}/api/tasks/", json=payload, ) assert create_response.status_code == 200, create_response.text created_task = create_response.json()["task"] task_id = created_task["id"] try: start_response = api_request( session, "post", f"{live_server.base_url}/api/tasks/start/{task_id}", ) assert start_response.status_code == 200, start_response.text final_task, result_data = wait_for_task_completion( session, live_server.base_url, task_id, filename, live_server.settings.expect_min_items, live_server.settings.timeout_seconds, live_server.workspace, ) assert final_task["is_running"] is False files_response = api_request(session, "get", f"{live_server.base_url}/api/results/files") assert files_response.status_code == 200, files_response.text assert filename in files_response.json()["files"] if result_data is None: result_data = fetch_results_or_none(session, live_server.base_url, filename) assert result_data is not None, f"结果文件 {filename} 未生成。" assert result_data["total_items"] >= live_server.settings.expect_min_items item = result_data["items"][0] product = item.get("商品信息", {}) analysis = item.get("ai_analysis", {}) assert product.get("商品标题"), item assert product.get("商品链接"), item assert product.get("当前售价"), item assert analysis, item assert analysis.get("analysis_source") == "ai", item log_path, log_text = read_task_log(live_server.workspace, task_id) assert_log_is_clean(log_text, log_path) finally: delete_task_safely(session, live_server.base_url, task_id) @pytest.mark.live_slow def test_live_ai_task_generation_job(live_server): if not live_server.settings.enable_task_generation: pytest.skip("未设置 LIVE_ENABLE_TASK_GENERATION=1,跳过真实 AI 任务生成测试。") payload = { "task_name": f"{live_server.settings.task_name} Generated", "keyword": live_server.settings.keyword, "description": "Generate a practical second-hand inspection criteria for live smoke validation.", "analyze_images": False, "max_pages": 1, "personal_only": True, "account_state_file": str(live_server.account_state_file), "account_strategy": "fixed", "decision_mode": "ai", } with requests.Session() as session: response = api_request( session, "post", f"{live_server.base_url}/api/tasks/generate", json=payload, ) assert response.status_code == 202, response.text job = response.json()["job"] job_id = job["job_id"] deadline = time.monotonic() + live_server.settings.timeout_seconds latest_job = job while time.monotonic() < deadline: status_response = api_request( session, "get", f"{live_server.base_url}/api/tasks/generate-jobs/{job_id}", ) assert status_response.status_code == 200, status_response.text latest_job = status_response.json()["job"] if latest_job["status"] == "completed": break if latest_job["status"] == "failed": pytest.fail(f"真实 AI 任务生成失败: {latest_job}") time.sleep(TASK_POLL_INTERVAL_SECONDS) else: pytest.fail(f"真实 AI 任务生成超时: {latest_job}") task = latest_job["task"] assert task["ai_prompt_criteria_file"] task_id = task["id"] try: start_response = api_request( session, "post", f"{live_server.base_url}/api/tasks/start/{task_id}", ) assert start_response.status_code == 200, start_response.text wait_for_task_running( session, live_server.base_url, task_id, timeout_seconds=min(live_server.settings.timeout_seconds, 30), ) stop_response = api_request( session, "post", f"{live_server.base_url}/api/tasks/stop/{task_id}", ) assert stop_response.status_code == 200, stop_response.text final_task, _ = wait_for_task_completion( session, live_server.base_url, task_id, build_result_filename(live_server.settings.keyword), expect_min_items=0, timeout_seconds=min(live_server.settings.timeout_seconds, 60), workspace=live_server.workspace, ) assert final_task["is_running"] is False finally: delete_task_safely(session, live_server.base_url, task_id) ================================================ FILE: tests/test_failure_guard.py ================================================ from __future__ import annotations from datetime import datetime, timedelta from src.failure_guard import FailureGuard def test_failure_guard_opens_circuit_after_threshold_and_rate_limits(tmp_path): guard_path = tmp_path / "guard.json" cookie_path = tmp_path / "xianyu_state.json" cookie_path.write_text("{}", encoding="utf-8") guard = FailureGuard( path=str(guard_path), threshold=3, pause_seconds=3 * 24 * 60 * 60, tz_name="Asia/Shanghai", ) base = datetime(2026, 3, 4, 12, 0, 0) r1 = guard.record_failure("task-a", "err-1", cookie_path=str(cookie_path), now=base) assert r1["should_notify"] is False assert r1["opened_circuit"] is False r2 = guard.record_failure("task-a", "err-2", cookie_path=str(cookie_path), now=base) assert r2["should_notify"] is False assert r2["opened_circuit"] is False r3 = guard.record_failure("task-a", "err-3", cookie_path=str(cookie_path), now=base) assert r3["should_notify"] is True assert r3["opened_circuit"] is True assert r3["paused_until"] is not None d0 = guard.should_skip_start("task-a", cookie_path=str(cookie_path), now=base) assert d0.skip is True assert d0.should_notify is False next_day = base + timedelta(days=1, minutes=1) d1 = guard.should_skip_start("task-a", cookie_path=str(cookie_path), now=next_day) assert d1.skip is True assert d1.should_notify is True d1b = guard.should_skip_start("task-a", cookie_path=str(cookie_path), now=next_day) assert d1b.skip is True assert d1b.should_notify is False def test_failure_guard_auto_recovers_on_cookie_change(tmp_path): guard_path = tmp_path / "guard.json" cookie_path = tmp_path / "xianyu_state.json" cookie_path.write_text("{}", encoding="utf-8") guard = FailureGuard( path=str(guard_path), threshold=2, pause_seconds=3 * 24 * 60 * 60, tz_name="Asia/Shanghai", ) base = datetime(2026, 3, 4, 12, 0, 0) guard.record_failure("task-a", "err-1", cookie_path=str(cookie_path), now=base) guard.record_failure("task-a", "err-2", cookie_path=str(cookie_path), now=base) paused = guard.should_skip_start("task-a", cookie_path=str(cookie_path), now=base) assert paused.skip is True cookie_path.write_text('{"updated": true}', encoding="utf-8") recovered = guard.should_skip_start( "task-a", cookie_path=str(cookie_path), now=base + timedelta(minutes=1), ) assert recovered.skip is False ================================================ FILE: tests/test_frontend_build_paths.py ================================================ from __future__ import annotations from pathlib import Path REPO_ROOT = Path(__file__).resolve().parents[1] ROOT_DIST = "/dist" def read_repo_file(relative_path: str) -> str: return (REPO_ROOT / relative_path).read_text(encoding="utf-8") def test_frontend_build_output_path_is_consistent_across_configs(): vite_config = read_repo_file("web-ui/vite.config.ts") dockerfile = read_repo_file("Dockerfile") frontend_dockerfile = read_repo_file("web-ui/Dockerfile") dockerignore = read_repo_file(".dockerignore") start_script = read_repo_file("start.sh") dockerignore_lines = dockerignore.splitlines() assert "path.resolve(__dirname, '../dist')" in vite_config assert ( f"COPY --from=frontend-builder {ROOT_DIST} /app/dist" in dockerfile ), "Docker multi-stage copy must use the Vite build output path." assert ( f"COPY --from=builder {ROOT_DIST} /usr/share/nginx/html" in frontend_dockerfile ), "Frontend-only Docker build must use the Vite build output path." assert "dist/" in dockerignore_lines assert "web-ui/dist" not in dockerignore_lines assert '[ ! -d "dist" ]' in start_script assert "cp -r web-ui/dist ./" not in start_script ================================================ FILE: tests/unit/test_ai_client.py ================================================ import asyncio from types import SimpleNamespace import pytest from src.infrastructure.external.ai_client import AIClient from src.services.ai_request_compat import build_responses_input def _build_fake_client(responses_create_impl, chat_create_impl=None): responses = SimpleNamespace(create=responses_create_impl) chat = SimpleNamespace( completions=SimpleNamespace(create=chat_create_impl or responses_create_impl) ) return SimpleNamespace(responses=responses, chat=chat) def test_build_messages_without_images_uses_text_only_content(): client = AIClient.__new__(AIClient) messages = client._build_messages( {"商品信息": {"商品标题": "MacBook Pro M2"}, "卖家信息": {"卖家信用等级": "优秀"}}, [], "只分析文字描述和卖家资质。", ) content = messages[0]["content"] assert isinstance(content, str) assert "MacBook Pro M2" in content assert "未提供商品图片" in content def test_build_messages_with_images_uses_multimodal_content(monkeypatch): client = AIClient.__new__(AIClient) monkeypatch.setattr(AIClient, "encode_image", staticmethod(lambda _path: "ZmFrZQ==")) messages = client._build_messages( {"商品信息": {"商品标题": "MacBook Pro M2"}}, ["fake-image.jpg"], "结合图片和文字综合判断。", ) content = messages[0]["content"] assert isinstance(content, list) assert content[0]["type"] == "image_url" assert content[-1]["type"] == "text" def test_build_responses_input_converts_multimodal_messages(): result = build_responses_input( [ { "role": "user", "content": [ {"type": "image_url", "image_url": {"url": "data:image/jpeg;base64,ZmFrZQ=="}}, {"type": "text", "text": "hello"}, ], } ] ) assert result == [ { "role": "user", "content": [ { "type": "input_image", "image_url": "data:image/jpeg;base64,ZmFrZQ==", "detail": "auto", }, {"type": "input_text", "text": "hello"}, ], } ] def test_call_ai_retries_without_structured_output_when_model_rejects_it(): client = AIClient.__new__(AIClient) client.settings = SimpleNamespace( model_name="fake-model", enable_response_format=True, enable_thinking=False, ) request_history = [] async def fake_create(**kwargs): request_history.append(kwargs) if len(request_history) == 1: raise Exception( "Error code: 400 - {'error': {'code': 'InvalidParameter', " "'message': 'The parameter `response_format.type` specified in " "the request are not valid: `json_object` is not supported by " "this model.', 'param': 'response_format.type'}}" ) return SimpleNamespace( choices=[ SimpleNamespace( message=SimpleNamespace(content='{"ok":true}') ) ] ) client.client = _build_fake_client(fake_create) response = asyncio.run(client._call_ai([{"role": "user", "content": "hi"}])) assert response == '{"ok":true}' assert request_history[0]["messages"][0]["content"] == "hi" assert request_history[0]["response_format"]["type"] == "json_object" assert "response_format" not in request_history[1] def test_call_ai_falls_back_to_responses_when_chat_completions_api_is_missing(): client = AIClient.__new__(AIClient) client.settings = SimpleNamespace( model_name="fake-model", enable_response_format=True, enable_thinking=False, ) request_history = [] async def fake_chat_create(**kwargs): request_history.append(("chat", kwargs)) raise Exception("Error code: 404 - page not found") async def fake_responses_create(**kwargs): request_history.append(("responses", kwargs)) if len([item for item in request_history if item[0] == "responses"]) == 1: raise Exception( "Error code: 400 - {'error': {'code': 'InvalidParameter', " "'message': 'The parameter `text.format.type` specified in " "the request are not valid: `json_object` is not supported by " "this model.', 'param': 'text.format.type'}}" ) return SimpleNamespace(output_text='{"ok":true}') client.client = _build_fake_client(fake_responses_create, fake_chat_create) response = asyncio.run(client._call_ai([{"role": "user", "content": "hi"}])) assert response == '{"ok":true}' assert request_history[0][0] == "chat" assert request_history[1][0] == "responses" assert request_history[1][1]["text"]["format"]["type"] == "json_object" assert request_history[2][0] == "responses" assert "text" not in request_history[2][1] def test_call_ai_retries_without_temperature_when_gateway_rejects_it(): client = AIClient.__new__(AIClient) client.settings = SimpleNamespace( model_name="fake-model", enable_response_format=False, enable_thinking=False, ) request_history = [] async def fake_create(**kwargs): request_history.append(kwargs) if len(request_history) == 1: raise Exception("temperature is not supported by this gateway") return SimpleNamespace( choices=[ SimpleNamespace( message=SimpleNamespace(content='{"ok":true}') ) ] ) client.client = _build_fake_client(fake_create) response = asyncio.run(client._call_ai([{"role": "user", "content": "hi"}])) assert response == '{"ok":true}' assert request_history[0]["temperature"] == 0.1 assert "temperature" not in request_history[1] def test_call_ai_retries_when_response_content_is_empty(): client = AIClient.__new__(AIClient) client.settings = SimpleNamespace( model_name="fake-model", enable_response_format=False, enable_thinking=False, ) request_history = [] async def fake_create(**kwargs): request_history.append(kwargs) if len(request_history) < 4: return SimpleNamespace(output_text="") return SimpleNamespace(output_text='{"ok":true}') client.client = _build_fake_client(fake_create) response = asyncio.run(client._call_ai([{"role": "user", "content": "hi"}])) assert response == '{"ok":true}' assert len(request_history) == 4 def test_call_ai_raises_after_all_empty_response_retries_are_exhausted(): client = AIClient.__new__(AIClient) client.settings = SimpleNamespace( model_name="fake-model", enable_response_format=False, enable_thinking=False, ) request_history = [] async def fake_create(**kwargs): request_history.append(kwargs) return SimpleNamespace(output_text="") client.client = _build_fake_client(fake_create) with pytest.raises(ValueError, match="AI响应内容为空"): asyncio.run(client._call_ai([{"role": "user", "content": "hi"}])) assert len(request_history) == 4 def test_close_closes_underlying_async_client_and_clears_reference(): client = AIClient.__new__(AIClient) close_state = {"closed": False} async def fake_close(): close_state["closed"] = True client.client = SimpleNamespace(close=fake_close) asyncio.run(client.close()) assert close_state["closed"] is True assert client.client is None def test_parse_response_uses_first_json_object_when_response_contains_multiple_objects(): client = AIClient.__new__(AIClient) result = client._parse_response("""```json {"ok": true, "reason": "first"} {"ok": false, "reason": "second"} ```""") assert result == {"ok": True, "reason": "first"} ================================================ FILE: tests/unit/test_ai_handler_analysis.py ================================================ import asyncio from types import SimpleNamespace import pytest import src.ai_handler as ai_handler import src.config as app_config def _build_fake_client(responses_create_impl, chat_create_impl=None): responses = SimpleNamespace(create=responses_create_impl) chat = SimpleNamespace( completions=SimpleNamespace(create=chat_create_impl or responses_create_impl) ) return SimpleNamespace(responses=responses, chat=chat) def test_get_ai_analysis_stops_after_internal_retries_when_content_is_none( monkeypatch, tmp_path ): monkeypatch.chdir(tmp_path) call_count = {"value": 0} async def fake_create(**_kwargs): call_count["value"] += 1 return SimpleNamespace(output_text="") monkeypatch.setattr(ai_handler, "client", _build_fake_client(fake_create)) monkeypatch.setattr(ai_handler, "MODEL_NAME", "fake-model") monkeypatch.setattr(ai_handler, "ENABLE_RESPONSE_FORMAT", True) monkeypatch.setattr(app_config, "ENABLE_RESPONSE_FORMAT", True) with pytest.raises(ValueError, match="AI响应内容为空"): asyncio.run( ai_handler.get_ai_analysis( {"商品信息": {"商品ID": "1", "商品标题": "测试商品"}}, image_paths=[], prompt_text="请输出 JSON", ) ) assert call_count["value"] == 4 def test_get_ai_analysis_returns_parsed_json(monkeypatch, tmp_path): monkeypatch.chdir(tmp_path) call_count = {"value": 0} async def fake_create(**_kwargs): call_count["value"] += 1 return SimpleNamespace( output_text=( '{"prompt_version":"v1","is_recommended":true,' '"reason":"ok","risk_tags":[],"criteria_analysis":{"seller_type":"个人"}}' ) ) monkeypatch.setattr(ai_handler, "client", _build_fake_client(fake_create)) monkeypatch.setattr(ai_handler, "MODEL_NAME", "fake-model") monkeypatch.setattr(ai_handler, "ENABLE_RESPONSE_FORMAT", True) monkeypatch.setattr(app_config, "ENABLE_RESPONSE_FORMAT", True) result = asyncio.run( ai_handler.get_ai_analysis( {"商品信息": {"商品ID": "2", "商品标题": "测试商品2"}}, image_paths=[], prompt_text="请输出 JSON", ) ) assert result["is_recommended"] is True assert call_count["value"] == 1 def test_get_ai_analysis_retries_without_structured_output_when_model_rejects_it( monkeypatch, tmp_path ): monkeypatch.chdir(tmp_path) request_history = [] async def fake_create(**kwargs): request_history.append(kwargs) if len(request_history) == 1: raise Exception( "Error code: 400 - {'error': {'code': 'InvalidParameter', " "'message': 'The parameter `response_format.type` specified in " "the request are not valid: `json_object` is not supported by " "this model.', 'param': 'response_format.type'}}" ) return SimpleNamespace( choices=[ SimpleNamespace( message=SimpleNamespace( content=( '{"prompt_version":"v1","is_recommended":true,' '"reason":"ok","risk_tags":[],"criteria_analysis":{"seller_type":"个人"}}' ) ) ) ] ) monkeypatch.setattr(ai_handler, "client", _build_fake_client(fake_create)) monkeypatch.setattr(ai_handler, "MODEL_NAME", "fake-model") monkeypatch.setattr(ai_handler, "ENABLE_RESPONSE_FORMAT", True) monkeypatch.setattr(app_config, "ENABLE_RESPONSE_FORMAT", True) result = asyncio.run( ai_handler.get_ai_analysis( {"商品信息": {"商品ID": "3", "商品标题": "测试商品3"}}, image_paths=[], prompt_text="请输出 JSON", ) ) assert result["reason"] == "ok" assert request_history[0]["messages"][0]["role"] == "user" assert request_history[0]["response_format"]["type"] == "json_object" assert "response_format" not in request_history[1] assert ai_handler.ENABLE_RESPONSE_FORMAT is True def test_get_ai_analysis_falls_back_to_responses_when_chat_completions_api_is_missing( monkeypatch, tmp_path ): monkeypatch.chdir(tmp_path) request_history = [] async def fake_chat_create(**kwargs): request_history.append(("chat", kwargs)) raise Exception("Error code: 404 - page not found") async def fake_responses_create(**kwargs): request_history.append(("responses", kwargs)) if len([item for item in request_history if item[0] == "responses"]) == 1: raise Exception( "Error code: 400 - {'error': {'code': 'InvalidParameter', " "'message': 'The parameter `text.format.type` specified in " "the request are not valid: `json_object` is not supported by " "this model.', 'param': 'text.format.type'}}" ) return SimpleNamespace( output_text=( '{"prompt_version":"v1","is_recommended":true,' '"reason":"ok","risk_tags":[],"criteria_analysis":{"seller_type":"个人"}}' ) ) monkeypatch.setattr( ai_handler, "client", _build_fake_client(fake_responses_create, fake_chat_create), ) monkeypatch.setattr(ai_handler, "MODEL_NAME", "fake-model") monkeypatch.setattr(ai_handler, "ENABLE_RESPONSE_FORMAT", True) monkeypatch.setattr(app_config, "ENABLE_RESPONSE_FORMAT", True) result = asyncio.run( ai_handler.get_ai_analysis( {"商品信息": {"商品ID": "4", "商品标题": "测试商品4"}}, image_paths=[], prompt_text="请输出 JSON", ) ) assert result["reason"] == "ok" assert request_history[0][0] == "chat" assert request_history[0][1]["messages"][0]["role"] == "user" assert request_history[1][0] == "responses" assert request_history[1][1]["text"]["format"]["type"] == "json_object" assert request_history[2][0] == "responses" assert "text" not in request_history[2][1] def test_get_ai_analysis_retries_without_temperature_when_gateway_rejects_it( monkeypatch, tmp_path ): monkeypatch.chdir(tmp_path) request_history = [] async def fake_create(**kwargs): request_history.append(kwargs) if len(request_history) == 1: raise Exception("temperature is unsupported for this model") return SimpleNamespace( choices=[ SimpleNamespace( message=SimpleNamespace( content=( '{"prompt_version":"v1","is_recommended":true,' '"reason":"ok","risk_tags":[],"criteria_analysis":{"seller_type":"个人"}}' ) ) ) ] ) monkeypatch.setattr(ai_handler, "client", _build_fake_client(fake_create)) monkeypatch.setattr(ai_handler, "MODEL_NAME", "fake-model") monkeypatch.setattr(ai_handler, "ENABLE_RESPONSE_FORMAT", True) monkeypatch.setattr(app_config, "ENABLE_RESPONSE_FORMAT", True) result = asyncio.run( ai_handler.get_ai_analysis( {"商品信息": {"商品ID": "4", "商品标题": "测试商品4"}}, image_paths=[], prompt_text="请输出 JSON", ) ) assert result["reason"] == "ok" assert request_history[0]["temperature"] == 0.1 assert "temperature" not in request_history[1] def test_get_ai_analysis_uses_first_json_object_when_model_returns_multiple_objects( monkeypatch, tmp_path ): monkeypatch.chdir(tmp_path) async def fake_create(**_kwargs): return SimpleNamespace( output_text="""```json {"prompt_version":"v1","is_recommended":true,"reason":"first","risk_tags":[],"criteria_analysis":{"seller_type":"个人"}} {"prompt_version":"v1","is_recommended":false,"reason":"second","risk_tags":[],"criteria_analysis":{"seller_type":"商家"}} ```""" ) monkeypatch.setattr(ai_handler, "client", _build_fake_client(fake_create)) monkeypatch.setattr(ai_handler, "MODEL_NAME", "fake-model") monkeypatch.setattr(ai_handler, "ENABLE_RESPONSE_FORMAT", True) monkeypatch.setattr(app_config, "ENABLE_RESPONSE_FORMAT", True) result = asyncio.run( ai_handler.get_ai_analysis( {"商品信息": {"商品ID": "5", "商品标题": "测试商品5"}}, image_paths=[], prompt_text="请输出 JSON", ) ) assert result["is_recommended"] is True assert result["reason"] == "first" ================================================ FILE: tests/unit/test_ai_handler_downloads.py ================================================ import asyncio from pathlib import Path import src.ai_handler as ai_handler def test_download_all_images_runs_with_concurrency(tmp_path, monkeypatch): monkeypatch.setattr(ai_handler, "IMAGE_SAVE_DIR", str(tmp_path / "images")) active_downloads = 0 max_active_downloads = 0 async def fake_download(url, save_path): nonlocal active_downloads, max_active_downloads active_downloads += 1 max_active_downloads = max(max_active_downloads, active_downloads) await asyncio.sleep(0.02) Path(save_path).parent.mkdir(parents=True, exist_ok=True) Path(save_path).write_text("ok", encoding="utf-8") active_downloads -= 1 return save_path monkeypatch.setattr(ai_handler, "_download_single_image", fake_download) async def run(): return await ai_handler.download_all_images( "product-1", [ "https://example.com/1.jpg", "https://example.com/2.jpg", "https://example.com/3.jpg", ], task_name="demo", concurrency=3, ) paths = asyncio.run(run()) assert len(paths) == 3 assert max_active_downloads == 3 ================================================ FILE: tests/unit/test_ai_request_compat.py ================================================ from src.services.ai_request_compat import ( is_responses_api_unsupported_error, is_temperature_unsupported_error, remove_temperature_param, ) def test_is_temperature_unsupported_error_detects_unsupported_message(): err = Exception("temperature is not supported by this gateway") assert is_temperature_unsupported_error(err) is True def test_remove_temperature_param_removes_only_temperature(): params = {"model": "x", "temperature": 0.5, "max_output_tokens": 128} result = remove_temperature_param(params) assert "temperature" not in result assert result["model"] == "x" assert result["max_output_tokens"] == 128 def test_is_responses_api_unsupported_error_detects_gemini_plain_404(): class _Resp: text = "" class _Err(Exception): status_code = 404 body = "" response = _Resp() def __str__(self): return "Error code: 404" assert is_responses_api_unsupported_error(_Err()) is True ================================================ FILE: tests/unit/test_ai_response_parser.py ================================================ import pytest from src.services.ai_response_parser import parse_ai_response_json def test_parse_ai_response_json_uses_first_object_when_multiple_json_objects_are_concatenated(): content = """```json {"is_recommended": true, "reason": "first"} {"is_recommended": false, "reason": "second"} ```""" result = parse_ai_response_json(content) assert result == {"is_recommended": True, "reason": "first"} def test_parse_ai_response_json_extracts_json_from_wrapped_text(): content = """分析结果如下: ```json {"is_recommended": true, "reason": "wrapped"} ``` 请按第一份结果处理。""" result = parse_ai_response_json(content) assert result == {"is_recommended": True, "reason": "wrapped"} def test_parse_ai_response_json_raises_when_no_json_exists(): with pytest.raises(ValueError): parse_ai_response_json("没有任何 JSON 内容") ================================================ FILE: tests/unit/test_app_lifespan.py ================================================ import asyncio import src.app as app_module class _FakeTaskService: def __init__(self, _repo): self.updated = [] async def get_all_tasks(self): return [] async def update_task_status(self, task_id, is_running): self.updated.append((task_id, is_running)) class _FakeSchedulerService: def __init__(self): self.started = False self.stopped = False self.reload_payload = None async def reload_jobs(self, tasks): self.reload_payload = list(tasks) def start(self): self.started = True def stop(self): self.stopped = True class _FakeProcessService: def __init__(self): self.stop_all_called = False async def stop_all(self): self.stop_all_called = True def test_lifespan_cleans_task_logs_on_startup(monkeypatch): called = {} fake_scheduler = _FakeSchedulerService() fake_process = _FakeProcessService() monkeypatch.setattr(app_module, "scheduler_service", fake_scheduler) monkeypatch.setattr(app_module, "process_service", fake_process) monkeypatch.setattr(app_module, "TaskService", _FakeTaskService) monkeypatch.setattr(app_module, "SqliteTaskRepository", lambda: object()) monkeypatch.setattr(app_module, "bootstrap_sqlite_storage", lambda: called.setdefault("bootstrapped", True)) monkeypatch.setattr( app_module, "cleanup_task_logs", lambda *args, **kwargs: called.setdefault("keep_days", kwargs.get("keep_days")), ) monkeypatch.setattr(app_module.app_settings, "task_log_retention_days", 9) async def _run(): async with app_module.lifespan(None): assert fake_scheduler.started is True assert fake_scheduler.reload_payload == [] asyncio.run(_run()) assert called["bootstrapped"] is True assert called["keep_days"] == 9 assert fake_scheduler.stopped is True assert fake_process.stop_all_called is True ================================================ FILE: tests/unit/test_cron_utils.py ================================================ from src.core.cron_utils import build_cron_trigger, validate_cron_expression def test_validate_cron_expression_normalizes_alias(): assert validate_cron_expression("@daily") == "0 0 * * *" def test_validate_cron_expression_accepts_six_fields(): assert validate_cron_expression("0 0 8 * * *") == "0 0 8 * * *" def test_build_cron_trigger_accepts_alias_and_timezone(): trigger = build_cron_trigger("@hourly", timezone="Asia/Shanghai") assert trigger is not None assert str(trigger.timezone) == "Asia/Shanghai" def test_validate_cron_expression_rejects_invalid_value(): try: validate_cron_expression("not-a-cron") except ValueError as exc: assert "支持 5 段" in str(exc) return raise AssertionError("非法 cron 应该抛出 ValueError") ================================================ FILE: tests/unit/test_domain_task.py ================================================ from src.domain.models.task import Task, TaskGenerateRequest, TaskUpdate def test_task_can_start_and_stop(): task = Task( id=1, task_name="Sony A7M4", enabled=True, keyword="sony a7m4", description="body", max_pages=2, personal_only=True, min_price=None, max_price=None, cron=None, ai_prompt_base_file="prompts/base_prompt.txt", ai_prompt_criteria_file="prompts/sony_a7m4_criteria.txt", is_running=False, ) assert task.can_start() is True assert task.can_stop() is False running = task.model_copy(update={"is_running": True}) assert running.can_start() is False assert running.can_stop() is True def test_task_apply_update(): task = Task( id=1, task_name="Sony A7M4", enabled=True, keyword="sony a7m4", description="body", max_pages=2, personal_only=True, min_price=None, max_price=None, cron=None, ai_prompt_base_file="prompts/base_prompt.txt", ai_prompt_criteria_file="prompts/sony_a7m4_criteria.txt", is_running=False, ) update = TaskUpdate(enabled=False, max_pages=5) updated = task.apply_update(update) assert updated.enabled is False assert updated.max_pages == 5 assert updated.task_name == task.task_name def test_legacy_keyword_groups_are_flattened_to_keyword_rules(): task = Task( id=1, task_name="Sony A7M4", enabled=True, keyword="sony a7m4", description="body", max_pages=2, personal_only=True, min_price=None, max_price=None, cron=None, ai_prompt_base_file="prompts/base_prompt.txt", ai_prompt_criteria_file="prompts/sony_a7m4_criteria.txt", decision_mode="keyword", keyword_rule_groups=[ {"name": "组1", "include_keywords": ["a7m4", "验货宝"], "exclude_keywords": ["瑕疵"]}, {"name": "组2", "include_keywords": ["全画幅", "a7m4"], "exclude_keywords": ["拆修"]}, ], is_running=False, ) assert task.keyword_rules == ["a7m4", "验货宝", "全画幅"] def test_generate_request_accepts_legacy_group_payload(): req = TaskGenerateRequest( task_name="legacy", keyword="sony a7m4", description="", decision_mode="keyword", keyword_rule_groups=[{"include_keywords": ["a7m4", "验货宝"], "exclude_keywords": ["瑕疵"]}], ) assert req.keyword_rules == ["a7m4", "验货宝"] def test_generate_request_enables_image_analysis_by_default(): req = TaskGenerateRequest( task_name="Sony A7M4", keyword="sony a7m4", description="只看机身成色和卖家信用。", decision_mode="ai", ) assert req.analyze_images is True def test_generate_request_infers_fixed_account_strategy_from_state_file(): req = TaskGenerateRequest( task_name="Sony A7M4", keyword="sony a7m4", description="只看机身成色和卖家信用。", decision_mode="ai", account_state_file="state/acc_1.json", ) assert req.account_strategy == "fixed" def test_generate_request_requires_state_file_for_fixed_account_strategy(): try: TaskGenerateRequest( task_name="Sony A7M4", keyword="sony a7m4", description="只看机身成色和卖家信用。", decision_mode="ai", account_strategy="fixed", ) except ValueError as exc: assert "固定账号模式下必须选择账号" in str(exc) return raise AssertionError("固定账号模式应要求 account_state_file") ================================================ FILE: tests/unit/test_item_analysis_dispatcher.py ================================================ import asyncio from src.services.item_analysis_dispatcher import ( ItemAnalysisDispatcher, ItemAnalysisJob, ) def test_item_analysis_dispatcher_uses_bounded_concurrency(): active_ai_calls = 0 max_active_ai_calls = 0 saved_records = [] notifications = [] async def seller_loader(user_id: str): await asyncio.sleep(0.005) return {"卖家ID": user_id} async def image_downloader(product_id: str, image_urls: list[str], task_name: str): return [] async def ai_analyzer(record: dict, image_paths: list[str], prompt_text: str): nonlocal active_ai_calls, max_active_ai_calls active_ai_calls += 1 max_active_ai_calls = max(max_active_ai_calls, active_ai_calls) await asyncio.sleep(0.03) active_ai_calls -= 1 return { "analysis_source": "ai", "is_recommended": True, "reason": f"推荐 {record['商品信息']['商品ID']}", "keyword_hit_count": 0, } async def notifier(item_data: dict, reason: str): notifications.append((item_data["商品ID"], reason)) async def saver(record: dict, keyword: str): saved_records.append((keyword, record)) return True async def run(): dispatcher = ItemAnalysisDispatcher( concurrency=2, skip_ai_analysis=False, seller_loader=seller_loader, image_downloader=image_downloader, ai_analyzer=ai_analyzer, notifier=notifier, saver=saver, ) for index in range(3): dispatcher.submit( ItemAnalysisJob( keyword="demo", task_name="Demo", decision_mode="ai", analyze_images=False, prompt_text="prompt", keyword_rules=(), final_record={ "商品信息": {"商品ID": str(index), "商品图片列表": []}, "卖家信息": {}, }, seller_id=f"seller-{index}", zhima_credit_text="优秀", registration_duration_text="来闲鱼1年", ) ) await dispatcher.join() return dispatcher dispatcher = asyncio.run(run()) assert dispatcher.completed_count == 3 assert len(saved_records) == 3 assert len(notifications) == 3 assert max_active_ai_calls == 2 assert saved_records[0][1]["卖家信息"]["卖家ID"].startswith("seller-") def test_item_analysis_dispatcher_supports_keyword_mode_without_ai(): saved_records = [] async def seller_loader(user_id: str): return {"卖家标签": "个人闲置"} async def image_downloader(product_id: str, image_urls: list[str], task_name: str): raise AssertionError("关键词模式不应下载图片") async def ai_analyzer(record: dict, image_paths: list[str], prompt_text: str): raise AssertionError("关键词模式不应调用 AI") async def notifier(item_data: dict, reason: str): return None async def saver(record: dict, keyword: str): saved_records.append(record) return True async def run(): dispatcher = ItemAnalysisDispatcher( concurrency=1, skip_ai_analysis=False, seller_loader=seller_loader, image_downloader=image_downloader, ai_analyzer=ai_analyzer, notifier=notifier, saver=saver, ) dispatcher.submit( ItemAnalysisJob( keyword="demo", task_name="Demo", decision_mode="keyword", analyze_images=False, prompt_text="", keyword_rules=("个人闲置",), final_record={ "商品信息": {"商品ID": "1", "商品标题": "演示商品"}, "卖家信息": {}, }, seller_id="seller-1", zhima_credit_text="优秀", registration_duration_text="来闲鱼1年", ) ) await dispatcher.join() asyncio.run(run()) assert saved_records[0]["ai_analysis"]["analysis_source"] == "keyword" assert saved_records[0]["ai_analysis"]["is_recommended"] is True ================================================ FILE: tests/unit/test_keyword_rule_engine.py ================================================ from src.keyword_rule_engine import build_search_text, evaluate_keyword_rules def _sample_record(): return { "商品信息": { "商品标题": "Sony A7M4 全画幅相机", "当前售价": "10000", "商品标签": ["验货宝", "包邮"], }, "卖家信息": { "卖家昵称": "摄影器材店", "卖家个性签名": "可验机,支持同城面交", }, } def test_build_search_text_contains_product_and_seller_fields(): text = build_search_text(_sample_record()) assert "sony a7m4" in text assert "摄影器材店" in text assert "支持同城面交" in text def test_keyword_rules_or_match_any_keyword(): text = build_search_text(_sample_record()) result = evaluate_keyword_rules(["a7m4", "佳能"], text) assert result["is_recommended"] is True assert result["analysis_source"] == "keyword" assert result["keyword_hit_count"] == 1 assert result["matched_keywords"] == ["a7m4"] def test_keyword_rules_count_multiple_hits(): text = build_search_text(_sample_record()) result = evaluate_keyword_rules(["a7m4", "验货宝", "摄影器材店"], text) assert result["is_recommended"] is True assert result["keyword_hit_count"] == 3 def test_keyword_rules_case_insensitive_contains(): text = build_search_text(_sample_record()) result = evaluate_keyword_rules(["SONY", "A7M4"], text) assert result["is_recommended"] is True assert result["keyword_hit_count"] == 2 def test_keyword_rules_no_match(): text = build_search_text(_sample_record()) result = evaluate_keyword_rules(["佳能", "单反"], text) assert result["is_recommended"] is False assert result["keyword_hit_count"] == 0 def test_keyword_rules_do_not_partially_match_alphanumeric_prefixes(): result = evaluate_keyword_rules(["q1"], "富士 q1r5 旗舰相机") assert result["is_recommended"] is False assert result["keyword_hit_count"] == 0 def test_keyword_rules_still_match_full_alphanumeric_token(): result = evaluate_keyword_rules(["q1r5"], "富士 q1r5 旗舰相机") assert result["is_recommended"] is True assert result["keyword_hit_count"] == 1 ================================================ FILE: tests/unit/test_notification_service.py ================================================ import asyncio from src.infrastructure.external.notification_clients.base import NotificationClient from src.infrastructure.external.notification_clients.webhook_client import WebhookClient from src.services.notification_service import NotificationService class _OkClient(NotificationClient): channel_key = "ok" display_name = "OK" async def send(self, product_data, reason): return None class _FailClient(NotificationClient): channel_key = "fail" display_name = "FAIL" async def send(self, product_data, reason): raise RuntimeError("boom") def test_notification_service_collects_success_and_failure_results(): service = NotificationService([_OkClient(enabled=True), _FailClient(enabled=True)]) results = asyncio.run( service.send_notification({"商品标题": "Sony A7M4"}, "价格合适") ) assert results["ok"]["success"] is True assert results["ok"]["message"] == "发送成功" assert results["fail"]["success"] is False assert results["fail"]["message"] == "boom" def test_webhook_client_renders_json_templates(monkeypatch): captured = {} class _FakeResponse: def raise_for_status(self): return None def _fake_post(url, headers=None, json=None, data=None, timeout=None): captured["url"] = url captured["headers"] = headers captured["json"] = json captured["data"] = data return _FakeResponse() monkeypatch.setattr("requests.post", _fake_post) client = WebhookClient( webhook_url="https://hooks.example.com/notify", webhook_method="POST", webhook_headers='{"Authorization":"Bearer token"}', webhook_content_type="JSON", webhook_query_parameters='{"task":"{{title}}"}', webhook_body='{"message":"{{content}}","link":"{{desktop_link}}"}', pcurl_to_mobile=False, ) asyncio.run( client.send( { "商品标题": "Sony A7M4", "当前售价": "9999", "商品链接": "https://www.goofish.com/item/123", }, "价格合适", ) ) assert "task=%F0%9F%9A%A8+%E6%96%B0%E6%8E%A8%E8%8D%90%21+Sony+A7M4" in captured["url"] assert captured["headers"]["Authorization"] == "Bearer token" assert captured["json"]["message"].startswith("价格: 9999") assert captured["json"]["link"] == "https://www.goofish.com/item/123" assert captured["data"] is None ================================================ FILE: tests/unit/test_price_history_service.py ================================================ from src.services.price_history_service import ( build_item_price_context, build_price_history_insights, load_price_snapshots, record_market_snapshots, ) def test_record_market_snapshots_and_build_price_history_insights(tmp_path, monkeypatch): monkeypatch.chdir(tmp_path) seen_item_ids = set() run1_items = [ { "商品ID": "1001", "商品标题": "Sony A7M4 单机", "当前售价": "¥10000", "商品标签": ["验货宝"], "发货地区": "上海", "卖家昵称": "卖家A", "商品链接": "https://www.goofish.com/item?id=1001", "发布时间": "2026-01-01 09:00", }, { "商品ID": "1002", "商品标题": "Sony A7M4 套机", "当前售价": "¥12000", "商品标签": ["包邮"], "发货地区": "杭州", "卖家昵称": "卖家B", "商品链接": "https://www.goofish.com/item?id=1002", "发布时间": "2026-01-01 10:00", }, ] run2_items = [ { "商品ID": "1001", "商品标题": "Sony A7M4 单机", "当前售价": "¥9500", "商品标签": ["验货宝"], "发货地区": "上海", "卖家昵称": "卖家A", "商品链接": "https://www.goofish.com/item?id=1001", "发布时间": "2026-01-02 09:00", }, { "商品ID": "1003", "商品标题": "Sony A7M4 全套", "当前售价": "¥13000", "商品标签": ["同城"], "发货地区": "南京", "卖家昵称": "卖家C", "商品链接": "https://www.goofish.com/item?id=1003", "发布时间": "2026-01-02 11:00", }, ] inserted_run1 = record_market_snapshots( keyword="sony a7m4", task_name="Sony A7M4 监控", items=run1_items, run_id="run-1", snapshot_time="2026-01-01T12:00:00", seen_item_ids=seen_item_ids, ) assert len(inserted_run1) == 2 inserted_run2 = record_market_snapshots( keyword="sony a7m4", task_name="Sony A7M4 监控", items=run2_items, run_id="run-2", snapshot_time="2026-01-02T12:00:00", seen_item_ids=set(), ) assert len(inserted_run2) == 2 snapshots = load_price_snapshots("sony a7m4") assert len(snapshots) == 4 insights = build_price_history_insights("sony a7m4") assert insights["market_summary"]["sample_count"] == 2 assert insights["market_summary"]["avg_price"] == 11250.0 assert insights["market_summary"]["min_price"] == 9500.0 assert insights["history_summary"]["unique_items"] == 3 assert len(insights["daily_trend"]) == 2 assert insights["daily_trend"][0]["day"] == "2026-01-01" assert insights["daily_trend"][1]["day"] == "2026-01-02" item_context = build_item_price_context( snapshots, item_id="1001", current_price=9500.0, ) assert item_context["observation_count"] == 2 assert item_context["min_price"] == 9500.0 assert item_context["max_price"] == 10000.0 assert item_context["price_change_amount"] == -500.0 assert item_context["deal_label"] == "高性价比" ================================================ FILE: tests/unit/test_process_service.py ================================================ import asyncio import sys from types import SimpleNamespace from src.services.process_service import ProcessService class FakeProcess: def __init__(self, pid: int): self.pid = pid self.returncode = None self._done = asyncio.Event() async def wait(self): await self._done.wait() return self.returncode def finish(self, returncode: int = 0): self.returncode = returncode self._done.set() def terminate(self): self.finish(-15) def kill(self): self.finish(-9) def test_process_service_marks_task_stopped_when_process_exits(monkeypatch, tmp_path): fake_process = FakeProcess(pid=4321) events = [] async def run_scenario(): service = ProcessService() service.failure_guard.should_skip_start = lambda *args, **kwargs: SimpleNamespace( skip=False, should_notify=False, reason="", consecutive_failures=0, paused_until=None, ) stopped = asyncio.Event() async def on_started(task_id: int): events.append(("started", task_id)) async def on_stopped(task_id: int): events.append(("stopped", task_id)) stopped.set() service.set_lifecycle_hooks(on_started=on_started, on_stopped=on_stopped) async def fake_create_subprocess_exec(*_args, **_kwargs): return fake_process monkeypatch.setattr( "src.services.process_service.build_task_log_path", lambda task_id, _task_name: str(tmp_path / f"task-{task_id}.log"), ) monkeypatch.setattr(asyncio, "create_subprocess_exec", fake_create_subprocess_exec) started = await service.start_task(0, "task-a") assert started is True assert events == [("started", 0)] assert service.is_running(0) is True fake_process.finish(0) await asyncio.wait_for(stopped.wait(), timeout=1) assert ("stopped", 0) in events assert service.is_running(0) is False asyncio.run(run_scenario()) def test_process_service_reindexes_runtime_maps_after_delete(): service = ProcessService() proc_a = object() proc_c = object() watcher_a = object() watcher_c = object() service.processes = {0: proc_a, 2: proc_c} service.log_paths = {0: "a.log", 2: "c.log"} service.task_names = {0: "A", 2: "C"} service.exit_watchers = {0: watcher_a, 2: watcher_c} service.reindex_after_delete(1) assert service.processes == {0: proc_a, 1: proc_c} assert service.log_paths == {0: "a.log", 1: "c.log"} assert service.task_names == {0: "A", 1: "C"} assert service.exit_watchers == {0: watcher_a, 1: watcher_c} def test_process_service_adds_debug_limit_arg_when_env_enabled(monkeypatch): monkeypatch.setenv("SPIDER_DEBUG_LIMIT", "1") service = ProcessService() command = service._build_spawn_command("task-a") assert command == [ sys.executable, "-u", "spider_v2.py", "--task-name", "task-a", "--debug-limit", "1", ] ================================================ FILE: tests/unit/test_prompt_utils.py ================================================ import asyncio import pytest import src.prompt_utils as prompt_utils from src.services.ai_response_parser import EmptyAIResponseError def test_generate_criteria_closes_ai_client_after_success(monkeypatch, tmp_path): close_state = {"closed": False} reference_file = tmp_path / "reference.txt" reference_file.write_text("reference", encoding="utf-8") class FakeAIClient: def is_available(self): return True def refresh(self): raise AssertionError("refresh should not be called") async def _call_ai(self, *_args, **_kwargs): return "generated criteria" async def close(self): close_state["closed"] = True monkeypatch.setattr(prompt_utils, "AIClient", FakeAIClient) result = asyncio.run( prompt_utils.generate_criteria("need a gpu", str(reference_file)) ) assert result == "generated criteria" assert close_state["closed"] is True def test_generate_criteria_closes_ai_client_after_ai_failure(monkeypatch, tmp_path): close_state = {"closed": False} reference_file = tmp_path / "reference.txt" reference_file.write_text("reference", encoding="utf-8") class FakeAIClient: def is_available(self): return True def refresh(self): raise AssertionError("refresh should not be called") async def _call_ai(self, *_args, **_kwargs): raise EmptyAIResponseError("AI响应内容为空。") async def close(self): close_state["closed"] = True monkeypatch.setattr(prompt_utils, "AIClient", FakeAIClient) with pytest.raises(EmptyAIResponseError, match="AI响应内容为空"): asyncio.run(prompt_utils.generate_criteria("need a gpu", str(reference_file))) assert close_state["closed"] is True ================================================ FILE: tests/unit/test_scraper_browser_channel.py ================================================ import importlib def _load_scraper(monkeypatch, *, login_is_edge: bool, running_in_docker: bool): monkeypatch.setenv("LOGIN_IS_EDGE", "true" if login_is_edge else "false") monkeypatch.setenv("RUNNING_IN_DOCKER", "true" if running_in_docker else "false") import src.config as config_module import src.scraper as scraper_module importlib.reload(config_module) reloaded_scraper = importlib.reload(scraper_module) reloaded_scraper.EDGE_DOCKER_WARNING_PRINTED = False return reloaded_scraper def test_resolve_browser_channel_uses_chromium_in_docker_even_when_edge_requested(monkeypatch, capsys): scraper = _load_scraper(monkeypatch, login_is_edge=True, running_in_docker=True) assert scraper._resolve_browser_channel() == "chromium" assert "Docker 镜像未内置 Edge" in capsys.readouterr().out def test_resolve_browser_channel_uses_msedge_locally_when_requested(monkeypatch): scraper = _load_scraper(monkeypatch, login_is_edge=True, running_in_docker=False) assert scraper._resolve_browser_channel() == "msedge" ================================================ FILE: tests/unit/test_search_pagination.py ================================================ import asyncio from playwright.async_api import TimeoutError as PlaywrightTimeoutError from src.services.search_pagination import advance_search_page from src.services.search_pagination import is_search_results_response class FakeRequest: def __init__(self, method: str = "POST"): self.method = method class FakeResponse: def __init__(self, url: str, ok: bool = True, method: str = "POST"): self.url = url self.ok = ok self.request = FakeRequest(method) class FakeLocator: def __init__(self, count: int, click_error: Exception | None = None): self._count = count self.clicks = 0 self.scrolls = 0 self.click_timeout = None self._click_error = click_error @property def first(self): return self async def count(self) -> int: return self._count async def scroll_into_view_if_needed(self) -> None: self.scrolls += 1 async def click(self, timeout: int | None = None) -> None: self.clicks += 1 self.click_timeout = timeout if self._click_error is not None: raise self._click_error class FakeResponseContext: def __init__(self, outcome): self._outcome = outcome async def __aenter__(self): return self async def __aexit__(self, exc_type, exc, tb): return False @property def value(self): return self._resolve() async def _resolve(self): if isinstance(self._outcome, Exception): raise self._outcome return self._outcome class FakePage: def __init__( self, next_button_count: int, outcomes: list[object], click_error: Exception | None = None, ): self.locator_stub = FakeLocator(next_button_count, click_error=click_error) self._outcomes = list(outcomes) def locator(self, _selector: str) -> FakeLocator: return self.locator_stub def expect_response(self, _predicate, timeout: int): assert timeout == 20000 if not self._outcomes: raise AssertionError("missing fake response outcome") return FakeResponseContext(self._outcomes.pop(0)) async def _noop_random_sleep(_min_seconds: float, _max_seconds: float) -> None: return None async def _noop_sleep(_seconds: float) -> None: return None def test_advance_search_page_stops_when_no_next_button() -> None: page = FakePage(next_button_count=0, outcomes=[]) logs: list[str] = [] result = asyncio.run( advance_search_page( page=page, page_num=2, logger=logs.append, wait_after_click=_noop_random_sleep, retry_sleep=_noop_sleep, ) ) assert result.advanced is False assert result.response is None assert result.stop_reason == "no_next_button" assert page.locator_stub.clicks == 0 assert logs == ["已到达最后一页,未找到可用的'下一页'按钮,停止翻页。"] def test_advance_search_page_stops_after_timeout_retries() -> None: page = FakePage( next_button_count=1, outcomes=[ PlaywrightTimeoutError("page 2 timeout"), PlaywrightTimeoutError("page 2 timeout"), ], ) logs: list[str] = [] result = asyncio.run( advance_search_page( page=page, page_num=2, logger=logs.append, wait_after_click=_noop_random_sleep, retry_sleep=_noop_sleep, ) ) assert result.advanced is False assert result.response is None assert result.stop_reason == "response_timeout" assert page.locator_stub.clicks == 2 assert page.locator_stub.scrolls == 2 assert logs == [ "等待第 2 页搜索响应超时,5秒后重试...", "等待第 2 页搜索响应超时 2 次,停止翻页。", ] def test_advance_search_page_returns_new_response_on_success() -> None: response = FakeResponse( url="https://example.com/h5/mtop.taobao.idlemtopsearch.pc.search/1.0/?page=2" ) page = FakePage(next_button_count=1, outcomes=[response]) result = asyncio.run( advance_search_page( page=page, page_num=2, logger=lambda _message: None, wait_after_click=_noop_random_sleep, retry_sleep=_noop_sleep, ) ) assert result.advanced is True assert result.response is response assert result.stop_reason is None assert page.locator_stub.clicks == 1 assert page.locator_stub.scrolls == 1 assert page.locator_stub.click_timeout == 10000 def test_advance_search_page_stops_when_click_times_out() -> None: page = FakePage( next_button_count=1, outcomes=[FakeResponse(url="https://example.com/unused")], click_error=PlaywrightTimeoutError("click timeout"), ) logs: list[str] = [] result = asyncio.run( advance_search_page( page=page, page_num=2, logger=logs.append, wait_after_click=_noop_random_sleep, retry_sleep=_noop_sleep, ) ) assert result.advanced is False assert result.response is None assert result.stop_reason == "click_timeout" assert page.locator_stub.clicks == 1 assert logs == ["第 2 页下一页按钮点击超时,停止翻页。"] def test_is_search_results_response_matches_exact_search_api() -> None: response = FakeResponse( url="https://h5api.m.goofish.com/h5/mtop.taobao.idlemtopsearch.pc.search/1.0/?foo=bar", method="POST", ) assert is_search_results_response(response) is True def test_is_search_results_response_rejects_search_shade_api() -> None: response = FakeResponse( url="https://h5api.m.goofish.com/h5/mtop.taobao.idlemtopsearch.pc.search.shade/1.0/?foo=bar", method="POST", ) assert is_search_results_response(response) is False def test_is_search_results_response_rejects_non_post_request() -> None: response = FakeResponse( url="https://h5api.m.goofish.com/h5/mtop.taobao.idlemtopsearch.pc.search/1.0/?foo=bar", method="GET", ) assert is_search_results_response(response) is False ================================================ FILE: tests/unit/test_seller_profile_cache.py ================================================ import asyncio from src.services.seller_profile_cache import SellerProfileCache def test_seller_profile_cache_reuses_value_and_returns_copy(): clock = {"value": 100.0} cache = SellerProfileCache(ttl_seconds=60, time_source=lambda: clock["value"]) loader_calls = 0 async def loader(user_id: str): nonlocal loader_calls loader_calls += 1 return {"user_id": user_id, "items": []} async def run(): first = await cache.get_or_load("seller-1", loader) first["items"].append("mutated") second = await cache.get_or_load("seller-1", loader) return second second = asyncio.run(run()) assert loader_calls == 1 assert second == {"user_id": "seller-1", "items": []} def test_seller_profile_cache_coalesces_inflight_requests(): cache = SellerProfileCache(ttl_seconds=60) loader_calls = 0 async def loader(user_id: str): nonlocal loader_calls loader_calls += 1 await asyncio.sleep(0.02) return {"user_id": user_id} async def run(): return await asyncio.gather( cache.get_or_load("seller-2", loader), cache.get_or_load("seller-2", loader), ) results = asyncio.run(run()) assert loader_calls == 1 assert results == [{"user_id": "seller-2"}, {"user_id": "seller-2"}] ================================================ FILE: tests/unit/test_task_log_cleanup_service.py ================================================ from __future__ import annotations from datetime import datetime from pathlib import Path from src.services.task_log_cleanup_service import cleanup_task_logs def _write_file(path: Path, content: str = "log") -> None: path.parent.mkdir(parents=True, exist_ok=True) path.write_text(content, encoding="utf-8") def _set_mtime(path: Path, when: datetime) -> None: timestamp = when.timestamp() path.touch() path.chmod(0o644) import os os.utime(path, (timestamp, timestamp)) def test_cleanup_task_logs_removes_only_expired_top_level_logs(tmp_path): logs_dir = tmp_path / "logs" old_log = logs_dir / "old_task.log" recent_log = logs_dir / "recent_task.log" nested_ai_log = logs_dir / "ai" / "old_ai.log" state_file = logs_dir / "task-failure-guard.json" _write_file(old_log) _write_file(recent_log) _write_file(nested_ai_log) _write_file(state_file, "{}") now = datetime(2026, 3, 19, 12, 0, 0) _set_mtime(old_log, datetime(2026, 3, 1, 0, 0, 0)) _set_mtime(recent_log, datetime(2026, 3, 18, 23, 0, 0)) _set_mtime(nested_ai_log, datetime(2026, 3, 1, 0, 0, 0)) _set_mtime(state_file, datetime(2026, 3, 1, 0, 0, 0)) removed = cleanup_task_logs(str(logs_dir), keep_days=7, now=now) assert removed == [str(old_log)] assert not old_log.exists() assert recent_log.exists() assert nested_ai_log.exists() assert state_file.exists() def test_cleanup_task_logs_skips_when_retention_is_invalid(tmp_path): logs_dir = tmp_path / "logs" task_log = logs_dir / "task.log" _write_file(task_log) removed = cleanup_task_logs(str(logs_dir), keep_days=0, now=datetime(2026, 3, 19, 12, 0, 0)) assert removed == [] assert task_log.exists() ================================================ FILE: tests/unit/test_utils.py ================================================ import asyncio from src.services.result_storage_service import load_all_result_records from src.utils import ( format_registration_days, get_link_unique_key, safe_get, save_to_jsonl, ) def test_safe_get_nested_and_default(): data = {"a": {"b": [{"c": "value"}]}} assert asyncio.run(safe_get(data, "a", "b", 0, "c")) == "value" assert asyncio.run(safe_get(data, "a", "b", 1, "c", default="missing")) == "missing" def test_format_registration_days(): assert format_registration_days(400).startswith("\u6765\u95f2\u9c7c") assert format_registration_days(-1) == "\u672a\u77e5" def test_get_link_unique_key(): link = "https://www.goofish.com/item?id=123&foo=bar" assert get_link_unique_key(link) == "https://www.goofish.com/item?id=123" def test_save_to_jsonl(tmp_path, monkeypatch): monkeypatch.chdir(tmp_path) record = { "爬取时间": "2026-01-01T10:00:00", "搜索关键字": "sony a7m4", "任务名称": "Sony A7M4", "商品信息": { "商品ID": "1", "商品标题": "Sony A7M4", "商品链接": "https://www.goofish.com/item?id=1", "当前售价": "¥10000", }, } ok = asyncio.run(save_to_jsonl(record, keyword="sony a7m4")) assert ok is True records = asyncio.run( load_all_result_records( "sony_a7m4_full_data.jsonl", ai_recommended_only=False, keyword_recommended_only=False, sort_by="crawl_time", sort_order="asc", ) ) assert records == [record] ================================================ FILE: web-ui/.gitignore ================================================ # Logs logs *.log npm-debug.log* yarn-debug.log* yarn-error.log* pnpm-debug.log* lerna-debug.log* node_modules dist dist-ssr *.local # Editor directories and files .vscode/* !.vscode/extensions.json .idea .DS_Store *.suo *.ntvs* *.njsproj *.sln *.sw? ================================================ FILE: web-ui/.vscode/extensions.json ================================================ { "recommendations": ["Vue.volar"] } ================================================ FILE: web-ui/Dockerfile ================================================ # Stage 1: Build the Vue application FROM node:22-alpine AS builder WORKDIR /app # Copy package files and install dependencies COPY package*.json ./ RUN npm install # Copy source code and build COPY . . RUN npm run build # Stage 2: Serve with Nginx FROM nginx:alpine # Remove default nginx static assets RUN rm -rf /usr/share/nginx/html/* # Vite 输出到容器根目录 /dist,而不是 /app/dist COPY --from=builder /dist /usr/share/nginx/html # Copy custom nginx configuration COPY nginx.conf /etc/nginx/conf.d/default.conf EXPOSE 80 CMD ["nginx", "-g", "daemon off;"] ================================================ FILE: web-ui/README.md ================================================ # Vue 3 + TypeScript + Vite This template should help get you started developing with Vue 3 and TypeScript in Vite. The template uses Vue 3 ` ================================================ FILE: web-ui/nginx.conf ================================================ server { listen 80; server_name localhost; root /usr/share/nginx/html; index index.html; # Serve static frontend files location / { try_files $uri $uri/ /index.html; } # Proxy API requests to the backend service location /api/ { proxy_pass http://app:8000/api/; proxy_set_header Host $host; proxy_set_header X-Real-IP $remote_addr; proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for; } # Proxy Auth requests location /auth/ { proxy_pass http://app:8000/auth/; proxy_set_header Host $host; proxy_set_header X-Real-IP $remote_addr; } # Proxy static files from backend if needed (though we serve frontend here) # But backend might serve user images or other static assets via /static location /static/ { proxy_pass http://app:8000/static/; } # Proxy WebSocket requests location /ws { proxy_pass http://app:8000/ws; proxy_http_version 1.1; proxy_set_header Upgrade $http_upgrade; proxy_set_header Connection "Upgrade"; proxy_set_header Host $host; } } ================================================ FILE: web-ui/package.json ================================================ { "name": "web-ui", "private": true, "version": "0.0.0", "type": "module", "scripts": { "dev": "vite", "build": "vue-tsc -b && vite build", "preview": "vite preview" }, "dependencies": { "@vueuse/core": "^14.1.0", "class-variance-authority": "^0.7.1", "clsx": "^2.1.1", "lucide-vue-next": "^0.562.0", "reka-ui": "^2.7.0", "tailwind-merge": "^3.4.0", "vue": "^3.5.24", "vue-i18n": "^10.0.7", "vue-router": "^4.6.4" }, "devDependencies": { "@types/node": "^24.10.4", "@vitejs/plugin-vue": "^6.0.1", "@vue/tsconfig": "^0.8.1", "autoprefixer": "^10.4.23", "postcss": "^8.5.6", "tailwindcss": "^3.4.17", "tailwindcss-animate": "^1.0.7", "typescript": "~5.9.3", "vite": "^7.2.4", "vue-tsc": "^3.1.4" } } ================================================ FILE: web-ui/postcss.config.cjs ================================================ module.exports = { plugins: { tailwindcss: {}, autoprefixer: {}, }, } ================================================ FILE: web-ui/src/App.vue ================================================ ================================================ FILE: web-ui/src/api/accounts.ts ================================================ import { http } from '@/lib/http' export interface AccountItem { name: string path: string } export interface AccountDetail extends AccountItem { content: string } export async function listAccounts(): Promise { return await http('/api/accounts') } export async function getAccount(name: string): Promise { return await http(`/api/accounts/${encodeURIComponent(name)}`) } export async function createAccount(payload: { name: string; content: string }): Promise { return await http('/api/accounts', { method: 'POST', headers: { 'Content-Type': 'application/json' }, body: JSON.stringify(payload), }) } export async function updateAccount(name: string, content: string): Promise { return await http(`/api/accounts/${encodeURIComponent(name)}`, { method: 'PUT', headers: { 'Content-Type': 'application/json' }, body: JSON.stringify({ content }), }) } export async function deleteAccount(name: string): Promise<{ message: string }> { return await http(`/api/accounts/${encodeURIComponent(name)}`, { method: 'DELETE' }) } ================================================ FILE: web-ui/src/api/dashboard.ts ================================================ import { http } from '@/lib/http' import type { DashboardSnapshot } from '@/types/dashboard.d.ts' export async function getDashboardSummary(): Promise { return await http('/api/dashboard/summary') } ================================================ FILE: web-ui/src/api/logs.ts ================================================ import { http } from '@/lib/http' export async function getLogs(fromPos: number = 0, taskId?: number | null): Promise<{ new_content: string; new_pos: number }> { const params: Record = { from_pos: fromPos } if (taskId !== null && taskId !== undefined) { params.task_id = taskId } return await http('/api/logs', { params }) } export async function clearLogs(taskId?: number | null): Promise { const params: Record = {} if (taskId !== null && taskId !== undefined) { params.task_id = taskId } await http('/api/logs', { method: 'DELETE', params }) } export async function getLogTail( taskId: number, offsetLines: number = 0, limitLines: number = 50 ): Promise<{ content: string; has_more: boolean; next_offset: number; new_pos: number }> { return await http('/api/logs/tail', { params: { task_id: taskId, offset_lines: offsetLines, limit_lines: limitLines, }, }) } ================================================ FILE: web-ui/src/api/prompts.ts ================================================ import { http } from '@/lib/http' export interface PromptContent { filename: string content: string } export async function listPrompts(): Promise { return await http('/api/prompts') } export async function getPromptContent(filename: string): Promise { return await http(`/api/prompts/${filename}`) } export async function updatePrompt(filename: string, content: string): Promise<{ message: string }> { return await http(`/api/prompts/${filename}`, { method: 'PUT', headers: { 'Content-Type': 'application/json' }, body: JSON.stringify({ content }), }) } ================================================ FILE: web-ui/src/api/results.ts ================================================ import type { ResultInsights, ResultItem } from '@/types/result.d.ts' import { http } from '@/lib/http' export interface GetResultContentParams { recommended_only?: boolean; ai_recommended_only?: boolean; keyword_recommended_only?: boolean; sort_by?: 'crawl_time' | 'publish_time' | 'price' | 'keyword_hit_count'; sort_order?: 'asc' | 'desc'; page?: number; limit?: number; } export async function getResultFiles(): Promise { const data = await http('/api/results/files') return data.files || [] } export async function deleteResultFile(filename: string): Promise<{ message: string }> { return await http(`/api/results/files/${filename}`, { method: 'DELETE' }) } export async function getResultContent( filename: string, params: GetResultContentParams = {} ): Promise<{ total_items: number; items: ResultItem[] }> { return await http(`/api/results/${filename}`, { params: params as Record }) } export async function getResultInsights(filename: string): Promise { return await http(`/api/results/${filename}/insights`) } export function buildResultExportUrl(filename: string, params: GetResultContentParams = {}): string { const searchParams = new URLSearchParams() Object.entries(params).forEach(([key, value]) => { if (value !== undefined && value !== null) { searchParams.set(key, String(value)) } }) const queryString = searchParams.toString() return `/api/results/${encodeURIComponent(filename)}/export${queryString ? `?${queryString}` : ''}` } export function downloadResultExport(filename: string, params: GetResultContentParams = {}) { const url = buildResultExportUrl(filename, params) const link = document.createElement('a') link.href = url link.download = '' document.body.appendChild(link) link.click() document.body.removeChild(link) } ================================================ FILE: web-ui/src/api/settings.ts ================================================ import { http } from '@/lib/http' export interface NotificationSettings { NTFY_TOPIC_URL?: string GOTIFY_URL?: string GOTIFY_TOKEN?: string BARK_URL?: string WX_BOT_URL?: string TELEGRAM_BOT_TOKEN?: string TELEGRAM_CHAT_ID?: string TELEGRAM_API_BASE_URL?: string WEBHOOK_URL?: string WEBHOOK_METHOD?: string WEBHOOK_HEADERS?: string WEBHOOK_CONTENT_TYPE?: string WEBHOOK_QUERY_PARAMETERS?: string WEBHOOK_BODY?: string PCURL_TO_MOBILE?: boolean BARK_URL_SET?: boolean GOTIFY_TOKEN_SET?: boolean WX_BOT_URL_SET?: boolean TELEGRAM_BOT_TOKEN_SET?: boolean WEBHOOK_URL_SET?: boolean WEBHOOK_HEADERS_SET?: boolean CONFIGURED_CHANNELS?: string[] } export interface NotificationSettingsUpdate { NTFY_TOPIC_URL?: string | null GOTIFY_URL?: string | null GOTIFY_TOKEN?: string | null BARK_URL?: string | null WX_BOT_URL?: string | null TELEGRAM_BOT_TOKEN?: string | null TELEGRAM_CHAT_ID?: string | null TELEGRAM_API_BASE_URL?: string | null WEBHOOK_URL?: string | null WEBHOOK_METHOD?: string | null WEBHOOK_HEADERS?: string | null WEBHOOK_CONTENT_TYPE?: string | null WEBHOOK_QUERY_PARAMETERS?: string | null WEBHOOK_BODY?: string | null PCURL_TO_MOBILE?: boolean } export interface NotificationTestResponse { message: string results: Record } export interface AiSettings { OPENAI_API_KEY?: string OPENAI_BASE_URL?: string OPENAI_MODEL_NAME?: string PROXY_URL?: string } export interface RotationSettings { ACCOUNT_ROTATION_ENABLED?: boolean ACCOUNT_ROTATION_MODE?: string ACCOUNT_ROTATION_RETRY_LIMIT?: number ACCOUNT_BLACKLIST_TTL?: number ACCOUNT_STATE_DIR?: string PROXY_ROTATION_ENABLED?: boolean PROXY_ROTATION_MODE?: string PROXY_POOL?: string PROXY_ROTATION_RETRY_LIMIT?: number PROXY_BLACKLIST_TTL?: number } export interface SystemStatus { scraper_running: boolean running_task_ids?: number[] ai_configured?: boolean notification_configured?: boolean headless_mode?: boolean running_in_docker?: boolean login_state_file: { exists: boolean path: string } env_file: { exists: boolean openai_api_key_set: boolean openai_base_url_set: boolean openai_model_name_set: boolean ntfy_topic_url_set: boolean gotify_url_set: boolean gotify_token_set: boolean bark_url_set: boolean wx_bot_url_set: boolean telegram_bot_token_set: boolean telegram_chat_id_set: boolean webhook_url_set: boolean webhook_headers_set: boolean } configured_notification_channels?: string[] } export async function getNotificationSettings(): Promise { return await http('/api/settings/notifications') } export async function updateNotificationSettings(settings: NotificationSettingsUpdate): Promise<{ message: string; configured_channels: string[] }> { return await http('/api/settings/notifications', { method: 'PUT', headers: { 'Content-Type': 'application/json' }, body: JSON.stringify(settings) }) } export async function testNotificationSettings( payload: { channel?: string; settings: NotificationSettingsUpdate } ): Promise { return await http('/api/settings/notifications/test', { method: 'POST', headers: { 'Content-Type': 'application/json' }, body: JSON.stringify(payload) }) } export async function getAiSettings(): Promise { return await http('/api/settings/ai') } export async function updateAiSettings(settings: AiSettings): Promise { await http('/api/settings/ai', { method: 'PUT', headers: { 'Content-Type': 'application/json' }, body: JSON.stringify(settings) }) } export async function getRotationSettings(): Promise { return await http('/api/settings/rotation') } export async function updateRotationSettings(settings: RotationSettings): Promise { await http('/api/settings/rotation', { method: 'PUT', headers: { 'Content-Type': 'application/json' }, body: JSON.stringify(settings) }) } export async function testAiSettings(settings: AiSettings): Promise<{ success: boolean; message: string; response?: string }> { return await http('/api/settings/ai/test', { method: 'POST', headers: { 'Content-Type': 'application/json' }, body: JSON.stringify(settings) }) } export async function getSystemStatus(): Promise { return await http('/api/settings/status') } export async function updateLoginState(content: string): Promise<{ message: string }> { return await http('/api/login-state', { method: 'POST', headers: { 'Content-Type': 'application/json' }, body: JSON.stringify({ content }) }) } export async function deleteLoginState(): Promise<{ message: string }> { return await http('/api/login-state', { method: 'DELETE' }) } ================================================ FILE: web-ui/src/api/tasks.ts ================================================ import type { Task, TaskCreateResponse, TaskGenerateRequest, TaskGenerationJob, TaskUpdate, } from '@/types/task.d.ts' import { http } from '@/lib/http' export async function getAllTasks(): Promise { return await http('/api/tasks') } export async function createTaskWithAI(data: TaskGenerateRequest): Promise { return await http('/api/tasks/generate', { method: 'POST', headers: { 'Content-Type': 'application/json', }, body: JSON.stringify(data), }) } export async function getTaskGenerationJob(jobId: string): Promise { const result = await http(`/api/tasks/generate-jobs/${jobId}`) return result.job } export async function updateTask(taskId: number, data: TaskUpdate): Promise { const result = await http(`/api/tasks/${taskId}`, { method: 'PATCH', headers: { 'Content-Type': 'application/json', }, body: JSON.stringify(data), }) return result.task } export async function startTask(taskId: number): Promise { await http(`/api/tasks/start/${taskId}`, { method: 'POST' }) } export async function stopTask(taskId: number): Promise { await http(`/api/tasks/stop/${taskId}`, { method: 'POST' }) } export async function deleteTask(taskId: number): Promise { await http(`/api/tasks/${taskId}`, { method: 'DELETE' }) } ================================================ FILE: web-ui/src/assets/main.css ================================================ @tailwind base; @tailwind components; @tailwind utilities; @layer base { :root { --background: 0 0% 100%; --foreground: 222.2 84% 4.9%; --card: 0 0% 100%; --card-foreground: 222.2 84% 4.9%; --popover: 0 0% 100%; --popover-foreground: 222.2 84% 4.9%; --primary: 222.2 47.4% 11.2%; --primary-foreground: 210 40% 98%; --secondary: 210 40% 96.1%; --secondary-foreground: 222.2 47.4% 11.2%; --muted: 210 40% 96.1%; --muted-foreground: 215.4 16.3% 46.9%; --accent: 210 40% 96.1%; --accent-foreground: 222.2 47.4% 11.2%; --destructive: 0 84.2% 60.2%; --destructive-foreground: 210 40% 98%; --border: 214.3 31.8% 91.4%; --input: 214.3 31.8% 91.4%; --ring: 222.2 84% 4.9%; --radius: 0.5rem; } .dark { --background: 222.2 84% 4.9%; --foreground: 210 40% 98%; --card: 222.2 84% 4.9%; --card-foreground: 210 40% 98%; --popover: 222.2 84% 4.9%; --popover-foreground: 210 40% 98%; --primary: 210 40% 98%; --primary-foreground: 222.2 47.4% 11.2%; --secondary: 217.2 32.6% 17.5%; --secondary-foreground: 210 40% 98%; --muted: 217.2 32.6% 17.5%; --muted-foreground: 215 20.2% 65.1%; --accent: 217.2 32.6% 17.5%; --accent-foreground: 210 40% 98%; --destructive: 0 62.8% 30.6%; --destructive-foreground: 210 40% 98%; --border: 217.2 32.6% 17.5%; --input: 217.2 32.6% 17.5%; --ring: 212.7 26.8% 83.9%; } } @layer base { * { @apply border-border; } body { @apply bg-background text-foreground; } } ================================================ FILE: web-ui/src/components/HelloWorld.vue ================================================ ================================================ FILE: web-ui/src/components/layout/DashboardTaskSearch.vue ================================================ ================================================ FILE: web-ui/src/components/layout/LocaleToggle.vue ================================================ ================================================ FILE: web-ui/src/components/layout/TheHeader.vue ================================================ ================================================ FILE: web-ui/src/components/layout/TheSidebar.vue ================================================ ================================================ FILE: web-ui/src/components/results/PriceTrendChart.vue ================================================ ================================================ FILE: web-ui/src/components/results/ResultCard.vue ================================================ ================================================ FILE: web-ui/src/components/results/ResultsFilterBar.vue ================================================ ================================================ FILE: web-ui/src/components/results/ResultsGrid.vue ================================================ ================================================ FILE: web-ui/src/components/results/ResultsInsightsPanel.vue ================================================ ================================================ FILE: web-ui/src/components/settings/NotificationSettingsPanel.vue ================================================