Repository: jdrbc/podly_pure_podcasts Branch: main Commit: 8584ec4a8f99 Files: 260 Total size: 1.3 MB Directory structure: gitextract_mp86zz6d/ ├── .cursor/ │ └── rules/ │ └── testing-conventions.mdc ├── .dockerignore ├── .github/ │ ├── FUNDING.yml │ ├── ISSUE_TEMPLATE/ │ │ ├── bug_report.md │ │ └── feature_request.md │ ├── pull_request_template.md │ └── workflows/ │ ├── conventional-commit-check.yml │ ├── docker-publish.yml │ ├── lint-and-format.yml │ └── release.yml ├── .gitignore ├── .pylintrc ├── .releaserc.cjs ├── .worktrees/ │ └── .gitignore ├── AGENTS.md ├── Dockerfile ├── LICENCE ├── Pipfile ├── Pipfile.lite ├── README.md ├── SECURITY.md ├── compose.dev.cpu.yml ├── compose.dev.nvidia.yml ├── compose.dev.rocm.yml ├── compose.yml ├── docker-entrypoint.sh ├── docs/ │ ├── contributors.md │ ├── how_to_run_beginners.md │ ├── how_to_run_railway.md │ └── todo.txt ├── frontend/ │ ├── .gitignore │ ├── README.md │ ├── eslint.config.js │ ├── index.html │ ├── package.json │ ├── postcss.config.js │ ├── src/ │ │ ├── App.css │ │ ├── App.tsx │ │ ├── components/ │ │ │ ├── AddFeedForm.tsx │ │ │ ├── AudioPlayer.tsx │ │ │ ├── DiagnosticsModal.tsx │ │ │ ├── DownloadButton.tsx │ │ │ ├── EpisodeProcessingStatus.tsx │ │ │ ├── FeedDetail.tsx │ │ │ ├── FeedList.tsx │ │ │ ├── PlayButton.tsx │ │ │ ├── ProcessingStatsButton.tsx │ │ │ ├── ReprocessButton.tsx │ │ │ └── config/ │ │ │ ├── ConfigContext.tsx │ │ │ ├── ConfigTabs.tsx │ │ │ ├── index.ts │ │ │ ├── sections/ │ │ │ │ ├── AppSection.tsx │ │ │ │ ├── LLMSection.tsx │ │ │ │ ├── OutputSection.tsx │ │ │ │ ├── ProcessingSection.tsx │ │ │ │ ├── WhisperSection.tsx │ │ │ │ └── index.ts │ │ │ ├── shared/ │ │ │ │ ├── ConnectionStatusCard.tsx │ │ │ │ ├── EnvOverrideWarningModal.tsx │ │ │ │ ├── EnvVarHint.tsx │ │ │ │ ├── Field.tsx │ │ │ │ ├── SaveButton.tsx │ │ │ │ ├── Section.tsx │ │ │ │ ├── TestButton.tsx │ │ │ │ ├── constants.ts │ │ │ │ └── index.ts │ │ │ └── tabs/ │ │ │ ├── AdvancedTab.tsx │ │ │ ├── DefaultTab.tsx │ │ │ ├── DiscordTab.tsx │ │ │ ├── UserManagementTab.tsx │ │ │ └── index.ts │ │ ├── contexts/ │ │ │ ├── AudioPlayerContext.tsx │ │ │ ├── AuthContext.tsx │ │ │ └── DiagnosticsContext.tsx │ │ ├── hooks/ │ │ │ ├── useConfigState.ts │ │ │ └── useEpisodeStatus.ts │ │ ├── index.css │ │ ├── main.tsx │ │ ├── pages/ │ │ │ ├── BillingPage.tsx │ │ │ ├── ConfigPage.tsx │ │ │ ├── HomePage.tsx │ │ │ ├── JobsPage.tsx │ │ │ ├── LandingPage.tsx │ │ │ └── LoginPage.tsx │ │ ├── services/ │ │ │ └── api.ts │ │ ├── types/ │ │ │ └── index.ts │ │ ├── utils/ │ │ │ ├── clipboard.ts │ │ │ ├── diagnostics.ts │ │ │ └── httpError.ts │ │ └── vite-env.d.ts │ ├── tailwind.config.js │ ├── tsconfig.app.json │ ├── tsconfig.json │ ├── tsconfig.node.json │ └── vite.config.ts ├── pyproject.toml ├── run_podly_docker.sh ├── scripts/ │ ├── ci.sh │ ├── create_migration.sh │ ├── downgrade_db.sh │ ├── generate_lockfiles.sh │ ├── manual_publish.sh │ ├── new_worktree.sh │ ├── start_services.sh │ ├── test_full_workflow.py │ └── upgrade_db.sh ├── src/ │ ├── app/ │ │ ├── __init__.py │ │ ├── auth/ │ │ │ ├── __init__.py │ │ │ ├── bootstrap.py │ │ │ ├── discord_service.py │ │ │ ├── discord_settings.py │ │ │ ├── feed_tokens.py │ │ │ ├── guards.py │ │ │ ├── middleware.py │ │ │ ├── passwords.py │ │ │ ├── rate_limiter.py │ │ │ ├── service.py │ │ │ ├── settings.py │ │ │ └── state.py │ │ ├── background.py │ │ ├── config_store.py │ │ ├── db_commit.py │ │ ├── db_guard.py │ │ ├── extensions.py │ │ ├── feeds.py │ │ ├── ipc.py │ │ ├── job_manager.py │ │ ├── jobs_manager.py │ │ ├── jobs_manager_run_service.py │ │ ├── logger.py │ │ ├── models.py │ │ ├── post_cleanup.py │ │ ├── posts.py │ │ ├── processor.py │ │ ├── routes/ │ │ │ ├── __init__.py │ │ │ ├── auth_routes.py │ │ │ ├── billing_routes.py │ │ │ ├── config_routes.py │ │ │ ├── discord_routes.py │ │ │ ├── feed_routes.py │ │ │ ├── jobs_routes.py │ │ │ ├── main_routes.py │ │ │ ├── post_routes.py │ │ │ └── post_stats_utils.py │ │ ├── runtime_config.py │ │ ├── static/ │ │ │ └── .gitignore │ │ ├── templates/ │ │ │ └── index.html │ │ ├── timeout_decorator.py │ │ └── writer/ │ │ ├── __init__.py │ │ ├── __main__.py │ │ ├── actions/ │ │ │ ├── __init__.py │ │ │ ├── cleanup.py │ │ │ ├── feeds.py │ │ │ ├── jobs.py │ │ │ ├── processor.py │ │ │ ├── system.py │ │ │ └── users.py │ │ ├── client.py │ │ ├── executor.py │ │ ├── model_ops.py │ │ ├── protocol.py │ │ └── service.py │ ├── boundary_refinement_prompt.jinja │ ├── main.py │ ├── migrations/ │ │ ├── README │ │ ├── alembic.ini │ │ ├── env.py │ │ ├── script.py.mako │ │ └── versions/ │ │ ├── 0d954a44fa8e_feed_access.py │ │ ├── 16311623dd58_env_hash.py │ │ ├── 185d3448990e_stripe.py │ │ ├── 18c2402c9202_cleanup_retention_days.py │ │ ├── 2e25a15d11de_per_feed_auto_whitelist.py │ │ ├── 31d767deb401_credits.py │ │ ├── 35b12b2d9feb_landing_page.py │ │ ├── 3c7f5f7640e4_add_counters_reset_timestamp.py │ │ ├── 3d232f215842_migration.py │ │ ├── 3eb0a3a0870b_discord.py │ │ ├── 401071604e7b_config_tables.py │ │ ├── 58b4eedd4c61_add_last_active_to_user.py │ │ ├── 5bccc39c9685_zero_initial_allowance.py │ │ ├── 608e0b27fcda_stronger_access_token.py │ │ ├── 611dcb5d7f12_add_image_url_to_post_model_for_episode_.py │ │ ├── 6e0e16299dcb_alternate_feed_id.py │ │ ├── 73a6b9f9b643_allow_null_feed_id_for_aggregate_tokens.py │ │ ├── 770771437280_episode_whitelist.py │ │ ├── 7de4e57ec4bb_discord_settings.py │ │ ├── 802a2365976d_gruanular_credits.py │ │ ├── 82cfcc8e0326_refined_cuts.py │ │ ├── 89d86978f407_limit_users.py │ │ ├── 91ff431c832e_download_count.py │ │ ├── 999b921ffc58_migration.py │ │ ├── a6f5df1a50ac_add_users_table.py │ │ ├── ab643af6472e_add_manual_feed_allowance_to_user.py │ │ ├── b038c2f99086_add_processingjob_table_for_async_.py │ │ ├── b92e47a03bb2_refactor_transcripts_to_db_tables_.py │ │ ├── bae70e584468_.py │ │ ├── c0f8893ce927_add_skipped_jobs_columns.py │ │ ├── ded4b70feadb_add_image_metadata_to_feed.py │ │ ├── e1325294473b_add_autoprocess_on_download.py │ │ ├── eb51923af483_multiple_supporters.py │ │ ├── f6d5fee57cc3_tz_fix.py │ │ ├── f7a4195e0953_add_enable_boundary_refinement_to_llm_.py │ │ └── fa3a95ecd67d_audio_processing_paths.py │ ├── podcast_processor/ │ │ ├── __init__.py │ │ ├── ad_classifier.py │ │ ├── ad_merger.py │ │ ├── audio.py │ │ ├── audio_processor.py │ │ ├── boundary_refiner.py │ │ ├── cue_detector.py │ │ ├── llm_concurrency_limiter.py │ │ ├── llm_error_classifier.py │ │ ├── llm_model_call_utils.py │ │ ├── model_output.py │ │ ├── podcast_downloader.py │ │ ├── podcast_processor.py │ │ ├── processing_status_manager.py │ │ ├── prompt.py │ │ ├── token_rate_limiter.py │ │ ├── transcribe.py │ │ ├── transcription_manager.py │ │ └── word_boundary_refiner.py │ ├── shared/ │ │ ├── __init__.py │ │ ├── config.py │ │ ├── defaults.py │ │ ├── interfaces.py │ │ ├── llm_utils.py │ │ ├── processing_paths.py │ │ └── test_utils.py │ ├── system_prompt.txt │ ├── tests/ │ │ ├── __init__.py │ │ ├── conftest.py │ │ ├── test_ad_classifier.py │ │ ├── test_ad_classifier_rate_limiting_integration.py │ │ ├── test_aggregate_feed.py │ │ ├── test_audio_processor.py │ │ ├── test_config_error_handling.py │ │ ├── test_feeds.py │ │ ├── test_filenames.py │ │ ├── test_helpers.py │ │ ├── test_llm_concurrency_limiter.py │ │ ├── test_llm_error_classifier.py │ │ ├── test_parse_model_output.py │ │ ├── test_podcast_downloader.py │ │ ├── test_podcast_processor_cleanup.py │ │ ├── test_post_cleanup.py │ │ ├── test_post_routes.py │ │ ├── test_posts.py │ │ ├── test_process_audio.py │ │ ├── test_rate_limiting_config.py │ │ ├── test_rate_limiting_edge_cases.py │ │ ├── test_session_auth.py │ │ ├── test_token_limit_config.py │ │ ├── test_token_rate_limiter.py │ │ ├── test_transcribe.py │ │ └── test_transcription_manager.py │ ├── user_prompt.jinja │ └── word_boundary_refinement_prompt.jinja └── tests/ └── test_cue_detector.py ================================================ FILE CONTENTS ================================================ ================================================ FILE: .cursor/rules/testing-conventions.mdc ================================================ --- description: Writing tests globs: alwaysApply: false --- # Testing Conventions This document describes testing conventions used in the Podly project. ## Fixtures and Dependency Injection The project uses pytest fixtures for dependency injection and test setup. Common fixtures are defined in [src/tests/conftest.py](mdc:src/tests/conftest.py). Key fixtures include: - `app` - Flask application context for testing - `test_config` - Configuration loaded from config.yml - `mock_db_session` - Mock database session - Mock classes for core components (TranscriptionManager, AdClassifier, etc.) ## SQLAlchemy Model Mocking When testing code that uses SQLAlchemy models, prefer creating custom mock classes over using `MagicMock(spec=ModelClass)` to avoid Flask context issues: ```python # Example from test_podcast_downloader.py class MockPost: """A mock Post class that doesn't require Flask context.""" def __init__(self, id=1, title="Test Episode", download_url="https://example.com/podcast.mp3"): self.id = id self.title = title self.download_url = download_url ``` See [src/tests/test_podcast_downloader.py](mdc:src/tests/test_podcast_downloader.py) for a complete example. ## Dependency Injection Prefer injecting dependencies via the contstructor rather than patching. See [src/tests/test_podcast_processor.py](mdc:src/tests/test_podcast_processor.py) for examples of: - Creating test fixtures with mock dependencies - Testing error handling with failing components - Using Flask app context when needed ## Improving Coverage When writing tests to improve coverage: 1. Focus on one module at a time 2. Create mock objects for dependencies 3. Test successful and error paths 4. Use `monkeypatch` to replace functions that access external resources 5. Use `tmp_path` fixture for file operations See [src/tests/test_feeds.py](mdc:src/tests/test_feeds.py) for comprehensive examples of these patterns. ================================================ FILE: .dockerignore ================================================ # Python cache files __pycache__/ *.py[cod] *$py.class .pytest_cache/ .mypy_cache/ # Git .git/ .github/ .gitignore # Editor files .vscode/ .idea/ *.swp *.swo # Virtual environments venv/ .env/ .venv/ env/ ENV/ # Build artifacts *.so *.egg-info/ dist/ build/ # Input/Output directories (these can be mounted as volumes instead) in/ processing/ # App instance data src/app/instance/ src/instance/ # Logs *.log # Database files *.db *.sqlite *.sqlite3 # Local configuration files .env .env.* !.env.example # Node / JS node_modules/ .DS_Store *.DS_Store # Frontend specific frontend/node_modules/ frontend/dist/ frontend/.vite/ frontend/coverage/ frontend/.nyc_output/ frontend/.eslintcache # Documentation docs/ *.md !README.md # Coverage / lint caches .coverage coverage.xml htmlcov/ .ruff_cache/ ================================================ FILE: .github/FUNDING.yml ================================================ # These are supported funding model platforms github: jdrbc ================================================ FILE: .github/ISSUE_TEMPLATE/bug_report.md ================================================ --- name: Bug report about: Report a problem or regression title: "[Bug]: " labels: bug assignees: "" --- ## Summary - ## Steps to reproduce 1. ## Expected behavior - ## Actual behavior - ## Environment - App version/commit: - OS: - Deployment: local / docker / other ## Logs or screenshots - ## Additional context - ================================================ FILE: .github/ISSUE_TEMPLATE/feature_request.md ================================================ --- name: Feature request about: Suggest an idea or enhancement title: "[Feature]: " labels: enhancement assignees: "" --- ## Summary - ## Problem to solve - ## Proposed solution - ## Alternatives considered - ## Additional context - ================================================ FILE: .github/pull_request_template.md ================================================ ## Summary - ## Type of change - [ ] Bug fix - [ ] New feature - [ ] Refactor - [ ] Docs - [ ] Other ## Testing - [ ] `scripts/ci.sh` - [ ] Not run (explain below) ## Docs - [ ] Not needed - [ ] Updated (details below) ## Related issues - ## Notes - ## Checklist - [ ] Target branch is `Preview` - [ ] Docs updated if needed - [ ] Tests run or explicitly skipped with reasoning - [ ] If merging to main, at least one commit in this PR follows Conventional Commits (e.g., `feat:`, `fix:`, `chore:`) Please refer to https://www.conventionalcommits.org/en/v1.0.0/#summary for more details. ================================================ FILE: .github/workflows/conventional-commit-check.yml ================================================ name: Conventional Commit Check on: pull_request: branches: - main permissions: contents: read jobs: conventional-commit: runs-on: ubuntu-latest steps: - name: Checkout repository uses: actions/checkout@v4 with: fetch-depth: 0 - name: Ensure at least one Conventional Commit env: BASE_SHA: ${{ github.event.pull_request.base.sha }} HEAD_SHA: ${{ github.event.pull_request.head.sha }} run: | set -euo pipefail echo "Checking commit subjects between $BASE_SHA and $HEAD_SHA" subjects=$(git log --format=%s "$BASE_SHA..$HEAD_SHA") if [ -z "$subjects" ]; then echo "No commits found in range." exit 1 fi if echo "$subjects" | grep -Eq '^(feat|fix|docs|style|refactor|perf|test|build|ci|chore|revert)(\([^)]+\))?(!)?: .+'; then echo "Conventional Commit found." else echo "No Conventional Commit found in this PR." echo "Add at least one commit like: feat: ..., fix(scope): ..., chore: ..." echo "Please refer to https://www.conventionalcommits.org/en/v1.0.0/#summary for more details." exit 1 fi ================================================ FILE: .github/workflows/docker-publish.yml ================================================ name: Build and Publish Docker Images on: push: branches: [main] tags: ["v*"] pull_request: branches: [main] release: types: [published] permissions: contents: read packages: write env: REGISTRY: ghcr.io IMAGE_NAME: ${{ github.repository_owner }}/podly-pure-podcasts jobs: changes: runs-on: ubuntu-latest outputs: skip: ${{ steps.check_files.outputs.skip }} steps: - name: Checkout repository uses: actions/checkout@v4 with: fetch-depth: 0 - name: Check for documentation-only changes id: check_files run: | # For PRs, compare against the base branch. For pushes, compare against the previous commit. if [ "${{ github.event_name }}" = "pull_request" ]; then BASE_REF="${{ github.event.pull_request.base.ref }}" echo "Fetching base branch origin/$BASE_REF" git fetch --no-tags origin "$BASE_REF" BASE_SHA=$(git rev-parse "origin/$BASE_REF") HEAD_SHA=$(git rev-parse "${{ github.sha }}") echo "Comparing PR commits: $BASE_SHA...$HEAD_SHA" files_changed=$(git diff --name-only "$BASE_SHA"..."$HEAD_SHA") elif [ "${{ github.event_name }}" = "release" ]; then echo "Release event detected; building images for release tag" TARGET_REF="${{ github.event.release.target_commitish }}" echo "Fetching release target origin/$TARGET_REF" git fetch --no-tags origin "$TARGET_REF" || true HEAD_SHA=$(git rev-parse "${{ github.sha }}") BASE_SHA=$(git rev-parse "origin/$TARGET_REF" 2>/dev/null || git rev-parse "$TARGET_REF" 2>/dev/null || echo "$HEAD_SHA") files_changed=$(git diff --name-only "$BASE_SHA"..."$HEAD_SHA" 2>/dev/null || echo "release-trigger") else echo "Comparing push commits: HEAD~1...HEAD" if git rev-parse HEAD~1 >/dev/null 2>&1; then files_changed=$(git diff --name-only HEAD~1 HEAD) else echo "Single commit push detected; using initial commit diff" files_changed=$(git diff-tree --no-commit-id --name-only -r HEAD) fi fi echo "Files changed:" echo "$files_changed" # If no files are documentation, then we should continue non_doc_files=$(echo "$files_changed" | grep -v -E '(\.md$|^docs/|LICENCE)') if [ "${{ github.event_name }}" = "release" ]; then echo "Release build detected. Skipping documentation-only shortcut." echo "skip=false" >> $GITHUB_OUTPUT elif [ -z "$non_doc_files" ]; then echo "Only documentation files were changed. Skipping build and publish." echo "skip=true" >> $GITHUB_OUTPUT else echo "Code files were changed. Proceeding with build and publish." echo "skip=false" >> $GITHUB_OUTPUT fi shell: bash ## test if build is successful, but don't run every permutation on PRs build-amd64-pr-lite: needs: changes if: ${{ needs.changes.outputs.skip == 'false' && github.event_name == 'pull_request' }} runs-on: ubuntu-latest strategy: matrix: variant: - name: "lite" base: "python:3.11-slim" gpu: "false" gpu_nvidia: "false" gpu_amd: "false" lite_build: "true" env: ARCH: amd64 steps: - name: Checkout repository uses: actions/checkout@v4 with: fetch-depth: 0 - name: Free up disk space if: ${{ matrix.variant.gpu == 'true' || matrix.variant.gpu_nvidia == 'true' || matrix.variant.gpu_amd == 'true' }} run: | echo "Available disk space before cleanup:" df -h sudo rm -rf /usr/share/dotnet /usr/local/lib/android /opt/ghc /usr/local/share/boost sudo rm -rf /opt/microsoft/msedge /opt/microsoft/powershell /opt/pipx /usr/lib/mono sudo rm -rf /usr/local/.ghcup /usr/share/swift docker system prune -af echo "Available disk space after cleanup:" df -h - name: Set up Docker Buildx uses: docker/setup-buildx-action@v3 with: driver-opts: | image=moby/buildkit:v0.12.0 - name: Log in to Container Registry uses: docker/login-action@v3 with: registry: ${{ env.REGISTRY }} username: ${{ github.actor }} password: ${{ secrets.GITHUB_TOKEN }} - name: Extract metadata id: meta uses: docker/metadata-action@v5 with: images: ${{ env.REGISTRY }}/${{ env.IMAGE_NAME }} tags: | type=ref,event=branch,suffix=-${{ matrix.variant.name }}-${{ env.ARCH }} type=ref,event=pr,suffix=-${{ matrix.variant.name }}-${{ env.ARCH }} type=semver,pattern={{version}},suffix=-${{ matrix.variant.name }}-${{ env.ARCH }} type=raw,value=${{ matrix.variant.name }}-${{ env.ARCH }},enable={{is_default_branch}} - name: Build and push uses: docker/build-push-action@v5 with: context: . file: ./Dockerfile push: true platforms: linux/${{ env.ARCH }} tags: ${{ steps.meta.outputs.tags }} labels: ${{ steps.meta.outputs.labels }} build-args: | BASE_IMAGE=${{ matrix.variant.base }} USE_GPU=${{ matrix.variant.gpu }} USE_GPU_NVIDIA=${{ matrix.variant.gpu_nvidia }} USE_GPU_AMD=${{ matrix.variant.gpu_amd }} LITE_BUILD=${{ matrix.variant.lite_build }} # Temporarily disabled due to GitHub Actions Cache service outage # cache-from: type=gha # cache-to: type=gha,mode=max build-amd64: needs: changes if: ${{ needs.changes.outputs.skip == 'false' && github.event_name != 'pull_request' }} runs-on: ubuntu-latest strategy: matrix: variant: - name: "latest" base: "python:3.11-slim" gpu: "false" gpu_nvidia: "false" gpu_amd: "false" lite_build: "false" - name: "lite" base: "python:3.11-slim" gpu: "false" gpu_nvidia: "false" gpu_amd: "false" lite_build: "true" - name: "gpu-nvidia" base: "nvidia/cuda:12.4.1-cudnn-devel-ubuntu22.04" gpu: "true" gpu_nvidia: "true" gpu_amd: "false" lite_build: "false" - name: "gpu-amd" base: "rocm/dev-ubuntu-22.04:6.4-complete" gpu: "false" gpu_nvidia: "false" gpu_amd: "true" lite_build: "false" env: ARCH: amd64 steps: - name: Checkout repository uses: actions/checkout@v4 with: fetch-depth: 0 - name: Free up disk space if: ${{ matrix.variant.gpu == 'true' || matrix.variant.gpu_nvidia == 'true' || matrix.variant.gpu_amd == 'true' }} run: | echo "Available disk space before cleanup:" df -h sudo rm -rf /usr/share/dotnet /usr/local/lib/android /opt/ghc /usr/local/share/boost sudo rm -rf /opt/microsoft/msedge /opt/microsoft/powershell /opt/pipx /usr/lib/mono sudo rm -rf /usr/local/.ghcup /usr/share/swift docker system prune -af echo "Available disk space after cleanup:" df -h - name: Set up Docker Buildx uses: docker/setup-buildx-action@v3 with: driver-opts: | image=moby/buildkit:v0.12.0 - name: Log in to Container Registry uses: docker/login-action@v3 with: registry: ${{ env.REGISTRY }} username: ${{ github.actor }} password: ${{ secrets.GITHUB_TOKEN }} - name: Extract metadata id: meta uses: docker/metadata-action@v5 with: images: ${{ env.REGISTRY }}/${{ env.IMAGE_NAME }} tags: | type=ref,event=branch,suffix=-${{ matrix.variant.name }}-${{ env.ARCH }} type=ref,event=pr,suffix=-${{ matrix.variant.name }}-${{ env.ARCH }} type=semver,pattern={{version}},suffix=-${{ matrix.variant.name }}-${{ env.ARCH }} type=raw,value=${{ matrix.variant.name }}-${{ env.ARCH }},enable={{is_default_branch}} - name: Build and push uses: docker/build-push-action@v5 with: context: . file: ./Dockerfile push: true platforms: linux/${{ env.ARCH }} tags: ${{ steps.meta.outputs.tags }} labels: ${{ steps.meta.outputs.labels }} build-args: | BASE_IMAGE=${{ matrix.variant.base }} USE_GPU=${{ matrix.variant.gpu }} USE_GPU_NVIDIA=${{ matrix.variant.gpu_nvidia }} USE_GPU_AMD=${{ matrix.variant.gpu_amd }} LITE_BUILD=${{ matrix.variant.lite_build }} # Temporarily disabled due to GitHub Actions Cache service outage # cache-from: type=gha # cache-to: type=gha,mode=max build-arm64: needs: changes if: ${{ needs.changes.outputs.skip == 'false' && github.event_name != 'pull_request' }} runs-on: ubuntu-latest strategy: matrix: variant: - { name: "latest", base: "python:3.11-slim", gpu: "false", gpu_nvidia: "false", gpu_amd: "false", lite_build: "false", } - { name: "lite", base: "python:3.11-slim", gpu: "false", gpu_nvidia: "false", gpu_amd: "false", lite_build: "true", } env: ARCH: arm64 steps: - name: Checkout repository uses: actions/checkout@v4 with: fetch-depth: 0 - name: Free up disk space if: ${{ matrix.variant.gpu == 'true' || matrix.variant.gpu_nvidia == 'true' || matrix.variant.gpu_amd == 'true' }} run: | echo "Available disk space before cleanup:" df -h sudo rm -rf /usr/share/dotnet /usr/local/lib/android /opt/ghc /usr/local/share/boost sudo rm -rf /opt/microsoft/msedge /opt/microsoft/powershell /opt/pipx /usr/lib/mono sudo rm -rf /usr/local/.ghcup /usr/share/swift docker system prune -af echo "Available disk space after cleanup:" df -h - name: Set up QEMU uses: docker/setup-qemu-action@v3 - name: Set up Docker Buildx uses: docker/setup-buildx-action@v3 with: driver-opts: | image=moby/buildkit:v0.12.0 - name: Log in to Container Registry uses: docker/login-action@v3 with: registry: ${{ env.REGISTRY }} username: ${{ github.actor }} password: ${{ secrets.GITHUB_TOKEN }} - name: Extract metadata id: meta uses: docker/metadata-action@v5 with: images: ${{ env.REGISTRY }}/${{ env.IMAGE_NAME }} tags: | type=ref,event=branch,suffix=-${{ matrix.variant.name }}-${{ env.ARCH }} type=ref,event=pr,suffix=-${{ matrix.variant.name }}-${{ env.ARCH }} type=semver,pattern={{version}},suffix=-${{ matrix.variant.name }}-${{ env.ARCH }} type=raw,value=${{ matrix.variant.name }}-${{ env.ARCH }},enable={{is_default_branch}} - name: Build and push uses: docker/build-push-action@v5 with: context: . file: ./Dockerfile push: true platforms: linux/${{ env.ARCH }} tags: ${{ steps.meta.outputs.tags }} labels: ${{ steps.meta.outputs.labels }} build-args: | BASE_IMAGE=${{ matrix.variant.base }} USE_GPU=${{ matrix.variant.gpu }} USE_GPU_NVIDIA=${{ matrix.variant.gpu_nvidia }} USE_GPU_AMD=${{ matrix.variant.gpu_amd }} LITE_BUILD=${{ matrix.variant.lite_build }} # Temporarily disabled due to GitHub Actions Cache service outage # cache-from: type=gha # cache-to: type=gha,mode=max manifest: needs: [changes, build-amd64, build-arm64] if: ${{ needs.changes.outputs.skip == 'false' }} runs-on: ubuntu-latest strategy: matrix: variant: - "latest" - "lite" steps: - name: Checkout repository uses: actions/checkout@v4 with: fetch-depth: 0 - name: Log in to Container Registry uses: docker/login-action@v3 with: registry: ${{ env.REGISTRY }} username: ${{ github.actor }} password: ${{ secrets.GITHUB_TOKEN }} - name: Extract metadata (manifest) id: meta uses: docker/metadata-action@v5 with: images: ${{ env.REGISTRY }}/${{ env.IMAGE_NAME }} tags: | type=ref,event=branch,suffix=-${{ matrix.variant }} type=ref,event=pr,suffix=-${{ matrix.variant }} type=semver,pattern={{version}},suffix=-${{ matrix.variant }} type=raw,value=${{ matrix.variant }},enable={{is_default_branch}} - name: Create and push manifest run: | set -euo pipefail tags="${{ steps.meta.outputs.tags }}" while IFS= read -r tag; do [ -z "$tag" ] && continue echo "Creating manifest for ${tag}" docker buildx imagetools create \ -t "${tag}" \ "${tag}-amd64" \ "${tag}-arm64" done <<< "$tags" ================================================ FILE: .github/workflows/lint-and-format.yml ================================================ name: Python Linting, Formatting, and Testing on: push: branches: - main pull_request: branches: - main jobs: lint-format-test: runs-on: ubuntu-latest env: PIPENV_VENV_IN_PROJECT: "1" PIP_DISABLE_PIP_VERSION_CHECK: "1" steps: - name: Checkout code uses: actions/checkout@v4 - name: Set up Python id: python uses: actions/setup-python@v5 with: python-version: "3.11" cache: "pip" cache-dependency-path: "Pipfile.lock" - name: Install ffmpeg run: sudo apt-get update -y && sudo apt-get install -y --no-install-recommends ffmpeg - name: Install pipenv run: pip install pipenv - name: Cache pipenv virtualenv uses: actions/cache@v4 with: path: .venv key: ${{ runner.os }}-venv-${{ steps.python.outputs.python-version }}-${{ hashFiles('Pipfile.lock') }} restore-keys: | ${{ runner.os }}-venv-${{ steps.python.outputs.python-version }}- - name: Cache mypy uses: actions/cache@v4 with: path: .mypy_cache key: ${{ runner.os }}-mypy-${{ steps.python.outputs.python-version }}-${{ hashFiles('Pipfile.lock') }} restore-keys: | ${{ runner.os }}-mypy-${{ steps.python.outputs.python-version }}- - name: Install dependencies run: pipenv install --dev --deploy - name: Install dependencies for mypy run: pipenv run mypy . --install-types --non-interactive --explicit-package-bases --exclude 'migrations' --exclude 'build' --exclude 'scripts' --exclude 'src/tests' --exclude 'src/tests/test_routes.py' --exclude 'src/app/routes.py' - name: Run pylint run: pipenv run pylint src --ignore=migrations,tests - name: Run black run: pipenv run black --check src - name: Run isort run: pipenv run isort --check-only src - name: Run pytest run: pipenv run pytest --disable-warnings ================================================ FILE: .github/workflows/release.yml ================================================ name: Release on: push: branches: - main workflow_dispatch: permissions: contents: write issues: write pull-requests: write jobs: release: runs-on: ubuntu-latest steps: - name: Checkout repository uses: actions/checkout@v4 with: fetch-depth: 0 - name: Set up Node.js uses: actions/setup-node@v4 with: node-version: "20" - name: Run semantic-release env: GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} run: > npx --yes -p semantic-release -p @semantic-release/changelog -p @semantic-release/git semantic-release ================================================ FILE: .gitignore ================================================ .worktrees/* __pycache__/ *.py[cod] *$py.class *.so .Python build/ develop-eggs/ dist/ downloads/ eggs/ .eggs/ lib/ lib64/ parts/ sdist/ var/ wheels/ *.egg-info/ .installed.cfg *.egg *.manifest *.spec pip-log.txt pip-delete-this-directory.txt htmlcov/ .tox/ .coverage .coverage.* .cache nosetests.xml coverage.xml *.cover .hypothesis/ .pytest_cache/ *.mo *.pot *.log out/* processing/* config/app.log .vscode/* in/**/*.mp3 srv/**/*.mp3 *.pickle .env .env.local config/config.yml *.db *.sqlite **/sqlite3.db-* **/*.sqlite-* .DS_Store src/instance/data/* # Frontend build logs frontend-build.log # Claude Code local notes (not committed) .claude-notes/ CLAUDE_NOTES.md ================================================ FILE: .pylintrc ================================================ [MASTER] ignore=frontend,migrations,scripts ignore-paths=^src/(migrations|tests)/ disable= C0114, # missing-module-docstring C0115, # missing-class-docstring C0116, # missing-function-docstring R0913, # too-many-arguments R0914, # too-many-locals R0903, # too-few-public-methods W1203, # logging-fstring-interpolation W1514, # using-constant-test E0401, # import-error C0301, # line-too-long R0911, # too-many-return-statements [DESIGN] # Allow more statements per function to accommodate complex processing routines max-statements=100 [MASTER:src/tests/*.py] disable= W0621, # redefined-outer-name W0212, # protected-access W0613, # Unused argument C0415, # Import outside toplevel W0622, R0902 [MASTER:scripts/*.py] disable= R0917, W0718 [SIMILARITIES] # Minimum lines number of a similarity. min-similarity-lines=10 # Ignore comments when computing similarities. ignore-comments=yes # Ignore docstrings when computing similarities. ignore-docstrings=yes # Ignore imports when computing similarities. ignore-imports=no ================================================ FILE: .releaserc.cjs ================================================ const { execSync } = require("node:child_process"); const resolveRepositoryUrl = () => { if (process.env.GITHUB_REPOSITORY) { return `https://github.com/${process.env.GITHUB_REPOSITORY}.git`; } try { return execSync("git remote get-url origin", { stdio: "pipe" }) .toString() .trim(); } catch { return undefined; } }; module.exports = { branches: ["main"], repositoryUrl: resolveRepositoryUrl(), tagFormat: "v${version}", plugins: [ "@semantic-release/commit-analyzer", "@semantic-release/release-notes-generator", ["@semantic-release/changelog", { changelogFile: "CHANGELOG.md" }], [ "@semantic-release/git", { assets: ["CHANGELOG.md"], message: "chore(release): ${nextRelease.version} [skip ci]\n\n${nextRelease.notes}", }, ], "@semantic-release/github", ], }; ================================================ FILE: .worktrees/.gitignore ================================================ * !.gitignore ================================================ FILE: AGENTS.md ================================================ Project-specific rules: - Do not create Alembic migrations yourself; request the user to generate migrations after model changes. - Only use ./scripts/ci.sh to run tests & lints - do not attempt to run directly - use pipenv - All database writes must go through the `writer` service. Do not use `db.session.commit()` directly in application code. Use `writer_client.action()` instead. ================================================ FILE: Dockerfile ================================================ # Multi-stage build for combined frontend and backend ARG BASE_IMAGE=python:3.11-slim FROM node:18-alpine AS frontend-build WORKDIR /app # Copy frontend package files COPY frontend/package*.json ./ RUN npm ci # Copy frontend source code COPY frontend/ ./ # Build frontend assets with explicit error handling RUN set -e && \ npm run build && \ test -d dist && \ echo "Frontend build successful - dist directory created" # Backend stage FROM ${BASE_IMAGE} AS backend # Environment variables ENV PYTHONDONTWRITEBYTECODE=1 ENV PYTHONUNBUFFERED=1 ARG CUDA_VERSION=12.4.1 ARG ROCM_VERSION=6.4 ARG USE_GPU=false ARG USE_GPU_NVIDIA=${USE_GPU} ARG USE_GPU_AMD=false ARG LITE_BUILD=false WORKDIR /app # Install dependencies based on base image RUN if [ -f /etc/debian_version ]; then \ apt-get update && \ apt-get install -y ca-certificates && \ # Determine if we need to install Python 3.11 INSTALL_PYTHON=true && \ if command -v python3 >/dev/null 2>&1; then \ if python3 --version 2>&1 | grep -q "3.11"; then \ INSTALL_PYTHON=false; \ fi; \ fi && \ if [ "$INSTALL_PYTHON" = "true" ]; then \ apt-get install -y software-properties-common && \ if ! apt-cache show python3.11 > /dev/null 2>&1; then \ add-apt-repository ppa:deadsnakes/ppa -y && \ apt-get update; \ fi && \ DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends \ python3.11 \ python3.11-distutils \ python3.11-dev \ python3-pip && \ update-alternatives --install /usr/bin/python3 python3 /usr/bin/python3.11 1 && \ update-alternatives --set python3 /usr/bin/python3.11; \ fi && \ # Install other dependencies DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends \ ffmpeg \ sqlite3 \ libsqlite3-dev \ build-essential \ gosu && \ apt-get clean && \ rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/* ; \ fi # Install python3-tomli if Python version is less than 3.11 (separate step for ARM compatibility) RUN if [ -f /etc/debian_version ]; then \ PYTHON_MINOR=$(python3 --version 2>&1 | grep -o 'Python 3\.[0-9]*' | cut -d '.' -f2) && \ if [ "$PYTHON_MINOR" -lt 11 ]; then \ apt-get update && \ apt-get install -y python3-tomli && \ apt-get clean && \ rm -rf /var/lib/apt/lists/* ; \ fi ; \ fi # Copy all Pipfiles/lock files COPY Pipfile Pipfile.lock Pipfile.lite Pipfile.lite.lock ./ # Remove problematic distutils-installed packages that may conflict RUN if [ -f /etc/debian_version ]; then \ apt-get remove -y python3-blinker 2>/dev/null || true; \ fi # Install pipenv and dependencies RUN if command -v pip >/dev/null 2>&1; then \ pip install --no-cache-dir pipenv; \ elif command -v pip3 >/dev/null 2>&1; then \ pip3 install --no-cache-dir pipenv; \ else \ python3 -m pip install --no-cache-dir pipenv; \ fi # Set pip timeout and retries for better reliability ENV PIP_DEFAULT_TIMEOUT=1000 ENV PIP_RETRIES=3 ENV PIP_DISABLE_PIP_VERSION_CHECK=1 ENV PIP_NO_CACHE_DIR=1 # Set pipenv configuration for better CI reliability ENV PIPENV_VENV_IN_PROJECT=1 ENV PIPENV_TIMEOUT=1200 # Install dependencies conditionally based on LITE_BUILD RUN set -e && \ if [ "${LITE_BUILD}" = "true" ]; then \ echo "Installing lite dependencies (without Whisper)"; \ echo "Using lite Pipfile:" && \ PIPENV_PIPFILE=Pipfile.lite pipenv install --deploy --system; \ else \ echo "Installing full dependencies (including Whisper)"; \ echo "Using full Pipfile:" && \ PIPENV_PIPFILE=Pipfile pipenv install --deploy --system; \ fi # Install PyTorch with CUDA support if using NVIDIA image (skip if LITE_BUILD) RUN if [ "${LITE_BUILD}" = "true" ]; then \ echo "Skipping PyTorch installation in lite mode"; \ elif [ "${USE_GPU}" = "true" ] || [ "${USE_GPU_NVIDIA}" = "true" ]; then \ if command -v pip >/dev/null 2>&1; then \ pip install --no-cache-dir nvidia-cudnn-cu12 torch; \ elif command -v pip3 >/dev/null 2>&1; then \ pip3 install --no-cache-dir nvidia-cudnn-cu12 torch; \ else \ python3 -m pip install --no-cache-dir nvidia-cudnn-cu12 torch; \ fi; \ elif [ "${USE_GPU_AMD}" = "true" ]; then \ if command -v pip >/dev/null 2>&1; then \ pip install --no-cache-dir torch --index-url https://download.pytorch.org/whl/rocm${ROCM_VERSION}; \ elif command -v pip3 >/dev/null 2>&1; then \ pip3 install --no-cache-dir torch --index-url https://download.pytorch.org/whl/rocm${ROCM_VERSION}; \ else \ python3 -m pip install --no-cache-dir torch --index-url https://download.pytorch.org/whl/rocm${ROCM_VERSION}; \ fi; \ else \ if command -v pip >/dev/null 2>&1; then \ pip install --no-cache-dir torch --index-url https://download.pytorch.org/whl/cpu; \ elif command -v pip3 >/dev/null 2>&1; then \ pip3 install --no-cache-dir torch --index-url https://download.pytorch.org/whl/cpu; \ else \ python3 -m pip install --no-cache-dir torch --index-url https://download.pytorch.org/whl/cpu; \ fi; \ fi # Copy application code COPY src/ ./src/ RUN rm -rf ./src/instance COPY scripts/ ./scripts/ RUN chmod +x scripts/start_services.sh # Copy built frontend assets to Flask static folder COPY --from=frontend-build /app/dist ./src/app/static # Create non-root user for running the application RUN groupadd -r appuser && \ useradd --no-log-init -r -g appuser -d /home/appuser appuser && \ mkdir -p /home/appuser && \ chown -R appuser:appuser /home/appuser # Create necessary directories and set permissions RUN mkdir -p /app/processing /app/src/instance /app/src/instance/data /app/src/instance/data/in /app/src/instance/data/srv /app/src/instance/config /app/src/instance/db && \ chown -R appuser:appuser /app # Copy entrypoint script COPY docker-entrypoint.sh /docker-entrypoint.sh RUN chmod 755 /docker-entrypoint.sh EXPOSE 5001 # Run the application through the entrypoint script ENTRYPOINT ["/docker-entrypoint.sh"] CMD ["./scripts/start_services.sh"] ================================================ FILE: LICENCE ================================================ MIT License Copyright (c) 2024 John Rogers Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. ================================================ FILE: Pipfile ================================================ [[source]] url = "https://pypi.org/simple" verify_ssl = true name = "pypi" [packages] speechrecognition = "*" openai = "*" python-dotenv = "*" jinja2 = "*" flask = "*" pyrss2gen = "*" feedparser = "*" certifi = "*" cd = "*" pyyaml = "*" prompt-toolkit = "*" pypodcastparser = "*" werkzeug = "*" exceptiongroup = "*" zeroconf = "*" waitress = "*" validators = "*" beartype = "*" openai-whisper = "*" flask-sqlalchemy = "*" flask-migrate = "*" Flask-APScheduler = "*" ffmpeg-python = "*" litellm = "*" # Pin to avoid fastuuid dependency bleach = "*" types-bleach = "*" groq = "*" async_timeout = "*" pytest-cov = "*" flask-cors = "*" bcrypt = "*" httpx-aiohttp = "*" stripe = "*" [dev-packages] black = "*" mypy = "*" types-pyyaml = "*" types-requests = "*" types-waitress = "*" pylint = "*" pytest = "*" dill = "*" isort = "*" types-flask-migrate = "*" pytest-mock = "*" watchdog = "*" requests = "*" types-flask-cors = "*" [requires] python_version = "3.11" ================================================ FILE: Pipfile.lite ================================================ [[source]] url = "https://pypi.org/simple" verify_ssl = true name = "pypi" [packages] speechrecognition = "*" openai = "*" python-dotenv = "*" jinja2 = "*" flask = "*" pyrss2gen = "*" feedparser = "*" certifi = "*" cd = "*" pyyaml = "*" prompt-toolkit = "*" pypodcastparser = "*" werkzeug = "*" exceptiongroup = "*" zeroconf = "*" waitress = "*" validators = "*" beartype = "*" flask-sqlalchemy = "*" flask-migrate = "*" Flask-APScheduler = "*" ffmpeg-python = "*" litellm = ">=1.59.8,<1.75.0" # Pin to avoid fastuuid dependency bleach = "*" types-bleach = "*" groq = "*" async_timeout = "*" pytest-cov = "*" flask-cors = "*" bcrypt = "*" stripe = "*" [dev-packages] black = "*" mypy = "*" types-pyyaml = "*" types-requests = "*" types-waitress = "*" pylint = "*" pytest = "*" dill = "*" isort = "*" types-flask-migrate = "*" pytest-mock = "*" watchdog = "*" requests = "*" types-flask-cors = "*" [requires] python_version = "3.11" ================================================ FILE: README.md ================================================

Ad-block for podcasts. Create an ad-free RSS feed.

Discord

## Overview Podly uses Whisper and Chat GPT to remove ads from podcasts. ## How To Run You have a few options to get started: - [![Deploy on Railway](https://railway.com/button.svg)](https://railway.com/deploy/podly?referralCode=NMdeg5&utm_medium=integration&utm_source=template&utm_campaign=generic) - quick and easy setup in the cloud, follow our [Railway deployment guide](docs/how_to_run_railway.md). - Use this if you want to share your Podly server with others. - **Run Locally**: - For local development and customization, - see our [beginner's guide for running locally](docs/how_to_run_beginners.md). - Use this for the most cost-optimal & private setup. - **[Join The Preview Server](https://podly.up.railway.app/)**: - pay what you want (limited sign ups available) ## How it works: - You request an episode - Podly downloads the requested episode - Whisper transcribes the episode - LLM labels ad segments - Podly removes the ad segments - Podly delivers the ad-free version of the podcast to you ### Cost Breakdown *Monthly cost breakdown for 5 podcasts* | Cost | Hosting | Transcription | LLM | |---------|----------|---------------|--------| | **free**| local | local | local | | **$2** | local | local | remote | | **$5** | local | remote | remote | | **$10** | public (railway) | remote | remote | | **Pay What You Want** | [preview server](https://podly.up.railway.app/) | n/a | n/a | | **$5.99/mo** | https://zeroads.ai/ | production fork of podly | | ## Contributing See [contributing guide](docs/contributors.md) for local setup & contribution instructions. ================================================ FILE: SECURITY.md ================================================ # Security Policy ## Supported Versions We only support the latest on main & preview. ## Reporting a Vulnerability Please use the Private Vulnerability Reporting feature on GitHub: - Navigate to the Security tab of this repository. - Select "Vulnerability reporting" from the left-hand sidebar. - Click "Report a vulnerability" to open a private advisory. Include as much detail as possible: - Steps to reproduce. - Potential impact. - Any suggested fixes. This allows us to collaborate with you on a fix in a private workspace before the issue is made public. ================================================ FILE: compose.dev.cpu.yml ================================================ services: podly: container_name: podly-pure-podcasts image: podly-pure-podcasts volumes: - ./src/instance:/app/src/instance env_file: - ./.env.local build: context: . dockerfile: Dockerfile args: - BASE_IMAGE=${BASE_IMAGE:-python:3.11-slim} - CUDA_VERSION=${CUDA_VERSION:-12.4.1} - USE_GPU=${USE_GPU:-false} - USE_GPU_NVIDIA=${USE_GPU_NVIDIA:-false} - USE_GPU_AMD=${USE_GPU_AMD:-false} - LITE_BUILD=${LITE_BUILD:-false} ports: - "5001:5001" environment: - PUID=${PUID:-1000} - PGID=${PGID:-1000} - CUDA_VISIBLE_DEVICES=${CUDA_VISIBLE_DEVICES:--1} - SERVER_THREADS=${SERVER_THREADS:-1} restart: unless-stopped healthcheck: test: [ "CMD", "python3", "-c", "import urllib.request; urllib.request.urlopen('http://127.0.0.1:5001/')", ] interval: 30s timeout: 10s retries: 3 start_period: 10s networks: default: name: podly-pure-podcasts-network ================================================ FILE: compose.dev.nvidia.yml ================================================ services: podly: extends: file: compose.dev.cpu.yml service: podly env_file: - ./.env.local environment: - PUID=${PUID:-1000} - PGID=${PGID:-1000} - CUDA_VISIBLE_DEVICES=0 - CORS_ORIGINS=* - SERVER_THREADS=${SERVER_THREADS:-1} deploy: resources: reservations: devices: - driver: nvidia count: 1 capabilities: [gpu] networks: default: name: podly-pure-podcasts-network ================================================ FILE: compose.dev.rocm.yml ================================================ services: podly: extends: file: compose.dev.cpu.yml service: podly env_file: - ./.env.local devices: - /dev/kfd - /dev/dri environment: - PUID=${PUID:-1000} - PGID=${PGID:-1000} - CUDA_VISIBLE_DEVICES=${CUDA_VISIBLE_DEVICES:--1} - CORS_ORIGINS=* - SERVER_THREADS=${SERVER_THREADS:-1} # Don't ask me why this is needed for ROCM. See # https://github.com/openai/whisper/discussions/55#discussioncomment-3714528 - HSA_OVERRIDE_GFX_VERSION=10.3.0 security_opt: - seccomp=unconfined networks: default: name: podly-pure-podcasts-network # This would be ideal. Not currently supported, apparently. Or I just wasn't able to figure out the driver arg. # Tried: amdgpu, amd, rocm # deploy: # resources: # reservations: # devices: # - capabilities: [gpu] # driver: "amdgpu" # count: 1 ================================================ FILE: compose.yml ================================================ services: podly: container_name: podly-pure-podcasts ports: - "5001:5001" image: ghcr.io/podly-pure-podcasts/podly-pure-podcasts:${BRANCH:-main-latest} volumes: - ./src/instance:/app/src/instance env_file: - ./.env.local environment: - PUID=${PUID:-1000} - PGID=${PGID:-1000} - CUDA_VISIBLE_DEVICES=${CUDA_VISIBLE_DEVICES:--1} - SERVER_THREADS=${SERVER_THREADS:-1} restart: unless-stopped healthcheck: test: [ "CMD", "python3", "-c", "import urllib.request; urllib.request.urlopen('http://127.0.0.1:5001/')", ] interval: 30s timeout: 10s retries: 3 start_period: 10s networks: default: name: podly-pure-podcasts-network ================================================ FILE: docker-entrypoint.sh ================================================ #!/bin/bash set -e # Check if PUID/PGID env variables are set if [ -n "${PUID}" ] && [ -n "${PGID}" ] && [ "$(id -u)" = "0" ]; then echo "Using custom UID:GID = ${PUID}:${PGID}" # Update user/group IDs if needed usermod -o -u "$PUID" appuser groupmod -o -g "$PGID" appuser # Ensure required directories exist mkdir -p /app/src/instance /app/src/instance/data /app/src/instance/data/in /app/src/instance/data/srv /app/src/instance/config /app/src/instance/db /app/src/instance/logs # Set permissions for all application directories APP_DIRS="/home/appuser /app/processing /app/src/instance /app/src/instance/data /app/src/instance/config /app/src/instance/db /app/src/instance/logs /app/scripts" chown -R appuser:appuser $APP_DIRS 2>/dev/null || true # Ensure log file exists and has correct permissions in new location touch /app/src/instance/logs/app.log chmod 664 /app/src/instance/logs/app.log chown appuser:appuser /app/src/instance/logs/app.log # Run as appuser export HOME=/home/appuser exec gosu appuser "$@" else # Run as current user (but don't assume it's appuser) exec "$@" fi ================================================ FILE: docs/contributors.md ================================================ # Contributor Guide ### Quick Start (Docker - recommended for local setup) 1. Make the script executable and run: ```bash chmod +x run_podly_docker.sh ./run_podly_docker.sh --build ./run_podly_docker.sh # foreground with logs ./run_podly_docker.sh -d # or detached ``` This automatically detects NVIDIA GPUs and uses them if available. After the server starts: - Open `http://localhost:5001` in your browser - Configure settings at `http://localhost:5001/config` - Add podcast feeds and start processing ## Usage Once the server is running: 1. Open `http://localhost:5001` 2. Configure settings in the Config page at `http://localhost:5001/config` 3. Add podcast RSS feeds through the web interface 4. Open your podcast app and subscribe to the Podly endpoint (e.g., `http://localhost:5001/feed/1`) 5. Select an episode and download ## Transcription Options Podly supports multiple options for audio transcription: 1. **Local Whisper (Default)** - Slower but self-contained 2. **OpenAI Hosted Whisper** - Fast and accurate; billed per-feed via Stripe 3. **Groq Hosted Whisper** - Fast and cost-effective Select your preferred method in the Config page (`/config`). ## Remote Setup Podly automatically detects reverse proxies and generates appropriate URLs via request headers. ### Reverse Proxy Examples **Nginx:** ```nginx server { listen 443 ssl; server_name your-domain.com; location / { proxy_pass http://localhost:5001; proxy_set_header Host $host; proxy_set_header X-Real-IP $remote_addr; proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for; proxy_set_header X-Forwarded-Proto $scheme; proxy_set_header X-Forwarded-Host $host; } } ``` **Traefik (docker-compose.yml):** ```yaml labels: - "traefik.enable=true" - "traefik.http.routers.podly.rule=Host(`your-domain.com`)" - "traefik.http.routers.podly.tls.certresolver=letsencrypt" - "traefik.http.services.podly.loadbalancer.server.port=5001" ``` > **Note**: Most modern reverse proxies automatically set the required headers. No manual configuration is needed in most cases. ### Built-in Authentication Podly ships with built-in authentication so you can secure feeds without relying on a reverse proxy. - Set `REQUIRE_AUTH=true` to enable protection. By default it is `false`, preserving existing behaviour. - When auth is enabled, Podly fails fast on startup unless `PODLY_ADMIN_PASSWORD` is supplied and meets the strength policy (≥12 characters with upper, lower, digit, symbol). Override the initial username with `PODLY_ADMIN_USERNAME` (default `podly_admin`). - Provide a long, random `PODLY_SECRET_KEY` so Flask sessions remain valid across restarts. If you omit it, the app generates a new key on each boot and all users are signed out. - On first boot with an empty database, Podly seeds an admin user using the supplied credentials. **If you are enabling auth on an existing install, start from a fresh data volume.** - After signing in, open the Config page to rotate your password and manage additional users. When you change the admin password, update the corresponding environment variable in your deployment platform so restarts continue to succeed. - Use the "Copy protected feed" button to generate feed-specific access tokens that are embedded in subscription URLs so podcast clients can authenticate without your primary password. Rate limiting is still applied to repeated authentication failures. ## Ubuntu Service Add a service file to /etc/systemd/system/podly.service ``` [Unit] Description=Podly Podcast Service After=network.target [Service] User=yourusername Group=yourusername WorkingDirectory=/path/to/your/app ExecStart=/usr/bin/pipenv run python src/main.py Restart=always [Install] WantedBy=multi-user.target ``` enable the service ``` sudo systemctl daemon-reload sudo systemctl enable podly.service ``` ## Database Update The database auto-migrates on launch. To add a migration after data model change: ```bash pipenv run flask --app ./src/main.py db migrate -m "[change description]" ``` On next launch, the database updates automatically. ## Releases and Commit Messages This repo uses `semantic-release` to automate versioning and GitHub releases. It relies on Conventional Commits to determine the next version. For pull requests, include **at least one** commit that follows the Conventional Commit format: - `feat: add new episode filter` - `fix(api): handle empty feed` - `chore: update dependencies` If no Conventional Commit is present, the release pipeline will have nothing to publish. ## Docker Support Podly can be run in Docker with support for both NVIDIA GPU and non-NVIDIA environments. ### Docker Options ```bash ./run_podly_docker.sh --dev # rebuild containers for local changes ./run_podly_docker.sh --production # use published images ./run_podly_docker.sh --lite # smaller image without local Whisper ./run_podly_docker.sh --cpu # force CPU mode ./run_podly_docker.sh --gpu # force GPU mode ./run_podly_docker.sh --build # build only ./run_podly_docker.sh --test-build # test build ./run_podly_docker.sh -d # detached ``` ### Development vs Production Modes **Development Mode** (default): - Uses local Docker builds - Requires rebuilding after code changes: `./run_podly_docker.sh --dev` - Mounts essential directories (config, input/output, database) and live code for development - Good for: development, testing, customization **Production Mode**: - Uses pre-built images from GitHub Container Registry - No building required - images are pulled automatically - Same volume mounts as development - Good for: deployment, quick setup, consistent environments ```bash # Start with existing local container ./run_podly_docker.sh # Rebuild and start after making code changes ./run_podly_docker.sh --dev # Use published images (no local building required) ./run_podly_docker.sh --production ``` ### Docker Environment Configuration **Environment Variables**: - `PUID`/`PGID`: User/group IDs for file permissions (automatically set by run script) - `CUDA_VISIBLE_DEVICES`: GPU device selection for CUDA acceleration - `CORS_ORIGINS`: Backend CORS configuration (defaults to accept requests from any origin) ## FAQ Q: What does "whitelisted" mean in the UI? A: It means an episode is eligible for download and ad removal. By default, new episodes are automatically whitelisted (`automatically_whitelist_new_episodes`), and only a limited number of old episodes are auto-whitelisted (`number_of_episodes_to_whitelist_from_archive_of_new_feed`). Adjust these settings in the Config page (/config). Q: How can I enable whisper GPU acceleration? A: There are two ways to enable GPU acceleration: 1. **Using Docker**: - Use the provided Docker setup with `run_podly_docker.sh` which automatically detects and uses NVIDIA GPUs if available - You can force GPU mode with `./run_podly_docker.sh --gpu` or force CPU mode with `./run_podly_docker.sh --cpu` 2. **In a local environment**: - Install the CUDA version of PyTorch to your virtual environment: ```bash pip3 install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118 ``` ## Contributing We welcome contributions to Podly! Here's how you can help: ### Development Setup 1. Fork the repository 2. Clone your fork: ```bash git clone https://github.com/yourusername/podly.git ``` 3. Create a new branch for your feature: ```bash git checkout -b feature/your-feature-name ``` 4. Create a pull request with a target branch of Preview #### Application Ports Both local and Docker deployments provide a consistent experience: - **Application**: Runs on port 5001 (configurable via web UI at `/config`) - Serves both the web interface and API endpoints - Frontend is built as static assets and served by the backend - **Development**: `run_podly_docker.sh` serves everything on port 5001 - Local script builds frontend to static assets (like Docker) - Restart `./run_podly_docker.sh` after frontend changes to rebuild assets #### Development Modes Both scripts provide equivalent core functionality with some unique features: **Common Options (work in both scripts)**: - `-b/--background` or `-d/--detach`: Run in background mode - `-h/--help`: Show help information **Local Development** **Docker Development** (`./run_podly_docker.sh`): - **Development mode**: `./run_podly_docker.sh --dev` - rebuilds containers with code changes - **Production mode**: `./run_podly_docker.sh --production` - uses pre-built images - **Docker-specific options**: `--build`, `--test-build`, `--gpu`, `--cpu`, `--cuda=VERSION`, `--rocm=VERSION`, `--branch=BRANCH` **Functional Equivalence**: Both scripts provide the same core user experience: - Application runs on port 5001 (configurable) - Frontend served as static assets by Flask backend - Same web interface and API endpoints - Compatible background/detached modes ### Running Tests Before submitting a pull request, you can run the same tests that run in CI: To prep your pipenv environment to run this script, you will need to first run: ```bash pipenv install --dev ``` Then, to run the checks, ```bash scripts/ci.sh ``` This will run all the necessary checks including: - Type checking with mypy - Code formatting checks - Unit tests - Linting ### Pull Request Process 1. Ensure all tests pass locally 2. Update the documentation if needed 3. Create a Pull Request with a clear description of the changes 4. Link any related issues ### Code Style - We use black for code formatting - Type hints are required for all new code - Follow existing patterns in the codebase ================================================ FILE: docs/how_to_run_beginners.md ================================================ # How To Run: Ultimate Beginner's Guide This guide will walk you through setting up Podly from scratch using Docker. Podly creates ad-free RSS feeds for podcasts by automatically detecting and removing advertisement segments. ## Highly Recommend! Want an expert to guide you through the setup? Download an AI powered IDE like cursor https://www.cursor.com/ or windsurf https://windsurf.com/ Most IDEs have a free tier you can use to get started. Alternatively, you can use your own [LLM API key in Cursor](https://docs.cursor.com/settings/api-keys) (you'll need a key for Podly anyways). Open the AI chat in the IDE. Enable 'Agent' mode if available, which will allow the IDE to help you run commands, view the output, and debug or take corrective steps if necessary. Paste one of the prompts below into the chat box. If you don't have the repo downloaded: ``` Help me install docker and run Podly https://github.com/podly-pure-podcasts/podly_pure_podcasts After the project is cloned, help me: - install docker & docker compose - run `./run_podly_docker.sh --build` then `./run_podly_docker.sh -d` - configure the app via the web UI at http://localhost:5001/config Be sure to check if a dependency is already installed before downloading. We recommend Docker because installing ffmpeg & local whisper can be difficult. The Docker image has both ffmpeg & local whisper preconfigured. Podly works with many different LLMs, it does not require an OpenAI key. Check your work by retrieving the index page from localhost:5001 at the end. ``` If you do have the repo pulled, open this file and prompt: ``` Review this project, follow this guide and start Podly on my computer. Briefly, help me: - install docker & docker compose - run `./run_podly_docker.sh --build` and then `./run_podly_docker.sh -d` - configure the app via the web UI at http://localhost:5001/config Be sure to check if a dependency is already installed before downloading. We recommend docker because installing ffmpeg & local whisper can be difficult. The docker image has both ffmpeg & local whisper preconfigured. Podly works with many different LLMs; it does not need to work with OpenAI. Check your work by retrieving the index page from localhost:5001 at the end. ``` ## Prerequisites ### Install Docker and Docker Compose #### On Windows: 1. Download and install [Docker Desktop for Windows](https://docs.docker.com/desktop/install/windows-install/) 2. During installation, make sure "Use WSL 2 instead of Hyper-V" is checked 3. Restart your computer when prompted 4. Open Docker Desktop and wait for it to start completely #### On macOS: 1. Download and install [Docker Desktop for Mac](https://docs.docker.com/desktop/install/mac-install/) 2. Drag Docker to your Applications folder 3. Launch Docker Desktop from Applications 4. Follow the setup assistant #### On Linux (Ubuntu/Debian): ```bash # Update package index sudo apt update # Install Docker sudo apt install docker.io docker-compose-v2 # Add your user to the docker group sudo usermod -aG docker $USER # Log out and log back in for group changes to take effect ``` #### Verify Installation: Open a terminal/command prompt and run: ```bash docker --version docker compose version ``` You should see version information for both commands. ### 2. Get an OpenAI API Key 1. Go to [OpenAI's API platform](https://platform.openai.com/) 2. Sign up for an account or log in if you already have one 3. Navigate to the [API Keys section](https://platform.openai.com/api-keys) 4. Click "Create new secret key" 5. Give it a name (e.g., "Podly") 6. **Important**: Copy the key immediately and save it somewhere safe - you won't be able to see it again! 7. Your API key will look something like: `sk-proj-xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx` > **Note**: OpenAI API usage requires payment. Make sure to set up billing and usage limits in your OpenAI account to avoid unexpected charges. ## Setup Podly ### Download the Project ```bash git clone https://github.com/normand1/podly_pure_podcasts.git cd podly_pure_podcasts ``` ## Running Podly ### Run the Application via Docker ```bash chmod +x run_podly_docker.sh ./run_podly_docker.sh --build ./run_podly_docker.sh # foreground ./run_podly_docker.sh -d # detached ``` ### Optional: Enable Authentication The Docker image reads environment variables from `.env` files or your shell. To require login: 1. Export the variables before running Podly, or add them to `config/.env`: ```bash export REQUIRE_AUTH=true export PODLY_ADMIN_USERNAME='podly_admin' export PODLY_ADMIN_PASSWORD='SuperSecurePass!2024' export PODLY_SECRET_KEY='replace-with-a-strong-64-char-secret' ``` 2. Start Podly as usual. On first boot with auth enabled and an empty database, the admin account is created automatically. If you are turning auth on for an existing volume, clear the `sqlite3.db` file so the bootstrap can succeed. 3. Sign in at `http://localhost:5001`, then visit the Config page to change your password, add users, and copy RSS URLs with the "Copy protected feed" button. Podly generates feed-specific access tokens and embeds them in the link so podcast players can subscribe without exposing your main password. Remember to update your environment variables whenever you rotate the admin password. ### First Run 1. Docker will download and build the necessary image (this may take 5-15 minutes) 2. Look for "Running on http://0.0.0.0:5001" 3. Open your browser to `http://localhost:5001` 4. Configure settings at `http://localhost:5001/config` - Alternatively, set secrets via Docker env file `.env.local` in the project root and restart the container. See .env.local.example ## Advanced Options ```bash # Force CPU-only processing (if you have GPU issues) ./run_podly_docker.sh --cpu # Force GPU processing ./run_podly_docker.sh --gpu # Just build the container without running ./run_podly_docker.sh --build # Test build from scratch (useful for troubleshooting) ./run_podly_docker.sh --test-build ``` ## Using Podly ### Adding Your First Podcast 1. In the web interface, look for an "Add Podcast" or similar button 2. Paste the RSS feed URL of your podcast 3. Podly will start processing new episodes automatically 4. Processed episodes will have advertisements removed ### Getting Your Ad-Free RSS Feed 1. After adding a podcast, Podly will generate a new RSS feed URL 2. Use this new URL in your podcast app instead of the original 3. Your podcast app will now download ad-free versions! ## Troubleshooting ### "Docker command not found" - Make sure Docker Desktop is running - On Windows, restart your terminal after installing Docker - On Linux, make sure you logged out and back in after adding yourself to the docker group ### Cannot connect to the Docker daemon. Is the docker daemon running? - If using docker desktop, open up the app, otherwise start the daemon ### "Permission denied" errors - On macOS/Linux, make sure the script is executable: `chmod +x run_podly_docker.sh` - On Windows, try running Command Prompt as Administrator ### OpenAI API errors - Double-check your API key in the Config page at `/config` - Make sure you have billing set up in your OpenAI account - Check your usage limits haven't been exceeded ### Port 5001 already in use - Another application is using port 5001 - **Docker users**: Either stop that application or modify the port in `compose.dev.cpu.yml` and `compose.yml` - **Native users**: Change the port in the Config page under App settings - To kill processes on that port run `lsof -i :5001 | grep LISTEN | awk '{print $2}' | xargs kill -9` ### Out of memory errors - Close other applications to free up RAM - Consider using `--cpu` flag if you have limited memory ## Stopping Podly To stop the application: If you have launched it in the foreground by omitting the `-d` parameter: 1. In the terminal where Podly is running, press `Ctrl+C` 2. Wait for the container to stop gracefully If you have launched it in the background using the `-d` parameter: 1. In the terminal where Podly is running, execute `docker compose down` 2. Wait for the container to stop gracefully In both cases this output should appear to indicate that it has stopped: ```sh [+] Running 2/2 ✔ Container podly-pure-podcasts Removed ✔ Network podly-pure-podcasts-network Removed ``` ## Upgrading Podly To upgrade the application while you are in the terminal where it is running: 1. [Stop it](#stopping-podly) 2. Execute `git pull` 3. [Run it again](#running-podly) ## Getting Help If you encounter issues ask in our discord, we're friendly! https://discord.gg/FRB98GtF6N ## What's Next? Once you have Podly running: - Explore the web interface to add more podcasts - Configure settings in the Config page - Consider setting up automatic background processing - Enjoy your ad-free podcasts! ================================================ FILE: docs/how_to_run_railway.md ================================================ # How to Run on Railway This guide will walk you through deploying Podly on Railway using the one-click template. ## 0. Important! Set Budgets Both Railway and Groq allow you to set budgets on your processing. Set a $10 (minimum possible, expect smaller bill) budget on Railway. Set a $5 budget for Groq. ## 1. Get Free Groq API Key Podly uses Groq to transcribe podcasts quickly and for free. 1. Go to [https://console.groq.com/keys](https://console.groq.com/keys). 2. Sign up for a free account. 3. Create a new API key. 4. Copy the key and paste it into the `GROQ_API_KEY` field during the Railway deployment. ## 2. Deploy Railway Template Click the button below to deploy Podly to Railway. This is a sponsored link that supports the project! [![Deploy on Railway](https://railway.com/button.svg)](https://railway.com/deploy/podly?referralCode=NMdeg5&utm_medium=integration&utm_source=template&utm_campaign=generic) If you want to be a beta-tester, you can deploy the preview branch instead: [![Deploy on Railway](https://railway.com/button.svg)](https://railway.com/deploy/podly-preview?referralCode=NMdeg5&utm_medium=integration&utm_source=template&utm_campaign=generic) ## 3. Configure Networking After the deployment is complete, you need to expose the service to the internet. 1. Click on the new deployment in your Railway dashboard. 2. Go to the **Settings** tab. 3. Under **Networking**, find the **Public Networking** section and click **Generate Domain**. 4. You can now access Podly at the generated URL. 5. (Optional) To change the domain name, click **Edit** and enter a new name. ![Setting up Railway Networking](images/setting_up_railway_networking.png) ## 4. Set Budgets & Expected Pricing Set a $10 budget on Railway and a $5 budget on Groq (or use the free tier for Groq which will slow processing). Podly is designed to run efficiently on Railway's hobby plan. If you process a large volume of podcasts, you can check the **Config** page in your Podly deployment for estimated monthly costs based on your usage. ## 5. Secure Your Deployment Podly now uses secure session cookies for the web dashboard while keeping HTTP Basic authentication for RSS feeds and audio downloads. Before inviting listeners, secure the app: 1. In the Railway dashboard, open your Podly service and head to **Variables**. 2. Add `REQUIRE_AUTH` with value `true`. 3. Add a strong `PODLY_ADMIN_PASSWORD` (minimum 12 characters including uppercase, lowercase, digit, and symbol). Optionally set `PODLY_ADMIN_USERNAME`. 4. Provide a long, random `PODLY_SECRET_KEY` so session cookies survive restarts. (If you omit it, Podly will generate a new key each deploy and sign everyone out.) 5. Redeploy the service. On first boot Podly seeds the admin user and requires those credentials on every request. > **Important:** Enabling auth on an existing deployment requires a fresh data volume. Create a new Railway deployment or wipe the existing storage so the initial admin can be seeded. After signing in, use the Config page to change your password, add additional users, and copy RSS links via the "Copy protected feed" button. Podly issues feed-specific access tokens and embeds them in each URL so listeners can subscribe without knowing your main password. When you rotate passwords, update the corresponding Railway variables so restarts succeed. ## 6. Using Podly 1. Open your new Podly URL in a browser. 2. Navigate to the **Feeds** page. 3. Add the RSS feed URL of a podcast you want to process. 4. Go to your favorite podcast client and subscribe to the new feed URL provided by Podly (e.g., `https://your-podly-app.up.railway.app/feed/1`). 5. Download and enjoy ad-free episodes! ================================================ FILE: docs/todo.txt ================================================ - config audit & testing (advanced and basic) - move host/port/threads to docker config reaudit security + testing ci.sh test railway login for public facing podcast rss search 'basic' config page - just put in groq api key + test + save on populate also show if api key is set or blank test hide 'local' whisper in lite build ================================================ FILE: frontend/.gitignore ================================================ # Logs logs *.log npm-debug.log* yarn-debug.log* yarn-error.log* pnpm-debug.log* lerna-debug.log* node_modules dist dist-ssr *.local # Editor directories and files .vscode/* !.vscode/extensions.json .idea .DS_Store *.suo *.ntvs* *.njsproj *.sln *.sw? ================================================ FILE: frontend/README.md ================================================ # Podly Frontend This is the React + TypeScript + Vite frontend for Podly. The frontend is built and served as part of the main Podly application. ## Development The frontend is integrated into the main Podly application and served as static assets by the Flask backend on port 5001. ### Development Workflows 1. **Docker (recommended)**: The Docker build compiles the frontend during image creation and serves static assets from Flask. 2. **Direct Frontend Development**: You can run the frontend development server separately for advanced frontend work: ```bash cd frontend npm install npm run dev ``` This starts the Vite development server on port 5173 with hot reloading and proxies API calls to the backend on port 5001. ### Build Process - **Direct Development** (`npm run dev`): Vite dev server serves files with hot reloading on port 5173 and proxies API calls to backend on port 5001 - **Docker**: Multi-stage build compiles frontend assets during image creation and copies them to the Flask static directory ## Technology Stack - **React 18+** with TypeScript - **Vite** for build tooling and development server - **Tailwind CSS** for styling - **React Router** for client-side routing - **Tanstack Query** for data fetching ## Configuration The frontend configuration is handled through: - **Environment Variables**: Set via Vite's environment variable system - **Vite Config**: `vite.config.ts` for build and development settings - Development server runs on port 5173 - Proxies API calls to backend on port 5001 (configurable via `BACKEND_TARGET`) - **Tailwind Config**: `tailwind.config.js` for styling configuration ================================================ FILE: frontend/eslint.config.js ================================================ import js from '@eslint/js' import globals from 'globals' import reactHooks from 'eslint-plugin-react-hooks' import reactRefresh from 'eslint-plugin-react-refresh' import tseslint from 'typescript-eslint' export default tseslint.config( { ignores: ['dist'] }, { extends: [js.configs.recommended, ...tseslint.configs.recommended], files: ['**/*.{ts,tsx}'], languageOptions: { ecmaVersion: 2020, globals: globals.browser, }, plugins: { 'react-hooks': reactHooks, 'react-refresh': reactRefresh, }, rules: { ...reactHooks.configs.recommended.rules, 'react-refresh/only-export-components': [ 'warn', { allowConstantExport: true }, ], }, }, ) ================================================ FILE: frontend/index.html ================================================ Podly
================================================ FILE: frontend/package.json ================================================ { "name": "frontend", "private": true, "version": "0.0.0", "type": "module", "scripts": { "dev": "vite", "build": "tsc -b && vite build", "lint": "eslint .", "preview": "vite preview" }, "dependencies": { "@tailwindcss/line-clamp": "^0.4.4", "@tanstack/react-query": "^5.77.0", "axios": "^1.9.0", "clsx": "^2.1.1", "react": "^19.1.0", "react-dom": "^19.1.0", "react-hot-toast": "^2.6.0", "react-router-dom": "^7.6.1", "tailwind-merge": "^3.3.0" }, "devDependencies": { "@eslint/js": "^9.25.0", "@types/react": "^19.1.2", "@types/react-dom": "^19.1.2", "@vitejs/plugin-react": "^4.4.1", "autoprefixer": "^10.4.21", "eslint": "^9.25.0", "eslint-plugin-react-hooks": "^5.2.0", "eslint-plugin-react-refresh": "^0.4.19", "globals": "^16.0.0", "postcss": "^8.5.3", "tailwindcss": "^3.4.17", "typescript": "~5.8.3", "typescript-eslint": "^8.30.1", "vite": "^6.3.5" } } ================================================ FILE: frontend/postcss.config.js ================================================ export default { plugins: { tailwindcss: {}, autoprefixer: {}, }, } ================================================ FILE: frontend/src/App.css ================================================ html, body { margin: 0 !important; padding: 0 !important; height: 100% !important; overflow: hidden !important; } #root { height: 100vh !important; overflow: hidden !important; max-width: none !important; margin: 0 !important; padding: 0 !important; } .logo { height: 6em; padding: 1.5em; will-change: filter; transition: filter 300ms; } .logo:hover { filter: drop-shadow(0 0 2em #646cffaa); } .logo.react:hover { filter: drop-shadow(0 0 2em #61dafbaa); } @keyframes logo-spin { from { transform: rotate(0deg); } to { transform: rotate(360deg); } } @media (prefers-reduced-motion: no-preference) { .logo { animation: logo-spin infinite 20s linear; } } .card { padding: 2em; } .read-the-docs { color: #888; } /* Audio Player Styles */ .audio-player-progress { transition: all 0.1s ease; } .audio-player-progress:hover { height: 6px; } .audio-player-progress-thumb { transition: all 0.2s ease; transform: scale(0); } .audio-player-progress:hover .audio-player-progress-thumb { transform: scale(1); } .audio-player-volume-slider { transition: all 0.2s ease; } /* Custom scrollbar for better UX */ ::-webkit-scrollbar { width: 6px; } ::-webkit-scrollbar-track { background: #f1f1f1; } ::-webkit-scrollbar-thumb { background: #c1c1c1; border-radius: 3px; } ::-webkit-scrollbar-thumb:hover { background: #a8a8a8; } ================================================ FILE: frontend/src/App.tsx ================================================ import { QueryClient, QueryClientProvider } from '@tanstack/react-query'; import { Toaster } from 'react-hot-toast'; import { BrowserRouter as Router, Routes, Route, Link, Navigate, useLocation } from 'react-router-dom'; import { AudioPlayerProvider } from './contexts/AudioPlayerContext'; import { AuthProvider, useAuth } from './contexts/AuthContext'; import { useQuery } from '@tanstack/react-query'; import { useState, useEffect, useRef } from 'react'; import HomePage from './pages/HomePage'; import JobsPage from './pages/JobsPage'; import ConfigPage from './pages/ConfigPage'; import LoginPage from './pages/LoginPage'; import LandingPage from './pages/LandingPage'; import BillingPage from './pages/BillingPage'; import AudioPlayer from './components/AudioPlayer'; import { billingApi } from './services/api'; import { DiagnosticsProvider, useDiagnostics } from './contexts/DiagnosticsContext'; import DiagnosticsModal from './components/DiagnosticsModal'; import './App.css'; const queryClient = new QueryClient({ defaultOptions: { queries: { staleTime: 0, gcTime: 0, refetchOnMount: 'always', refetchOnWindowFocus: 'always', refetchOnReconnect: 'always', }, }, }); function AppShell() { const { status, requireAuth, isAuthenticated, user, logout, landingPageEnabled } = useAuth(); const { open: openDiagnostics } = useDiagnostics(); const [mobileMenuOpen, setMobileMenuOpen] = useState(false); const mobileMenuRef = useRef(null); const location = useLocation(); const { data: billingSummary } = useQuery({ queryKey: ['billing', 'summary'], queryFn: billingApi.getSummary, enabled: !!user && requireAuth && isAuthenticated, retry: false, }); // Close mobile menu on route change useEffect(() => { setMobileMenuOpen(false); }, [location.pathname]); // Close mobile menu when clicking outside useEffect(() => { function handleClickOutside(event: MouseEvent) { if (mobileMenuRef.current && !mobileMenuRef.current.contains(event.target as Node)) { setMobileMenuOpen(false); } } if (mobileMenuOpen) { document.addEventListener('mousedown', handleClickOutside); return () => document.removeEventListener('mousedown', handleClickOutside); } }, [mobileMenuOpen]); if (status === 'loading') { return (

Loading authentication…

); } // Show landing page for unauthenticated users when auth is required // But allow access to /login route if (requireAuth && !isAuthenticated) { return ( } /> {landingPageEnabled ? ( } /> ) : ( <> } /> } /> )} ); } const isAdmin = !requireAuth || user?.role === 'admin'; const showConfigLink = !requireAuth || isAdmin; const showJobsLink = !requireAuth || isAdmin; const showBillingLink = requireAuth && !isAdmin; return (
Podly

Podly

{/* Desktop Navigation */} {/* Mobile: Credits + Hamburger */}
{requireAuth && user && billingSummary && !isAdmin && ( <>
Feeds {billingSummary.feeds_in_use}/{billingSummary.feed_allowance}
Change plan )} {/* Hamburger Button */}
{/* Mobile Menu Dropdown */} {mobileMenuOpen && (
Home {showBillingLink && ( Billing )} {showJobsLink && ( Jobs )} {showConfigLink && ( Config )} {requireAuth && user && ( <>
{user.username}
)}
)}
} /> {showBillingLink && } />} {showJobsLink && } />} {showConfigLink && } />} } />
); } function App() { return ( ); } export default App; ================================================ FILE: frontend/src/components/AddFeedForm.tsx ================================================ import { useState, useEffect, useCallback } from 'react'; import { feedsApi } from '../services/api'; import type { PodcastSearchResult } from '../types'; import { diagnostics, emitDiagnosticError } from '../utils/diagnostics'; import { getHttpErrorInfo } from '../utils/httpError'; interface AddFeedFormProps { onSuccess: () => void; onUpgradePlan?: () => void; planLimitReached?: boolean; } type AddMode = 'url' | 'search'; const PAGE_SIZE = 10; export default function AddFeedForm({ onSuccess, onUpgradePlan, planLimitReached }: AddFeedFormProps) { const [url, setUrl] = useState(''); const [activeMode, setActiveMode] = useState('search'); const [isSubmitting, setIsSubmitting] = useState(false); const [error, setError] = useState(''); const [addingFeedUrl, setAddingFeedUrl] = useState(null); const [upgradePrompt, setUpgradePrompt] = useState(null); const [searchTerm, setSearchTerm] = useState(''); const [searchResults, setSearchResults] = useState([]); const [searchError, setSearchError] = useState(''); const [isSearching, setIsSearching] = useState(false); const [searchPage, setSearchPage] = useState(1); const [totalResults, setTotalResults] = useState(0); const [hasSearched, setHasSearched] = useState(false); const resetSearchState = () => { setSearchResults([]); setSearchError(''); setSearchPage(1); setTotalResults(0); setHasSearched(false); }; const handleSubmitManual = async (e: React.FormEvent) => { e.preventDefault(); if (!url.trim()) return; diagnostics.add('info', 'Add feed (manual) submitted', { via: 'url', hasUrl: true }); setError(''); await addFeed(url.trim(), 'url'); }; const addFeed = async (feedUrl: string, source: AddMode) => { if (planLimitReached) { setUpgradePrompt('Your plan is full. Increase your feed allowance to add more.'); return; } setIsSubmitting(true); setAddingFeedUrl(source === 'url' ? 'manual' : feedUrl); setError(''); setUpgradePrompt(null); try { diagnostics.add('info', 'Add feed request', { source, hasUrl: !!feedUrl }); await feedsApi.addFeed(feedUrl); if (source === 'url') { setUrl(''); } diagnostics.add('info', 'Add feed success', { source }); onSuccess(); } catch (err) { console.error('Failed to add feed:', err); const { status, data, message } = getHttpErrorInfo(err); const code = data && typeof data === 'object' ? (data as { error?: unknown }).error : undefined; const errorCode = typeof code === 'string' ? code : undefined; emitDiagnosticError({ title: 'Failed to add feed', message, kind: status ? 'http' : 'network', details: { source, feedUrl, status, response: data, }, }); if (errorCode === 'FEED_LIMIT_REACHED') { setUpgradePrompt(message || 'Plan limit reached. Increase your feeds to add more.'); } else { setError(message || 'Failed to add feed. Please check the URL and try again.'); } } finally { setIsSubmitting(false); setAddingFeedUrl(null); } }; const performSearch = useCallback(async (term: string) => { if (!term.trim()) { setSearchResults([]); setTotalResults(0); setHasSearched(false); setSearchError(''); return; } setIsSearching(true); setSearchError(''); try { diagnostics.add('info', 'Search podcasts request', { term: term.trim() }); const response = await feedsApi.searchFeeds(term.trim()); setSearchResults(response.results); setTotalResults(response.total ?? response.results.length); setSearchPage(1); setHasSearched(true); diagnostics.add('info', 'Search podcasts success', { term: term.trim(), total: response.total ?? response.results.length, }); } catch (err) { console.error('Podcast search failed:', err); diagnostics.add('error', 'Search podcasts failed', { term: term.trim() }); setSearchError('Failed to search podcasts. Please try again.'); setSearchResults([]); } finally { setIsSearching(false); } }, []); useEffect(() => { const delayDebounceFn = setTimeout(() => { if (searchTerm.trim()) { performSearch(searchTerm); } else { setSearchResults([]); setTotalResults(0); setHasSearched(false); } }, 500); return () => clearTimeout(delayDebounceFn); }, [searchTerm, performSearch]); const handleSearchSubmit = async (e: React.FormEvent) => { e.preventDefault(); await performSearch(searchTerm); }; const handleAddFromSearch = async (result: PodcastSearchResult) => { await addFeed(result.feedUrl, 'search'); }; const totalPages = totalResults === 0 ? 1 : Math.max(1, Math.ceil(totalResults / PAGE_SIZE)); const startIndex = totalResults === 0 ? 0 : (searchPage - 1) * PAGE_SIZE + 1; const endIndex = totalResults === 0 ? 0 : Math.min(searchPage * PAGE_SIZE, totalResults); const displayedResults = searchResults.slice( (searchPage - 1) * PAGE_SIZE, (searchPage - 1) * PAGE_SIZE + PAGE_SIZE ); return (

Add New Podcast Feed

{planLimitReached && (
Your plan is full. Increase your feed allowance to add more.
)}
{activeMode === 'url' && (
setUrl(e.target.value)} placeholder="https://example.com/podcast/feed.xml" className="w-full px-3 py-2 border border-gray-300 rounded-md focus:outline-none focus:ring-2 focus:ring-blue-500 focus:border-transparent" required disabled={!!planLimitReached} />
{error && (
{error}
)} {upgradePrompt && (
{upgradePrompt} {onUpgradePlan && ( )}
)}
)} {activeMode === 'search' && (
setSearchTerm(e.target.value)} placeholder="e.g. history, space, entrepreneurship" className="w-full px-3 py-2 border border-gray-300 rounded-md focus:outline-none focus:ring-2 focus:ring-blue-500 focus:border-transparent" disabled={!!planLimitReached} />
{searchError && (
{searchError}
)} {isSearching && searchResults.length === 0 && (
Searching for podcasts...
)} {!isSearching && searchResults.length === 0 && totalResults === 0 && hasSearched && !searchError && (
No podcasts found. Try a different search term.
)} {searchResults.length > 0 && (
Showing {startIndex}-{endIndex} of {totalResults} results
    {displayedResults.map((result) => (
  • {result.artworkUrl ? ( {result.title} ) : (
    No Image
    )}

    {result.title}

    {result.author && (

    {result.author}

    )} {result.genres.length > 0 && (

    {result.genres.join(' · ')}

    )}

    {result.feedUrl}

  • ))}
)}
)}
); } ================================================ FILE: frontend/src/components/AudioPlayer.tsx ================================================ import React, { useState, useRef, useEffect } from 'react'; import { useAudioPlayer } from '../contexts/AudioPlayerContext'; // Simple SVG icons to replace Heroicons const PlayIcon = ({ className }: { className: string }) => ( ); const PauseIcon = ({ className }: { className: string }) => ( ); const SpeakerWaveIcon = ({ className }: { className: string }) => ( ); const SpeakerXMarkIcon = ({ className }: { className: string }) => ( ); const XMarkIcon = ({ className }: { className: string }) => ( ); export default function AudioPlayer() { const { currentEpisode, isPlaying, currentTime, duration, volume, isLoading, error, togglePlayPause, seekTo, setVolume } = useAudioPlayer(); const [isDragging, setIsDragging] = useState(false); const [dragTime, setDragTime] = useState(0); const [showVolumeSlider, setShowVolumeSlider] = useState(false); const [showKeyboardShortcuts, setShowKeyboardShortcuts] = useState(false); const [dismissedError, setDismissedError] = useState(null); const progressBarRef = useRef(null); const volumeSliderRef = useRef(null); // Reset dismissed error when a new error occurs useEffect(() => { if (error && error !== dismissedError) { setDismissedError(null); } }, [error, dismissedError]); // Close volume slider when clicking outside useEffect(() => { const handleClickOutside = (event: MouseEvent) => { if (volumeSliderRef.current && !volumeSliderRef.current.contains(event.target as Node)) { setShowVolumeSlider(false); } }; if (showVolumeSlider) { document.addEventListener('mousedown', handleClickOutside); return () => document.removeEventListener('mousedown', handleClickOutside); } }, [showVolumeSlider]); // Don't render if no episode is loaded if (!currentEpisode) { return null; } console.log('AudioPlayer rendering with:', { currentEpisode: currentEpisode?.title, isPlaying, isLoading, error, duration }); const formatTime = (seconds: number) => { if (isNaN(seconds)) return '0:00'; const hours = Math.floor(seconds / 3600); const minutes = Math.floor((seconds % 3600) / 60); const remainingSeconds = Math.floor(seconds % 60); if (hours > 0) { return `${hours}:${minutes.toString().padStart(2, '0')}:${remainingSeconds.toString().padStart(2, '0')}`; } return `${minutes}:${remainingSeconds.toString().padStart(2, '0')}`; }; const handleProgressClick = (e: React.MouseEvent) => { if (!progressBarRef.current || !duration) return; const rect = progressBarRef.current.getBoundingClientRect(); const clickX = e.clientX - rect.left; const newTime = (clickX / rect.width) * duration; seekTo(newTime); }; const handleProgressMouseDown = (e: React.MouseEvent) => { setIsDragging(true); handleProgressClick(e); }; const handleProgressMouseMove = (e: React.MouseEvent) => { if (!isDragging || !progressBarRef.current || !duration) return; const rect = progressBarRef.current.getBoundingClientRect(); const clickX = e.clientX - rect.left; const newTime = Math.max(0, Math.min((clickX / rect.width) * duration, duration)); setDragTime(newTime); }; const handleProgressMouseUp = () => { if (isDragging) { seekTo(dragTime); setIsDragging(false); } }; const handleVolumeChange = (e: React.MouseEvent) => { if (!volumeSliderRef.current) return; const rect = volumeSliderRef.current.getBoundingClientRect(); const clickX = e.clientX - rect.left; const newVolume = Math.max(0, Math.min(clickX / rect.width, 1)); setVolume(newVolume); }; const toggleMute = () => { setVolume(volume > 0 ? 0 : 1); }; const dismissError = () => { setDismissedError(error); }; const displayTime = isDragging ? dragTime : currentTime; const progressPercentage = duration > 0 ? (displayTime / duration) * 100 : 0; const shouldShowError = error && error !== dismissedError; return (
{shouldShowError && (
{error}
)}
{/* Episode Info */}
🎵

{currentEpisode.title}

Episode • {formatTime(duration)}

{/* Player Controls */}
{/* Control Buttons */}
setShowKeyboardShortcuts(true)} onMouseLeave={() => setShowKeyboardShortcuts(false)} > {/* Keyboard Shortcuts Tooltip */} {showKeyboardShortcuts && (
Space: Play/Pause
← →: Seek ±10s
↑ ↓: Volume ±10%
)}
{/* Progress Bar */}
{formatTime(displayTime)}
{formatTime(duration)}
{/* Volume Control */}
{showVolumeSlider && (
setShowVolumeSlider(true)} >
)}
); } ================================================ FILE: frontend/src/components/DiagnosticsModal.tsx ================================================ import { useEffect, useMemo, useState } from 'react'; import { useDiagnostics } from '../contexts/DiagnosticsContext'; import { DIAGNOSTIC_UPDATED_EVENT, diagnostics, type DiagnosticsEntry } from '../utils/diagnostics'; const GITHUB_NEW_ISSUE_URL = 'https://github.com/podly-pure-podcasts/podly_pure_podcasts/issues/new'; const buildIssueUrl = (title: string, body: string) => { const url = new URL(GITHUB_NEW_ISSUE_URL); url.searchParams.set('title', title); url.searchParams.set('body', body); return url.toString(); }; const formatTs = (ts: number) => { try { return new Date(ts).toISOString(); } catch { return String(ts); } }; export default function DiagnosticsModal() { const { isOpen, close, clear, getEntries, currentError } = useDiagnostics(); const [entries, setEntries] = useState(() => getEntries()); useEffect(() => { if (!isOpen) return; // Refresh immediately when opened setEntries(getEntries()); const handler = () => { setEntries(getEntries()); }; window.addEventListener(DIAGNOSTIC_UPDATED_EVENT, handler); return () => window.removeEventListener(DIAGNOSTIC_UPDATED_EVENT, handler); }, [getEntries, isOpen]); const recentEntries = useMemo(() => entries.slice(-80), [entries]); const issueTitle = currentError?.title ? `[FE] ${currentError.title}` : '[FE] Troubleshooting info'; const issueBody = useMemo(() => { const env = { userAgent: typeof navigator !== 'undefined' ? navigator.userAgent : null, url: typeof window !== 'undefined' ? window.location.href : null, time: new Date().toISOString(), }; const payload = { error: currentError, env, logs: recentEntries, }; const json = JSON.stringify(diagnostics.sanitize(payload), null, 2); return [ '## What happened', '(Describe what you clicked / expected / saw)', '', '## Diagnostics (auto-collected)', '```json', json, '```', ].join('\n'); }, [currentError, recentEntries]); const issueUrl = useMemo(() => buildIssueUrl(issueTitle, issueBody), [issueTitle, issueBody]); if (!isOpen) return null; return (

Troubleshooting

{currentError ? 'An error occurred. You can report it with logs.' : 'Use this to collect logs for a bug report.'}

{currentError && (
{currentError.title}
{currentError.message}
)}
Showing last {recentEntries.length} log entries (session only).
Report on GitHub
{recentEntries
  .map((e) => {
    const base = `[${formatTs(e.ts)}] ${e.level.toUpperCase()}: ${e.message}`;
    if (e.data === undefined) return base;
    try {
      return base + `\n  ${JSON.stringify(e.data)}`;
    } catch {
      return base;
    }
  })
  .join('\n')}
              
Sensitive fields like tokens/cookies are redacted.
); } ================================================ FILE: frontend/src/components/DownloadButton.tsx ================================================ import { useState } from 'react'; import { useQueryClient } from '@tanstack/react-query'; import axios from 'axios'; import { feedsApi } from '../services/api'; import ReprocessButton from './ReprocessButton'; import { configApi } from '../services/api'; import { toast } from 'react-hot-toast'; import { useEpisodeStatus } from '../hooks/useEpisodeStatus'; interface DownloadButtonProps { episodeGuid: string; isWhitelisted: boolean; hasProcessedAudio: boolean; feedId?: number; canModifyEpisodes?: boolean; className?: string; } export default function DownloadButton({ episodeGuid, isWhitelisted, hasProcessedAudio, feedId, canModifyEpisodes = true, className = '' }: DownloadButtonProps) { const [error, setError] = useState(null); const queryClient = useQueryClient(); const { data: status } = useEpisodeStatus(episodeGuid, isWhitelisted, hasProcessedAudio, feedId); const isProcessing = status?.status === 'pending' || status?.status === 'running' || status?.status === 'starting'; const isCompleted = hasProcessedAudio || status?.status === 'completed'; const downloadUrl = status?.download_url || (hasProcessedAudio ? `/api/posts/${episodeGuid}/download` : undefined); const handleDownloadClick = async () => { if (!isWhitelisted) { setError('Post must be whitelisted before processing'); return; } // Guard when LLM API key is not configured - use fresh server check try { const { configured } = await configApi.isConfigured(); if (!configured) { toast.error('Add an API key in Config before processing.'); return; } } catch (err) { if (!(axios.isAxiosError(err) && err.response?.status === 403)) { toast.error('Unable to verify configuration. Please try again.'); return; } } if (isCompleted && downloadUrl) { // Already processed, download directly try { await feedsApi.downloadPost(episodeGuid); } catch (err) { console.error('Error downloading file:', err); setError('Failed to download file'); } return; } try { setError(null); // Optimistically update status to show processing state immediately queryClient.setQueryData(['episode-status', episodeGuid], { status: 'starting', step: 0, step_name: 'Starting', total_steps: 4, message: 'Requesting processing...' }); const response = await feedsApi.processPost(episodeGuid); // Invalidate to trigger polling in the hook queryClient.invalidateQueries({ queryKey: ['episode-status', episodeGuid] }); if (response.status === 'not_started') { setError('No processing job found'); } } catch (err: unknown) { console.error('Error starting processing:', err); const errorMessage = err && typeof err === 'object' && 'response' in err ? (err as { response?: { data?: { error?: string; message?: string } } }).response?.data?.message || (err as { response?: { data?: { error?: string } } }).response?.data?.error || 'Failed to start processing' : 'Failed to start processing'; setError(errorMessage); // Invalidate to clear optimistic update if failed queryClient.invalidateQueries({ queryKey: ['episode-status', episodeGuid] }); } }; // Show completed state with download button only if (isCompleted && downloadUrl) { return (
{ queryClient.invalidateQueries({ queryKey: ['episode-status', episodeGuid] }); }} />
{error && (
{error}
)}
); } // If user can't modify episodes, don't show the Process button if (!canModifyEpisodes) { return null; } // If processing, hide the button (EpisodeProcessingStatus will show progress) if (isProcessing) { return null; } return (
{/* Error message */} {error && (
{error}
)}
); } ================================================ FILE: frontend/src/components/EpisodeProcessingStatus.tsx ================================================ import { useEpisodeStatus } from '../hooks/useEpisodeStatus'; interface EpisodeProcessingStatusProps { episodeGuid: string; isWhitelisted: boolean; hasProcessedAudio: boolean; feedId?: number; className?: string; } export default function EpisodeProcessingStatus({ episodeGuid, isWhitelisted, hasProcessedAudio, feedId, className = '' }: EpisodeProcessingStatusProps) { const { data: status } = useEpisodeStatus(episodeGuid, isWhitelisted, hasProcessedAudio, feedId); if (!status) return null; // Don't show anything if completed (DownloadButton handles this) or not started if (status.status === 'completed' || status.status === 'not_started') { return null; } const getProgressPercentage = () => { if (!status) return 0; return (status.step / status.total_steps) * 100; }; const getStepIcon = (stepNumber: number) => { if (!status) return '○'; if (status.step > stepNumber) { return '✓'; // Completed } else if (status.step === stepNumber) { return '●'; // Current } else { return '○'; // Not started } }; return (
{/* Progress indicator */}
{/* Progress bar */}
{/* Step indicators */}
{[1, 2, 3, 4].map((stepNumber) => (
stepNumber ? 'text-green-600' : '' }`} > {getStepIcon(stepNumber)} {stepNumber}/4
))}
{/* Current step name */}
{status.step_name}
{/* Error message */} {(status.error || status.status === 'failed' || status.status === 'error') && (
{status.error || 'Processing failed'}
)}
); } ================================================ FILE: frontend/src/components/FeedDetail.tsx ================================================ import { useQuery, useMutation, useQueryClient } from '@tanstack/react-query'; import { useState, useEffect, useRef, useMemo } from 'react'; import { toast } from 'react-hot-toast'; import type { Feed, Episode, PagedResult, ConfigResponse } from '../types'; import { feedsApi, configApi } from '../services/api'; import DownloadButton from './DownloadButton'; import PlayButton from './PlayButton'; import ProcessingStatsButton from './ProcessingStatsButton'; import EpisodeProcessingStatus from './EpisodeProcessingStatus'; import { useAuth } from '../contexts/AuthContext'; import { copyToClipboard } from '../utils/clipboard'; import { emitDiagnosticError } from '../utils/diagnostics'; import { getHttpErrorInfo } from '../utils/httpError'; interface FeedDetailProps { feed: Feed; onClose?: () => void; onFeedDeleted?: () => void; } type SortOption = 'newest' | 'oldest' | 'title'; interface ProcessingEstimate { post_guid: string; estimated_minutes: number; can_process: boolean; reason: string | null; } const EPISODES_PAGE_SIZE = 25; export default function FeedDetail({ feed, onClose, onFeedDeleted }: FeedDetailProps) { const { requireAuth, isAuthenticated, user } = useAuth(); const [sortBy, setSortBy] = useState('newest'); const [showStickyHeader, setShowStickyHeader] = useState(false); const [showHelp, setShowHelp] = useState(false); const [showMenu, setShowMenu] = useState(false); const queryClient = useQueryClient(); const scrollContainerRef = useRef(null); const feedHeaderRef = useRef(null); const [currentFeed, setCurrentFeed] = useState(feed); const [pendingEpisode, setPendingEpisode] = useState(null); const [showProcessingModal, setShowProcessingModal] = useState(false); const [processingEstimate, setProcessingEstimate] = useState(null); const [isEstimating, setIsEstimating] = useState(false); const [estimateError, setEstimateError] = useState(null); const [page, setPage] = useState(1); const isAdmin = !requireAuth || user?.role === 'admin'; const whitelistedOnly = requireAuth && !isAdmin; const { data: configResponse } = useQuery({ queryKey: ['config'], queryFn: configApi.getConfig, enabled: isAdmin, }); const { data: episodesPage, isLoading, isFetching, error, } = useQuery, Error, PagedResult, [string, number, number, boolean]>({ queryKey: ['episodes', currentFeed.id, page, whitelistedOnly], queryFn: () => feedsApi.getFeedPosts(currentFeed.id, { page, pageSize: EPISODES_PAGE_SIZE, whitelistedOnly, }), placeholderData: (previousData) => previousData, }); const whitelistMutation = useMutation({ mutationFn: ({ guid, whitelisted, triggerProcessing }: { guid: string; whitelisted: boolean; triggerProcessing?: boolean }) => feedsApi.togglePostWhitelist(guid, whitelisted, triggerProcessing), onSuccess: () => { queryClient.invalidateQueries({ queryKey: ['episodes', currentFeed.id] }); }, onError: (err) => { const { status, data, message } = getHttpErrorInfo(err); emitDiagnosticError({ title: 'Failed to update whitelist status', message, kind: status ? 'http' : 'network', details: { status, response: data, }, }); }, }); const bulkWhitelistMutation = useMutation({ mutationFn: () => feedsApi.toggleAllPostsWhitelist(currentFeed.id), onSuccess: () => { queryClient.invalidateQueries({ queryKey: ['episodes', currentFeed.id] }); }, }); const refreshFeedMutation = useMutation({ mutationFn: () => feedsApi.refreshFeed(currentFeed.id), onSuccess: (data) => { queryClient.invalidateQueries({ queryKey: ['feeds'] }); queryClient.invalidateQueries({ queryKey: ['episodes', currentFeed.id] }); toast.success(data?.message ?? 'Feed refreshed'); }, onError: (err) => { console.error('Failed to refresh feed', err); const { status, data, message } = getHttpErrorInfo(err); emitDiagnosticError({ title: 'Failed to refresh feed', message, kind: status ? 'http' : 'network', details: { status, response: data, feedId: currentFeed.id, }, }); }, }); const updateFeedSettingsMutation = useMutation({ mutationFn: (override: boolean | null) => feedsApi.updateFeedSettings(currentFeed.id, { auto_whitelist_new_episodes_override: override, }), onSuccess: (data) => { setCurrentFeed(data); queryClient.invalidateQueries({ queryKey: ['feeds'] }); toast.success('Feed settings updated'); }, onError: (err) => { const { status, data, message } = getHttpErrorInfo(err); emitDiagnosticError({ title: 'Failed to update feed settings', message, kind: status ? 'http' : 'network', details: { status, response: data, feedId: currentFeed.id, }, }); toast.error('Failed to update feed settings'); }, }); const deleteFeedMutation = useMutation({ mutationFn: () => feedsApi.deleteFeed(currentFeed.id), onSuccess: () => { queryClient.invalidateQueries({ queryKey: ['feeds'] }); if (onFeedDeleted) { onFeedDeleted(); } }, onError: (err) => { console.error('Failed to delete feed', err); const { status, data, message } = getHttpErrorInfo(err); emitDiagnosticError({ title: 'Failed to delete feed', message, kind: status ? 'http' : 'network', details: { status, response: data, feedId: currentFeed.id, }, }); }, }); const joinFeedMutation = useMutation({ mutationFn: () => feedsApi.joinFeed(currentFeed.id), onSuccess: (data) => { toast.success('Joined feed'); setCurrentFeed(data); queryClient.invalidateQueries({ queryKey: ['feeds'] }); }, onError: (err) => { console.error('Failed to join feed', err); const { status, data, message } = getHttpErrorInfo(err); emitDiagnosticError({ title: 'Failed to join feed', message, kind: status ? 'http' : 'network', details: { status, response: data, feedId: currentFeed.id, }, }); }, }); const leaveFeedMutation = useMutation({ mutationFn: () => feedsApi.leaveFeed(currentFeed.id), onSuccess: () => { toast.success('Removed from your feeds'); setCurrentFeed((prev) => (prev ? { ...prev, is_member: false, is_active_subscription: false } : prev)); queryClient.invalidateQueries({ queryKey: ['feeds'] }); if (onFeedDeleted && !isAdmin) { onFeedDeleted(); } }, onError: (err) => { console.error('Failed to leave feed', err); const { status, data, message } = getHttpErrorInfo(err); emitDiagnosticError({ title: 'Failed to remove feed', message, kind: status ? 'http' : 'network', details: { status, response: data, feedId: currentFeed.id, }, }); }, }); useEffect(() => { setCurrentFeed(feed); }, [feed]); useEffect(() => { setPage(1); }, [feed.id, whitelistedOnly]); // Handle scroll to show/hide sticky header useEffect(() => { const scrollContainer = scrollContainerRef.current; const feedHeader = feedHeaderRef.current; if (!scrollContainer || !feedHeader) return; const handleScroll = () => { const scrollTop = scrollContainer.scrollTop; const feedHeaderHeight = feedHeader.offsetHeight; // Show sticky header when scrolled past the feed header setShowStickyHeader(scrollTop > feedHeaderHeight - 100); }; scrollContainer.addEventListener('scroll', handleScroll); return () => scrollContainer.removeEventListener('scroll', handleScroll); }, []); // Handle click outside to close menu useEffect(() => { const handleClickOutside = (event: MouseEvent) => { if (showMenu && !(event.target as Element).closest('.menu-container')) { setShowMenu(false); } }; document.addEventListener('mousedown', handleClickOutside); return () => document.removeEventListener('mousedown', handleClickOutside); }, [showMenu]); const handleWhitelistToggle = (episode: Episode) => { if (!episode.whitelisted) { setPendingEpisode(episode); setShowProcessingModal(true); setProcessingEstimate(null); setEstimateError(null); setIsEstimating(true); feedsApi .getProcessingEstimate(episode.guid) .then((estimate) => { setProcessingEstimate(estimate); }) .catch((err) => { console.error('Failed to load processing estimate', err); const { status, data, message } = getHttpErrorInfo(err); emitDiagnosticError({ title: 'Failed to load processing estimate', message, kind: status ? 'http' : 'network', details: { status, response: data, postGuid: episode.guid, }, }); setEstimateError(message ?? 'Unable to estimate processing time'); }) .finally(() => setIsEstimating(false)); return; } whitelistMutation.mutate({ guid: episode.guid, whitelisted: false, }); }; const handleConfirmProcessing = () => { if (!pendingEpisode) return; whitelistMutation.mutate( { guid: pendingEpisode.guid, whitelisted: true, triggerProcessing: true, }, { onSuccess: () => { setShowProcessingModal(false); setPendingEpisode(null); setProcessingEstimate(null); }, } ); }; const handleCancelProcessing = () => { setShowProcessingModal(false); setPendingEpisode(null); setProcessingEstimate(null); setEstimateError(null); }; const handleAutoWhitelistOverrideChange = (value: string) => { const override = value === 'inherit' ? null : value === 'on'; updateFeedSettingsMutation.mutate(override); }; const isMember = Boolean(currentFeed.is_member); const isActiveSubscription = currentFeed.is_active_subscription !== false; // Admins can manage everything; regular users are read-only. const canDeleteFeed = isAdmin; // only admins can delete feeds const canModifyEpisodes = !requireAuth ? true : Boolean(isAdmin); const canBulkModifyEpisodes = !requireAuth ? true : Boolean(isAdmin); const canSubscribe = !requireAuth || isMember; const showPodlyRssButton = !(requireAuth && isAdmin && !isMember); const showWhitelistUi = canModifyEpisodes && isAdmin; const appAutoWhitelistDefault = configResponse?.config?.app?.automatically_whitelist_new_episodes; const autoWhitelistDefaultLabel = appAutoWhitelistDefault === undefined ? 'Unknown' : appAutoWhitelistDefault ? 'On' : 'Off'; const autoWhitelistOverrideValue = currentFeed.auto_whitelist_new_episodes_override ?? null; const autoWhitelistSelectValue = autoWhitelistOverrideValue === true ? 'on' : autoWhitelistOverrideValue === false ? 'off' : 'inherit'; const episodes = episodesPage?.items ?? []; const totalCount = episodesPage?.total ?? 0; const whitelistedCount = episodesPage?.whitelisted_total ?? episodes.filter((ep: Episode) => ep.whitelisted).length; const totalPages = Math.max( 1, episodesPage?.total_pages ?? Math.ceil(totalCount / EPISODES_PAGE_SIZE) ); const hasEpisodes = totalCount > 0; const visibleStart = hasEpisodes ? (page - 1) * EPISODES_PAGE_SIZE + 1 : 0; const visibleEnd = hasEpisodes ? Math.min(totalCount, page * EPISODES_PAGE_SIZE) : 0; useEffect(() => { if (page > totalPages && totalPages > 0) { setPage(totalPages); } }, [page, totalPages]); const handleBulkWhitelistToggle = () => { if (requireAuth && !isAdmin) { toast.error('Only admins can bulk toggle whitelist status.'); return; } bulkWhitelistMutation.mutate(); }; const handleDeleteFeed = () => { if (confirm(`Are you sure you want to delete "${currentFeed.title}"? This action cannot be undone.`)) { deleteFeedMutation.mutate(); } }; const episodesToShow = useMemo(() => episodes, [episodes]); const sortedEpisodes = useMemo(() => { const list = [...episodesToShow]; return list.sort((a, b) => { switch (sortBy) { case 'newest': return new Date(b.release_date || 0).getTime() - new Date(a.release_date || 0).getTime(); case 'oldest': return new Date(a.release_date || 0).getTime() - new Date(b.release_date || 0).getTime(); case 'title': return a.title.localeCompare(b.title); default: return 0; } }); }, [episodesToShow, sortBy]); // Calculate whitelist status for bulk button const allWhitelisted = totalCount > 0 && whitelistedCount === totalCount; const formatDate = (dateString: string | null) => { if (!dateString) return 'Unknown date'; return new Date(dateString).toLocaleDateString('en-US', { year: 'numeric', month: 'short', day: 'numeric' }); }; const formatDuration = (seconds: number | null) => { if (!seconds) return ''; const hours = Math.floor(seconds / 3600); const minutes = Math.floor((seconds % 3600) / 60); if (hours > 0) { return `${hours}h ${minutes}m`; } return `${minutes}m`; }; const handleCopyRssToClipboard = async () => { if (requireAuth && !isAuthenticated) { toast.error('Please sign in to copy a protected RSS URL.'); return; } try { let rssUrl: string; if (requireAuth) { const response = await feedsApi.createProtectedFeedShareLink(currentFeed.id); rssUrl = response.url; } else { rssUrl = new URL(`/feed/${currentFeed.id}`, window.location.origin).toString(); } await copyToClipboard(rssUrl, 'Copy the Feed RSS URL:', 'Feed URL copied to clipboard!'); } catch (err) { console.error('Failed to copy feed URL', err); toast.error('Failed to copy feed URL'); } }; const handleCopyOriginalRssToClipboard = async () => { try { const rssUrl = currentFeed.rss_url || ''; if (!rssUrl) throw new Error('No RSS URL'); await copyToClipboard(rssUrl, 'Copy the Original RSS URL:', 'Original RSS URL copied to clipboard'); } catch (err) { console.error('Failed to copy original RSS URL', err); toast.error('Failed to copy original RSS URL'); } }; return (
{/* Mobile Header */}

Podcast Details

{onClose && ( )}
{/* Sticky Header - appears when scrolling */}
{currentFeed.image_url && ( {currentFeed.title} )}

{currentFeed.title}

{currentFeed.author && (

by {currentFeed.author}

)}
{/* do not add addtional controls to sticky headers */}
{/* Scrollable Content */}
{/* Feed Info Header */}
{/* Top Section: Image and Title */}
{/* Podcast Image */}
{currentFeed.image_url ? ( {currentFeed.title} ) : (
)}
{/* Title aligned to bottom-left of image */}

{currentFeed.title}

{currentFeed.author && (

by {currentFeed.author}

)}
{totalCount} episodes visible
{requireAuth && isAdmin && (
{isMember ? 'Joined' : 'Not joined'} {isMember && !isActiveSubscription && ( Paused )}
)}
{/* RSS Button and Menu */}
{/* Podly RSS Subscribe Button */} {showPodlyRssButton && ( )} {requireAuth && isAdmin && !isMember && ( )} {canModifyEpisodes && ( )} {/* Ellipsis Menu */}
{/* Dropdown Menu */} {showMenu && (
{canBulkModifyEpisodes && ( <> )} {isAdmin && ( )} {requireAuth && isAdmin && isMember && ( <>
)} {canDeleteFeed && ( <>
)}
)}
{/* Feed Description */} {currentFeed.description && (

{currentFeed.description.replace(/<[^>]*>/g, '')}

)} {isAdmin && (

Overrides the global setting. Global default: {autoWhitelistDefaultLabel}.

)}
{/* Inactive Subscription Warning */} {currentFeed.is_member && currentFeed.is_active_subscription === false && (

Processing Paused

This feed exceeds your plan's allowance. New episodes will not be processed automatically until you upgrade your plan or leave other feeds.

)} {/* Episodes Header with Sort Only */}

Episodes

{/* Help Explainer (admins only) */} {showHelp && isAdmin && (

About Enabling & Disabling Ad Removal

Enabled episodes are processed by Podly to automatically detect and remove advertisements, giving you a clean, ad-free listening experience.

Disabled episodes are not processed and won't be available for download through Podly. This is useful for episodes you don't want to listen to.

Why whitelist episodes? Processing takes time and computational resources. Enable only the episodes you want to hear to keep your feed focused. This is useful when adding a new feed with a large back catalog.

)} {/* Episodes List */}
{isLoading ? (
{[...Array(5)].map((_, i) => (
))}
) : error ? (

Failed to load episodes

) : sortedEpisodes.length === 0 ? (

No episodes found

) : (
{sortedEpisodes.map((episode) => (
{/* Top Section: Thumbnail and Title */}
{/* Episode/Podcast Thumbnail */}
{(episode.image_url || currentFeed.image_url) ? ( {episode.title} ) : (
)}
{/* Title and Feed Name */}

{episode.title}

{currentFeed.title}

{/* Episode Description */} {episode.description && (

{episode.description.replace(/<[^>]*>/g, '').substring(0, 300)}...

)} {/* Metadata: Status, Date and Duration */}
{showWhitelistUi && ( <> )} {formatDate(episode.release_date)} {episode.duration && ( <> {formatDuration(episode.duration)} )} <> {episode.download_count ? episode.download_count : 0} {episode.download_count === 1 ? 'download' : 'downloads'}
{/* Bottom Controls - only show if episode is whitelisted */} {episode.whitelisted && (
{/* Left side: Download buttons */}
{/* Right side: Play button */}
{episode.has_processed_audio && ( )}
)}
))}
)}
{totalCount > 0 && (
Showing {visibleStart}-{visibleEnd} of {totalCount} episodes
Page {page} of {totalPages}
)}
{showProcessingModal && pendingEpisode && (
event.stopPropagation()} >

Enable episode

{pendingEpisode.title}

{isEstimating && (
Estimating processing time…
)} {!isEstimating && estimateError && (

{estimateError}

)} {!isEstimating && processingEstimate && (

Estimated minutes: {processingEstimate.estimated_minutes.toFixed(2)}

{!processingEstimate.can_process && (

Processing not available for this episode.

)}
)}
)}
); } ================================================ FILE: frontend/src/components/FeedList.tsx ================================================ import { useMemo, useState } from 'react'; import { useAuth } from '../contexts/AuthContext'; import type { Feed } from '../types'; interface FeedListProps { feeds: Feed[]; onFeedDeleted: () => void; onFeedSelected: (feed: Feed) => void; selectedFeedId?: number; } export default function FeedList({ feeds, onFeedDeleted: _onFeedDeleted, onFeedSelected, selectedFeedId }: FeedListProps) { const [searchTerm, setSearchTerm] = useState(''); const { requireAuth, user } = useAuth(); const showMembership = Boolean(requireAuth && user?.role === 'admin'); // Ensure feeds is an array const feedsArray = Array.isArray(feeds) ? feeds : []; const filteredFeeds = useMemo(() => { const term = searchTerm.trim().toLowerCase(); if (!term) { return feedsArray; } return feedsArray.filter((feed) => { const title = feed.title?.toLowerCase() ?? ''; const author = feed.author?.toLowerCase() ?? ''; return title.includes(term) || author.includes(term); }); }, [feedsArray, searchTerm]); if (feedsArray.length === 0) { return (

No podcast feeds added yet.

Click "Add Feed" to get started.

); } return (
setSearchTerm(event.target.value)} className="w-full rounded-lg border border-gray-300 bg-white px-3 py-2 text-sm text-gray-900 placeholder:text-gray-500 focus:border-blue-500 focus:outline-none focus:ring-2 focus:ring-blue-200" />
{filteredFeeds.length === 0 ? (

No podcasts match "{searchTerm}"

) : ( filteredFeeds.map((feed) => (
onFeedSelected(feed)} >
{/* Podcast Image */}
{feed.image_url ? ( {feed.title} ) : (
)}
{/* Feed Info */}

{feed.title}

{feed.author && (

by {feed.author}

)}
{feed.posts_count} episodes {showMembership && (
{feed.is_member ? 'Joined' : 'Not joined'} {feed.is_member && feed.is_active_subscription === false && ( Paused )}
)}
)) )}
); } ================================================ FILE: frontend/src/components/PlayButton.tsx ================================================ import { useAudioPlayer } from '../contexts/AudioPlayerContext'; import type { Episode } from '../types'; interface PlayButtonProps { episode: Episode; className?: string; } const PlayIcon = ({ className }: { className: string }) => ( ); const PauseIcon = ({ className }: { className: string }) => ( ); export default function PlayButton({ episode, className = '' }: PlayButtonProps) { const { currentEpisode, isPlaying, isLoading, playEpisode, togglePlayPause } = useAudioPlayer(); const isCurrentEpisode = currentEpisode?.id === episode.id; const canPlay = episode.has_processed_audio; console.log(`PlayButton for "${episode.title}":`, { has_processed_audio: episode.has_processed_audio, whitelisted: episode.whitelisted, canPlay }); const getDisabledReason = () => { if (!episode.has_processed_audio) { return 'Episode not processed yet'; } return ''; }; const handleClick = () => { console.log('PlayButton clicked for episode:', episode.title); console.log('canPlay:', canPlay); console.log('isCurrentEpisode:', isCurrentEpisode); if (!canPlay) return; if (isCurrentEpisode) { console.log('Toggling play/pause for current episode'); togglePlayPause(); } else { console.log('Playing new episode'); playEpisode(episode); } }; const isDisabled = !canPlay || (isLoading && isCurrentEpisode); const disabledReason = getDisabledReason(); const title = isDisabled && disabledReason ? disabledReason : isCurrentEpisode ? (isPlaying ? 'Pause' : 'Play') : 'Play episode'; return ( ); } ================================================ FILE: frontend/src/components/ProcessingStatsButton.tsx ================================================ import { useState } from 'react'; import { useQuery } from '@tanstack/react-query'; import { feedsApi } from '../services/api'; interface ProcessingStatsButtonProps { episodeGuid: string; hasProcessedAudio: boolean; className?: string; } export default function ProcessingStatsButton({ episodeGuid, hasProcessedAudio, className = '' }: ProcessingStatsButtonProps) { const [showModal, setShowModal] = useState(false); const [activeTab, setActiveTab] = useState<'overview' | 'model-calls' | 'transcript' | 'identifications'>('overview'); const [expandedModelCalls, setExpandedModelCalls] = useState>(new Set()); const { data: stats, isLoading, error } = useQuery({ queryKey: ['episode-stats', episodeGuid], queryFn: () => feedsApi.getPostStats(episodeGuid), enabled: showModal && hasProcessedAudio, // Only fetch when modal is open and episode is processed }); const formatDuration = (seconds: number) => { const hours = Math.floor(seconds / 3600); const minutes = Math.floor((seconds % 3600) / 60); const secs = Math.round(seconds % 60); // Round to nearest whole second if (hours > 0) { return `${hours}h ${minutes}m ${secs}s`; } return `${minutes}m ${secs}s`; }; const formatTimestamp = (timestamp: string | null) => { if (!timestamp) return 'N/A'; return new Date(timestamp).toLocaleString(); }; const toggleModelCallDetails = (callId: number) => { const newExpanded = new Set(expandedModelCalls); if (newExpanded.has(callId)) { newExpanded.delete(callId); } else { newExpanded.add(callId); } setExpandedModelCalls(newExpanded); }; if (!hasProcessedAudio) { return null; } return ( <> {/* Modal */} {showModal && (
{/* Header */}

Processing Statistics & Debug

{/* Tabs */}
{/* Content */}
{isLoading ? (
Loading stats...
) : error ? (

Failed to load processing statistics

) : stats ? ( <> {/* Overview Tab */} {activeTab === 'overview' && (
{/* Episode Info */}

Episode Information

Title: {stats.post?.title || 'Unknown'}
Duration: {stats.post?.duration ? formatDuration(stats.post.duration) : 'Unknown'}
{/* Key Metrics */}

Key Metrics

{stats.processing_stats?.total_segments || 0}
Transcript Segments
{stats.processing_stats?.content_segments || 0}
Content Segments
{stats.processing_stats?.ad_segments_count || 0}
Ad Segments Removed
{/* Model Performance */}

AI Model Performance

{/* Model Call Status */}

Processing Status

{Object.entries(stats.processing_stats?.model_call_statuses || {}).map(([status, count]) => (
{status} {count}
))}
{/* Model Types */}

Models Used

{Object.entries(stats.processing_stats?.model_types || {}).map(([model, count]) => (
{model} {count} calls
))}
)} {/* Model Calls Tab */} {activeTab === 'model-calls' && (

Model Calls ({stats.model_calls?.length || 0})

{(stats.model_calls || []).map((call) => ( <> {expandedModelCalls.has(call.id) && ( )} ))}
ID Model Segment Range Status Timestamp Retries Actions
{call.id} {call.model_name} {call.segment_range} {call.status} {formatTimestamp(call.timestamp)} {call.retry_attempts}
{call.prompt && (
Prompt:
{call.prompt}
)} {call.error_message && (
Error Message:
{call.error_message}
)} {call.response && (
Response:
{call.response}
)}
)} {/* Transcript Segments Tab */} {activeTab === 'transcript' && (

Transcript Segments ({stats.transcript_segments?.length || 0})

{(stats.transcript_segments || []).map((segment) => ( ))}
Seq # Time Range Label Text
{segment.sequence_num} {segment.start_time}s - {segment.end_time}s {segment.primary_label === 'ad' ? (segment.mixed ? 'Ad (mixed)' : 'Ad') : 'Content'}
{segment.text}
)} {/* Identifications Tab */} {activeTab === 'identifications' && (

Identifications ({stats.identifications?.length || 0})

{(stats.identifications || []).map((identification) => ( ))}
ID Segment ID Time Range Label Confidence Model Call Text
{identification.id} {identification.transcript_segment_id} {identification.segment_start_time}s - {identification.segment_end_time}s {identification.label === 'ad' ? (identification.mixed ? 'ad (mixed)' : 'ad') : identification.label} {identification.confidence ? identification.confidence.toFixed(2) : 'N/A'} {identification.model_call_id}
{identification.segment_text}
)} ) : null}
)} ); } ================================================ FILE: frontend/src/components/ReprocessButton.tsx ================================================ import { useState } from 'react'; import { useQueryClient } from '@tanstack/react-query'; import { feedsApi } from '../services/api'; interface ReprocessButtonProps { episodeGuid: string; isWhitelisted: boolean; feedId?: number; canModifyEpisodes?: boolean; className?: string; onReprocessStart?: () => void; } export default function ReprocessButton({ episodeGuid, isWhitelisted, feedId, canModifyEpisodes = true, className = '', onReprocessStart }: ReprocessButtonProps) { const [isReprocessing, setIsReprocessing] = useState(false); const [error, setError] = useState(null); const [showModal, setShowModal] = useState(false); const queryClient = useQueryClient(); const handleReprocessClick = async () => { if (!isWhitelisted) { setError('Post must be whitelisted before reprocessing'); return; } setShowModal(true); }; const handleConfirmReprocess = async () => { setShowModal(false); setIsReprocessing(true); setError(null); try { const response = await feedsApi.reprocessPost(episodeGuid); if (response.status === 'started') { // Notify parent component that reprocessing started onReprocessStart?.(); // Invalidate queries to refresh the UI if (feedId) { queryClient.invalidateQueries({ queryKey: ['episodes', feedId] }); } queryClient.invalidateQueries({ queryKey: ['episode-stats', episodeGuid] }); } else { setError(response.message || 'Failed to start reprocessing'); } } catch (err: unknown) { console.error('Error starting reprocessing:', err); const errorMessage = err && typeof err === 'object' && 'response' in err ? (err as { response?: { data?: { message?: string } } }).response?.data?.message || 'Failed to start reprocessing' : 'Failed to start reprocessing'; setError(errorMessage); } finally { setIsReprocessing(false); } }; if (!isWhitelisted || !canModifyEpisodes) { return null; } return (
{error && (
{error}
)} {/* Confirmation Modal */} {showModal && (
{/* Header */}

Confirm Reprocess

{/* Content */}

Are you sure you want to reprocess this episode? This will delete the existing processed data and start fresh processing.

{/* Action Buttons */}
)}
); } ================================================ FILE: frontend/src/components/config/ConfigContext.tsx ================================================ import { createContext, useContext } from 'react'; import type { UseConfigStateReturn } from '../../hooks/useConfigState'; export type ConfigTabId = 'default' | 'advanced' | 'users' | 'discord'; export type AdvancedSubtab = 'llm' | 'whisper' | 'processing' | 'output' | 'app'; export interface ConfigContextValue extends UseConfigStateReturn { activeTab: ConfigTabId; setActiveTab: (tab: ConfigTabId) => void; activeSubtab: AdvancedSubtab; setActiveSubtab: (subtab: AdvancedSubtab) => void; isAdmin: boolean; showSecurityControls: boolean; } export const ConfigContext = createContext(null); export function useConfigContext(): ConfigContextValue { const context = useContext(ConfigContext); if (!context) { throw new Error('useConfigContext must be used within ConfigProvider'); } return context; } ================================================ FILE: frontend/src/components/config/ConfigTabs.tsx ================================================ import { useMemo, useEffect, useCallback } from 'react'; import { useSearchParams } from 'react-router-dom'; import { useAuth } from '../../contexts/AuthContext'; import useConfigState from '../../hooks/useConfigState'; import { ConfigContext, type ConfigTabId, type AdvancedSubtab } from './ConfigContext'; import { EnvOverrideWarningModal } from './shared'; import DefaultTab from './tabs/DefaultTab'; import AdvancedTab from './tabs/AdvancedTab'; import UserManagementTab from './tabs/UserManagementTab'; import DiscordTab from './tabs/DiscordTab'; const TABS: { id: ConfigTabId; label: string; adminOnly?: boolean }[] = [ { id: 'default', label: 'Default' }, { id: 'advanced', label: 'Advanced' }, { id: 'users', label: 'User Management', adminOnly: true }, { id: 'discord', label: 'Discord', adminOnly: true }, ]; export default function ConfigTabs() { const [searchParams, setSearchParams] = useSearchParams(); const { user, requireAuth } = useAuth(); const configState = useConfigState(); const showSecurityControls = requireAuth && !!user; const isAdmin = !requireAuth || (showSecurityControls && user?.role === 'admin'); // Get tab from URL or default const activeTab = useMemo(() => { const urlTab = searchParams.get('tab') as ConfigTabId | null; if (urlTab && TABS.some((t) => t.id === urlTab)) { // Check admin-only tabs const tab = TABS.find((t) => t.id === urlTab); if (tab?.adminOnly && !isAdmin) { return 'default'; } if (urlTab === 'users' && !requireAuth) { return 'default'; } return urlTab; } return 'default'; }, [searchParams, isAdmin, requireAuth]); const activeSubtab = useMemo(() => { const urlSubtab = searchParams.get('section') as AdvancedSubtab | null; if (urlSubtab && ['llm', 'whisper', 'processing', 'output', 'app'].includes(urlSubtab)) { return urlSubtab; } return 'llm'; }, [searchParams]); const setActiveTab = useCallback((tab: ConfigTabId) => { setSearchParams((prev) => { const newParams = new URLSearchParams(prev); newParams.set('tab', tab); if (tab !== 'advanced') { newParams.delete('section'); } return newParams; }, { replace: true }); }, [setSearchParams]); const setActiveSubtab = useCallback((subtab: AdvancedSubtab) => { setSearchParams((prev) => { const newParams = new URLSearchParams(prev); newParams.set('section', subtab); return newParams; }, { replace: true }); }, [setSearchParams]); // Redirect if on admin-only tab without permission useEffect(() => { const tab = TABS.find((t) => t.id === activeTab); if (tab?.adminOnly && !isAdmin) { setActiveTab('default'); } }, [isAdmin, activeTab, setActiveTab]); const contextValue = useMemo( () => ({ ...configState, activeTab, setActiveTab, activeSubtab, setActiveSubtab, isAdmin, showSecurityControls, }), [configState, activeTab, setActiveTab, activeSubtab, setActiveSubtab, isAdmin, showSecurityControls] ); const visibleTabs = TABS.filter((tab) => { if (tab.id === 'users' && !requireAuth) return false; return !tab.adminOnly || isAdmin; }); if (configState.isLoading || !configState.pending) { return
Loading configuration...
; } return (

Configuration

{/* Tab Navigation */}
{/* Tab Content */}
{activeTab === 'default' && } {activeTab === 'advanced' && } {activeTab === 'users' && isAdmin && } {activeTab === 'discord' && isAdmin && }
{/* Env Warning Modal */} {configState.showEnvWarning && configState.envWarningPaths.length > 0 && ( )} {/* Extra padding to prevent audio player overlay from obscuring bottom settings */}
); } ================================================ FILE: frontend/src/components/config/index.ts ================================================ export { default as ConfigTabs } from './ConfigTabs'; export { ConfigContext, useConfigContext } from './ConfigContext'; export type { ConfigTabId, AdvancedSubtab, ConfigContextValue } from './ConfigContext'; // Re-export tabs export * from './tabs'; // Re-export sections export * from './sections'; // Re-export shared components export * from './shared'; ================================================ FILE: frontend/src/components/config/sections/AppSection.tsx ================================================ import { useConfigContext } from '../ConfigContext'; import { Section, Field, SaveButton } from '../shared'; export default function AppSection() { const { pending, setField, handleSave, isSaving } = useConfigContext(); if (!pending) return null; return (
setField( ['app', 'background_update_interval_minute'], e.target.value === '' ? null : Number(e.target.value) ) } /> setField( ['app', 'post_cleanup_retention_days'], e.target.value === '' ? null : Number(e.target.value) ) } /> setField(['app', 'automatically_whitelist_new_episodes'], e.target.checked) } /> setField( ['app', 'number_of_episodes_to_whitelist_from_archive_of_new_feed'], Number(e.target.value) ) } />
); } ================================================ FILE: frontend/src/components/config/sections/LLMSection.tsx ================================================ import { useState } from 'react'; import { toast } from 'react-hot-toast'; import { configApi } from '../../../services/api'; import { useConfigContext } from '../ConfigContext'; import { Section, Field, SaveButton, TestButton } from '../shared'; import type { LLMConfig } from '../../../types'; const LLM_MODEL_ALIASES: string[] = [ 'openai/gpt-4', 'openai/gpt-4o', 'anthropic/claude-3.5-sonnet', 'anthropic/claude-3.5-haiku', 'gemini/gemini-3-flash-preview', 'gemini/gemini-2.0-flash', 'gemini/gemini-1.5-pro', 'gemini/gemini-1.5-flash', 'groq/openai/gpt-oss-120b', ]; export default function LLMSection() { const { pending, setField, getEnvHint, handleSave, isSaving } = useConfigContext(); const [showBaseUrlInfo, setShowBaseUrlInfo] = useState(false); if (!pending) return null; const handleTestLLM = () => { toast.promise(configApi.testLLM({ llm: pending.llm as LLMConfig }), { loading: 'Testing LLM connection...', success: (res: { ok: boolean; message?: string }) => res?.message || 'LLM connection OK', error: (err: unknown) => { const e = err as { response?: { data?: { error?: string; message?: string } }; message?: string; }; return ( e?.response?.data?.error || e?.response?.data?.message || e?.message || 'LLM connection failed' ); }, }); }; return (
setField(['llm', 'llm_api_key'], e.target.value)} />
setField(['llm', 'llm_model'], e.target.value)} placeholder="e.g. groq/openai/gpt-oss-120b" />
setField(['llm', 'openai_timeout'], Number(e.target.value))} /> setField(['llm', 'openai_max_tokens'], Number(e.target.value))} /> setField(['llm', 'llm_max_concurrent_calls'], Number(e.target.value))} /> setField(['llm', 'llm_max_retry_attempts'], Number(e.target.value))} /> setField(['llm', 'llm_enable_token_rate_limiting'], e.target.checked)} /> setField(['llm', 'enable_boundary_refinement'], e.target.checked)} /> setField(['llm', 'enable_word_level_boundary_refinder'], e.target.checked) } /> setField( ['llm', 'llm_max_input_tokens_per_call'], e.target.value === '' ? null : Number(e.target.value) ) } /> setField( ['llm', 'llm_max_input_tokens_per_minute'], e.target.value === '' ? null : Number(e.target.value) ) } />
{/* Datalist for model suggestions */} {LLM_MODEL_ALIASES.map((m) => (
); } function BaseUrlInfoBox() { return (

When is Base URL used?

The Base URL is only used for models without a provider prefix. LiteLLM automatically routes provider-prefixed models to their respective APIs.

✅ Base URL is IGNORED for:

  • groq/openai/gpt-oss-120b → Groq API
  • anthropic/claude-3.5-sonnet → Anthropic API
  • gemini/gemini-3-flash-preview → Google API
  • gemini/gemini-2.0-flash → Google API

⚙️ Base URL is USED for:

  • Unprefixed models like gpt-4o
  • Self-hosted OpenAI-compatible endpoints
  • LiteLLM proxy servers or local LLMs

For the default Groq setup, you don't need to set this.

); } ================================================ FILE: frontend/src/components/config/sections/OutputSection.tsx ================================================ import { useConfigContext } from '../ConfigContext'; import { Section, Field, SaveButton } from '../shared'; export default function OutputSection() { const { pending, setField, handleSave, isSaving } = useConfigContext(); if (!pending) return null; return (
setField(['output', 'fade_ms'], Number(e.target.value))} /> setField(['output', 'min_ad_segement_separation_seconds'], Number(e.target.value)) } /> setField(['output', 'min_ad_segment_length_seconds'], Number(e.target.value)) } /> setField(['output', 'min_confidence'], Number(e.target.value))} />
); } ================================================ FILE: frontend/src/components/config/sections/ProcessingSection.tsx ================================================ import { useConfigContext } from '../ConfigContext'; import { Section, Field, SaveButton } from '../shared'; export default function ProcessingSection() { const { pending, setField, handleSave, isSaving } = useConfigContext(); if (!pending) return null; return (
setField(['processing', 'num_segments_to_input_to_prompt'], Number(e.target.value)) } />
); } ================================================ FILE: frontend/src/components/config/sections/WhisperSection.tsx ================================================ import { useMemo } from 'react'; import { toast } from 'react-hot-toast'; import { configApi } from '../../../services/api'; import { useConfigContext } from '../ConfigContext'; import { Section, Field, SaveButton, TestButton } from '../shared'; import type { WhisperConfig } from '../../../types'; export default function WhisperSection() { const { pending, setField, getEnvHint, handleSave, isSaving, localWhisperAvailable, handleWhisperTypeChange, getWhisperApiKey, envOverrides, } = useConfigContext(); const whisperApiKeyPreview = pending?.whisper?.whisper_type === 'remote' || pending?.whisper?.whisper_type === 'groq' ? (pending.whisper as { api_key_preview?: string }).api_key_preview : undefined; const whisperApiKeyPlaceholder = useMemo(() => { if (pending?.whisper?.whisper_type === 'remote' || pending?.whisper?.whisper_type === 'groq') { if (whisperApiKeyPreview) { return whisperApiKeyPreview; } const override = envOverrides['whisper.api_key']; if (override) { return override.value_preview || override.value || ''; } } return ''; }, [whisperApiKeyPreview, pending?.whisper?.whisper_type, envOverrides]); if (!pending) return null; const handleTestWhisper = () => { toast.promise(configApi.testWhisper({ whisper: pending.whisper as WhisperConfig }), { loading: 'Testing Whisper...', success: (res: { ok: boolean; message?: string }) => res?.message || 'Whisper OK', error: (err: unknown) => { const e = err as { response?: { data?: { error?: string; message?: string } }; message?: string; }; return ( e?.response?.data?.error || e?.response?.data?.message || e?.message || 'Whisper test failed' ); }, }); }; const whisperType = pending?.whisper?.whisper_type ?? (localWhisperAvailable === false ? 'remote' : 'local'); return (
{/* Local Whisper Options */} {pending?.whisper?.whisper_type === 'local' && ( setField(['whisper', 'model'], e.target.value)} /> )} {/* Remote Whisper Options */} {pending?.whisper?.whisper_type === 'remote' && (
setField(['whisper', 'api_key'], e.target.value)} /> setField(['whisper', 'model'], e.target.value)} /> setField(['whisper', 'base_url'], e.target.value)} /> setField(['whisper', 'language'], e.target.value)} /> setField(['whisper', 'timeout_sec'], Number(e.target.value))} /> setField(['whisper', 'chunksize_mb'], Number(e.target.value))} />
)} {/* Groq Whisper Options */} {pending?.whisper?.whisper_type === 'groq' && (
setField(['whisper', 'api_key'], e.target.value)} /> setField(['whisper', 'model'], e.target.value)} /> setField(['whisper', 'language'], e.target.value)} /> setField(['whisper', 'max_retries'], Number(e.target.value))} />
)}
); } ================================================ FILE: frontend/src/components/config/sections/index.ts ================================================ export { default as LLMSection } from './LLMSection'; export { default as WhisperSection } from './WhisperSection'; export { default as ProcessingSection } from './ProcessingSection'; export { default as OutputSection } from './OutputSection'; export { default as AppSection } from './AppSection'; ================================================ FILE: frontend/src/components/config/shared/ConnectionStatusCard.tsx ================================================ interface ConnectionStatusCardProps { title: string; status: 'loading' | 'ok' | 'error'; message: string; error?: string; onRetry: () => void; } export default function ConnectionStatusCard({ title, status, message, error, onRetry, }: ConnectionStatusCardProps) { const statusColor = status === 'ok' ? 'text-green-700' : status === 'error' ? 'text-red-700' : 'text-gray-600'; const displayMessage = status === 'loading' ? 'Testing...' : status === 'ok' ? message || `${title} connection OK` : error || `${title} connection failed`; return (
{title}
{displayMessage}
); } ================================================ FILE: frontend/src/components/config/shared/EnvOverrideWarningModal.tsx ================================================ import type { EnvOverrideMap } from '../../../types'; import { ENV_FIELD_LABELS } from './constants'; interface EnvOverrideWarningModalProps { paths: string[]; overrides: EnvOverrideMap; onConfirm: () => void; onCancel: () => void; } export default function EnvOverrideWarningModal({ paths, overrides, onConfirm, onCancel, }: EnvOverrideWarningModalProps) { if (!paths.length) { return null; } return (

Environment-managed settings

These fields are controlled by environment variables. Update the referenced variables in your .env (or deployment secrets) to make the change persistent. Your manual change will be saved, but will be overwritten if you modify your environment variables in the future.

    {paths.map((path) => { const meta = overrides[path]; const label = ENV_FIELD_LABELS[path] ?? path; return (
  • {label}
    {meta?.env_var ? (

    Managed by {meta.env_var} {meta?.value_preview && ( ({meta.value_preview}) )} {!meta?.value_preview && meta?.value && ( ({meta.value}) )}

    ) : (

    Managed by deployment environment

    )}
  • ); })}
); } ================================================ FILE: frontend/src/components/config/shared/EnvVarHint.tsx ================================================ import type { EnvOverrideEntry } from '../../../types'; interface EnvVarHintProps { meta?: EnvOverrideEntry; } export default function EnvVarHint({ meta }: EnvVarHintProps) { if (!meta?.env_var) { return null; } return ( {meta.env_var} ); } ================================================ FILE: frontend/src/components/config/shared/Field.tsx ================================================ import type { ReactNode } from 'react'; import type { EnvOverrideEntry } from '../../../types'; import EnvVarHint from './EnvVarHint'; interface FieldProps { label: string; children: ReactNode; envMeta?: EnvOverrideEntry; labelWidth?: string; hint?: string; } export default function Field({ label, children, envMeta, labelWidth = 'w-60', hint, }: FieldProps) { return ( ); } ================================================ FILE: frontend/src/components/config/shared/SaveButton.tsx ================================================ interface SaveButtonProps { onSave: () => void; isPending: boolean; className?: string; } export default function SaveButton({ onSave, isPending, className = '' }: SaveButtonProps) { return (
); } ================================================ FILE: frontend/src/components/config/shared/Section.tsx ================================================ import type { ReactNode } from 'react'; interface SectionProps { title: string; children: ReactNode; className?: string; } export default function Section({ title, children, className = '' }: SectionProps) { return (

{title}

{children}
); } ================================================ FILE: frontend/src/components/config/shared/TestButton.tsx ================================================ interface TestButtonProps { onClick: () => void; label: string; className?: string; } export default function TestButton({ onClick, label, className = '' }: TestButtonProps) { return (
); } ================================================ FILE: frontend/src/components/config/shared/constants.ts ================================================ export const ENV_FIELD_LABELS: Record = { 'groq.api_key': 'Groq API Key', 'llm.llm_api_key': 'LLM API Key', 'llm.llm_model': 'LLM Model', 'llm.openai_base_url': 'LLM Base URL', 'whisper.whisper_type': 'Whisper Mode', 'whisper.api_key': 'Whisper API Key', 'whisper.model': 'Whisper Model', 'whisper.base_url': 'Whisper Base URL', 'whisper.timeout_sec': 'Whisper Timeout (sec)', 'whisper.chunksize_mb': 'Whisper Chunk Size (MB)', 'whisper.max_retries': 'Whisper Max Retries', }; ================================================ FILE: frontend/src/components/config/shared/index.ts ================================================ export { default as Section } from './Section'; export { default as Field } from './Field'; export { default as EnvVarHint } from './EnvVarHint'; export { default as EnvOverrideWarningModal } from './EnvOverrideWarningModal'; export { default as ConnectionStatusCard } from './ConnectionStatusCard'; export { default as SaveButton } from './SaveButton'; export { default as TestButton } from './TestButton'; export { ENV_FIELD_LABELS } from './constants'; ================================================ FILE: frontend/src/components/config/tabs/AdvancedTab.tsx ================================================ import { useConfigContext, type AdvancedSubtab } from '../ConfigContext'; import { LLMSection, WhisperSection, ProcessingSection, OutputSection, AppSection, } from '../sections'; const SUBTABS: { id: AdvancedSubtab; label: string }[] = [ { id: 'llm', label: 'LLM' }, { id: 'whisper', label: 'Whisper' }, { id: 'processing', label: 'Processing' }, { id: 'output', label: 'Output' }, { id: 'app', label: 'App' }, ]; export default function AdvancedTab() { const { activeSubtab, setActiveSubtab } = useConfigContext(); return (
{/* Subtab Navigation */}
{SUBTABS.map((subtab) => ( ))}
{/* Subtab Content */}
{activeSubtab === 'llm' && } {activeSubtab === 'whisper' && } {activeSubtab === 'processing' && } {activeSubtab === 'output' && } {activeSubtab === 'app' && }
); } ================================================ FILE: frontend/src/components/config/tabs/DefaultTab.tsx ================================================ import { useState } from 'react'; import { useConfigContext } from '../ConfigContext'; import { Section, Field, ConnectionStatusCard } from '../shared'; import type { WhisperConfig, LLMConfig } from '../../../types'; export default function DefaultTab() { const { pending, updatePending, llmStatus, whisperStatus, probeConnections, getEnvHint, getWhisperApiKey, groqRecommendedModel, groqRecommendedWhisper, applyGroqKey, } = useConfigContext(); const [showGroqHelp, setShowGroqHelp] = useState(false); const [showGroqPricing, setShowGroqPricing] = useState(false); if (!pending) return null; const handleGroqKeyChange = (val: string) => { updatePending((prevConfig) => { return { ...prevConfig, llm: { ...(prevConfig.llm as LLMConfig), llm_api_key: val, llm_model: groqRecommendedModel, }, whisper: { whisper_type: 'groq', api_key: val, model: groqRecommendedWhisper, language: 'en', max_retries: 3, } as WhisperConfig, }; }); }; const handleGroqKeyApply = (key: string) => { if (!key.trim()) return; void applyGroqKey(key.trim()); }; const currentGroqKey = pending?.whisper?.whisper_type === 'groq' ? getWhisperApiKey(pending?.whisper) : pending?.llm?.llm_api_key || ''; const groqKeyPlaceholder = pending?.whisper?.whisper_type === 'groq' ? pending?.whisper?.api_key_preview || '' : pending?.llm?.llm_api_key_preview || ''; return (
void probeConnections()} /> void probeConnections()} />
Enter your Groq API key to use the recommended setup.
{showGroqHelp && } {showGroqPricing && }
handleGroqKeyChange(e.target.value)} onBlur={(e) => handleGroqKeyApply(e.target.value)} onPaste={(e) => { const text = e.clipboardData.getData('text').trim(); if (text) handleGroqKeyApply(text); }} />
{/* Input styling */}
); } function GroqHelpBox() { return (
  1. Visit the{' '} Groq Console {' '} and sign in or create an account.
  2. Open the Keys page and click "Create API Key".
  3. Copy the key (it starts with gsk_) and paste it below.
  4. Recommended: Set a billing limit at{' '} Settings → Billing → Limits {' '} to control costs and receive usage alerts.
); } function GroqPricingBox() { return (

Groq Pricing Guide

Based on the recommended models: whisper-large-v3-turbo and{' '} llama-3.3-70b-versatile

Whisper (Transcription)
  • whisper-large-v3-turbo: $0.04/hour
  • • Speed: 216x real-time
  • • Minimum charge: 10 seconds per request
LLM (Ad Detection)
  • llama-3.3-70b-versatile:
  • • Input: $0.59/1M tokens
  • • Output: $0.79/1M tokens
  • • ~1M tokens ≈ 750,000 words
Estimated Monthly Cost (6 podcasts, 6 hours/week)
Transcription:
24 hours/month × $0.04 = $0.96/month
Ad Detection:
~2M tokens × $0.69 avg = $1.38/month
Total Estimate:
~$2.34/month

* Actual costs may vary based on podcast length, complexity, and token usage. Consider setting a $5-10/month billing limit for safety.

); } ================================================ FILE: frontend/src/components/config/tabs/DiscordTab.tsx ================================================ import { useState, useEffect } from 'react'; import { useQuery, useMutation, useQueryClient } from '@tanstack/react-query'; import { toast } from 'react-hot-toast'; import { discordApi } from '../../../services/api'; import { Section } from '../shared'; export default function DiscordTab() { const queryClient = useQueryClient(); const { data, isLoading, error } = useQuery({ queryKey: ['discord-config'], queryFn: discordApi.getConfig, }); const [form, setForm] = useState({ client_id: '', client_secret: '', redirect_uri: '', guild_ids: '', allow_registration: true, }); const [hasSecretChange, setHasSecretChange] = useState(false); // Initialize form when data loads useEffect(() => { if (data?.config) { setForm({ client_id: data.config.client_id || '', client_secret: '', // Don't prefill secret redirect_uri: data.config.redirect_uri || '', guild_ids: data.config.guild_ids || '', allow_registration: data.config.allow_registration, }); setHasSecretChange(false); } }, [data]); const mutation = useMutation({ mutationFn: discordApi.updateConfig, onSuccess: () => { toast.success('Discord settings saved'); queryClient.invalidateQueries({ queryKey: ['discord-config'] }); queryClient.invalidateQueries({ queryKey: ['discord-status'] }); setHasSecretChange(false); }, onError: (err: Error) => { toast.error(`Failed to save: ${err.message}`); }, }); const handleSubmit = (e: React.FormEvent) => { e.preventDefault(); const payload: Record = { client_id: form.client_id, redirect_uri: form.redirect_uri, guild_ids: form.guild_ids, allow_registration: form.allow_registration, }; // Only include secret if it was changed if (hasSecretChange && form.client_secret) { payload.client_secret = form.client_secret; } mutation.mutate(payload); }; const envOverrides = data?.env_overrides || {}; if (isLoading) { return
Loading Discord configuration...
; } if (error) { return
Failed to load Discord configuration
; } return (
setForm({ ...form, client_id: e.target.value })} placeholder="Your Discord application Client ID" disabled={!!envOverrides.client_id} />
{ setForm({ ...form, client_secret: e.target.value }); setHasSecretChange(true); }} placeholder={data?.config.client_secret_preview ? '••••••••' : 'Your Discord application Client Secret'} disabled={!!envOverrides.client_secret} />
setForm({ ...form, redirect_uri: e.target.value })} placeholder="https://your-domain.com/api/auth/discord/callback" disabled={!!envOverrides.redirect_uri} />

Must match the URI configured in Discord Developer Portal

setForm({ ...form, guild_ids: e.target.value })} placeholder="123456789,987654321" disabled={!!envOverrides.guild_ids} />

Comma-separated Discord server IDs to restrict access

{envOverrides.allow_registration && (

Overridden by {envOverrides.allow_registration.env_var}

)}
); } function StatusIndicator({ enabled }: { enabled: boolean }) { return (
{enabled ? 'Discord SSO is enabled' : 'Discord SSO is not configured'}
); } function SetupInstructions() { return (

Discord Developer Portal Setup

  1. Go to{' '} Discord Developer Portal
  2. Create a new application or select an existing one
  3. Navigate to OAuth2 → General
  4. Copy the Client ID and Client Secret
  5. Add your redirect URI to the list of allowed redirects
  6. The redirect URI should be: https://your-domain/api/auth/discord/callback

Note: Environment variables (DISCORD_CLIENT_ID, DISCORD_CLIENT_SECRET, etc.) take precedence over values configured here.

); } ================================================ FILE: frontend/src/components/config/tabs/UserManagementTab.tsx ================================================ import { useMemo, useState } from 'react'; import type { FormEvent } from 'react'; import { useQuery } from '@tanstack/react-query'; import { toast } from 'react-hot-toast'; import { authApi } from '../../../services/api'; import { useAuth } from '../../../contexts/AuthContext'; import { useConfigContext } from '../ConfigContext'; import { Section, Field, SaveButton } from '../shared'; import type { ManagedUser } from '../../../types'; export default function UserManagementTab() { const { changePassword, refreshUser, user, logout } = useAuth(); const { pending, setField, handleSave, isSaving } = useConfigContext(); const { data: managedUsers, isLoading: usersLoading, refetch: refetchUsers, } = useQuery({ queryKey: ['auth-users'], queryFn: async () => { const response = await authApi.listUsers(); return response.users; }, }); const totalUsers = useMemo(() => managedUsers?.length ?? 0, [managedUsers]); const limitValue = pending?.app?.user_limit_total ?? null; return (
{pending && ( setField( ['app', 'user_limit_total'], value === '' ? null : Number(value) ) } onSave={handleSave} isSaving={isSaving} isLoadingUsers={usersLoading} /> )}
); } // --- Account Security Section --- interface AccountSecurityProps { changePassword: (current: string, next: string) => Promise; refreshUser: () => Promise; } function AccountSecuritySection({ changePassword, refreshUser }: AccountSecurityProps) { const [passwordForm, setPasswordForm] = useState({ current: '', next: '', confirm: '' }); const [passwordSubmitting, setPasswordSubmitting] = useState(false); const handlePasswordSubmit = async (event: FormEvent) => { event.preventDefault(); if (passwordForm.next !== passwordForm.confirm) { toast.error('New passwords do not match.'); return; } setPasswordSubmitting(true); try { await changePassword(passwordForm.current, passwordForm.next); toast.success('Password updated. Update PODLY_ADMIN_PASSWORD to match.'); setPasswordForm({ current: '', next: '', confirm: '' }); await refreshUser(); } catch (error) { toast.error(getErrorMessage(error, 'Failed to update password.')); } finally { setPasswordSubmitting(false); } }; return (
setPasswordForm((prev) => ({ ...prev, current: event.target.value })) } required /> setPasswordForm((prev) => ({ ...prev, next: event.target.value })) } required /> setPasswordForm((prev) => ({ ...prev, confirm: event.target.value })) } required />

After updating, rotate PODLY_ADMIN_PASSWORD to match.

); } // --- User Limit Section --- interface UserLimitSectionProps { currentUsers: number; userLimit: number | null; onChangeLimit: (value: string) => void; onSave: () => void; isSaving: boolean; isLoadingUsers: boolean; } function UserLimitSection({ currentUsers, userLimit, onChangeLimit, onSave, isSaving, isLoadingUsers }: UserLimitSectionProps) { return (
onChangeLimit(event.target.value)} placeholder="Unlimited" />

Leave blank for unlimited; set to 0 to block new user creation. Applies only when authentication is enabled.

Current users
{isLoadingUsers ? 'Loading…' : currentUsers}
{userLimit !== null && userLimit > 0 && currentUsers >= userLimit ? (
Limit reached. New users are blocked until the total drops below {userLimit}.
) : (
New user creation is blocked once the limit is reached.
)}
); } // --- User Management Section --- interface UserManagementProps { currentUser: { id: number; username: string; role: string } | null; refreshUser: () => Promise; logout: () => void; managedUsers: ManagedUser[] | undefined; usersLoading: boolean; refetchUsers: () => Promise; } function UserManagementSection({ currentUser, refreshUser, logout, managedUsers, usersLoading, refetchUsers }: UserManagementProps) { const [newUser, setNewUser] = useState({ username: '', password: '', confirm: '', role: 'user' }); const [activeResetUser, setActiveResetUser] = useState(null); const [resetPassword, setResetPassword] = useState(''); const [resetConfirm, setResetConfirm] = useState(''); const sortedUsers = useMemo(() => { if (!managedUsers) { return []; } return [...managedUsers].sort( (a, b) => new Date(b.created_at).getTime() - new Date(a.created_at).getTime() ); }, [managedUsers]); const adminCount = useMemo( () => sortedUsers.filter((u) => u.role === 'admin').length, [sortedUsers] ); const handleCreateUser = async (event: FormEvent) => { event.preventDefault(); const username = newUser.username.trim(); if (!username) { toast.error('Username is required.'); return; } if (newUser.password !== newUser.confirm) { toast.error('Passwords do not match.'); return; } try { await authApi.createUser({ username, password: newUser.password, role: newUser.role, }); toast.success(`User '${username}' created.`); setNewUser({ username: '', password: '', confirm: '', role: 'user' }); await refetchUsers(); } catch (error) { toast.error(getErrorMessage(error, 'Failed to create user.')); } }; const handleRoleChange = async (username: string, role: string) => { try { await authApi.updateUser(username, { role }); toast.success(`Updated role for ${username}.`); await refetchUsers(); if (currentUser && currentUser.username === username) { await refreshUser(); } } catch (error) { toast.error(getErrorMessage(error, 'Failed to update role.')); } }; const handleAllowanceChange = async (username: string, allowance: string) => { const val = allowance === '' ? null : parseInt(allowance, 10); if (val !== null && isNaN(val)) return; try { await authApi.updateUser(username, { manual_feed_allowance: val }); toast.success(`Updated allowance for ${username}.`); await refetchUsers(); } catch (error) { toast.error(getErrorMessage(error, 'Failed to update allowance.')); } }; const handleResetPassword = async (event: FormEvent) => { event.preventDefault(); if (!activeResetUser) { return; } if (resetPassword !== resetConfirm) { toast.error('Passwords do not match.'); return; } try { await authApi.updateUser(activeResetUser, { password: resetPassword }); toast.success(`Password updated for ${activeResetUser}.`); setActiveResetUser(null); setResetPassword(''); setResetConfirm(''); await refetchUsers(); } catch (error) { toast.error(getErrorMessage(error, 'Failed to update password.')); } }; const handleDeleteUser = async (username: string) => { const confirmed = window.confirm(`Delete user '${username}'? This action cannot be undone.`); if (!confirmed) { return; } try { await authApi.deleteUser(username); toast.success(`Deleted user '${username}'.`); await refetchUsers(); if (currentUser && currentUser.username === username) { logout(); } } catch (error) { toast.error(getErrorMessage(error, 'Failed to delete user.')); } }; return (
{/* Create User Form */}
setNewUser((prev) => ({ ...prev, username: event.target.value }))} placeholder="new_user" required />
setNewUser((prev) => ({ ...prev, password: event.target.value }))} required />
setNewUser((prev) => ({ ...prev, confirm: event.target.value }))} required />
{/* User List */}
{usersLoading &&
Loading users…
} {!usersLoading && (!managedUsers || managedUsers.length === 0) && (
No additional users configured.
)} {!usersLoading && managedUsers && managedUsers.length > 0 && (
{sortedUsers.map((managed) => { const disableDemotion = managed.role === 'admin' && adminCount <= 1; const disableDelete = disableDemotion; const isActive = activeResetUser === managed.username; const allowance = managed.feed_allowance ?? 0; const subscriptionStatus = managed.feed_subscription_status ?? 'inactive'; return (
{managed.username}
Added {new Date(managed.created_at).toLocaleString()} • Role {managed.role} • Feeds {allowance} • Status {subscriptionStatus} {managed.last_active && ( <> • Last Active {new Date(managed.last_active).toLocaleString()} )}
Feed Allowance Override: { const val = e.target.value; const current = managed.manual_feed_allowance?.toString() ?? ''; if (val !== current) { void handleAllowanceChange(managed.username, val); } }} onKeyDown={(e) => { if (e.key === 'Enter') { e.currentTarget.blur(); } }} />
{isActive && (
setResetPassword(event.target.value)} required />
setResetConfirm(event.target.value)} required />

Share new credentials securely.

)}
); })}
)}
); } // Helper function function getErrorMessage(error: unknown, fallback = 'Request failed.') { if (error && typeof error === 'object') { const err = error as { response?: { data?: { error?: string; message?: string } }; message?: string; }; return err.response?.data?.error || err.response?.data?.message || err.message || fallback; } if (error instanceof Error) { return error.message; } return fallback; } ================================================ FILE: frontend/src/components/config/tabs/index.ts ================================================ export { default as DefaultTab } from './DefaultTab'; export { default as AdvancedTab } from './AdvancedTab'; export { default as UserManagementTab } from './UserManagementTab'; export { default as DiscordTab } from './DiscordTab'; ================================================ FILE: frontend/src/contexts/AudioPlayerContext.tsx ================================================ import React, { createContext, useContext, useReducer, useRef, useEffect, useCallback } from 'react'; import type { Episode } from '../types'; import { feedsApi } from '../services/api'; interface AudioPlayerState { currentEpisode: Episode | null; isPlaying: boolean; currentTime: number; duration: number; volume: number; isLoading: boolean; error: string | null; } interface AudioPlayerContextType extends AudioPlayerState { playEpisode: (episode: Episode) => void; togglePlayPause: () => void; seekTo: (time: number) => void; setVolume: (volume: number) => void; audioRef: React.RefObject; } type AudioPlayerAction = | { type: 'SET_EPISODE'; payload: Episode } | { type: 'SET_PLAYING'; payload: boolean } | { type: 'SET_CURRENT_TIME'; payload: number } | { type: 'SET_DURATION'; payload: number } | { type: 'SET_VOLUME'; payload: number } | { type: 'SET_LOADING'; payload: boolean } | { type: 'SET_ERROR'; payload: string | null }; const initialState: AudioPlayerState = { currentEpisode: null, isPlaying: false, currentTime: 0, duration: 0, volume: 1, isLoading: false, error: null, }; function audioPlayerReducer(state: AudioPlayerState, action: AudioPlayerAction): AudioPlayerState { switch (action.type) { case 'SET_EPISODE': return { ...state, currentEpisode: action.payload, currentTime: 0, error: null }; case 'SET_PLAYING': return { ...state, isPlaying: action.payload }; case 'SET_CURRENT_TIME': return { ...state, currentTime: action.payload }; case 'SET_DURATION': return { ...state, duration: action.payload }; case 'SET_VOLUME': return { ...state, volume: action.payload }; case 'SET_LOADING': return { ...state, isLoading: action.payload }; case 'SET_ERROR': return { ...state, error: action.payload, isLoading: false }; default: return state; } } const AudioPlayerContext = createContext(undefined); export function AudioPlayerProvider({ children }: { children: React.ReactNode }) { const [state, dispatch] = useReducer(audioPlayerReducer, initialState); const audioRef = useRef(null); const playEpisode = (episode: Episode) => { console.log('playEpisode called with:', episode); console.log('Episode audio flags:', { has_processed_audio: episode.has_processed_audio, has_unprocessed_audio: episode.has_unprocessed_audio, download_url: episode.download_url }); if (!episode.has_processed_audio) { console.log('No processed audio available for episode'); dispatch({ type: 'SET_ERROR', payload: 'Post needs to be processed first' }); return; } console.log('Setting episode and loading state'); dispatch({ type: 'SET_EPISODE', payload: episode }); dispatch({ type: 'SET_LOADING', payload: true }); if (audioRef.current) { // Use the new API endpoint for audio const audioUrl = feedsApi.getPostAudioUrl(episode.guid); console.log('Using API audio URL:', audioUrl); audioRef.current.src = audioUrl; audioRef.current.load(); } else { console.log('audioRef.current is null'); } }; const togglePlayPause = useCallback(() => { if (!audioRef.current || !state.currentEpisode) return; if (state.isPlaying) { audioRef.current.pause(); } else { audioRef.current.play().catch((error) => { dispatch({ type: 'SET_ERROR', payload: 'Failed to play audio' }); console.error('Audio play error:', error); }); } }, [state.isPlaying, state.currentEpisode]); const seekTo = useCallback((time: number) => { if (audioRef.current) { audioRef.current.currentTime = time; dispatch({ type: 'SET_CURRENT_TIME', payload: time }); } }, []); const setVolume = useCallback((volume: number) => { if (audioRef.current) { audioRef.current.volume = volume; dispatch({ type: 'SET_VOLUME', payload: volume }); } }, []); // Audio event handlers useEffect(() => { const audio = audioRef.current; if (!audio) return; const handleLoadedData = () => { dispatch({ type: 'SET_DURATION', payload: audio.duration }); dispatch({ type: 'SET_LOADING', payload: false }); }; const handleTimeUpdate = () => { dispatch({ type: 'SET_CURRENT_TIME', payload: audio.currentTime }); }; const handlePlay = () => { dispatch({ type: 'SET_PLAYING', payload: true }); }; const handlePause = () => { dispatch({ type: 'SET_PLAYING', payload: false }); }; const handleEnded = () => { dispatch({ type: 'SET_PLAYING', payload: false }); dispatch({ type: 'SET_CURRENT_TIME', payload: 0 }); }; const handleError = () => { const audio = audioRef.current; if (!audio) return; // Get more specific error information let errorMessage = 'Failed to load audio'; if (audio.error) { switch (audio.error.code) { case MediaError.MEDIA_ERR_ABORTED: errorMessage = 'Audio loading was aborted'; break; case MediaError.MEDIA_ERR_NETWORK: errorMessage = 'Network error while loading audio'; break; case MediaError.MEDIA_ERR_DECODE: errorMessage = 'Audio file is corrupted or unsupported'; break; case MediaError.MEDIA_ERR_SRC_NOT_SUPPORTED: errorMessage = 'Audio format not supported or file not found'; break; default: errorMessage = 'Unknown audio error'; } } // Check if it's a network error that might indicate specific HTTP status if (audio.error?.code === MediaError.MEDIA_ERR_NETWORK || audio.error?.code === MediaError.MEDIA_ERR_SRC_NOT_SUPPORTED) { // For network errors, provide more helpful messages if (state.currentEpisode) { if (!state.currentEpisode.has_processed_audio) { errorMessage = 'Post needs to be processed first'; } else if (!state.currentEpisode.whitelisted) { errorMessage = 'Post is not whitelisted'; } else { errorMessage = 'Audio file not available - try processing the post again'; } } } console.error('Audio error:', audio.error, 'Message:', errorMessage); dispatch({ type: 'SET_ERROR', payload: errorMessage }); }; const handleCanPlay = () => { dispatch({ type: 'SET_LOADING', payload: false }); }; audio.addEventListener('loadeddata', handleLoadedData); audio.addEventListener('timeupdate', handleTimeUpdate); audio.addEventListener('play', handlePlay); audio.addEventListener('pause', handlePause); audio.addEventListener('ended', handleEnded); audio.addEventListener('error', handleError); audio.addEventListener('canplay', handleCanPlay); return () => { audio.removeEventListener('loadeddata', handleLoadedData); audio.removeEventListener('timeupdate', handleTimeUpdate); audio.removeEventListener('play', handlePlay); audio.removeEventListener('pause', handlePause); audio.removeEventListener('ended', handleEnded); audio.removeEventListener('error', handleError); audio.removeEventListener('canplay', handleCanPlay); }; }, []); // Keyboard shortcuts useEffect(() => { const handleKeyDown = (event: KeyboardEvent) => { // Only handle shortcuts when there's a current episode and not typing in an input if (!state.currentEpisode || event.target instanceof HTMLInputElement || event.target instanceof HTMLTextAreaElement) { return; } switch (event.code) { case 'Space': event.preventDefault(); togglePlayPause(); break; case 'ArrowLeft': event.preventDefault(); seekTo(Math.max(0, state.currentTime - 10)); // Seek back 10 seconds break; case 'ArrowRight': event.preventDefault(); seekTo(Math.min(state.duration, state.currentTime + 10)); // Seek forward 10 seconds break; case 'ArrowUp': event.preventDefault(); setVolume(Math.min(1, state.volume + 0.1)); // Volume up break; case 'ArrowDown': event.preventDefault(); setVolume(Math.max(0, state.volume - 0.1)); // Volume down break; } }; document.addEventListener('keydown', handleKeyDown); return () => document.removeEventListener('keydown', handleKeyDown); }, [state.currentEpisode, state.currentTime, state.duration, state.volume, togglePlayPause, seekTo, setVolume]); const contextValue: AudioPlayerContextType = { ...state, playEpisode, togglePlayPause, seekTo, setVolume, audioRef, }; return ( {children} ); } export function useAudioPlayer() { const context = useContext(AudioPlayerContext); if (context === undefined) { throw new Error('useAudioPlayer must be used within an AudioPlayerProvider'); } return context; } ================================================ FILE: frontend/src/contexts/AuthContext.tsx ================================================ import { createContext, useCallback, useContext, useEffect, useMemo, useState } from 'react'; import type { ReactNode } from 'react'; import { authApi } from '../services/api'; import type { AuthUser } from '../types'; type AuthStatus = 'loading' | 'ready'; interface AuthContextValue { status: AuthStatus; requireAuth: boolean; isAuthenticated: boolean; user: AuthUser | null; landingPageEnabled: boolean; login: (username: string, password: string) => Promise; logout: () => void; changePassword: (currentPassword: string, newPassword: string) => Promise; refreshUser: () => Promise; } const AuthContext = createContext(undefined); interface InternalState { status: AuthStatus; requireAuth: boolean; user: AuthUser | null; landingPageEnabled: boolean; } export function AuthProvider({ children }: { children: ReactNode }) { const [state, setState] = useState({ status: 'loading', requireAuth: false, user: null, landingPageEnabled: false, }); const bootstrapAuth = useCallback(async () => { try { const statusResponse = await authApi.getStatus(); const requireAuth = Boolean(statusResponse.require_auth); const landingPageEnabled = Boolean(statusResponse.landing_page_enabled); if (!requireAuth) { setState({ status: 'ready', requireAuth: false, user: null, landingPageEnabled, }); return; } try { const me = await authApi.getCurrentUser(); setState({ status: 'ready', requireAuth: true, user: me.user, landingPageEnabled, }); } catch (error) { setState({ status: 'ready', requireAuth: true, user: null, landingPageEnabled, }); } } catch (error) { console.error('Failed to initialize auth state', error); setState({ status: 'ready', requireAuth: false, user: null, landingPageEnabled: false, }); } }, []); useEffect(() => { void bootstrapAuth(); }, [bootstrapAuth]); const login = useCallback(async (username: string, password: string) => { const trimmedUsername = username.trim(); if (!trimmedUsername) { throw new Error('Username is required.'); } const response = await authApi.login(trimmedUsername, password); setState((prev) => ({ status: 'ready', requireAuth: true, user: response.user, landingPageEnabled: prev.landingPageEnabled, })); }, []); const logout = useCallback(() => { void authApi.logout().catch((error) => { console.warn('Failed to log out cleanly', error); }); setState((prev) => ({ status: 'ready', requireAuth: prev.requireAuth, user: prev.requireAuth ? null : prev.user, landingPageEnabled: prev.landingPageEnabled, })); }, []); const changePassword = useCallback( async (currentPassword: string, newPassword: string) => { await authApi.changePassword({ current_password: currentPassword, new_password: newPassword, }); }, [], ); const refreshUser = useCallback(async () => { if (!state.requireAuth) { return; } try { const me = await authApi.getCurrentUser(); setState((prev) => ({ ...prev, user: me.user, })); } catch (error) { console.warn('Session expired while refreshing user', error); setState((prev) => ({ ...prev, user: null, })); } }, [state.requireAuth]); const value = useMemo(() => { const isAuthenticated = !state.requireAuth || Boolean(state.user); return { status: state.status, requireAuth: state.requireAuth, isAuthenticated, user: state.user, landingPageEnabled: state.landingPageEnabled, login, logout, changePassword, refreshUser, }; }, [changePassword, login, logout, refreshUser, state.requireAuth, state.status, state.user]); return {children}; } export const useAuth = (): AuthContextValue => { const context = useContext(AuthContext); if (!context) { throw new Error('useAuth must be used within an AuthProvider'); } return context; }; ================================================ FILE: frontend/src/contexts/DiagnosticsContext.tsx ================================================ /* eslint-disable react-refresh/only-export-components */ import { createContext, useCallback, useContext, useEffect, useMemo, useRef, useState, type ReactNode } from 'react'; import { DIAGNOSTIC_ERROR_EVENT, diagnostics, type DiagnosticErrorPayload, type DiagnosticsEntry } from '../utils/diagnostics'; export type DiagnosticsContextValue = { isOpen: boolean; open: (payload?: DiagnosticErrorPayload) => void; close: () => void; clear: () => void; getEntries: () => DiagnosticsEntry[]; currentError: DiagnosticErrorPayload | null; }; const DiagnosticsContext = createContext(null); const signatureFor = (payload: DiagnosticErrorPayload): string => { const base = { title: payload.title, message: payload.message, kind: payload.kind, }; try { return JSON.stringify(base); } catch { return `${payload.title}:${payload.message}`; } }; export function DiagnosticsProvider({ children }: { children: ReactNode }) { const [isOpen, setIsOpen] = useState(false); const [currentError, setCurrentError] = useState(null); const lastShownRef = useRef<{ sig: string; ts: number } | null>(null); const open = useCallback((payload?: DiagnosticErrorPayload) => { if (payload) { setCurrentError(payload); } else { setCurrentError(null); } setIsOpen(true); }, []); const close = useCallback(() => { setIsOpen(false); }, []); const clear = useCallback(() => { diagnostics.clear(); }, []); const getEntries = useCallback(() => diagnostics.getEntries(), []); useEffect(() => { const handler = (event: Event) => { const detail = (event as CustomEvent).detail as DiagnosticErrorPayload | undefined; if (!detail) return; // Deduplicate noisy errors (same signature within 5s) const sig = signatureFor(detail); const now = Date.now(); const last = lastShownRef.current; if (last && last.sig === sig && now - last.ts < 5000) { return; } lastShownRef.current = { sig, ts: now }; setCurrentError(detail); setIsOpen(true); }; window.addEventListener(DIAGNOSTIC_ERROR_EVENT, handler as EventListener); return () => window.removeEventListener(DIAGNOSTIC_ERROR_EVENT, handler as EventListener); }, []); const value = useMemo( () => ({ isOpen, open, close, clear, getEntries, currentError, }), [close, clear, currentError, getEntries, isOpen, open] ); return {children}; } export const useDiagnostics = (): DiagnosticsContextValue => { const ctx = useContext(DiagnosticsContext); if (!ctx) { throw new Error('useDiagnostics must be used within DiagnosticsProvider'); } return ctx; }; ================================================ FILE: frontend/src/hooks/useConfigState.ts ================================================ import { useCallback, useEffect, useMemo, useRef, useState } from 'react'; import { useMutation, useQuery } from '@tanstack/react-query'; import { configApi } from '../services/api'; import { toast } from 'react-hot-toast'; import type { CombinedConfig, ConfigResponse, EnvOverrideEntry, EnvOverrideMap, LLMConfig, WhisperConfig, } from '../types'; const DEFAULT_ENV_HINTS: Record = { 'groq.api_key': { env_var: 'GROQ_API_KEY' }, 'llm.llm_api_key': { env_var: 'LLM_API_KEY' }, 'llm.llm_model': { env_var: 'LLM_MODEL' }, 'llm.openai_base_url': { env_var: 'OPENAI_BASE_URL' }, 'whisper.whisper_type': { env_var: 'WHISPER_TYPE' }, 'whisper.api_key': { env_var: 'WHISPER_REMOTE_API_KEY' }, 'whisper.base_url': { env_var: 'WHISPER_REMOTE_BASE_URL' }, 'whisper.model': { env_var: 'WHISPER_REMOTE_MODEL' }, 'whisper.timeout_sec': { env_var: 'WHISPER_REMOTE_TIMEOUT_SEC' }, 'whisper.chunksize_mb': { env_var: 'WHISPER_REMOTE_CHUNKSIZE_MB' }, 'whisper.max_retries': { env_var: 'GROQ_MAX_RETRIES' }, }; const getValueAtPath = (obj: unknown, path: string): unknown => { if (!obj || typeof obj !== 'object') { return undefined; } return path.split('.').reduce((acc, key) => { if (!acc || typeof acc !== 'object') { return undefined; } return (acc as Record)[key]; }, obj); }; const valuesDiffer = (a: unknown, b: unknown): boolean => { if (a === b) { return false; } const aEmpty = a === null || a === undefined || a === ''; const bEmpty = b === null || b === undefined || b === ''; if (aEmpty && bEmpty) { return false; } return true; }; export interface ConnectionStatus { status: 'loading' | 'ok' | 'error'; message: string; error: string; } export interface UseConfigStateReturn { // Data pending: CombinedConfig | null; configData: CombinedConfig | undefined; envOverrides: EnvOverrideMap; isLoading: boolean; // Status llmStatus: ConnectionStatus; whisperStatus: ConnectionStatus; hasEdits: boolean; localWhisperAvailable: boolean | null; isSaving: boolean; // Actions setField: (path: string[], value: unknown) => void; updatePending: ( transform: (prevConfig: CombinedConfig) => CombinedConfig, markDirty?: boolean ) => void; probeConnections: () => Promise; handleSave: () => void; refetch: () => void; setHasEdits: (value: boolean) => void; // Helpers getEnvHint: (path: string, fallback?: EnvOverrideEntry) => EnvOverrideEntry | undefined; getWhisperApiKey: (w: WhisperConfig | undefined) => string; // Recommended defaults groqRecommendedModel: string; groqRecommendedWhisper: string; // Env warning modal envWarningPaths: string[]; showEnvWarning: boolean; handleConfirmEnvWarning: () => void; handleDismissEnvWarning: () => void; // Whisper type change handler handleWhisperTypeChange: (nextType: 'local' | 'remote' | 'groq') => void; // Groq quick setup mutation applyGroqKey: (key: string) => Promise; isApplyingGroqKey: boolean; } export function useConfigState(): UseConfigStateReturn { const { data, isLoading, refetch } = useQuery({ queryKey: ['config'], queryFn: configApi.getConfig, staleTime: Infinity, refetchOnWindowFocus: false, refetchOnReconnect: false, }); const configData = data?.config; const envOverrides = useMemo(() => data?.env_overrides ?? {}, [data]); const getEnvHint = useCallback( (path: string, fallback?: EnvOverrideEntry) => envOverrides[path] ?? fallback ?? DEFAULT_ENV_HINTS[path], [envOverrides] ); const [pending, setPending] = useState(null); const [hasEdits, setHasEdits] = useState(false); const [localWhisperAvailable, setLocalWhisperAvailable] = useState(null); // Connection statuses const [llmStatus, setLlmStatus] = useState({ status: 'loading', message: '', error: '', }); const [whisperStatus, setWhisperStatus] = useState({ status: 'loading', message: '', error: '', }); // Env warning modal state const [envWarningPaths, setEnvWarningPaths] = useState([]); const [showEnvWarning, setShowEnvWarning] = useState(false); const initialProbeDone = useRef(false); const groqRecommendedModel = useMemo(() => 'groq/openai/gpt-oss-120b', []); const groqRecommendedWhisper = useMemo(() => 'whisper-large-v3-turbo', []); const getWhisperApiKey = (w: WhisperConfig | undefined): string => { if (!w) return ''; if (w.whisper_type === 'remote') return w.api_key ?? ''; if (w.whisper_type === 'groq') return w.api_key ?? ''; return ''; }; const updatePending = useCallback( (transform: (prevConfig: CombinedConfig) => CombinedConfig, markDirty: boolean = true) => { let updated = false; setPending((prevConfig) => { if (!prevConfig) { return prevConfig; } const nextConfig = transform(prevConfig); if (nextConfig === prevConfig) { return prevConfig; } updated = true; return nextConfig; }); if (updated && markDirty) { setHasEdits(true); } }, [] ); const setField = useCallback( (path: string[], value: unknown) => { updatePending((prevConfig) => { const prevRecord = prevConfig as unknown as Record; const lastIndex = path.length - 1; let existingParent: Record | null = prevRecord; for (let i = 0; i < lastIndex; i++) { const key = path[i]; const rawNext: unknown = existingParent?.[key]; const nextParent: Record | null = rawNext && typeof rawNext === 'object' ? (rawNext as Record) : null; if (!nextParent) { existingParent = null; break; } existingParent = nextParent; } if (existingParent) { const currentValue = existingParent[path[lastIndex]]; if (Object.is(currentValue, value)) { return prevConfig; } } const next: Record = { ...prevRecord }; let cursor: Record = next; let sourceCursor: Record = prevRecord; for (let i = 0; i < lastIndex; i++) { const key = path[i]; const currentSource = (sourceCursor?.[key] as Record) ?? {}; const clonedChild: Record = { ...currentSource }; cursor[key] = clonedChild; cursor = clonedChild; sourceCursor = currentSource; } cursor[path[lastIndex]] = value; return next as unknown as CombinedConfig; }); }, [updatePending] ); // Initialize pending from config data useEffect(() => { if (!configData) { return; } setPending((prev) => { if (prev === null) { return configData; } if (hasEdits) { return prev; } return configData; }); }, [configData, hasEdits]); // Probe connections const probeConnections = async () => { if (!pending) return; setLlmStatus({ status: 'loading', message: '', error: '' }); setWhisperStatus({ status: 'loading', message: '', error: '' }); try { const [llmRes, whisperRes] = await Promise.all([ configApi.testLLM({ llm: pending.llm as LLMConfig }), configApi.testWhisper({ whisper: pending.whisper as WhisperConfig }), ]); if (llmRes?.ok) { setLlmStatus({ status: 'ok', message: llmRes.message || 'LLM connection OK', error: '', }); } else { setLlmStatus({ status: 'error', message: '', error: llmRes?.error || 'LLM connection failed', }); } if (whisperRes?.ok) { setWhisperStatus({ status: 'ok', message: whisperRes.message || 'Whisper connection OK', error: '', }); } else { setWhisperStatus({ status: 'error', message: '', error: whisperRes?.error || 'Whisper test failed', }); } } catch (err: unknown) { const e = err as { response?: { data?: { error?: string; message?: string } }; message?: string; }; const msg = e?.response?.data?.error || e?.response?.data?.message || e?.message || 'Connection test failed'; setLlmStatus({ status: 'error', message: '', error: msg }); setWhisperStatus({ status: 'error', message: '', error: msg }); } }; // Initial probe useEffect(() => { if (!pending || initialProbeDone.current) return; initialProbeDone.current = true; void probeConnections(); // eslint-disable-next-line react-hooks/exhaustive-deps }, [pending]); // Probe whisper capabilities useEffect(() => { let cancelled = false; configApi .getWhisperCapabilities() .then((res) => { if (!cancelled) setLocalWhisperAvailable(!!res.local_available); }) .catch(() => { if (!cancelled) setLocalWhisperAvailable(false); }); return () => { cancelled = true; }; }, []); // If local is unavailable but selected, switch to safe default useEffect(() => { if (!pending || localWhisperAvailable !== false) return; const currentType = pending.whisper.whisper_type; if (currentType === 'local') { setField(['whisper', 'whisper_type'], 'remote'); } }, [localWhisperAvailable, pending, setField]); // Save mutation const saveMutation = useMutation({ mutationFn: async () => { return configApi.updateConfig((pending ?? {}) as Partial); }, onSuccess: () => { setHasEdits(false); refetch(); }, }); const saveToastMessages = { loading: 'Saving changes...', success: 'Configuration saved', error: (err: unknown) => { if (typeof err === 'object' && err !== null) { const e = err as { response?: { data?: { error?: string; details?: string; message?: string } }; message?: string; }; return ( e.response?.data?.message || e.response?.data?.error || e.response?.data?.details || e.message || 'Failed to save configuration' ); } return 'Failed to save configuration'; }, } as const; const getEnvManagedConflicts = (): string[] => { if (!pending || !configData) { return []; } return Object.keys(envOverrides).filter((path) => { const baseline = getValueAtPath(configData, path); const current = getValueAtPath(pending, path); return valuesDiffer(current, baseline); }); }; const triggerSaveMutation = () => { toast.promise(saveMutation.mutateAsync(), saveToastMessages); }; const handleSave = () => { if (saveMutation.isPending) { return; } const envConflicts = getEnvManagedConflicts(); if (envConflicts.length > 0) { setEnvWarningPaths(envConflicts); setShowEnvWarning(true); return; } triggerSaveMutation(); }; const handleConfirmEnvWarning = () => { setShowEnvWarning(false); triggerSaveMutation(); }; const handleDismissEnvWarning = () => { setShowEnvWarning(false); setEnvWarningPaths([]); }; // Whisper type change handler const handleWhisperTypeChange = (nextType: 'local' | 'remote' | 'groq') => { updatePending((prevConfig) => { const prevWhisper = { ...(prevConfig.whisper as unknown as Record), }; const prevModelRaw = (prevWhisper?.model as string | undefined) ?? ''; const prevModel = String(prevModelRaw).toLowerCase(); const isNonGroqDefault = prevModel === 'base' || prevModel === 'base.en' || prevModel === 'whisper-1'; const isDeprecatedGroq = prevModel === 'distil-whisper-large-v3-en'; let nextModel: string | undefined = prevWhisper?.model as string | undefined; if (nextType === 'groq') { if (!nextModel || isNonGroqDefault || isDeprecatedGroq) { nextModel = 'whisper-large-v3-turbo'; } } else if (nextType === 'remote') { if (!nextModel || prevModel === 'base' || prevModel === 'base.en') { nextModel = 'whisper-1'; } } else if (nextType === 'local') { if (!nextModel || prevModel === 'whisper-1' || prevModel.startsWith('whisper-large')) { nextModel = 'base.en'; } } const nextWhisper: Record = { ...prevWhisper, whisper_type: nextType, }; if (nextType === 'groq') { nextWhisper.model = nextModel ?? 'whisper-large-v3-turbo'; nextWhisper.language = (prevWhisper.language as string | undefined) || 'en'; delete nextWhisper.base_url; delete nextWhisper.timeout_sec; delete nextWhisper.chunksize_mb; } else if (nextType === 'remote') { nextWhisper.model = nextModel ?? 'whisper-1'; nextWhisper.language = (prevWhisper.language as string | undefined) || 'en'; } else if (nextType === 'local') { nextWhisper.model = nextModel ?? 'base.en'; delete nextWhisper.api_key; } else if (nextType === 'test') { delete nextWhisper.model; delete nextWhisper.api_key; } return { ...prevConfig, whisper: nextWhisper as unknown as WhisperConfig, } as CombinedConfig; }); }; // Groq key mutation const applyGroqKeyMutation = useMutation({ mutationFn: async (key: string) => { const next = { llm: { ...(pending?.llm as LLMConfig), llm_api_key: key, llm_model: groqRecommendedModel, }, whisper: { whisper_type: 'groq', api_key: key, model: groqRecommendedWhisper, language: 'en', max_retries: 3, }, } as Partial; updatePending((prevConfig) => ({ ...prevConfig, llm: next.llm as LLMConfig, whisper: next.whisper as WhisperConfig, })); const [llmRes, whisperRes] = await Promise.all([ configApi.testLLM({ llm: next.llm as LLMConfig }), configApi.testWhisper({ whisper: next.whisper as WhisperConfig }), ]); if (!llmRes?.ok) throw new Error(llmRes?.error || 'LLM test failed'); if (!whisperRes?.ok) throw new Error(whisperRes?.error || 'Whisper test failed'); return await configApi.updateConfig(next); }, onSuccess: () => { setHasEdits(false); refetch(); toast.success('Groq key verified and saved. Defaults applied.'); setLlmStatus({ status: 'ok', message: 'LLM connection OK', error: '' }); setWhisperStatus({ status: 'ok', message: 'Whisper connection OK', error: '' }); }, }); const applyGroqKey = async (key: string) => { await toast.promise(applyGroqKeyMutation.mutateAsync(key), { loading: 'Verifying Groq key and applying defaults...', success: 'Groq configured successfully', error: (err: unknown) => { const e = err as { response?: { data?: { error?: string; message?: string } }; message?: string; }; return ( e?.response?.data?.error || e?.response?.data?.message || e?.message || 'Failed to configure Groq' ); }, }); }; return { // Data pending, configData, envOverrides, isLoading, // Status llmStatus, whisperStatus, hasEdits, localWhisperAvailable, isSaving: saveMutation.isPending, // Actions setField, updatePending, probeConnections, handleSave, refetch, setHasEdits, // Helpers getEnvHint, getWhisperApiKey, // Recommended defaults groqRecommendedModel, groqRecommendedWhisper, // Env warning modal envWarningPaths, showEnvWarning, handleConfirmEnvWarning, handleDismissEnvWarning, // Whisper type change handleWhisperTypeChange, // Groq quick setup applyGroqKey, isApplyingGroqKey: applyGroqKeyMutation.isPending, }; } export default useConfigState; ================================================ FILE: frontend/src/hooks/useEpisodeStatus.ts ================================================ import { useQuery, useQueryClient } from '@tanstack/react-query'; import { useEffect } from 'react'; import { feedsApi } from '../services/api'; export function useEpisodeStatus(episodeGuid: string, isWhitelisted: boolean, hasProcessedAudio: boolean, feedId?: number) { const queryClient = useQueryClient(); const query = useQuery({ queryKey: ['episode-status', episodeGuid], queryFn: () => feedsApi.getPostStatus(episodeGuid), enabled: isWhitelisted && !hasProcessedAudio, refetchOnWindowFocus: false, refetchInterval: (query) => { const status = query.state.data?.status; if (status === 'pending' || status === 'running' || status === 'starting' || status === 'processing') { return 3000; } return false; }, }); useEffect(() => { if (query.data?.status === 'completed' && feedId) { // Invalidate episodes list to refresh UI (show Play button) queryClient.invalidateQueries({ queryKey: ['episodes', feedId] }); } }, [query.data?.status, feedId, queryClient]); return query; } ================================================ FILE: frontend/src/index.css ================================================ @tailwind base; @tailwind components; @tailwind utilities; ================================================ FILE: frontend/src/main.tsx ================================================ import { StrictMode } from 'react' import { createRoot } from 'react-dom/client' import './index.css' import './App.css' import App from './App.tsx' import { initFrontendDiagnostics } from './utils/diagnostics' initFrontendDiagnostics() createRoot(document.getElementById('root')!).render( , ) ================================================ FILE: frontend/src/pages/BillingPage.tsx ================================================ import { useEffect, useState } from 'react'; import { useQuery, useMutation } from '@tanstack/react-query'; import { billingApi } from '../services/api'; import { toast } from 'react-hot-toast'; import { useAuth } from '../contexts/AuthContext'; import { Navigate } from 'react-router-dom'; export default function BillingPage() { const { user } = useAuth(); if (user?.role === 'admin') { return ; } const { data, refetch, isLoading } = useQuery({ queryKey: ['billing', 'summary'], queryFn: billingApi.getSummary, }); // Amount in dollars const [amount, setAmount] = useState(5); useEffect(() => { if (data?.current_amount) { setAmount(data.current_amount / 100); } }, [data]); const updateSubscription = useMutation({ mutationFn: (amt: number) => billingApi.updateSubscription(Math.round(amt * 100), { subscriptionId: data?.stripe_subscription_id ?? null, }), onSuccess: (res) => { if (res.checkout_url) { window.location.href = res.checkout_url; return; } toast.success('Plan updated'); if (res.current_amount) { setAmount(res.current_amount / 100); } refetch(); }, onError: (err) => { console.error('Failed to update plan', err); toast.error('Could not update plan'); }, }); const portalSession = useMutation({ mutationFn: () => billingApi.createPortalSession(), onSuccess: (res) => { if (res.url) { window.location.href = res.url; } }, onError: (err) => { console.error('Failed to open billing portal', err); toast.error('Unable to open billing portal'); }, }); if (isLoading || !data) { return (
Loading billing…
); } const isSubscribed = data.subscription_status === 'active' || data.subscription_status === 'trialing'; const currentAmountDollars = data.current_amount ? data.current_amount / 100 : 0; const atCurrentAmount = amount === currentAmountDollars && isSubscribed; const planLimitInfo = `${data.feeds_in_use}/${data.feed_allowance} feeds active`; const minAmountCents = data.min_amount_cents ?? 100; const minAmountDollars = minAmountCents / 100; return (

Billing

Pay what you want for the Starter Bundle (10 feeds).

Current plan
{isSubscribed ? 'Starter Bundle (10 Feeds)' : 'Free Tier'}
{planLimitInfo}
Monthly payment
{isSubscribed ? `$${currentAmountDollars.toFixed(2)}` : '$0.00'}
Subscription status: {data.subscription_status || 'inactive'}
{isSubscribed ? 'Update your price' : 'Subscribe to Starter Bundle'}

Get 10 feeds for a monthly price of your choice (min ${minAmountDollars.toFixed(2)}).

Note: We suggest paying ~$1 per feed you use. If revenue doesn't cover server costs, we may have to shut down the service.
$
setAmount(Math.max(0, Number(e.target.value)))} className="block w-full rounded-md border-gray-300 pl-7 pr-3 py-2 focus:border-blue-500 focus:ring-blue-500 sm:text-sm border" placeholder="5.00" />
Suggested: {[3, 5, 10, 15].map((preset) => ( ))}
{amount < minAmountDollars && ( Minimum amount is ${minAmountDollars.toFixed(2)} )}
Payments are securely processed by Stripe. You can cancel anytime.
); } ================================================ FILE: frontend/src/pages/ConfigPage.tsx ================================================ import ConfigTabs from '../components/config/ConfigTabs'; export default function ConfigPage() { return ; } ================================================ FILE: frontend/src/pages/HomePage.tsx ================================================ import { useMutation, useQuery } from '@tanstack/react-query'; import { useEffect, useState } from 'react'; import { feedsApi, configApi, billingApi } from '../services/api'; import FeedList from '../components/FeedList'; import FeedDetail from '../components/FeedDetail'; import AddFeedForm from '../components/AddFeedForm'; import type { Feed, ConfigResponse } from '../types'; import { toast } from 'react-hot-toast'; import { useAuth } from '../contexts/AuthContext'; import { useNavigate } from 'react-router-dom'; import { copyToClipboard } from '../utils/clipboard'; import { emitDiagnosticError } from '../utils/diagnostics'; import { getHttpErrorInfo } from '../utils/httpError'; export default function HomePage() { const navigate = useNavigate(); const [showAddForm, setShowAddForm] = useState(false); const [selectedFeed, setSelectedFeed] = useState(null); const { requireAuth, user } = useAuth(); const { data: feeds, isLoading, error, refetch } = useQuery({ queryKey: ['feeds'], queryFn: feedsApi.getFeeds, }); const { data: billingSummary, refetch: refetchBilling } = useQuery({ queryKey: ['billing', 'summary'], queryFn: billingApi.getSummary, enabled: requireAuth && !!user, }); useQuery({ queryKey: ['config'], queryFn: configApi.getConfig, enabled: !requireAuth || user?.role === 'admin', }); const canRefreshAll = !requireAuth || user?.role === 'admin'; const refreshAllMutation = useMutation({ mutationFn: () => feedsApi.refreshAllFeeds(), onSuccess: (data) => { toast.success( `Refreshed ${data.feeds_refreshed} feeds and enqueued ${data.jobs_enqueued} jobs` ); refetch(); }, onError: (err) => { console.error('Failed to refresh all feeds', err); const { status, data, message } = getHttpErrorInfo(err); emitDiagnosticError({ title: 'Failed to refresh all feeds', message, kind: status ? 'http' : 'network', details: { status, response: data, }, }); }, }); useEffect(() => { if (!showAddForm || typeof document === 'undefined') { return; } const originalOverflow = document.body.style.overflow; document.body.style.overflow = 'hidden'; return () => { document.body.style.overflow = originalOverflow; }; }, [showAddForm]); if (isLoading) { return (
); } if (error) { return (

Error loading feeds. Please try again.

); } const planLimitReached = !!billingSummary && billingSummary.feeds_in_use >= billingSummary.feed_allowance && user?.role !== 'admin'; const handleChangePlan = () => { navigate('/billing'); }; const handleCopyAggregateLink = async () => { try { const { url } = await feedsApi.getAggregateFeedLink(); await copyToClipboard(url, 'Copy the Aggregate RSS URL:', 'Aggregate feed URL copied to clipboard!'); } catch (err) { console.error('Failed to get aggregate link', err); toast.error('Failed to get aggregate feed link'); } }; return (
{/* Left Panel - Feed List (hidden on mobile when feed is selected) */}

Podcast Feeds

{canRefreshAll && ( )}
{/* Right Panel - Feed Detail */} {selectedFeed && (
setSelectedFeed(null)} onFeedDeleted={() => { setSelectedFeed(null); refetch(); }} />
)} {/* Empty State for Desktop */} {!selectedFeed && (

No podcast selected

Select a podcast from the list to view details and episodes.

)} {showAddForm && (
setShowAddForm(false)} >
event.stopPropagation()} >

Add a Podcast Feed

Paste an RSS URL or search the catalog to find shows to follow.

{ setShowAddForm(false); refetch(); refetchBilling(); }} onUpgradePlan={handleChangePlan} planLimitReached={planLimitReached} />
)}
); } ================================================ FILE: frontend/src/pages/JobsPage.tsx ================================================ import { useCallback, useEffect, useRef, useState } from 'react'; import { jobsApi } from '../services/api'; import type { CleanupPreview, Job, JobManagerRun, JobManagerStatus } from '../types'; function getStatusColor(status: string) { switch (status) { case 'running': return 'bg-green-100 text-green-800'; case 'pending': return 'bg-yellow-100 text-yellow-800'; case 'failed': return 'bg-red-100 text-red-800'; case 'completed': return 'bg-blue-100 text-blue-800'; case 'skipped': return 'bg-purple-100 text-purple-800'; case 'cancelled': return 'bg-gray-100 text-gray-800'; default: return 'bg-gray-100 text-gray-800'; } } function StatusBadge({ status }: { status: string }) { const color = getStatusColor(status); return ( {status} ); } function ProgressBar({ value }: { value: number }) { const clamped = Math.max(0, Math.min(100, Math.round(value))); return (
); } function RunStat({ label, value }: { label: string; value: number }) { return (
{label}
{value}
); } function formatDateTime(value: string | null): string { if (!value) { return '—'; } try { return new Date(value).toLocaleString(); } catch (err) { console.error('Failed to format date', err); return value; } } export default function JobsPage() { const [jobs, setJobs] = useState([]); const [managerStatus, setManagerStatus] = useState(null); const [statusError, setStatusError] = useState(null); const [loading, setLoading] = useState(false); const [error, setError] = useState(null); const [mode, setMode] = useState<'active' | 'all'>('active'); const [cancellingJobs, setCancellingJobs] = useState>(new Set()); const previousHasActiveWork = useRef(false); const [cleanupPreview, setCleanupPreview] = useState(null); const [cleanupLoading, setCleanupLoading] = useState(false); const [cleanupError, setCleanupError] = useState(null); const [cleanupRunning, setCleanupRunning] = useState(false); const [cleanupMessage, setCleanupMessage] = useState(null); const loadStatus = useCallback(async () => { try { const data = await jobsApi.getJobManagerStatus(); setManagerStatus(data); setStatusError(null); } catch (e) { console.error('Failed to load job manager status:', e); setStatusError('Failed to load manager status'); } }, []); const loadActive = useCallback(async () => { setLoading(true); setError(null); try { const data = await jobsApi.getActiveJobs(100); setJobs(data); } catch (e) { console.error('Failed to load active jobs:', e); setError('Failed to load jobs'); } finally { setLoading(false); } }, []); const loadAll = useCallback(async () => { setLoading(true); setError(null); try { const data = await jobsApi.getAllJobs(200); setJobs(data); } catch (e) { console.error('Failed to load all jobs:', e); setError('Failed to load jobs'); } finally { setLoading(false); } }, []); const loadCleanupPreview = useCallback(async () => { setCleanupLoading(true); try { const data = await jobsApi.getCleanupPreview(); setCleanupPreview(data); setCleanupError(null); } catch (e) { console.error('Failed to load cleanup preview:', e); setCleanupError('Failed to load cleanup preview'); } finally { setCleanupLoading(false); } }, []); const refresh = useCallback(async () => { await loadStatus(); if (mode === 'active') { await loadActive(); } else { await loadAll(); } await loadCleanupPreview(); }, [mode, loadActive, loadAll, loadStatus, loadCleanupPreview]); const cancelJob = useCallback( async (jobId: string) => { setCancellingJobs(prev => new Set(prev).add(jobId)); try { await jobsApi.cancelJob(jobId); await refresh(); } catch (e) { setError(`Failed to cancel job: ${e instanceof Error ? e.message : 'Unknown error'}`); } finally { setCancellingJobs(prev => { const newSet = new Set(prev); newSet.delete(jobId); return newSet; }); } }, [refresh] ); const runCleanupNow = useCallback(async () => { setCleanupRunning(true); setCleanupError(null); setCleanupMessage(null); try { const result = await jobsApi.runCleanupJob(); if (result.status === 'disabled') { setCleanupMessage(result.message ?? 'Cleanup is disabled.'); return; } if (result.status !== 'ok') { setCleanupError(result.message ?? 'Cleanup job failed'); return; } const removed = result.removed_posts ?? 0; const remaining = result.remaining_candidates ?? 0; const removedText = `Cleanup removed ${removed} episode${removed === 1 ? '' : 's'}.`; const remainingText = remaining > 0 ? ` ${remaining} episode${remaining === 1 ? '' : 's'} still eligible.` : ''; setCleanupMessage(`${removedText}${remainingText}`); await refresh(); } catch (e) { console.error('Failed to run cleanup job:', e); setCleanupError('Failed to run cleanup job'); } finally { setCleanupRunning(false); } }, [refresh]); useEffect(() => { void loadStatus(); void loadActive(); void loadCleanupPreview(); }, [loadActive, loadStatus, loadCleanupPreview]); useEffect(() => { const queued = managerStatus?.run?.queued_jobs ?? 0; const running = managerStatus?.run?.running_jobs ?? 0; const hasActiveWork = queued + running > 0; if (!hasActiveWork) { return undefined; } // Poll every 15 seconds when jobs are active to reduce database contention const interval = setInterval(() => { void loadStatus(); }, 15000); return () => clearInterval(interval); }, [managerStatus?.run?.queued_jobs, managerStatus?.run?.running_jobs, loadStatus]); useEffect(() => { const queued = managerStatus?.run?.queued_jobs ?? 0; const running = managerStatus?.run?.running_jobs ?? 0; const hasActiveWork = queued + running > 0; if (!hasActiveWork && previousHasActiveWork.current) { void refresh(); } previousHasActiveWork.current = hasActiveWork; }, [managerStatus?.run?.queued_jobs, managerStatus?.run?.running_jobs, refresh]); const run: JobManagerRun | null = managerStatus?.run ?? null; const hasActiveWork = run ? run.queued_jobs + run.running_jobs > 0 : false; const retentionDays = cleanupPreview?.retention_days ?? null; const cleanupDisabled = retentionDays === null || retentionDays <= 0; const cleanupEligibleCount = cleanupPreview?.count ?? 0; return (

Jobs Manager

{run ? hasActiveWork ? `Processing · Last update ${formatDateTime(run.updated_at)}` : `Idle · Last activity ${formatDateTime(run.updated_at)}` : 'Jobs Manager has not started yet.'}

{run ? ( ) : ( idle )}
{statusError && (
{statusError}
)} {run ? ( <>
{run.completed_jobs} completed · {run.skipped_jobs} skipped · {run.failed_jobs} failed of {run.total_jobs} jobs
Trigger: {run.trigger}
{run.counters_reset_at ? (
Stats since {formatDateTime(run.counters_reset_at)}
) : null} ) : null}

Post Cleanup

{cleanupDisabled ? 'Cleanup is disabled while retention days are unset or zero.' : `Episodes older than ${retentionDays} day${retentionDays === 1 ? '' : 's'} will be removed.`}

Eligible
{cleanupLoading ? '…' : cleanupEligibleCount}
{cleanupError && (
{cleanupError}
)} {cleanupMessage && (
{cleanupMessage}
)}
Retention
{cleanupDisabled ? 'Disabled' : `${retentionDays} day${retentionDays === 1 ? '' : 's'}`}
Eligible episodes
{cleanupLoading ? 'Loading…' : cleanupEligibleCount}
Cutoff date
{cleanupPreview?.cutoff_utc ? formatDateTime(cleanupPreview.cutoff_utc) : '—'}
Includes completed jobs and non-whitelisted episodes with release dates older than the retention window.

{mode === 'active' ? 'Active Jobs' : 'All Jobs'}

{mode === 'active' ? 'Queued and running jobs, ordered by priority.' : 'All jobs ordered by priority (running/pending first).'}

{mode === 'active' ? ( ) : ( )}
{error && (
{error}
)} {jobs.length === 0 && !loading ? (
No jobs to display.
) : null}
{jobs.map((job) => (
{job.post_title || 'Untitled episode'}
{job.feed_title || 'Unknown feed'}
Priority {job.priority}
Step {job.step}/{job.total_steps} {job.step_name ? `· ${job.step_name}` : ''}
Progress {Math.round(job.progress_percentage)}%
Job ID
{job.job_id}
Post GUID
{job.post_guid}
Created
{job.created_at ? formatDateTime(job.created_at) : '—'}
Started
{job.started_at ? formatDateTime(job.started_at) : '—'}
{job.error_message ? (
Message
{job.error_message}
) : null}
{(job.status === 'pending' || job.status === 'running') && (
)}
))}
); } ================================================ FILE: frontend/src/pages/LandingPage.tsx ================================================ import { Link } from 'react-router-dom'; import { useQuery } from '@tanstack/react-query'; import { landingApi } from '../services/api'; export default function LandingPage() { const { data: status } = useQuery({ queryKey: ['landing-status'], queryFn: landingApi.getStatus, refetchInterval: 30000, // refresh every 30s }); const userCount = status?.user_count ?? 0; const userLimit = status?.user_limit_total; const slotsRemaining = status?.slots_remaining; return (
{/* Header */}
{/* Hero */}

Join the Podly test group

We're testing a self-hosted podcast ad removal system. Podly transcribes episodes, detects sponsor reads with an LLM, and generates clean RSS feeds that work in any podcast app.

{/* Live user count */}
{userLimit !== null && userLimit !== undefined && userLimit > 0 ? ( <> {userCount} / {userLimit} testers {slotsRemaining !== null && slotsRemaining !== undefined && slotsRemaining > 0 && ( ({slotsRemaining} {slotsRemaining === 1 ? 'slot' : 'spots'} remaining) )} ) : ( <> {userCount} active testers )}
{slotsRemaining !== null && slotsRemaining === 0 && (
Test group full. Join the Discord to hear when more slots open up.
)}
{/* How it works */}

How it works

Podly grabs the feed, finds sponsorship blocks, and gives you a private RSS link so your own players stream the ad-free version.

Listen anywhere

  • Apple Podcasts: Library → Edit → Add Show by URL → paste the Podly link.
  • Overcast: Tap + → Add URL → paste → done.
  • Pocket Casts: Discover → Paste RSS Link → Subscribe.
  • Other players: Podcast Addict, AntennaPod, Castro, etc. all support "add via URL."
Spotify blocks custom RSS feeds, so switch to any other podcast app when you use Podly links.

Getting started

  1. 1. Sign up, choose number of podcasts ($1/pod/month)
  2. 2. Search for a podcast and add it to your personal feed list.
  3. 3. Copy your unique Podly RSS link for that feed.
  4. 4. Paste the link into your podcast app to start listening ad-free.
  5. Need help? Ask questions in{' '} Discord .
{/* CTA */}
{/* Footer */}
Podly Podly

Open source podcast ad remover.

); } ================================================ FILE: frontend/src/pages/LoginPage.tsx ================================================ import type { FormEvent } from 'react'; import { useState, useEffect } from 'react'; import axios from 'axios'; import { Link } from 'react-router-dom'; import { useAuth } from '../contexts/AuthContext'; import { discordApi } from '../services/api'; export default function LoginPage() { const { login, landingPageEnabled } = useAuth(); const [username, setUsername] = useState(''); const [password, setPassword] = useState(''); const [submitting, setSubmitting] = useState(false); const [error, setError] = useState(null); const [discordEnabled, setDiscordEnabled] = useState(false); const [discordLoading, setDiscordLoading] = useState(false); const [showPasswordLogin, setShowPasswordLogin] = useState(false); // Check for OAuth callback errors in URL useEffect(() => { const params = new URLSearchParams(window.location.search); const urlError = params.get('error'); if (urlError) { const messages: Record = { 'guild_requirement_not_met': 'You must be a member of the required Discord server.', 'registration_disabled': 'Self-registration is currently disabled.', 'auth_failed': 'Discord authentication failed. Please try again.', 'invalid_state': 'Invalid session state. Please try again.', 'access_denied': 'Discord access was denied.', 'discord_not_configured': 'Discord SSO is not configured.', 'missing_code': 'Missing authorization code from Discord.', }; setError(messages[urlError] || 'An error occurred during login.'); // Clean URL window.history.replaceState({}, '', window.location.pathname); } }, []); // Check if Discord SSO is enabled useEffect(() => { discordApi.getStatus() .then((status) => { setDiscordEnabled(status.enabled); setShowPasswordLogin(!status.enabled); }) .catch(() => { setDiscordEnabled(false); setShowPasswordLogin(true); }); }, []); const handleSubmit = async (event: FormEvent) => { event.preventDefault(); setError(null); setSubmitting(true); try { await login(username, password); setUsername(''); setPassword(''); } catch (err) { if (axios.isAxiosError(err)) { const message = err.response?.data?.error ?? 'Invalid username or password.'; setError(message); } else if (err instanceof Error) { setError(err.message); } else { setError('Login failed. Please try again.'); } } finally { setSubmitting(false); } }; const handleDiscordLogin = async () => { setError(null); setDiscordLoading(true); try { const { authorization_url } = await discordApi.getLoginUrl(); window.location.href = authorization_url; } catch { setError('Failed to start Discord login. Please try again.'); setDiscordLoading(false); } }; return (
Podly

Sign in to Podly

{error && (
{error}
)} {discordEnabled && (
{!showPasswordLogin && ( )}
)} {(!discordEnabled || showPasswordLogin) && (
setUsername(event.target.value)} className="mt-1 block w-full rounded-md border border-gray-300 px-3 py-2 shadow-sm focus:border-blue-500 focus:outline-none focus:ring-1 focus:ring-blue-500" disabled={submitting} required />
setPassword(event.target.value)} className="mt-1 block w-full rounded-md border border-gray-300 px-3 py-2 shadow-sm focus:border-blue-500 focus:outline-none focus:ring-1 focus:ring-blue-500" disabled={submitting} required />
)}
Discord {landingPageEnabled && ( ← Back to home )}
); } ================================================ FILE: frontend/src/services/api.ts ================================================ import axios from 'axios'; import { diagnostics } from '../utils/diagnostics'; import type { Feed, Episode, Job, JobManagerStatus, CleanupPreview, CleanupRunResult, CombinedConfig, LLMConfig, WhisperConfig, PodcastSearchResult, ConfigResponse, BillingSummary, LandingStatus, PagedResult, } from '../types'; const API_BASE_URL = ''; const api = axios.create({ baseURL: API_BASE_URL, withCredentials: true, }); api.interceptors.response.use( (response) => response, (error) => { try { const cfg = error?.config; const method = (cfg?.method ?? 'GET').toUpperCase(); const url = cfg?.url ?? '(unknown url)'; const status = error?.response?.status as number | undefined; const responseData = error?.response?.data; const details = { method, url, status, response: responseData, }; diagnostics.add('error', `HTTP error ${status ?? 'NETWORK'} ${method} ${url}`, details); } catch { // ignore } return Promise.reject(error); } ); const buildAbsoluteUrl = (path: string): string => { if (/^https?:\/\//i.test(path)) { return path; } const origin = API_BASE_URL || window.location.origin; if (path.startsWith('/')) { return `${origin}${path}`; } return `${origin}/${path}`; }; export const feedsApi = { getFeeds: async (): Promise => { const response = await api.get('/feeds'); return response.data; }, getFeedPosts: async ( feedId: number, options?: { page?: number; pageSize?: number; whitelistedOnly?: boolean } ): Promise> => { const response = await api.get(`/api/feeds/${feedId}/posts`, { params: { page: options?.page, page_size: options?.pageSize, whitelisted_only: options?.whitelistedOnly, }, }); return response.data; }, addFeed: async (url: string): Promise => { const formData = new FormData(); formData.append('url', url); await api.post('/feed', formData); }, deleteFeed: async (feedId: number): Promise => { await api.delete(`/feed/${feedId}`); }, refreshFeed: async ( feedId: number ): Promise<{ status: string; message?: string }> => { const response = await api.post(`/api/feeds/${feedId}/refresh`); return response.data; }, refreshAllFeeds: async (): Promise<{ status: string; feeds_refreshed: number; jobs_enqueued: number; }> => { const response = await api.post('/api/feeds/refresh-all'); return response.data; }, togglePostWhitelist: async ( guid: string, whitelisted: boolean, triggerProcessing = false ): Promise<{ processing_job?: { status: string; job_id?: string; message?: string } }> => { const response = await api.post(`/api/posts/${guid}/whitelist`, { whitelisted, trigger_processing: triggerProcessing, }); return response.data; }, toggleAllPostsWhitelist: async (feedId: number): Promise<{ message: string; whitelisted_count: number; total_count: number; all_whitelisted: boolean }> => { const response = await api.post(`/api/feeds/${feedId}/toggle-whitelist-all`); return response.data; }, joinFeed: async (feedId: number): Promise => { const response = await api.post(`/api/feeds/${feedId}/join`); return response.data; }, exitFeed: async (feedId: number): Promise => { const response = await api.post(`/api/feeds/${feedId}/exit`); return response.data; }, leaveFeed: async (feedId: number): Promise<{ status: string; feed_id: number }> => { const response = await api.post(`/api/feeds/${feedId}/leave`); return response.data; }, updateFeedSettings: async ( feedId: number, settings: { auto_whitelist_new_episodes_override: boolean | null } ): Promise => { const response = await api.patch(`/api/feeds/${feedId}/settings`, settings); return response.data; }, getProcessingEstimate: async (guid: string): Promise<{ post_guid: string; estimated_minutes: number; can_process: boolean; reason: string | null; }> => { const response = await api.get(`/api/posts/${guid}/processing-estimate`); return response.data; }, searchFeeds: async ( term: string ): Promise<{ results: PodcastSearchResult[]; total: number; }> => { const response = await api.get('/api/feeds/search', { params: { term }, }); return response.data; }, // New post processing methods processPost: async (guid: string): Promise<{ status: string; job_id?: string; message: string; download_url?: string }> => { const response = await api.post(`/api/posts/${guid}/process`); return response.data; }, reprocessPost: async (guid: string): Promise<{ status: string; job_id?: string; message: string; download_url?: string }> => { const response = await api.post(`/api/posts/${guid}/reprocess`); return response.data; }, getPostStatus: async (guid: string): Promise<{ status: string; step: number; step_name: string; total_steps: number; message: string; download_url?: string; error?: string; }> => { const response = await api.get(`/api/posts/${guid}/status`); return response.data; }, // Get audio URL for post getPostAudioUrl: (guid: string): string => { return buildAbsoluteUrl(`/api/posts/${guid}/audio`); }, // Get download URL for processed post getPostDownloadUrl: (guid: string): string => { return buildAbsoluteUrl(`/api/posts/${guid}/download`); }, // Get download URL for original post getPostOriginalDownloadUrl: (guid: string): string => { return buildAbsoluteUrl(`/api/posts/${guid}/download/original`); }, // Download processed post downloadPost: async (guid: string): Promise => { const response = await api.get(`/api/posts/${guid}/download`, { responseType: 'blob', }); const blob = new Blob([response.data], { type: 'audio/mpeg' }); const url = window.URL.createObjectURL(blob); const link = document.createElement('a'); link.href = url; link.download = `${guid}.mp3`; document.body.appendChild(link); link.click(); document.body.removeChild(link); window.URL.revokeObjectURL(url); }, // Download original post downloadOriginalPost: async (guid: string): Promise => { const response = await api.get(`/api/posts/${guid}/download/original`, { responseType: 'blob', }); const blob = new Blob([response.data], { type: 'audio/mpeg' }); const url = window.URL.createObjectURL(blob); const link = document.createElement('a'); link.href = url; link.download = `${guid}_original.mp3`; document.body.appendChild(link); link.click(); document.body.removeChild(link); window.URL.revokeObjectURL(url); }, createProtectedFeedShareLink: async ( feedId: number ): Promise<{ url: string; feed_token: string; feed_secret: string; feed_id: number }> => { const response = await api.post(`/api/feeds/${feedId}/share-link`); return response.data; }, // Get processing stats for post getPostStats: async (guid: string): Promise<{ post: { guid: string; title: string; duration: number | null; release_date: string | null; whitelisted: boolean; has_processed_audio: boolean; }; processing_stats: { total_segments: number; total_model_calls: number; total_identifications: number; content_segments: number; ad_segments_count: number; ad_percentage: number; estimated_ad_time_seconds: number; model_call_statuses: Record; model_types: Record; }; model_calls: Array<{ id: number; model_name: string; status: string; segment_range: string; first_segment_sequence_num: number; last_segment_sequence_num: number; timestamp: string | null; retry_attempts: number; error_message: string | null; prompt: string | null; response: string | null; }>; transcript_segments: Array<{ id: number; sequence_num: number; start_time: number; end_time: number; text: string; primary_label: 'ad' | 'content'; mixed: boolean; identifications: Array<{ id: number; label: string; confidence: number | null; model_call_id: number; }>; }>; identifications: Array<{ id: number; transcript_segment_id: number; label: string; confidence: number | null; model_call_id: number; segment_sequence_num: number; segment_start_time: number; segment_end_time: number; segment_text: string; mixed: boolean; }>; }> => { const response = await api.get(`/api/posts/${guid}/stats`); return response.data; }, // Legacy aliases for backward compatibility getFeedEpisodes: async ( feedId: number, options?: { page?: number; pageSize?: number; whitelistedOnly?: boolean } ): Promise> => { return feedsApi.getFeedPosts(feedId, options); }, toggleEpisodeWhitelist: async (guid: string, whitelisted: boolean): Promise<{ processing_job?: { status: string; job_id?: string; message?: string } }> => { return feedsApi.togglePostWhitelist(guid, whitelisted); }, toggleAllEpisodesWhitelist: async (feedId: number): Promise<{ message: string; whitelisted_count: number; total_count: number; all_whitelisted: boolean }> => { return feedsApi.toggleAllPostsWhitelist(feedId); }, processEpisode: async (guid: string): Promise<{ status: string; job_id?: string; message: string; download_url?: string }> => { return feedsApi.processPost(guid); }, getEpisodeStatus: async (guid: string): Promise<{ status: string; step: number; step_name: string; total_steps: number; message: string; download_url?: string; error?: string; }> => { return feedsApi.getPostStatus(guid); }, getEpisodeAudioUrl: (guid: string): string => { return feedsApi.getPostAudioUrl(guid); }, getEpisodeStats: async (guid: string): Promise<{ post: { guid: string; title: string; duration: number | null; release_date: string | null; whitelisted: boolean; has_processed_audio: boolean; }; processing_stats: { total_segments: number; total_model_calls: number; total_identifications: number; content_segments: number; ad_segments_count: number; ad_percentage: number; estimated_ad_time_seconds: number; model_call_statuses: Record; model_types: Record; }; model_calls: Array<{ id: number; model_name: string; status: string; segment_range: string; first_segment_sequence_num: number; last_segment_sequence_num: number; timestamp: string | null; retry_attempts: number; error_message: string | null; prompt: string | null; response: string | null; }>; transcript_segments: Array<{ id: number; sequence_num: number; start_time: number; end_time: number; text: string; primary_label: 'ad' | 'content'; mixed: boolean; identifications: Array<{ id: number; label: string; confidence: number | null; model_call_id: number; }>; }>; identifications: Array<{ id: number; transcript_segment_id: number; label: string; confidence: number | null; model_call_id: number; segment_sequence_num: number; segment_start_time: number; segment_end_time: number; segment_text: string; mixed: boolean; }>; }> => { return feedsApi.getPostStats(guid); }, // Legacy download aliases downloadEpisode: async (guid: string): Promise => { return feedsApi.downloadPost(guid); }, downloadOriginalEpisode: async (guid: string): Promise => { return feedsApi.downloadOriginalPost(guid); }, getEpisodeDownloadUrl: (guid: string): string => { return feedsApi.getPostDownloadUrl(guid); }, getEpisodeOriginalDownloadUrl: (guid: string): string => { return feedsApi.getPostOriginalDownloadUrl(guid); }, getAggregateFeedLink: async (): Promise<{ url: string }> => { const response = await api.post('/api/user/aggregate-link'); return response.data; }, }; export const authApi = { getStatus: async (): Promise<{ require_auth: boolean; landing_page_enabled?: boolean }> => { const response = await api.get('/api/auth/status'); return response.data; }, login: async (username: string, password: string): Promise<{ user: { id: number; username: string; role: string } }> => { const response = await api.post('/api/auth/login', { username, password }); return response.data; }, logout: async (): Promise => { await api.post('/api/auth/logout'); }, getCurrentUser: async (): Promise<{ user: { id: number; username: string; role: string } }> => { const response = await api.get('/api/auth/me'); return response.data; }, changePassword: async (payload: { current_password: string; new_password: string }): Promise<{ status: string }> => { const response = await api.post('/api/auth/change-password', payload); return response.data; }, listUsers: async (): Promise<{ users: Array<{ id: number; username: string; role: string; created_at: string; updated_at: string; last_active?: string | null; feed_allowance?: number; feed_subscription_status?: string; manual_feed_allowance?: number | null }> }> => { const response = await api.get('/api/auth/users'); return response.data; }, createUser: async (payload: { username: string; password: string; role: string }): Promise<{ user: { id: number; username: string; role: string; created_at: string; updated_at: string } }> => { const response = await api.post('/api/auth/users', payload); return response.data; }, updateUser: async (username: string, payload: { password?: string; role?: string; manual_feed_allowance?: number | null }): Promise<{ status: string }> => { const response = await api.patch(`/api/auth/users/${username}`, payload); return response.data; }, deleteUser: async (username: string): Promise<{ status: string }> => { const response = await api.delete(`/api/auth/users/${username}`); return response.data; }, }; export const landingApi = { getStatus: async (): Promise => { const response = await api.get('/api/landing/status'); return response.data; }, }; export const discordApi = { getStatus: async (): Promise<{ enabled: boolean }> => { const response = await api.get('/api/auth/discord/status'); return response.data; }, getLoginUrl: async (): Promise<{ authorization_url: string }> => { const response = await api.get('/api/auth/discord/login'); return response.data; }, getConfig: async (): Promise<{ config: { enabled: boolean; client_id: string | null; client_secret_preview: string | null; redirect_uri: string | null; guild_ids: string; allow_registration: boolean; }; env_overrides: Record; }> => { const response = await api.get('/api/auth/discord/config'); return response.data; }, updateConfig: async (payload: { client_id?: string; client_secret?: string; redirect_uri?: string; guild_ids?: string; allow_registration?: boolean; }): Promise<{ status: string; config: { enabled: boolean; client_id: string | null; client_secret_preview: string | null; redirect_uri: string | null; guild_ids: string; allow_registration: boolean; }; }> => { const response = await api.put('/api/auth/discord/config', payload); return response.data; }, }; export const configApi = { getConfig: async (): Promise => { const response = await api.get('/api/config'); return response.data; }, isConfigured: async (): Promise<{ configured: boolean }> => { const response = await api.get('/api/config/api_configured_check'); return { configured: !!response.data?.configured }; }, updateConfig: async (payload: Partial): Promise => { const response = await api.put('/api/config', payload); return response.data; }, testLLM: async ( payload: Partial<{ llm: LLMConfig }> ): Promise<{ ok: boolean; message?: string; error?: string }> => { const response = await api.post('/api/config/test-llm', payload ?? {}); return response.data; }, testWhisper: async ( payload: Partial<{ whisper: WhisperConfig }> ): Promise<{ ok: boolean; message?: string; error?: string }> => { const response = await api.post('/api/config/test-whisper', payload ?? {}); return response.data; }, getWhisperCapabilities: async (): Promise<{ local_available: boolean }> => { const response = await api.get('/api/config/whisper-capabilities'); const local_available = !!response.data?.local_available; return { local_available }; }, }; export const billingApi = { getSummary: async (): Promise => { const response = await api.get('/api/billing/summary'); return response.data; }, updateSubscription: async ( amount: number, options?: { subscriptionId?: string | null } ): Promise< BillingSummary & { message?: string; checkout_url?: string; requires_stripe_checkout?: boolean; } > => { const response = await api.post('/api/billing/subscription', { amount, subscription_id: options?.subscriptionId, }); return response.data; }, createPortalSession: async (): Promise<{ url: string }> => { const response = await api.post('/api/billing/portal-session'); return response.data; }, }; export const jobsApi = { getActiveJobs: async (limit: number = 100): Promise => { const response = await api.get('/api/jobs/active', { params: { limit } }); return response.data; }, getAllJobs: async (limit: number = 200): Promise => { const response = await api.get('/api/jobs/all', { params: { limit } }); return response.data; }, cancelJob: async (jobId: string): Promise<{ status: string; job_id: string; message: string }> => { const response = await api.post(`/api/jobs/${jobId}/cancel`); return response.data; }, getJobManagerStatus: async (): Promise => { const response = await api.get('/api/job-manager/status'); return response.data; }, getCleanupPreview: async (): Promise => { const response = await api.get('/api/jobs/cleanup/preview'); return response.data; }, runCleanupJob: async (): Promise => { const response = await api.post('/api/jobs/cleanup/run'); return response.data; } }; ================================================ FILE: frontend/src/types/index.ts ================================================ export interface Feed { id: number; rss_url: string; title: string; description?: string; author?: string; image_url?: string; posts_count: number; member_count?: number; is_member?: boolean; is_active_subscription?: boolean; auto_whitelist_new_episodes_override?: boolean | null; } export interface Episode { id: number; guid: string; title: string; description: string; release_date: string | null; duration: number | null; whitelisted: boolean; has_processed_audio: boolean; has_unprocessed_audio: boolean; download_url: string; image_url: string | null; download_count: number; } export interface PagedResult { items: T[]; total: number; page: number; page_size: number; total_pages?: number; whitelisted_total?: number; } export interface Job { job_id: string; post_guid: string; post_title: string | null; feed_title: string | null; status: 'pending' | 'running' | 'completed' | 'failed' | 'cancelled' | 'skipped' | string; priority: number; step: number; step_name: string | null; total_steps: number; progress_percentage: number; created_at: string | null; started_at: string | null; completed_at: string | null; error_message: string | null; } export interface JobManagerRun { id: string; status: 'pending' | 'running' | 'completed' | 'failed' | string; trigger: string; started_at: string | null; completed_at: string | null; updated_at: string | null; total_jobs: number; queued_jobs: number; running_jobs: number; completed_jobs: number; failed_jobs: number; skipped_jobs: number; context?: Record | null; counters_reset_at: string | null; progress_percentage: number; } export interface JobManagerStatus { run: JobManagerRun | null; } export interface CleanupPreview { count: number; retention_days: number | null; cutoff_utc: string | null; } export interface CleanupRunResult { status: 'ok' | 'disabled' | 'error' | string; removed_posts?: number; remaining_candidates?: number; retention_days?: number | null; cutoff_utc?: string | null; message?: string; } // ----- Configuration Types ----- export interface LLMConfig { llm_api_key?: string | null; llm_api_key_preview?: string | null; llm_model: string; openai_base_url?: string | null; openai_timeout: number; openai_max_tokens: number; llm_max_concurrent_calls: number; llm_max_retry_attempts: number; llm_max_input_tokens_per_call?: number | null; llm_enable_token_rate_limiting: boolean; llm_max_input_tokens_per_minute?: number | null; enable_boundary_refinement: boolean; enable_word_level_boundary_refinder?: boolean; } export type WhisperConfig = | { whisper_type: 'local'; model: string } | { whisper_type: 'remote'; model: string; api_key?: string | null; api_key_preview?: string | null; base_url?: string; language: string; timeout_sec: number; chunksize_mb: number; } | { whisper_type: 'groq'; api_key?: string | null; api_key_preview?: string | null; model: string; language: string; max_retries: number; } | { whisper_type: 'test' }; export interface ProcessingConfigUI { num_segments_to_input_to_prompt: number; } export interface OutputConfigUI { fade_ms: number; // Note the intentional spelling to match backend min_ad_segement_separation_seconds: number; min_ad_segment_length_seconds: number; min_confidence: number; } export interface AppConfigUI { background_update_interval_minute: number | null; automatically_whitelist_new_episodes: boolean; post_cleanup_retention_days: number | null; number_of_episodes_to_whitelist_from_archive_of_new_feed: number; enable_public_landing_page: boolean; user_limit_total: number | null; autoprocess_on_download: boolean; } export interface CombinedConfig { llm: LLMConfig; whisper: WhisperConfig; processing: ProcessingConfigUI; output: OutputConfigUI; app: AppConfigUI; } export interface EnvOverrideEntry { env_var: string; value?: string; value_preview?: string | null; is_secret?: boolean; } export type EnvOverrideMap = Record; export interface ConfigResponse { config: CombinedConfig; env_overrides?: EnvOverrideMap; } export interface PodcastSearchResult { title: string; author: string; feedUrl: string; artworkUrl: string; description: string; genres: string[]; } export interface AuthUser { id: number; username: string; role: 'admin' | 'user' | string; feed_allowance?: number; feed_subscription_status?: string; manual_feed_allowance?: number | null; } export interface ManagedUser extends AuthUser { created_at: string; updated_at: string; last_active?: string | null; } export interface DiscordStatus { enabled: boolean; } export interface BillingSummary { feed_allowance: number; feeds_in_use: number; remaining: number; current_amount?: number; min_amount_cents?: number; subscription_status: string; stripe_subscription_id?: string | null; stripe_customer_id?: string | null; product_id?: string | null; message?: string; } export interface LandingStatus { require_auth: boolean; landing_page_enabled: boolean; user_count: number; user_limit_total: number | null; slots_remaining: number | null; } ================================================ FILE: frontend/src/utils/clipboard.ts ================================================ import { toast } from 'react-hot-toast'; export async function copyToClipboard(text: string, promptMessage: string = 'Copy to clipboard:', successMessage?: string): Promise { // Try Clipboard API first if (navigator.clipboard && navigator.clipboard.writeText) { try { await navigator.clipboard.writeText(text); if (successMessage) toast.success(successMessage); return true; } catch (err) { console.warn('Clipboard API failed, trying fallback', err); } } // Fallback for non-secure contexts or if Clipboard API fails try { const textArea = document.createElement('textarea'); textArea.value = text; // Ensure it's not visible but part of the DOM textArea.style.position = 'fixed'; textArea.style.left = '-9999px'; textArea.style.top = '0'; document.body.appendChild(textArea); textArea.focus(); textArea.select(); const successful = document.execCommand('copy'); document.body.removeChild(textArea); if (successful) { if (successMessage) toast.success(successMessage); return true; } } catch (err) { console.error('Fallback copy failed', err); } // If all else fails, prompt the user window.prompt(promptMessage, text); return false; } ================================================ FILE: frontend/src/utils/diagnostics.ts ================================================ export type DiagnosticsLevel = 'debug' | 'info' | 'warn' | 'error'; export type DiagnosticsEntry = { ts: number; level: DiagnosticsLevel; message: string; data?: unknown; }; export type DiagnosticsState = { v: 1; entries: DiagnosticsEntry[]; }; export type DiagnosticErrorPayload = { title: string; message: string; kind?: 'network' | 'http' | 'app' | 'unknown'; details?: unknown; }; const STORAGE_KEY = 'podly.diagnostics.v1'; const MAX_ENTRIES = 200; const MAX_ENTRY_MESSAGE_CHARS = 500; const MAX_JSON_CHARS = 120_000; const SENSITIVE_KEY_RE = /(authorization|cookie|set-cookie|token|access[_-]?token|refresh[_-]?token|id[_-]?token|api[_-]?key|secret|password|session)/i; const SENSITIVE_VALUE_REPLACEMENT = '[REDACTED]'; const redactString = (value: string): string => { let v = value; // Authorization headers / bearer tokens v = v.replace(/\bBearer\s+([A-Za-z0-9\-._~+/]+=*)/gi, 'Bearer [REDACTED]'); v = v.replace(/\bBasic\s+([A-Za-z0-9+/=]+)\b/gi, 'Basic [REDACTED]'); // Common query params v = v.replace(/([?&](?:token|access_token|refresh_token|id_token|api_key|key|password)=)([^&#]+)/gi, '$1[REDACTED]'); // JSON-ish fields in strings v = v.replace(/("(?:access_token|refresh_token|id_token|token|api_key|password)"\s*:\s*")([^"]+)(")/gi, '$1[REDACTED]$3'); return v; }; const sanitize = (input: unknown, depth = 0): unknown => { if (depth > 6) return '[Truncated]'; if (input == null) return input; if (typeof input === 'string') return redactString(input); if (typeof input === 'number' || typeof input === 'boolean') return input; if (Array.isArray(input)) { return input.slice(0, 50).map((v) => sanitize(v, depth + 1)); } if (typeof input === 'object') { const obj = input as Record; const out: Record = {}; const keys = Object.keys(obj).slice(0, 50); for (const key of keys) { const value = obj[key]; if (SENSITIVE_KEY_RE.test(key)) { out[key] = SENSITIVE_VALUE_REPLACEMENT; } else { out[key] = sanitize(value, depth + 1); } } return out; } return String(input); }; const safeJsonStringify = (value: unknown): string => { try { const json = JSON.stringify(value); if (json.length <= MAX_JSON_CHARS) return json; return json.slice(0, MAX_JSON_CHARS) + '\n...[truncated]'; } catch { return '[Unserializable]'; } }; const loadState = (): DiagnosticsState => { try { const raw = sessionStorage.getItem(STORAGE_KEY); if (!raw) return { v: 1, entries: [] }; const parsed = JSON.parse(raw) as DiagnosticsState; if (parsed?.v !== 1 || !Array.isArray(parsed.entries)) { return { v: 1, entries: [] }; } return parsed; } catch { return { v: 1, entries: [] }; } }; const saveState = (state: DiagnosticsState) => { try { const raw = safeJsonStringify(state); // Prevent sessionStorage bloat if (raw.length > MAX_JSON_CHARS) { const trimmed = { v: 1 as const, entries: state.entries.slice(-Math.floor(MAX_ENTRIES / 2)) }; sessionStorage.setItem(STORAGE_KEY, safeJsonStringify(trimmed)); return; } sessionStorage.setItem(STORAGE_KEY, raw); } catch { // ignore } }; export const DIAGNOSTIC_UPDATED_EVENT = 'podly:diagnostic-updated'; export const diagnostics = { add: (level: DiagnosticsLevel, message: string, data?: unknown) => { const sanitizedMessage = redactString(message).slice(0, MAX_ENTRY_MESSAGE_CHARS); const entry: DiagnosticsEntry = { ts: Date.now(), level, message: sanitizedMessage, data: data === undefined ? undefined : sanitize(data), }; const state = loadState(); const next = [...state.entries, entry].slice(-MAX_ENTRIES); saveState({ v: 1, entries: next }); try { if (typeof window !== 'undefined') { window.dispatchEvent(new Event(DIAGNOSTIC_UPDATED_EVENT)); } } catch { // ignore } }, getEntries: (): DiagnosticsEntry[] => { return loadState().entries; }, clear: () => { try { sessionStorage.removeItem(STORAGE_KEY); } catch { // ignore } }, sanitize, }; export const DIAGNOSTIC_ERROR_EVENT = 'podly:diagnostic-error'; export const emitDiagnosticError = (payload: DiagnosticErrorPayload) => { const safePayload = diagnostics.sanitize(payload) as DiagnosticErrorPayload; diagnostics.add('error', safePayload.title + ': ' + safePayload.message, safePayload); try { window.dispatchEvent(new CustomEvent(DIAGNOSTIC_ERROR_EVENT, { detail: safePayload })); } catch { // ignore } }; let consoleWrapped = false; export const initFrontendDiagnostics = () => { if (typeof window === 'undefined') return; if (!consoleWrapped) { consoleWrapped = true; const wrap = (level: DiagnosticsLevel, original: (...args: unknown[]) => void) => (...args: unknown[]) => { try { const msg = args .map((a) => (typeof a === 'string' ? a : safeJsonStringify(diagnostics.sanitize(a)))) .join(' '); diagnostics.add(level, msg); } catch { // ignore } original(...args); }; console.log = wrap('info', console.log.bind(console)); console.info = wrap('info', console.info.bind(console)); console.warn = wrap('warn', console.warn.bind(console)); console.error = wrap('error', console.error.bind(console)); } window.addEventListener('error', (event) => { emitDiagnosticError({ title: 'Unhandled error', message: event.message || 'Unknown error', kind: 'app', details: { filename: event.filename, lineno: event.lineno, colno: event.colno, }, }); }); window.addEventListener('unhandledrejection', (event) => { const reason = (event as PromiseRejectionEvent).reason; emitDiagnosticError({ title: 'Unhandled promise rejection', message: typeof reason === 'string' ? reason : 'Promise rejected', kind: 'app', details: reason, }); }); }; ================================================ FILE: frontend/src/utils/httpError.ts ================================================ import type { AxiosError } from 'axios'; export type ApiErrorData = { message?: unknown; error?: unknown; [key: string]: unknown; }; export type HttpErrorInfo = { status?: number; message: string; data?: unknown; }; const asString = (v: unknown): string | null => (typeof v === 'string' ? v : null); export const getHttpErrorInfo = (err: unknown): HttpErrorInfo => { const axiosErr = err as AxiosError; const status = axiosErr?.response?.status; const data = axiosErr?.response?.data; const messageFromData = data && typeof data === 'object' ? asString((data as ApiErrorData).message) ?? asString((data as ApiErrorData).error) : null; return { status, data, message: messageFromData ?? asString((axiosErr as unknown as { message?: unknown })?.message) ?? 'Request failed', }; }; ================================================ FILE: frontend/src/vite-env.d.ts ================================================ /// ================================================ FILE: frontend/tailwind.config.js ================================================ /** @type {import('tailwindcss').Config} */ module.exports = { content: ["./index.html", "./src/**/*.{js,ts,jsx,tsx}"], theme: { extend: {}, }, plugins: [], }; ================================================ FILE: frontend/tsconfig.app.json ================================================ { "compilerOptions": { "tsBuildInfoFile": "./node_modules/.tmp/tsconfig.app.tsbuildinfo", "target": "ES2020", "useDefineForClassFields": true, "lib": ["ES2020", "DOM", "DOM.Iterable"], "module": "ESNext", "skipLibCheck": true, /* Bundler mode */ "moduleResolution": "bundler", "allowImportingTsExtensions": true, "verbatimModuleSyntax": true, "moduleDetection": "force", "noEmit": true, "jsx": "react-jsx", /* Linting */ "strict": true, "noUnusedLocals": true, "noUnusedParameters": true, "erasableSyntaxOnly": true, "noFallthroughCasesInSwitch": true, "noUncheckedSideEffectImports": true }, "include": ["src"], "exclude": ["src/contexts/diagnosticsContext.ts"] } ================================================ FILE: frontend/tsconfig.json ================================================ { "files": [], "references": [ { "path": "./tsconfig.app.json" }, { "path": "./tsconfig.node.json" } ] } ================================================ FILE: frontend/tsconfig.node.json ================================================ { "compilerOptions": { "tsBuildInfoFile": "./node_modules/.tmp/tsconfig.node.tsbuildinfo", "target": "ES2022", "lib": ["ES2023"], "module": "ESNext", "skipLibCheck": true, /* Bundler mode */ "moduleResolution": "bundler", "allowImportingTsExtensions": true, "verbatimModuleSyntax": true, "moduleDetection": "force", "noEmit": true, /* Linting */ "strict": true, "noUnusedLocals": true, "noUnusedParameters": true, "erasableSyntaxOnly": true, "noFallthroughCasesInSwitch": true, "noUncheckedSideEffectImports": true }, "include": ["vite.config.ts"] } ================================================ FILE: frontend/vite.config.ts ================================================ import { defineConfig } from 'vite' import react from '@vitejs/plugin-react' // For development, the frontend development server will proxy to the backend // The backend port should match the configured application port // This will work with the new port configuration const BACKEND_TARGET = 'http://localhost:5001' // https://vite.dev/config/ export default defineConfig({ plugins: [react()], server: { port: 5173, host: true, allowedHosts: true, proxy: { '/api': { target: BACKEND_TARGET, changeOrigin: true, secure: false }, // Proxy feed endpoints for backwards compatibility '/feed': { target: BACKEND_TARGET, changeOrigin: true, secure: false }, // Proxy legacy post endpoints for backwards compatibility '/post': { target: BACKEND_TARGET, changeOrigin: true, secure: false } } }, build: { outDir: 'dist', sourcemap: false } }) ================================================ FILE: pyproject.toml ================================================ [tool.pylint] init-hook = 'import sys; sys.path.append("./src")' disable = [ "logging-fstring-interpolation", "missing-class-docstring", "missing-function-docstring", "missing-module-docstring", "too-few-public-methods", "too-many-arguments", "too-many-locals", "unspecified-encoding", "line-too-long", "too-many-return-statements" ] [tool.mypy] warn_unused_ignores = true strict = true mypy_path = "src" [tool.pytest.ini_options] pythonpath = ["src"] [tool.black] line-length = 88 [tool.isort] profile = "black" line_length = 88 float_to_top = true ================================================ FILE: run_podly_docker.sh ================================================ #!/bin/bash # Colors for output YELLOW='\033[1;33m' RED='\033[0;31m' GREEN='\033[0;32m' NC='\033[0m' # No Color # Central configuration defaults CUDA_VERSION="12.4.1" ROCM_VERSION="6.4" CPU_BASE_IMAGE="python:3.11-slim" GPU_NVIDIA_BASE_IMAGE="nvidia/cuda:${CUDA_VERSION}-cudnn-devel-ubuntu22.04" GPU_ROCM_BASE_IMAGE="rocm/dev-ubuntu-22.04:${ROCM_VERSION}-complete" # Read server URL from config.yml if it exists SERVER_URL="" if [ -f "config/config.yml" ]; then SERVER_URL=$(grep "^server:" config/config.yml | cut -d' ' -f2- | tr -d ' ') if [ -n "$SERVER_URL" ]; then # Remove http:// or https:// prefix to get just the hostname CLEAN_URL=$(echo "$SERVER_URL" | sed 's|^https\?://||') export VITE_API_URL="http://${CLEAN_URL}:5001" echo -e "${GREEN}Using server URL from config.yml: ${VITE_API_URL}${NC}" fi fi # Check dependencies echo -e "${YELLOW}Checking dependencies...${NC}" if ! command -v docker &> /dev/null; then echo -e "${RED}Docker not found. Please install Docker first.${NC}" exit 1 fi if ! docker compose version &> /dev/null; then echo -e "${RED}Docker Compose not found. Please install Docker Compose V2.${NC}" exit 1 fi # Default values BUILD_ONLY=false TEST_BUILD=false FORCE_CPU=false FORCE_GPU=false DETACHED=false PRODUCTION_MODE=true REBUILD=false BRANCH_SUFFIX="main" LITE_BUILD=false # Detect NVIDIA GPU NVIDIA_GPU_AVAILABLE=false if command -v nvidia-smi &> /dev/null && nvidia-smi &> /dev/null; then NVIDIA_GPU_AVAILABLE=true echo -e "${GREEN}NVIDIA GPU detected.${NC}" fi # Detect ROCM GPU AMD_GPU_AVAILABLE=false if command -v rocm-smi &> /dev/null && rocm-smi &> /dev/null; then AMD_GPU_AVAILABLE=true echo -e "${GREEN}ROCM GPU detected.${NC}" fi # Parse command line arguments while [[ $# -gt 0 ]]; do case "$1" in --build) BUILD_ONLY=true ;; --test-build) TEST_BUILD=true ;; --gpu) FORCE_GPU=true ;; --cpu) FORCE_CPU=true ;; --cuda=*) CUDA_VERSION="${1#*=}" GPU_NVIDIA_BASE_IMAGE="nvidia/cuda:${CUDA_VERSION}-cudnn-devel-ubuntu22.04" ;; --rocm=*) ROCM_VERSION="${1#*=}" GPU_ROCM_BASE_IMAGE="rocm/dev-ubuntu-22.04:${ROCM_VERSION}-complete" ;; -d|--detach|-b|--background) DETACHED=true ;; --dev) REBUILD=true PRODUCTION_MODE=false ;; --rebuild) REBUILD=true ;; --production) PRODUCTION_MODE=true ;; --branch=*) BRANCH_NAME="${1#*=}" BRANCH_SUFFIX="${BRANCH_NAME}" ;; --lite) LITE_BUILD=true ;; -h|--help) echo "Usage: $0 [OPTIONS]" echo "" echo "Options:" echo " --build Build containers only (don't start)" echo " --test-build Test build with no cache" echo " --gpu Force GPU mode" echo " --cpu Force CPU mode" echo " --cuda=VERSION Specify CUDA version" echo " --rocm=VERSION Specify ROCM version" echo " -d, --detach Run in detached/background mode" echo " -b, --background Alias for --detach" echo " --dev Development mode (rebuild containers)" echo " --rebuild Rebuild containers before starting" echo " --production Use published images (default)" echo " --branch=BRANCH Use specific branch images" echo " --lite Build without Whisper (smaller image, remote transcription only)" echo " -h, --help Show this help message" exit 0 ;; *) echo "Unknown argument: $1" echo "Usage: $0 [--build] [--test-build] [--gpu] [--cpu] [--cuda=VERSION] [--rocm=VERSION] [-d|--detach] [-b|--background] [--dev] [--rebuild] [--production] [--branch=BRANCH_NAME] [--lite] [-h|--help]" exit 1 ;; esac shift done # Determine if GPU should be used based on availability and flags USE_GPU=false USE_GPU_NVIDIA=false USE_GPU_AMD=false if [ "$FORCE_CPU" = true ]; then USE_GPU=false echo -e "${YELLOW}Forcing CPU mode${NC}" elif [ "$FORCE_GPU" = true ]; then if [ "$NVIDIA_GPU_AVAILABLE" = true ]; then USE_GPU=true USE_GPU_NVIDIA=true echo -e "${YELLOW}Forcing GPU mode (NVIDIA detected)${NC}" elif [ "$AMD_GPU_AVAILABLE" = true ]; then USE_GPU=true USE_GPU_AMD=true echo -e "${YELLOW}Forcing GPU mode (AMD detected)${NC}" else echo -e "${RED}Error: GPU requested but no compatible GPU detected. Please install NVIDIA or AMD GPU drivers.${NC}" exit 1 fi elif [ "$NVIDIA_GPU_AVAILABLE" = true ]; then USE_GPU=true USE_GPU_NVIDIA=true echo -e "${YELLOW}Using GPU mode (auto-detected)${NC}" elif [ "${AMD_GPU_AVAILABLE}" = true ]; then USE_GPU=true USE_GPU_AMD=true echo -e "${YELLOW}Using GPU mode (auto-detected)${NC}" else echo -e "${YELLOW}Using CPU mode (no GPU detected)${NC}" fi # Set base image and CUDA environment if [ "$USE_GPU_NVIDIA" = true ]; then BASE_IMAGE="$GPU_NVIDIA_BASE_IMAGE" CUDA_VISIBLE_DEVICES=0 elif [ "${USE_GPU_AMD}" = true ]; then BASE_IMAGE="${GPU_ROCM_BASE_IMAGE}" CUDA_VISIBLE_DEVICES=0 else BASE_IMAGE="$CPU_BASE_IMAGE" CUDA_VISIBLE_DEVICES=-1 fi # Get current user's UID and GID export PUID=$(id -u) export PGID=$(id -g) export BASE_IMAGE export CUDA_VERSION export ROCM_VERSION export CUDA_VISIBLE_DEVICES export USE_GPU export USE_GPU_NVIDIA export USE_GPU_AMD export LITE_BUILD # Surface authentication/session configuration warnings REQUIRE_AUTH_LOWER=$(printf '%s' "${REQUIRE_AUTH:-false}" | tr '[:upper:]' '[:lower:]') if [ "$REQUIRE_AUTH_LOWER" = "true" ]; then if [ -z "${PODLY_SECRET_KEY}" ]; then echo -e "${YELLOW}Warning: REQUIRE_AUTH is true but PODLY_SECRET_KEY is not set. Sessions will be reset on every restart.${NC}" fi fi # Setup Docker Compose configuration if [ "$PRODUCTION_MODE" = true ]; then COMPOSE_FILES="-f compose.yml" # Set branch tag based on GPU detection and branch if [ "$LITE_BUILD" = true ] && [ "$USE_GPU" = true ]; then echo -e "${RED}Error: --lite cannot be combined with GPU builds. Use --cpu or drop --lite.${NC}" exit 1 fi if [ "$LITE_BUILD" = true ]; then BRANCH="${BRANCH_SUFFIX}-lite" elif [ "$USE_GPU_NVIDIA" = true ]; then BRANCH="${BRANCH_SUFFIX}-gpu-nvidia" elif [ "$USE_GPU_AMD" = true ]; then BRANCH="${BRANCH_SUFFIX}-gpu-amd" else BRANCH="${BRANCH_SUFFIX}-latest" fi export BRANCH echo -e "${YELLOW}Production mode - using published images${NC}" echo -e "${YELLOW} Branch tag: ${BRANCH}${NC}" if [ "$BRANCH_SUFFIX" != "main" ]; then echo -e "${GREEN}Using custom branch: ${BRANCH_SUFFIX}${NC}" fi else export DEVELOPER_MODE=true COMPOSE_FILES="-f compose.dev.cpu.yml" if [ "$USE_GPU_NVIDIA" = true ]; then COMPOSE_FILES="$COMPOSE_FILES -f compose.dev.nvidia.yml" fi if [ "$USE_GPU_AMD" = true ]; then COMPOSE_FILES="$COMPOSE_FILES -f compose.dev.rocm.yml" fi if [ "$REBUILD" = true ]; then echo -e "${YELLOW}Rebuild mode - will rebuild containers before starting${NC}" fi if [ "$LITE_BUILD" = true ]; then echo -e "${YELLOW}Lite mode - building without Whisper (remote transcription only)${NC}" fi fi # Execute appropriate Docker Compose command if [ "$BUILD_ONLY" = true ]; then echo -e "${YELLOW}Building containers only...${NC}" if ! docker compose $COMPOSE_FILES build; then echo -e "${RED}Build failed! Please fix the errors above and try again.${NC}" exit 1 fi echo -e "${GREEN}Build completed successfully.${NC}" elif [ "$TEST_BUILD" = true ]; then echo -e "${YELLOW}Testing build with no cache...${NC}" if ! docker compose $COMPOSE_FILES build --no-cache; then echo -e "${RED}Build failed! Please fix the errors above and try again.${NC}" exit 1 fi echo -e "${GREEN}Test build completed successfully.${NC}" else # Handle development rebuild if [ "$REBUILD" = true ]; then echo -e "${YELLOW}Rebuilding containers...${NC}" if ! docker compose $COMPOSE_FILES build; then echo -e "${RED}Build failed! Please fix the errors above and try again.${NC}" exit 1 fi fi if [ "$DETACHED" = true ]; then echo -e "${YELLOW}Starting Podly in detached mode...${NC}" docker compose $COMPOSE_FILES up -d echo -e "${GREEN}Podly is running in the background.${NC}" echo -e "${GREEN}Application: http://localhost:5001${NC}" else echo -e "${YELLOW}Starting Podly...${NC}" echo -e "${GREEN}Application will be available at: http://localhost:5001${NC}" docker compose $COMPOSE_FILES up fi fi ================================================ FILE: scripts/ci.sh ================================================ #!/bin/bash # format echo '=============================================================' echo "Running 'pipenv run black .'" echo '=============================================================' pipenv run black . echo '=============================================================' echo "Running 'pipenv run isort .'" echo '=============================================================' pipenv run isort . # lint and type check echo '=============================================================' echo "Running 'pipenv run mypy . --install-types --non-interactive'" echo '=============================================================' pipenv run mypy . \ --install-types \ --non-interactive \ --explicit-package-bases \ --exclude 'migrations' \ --exclude 'build' \ --exclude 'scripts' \ --exclude 'src/tests' \ --exclude 'src/tests/test_routes.py' \ --exclude 'src/app/routes.py' echo '=============================================================' echo "Running 'pipenv run pylint src/ --ignore=migrations,tests'" echo '=============================================================' pipenv run pylint src/ --ignore=migrations,tests # run tests echo '=============================================================' echo "Running 'pipenv run pytest --disable-warnings'" echo '=============================================================' pipenv run pytest --disable-warnings ================================================ FILE: scripts/create_migration.sh ================================================ #!/usr/bin/env bash set -euo pipefail # Usage: ./scripts/create_migration.sh "message" # Creates migrations using the project's local instance directory so the app # doesn't attempt to mkdir /app on macOS dev machines. SCRIPT_DIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) REPO_ROOT=$(cd "$SCRIPT_DIR/.." && pwd) MIGRATION_MSG=${1:-"migration"} # Prefer using repo-local src/instance to avoid writing to /app export PODLY_INSTANCE_DIR="$REPO_ROOT/src/instance" echo "Using PODLY_INSTANCE_DIR=$PODLY_INSTANCE_DIR" # Ensure instance and data directories exist mkdir -p "$PODLY_INSTANCE_DIR" mkdir -p "$PODLY_INSTANCE_DIR/data/in" mkdir -p "$PODLY_INSTANCE_DIR/data/srv" echo "Running flask db migrate with message: $MIGRATION_MSG" export PYTHONPATH="$REPO_ROOT/src" pipenv run flask --app app db migrate -m "$MIGRATION_MSG" echo "Applying migration (upgrade)" read -r -p "Apply migration now? [y/N]: " response case "$response" in [yY][eE][sS]|[yY]) echo "Applying migration..." pipenv run flask --app app db upgrade echo "Migration applied." ;; *) echo "Upgrade cancelled. Migration files created but not applied." ;; esac ================================================ FILE: scripts/downgrade_db.sh ================================================ #!/usr/bin/env bash SCRIPT_DIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) REPO_ROOT=$(cd "$SCRIPT_DIR/.." && pwd) export PODLY_INSTANCE_DIR="$REPO_ROOT/src/instance" export PYTHONPATH="$REPO_ROOT/src" # Default to downgrading one revision if not specified REVISION=${1:-"-1"} pipenv run flask --app app db downgrade "$REVISION" ================================================ FILE: scripts/generate_lockfiles.sh ================================================ #!/bin/bash set -e # Generate lock file for the regular Pipfile echo "Locking Pipfile..." pipenv lock # Temporarily move Pipfiles to lock Pipfile.lite echo "Preparing to lock Pipfile.lite..." mv Pipfile Pipfile.tmp mv Pipfile.lite Pipfile # Generate lock file for Pipfile.lite echo "Locking Pipfile.lite..." pipenv lock # Rename the new lock file to Pipfile.lite.lock echo "Renaming lockfile for lite version..." mv Pipfile.lock Pipfile.lite.lock # Restore original Pipfile names echo "Restoring original Pipfile names..." mv Pipfile Pipfile.lite mv Pipfile.tmp Pipfile echo "Lockfiles generated successfully!" echo "- Pipfile.lock" echo "- Pipfile.lite.lock" ================================================ FILE: scripts/manual_publish.sh ================================================ #!/bin/bash set -euo pipefail # Branch name becomes part of a manual tag (slashes replaced) BRANCH=$(git rev-parse --abbrev-ref HEAD | tr '/' '_') # Allow overriding image/owner/builder via env vars IMAGE=${IMAGE:-ghcr.io/podly-pure-podcasts/podly-pure-podcasts} BUILDER=${BUILDER:-podly_builder} # Ensure a docker-container buildx builder for multi-arch builds docker buildx create --name "${BUILDER}" --driver docker-container --use >/dev/null 2>&1 || docker buildx use "${BUILDER}" # Ensure binfmt handlers for cross-compilation are installed (no-op if already present) docker run --privileged --rm tonistiigi/binfmt --install all >/dev/null 2>&1 || true # Optional GHCR login (requires GHCR_TOKEN and optionally OWNER) if [[ -n "${GHCR_TOKEN:-}" ]]; then OWNER=${OWNER:-$(echo "${IMAGE}" | sed -E 's#^ghcr.io/([^/]+)/.*$#\1#')} echo "${GHCR_TOKEN}" | docker login ghcr.io -u "${OWNER}" --password-stdin fi # Build and push multi-arch CPU image (lite) docker buildx build \ --platform linux/amd64,linux/arm64 \ -t "${IMAGE}:${BRANCH}-lite" \ --build-arg BASE_IMAGE=python:3.11-slim \ --build-arg USE_GPU=false \ --build-arg USE_GPU_NVIDIA=false \ --build-arg USE_GPU_AMD=false \ --build-arg LITE_BUILD=true \ --push . ================================================ FILE: scripts/new_worktree.sh ================================================ #!/usr/bin/env bash set -euo pipefail usage() { echo "Usage: $0 []" >&2 exit 1 } if [[ ${1-} == "" ]]; then usage fi BRANCH_NAME="$1" START_POINT="${2-}" SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" REPO_ROOT="$(cd "$SCRIPT_DIR/.." && pwd)" WORKTREES_ROOT="$REPO_ROOT/.worktrees" WORKTREE_PATH="$WORKTREES_ROOT/$BRANCH_NAME" if git worktree list --porcelain | grep -q "^worktree $WORKTREE_PATH$"; then echo "Worktree already exists at $WORKTREE_PATH" >&2 exit 1 fi mkdir -p "$(dirname "$WORKTREE_PATH")" if [[ -d "$WORKTREE_PATH" ]]; then echo "Target path $WORKTREE_PATH already exists. Remove it first." >&2 exit 1 fi echo "Creating worktree at $WORKTREE_PATH" >&2 if git rev-parse --verify --quiet "$BRANCH_NAME" >/dev/null; then git worktree add "$WORKTREE_PATH" "$BRANCH_NAME" else if [[ -n "$START_POINT" ]]; then git worktree add -b "$BRANCH_NAME" "$WORKTREE_PATH" "$START_POINT" else git worktree add -b "$BRANCH_NAME" "$WORKTREE_PATH" fi fi pushd "$WORKTREE_PATH" >/dev/null if command -v pipenv >/dev/null; then echo "Installing dependencies via pipenv" >&2 pipenv install --dev else echo "pipenv not found on PATH; skipping dependency installation" >&2 fi ENV_SOURCE="" if [[ -f "$REPO_ROOT/.env" ]]; then ENV_SOURCE="$REPO_ROOT/.env" elif [[ -f "$REPO_ROOT/.env.local" ]]; then ENV_SOURCE="$REPO_ROOT/.env.local" fi if [[ -n "$ENV_SOURCE" ]]; then if [[ -f .env ]]; then echo "Worktree already has a .env file; leaving existing file in place" >&2 else echo "Copying $(basename "$ENV_SOURCE") into worktree" >&2 cp "$ENV_SOURCE" ./.env fi else echo "No .env or .env.local found in repository root; nothing copied" >&2 fi if command -v code >/dev/null; then echo "Opening worktree in VS Code" >&2 code "$WORKTREE_PATH" else echo "VS Code command-line tool 'code' not found; skipping auto-open" >&2 fi popd >/dev/null ================================================ FILE: scripts/start_services.sh ================================================ #!/bin/bash set -e # 1. Start Writer Service in background echo "Starting Writer Service..." export PYTHONPATH="/app/src${PYTHONPATH:+:$PYTHONPATH}" python3 -u -m app.writer & WRITER_PID=$! # Wait for writer IPC to be ready echo "Waiting for writer IPC on 127.0.0.1:50001..." READY=0 for i in {1..120}; do if python3 - <<'PY' import socket s = socket.socket(socket.AF_INET, socket.SOCK_STREAM) s.settimeout(0.2) try: s.connect(("127.0.0.1", 50001)) raise SystemExit(0) except OSError: raise SystemExit(1) finally: try: s.close() except Exception: pass PY then READY=1 break fi sleep 0.25 done if [ $READY -ne 1 ]; then echo "Writer IPC did not become ready in time; exiting." exit 1 fi # 2. Start Main App (Waitress) echo "Starting Main Application..." python3 -u src/main.py & APP_PID=$! # 3. Monitor processes # 'wait -n' waits for the first process to exit. # If writer dies, we want to exit so Docker restarts us. wait -n # Exit with status of process that exited first exit $? ================================================ FILE: scripts/test_full_workflow.py ================================================ import json import sys import time import requests BASE_URL = "http://localhost:5001" def log(msg): print(f"[TEST] {msg}") def check_health(): try: # Assuming there's a health check or just checking root # If no explicit health check, we can try listing feeds response = requests.get(f"{BASE_URL}/feeds") if response.status_code == 200: log("Server is up and running.") return True except requests.exceptions.ConnectionError: pass return False def add_feed(url): log(f"Adding feed: {url}") response = requests.post(f"{BASE_URL}/feed", data={"url": url}) if response.status_code == 302: # Redirects to index on success log("Feed added successfully (redirected).") return True elif response.status_code == 200: log("Feed added successfully.") return True else: log( f"Failed to add feed. Status: {response.status_code}, Body: {response.text}" ) return False def get_feeds(): log("Fetching feeds...") response = requests.get(f"{BASE_URL}/feeds") if response.status_code == 200: feeds = response.json() log(f"Found {len(feeds)} feeds.") return feeds else: log(f"Failed to fetch feeds. Status: {response.status_code}") return [] def get_posts(feed_id): log(f"Fetching posts for feed {feed_id}...") response = requests.get(f"{BASE_URL}/api/feeds/{feed_id}/posts") if response.status_code == 200: posts = response.json() log(f"Found {len(posts)} posts.") return posts else: log(f"Failed to fetch posts. Status: {response.status_code}") return [] def whitelist_post(guid): log(f"Whitelisting post {guid}...") # Assuming admin auth is not strictly enforced for localhost/dev mode or we need to handle it. # The code checks for current_user. If auth is disabled, it might pass. # If auth is enabled, we might need to login first. # For now, let's try without auth headers, assuming dev environment. response = requests.post( f"{BASE_URL}/api/posts/{guid}/whitelist", json={"whitelisted": True, "trigger_processing": True}, ) if response.status_code == 200: log("Post whitelisted and processing triggered.") return True else: log( f"Failed to whitelist post. Status: {response.status_code}, Body: {response.text}" ) return False def check_status(guid): response = requests.get(f"{BASE_URL}/api/posts/{guid}/status") if response.status_code == 200: return response.json() return None def wait_for_processing(guid, timeout=300): log(f"Waiting for processing of {guid}...") start_time = time.time() while time.time() - start_time < timeout: status_data = check_status(guid) if status_data: status = status_data.get("status") progress = status_data.get("progress_percentage", 0) step = status_data.get("step_name", "unknown") log(f"Status: {status}, Step: {step}, Progress: {progress}%") if status == "completed": log("Processing completed successfully!") return True elif status == "failed": log(f"Processing failed: {status_data.get('error_message')}") return False elif status == "error": log(f"Processing error: {status_data.get('message')}") return False time.sleep(5) log("Timeout waiting for processing.") return False def main(): if not check_health(): log("Server is not reachable. Please start the server first.") sys.exit(1) # 1. Add a test feed # Using a known stable feed or a mock one if available. # Let's use a popular tech podcast that usually works. test_feed_url = "http://test-feed/1" # Developer mode test feed # Check if feed already exists feeds = get_feeds() target_feed = None for feed in feeds: if feed["rss_url"] == test_feed_url: target_feed = feed break if not target_feed: if add_feed(test_feed_url): # Fetch feeds again to get the ID feeds = get_feeds() for feed in feeds: if feed["rss_url"] == test_feed_url: target_feed = feed break if not target_feed: log("Could not find or add the test feed.") sys.exit(1) log(f"Working with feed: {target_feed['title']} (ID: {target_feed['id']})") # 2. Get posts posts = get_posts(target_feed["id"]) if not posts: log("No posts found.") sys.exit(1) # 3. Pick the latest post # Posts are usually sorted by release date desc target_post = posts[0] log(f"Selected post: {target_post['title']} (GUID: {target_post['guid']})") # 4. Trigger processing (Whitelist + Trigger) if not target_post["whitelisted"]: if not whitelist_post(target_post["guid"]): log("Failed to trigger processing.") sys.exit(1) else: log("Post already whitelisted. Checking status...") # If already whitelisted, maybe trigger reprocess or just check status? # Let's try to trigger process explicitly if it's not processed if not target_post["has_processed_audio"]: response = requests.post( f"{BASE_URL}/api/posts/{target_post['guid']}/process" ) log(f"Trigger process response: {response.status_code}") # 5. Wait for completion if wait_for_processing(target_post["guid"]): # 6. Verify output log("Verifying output...") # Check if we can get the audio link response = requests.get( f"{BASE_URL}/api/posts/{target_post['guid']}/audio", stream=True ) if response.status_code == 200: log("Audio file is accessible.") else: log(f"Failed to access audio file. Status: {response.status_code}") # Check JSON details response = requests.get(f"{BASE_URL}/post/{target_post['guid']}/json") if response.status_code == 200: data = response.json() log( f"Post JSON retrieved. Transcript segments: {data.get('transcript_segment_count')}" ) else: log("Failed to retrieve post JSON.") if __name__ == "__main__": main() ================================================ FILE: scripts/upgrade_db.sh ================================================ #!/usr/bin/env bash SCRIPT_DIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) REPO_ROOT=$(cd "$SCRIPT_DIR/.." && pwd) export PODLY_INSTANCE_DIR="$REPO_ROOT/src/instance" export PYTHONPATH="$REPO_ROOT/src" pipenv run flask --app app db upgrade ================================================ FILE: src/app/__init__.py ================================================ import importlib import json import logging import os import secrets import sys from pathlib import Path from typing import Any from flask import Flask, current_app, g, has_app_context, request from flask_cors import CORS from flask_migrate import upgrade from sqlalchemy import event from sqlalchemy.engine import Engine from app import models from app.auth import AuthSettings, load_auth_settings from app.auth.bootstrap import bootstrap_admin_user from app.auth.discord_settings import load_discord_settings from app.auth.middleware import init_auth_middleware from app.background import add_background_job, schedule_cleanup_job from app.config_store import ( ensure_defaults_and_hydrate, hydrate_runtime_config_inplace, ) from app.extensions import db, migrate, scheduler from app.jobs_manager import ( get_jobs_manager, ) from app.logger import setup_logger from app.processor import ( ProcessorSingleton, ) from app.routes import register_routes from app.runtime_config import config, is_test from app.writer.client import writer_client from shared.processing_paths import get_in_root, get_srv_root setup_logger("global_logger", "src/instance/logs/app.log") logger = logging.getLogger("global_logger") def _env_bool(name: str, default: bool = False) -> bool: raw = os.environ.get(name) if raw is None: return default return raw.strip().lower() in {"1", "true", "yes", "on"} def _get_sqlite_busy_timeout_ms() -> int: # Longer timeout to allow large batch deletes/updates to finish before giving up return 90000 def setup_dirs() -> None: """Create data directories. Logs a warning and continues if paths are not writable.""" in_root = get_in_root() srv_root = get_srv_root() try: os.makedirs(in_root, exist_ok=True) os.makedirs(srv_root, exist_ok=True) except OSError as exc: # During CLI commands like migrations, the /app path may not exist logger.warning( "Could not create data directories (%s, %s): %s. " "This is expected during migrations on local dev.", in_root, srv_root, exc, ) class SchedulerConfig: SCHEDULER_JOBSTORES = { "default": { "type": "sqlalchemy", "url": "sqlite:////tmp/jobs.sqlite", } } SCHEDULER_EXECUTORS = {"default": {"type": "threadpool", "max_workers": 1}} SCHEDULER_JOB_DEFAULTS = {"coalesce": False, "max_instances": 1} @event.listens_for(Engine, "connect", once=False) def _set_sqlite_pragmas(dbapi_connection: Any, connection_record: Any) -> None: module = getattr(dbapi_connection.__class__, "__module__", "") if not module.startswith(("sqlite3", "pysqlite2")): return cursor = dbapi_connection.cursor() busy_timeout_ms = _get_sqlite_busy_timeout_ms() try: cursor.execute("PRAGMA journal_mode=WAL;") cursor.execute("PRAGMA synchronous=NORMAL;") cursor.execute(f"PRAGMA busy_timeout={busy_timeout_ms};") # Limit WAL file size to prevent checkpoint starvation cursor.execute("PRAGMA wal_autocheckpoint=1000;") finally: cursor.close() def setup_scheduler(app: Flask) -> None: """Initialize and start the scheduler.""" if not is_test: scheduler.init_app(app) scheduler.start() def create_app() -> Flask: disable_scheduler = _env_bool("PODLY_DISABLE_SCHEDULER", default=False) run_startup = _env_bool("PODLY_RUN_STARTUP", default=True) return _create_configured_app( app_role="web", run_startup=run_startup, start_scheduler=not disable_scheduler, ) def create_web_app() -> Flask: """Create the web (read-mostly) Flask app. This app should not run startup migrations/bootstrapping; DB writes are delegated to the writer service. Scheduler runs here so background processing happens in the web process. """ return _create_configured_app( app_role="web", run_startup=False, start_scheduler=True, ) def create_writer_app() -> Flask: """Create the writer Flask app. This app owns startup migrations/bootstrapping. """ return _create_configured_app( app_role="writer", run_startup=True, start_scheduler=False, ) def _create_configured_app( *, app_role: str, run_startup: bool, start_scheduler: bool, ) -> Flask: # Setup directories early but only when actually creating the app (not during migrations) if not is_test: setup_dirs() app = _create_flask_app() app.config["PODLY_APP_ROLE"] = app_role auth_settings = _load_auth_settings() _apply_auth_settings(app, auth_settings) _configure_session(app, auth_settings) _configure_cors(app) _configure_scheduler(app) _configure_database(app) _configure_external_loggers() _initialize_extensions(app) _register_routes_and_middleware(app) app.config["developer_mode"] = config.developer_mode with app.app_context(): if run_startup: _run_app_startup(auth_settings) else: _hydrate_web_config() discord_settings = load_discord_settings() app.config["DISCORD_SETTINGS"] = discord_settings app.config["AUTH_SETTINGS"] = auth_settings.without_password() if app.config["DISCORD_SETTINGS"].enabled: logger.info( "Discord SSO enabled (guild restriction: %s)", "yes" if app.config["DISCORD_SETTINGS"].guild_ids else "no", ) _validate_env_key_conflicts() if start_scheduler: _start_scheduler_and_jobs(app) return app def _clear_scheduler_jobstore() -> None: """Remove persisted APScheduler jobs so startup adds a clean schedule.""" jobstore_config = SchedulerConfig.SCHEDULER_JOBSTORES.get("default") if not isinstance(jobstore_config, dict): return url = jobstore_config.get("url") if not isinstance(url, str): return prefix = "sqlite:///" if not url.startswith(prefix): return relative_path = url[len(prefix) :] project_root = Path(__file__).resolve().parents[2] jobstore_path = (project_root / Path(relative_path)).resolve() jobstore_path.parent.mkdir(parents=True, exist_ok=True) sidecars = [ jobstore_path, jobstore_path.with_name(jobstore_path.name + "-wal"), jobstore_path.with_name(jobstore_path.name + "-shm"), ] try: cleared_any = False for path in sidecars: if path.exists(): path.unlink() cleared_any = True if cleared_any: logger.info( "Startup: cleared persisted APScheduler jobs at %s", jobstore_path ) except OSError as exc: logger.warning( "Startup: failed to clear APScheduler jobs at %s: %s", jobstore_path, exc ) def _validate_env_key_conflicts() -> None: """Validate that environment API key variables are not conflicting. Rules: - If both LLM_API_KEY and GROQ_API_KEY are set and differ -> error """ llm_key = os.environ.get("LLM_API_KEY") groq_key = os.environ.get("GROQ_API_KEY") conflicts: list[str] = [] if llm_key and groq_key and llm_key != groq_key: conflicts.append( "LLM_API_KEY and GROQ_API_KEY are both set but have different values" ) if conflicts: details = "; ".join(conflicts) message = ( "Configuration error: Conflicting environment API keys detected. " f"{details}. To use Groq, prefer setting GROQ_API_KEY only; " "alternatively, set the variables to the same value." ) # Crash the process so Docker start fails clearly raise SystemExit(message) def _create_flask_app() -> Flask: static_folder = os.path.abspath(os.path.join(os.path.dirname(__file__), "static")) return Flask(__name__, static_folder=static_folder) def _load_auth_settings() -> AuthSettings: try: return load_auth_settings() except RuntimeError as exc: logger.critical("Authentication configuration error: %s", exc) raise def _apply_auth_settings(app: Flask, auth_settings: AuthSettings) -> None: app.config["AUTH_SETTINGS"] = auth_settings app.config["REQUIRE_AUTH"] = auth_settings.require_auth app.config["AUTH_ADMIN_USERNAME"] = auth_settings.admin_username def _configure_session(app: Flask, auth_settings: AuthSettings) -> None: secret_key = os.environ.get("PODLY_SECRET_KEY") if not secret_key: try: secret_key = secrets.token_urlsafe(64) except Exception as exc: # pylint: disable=broad-except raise RuntimeError("Failed to generate session secret key.") from exc if auth_settings.require_auth: logger.warning( "Generated ephemeral session secret key because PODLY_SECRET_KEY is not set; " "all sessions will be invalidated on restart." ) app.config["SECRET_KEY"] = secret_key app.config["SESSION_COOKIE_NAME"] = os.environ.get( "PODLY_SESSION_COOKIE_NAME", "podly_session" ) app.config["SESSION_COOKIE_HTTPONLY"] = True app.config["SESSION_COOKIE_SAMESITE"] = "Lax" # We always allow HTTP cookies so self-hosted installs work behind simple HTTP reverse proxies. app.config["SESSION_COOKIE_SECURE"] = False def _configure_cors(app: Flask) -> None: default_cors = [ "http://localhost:5173", "http://127.0.0.1:5173", ] cors_origins_env = os.environ.get("CORS_ORIGINS") if cors_origins_env: cors_origins = [ origin.strip() for origin in cors_origins_env.split(",") if origin.strip() ] else: cors_origins = default_cors CORS( app, resources={r"/*": {"origins": cors_origins}}, allow_headers=["Content-Type", "Authorization", "Range"], methods=["GET", "POST", "PUT", "DELETE", "OPTIONS"], supports_credentials=True, ) def _configure_scheduler(app: Flask) -> None: app.config.from_object(SchedulerConfig()) def _configure_database(app: Flask) -> None: def _get_sqlite_connect_timeout() -> int: return 60 uri_scheme = "sqlite" connect_timeout = _get_sqlite_connect_timeout() app.config["SQLALCHEMY_DATABASE_URI"] = ( f"{uri_scheme}:///sqlite3.db?timeout={connect_timeout}" ) engine_options: dict[str, Any] = { "connect_args": { "timeout": connect_timeout, }, # Keep pool small to reduce concurrent SQLite writers "pool_size": 5, "max_overflow": 5, } app.config["SQLALCHEMY_ENGINE_OPTIONS"] = engine_options app.config["SQLALCHEMY_TRACK_MODIFICATIONS"] = False def _configure_external_loggers() -> None: groq_logger = logging.getLogger("groq") groq_logger.setLevel(logging.INFO) def _configure_readonly_sessions(app: Flask) -> None: """ Configure SQLAlchemy sessions to be read-only for the web/API app. This prevents Flask from acquiring write locks on the database, which can cause deadlocks with the writer service. Only the writer service should perform database writes. """ from sqlalchemy.orm import Session @event.listens_for(Session, "after_begin", once=False) def receive_after_begin( session: Session, transaction: Any, connection: Any ) -> None: """Set new transactions to read-only by default.""" # Only apply to sessions created within this app context try: if not has_app_context(): return if current_app.config.get("PODLY_APP_ROLE") != "web": return except Exception: # pylint: disable=broad-except return # Set isolation level to prevent write locks # For SQLite, this prevents RESERVED/EXCLUSIVE locks connection.connection.isolation_level = "DEFERRED" # Disable autoflush to prevent accidental writes session.autoflush = False # Mark session as read-only to prevent any writes session.info["readonly"] = True @event.listens_for(Session, "before_flush", once=False) def receive_before_flush( session: Session, flush_context: Any, instances: Any ) -> None: """Prevent accidental writes in read-only sessions.""" try: if not has_app_context(): return if current_app.config.get("PODLY_APP_ROLE") != "web": return except Exception: # pylint: disable=broad-except return if session.info.get("readonly"): raise RuntimeError( "Attempted to flush changes in read-only session. " "All database writes must go through the writer service." ) def _initialize_extensions(app: Flask) -> None: db.init_app(app) migrate.init_app(app, db) # Configure read-only mode for web/API Flask app to prevent database locks # Only the writer service should acquire write locks if app.config.get("PODLY_APP_ROLE") == "web": _configure_readonly_sessions(app) def _register_routes_and_middleware(app: Flask) -> None: register_routes(app) init_auth_middleware(app) _register_api_logging(app) def _register_api_logging(app: Flask) -> None: @app.after_request def _log_api_request(response: Any) -> Any: try: path = request.path except Exception: # pragma: no cover # pylint: disable=broad-except return response if not path.startswith("/api/"): return response method = request.method status = getattr(response, "status_code", None) user = getattr(g, "current_user", None) user_id = getattr(user, "id", None) logger.info( "[API] %s %s status=%s user_id=%s content_type=%s", method, path, status, user_id, getattr(response, "content_type", None), ) return response def _run_app_startup(auth_settings: AuthSettings) -> None: upgrade() bootstrap_admin_user(auth_settings) try: ensure_defaults_and_hydrate() ProcessorSingleton.reset_instance() except Exception as exc: # pylint: disable=broad-except logger.error(f"Failed to initialize settings: {exc}") def _hydrate_web_config() -> None: """Hydrate runtime config for web app (read-only).""" hydrate_runtime_config_inplace() ProcessorSingleton.reset_instance() def _start_scheduler_and_jobs(app: Flask) -> None: _clear_scheduler_jobstore() setup_scheduler(app) jobs_manager = get_jobs_manager() clear_result = jobs_manager.clear_all_jobs() if clear_result["status"] == "success": logger.info(f"Startup: {clear_result['message']}") else: logger.warning(f"Startup job clearing failed: {clear_result['message']}") add_background_job( 10 if config.background_update_interval_minute is None else int(config.background_update_interval_minute) ) schedule_cleanup_job(getattr(config, "post_cleanup_retention_days", None)) ================================================ FILE: src/app/auth/__init__.py ================================================ """ Authentication package exposing configuration helpers and utilities. """ from .guards import is_auth_enabled, require_admin from .settings import AuthSettings, load_auth_settings __all__ = ["AuthSettings", "load_auth_settings", "require_admin", "is_auth_enabled"] ================================================ FILE: src/app/auth/bootstrap.py ================================================ from __future__ import annotations import logging from flask import current_app from app.db_commit import safe_commit from app.extensions import db from app.models import User from app.writer.client import writer_client from .settings import AuthSettings logger = logging.getLogger("global_logger") def bootstrap_admin_user(auth_settings: AuthSettings) -> None: """Ensure an administrator user exists when auth is required.""" logger.info("Bootstrapping admin user...") if not auth_settings.require_auth: return # Avoid seeding if users already exist. current_admin = db.session.query(User.id).limit(1).first() if current_admin is not None: logger.info("Admin user already exists; skipping bootstrap.") return password = auth_settings.admin_password if not password: logger.error( "REQUIRE_AUTH=true but PODLY_ADMIN_PASSWORD is missing during bootstrap." ) raise RuntimeError( "Authentication bootstrap failed: PODLY_ADMIN_PASSWORD is required." ) username = auth_settings.admin_username role = current_app.config.get("PODLY_APP_ROLE") if role == "writer": user = User(username=username, role="admin") user.set_password(password) db.session.add(user) safe_commit( db.session, must_succeed=True, context="bootstrap_admin_user", logger_obj=logger, ) else: res = writer_client.action( "create_user", {"username": username, "password": password, "role": "admin"}, wait=True, ) if not res or not res.success: # If another process created the admin concurrently, treat as success. if "already exists" not in str(getattr(res, "error", "")): raise RuntimeError( getattr(res, "error", "Failed to bootstrap admin user") ) logger.info( "Bootstrapped initial admin user '%s'. Ensure environment secrets are stored securely.", username, ) # Clear the password from the Flask config if it was set to avoid lingering plaintext. current_app.config.pop("PODLY_ADMIN_PASSWORD", None) ================================================ FILE: src/app/auth/discord_service.py ================================================ from __future__ import annotations import logging import secrets from dataclasses import dataclass from typing import Any from urllib.parse import urlencode import httpx from app.auth.discord_settings import DiscordSettings from app.extensions import db from app.models import User from app.writer.client import writer_client logger = logging.getLogger("global_logger") DISCORD_API_BASE = "https://discord.com/api/v10" DISCORD_OAUTH2_AUTHORIZE = "https://discord.com/oauth2/authorize" DISCORD_OAUTH2_TOKEN = "https://discord.com/api/oauth2/token" class DiscordAuthError(Exception): """Base error for Discord auth failures.""" class DiscordGuildRequirementError(DiscordAuthError): """User is not in required guild(s).""" class DiscordRegistrationDisabledError(DiscordAuthError): """Self-registration is disabled.""" @dataclass class DiscordUser: id: str username: str def generate_oauth_state() -> str: """Generate a secure random state parameter for OAuth2.""" return secrets.token_urlsafe(32) def build_authorization_url( settings: DiscordSettings, state: str, prompt: str = "none" ) -> str: """Build the Discord OAuth2 authorization URL.""" scopes = ["identify"] if settings.guild_ids: scopes.append("guilds") params = { "client_id": settings.client_id, "redirect_uri": settings.redirect_uri, "response_type": "code", "scope": " ".join(scopes), "state": state, } if prompt: params["prompt"] = prompt return f"{DISCORD_OAUTH2_AUTHORIZE}?{urlencode(params)}" def exchange_code_for_token(settings: DiscordSettings, code: str) -> dict[str, Any]: """Exchange an authorization code for an access token (synchronous).""" with httpx.Client() as client: response = client.post( DISCORD_OAUTH2_TOKEN, data={ "client_id": settings.client_id, "client_secret": settings.client_secret, "grant_type": "authorization_code", "code": code, "redirect_uri": settings.redirect_uri, }, headers={"Content-Type": "application/x-www-form-urlencoded"}, ) response.raise_for_status() result: dict[str, Any] = response.json() return result def get_discord_user(access_token: str) -> DiscordUser: """Fetch Discord user info using an access token (synchronous).""" with httpx.Client() as client: response = client.get( f"{DISCORD_API_BASE}/users/@me", headers={"Authorization": f"Bearer {access_token}"}, ) response.raise_for_status() data = response.json() return DiscordUser( id=data["id"], username=data["username"], ) def check_guild_membership(access_token: str, settings: DiscordSettings) -> bool: """Check if user is in any of the required guilds (synchronous).""" if not settings.guild_ids: return True with httpx.Client() as client: response = client.get( f"{DISCORD_API_BASE}/users/@me/guilds", headers={"Authorization": f"Bearer {access_token}"}, ) response.raise_for_status() user_guilds = {g["id"] for g in response.json()} return any(gid in user_guilds for gid in settings.guild_ids) def find_or_create_user_from_discord( discord_user: DiscordUser, settings: DiscordSettings, ) -> User: """Find an existing user by Discord ID or create a new one.""" result = writer_client.action( "upsert_discord_user", { "discord_id": discord_user.id, "discord_username": discord_user.username, "allow_registration": settings.allow_registration, }, wait=True, ) if not result or not result.success or not isinstance(result.data, dict): err = getattr(result, "error", "Failed to upsert Discord user") if "disabled" in str(err).lower(): raise DiscordRegistrationDisabledError(str(err)) raise DiscordAuthError(str(err)) user_id = int(result.data["user_id"]) user = db.session.get(User, user_id) if user is None: raise DiscordAuthError("Discord user upserted but not found") return user ================================================ FILE: src/app/auth/discord_settings.py ================================================ from __future__ import annotations import os from dataclasses import dataclass from typing import TYPE_CHECKING if TYPE_CHECKING: from flask import Flask from app.models import DiscordSettings as DiscordSettingsModel @dataclass(slots=True, frozen=True) class DiscordSettings: enabled: bool client_id: str | None client_secret: str | None redirect_uri: str | None guild_ids: list[str] allow_registration: bool def load_discord_settings() -> DiscordSettings: """Load Discord OAuth2 settings from environment variables and database. Environment variables take precedence over database values. """ # Try to load from database first db_settings = _load_from_database() # Environment variables override database values client_id = os.environ.get("DISCORD_CLIENT_ID") or ( db_settings.client_id if db_settings else None ) client_secret = os.environ.get("DISCORD_CLIENT_SECRET") or ( db_settings.client_secret if db_settings else None ) redirect_uri = os.environ.get("DISCORD_REDIRECT_URI") or ( db_settings.redirect_uri if db_settings else None ) enabled = bool(client_id and client_secret and redirect_uri) # Guild IDs: env var takes precedence guild_ids_env = os.environ.get("DISCORD_GUILD_IDS", "") if guild_ids_env: guild_ids = [g.strip() for g in guild_ids_env.split(",") if g.strip()] elif db_settings and db_settings.guild_ids: guild_ids = [g.strip() for g in db_settings.guild_ids.split(",") if g.strip()] else: guild_ids = [] # Allow registration: env var takes precedence allow_reg_env = os.environ.get("DISCORD_ALLOW_REGISTRATION") if allow_reg_env is not None: allow_registration = allow_reg_env.lower() in ("true", "1", "yes") elif db_settings is not None: allow_registration = db_settings.allow_registration else: allow_registration = True return DiscordSettings( enabled=enabled, client_id=client_id, client_secret=client_secret, redirect_uri=redirect_uri, guild_ids=guild_ids, allow_registration=allow_registration, ) def _load_from_database() -> "DiscordSettingsModel | None": """Load Discord settings from database, returns None if not available.""" try: from app.extensions import db from app.models import DiscordSettings as DiscordSettingsModel return db.session.get(DiscordSettingsModel, 1) except Exception: # Database not initialized or table doesn't exist yet return None def reload_discord_settings(app: "Flask") -> DiscordSettings: """Reload Discord settings and update app config.""" settings = load_discord_settings() app.config["DISCORD_SETTINGS"] = settings return settings ================================================ FILE: src/app/auth/feed_tokens.py ================================================ from __future__ import annotations import hashlib import logging import secrets from dataclasses import dataclass from typing import Optional from app.auth.service import AuthenticatedUser from app.extensions import db from app.models import Feed, FeedAccessToken, Post, User, UserFeed from app.writer.client import writer_client logger = logging.getLogger("global_logger") def _hash_token(secret_value: str) -> str: return hashlib.sha256(secret_value.encode("utf-8")).hexdigest() @dataclass(slots=True) class FeedTokenAuthResult: user: AuthenticatedUser feed_id: int | None token: FeedAccessToken def _validate_token_access(token: FeedAccessToken, user: User, path: str) -> bool: # Handle Aggregate Token (feed_id is None) if token.feed_id is None: # 1. If accessing the aggregate feed itself (/feed/user/) # Validate that the token belongs to the requested user requested_user_id = _resolve_user_id_from_feed_path(path) if requested_user_id is not None: return bool(requested_user_id == user.id) # 2. If accessing a specific resource (audio/post), verify subscription resource_feed_id = _resolve_feed_id(path) if resource_feed_id is not None: return _verify_subscription(user, resource_feed_id) # If we can't resolve a feed ID but it's not the aggregate feed path, # we might be in a generic context or invalid path. # For safety, if we can't verify context, we might deny, # but let's allow if it's just a token check not tied to a specific resource yet. return True # Handle Specific Feed Token feed_id = _resolve_feed_id(path) if feed_id is None or feed_id != token.feed_id: return False return _verify_subscription(user, token.feed_id) def create_feed_access_token(user: User, feed: Feed | None) -> tuple[str, str]: feed_id = feed.id if feed else None result = writer_client.action( "create_feed_access_token", {"user_id": user.id, "feed_id": feed_id}, wait=True, ) if not result or not result.success or not isinstance(result.data, dict): raise RuntimeError(getattr(result, "error", "Failed to create feed token")) return str(result.data["token_id"]), str(result.data["secret"]) def authenticate_feed_token( token_id: str, secret: str, path: str ) -> Optional[FeedTokenAuthResult]: if not token_id: return None token = FeedAccessToken.query.filter_by(token_id=token_id, revoked=False).first() if token is None: return None expected_hash = _hash_token(secret) if not secrets.compare_digest(token.token_hash, expected_hash): return None user = db.session.get(User, token.user_id) if user is None: return None if not _validate_token_access(token, user, path): return None writer_client.action( "touch_feed_access_token", {"token_id": token_id, "secret": secret}, wait=False, ) return FeedTokenAuthResult( user=AuthenticatedUser(id=user.id, username=user.username, role=user.role), feed_id=token.feed_id, token=token, ) def _verify_subscription(user: User, feed_id: int) -> bool: if user.role == "admin": return True # Hack: Always allow Feed 1 if feed_id == 1: return True membership = UserFeed.query.filter_by(user_id=user.id, feed_id=feed_id).first() if not membership: logger.warning( "Access denied: User %s has valid token but no active subscription for feed %s", user.id, feed_id, ) return False return True def _resolve_user_id_from_feed_path(path: str) -> Optional[int]: if path.startswith("/feed/user/"): remainder = path[len("/feed/user/") :] try: return int(remainder.split("/", 1)[0]) except ValueError: return None return None def _resolve_feed_id(path: str) -> Optional[int]: if path.startswith("/feed/"): remainder = path[len("/feed/") :] try: return int(remainder.split("/", 1)[0]) except ValueError: return None if path.startswith("/api/posts/"): parts = path.split("/") if len(parts) < 4: return None guid = parts[3] post = Post.query.filter_by(guid=guid).first() return post.feed_id if post else None if path.startswith("/post/"): remainder = path[len("/post/") :] guid = remainder.split("/", 1)[0] guid = guid.split(".", 1)[0] post = Post.query.filter_by(guid=guid).first() return post.feed_id if post else None return None ================================================ FILE: src/app/auth/guards.py ================================================ """Authorization guard utilities for admin and authenticated user checks.""" from typing import TYPE_CHECKING, Tuple import flask from flask import current_app, g, jsonify from app.extensions import db if TYPE_CHECKING: from app.models import User def require_admin( action: str = "perform this action", ) -> Tuple["User | None", flask.Response | None]: """Ensure the current user is an admin when auth is enabled. When auth is disabled (AUTH_SETTINGS.require_auth == False), returns (None, None) to allow the operation. When auth is enabled: - Returns (user, None) if user is authenticated and is admin - Returns (None, error_response) if not authenticated or not admin Args: action: Description of the action for error messages. Returns: (user, error_response) tuple where only one is non-None. """ settings = current_app.config.get("AUTH_SETTINGS") if not settings or not settings.require_auth: return None, None current = getattr(g, "current_user", None) if current is None: return None, flask.make_response( jsonify({"error": "Authentication required."}), 401 ) from app.models import User user: User | None = db.session.get(User, current.id) if user is None: return None, flask.make_response(jsonify({"error": "User not found."}), 404) if user.role != "admin": return None, flask.make_response( jsonify({"error": f"Only admins can {action}."}), 403 ) return user, None def is_auth_enabled() -> bool: """Check if authentication is enabled.""" settings = current_app.config.get("AUTH_SETTINGS") return bool(settings and settings.require_auth) ================================================ FILE: src/app/auth/middleware.py ================================================ from __future__ import annotations import re from typing import Any from flask import Response, current_app, g, jsonify, request, session from app.auth.feed_tokens import FeedTokenAuthResult, authenticate_feed_token from app.auth.service import AuthenticatedUser from app.auth.state import failure_rate_limiter from app.extensions import db from app.models import User SESSION_USER_KEY = "user_id" # Paths that remain public even when auth is required. _PUBLIC_PATHS: set[str] = { "/", "/health", "/robots.txt", "/manifest.json", "/favicon.ico", "/api/auth/login", "/api/auth/status", "/api/auth/discord/status", "/api/auth/discord/login", "/api/auth/discord/callback", "/api/landing/status", # Stripe webhooks must bypass auth to allow Stripe to deliver events "/api/billing/stripe-webhook", } _PUBLIC_PREFIXES: tuple[str, ...] = ( "/static/", "/assets/", "/images/", "/fonts/", "/.well-known/", ) _PUBLIC_EXTENSIONS: tuple[str, ...] = ( ".js", ".css", ".map", ".png", ".jpg", ".jpeg", ".gif", ".svg", ".ico", ".webp", ".txt", ) _TOKEN_PROTECTED_PATTERNS: tuple[re.Pattern[str], ...] = ( re.compile(r"^/feed/[^/]+$"), re.compile(r"^/feed/user/[^/]+$"), re.compile(r"^/api/posts/[^/]+/(audio|download(?:/original)?)$"), re.compile(r"^/post/[^/]+(?:\\.mp3|/original\\.mp3)$"), ) def init_auth_middleware(app: Any) -> None: """Attach the authentication guard to the Flask app.""" @app.before_request # type: ignore[untyped-decorator] def enforce_authentication() -> Response | None: # pylint: disable=too-many-return-statements if request.method == "OPTIONS": return None settings = current_app.config.get("AUTH_SETTINGS") if not settings or not settings.require_auth: return None if _is_public_request(request.path): return None client_identifier = request.remote_addr or "unknown" session_user = _load_session_user() if session_user is not None: g.current_user = session_user g.feed_token = None failure_rate_limiter.register_success(client_identifier) return None if _is_token_protected_endpoint(request.path): retry_after = failure_rate_limiter.retry_after(client_identifier) if retry_after: return _too_many_requests(retry_after) token_result = _authenticate_feed_token_from_query() if token_result is None: backoff = failure_rate_limiter.register_failure(client_identifier) response = _token_unauthorized() if backoff: response.headers["Retry-After"] = str(backoff) return response failure_rate_limiter.register_success(client_identifier) g.current_user = token_result.user g.feed_token = token_result return None return _json_unauthorized() def _load_session_user() -> AuthenticatedUser | None: raw_user_id = session.get(SESSION_USER_KEY) if isinstance(raw_user_id, str) and raw_user_id.isdigit(): user_id = int(raw_user_id) elif isinstance(raw_user_id, int): user_id = raw_user_id else: return None user = db.session.get(User, user_id) if user is None: session.pop(SESSION_USER_KEY, None) return None return AuthenticatedUser(id=user.id, username=user.username, role=user.role) def _is_token_protected_endpoint(path: str) -> bool: return any(pattern.match(path) for pattern in _TOKEN_PROTECTED_PATTERNS) def _authenticate_feed_token_from_query() -> FeedTokenAuthResult | None: token_id = request.args.get("feed_token") secret = request.args.get("feed_secret") if not token_id or not secret: return None return authenticate_feed_token(token_id, secret, request.path) def _is_public_request(path: str) -> bool: if path in _PUBLIC_PATHS: return True if any(path.startswith(prefix) for prefix in _PUBLIC_PREFIXES): return True if any(path.endswith(ext) for ext in _PUBLIC_EXTENSIONS): return True return False def _json_unauthorized(message: str = "Authentication required.") -> Response: response = jsonify({"error": message}) response.status_code = 401 return response def _token_unauthorized() -> Response: response = Response("Invalid or missing feed token", status=401) return response def _too_many_requests(retry_after: int) -> Response: response = Response("Too Many Authentication Attempts", status=429) response.headers["Retry-After"] = str(retry_after) return response ================================================ FILE: src/app/auth/passwords.py ================================================ from __future__ import annotations import bcrypt def hash_password(password: str, *, rounds: int = 12) -> str: """Hash a password using bcrypt with the provided work factor.""" salt = bcrypt.gensalt(rounds) hashed = bcrypt.hashpw(password.encode("utf-8"), salt) return hashed.decode("utf-8") def verify_password(password: str, password_hash: str) -> bool: """Verify the provided password against the stored bcrypt hash.""" try: return bcrypt.checkpw( password.encode("utf-8"), password_hash.encode("utf-8"), ) except ValueError: return False ================================================ FILE: src/app/auth/rate_limiter.py ================================================ from __future__ import annotations from collections.abc import MutableMapping from dataclasses import dataclass from datetime import datetime, timedelta @dataclass class FailureState: attempts: int blocked_until: datetime | None last_attempt: datetime class FailureRateLimiter: """Simple in-memory exponential backoff tracker for authentication failures.""" def __init__( self, *, storage: MutableMapping[str, FailureState] | None = None, max_backoff_seconds: int = 300, warm_up_attempts: int = 3, ) -> None: self._storage = storage if storage is not None else {} self._max_backoff_seconds = max_backoff_seconds self._warm_up_attempts = warm_up_attempts def register_failure(self, key: str) -> int: now = datetime.utcnow() state = self._storage.get(key) if state is None: state = FailureState(attempts=1, blocked_until=None, last_attempt=now) else: state.attempts += 1 state.last_attempt = now backoff_seconds = 0 if state.attempts > self._warm_up_attempts: exponent = state.attempts - self._warm_up_attempts backoff_seconds = min(2**exponent, self._max_backoff_seconds) state.blocked_until = now + timedelta(seconds=backoff_seconds) else: state.blocked_until = None self._storage[key] = state self._prune_stale(now) return backoff_seconds def register_success(self, key: str) -> None: if key in self._storage: del self._storage[key] def retry_after(self, key: str) -> int | None: state = self._storage.get(key) if state is None or state.blocked_until is None: return None now = datetime.utcnow() if state.blocked_until <= now: del self._storage[key] return None remaining = int((state.blocked_until - now).total_seconds()) if remaining <= 0: del self._storage[key] return None return remaining def _prune_stale(self, now: datetime) -> None: stale_keys: list[str] = [] for key, state in self._storage.items(): if now - state.last_attempt > timedelta(hours=1): stale_keys.append(key) for key in stale_keys: del self._storage[key] ================================================ FILE: src/app/auth/service.py ================================================ from __future__ import annotations import logging from dataclasses import dataclass from typing import Sequence, cast from app.extensions import db from app.models import User from app.runtime_config import config as runtime_config from app.writer.client import writer_client logger = logging.getLogger("global_logger") class AuthServiceError(Exception): """Base class for authentication domain errors.""" class InvalidCredentialsError(AuthServiceError): """Raised when provided credentials are invalid.""" class PasswordValidationError(AuthServiceError): """Raised when a password fails strength validation.""" class DuplicateUserError(AuthServiceError): """Raised when attempting to create a user with an existing username.""" class LastAdminRemovalError(AuthServiceError): """Raised when deleting or demoting the final admin user.""" class UserLimitExceededError(AuthServiceError): """Raised when creating a user would exceed the configured limit.""" ALLOWED_ROLES: set[str] = {"admin", "user"} @dataclass(slots=True) class AuthenticatedUser: id: int username: str role: str def _normalize_username(username: str) -> str: return username.strip().lower() def authenticate(username: str, password: str) -> AuthenticatedUser | None: user = User.query.filter_by(username=_normalize_username(username)).first() if user is None: return None if not user.verify_password(password): return None return AuthenticatedUser(id=user.id, username=user.username, role=user.role) def list_users() -> Sequence[User]: return cast( Sequence[User], User.query.order_by(User.created_at.desc(), User.id.desc()).all(), ) def create_user(username: str, password: str, role: str = "user") -> User: normalized_username = _normalize_username(username) if not normalized_username: raise AuthServiceError("Username is required.") if role not in ALLOWED_ROLES: raise AuthServiceError(f"Role must be one of {sorted(ALLOWED_ROLES)}.") if User.query.filter_by(username=normalized_username).first(): raise DuplicateUserError("A user with that username already exists.") _enforce_user_limit() result = writer_client.action( "create_user", {"username": normalized_username, "password": password, "role": role}, wait=True, ) if not result or not result.success or not isinstance(result.data, dict): raise AuthServiceError(getattr(result, "error", "Failed to create user")) user_id = int(result.data["user_id"]) user = db.session.get(User, user_id) if user is None: raise AuthServiceError("User created but not found") return user def change_password(user: User, current_password: str, new_password: str) -> None: if not user.verify_password(current_password): raise InvalidCredentialsError("Current password is incorrect.") update_password(user, new_password) def update_password(user: User, new_password: str) -> None: result = writer_client.action( "update_user_password", {"user_id": user.id, "new_password": new_password}, wait=True, ) if not result or not result.success: raise AuthServiceError(getattr(result, "error", "Failed to update password")) db.session.expire(user) def delete_user(user: User) -> None: if user.role == "admin" and _count_admins() <= 1: raise LastAdminRemovalError("Cannot remove the last admin user.") result = writer_client.action("delete_user", {"user_id": user.id}, wait=True) if not result or not result.success: raise AuthServiceError(getattr(result, "error", "Failed to delete user")) def set_role(user: User, role: str) -> None: if role not in ALLOWED_ROLES: raise AuthServiceError(f"Role must be one of {sorted(ALLOWED_ROLES)}.") if user.role == "admin" and role != "admin" and _count_admins() <= 1: raise LastAdminRemovalError("Cannot demote the last admin user.") result = writer_client.action( "set_user_role", {"user_id": user.id, "role": role}, wait=True ) if not result or not result.success: raise AuthServiceError(getattr(result, "error", "Failed to set role")) db.session.expire(user) def set_manual_feed_allowance(user: User, allowance: int | None) -> None: result = writer_client.action( "set_manual_feed_allowance", {"user_id": user.id, "allowance": allowance}, wait=True, ) if not result or not result.success: raise AuthServiceError(getattr(result, "error", "Failed to set allowance")) db.session.expire(user) def update_user_last_active(user_id: int) -> None: """Update the last_active timestamp for a user.""" writer_client.action( "update_user_last_active", {"user_id": user_id}, wait=False, ) def _count_admins() -> int: return cast(int, User.query.filter_by(role="admin").count()) def _enforce_user_limit() -> None: """Prevent creating users beyond the configured total limit. Limit applies only when authentication is enabled; a non-positive or missing limit means unlimited users. """ try: limit = getattr(runtime_config, "user_limit_total", None) except Exception: # pragma: no cover - defensive limit = None if limit is None: return try: limit_int = int(limit) except Exception: return if limit_int < 0: return current_total = cast(int, User.query.count()) if limit_int == 0 or current_total >= limit_int: raise UserLimitExceededError( f"User limit reached ({current_total}/{limit_int}). Delete a user or increase the limit." ) ================================================ FILE: src/app/auth/settings.py ================================================ from __future__ import annotations import os from dataclasses import dataclass, replace def _str_to_bool(value: str | None, default: bool = False) -> bool: if value is None: return default lowered = value.strip().lower() return lowered in {"1", "true", "t", "yes", "y", "on"} @dataclass(slots=True, frozen=True) class AuthSettings: """Runtime authentication configuration derived from environment variables.""" require_auth: bool admin_username: str admin_password: str | None @property def admin_password_required(self) -> bool: return self.require_auth def without_password(self) -> "AuthSettings": """Return a copy with the password removed to avoid retaining plaintext.""" return replace(self, admin_password=None) def load_auth_settings() -> AuthSettings: """Load authentication settings from environment variables.""" require_auth = _str_to_bool(os.environ.get("REQUIRE_AUTH"), default=False) admin_username = os.environ.get("PODLY_ADMIN_USERNAME", "podly_admin").strip() admin_password = os.environ.get("PODLY_ADMIN_PASSWORD") if require_auth: if not admin_username: raise RuntimeError( "PODLY_ADMIN_USERNAME must be set to a non-empty value when " "REQUIRE_AUTH=true." ) if admin_password is None: raise RuntimeError( "PODLY_ADMIN_PASSWORD must be provided when REQUIRE_AUTH=true." ) return AuthSettings( require_auth=require_auth, admin_username=admin_username or "podly_admin", admin_password=admin_password, ) ================================================ FILE: src/app/auth/state.py ================================================ from __future__ import annotations from .rate_limiter import FailureRateLimiter failure_rate_limiter = FailureRateLimiter() ================================================ FILE: src/app/background.py ================================================ from datetime import datetime, timedelta from typing import Optional from app.extensions import scheduler from app.jobs_manager import ( scheduled_refresh_all_feeds, ) from app.post_cleanup import scheduled_cleanup_processed_posts def add_background_job(minutes: int) -> None: """Add the recurring background job for refreshing feeds. minutes: interval in minutes; must be a positive integer. """ scheduler.add_job( id="refresh_all_feeds", func=scheduled_refresh_all_feeds, trigger="interval", minutes=minutes, replace_existing=True, ) def schedule_cleanup_job(retention_days: Optional[int]) -> None: """Ensure the periodic cleanup job is scheduled or disabled as needed.""" job_id = "cleanup_processed_posts" if retention_days is None or retention_days <= 0: try: scheduler.remove_job(job_id) except Exception: # Job may not be scheduled; ignore. pass return # Run daily; allow scheduler to coalesce missed runs. scheduler.add_job( id=job_id, func=scheduled_cleanup_processed_posts, trigger="interval", hours=24, next_run_time=datetime.utcnow() + timedelta(minutes=15), replace_existing=True, ) ================================================ FILE: src/app/config_store.py ================================================ from __future__ import annotations import hashlib import logging import os from typing import Any, Dict, Optional, Tuple from flask import current_app from app.db_commit import safe_commit from app.extensions import db, scheduler from app.models import ( AppSettings, LLMSettings, OutputSettings, ProcessingSettings, WhisperSettings, ) from app.runtime_config import config as runtime_config from shared import defaults as DEFAULTS from shared.config import Config as PydanticConfig from shared.config import ( GroqWhisperConfig, LocalWhisperConfig, RemoteWhisperConfig, TestWhisperConfig, ) # pylint: disable=too-many-lines logger = logging.getLogger("global_logger") def _is_empty(value: Any) -> bool: return value is None or value == "" def _parse_int(val: Any) -> Optional[int]: try: return int(val) if val is not None else None except Exception: return None def _parse_bool(val: Any) -> Optional[bool]: if val is None: return None s = str(val).strip().lower() if s in {"1", "true", "yes", "on"}: return True if s in {"0", "false", "no", "off"}: return False return None def _set_if_empty(obj: Any, attr: str, new_val: Any) -> bool: if _is_empty(new_val): return False if _is_empty(getattr(obj, attr)): setattr(obj, attr, new_val) return True return False def _set_if_default(obj: Any, attr: str, new_val: Any, default_val: Any) -> bool: if new_val is None: return False if getattr(obj, attr) == default_val: setattr(obj, attr, new_val) return True return False def _ensure_row(model: type, defaults: Dict[str, Any]) -> Any: row = db.session.get(model, 1) if row is None: role = None try: role = current_app.config.get("PODLY_APP_ROLE") except Exception: # pylint: disable=broad-except role = None # Web app should be read-only; only the writer process is allowed to create # missing settings rows. if role == "writer": row = model(id=1, **defaults) db.session.add(row) safe_commit( db.session, must_succeed=True, context="ensure_settings_row", logger_obj=logger, ) else: logger.warning( "Settings row %s missing; returning defaults without persisting (role=%s)", getattr(model, "__name__", str(model)), role, ) return model(id=1, **defaults) return row def ensure_defaults() -> None: _ensure_row( LLMSettings, { "llm_model": DEFAULTS.LLM_DEFAULT_MODEL, "openai_timeout": DEFAULTS.OPENAI_DEFAULT_TIMEOUT_SEC, "openai_max_tokens": DEFAULTS.OPENAI_DEFAULT_MAX_TOKENS, "llm_max_concurrent_calls": DEFAULTS.LLM_DEFAULT_MAX_CONCURRENT_CALLS, "llm_max_retry_attempts": DEFAULTS.LLM_DEFAULT_MAX_RETRY_ATTEMPTS, "llm_enable_token_rate_limiting": DEFAULTS.LLM_ENABLE_TOKEN_RATE_LIMITING, "enable_boundary_refinement": DEFAULTS.ENABLE_BOUNDARY_REFINEMENT, "enable_word_level_boundary_refinder": DEFAULTS.ENABLE_WORD_LEVEL_BOUNDARY_REFINDER, }, ) _ensure_row( WhisperSettings, { "whisper_type": DEFAULTS.WHISPER_DEFAULT_TYPE, "local_model": DEFAULTS.WHISPER_LOCAL_MODEL, "remote_model": DEFAULTS.WHISPER_REMOTE_MODEL, "remote_base_url": DEFAULTS.WHISPER_REMOTE_BASE_URL, "remote_language": DEFAULTS.WHISPER_REMOTE_LANGUAGE, "remote_timeout_sec": DEFAULTS.WHISPER_REMOTE_TIMEOUT_SEC, "remote_chunksize_mb": DEFAULTS.WHISPER_REMOTE_CHUNKSIZE_MB, "groq_model": DEFAULTS.WHISPER_GROQ_MODEL, "groq_language": DEFAULTS.WHISPER_GROQ_LANGUAGE, "groq_max_retries": DEFAULTS.WHISPER_GROQ_MAX_RETRIES, }, ) _ensure_row( ProcessingSettings, { "num_segments_to_input_to_prompt": DEFAULTS.PROCESSING_NUM_SEGMENTS_TO_INPUT_TO_PROMPT, }, ) _ensure_row( OutputSettings, { "fade_ms": DEFAULTS.OUTPUT_FADE_MS, "min_ad_segement_separation_seconds": DEFAULTS.OUTPUT_MIN_AD_SEGMENT_SEPARATION_SECONDS, "min_ad_segment_length_seconds": DEFAULTS.OUTPUT_MIN_AD_SEGMENT_LENGTH_SECONDS, "min_confidence": DEFAULTS.OUTPUT_MIN_CONFIDENCE, }, ) _ensure_row( AppSettings, { "background_update_interval_minute": DEFAULTS.APP_BACKGROUND_UPDATE_INTERVAL_MINUTE, "automatically_whitelist_new_episodes": DEFAULTS.APP_AUTOMATICALLY_WHITELIST_NEW_EPISODES, "post_cleanup_retention_days": DEFAULTS.APP_POST_CLEANUP_RETENTION_DAYS, "number_of_episodes_to_whitelist_from_archive_of_new_feed": DEFAULTS.APP_NUM_EPISODES_TO_WHITELIST_FROM_ARCHIVE_OF_NEW_FEED, "enable_public_landing_page": DEFAULTS.APP_ENABLE_PUBLIC_LANDING_PAGE, "user_limit_total": DEFAULTS.APP_USER_LIMIT_TOTAL, "autoprocess_on_download": DEFAULTS.APP_AUTOPROCESS_ON_DOWNLOAD, }, ) def _apply_llm_env_overrides_to_db(llm: Any) -> bool: """Apply LLM-related environment variable overrides to database settings. Returns True if any settings were changed. """ changed = False env_llm_key = ( os.environ.get("LLM_API_KEY") or os.environ.get("OPENAI_API_KEY") or os.environ.get("GROQ_API_KEY") ) changed = _set_if_empty(llm, "llm_api_key", env_llm_key) or changed env_llm_model = os.environ.get("LLM_MODEL") changed = ( _set_if_default(llm, "llm_model", env_llm_model, DEFAULTS.LLM_DEFAULT_MODEL) or changed ) env_openai_base_url = os.environ.get("OPENAI_BASE_URL") changed = _set_if_empty(llm, "openai_base_url", env_openai_base_url) or changed env_openai_timeout = _parse_int(os.environ.get("OPENAI_TIMEOUT")) changed = ( _set_if_default( llm, "openai_timeout", env_openai_timeout, DEFAULTS.OPENAI_DEFAULT_TIMEOUT_SEC, ) or changed ) env_openai_max_tokens = _parse_int(os.environ.get("OPENAI_MAX_TOKENS")) changed = ( _set_if_default( llm, "openai_max_tokens", env_openai_max_tokens, DEFAULTS.OPENAI_DEFAULT_MAX_TOKENS, ) or changed ) env_llm_max_concurrent = _parse_int(os.environ.get("LLM_MAX_CONCURRENT_CALLS")) changed = ( _set_if_default( llm, "llm_max_concurrent_calls", env_llm_max_concurrent, DEFAULTS.LLM_DEFAULT_MAX_CONCURRENT_CALLS, ) or changed ) env_llm_max_retries = _parse_int(os.environ.get("LLM_MAX_RETRY_ATTEMPTS")) changed = ( _set_if_default( llm, "llm_max_retry_attempts", env_llm_max_retries, DEFAULTS.LLM_DEFAULT_MAX_RETRY_ATTEMPTS, ) or changed ) env_llm_enable_token_rl = _parse_bool( os.environ.get("LLM_ENABLE_TOKEN_RATE_LIMITING") ) if ( llm.llm_enable_token_rate_limiting == DEFAULTS.LLM_ENABLE_TOKEN_RATE_LIMITING and env_llm_enable_token_rl is not None ): llm.llm_enable_token_rate_limiting = bool(env_llm_enable_token_rl) changed = True env_llm_max_input_tokens_per_call = _parse_int( os.environ.get("LLM_MAX_INPUT_TOKENS_PER_CALL") ) if ( llm.llm_max_input_tokens_per_call is None and env_llm_max_input_tokens_per_call is not None ): llm.llm_max_input_tokens_per_call = env_llm_max_input_tokens_per_call changed = True env_llm_max_input_tokens_per_minute = _parse_int( os.environ.get("LLM_MAX_INPUT_TOKENS_PER_MINUTE") ) if ( llm.llm_max_input_tokens_per_minute is None and env_llm_max_input_tokens_per_minute is not None ): llm.llm_max_input_tokens_per_minute = env_llm_max_input_tokens_per_minute changed = True return changed def _apply_whisper_env_overrides_to_db(whisper: Any) -> bool: """Apply Whisper-related environment variable overrides to database settings. Returns True if any settings were changed. """ changed = False # Respect explicit whisper type env if still default env_whisper_type = os.environ.get("WHISPER_TYPE") if env_whisper_type and isinstance(env_whisper_type, str): env_whisper_type_norm = env_whisper_type.strip().lower() if env_whisper_type_norm in {"local", "remote", "groq"}: changed = ( _set_if_default( whisper, "whisper_type", env_whisper_type_norm, DEFAULTS.WHISPER_DEFAULT_TYPE, ) or changed ) # If GROQ_API_KEY is provided, seed both LLM key and Groq whisper key if empty groq_key = os.environ.get("GROQ_API_KEY") changed = _set_if_empty(whisper, "groq_api_key", groq_key) or changed if whisper.whisper_type == "remote": remote_key = os.environ.get("WHISPER_REMOTE_API_KEY") or os.environ.get( "OPENAI_API_KEY" ) changed = _set_if_empty(whisper, "remote_api_key", remote_key) or changed remote_base = os.environ.get("WHISPER_REMOTE_BASE_URL") or os.environ.get( "OPENAI_BASE_URL" ) changed = ( _set_if_default( whisper, "remote_base_url", remote_base, DEFAULTS.WHISPER_REMOTE_BASE_URL, ) or changed ) remote_model = os.environ.get("WHISPER_REMOTE_MODEL") changed = ( _set_if_default( whisper, "remote_model", remote_model, DEFAULTS.WHISPER_REMOTE_MODEL ) or changed ) remote_timeout = _parse_int(os.environ.get("WHISPER_REMOTE_TIMEOUT_SEC")) changed = ( _set_if_default( whisper, "remote_timeout_sec", remote_timeout, DEFAULTS.WHISPER_REMOTE_TIMEOUT_SEC, ) or changed ) remote_chunksize = _parse_int(os.environ.get("WHISPER_REMOTE_CHUNKSIZE_MB")) changed = ( _set_if_default( whisper, "remote_chunksize_mb", remote_chunksize, DEFAULTS.WHISPER_REMOTE_CHUNKSIZE_MB, ) or changed ) elif whisper.whisper_type == "groq": groq_model_env = os.environ.get("GROQ_WHISPER_MODEL") or os.environ.get( "WHISPER_GROQ_MODEL" ) changed = ( _set_if_default( whisper, "groq_model", groq_model_env, DEFAULTS.WHISPER_GROQ_MODEL ) or changed ) groq_max_retries_env = _parse_int(os.environ.get("GROQ_MAX_RETRIES")) changed = ( _set_if_default( whisper, "groq_max_retries", groq_max_retries_env, DEFAULTS.WHISPER_GROQ_MAX_RETRIES, ) or changed ) elif whisper.whisper_type == "local": local_model_env = os.environ.get("WHISPER_LOCAL_MODEL") changed = ( _set_if_default( whisper, "local_model", local_model_env, DEFAULTS.WHISPER_LOCAL_MODEL ) or changed ) return changed def _apply_env_overrides_to_db_first_boot() -> None: """Persist environment-provided overrides into the DB on first boot. Only updates fields that are at default/empty values so we don't clobber user-changed settings after first start. """ llm = LLMSettings.query.get(1) whisper = WhisperSettings.query.get(1) processing = ProcessingSettings.query.get(1) output = OutputSettings.query.get(1) app_s = AppSettings.query.get(1) assert llm and whisper and processing and output and app_s changed = False changed = _apply_llm_env_overrides_to_db(llm) or changed changed = _apply_whisper_env_overrides_to_db(whisper) or changed # Future: add processing/output/app env-to-db seeding if envs defined if changed: safe_commit( db.session, must_succeed=True, context="env_overrides_to_db", logger_obj=logger, ) def read_combined() -> Dict[str, Any]: ensure_defaults() llm = LLMSettings.query.get(1) whisper = WhisperSettings.query.get(1) processing = ProcessingSettings.query.get(1) output = OutputSettings.query.get(1) app_s = AppSettings.query.get(1) assert llm and whisper and processing and output and app_s whisper_payload: Dict[str, Any] = {"whisper_type": whisper.whisper_type} if whisper.whisper_type == "local": whisper_payload.update({"model": whisper.local_model}) elif whisper.whisper_type == "remote": whisper_payload.update( { "model": whisper.remote_model, "api_key": whisper.remote_api_key, "base_url": whisper.remote_base_url, "language": whisper.remote_language, "timeout_sec": whisper.remote_timeout_sec, "chunksize_mb": whisper.remote_chunksize_mb, } ) elif whisper.whisper_type == "groq": whisper_payload.update( { "api_key": whisper.groq_api_key, "model": whisper.groq_model, "language": whisper.groq_language, "max_retries": whisper.groq_max_retries, } ) elif whisper.whisper_type == "test": whisper_payload.update({}) return { "llm": { "llm_api_key": llm.llm_api_key, "llm_model": llm.llm_model, "openai_base_url": llm.openai_base_url, "openai_timeout": llm.openai_timeout, "openai_max_tokens": llm.openai_max_tokens, "llm_max_concurrent_calls": llm.llm_max_concurrent_calls, "llm_max_retry_attempts": llm.llm_max_retry_attempts, "llm_max_input_tokens_per_call": llm.llm_max_input_tokens_per_call, "llm_enable_token_rate_limiting": llm.llm_enable_token_rate_limiting, "llm_max_input_tokens_per_minute": llm.llm_max_input_tokens_per_minute, "enable_boundary_refinement": llm.enable_boundary_refinement, "enable_word_level_boundary_refinder": llm.enable_word_level_boundary_refinder, }, "whisper": whisper_payload, "processing": { "num_segments_to_input_to_prompt": processing.num_segments_to_input_to_prompt, }, "output": { "fade_ms": output.fade_ms, "min_ad_segement_separation_seconds": output.min_ad_segement_separation_seconds, "min_ad_segment_length_seconds": output.min_ad_segment_length_seconds, "min_confidence": output.min_confidence, }, "app": { "background_update_interval_minute": app_s.background_update_interval_minute, "automatically_whitelist_new_episodes": app_s.automatically_whitelist_new_episodes, "post_cleanup_retention_days": app_s.post_cleanup_retention_days, "number_of_episodes_to_whitelist_from_archive_of_new_feed": app_s.number_of_episodes_to_whitelist_from_archive_of_new_feed, "enable_public_landing_page": app_s.enable_public_landing_page, "user_limit_total": app_s.user_limit_total, "autoprocess_on_download": app_s.autoprocess_on_download, }, } def _update_section_llm(data: Dict[str, Any]) -> None: row = LLMSettings.query.get(1) assert row is not None for key in [ "llm_api_key", "llm_model", "openai_base_url", "openai_timeout", "openai_max_tokens", "llm_max_concurrent_calls", "llm_max_retry_attempts", "llm_max_input_tokens_per_call", "llm_enable_token_rate_limiting", "llm_max_input_tokens_per_minute", "enable_boundary_refinement", "enable_word_level_boundary_refinder", ]: if key in data: new_val = data[key] if key == "llm_api_key" and _is_empty(new_val): continue setattr(row, key, new_val) safe_commit( db.session, must_succeed=True, context="update_llm_settings", logger_obj=logger, ) def _update_section_whisper(data: Dict[str, Any]) -> None: row = WhisperSettings.query.get(1) assert row is not None if "whisper_type" in data and data["whisper_type"] in { "local", "remote", "groq", "test", }: row.whisper_type = data["whisper_type"] if row.whisper_type == "local": if "model" in data: row.local_model = data["model"] elif row.whisper_type == "remote": for key_map in [ ("model", "remote_model"), ("api_key", "remote_api_key"), ("base_url", "remote_base_url"), ("language", "remote_language"), ("timeout_sec", "remote_timeout_sec"), ("chunksize_mb", "remote_chunksize_mb"), ]: src, dst = key_map if src in data: new_val = data[src] if src == "api_key" and _is_empty(new_val): continue setattr(row, dst, new_val) elif row.whisper_type == "groq": for key_map in [ ("api_key", "groq_api_key"), ("model", "groq_model"), ("language", "groq_language"), ("max_retries", "groq_max_retries"), ]: src, dst = key_map if src in data: new_val = data[src] if src == "api_key" and _is_empty(new_val): continue setattr(row, dst, new_val) else: # test type has no extra fields pass safe_commit( db.session, must_succeed=True, context="update_whisper_settings", logger_obj=logger, ) def _update_section_processing(data: Dict[str, Any]) -> None: row = ProcessingSettings.query.get(1) assert row is not None for key in [ "num_segments_to_input_to_prompt", ]: if key in data: setattr(row, key, data[key]) safe_commit( db.session, must_succeed=True, context="update_processing_settings", logger_obj=logger, ) def _update_section_output(data: Dict[str, Any]) -> None: row = OutputSettings.query.get(1) assert row is not None for key in [ "fade_ms", "min_ad_segement_separation_seconds", "min_ad_segment_length_seconds", "min_confidence", ]: if key in data: setattr(row, key, data[key]) safe_commit( db.session, must_succeed=True, context="update_output_settings", logger_obj=logger, ) def _update_section_app(data: Dict[str, Any]) -> Tuple[Optional[int], Optional[int]]: row = AppSettings.query.get(1) assert row is not None old_interval: Optional[int] = row.background_update_interval_minute old_retention: Optional[int] = row.post_cleanup_retention_days for key in [ "background_update_interval_minute", "automatically_whitelist_new_episodes", "post_cleanup_retention_days", "number_of_episodes_to_whitelist_from_archive_of_new_feed", "enable_public_landing_page", "user_limit_total", "autoprocess_on_download", ]: if key in data: setattr(row, key, data[key]) safe_commit( db.session, must_succeed=True, context="update_app_settings", logger_obj=logger, ) return old_interval, old_retention def _maybe_reschedule_refresh_job( old_interval: Optional[int], new_interval: Optional[int] ) -> None: if old_interval == new_interval: return job_id = "refresh_all_feeds" job = scheduler.get_job(job_id) if new_interval is None: if job: try: scheduler.remove_job(job_id) except Exception: pass return if not job: return # Avoid importing app.background here (it creates a cycle for pylint). # Use best-effort rescheduling on the underlying APScheduler instance. scheduler_obj = getattr(scheduler, "scheduler", scheduler) reschedule = getattr(scheduler_obj, "reschedule_job", None) if callable(reschedule): reschedule(job_id, trigger="interval", minutes=int(new_interval)) def _maybe_disable_cleanup_job( old_retention: Optional[int], new_retention: Optional[int] ) -> None: if old_retention == new_retention: return job_id = "cleanup_processed_posts" job = scheduler.get_job(job_id) if new_retention is None or new_retention <= 0: if job: try: scheduler.remove_job(job_id) except Exception: pass def update_combined(payload: Dict[str, Any]) -> Dict[str, Any]: if "llm" in payload: _update_section_llm(payload["llm"] or {}) if "whisper" in payload: _update_section_whisper(payload["whisper"] or {}) if "processing" in payload: _update_section_processing(payload["processing"] or {}) if "output" in payload: _update_section_output(payload["output"] or {}) if "app" in payload: old_interval, old_retention = _update_section_app(payload["app"] or {}) app_s = AppSettings.query.get(1) if app_s: _maybe_reschedule_refresh_job( old_interval, app_s.background_update_interval_minute ) _maybe_disable_cleanup_job(old_retention, app_s.post_cleanup_retention_days) return read_combined() def to_pydantic_config() -> PydanticConfig: data = read_combined() # Map whisper section to discriminated union config whisper_obj: Optional[ LocalWhisperConfig | RemoteWhisperConfig | TestWhisperConfig | GroqWhisperConfig ] = None w = data["whisper"] wtype = w.get("whisper_type") if wtype == "local": whisper_obj = LocalWhisperConfig(model=w.get("model", "base.en")) elif wtype == "remote": whisper_obj = RemoteWhisperConfig( model=w.get("model", "whisper-1"), # Allow boot without a remote API key so the UI can be used to set it api_key=w.get("api_key") or "", base_url=w.get("base_url", "https://api.openai.com/v1"), language=w.get("language", "en"), timeout_sec=w.get("timeout_sec", 600), chunksize_mb=w.get("chunksize_mb", 24), ) elif wtype == "groq": whisper_obj = GroqWhisperConfig( # Allow boot without a Groq API key so the UI can be used to set it api_key=w.get("api_key") or "", model=w.get("model", DEFAULTS.WHISPER_GROQ_MODEL), language=w.get("language", "en"), max_retries=w.get("max_retries", 3), ) elif wtype == "test": whisper_obj = TestWhisperConfig() return PydanticConfig( llm_api_key=data["llm"].get("llm_api_key"), llm_model=data["llm"].get("llm_model", DEFAULTS.LLM_DEFAULT_MODEL), openai_base_url=data["llm"].get("openai_base_url"), openai_max_tokens=int( data["llm"].get("openai_max_tokens", DEFAULTS.OPENAI_DEFAULT_MAX_TOKENS) or DEFAULTS.OPENAI_DEFAULT_MAX_TOKENS ), openai_timeout=int( data["llm"].get("openai_timeout", DEFAULTS.OPENAI_DEFAULT_TIMEOUT_SEC) or DEFAULTS.OPENAI_DEFAULT_TIMEOUT_SEC ), llm_max_concurrent_calls=int( data["llm"].get( "llm_max_concurrent_calls", DEFAULTS.LLM_DEFAULT_MAX_CONCURRENT_CALLS ) or DEFAULTS.LLM_DEFAULT_MAX_CONCURRENT_CALLS ), llm_max_retry_attempts=int( data["llm"].get( "llm_max_retry_attempts", DEFAULTS.LLM_DEFAULT_MAX_RETRY_ATTEMPTS ) or DEFAULTS.LLM_DEFAULT_MAX_RETRY_ATTEMPTS ), llm_max_input_tokens_per_call=data["llm"].get("llm_max_input_tokens_per_call"), llm_enable_token_rate_limiting=bool( data["llm"].get( "llm_enable_token_rate_limiting", DEFAULTS.LLM_ENABLE_TOKEN_RATE_LIMITING, ) ), llm_max_input_tokens_per_minute=data["llm"].get( "llm_max_input_tokens_per_minute" ), enable_boundary_refinement=bool( data["llm"].get( "enable_boundary_refinement", DEFAULTS.ENABLE_BOUNDARY_REFINEMENT, ) ), enable_word_level_boundary_refinder=bool( data["llm"].get( "enable_word_level_boundary_refinder", DEFAULTS.ENABLE_WORD_LEVEL_BOUNDARY_REFINDER, ) ), output=data["output"], processing=data["processing"], background_update_interval_minute=data["app"].get( "background_update_interval_minute" ), post_cleanup_retention_days=data["app"].get("post_cleanup_retention_days"), whisper=whisper_obj, automatically_whitelist_new_episodes=bool( data["app"].get( "automatically_whitelist_new_episodes", DEFAULTS.APP_AUTOMATICALLY_WHITELIST_NEW_EPISODES, ) ), number_of_episodes_to_whitelist_from_archive_of_new_feed=int( data["app"].get( "number_of_episodes_to_whitelist_from_archive_of_new_feed", DEFAULTS.APP_NUM_EPISODES_TO_WHITELIST_FROM_ARCHIVE_OF_NEW_FEED, ) or DEFAULTS.APP_NUM_EPISODES_TO_WHITELIST_FROM_ARCHIVE_OF_NEW_FEED ), enable_public_landing_page=bool( data["app"].get( "enable_public_landing_page", DEFAULTS.APP_ENABLE_PUBLIC_LANDING_PAGE, ) ), user_limit_total=data["app"].get( "user_limit_total", DEFAULTS.APP_USER_LIMIT_TOTAL ), autoprocess_on_download=bool( data["app"].get( "autoprocess_on_download", DEFAULTS.APP_AUTOPROCESS_ON_DOWNLOAD, ) ), ) def hydrate_runtime_config_inplace(db_config: Optional[PydanticConfig] = None) -> None: """Hydrate the in-process runtime config from DB-backed settings in-place. Preserves the identity of the `app.config` Pydantic instance so any modules that imported it by value continue to see updated fields. """ cfg = db_config or to_pydantic_config() _log_initial_snapshot(cfg) _apply_top_level_env_overrides(cfg) _apply_whisper_env_overrides(cfg) _apply_llm_model_override(cfg) _apply_whisper_type_override(cfg) _commit_runtime_config(cfg) _log_final_snapshot() def _log_initial_snapshot(cfg: PydanticConfig) -> None: logger.info( "Config hydration: starting with DB values | whisper_type=%s llm_model=%s openai_base_url=%s llm_api_key_set=%s whisper_api_key_set=%s", getattr(getattr(cfg, "whisper", None), "whisper_type", None), getattr(cfg, "llm_model", None), getattr(cfg, "openai_base_url", None), bool(getattr(cfg, "llm_api_key", None)), bool(getattr(getattr(cfg, "whisper", None), "api_key", None)), ) def _apply_top_level_env_overrides(cfg: PydanticConfig) -> None: env_llm_key = ( os.environ.get("LLM_API_KEY") or os.environ.get("OPENAI_API_KEY") or os.environ.get("GROQ_API_KEY") ) if env_llm_key: cfg.llm_api_key = env_llm_key env_openai_base_url = os.environ.get("OPENAI_BASE_URL") if env_openai_base_url: cfg.openai_base_url = env_openai_base_url def _apply_whisper_env_overrides(cfg: PydanticConfig) -> None: if cfg.whisper is None: return wtype = getattr(cfg.whisper, "whisper_type", None) if wtype == "remote": remote_key = os.environ.get("WHISPER_REMOTE_API_KEY") or os.environ.get( "OPENAI_API_KEY" ) remote_base = os.environ.get("WHISPER_REMOTE_BASE_URL") or os.environ.get( "OPENAI_BASE_URL" ) remote_model = os.environ.get("WHISPER_REMOTE_MODEL") if isinstance(cfg.whisper, RemoteWhisperConfig): if remote_key: cfg.whisper.api_key = remote_key if remote_base: cfg.whisper.base_url = remote_base if remote_model: cfg.whisper.model = remote_model elif wtype == "groq": groq_key = os.environ.get("GROQ_API_KEY") groq_model = os.environ.get("GROQ_WHISPER_MODEL") or os.environ.get( "WHISPER_GROQ_MODEL" ) if isinstance(cfg.whisper, GroqWhisperConfig): if groq_key: cfg.whisper.api_key = groq_key if groq_model: cfg.whisper.model = groq_model elif wtype == "local": loc_model = os.environ.get("WHISPER_LOCAL_MODEL") if isinstance(cfg.whisper, LocalWhisperConfig) and loc_model: cfg.whisper.model = loc_model def _apply_llm_model_override(cfg: PydanticConfig) -> None: env_llm_model = os.environ.get("LLM_MODEL") if env_llm_model: cfg.llm_model = env_llm_model def _configure_local_whisper(cfg: PydanticConfig) -> None: """Configure local whisper type.""" # Validate that local whisper is available try: import whisper as _ # type: ignore[import-untyped] # noqa: F401 except ImportError as e: error_msg = ( f"WHISPER_TYPE is set to 'local' but whisper library is not available. " f"Either install whisper with 'pip install openai-whisper' or set WHISPER_TYPE to 'remote' or 'groq'. " f"Import error: {e}" ) logger.error(error_msg) raise RuntimeError(error_msg) from e existing_model_any = getattr(cfg.whisper, "model", "base.en") existing_model = ( existing_model_any if isinstance(existing_model_any, str) else "base.en" ) loc_model_env = os.environ.get("WHISPER_LOCAL_MODEL") loc_model: str = ( loc_model_env if isinstance(loc_model_env, str) and loc_model_env else existing_model ) cfg.whisper = LocalWhisperConfig(model=loc_model) def _configure_remote_whisper(cfg: PydanticConfig) -> None: """Configure remote whisper type.""" existing_model_any = getattr(cfg.whisper, "model", "whisper-1") existing_model = ( existing_model_any if isinstance(existing_model_any, str) else "whisper-1" ) rem_model_env = os.environ.get("WHISPER_REMOTE_MODEL") rem_model: str = ( rem_model_env if isinstance(rem_model_env, str) and rem_model_env else existing_model ) existing_key_any = getattr(cfg.whisper, "api_key", "") existing_key = existing_key_any if isinstance(existing_key_any, str) else "" rem_api_key_env = os.environ.get("WHISPER_REMOTE_API_KEY") or os.environ.get( "OPENAI_API_KEY" ) rem_api_key: str = ( rem_api_key_env if isinstance(rem_api_key_env, str) and rem_api_key_env else existing_key ) existing_base_any = getattr(cfg.whisper, "base_url", "https://api.openai.com/v1") existing_base = ( existing_base_any if isinstance(existing_base_any, str) else "https://api.openai.com/v1" ) rem_base_env = os.environ.get("WHISPER_REMOTE_BASE_URL") or os.environ.get( "OPENAI_BASE_URL" ) rem_base_url: str = ( rem_base_env if isinstance(rem_base_env, str) and rem_base_env else existing_base ) existing_lang_any = getattr(cfg.whisper, "language", "en") lang: str = existing_lang_any if isinstance(existing_lang_any, str) else "en" timeout_sec: int = int( os.environ.get( "WHISPER_REMOTE_TIMEOUT_SEC", str(getattr(cfg.whisper, "timeout_sec", 600)), ) ) chunksize_mb: int = int( os.environ.get( "WHISPER_REMOTE_CHUNKSIZE_MB", str(getattr(cfg.whisper, "chunksize_mb", 24)), ) ) cfg.whisper = RemoteWhisperConfig( model=rem_model, api_key=rem_api_key, base_url=rem_base_url, language=lang, timeout_sec=timeout_sec, chunksize_mb=chunksize_mb, ) def _configure_groq_whisper(cfg: PydanticConfig) -> None: """Configure groq whisper type.""" existing_key_any = getattr(cfg.whisper, "api_key", "") existing_key = existing_key_any if isinstance(existing_key_any, str) else "" groq_key_env = os.environ.get("GROQ_API_KEY") groq_api_key: str = ( groq_key_env if isinstance(groq_key_env, str) and groq_key_env else existing_key ) existing_model_any = getattr(cfg.whisper, "model", DEFAULTS.WHISPER_GROQ_MODEL) existing_model = ( existing_model_any if isinstance(existing_model_any, str) else DEFAULTS.WHISPER_GROQ_MODEL ) groq_model_env = os.environ.get("GROQ_WHISPER_MODEL") or os.environ.get( "WHISPER_GROQ_MODEL" ) groq_model_val: str = ( groq_model_env if isinstance(groq_model_env, str) and groq_model_env else existing_model ) existing_lang_any = getattr(cfg.whisper, "language", "en") groq_lang: str = existing_lang_any if isinstance(existing_lang_any, str) else "en" max_retries: int = int( os.environ.get("GROQ_MAX_RETRIES", str(getattr(cfg.whisper, "max_retries", 3))) ) cfg.whisper = GroqWhisperConfig( api_key=groq_api_key, model=groq_model_val, language=groq_lang, max_retries=max_retries, ) def _apply_whisper_type_override(cfg: PydanticConfig) -> None: env_whisper_type = os.environ.get("WHISPER_TYPE") # Auto-detect whisper type from API key environment variables if not explicitly set if not env_whisper_type: if os.environ.get("WHISPER_REMOTE_API_KEY"): env_whisper_type = "remote" logger.info( "Auto-detected WHISPER_TYPE=remote from WHISPER_REMOTE_API_KEY environment variable" ) elif os.environ.get("GROQ_API_KEY") and not os.environ.get("LLM_API_KEY"): # Only auto-detect groq for whisper if LLM_API_KEY is not set # (to avoid confusion when GROQ_API_KEY is only meant for LLM) env_whisper_type = "groq" logger.info( "Auto-detected WHISPER_TYPE=groq from GROQ_API_KEY environment variable" ) if not env_whisper_type: return wtype = env_whisper_type.strip().lower() if wtype == "local": _configure_local_whisper(cfg) elif wtype == "remote": _configure_remote_whisper(cfg) elif wtype == "groq": _configure_groq_whisper(cfg) elif wtype == "test": cfg.whisper = TestWhisperConfig() def _commit_runtime_config(cfg: PydanticConfig) -> None: logger.info( "Config hydration: after env overrides | whisper_type=%s llm_model=%s openai_base_url=%s llm_api_key_set=%s whisper_api_key_set=%s", getattr(getattr(cfg, "whisper", None), "whisper_type", None), getattr(cfg, "llm_model", None), getattr(cfg, "openai_base_url", None), bool(getattr(cfg, "llm_api_key", None)), bool(getattr(getattr(cfg, "whisper", None), "api_key", None)), ) # Copy values from cfg to runtime_config, preserving Pydantic model instances for key in cfg.model_fields.keys(): setattr(runtime_config, key, getattr(cfg, key)) def _log_final_snapshot() -> None: logger.info( "Config hydration: runtime set | whisper_type=%s llm_model=%s openai_base_url=%s", getattr(getattr(runtime_config, "whisper", None), "whisper_type", None), getattr(runtime_config, "llm_model", None), getattr(runtime_config, "openai_base_url", None), ) def ensure_defaults_and_hydrate() -> None: """Ensure default rows exist, then hydrate the runtime config from DB.""" ensure_defaults() # Check if environment variables have changed since last boot _check_and_apply_env_changes() _apply_env_overrides_to_db_first_boot() hydrate_runtime_config_inplace() def _calculate_env_hash() -> str: """Calculate a hash of all configuration-related environment variables.""" keys = [ # LLM "LLM_API_KEY", "OPENAI_API_KEY", "GROQ_API_KEY", "LLM_MODEL", "OPENAI_BASE_URL", "OPENAI_TIMEOUT", "OPENAI_MAX_TOKENS", "LLM_MAX_CONCURRENT_CALLS", "LLM_MAX_RETRY_ATTEMPTS", "LLM_ENABLE_TOKEN_RATE_LIMITING", "LLM_MAX_INPUT_TOKENS_PER_CALL", "LLM_MAX_INPUT_TOKENS_PER_MINUTE", # Whisper "WHISPER_TYPE", "WHISPER_LOCAL_MODEL", "WHISPER_REMOTE_API_KEY", "WHISPER_REMOTE_BASE_URL", "WHISPER_REMOTE_MODEL", "WHISPER_REMOTE_TIMEOUT_SEC", "WHISPER_REMOTE_CHUNKSIZE_MB", "GROQ_WHISPER_MODEL", "WHISPER_GROQ_MODEL", "GROQ_MAX_RETRIES", # App "PODLY_APP_ROLE", "DEVELOPER_MODE", ] # Sort keys to ensure stable hash keys.sort() hasher = hashlib.sha256() for key in keys: val = os.environ.get(key, "") hasher.update(f"{key}={val}".encode("utf-8")) return hasher.hexdigest() def _check_and_apply_env_changes() -> None: """Check if env hash changed and force-apply overrides if so.""" try: app_s = AppSettings.query.get(1) if not app_s: return # Check if column exists (handle pre-migration state gracefully) if not hasattr(app_s, "env_config_hash"): return current_hash = _calculate_env_hash() stored_hash = app_s.env_config_hash if stored_hash != current_hash: logger.info( "Environment configuration changed (hash mismatch). " "Applying environment overrides to database settings." ) _apply_env_overrides_to_db_force() app_s.env_config_hash = current_hash safe_commit( db.session, must_succeed=True, context="update_env_hash", logger_obj=logger, ) except Exception as e: logger.warning(f"Failed to check/update environment hash: {e}") def _apply_llm_env_overrides(llm: LLMSettings) -> bool: """Apply environment overrides to LLM settings.""" changed = False env_llm_key = ( os.environ.get("LLM_API_KEY") or os.environ.get("OPENAI_API_KEY") or os.environ.get("GROQ_API_KEY") ) if env_llm_key: llm.llm_api_key = env_llm_key changed = True env_llm_model = os.environ.get("LLM_MODEL") if env_llm_model: llm.llm_model = env_llm_model changed = True env_openai_base_url = os.environ.get("OPENAI_BASE_URL") if env_openai_base_url: llm.openai_base_url = env_openai_base_url changed = True env_openai_timeout = _parse_int(os.environ.get("OPENAI_TIMEOUT")) if env_openai_timeout is not None: llm.openai_timeout = env_openai_timeout changed = True env_openai_max_tokens = _parse_int(os.environ.get("OPENAI_MAX_TOKENS")) if env_openai_max_tokens is not None: llm.openai_max_tokens = env_openai_max_tokens changed = True env_llm_max_concurrent = _parse_int(os.environ.get("LLM_MAX_CONCURRENT_CALLS")) if env_llm_max_concurrent is not None: llm.llm_max_concurrent_calls = env_llm_max_concurrent changed = True env_llm_max_retries = _parse_int(os.environ.get("LLM_MAX_RETRY_ATTEMPTS")) if env_llm_max_retries is not None: llm.llm_max_retry_attempts = env_llm_max_retries changed = True env_llm_enable_token_rl = _parse_bool( os.environ.get("LLM_ENABLE_TOKEN_RATE_LIMITING") ) if ( llm.llm_enable_token_rate_limiting == DEFAULTS.LLM_ENABLE_TOKEN_RATE_LIMITING and env_llm_enable_token_rl is not None ): llm.llm_enable_token_rate_limiting = bool(env_llm_enable_token_rl) changed = True env_llm_max_input_tokens_per_call = _parse_int( os.environ.get("LLM_MAX_INPUT_TOKENS_PER_CALL") ) if ( llm.llm_max_input_tokens_per_call is None and env_llm_max_input_tokens_per_call is not None ): llm.llm_max_input_tokens_per_call = env_llm_max_input_tokens_per_call changed = True env_llm_max_input_tokens_per_minute = _parse_int( os.environ.get("LLM_MAX_INPUT_TOKENS_PER_MINUTE") ) if ( llm.llm_max_input_tokens_per_minute is None and env_llm_max_input_tokens_per_minute is not None ): llm.llm_max_input_tokens_per_minute = env_llm_max_input_tokens_per_minute changed = True return changed def _apply_whisper_remote_overrides(whisper: WhisperSettings) -> bool: """Apply environment overrides for Remote Whisper settings.""" changed = False remote_key = os.environ.get("WHISPER_REMOTE_API_KEY") or os.environ.get( "OPENAI_API_KEY" ) if remote_key: whisper.remote_api_key = remote_key changed = True remote_base = os.environ.get("WHISPER_REMOTE_BASE_URL") or os.environ.get( "OPENAI_BASE_URL" ) if remote_base: whisper.remote_base_url = remote_base changed = True remote_model = os.environ.get("WHISPER_REMOTE_MODEL") if remote_model: whisper.remote_model = remote_model changed = True remote_timeout = _parse_int(os.environ.get("WHISPER_REMOTE_TIMEOUT_SEC")) if remote_timeout is not None: whisper.remote_timeout_sec = remote_timeout changed = True remote_chunksize = _parse_int(os.environ.get("WHISPER_REMOTE_CHUNKSIZE_MB")) if remote_chunksize is not None: whisper.remote_chunksize_mb = remote_chunksize changed = True return changed def _apply_whisper_groq_overrides(whisper: WhisperSettings) -> bool: """Apply environment overrides for Groq Whisper settings.""" changed = False groq_model_env = os.environ.get("GROQ_WHISPER_MODEL") or os.environ.get( "WHISPER_GROQ_MODEL" ) if groq_model_env: whisper.groq_model = groq_model_env changed = True groq_max_retries_env = _parse_int(os.environ.get("GROQ_MAX_RETRIES")) if groq_max_retries_env is not None: whisper.groq_max_retries = groq_max_retries_env changed = True return changed def _apply_whisper_env_overrides_force(whisper: WhisperSettings) -> bool: """Apply environment overrides to Whisper settings.""" changed = False env_whisper_type = os.environ.get("WHISPER_TYPE") if env_whisper_type: wtype = env_whisper_type.strip().lower() if wtype in {"local", "remote", "groq"}: whisper.whisper_type = wtype changed = True # Always update Groq API key if present in env groq_key = os.environ.get("GROQ_API_KEY") if groq_key: whisper.groq_api_key = groq_key changed = True if whisper.whisper_type == "remote": if _apply_whisper_remote_overrides(whisper): changed = True elif whisper.whisper_type == "groq": if _apply_whisper_groq_overrides(whisper): changed = True elif whisper.whisper_type == "local": local_model_env = os.environ.get("WHISPER_LOCAL_MODEL") if local_model_env: whisper.local_model = local_model_env changed = True return changed def _apply_env_overrides_to_db_force() -> None: """Force-apply environment overrides to DB, overwriting existing values.""" llm = LLMSettings.query.get(1) whisper = WhisperSettings.query.get(1) if not llm or not whisper: return llm_changed = _apply_llm_env_overrides(llm) whisper_changed = _apply_whisper_env_overrides_force(whisper) if llm_changed or whisper_changed: safe_commit( db.session, must_succeed=True, context="force_env_overrides", logger_obj=logger, ) ================================================ FILE: src/app/db_commit.py ================================================ from __future__ import annotations import logging from typing import Any def safe_commit( session: Any, *, context: str, logger_obj: logging.Logger | None = None, must_succeed: bool = True, ) -> None: """Commit the current transaction and rollback on failure. This is a minimal replacement for the old SQLite concurrency helpers. """ log = logger_obj or logging.getLogger("global_logger") try: session.commit() except Exception as exc: # pylint: disable=broad-except log.error("Commit failed in %s, rolling back: %s", context, exc, exc_info=True) try: session.rollback() except Exception as rb_exc: # pylint: disable=broad-except log.error("Rollback also failed in %s: %s", context, rb_exc, exc_info=True) if must_succeed: raise ================================================ FILE: src/app/db_guard.py ================================================ """Shared helpers to protect long-lived sessions in background threads.""" from __future__ import annotations import logging from contextlib import contextmanager from typing import Any, Iterator from sqlalchemy.exc import OperationalError, PendingRollbackError from sqlalchemy.orm import Session, scoped_session SessionType = Session | scoped_session[Any] def reset_session( session: SessionType, logger: logging.Logger, context: str, exc: Exception | None = None, ) -> None: """ Roll back and remove a session after a failure to avoid leaving it in a bad state. Safe to call even if the session is already closed/invalid. """ if exc: logger.warning( "[SESSION_RESET] context=%s exc=%s; rolling back and removing session", context, exc, ) try: session.rollback() except Exception as rb_exc: # pylint: disable=broad-except logger.warning( "[SESSION_RESET] rollback failed in context=%s: %s", context, rb_exc ) try: remove_fn = getattr(session, "remove", None) if callable(remove_fn): remove_fn() except Exception as rm_exc: # pylint: disable=broad-except logger.warning( "[SESSION_RESET] remove failed in context=%s: %s", context, rm_exc ) @contextmanager def db_guard( context: str, session: SessionType, logger: logging.Logger ) -> Iterator[None]: """ Guard a block of DB work so lock/rollback errors always clean the session before propagating. """ try: yield except (OperationalError, PendingRollbackError) as exc: reset_session(session, logger, context, exc) raise ================================================ FILE: src/app/extensions.py ================================================ import os from flask_apscheduler import APScheduler # type: ignore from flask_migrate import Migrate from flask_sqlalchemy import SQLAlchemy # Unbound singletons; initialized in app factory db = SQLAlchemy() scheduler = APScheduler() base_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) migrations_dir = os.path.join(base_dir, "migrations") migrate = Migrate(directory=migrations_dir) ================================================ FILE: src/app/feeds.py ================================================ import datetime import logging import uuid from email.utils import format_datetime, parsedate_to_datetime from typing import Any, Iterable, Optional, cast from urllib.parse import parse_qsl, urlencode, urlparse, urlunparse import feedparser # type: ignore[import-untyped] import PyRSS2Gen # type: ignore[import-untyped] from flask import current_app, g, request from app.extensions import db from app.models import Feed, Post, User, UserFeed from app.runtime_config import config from app.writer.client import writer_client from podcast_processor.podcast_downloader import find_audio_link logger = logging.getLogger("global_logger") def is_feed_active_for_user(feed_id: int, user: User) -> bool: """Check if the feed is within the user's allowance based on subscription date.""" if user.role == "admin": return True # Hack: Always treat Feed 1 as active if feed_id == 1: return True # Use manual allowance if set, otherwise fall back to plan allowance manual_allowance = user.manual_feed_allowance if manual_allowance is not None: allowance = int(manual_allowance) else: allowance = int(getattr(user, "feed_allowance", 0)) # Sort user's feeds by creation date to determine priority user_feeds = sorted(user.user_feeds, key=lambda uf: uf.created_at) for i, uf in enumerate(user_feeds): if uf.feed_id == feed_id: return i < allowance return False def _should_auto_whitelist_new_posts(feed: Feed, post: Optional[Post] = None) -> bool: """Return True when new posts should default to whitelisted for this feed.""" override = getattr(feed, "auto_whitelist_new_episodes_override", None) if override is not None: return bool(override) if not getattr(config, "automatically_whitelist_new_episodes", False): return False from app.auth import is_auth_enabled # If auth is disabled, we should auto-whitelist if the global setting is on. if not is_auth_enabled(): return True memberships = getattr(feed, "user_feeds", None) or [] if not memberships: # No memberships for this feed. If there are no users in the database at all, # still whitelist. This handles fresh installs where no account exists yet. if db.session.query(User.id).first() is None: return True return False # Check if at least one member has this feed in their "active" list (within allowance) for membership in memberships: user = membership.user if not user: continue if is_feed_active_for_user(feed.id, user): return True return False def _get_base_url() -> str: try: # Check various ways HTTP/2 pseudo-headers might be available http2_scheme = ( request.headers.get(":scheme") or request.headers.get("scheme") or request.environ.get("HTTP2_SCHEME") ) http2_authority = ( request.headers.get(":authority") or request.headers.get("authority") or request.environ.get("HTTP2_AUTHORITY") ) host = request.headers.get("Host") if http2_scheme and http2_authority: return f"{http2_scheme}://{http2_authority}" # Fall back to Host header with scheme detection if host: # Check multiple indicators for HTTPS is_https = ( request.is_secure or request.headers.get("X-Forwarded-Proto") == "https" or request.headers.get("Strict-Transport-Security") is not None or request.headers.get("X-Forwarded-Ssl") == "on" or request.environ.get("HTTPS") == "on" or request.scheme == "https" ) scheme = "https" if is_https else "http" return f"{scheme}://{host}" except RuntimeError: # Working outside of request context pass # Use localhost with main app port return "http://localhost:5001" def fetch_feed(url: str) -> feedparser.FeedParserDict: logger.info(f"Fetching feed from URL: {url}") feed_data = feedparser.parse(url) for entry in feed_data.entries: entry.id = get_guid(entry) return feed_data def refresh_feed(feed: Feed) -> None: logger.info(f"Refreshing feed with ID: {feed.id}") feed_data = fetch_feed(feed.rss_url) updates = {} image_info = feed_data.feed.get("image") if image_info and "href" in image_info: new_image_url = image_info["href"] if feed.image_url != new_image_url: updates["image_url"] = new_image_url existing_posts = {post.guid for post in feed.posts} # type: ignore[attr-defined] oldest_post = min( (post for post in feed.posts if post.release_date), # type: ignore[attr-defined] key=lambda p: p.release_date, default=None, ) new_posts = [] for entry in feed_data.entries: if entry.id not in existing_posts: logger.debug("found new podcast: %s", entry.title) p = make_post(feed, entry) # do not allow automatic download of any backcatalog added to the feed if ( oldest_post is not None and p.release_date and oldest_post.release_date and p.release_date.date() < oldest_post.release_date.date() ): p.whitelisted = False logger.debug( f"skipping post from archive due to \ number_of_episodes_to_whitelist_from_archive_of_new_feed setting: {entry.title}" ) else: p.whitelisted = _should_auto_whitelist_new_posts(feed, p) post_data = { "guid": p.guid, "title": p.title, "description": p.description, "download_url": p.download_url, "release_date": p.release_date.isoformat() if p.release_date else None, "duration": p.duration, "image_url": p.image_url, "whitelisted": p.whitelisted, "feed_id": feed.id, } new_posts.append(post_data) if updates or new_posts: writer_client.action( "refresh_feed", {"feed_id": feed.id, "updates": updates, "new_posts": new_posts}, wait=True, ) logger.info(f"Feed with ID: {feed.id} refreshed") def add_or_refresh_feed(url: str) -> Feed: feed_data = fetch_feed(url) if "title" not in feed_data.feed: logger.error("Invalid feed URL") raise ValueError(f"Invalid feed URL: {url}") feed = Feed.query.filter_by(rss_url=url).first() if feed: refresh_feed(feed) else: feed = add_feed(feed_data) return feed # type: ignore[no-any-return] def add_feed(feed_data: feedparser.FeedParserDict) -> Feed: logger.info(f"Storing feed: {feed_data.feed.title}") try: feed_dict = { "title": feed_data.feed.title, "description": feed_data.feed.get("description", ""), "author": feed_data.feed.get("author", ""), "rss_url": feed_data.href, "image_url": feed_data.feed.image.href, } # Create a temporary feed object to use make_post helper temp_feed = Feed(**feed_dict) temp_feed.id = 0 # Dummy ID posts_data = [] num_posts_added = 0 for entry in feed_data.entries: p = make_post(temp_feed, entry) if ( config.number_of_episodes_to_whitelist_from_archive_of_new_feed is not None and num_posts_added >= config.number_of_episodes_to_whitelist_from_archive_of_new_feed ): p.whitelisted = False else: num_posts_added += 1 p.whitelisted = config.automatically_whitelist_new_episodes post_data = { "guid": p.guid, "title": p.title, "description": p.description, "download_url": p.download_url, "release_date": p.release_date.isoformat() if p.release_date else None, "duration": p.duration, "image_url": p.image_url, "whitelisted": p.whitelisted, } posts_data.append(post_data) result = writer_client.action( "add_feed", {"feed": feed_dict, "posts": posts_data}, wait=True ) if result is None or result.data is None: raise RuntimeError("Failed to get result from writer action") feed_id = result.data["feed_id"] logger.info(f"Feed stored with ID: {feed_id}") # Return the feed object feed = db.session.get(Feed, feed_id) if feed is None: raise RuntimeError(f"Feed {feed_id} not found after creation") return feed except Exception as e: logger.error(f"Failed to store feed: {e}") raise e class ItunesRSSItem(PyRSS2Gen.RSSItem): # type: ignore[misc] def __init__( self, *, title: str, enclosure: PyRSS2Gen.Enclosure, description: str, guid: str, pubDate: Optional[str], image_url: Optional[str] = None, **kwargs: Any, ) -> None: self.image_url = image_url super().__init__( title=title, enclosure=enclosure, description=description, guid=guid, pubDate=pubDate, **kwargs, ) def publish_extensions(self, handler: Any) -> None: if self.image_url: handler.startElement("itunes:image", {"href": self.image_url}) handler.endElement("itunes:image") super().publish_extensions(handler) def feed_item(post: Post, prepend_feed_title: bool = False) -> PyRSS2Gen.RSSItem: """ Given a post, return the corresponding RSS item. Reference: https://github.com/Podcast-Standards-Project/PSP-1-Podcast-RSS-Specification?tab=readme-ov-file#required-item-elements """ base_url = _get_base_url() # Generate URLs that will be proxied by the frontend to the backend audio_url = _append_feed_token_params(f"{base_url}/api/posts/{post.guid}/download") post_details_url = _append_feed_token_params(f"{base_url}/api/posts/{post.guid}") description = ( f'{post.description}\n

Podly Post Page

' ) title = post.title if prepend_feed_title and post.feed: title = f"[{post.feed.title}] {title}" item = ItunesRSSItem( title=title, enclosure=PyRSS2Gen.Enclosure( url=audio_url, type="audio/mpeg", length=post.audio_len_bytes(), ), description=description, guid=post.guid, pubDate=_format_pub_date(post.release_date), image_url=post.image_url, ) return item def generate_feed_xml(feed: Feed) -> Any: logger.info(f"Generating XML for feed with ID: {feed.id}") include_unprocessed = getattr(config, "autoprocess_on_download", True) if include_unprocessed: posts = list(cast(Iterable[Post], feed.posts)) else: posts = ( Post.query.filter( Post.feed_id == feed.id, Post.whitelisted.is_(True), Post.processed_audio_path.isnot(None), ) .order_by(Post.release_date.desc().nullslast(), Post.id.desc()) .all() ) items = [feed_item(post) for post in posts] base_url = _get_base_url() link = _append_feed_token_params(f"{base_url}/feed/{feed.id}") last_build_date = format_datetime(datetime.datetime.now(datetime.timezone.utc)) rss_feed = PyRSS2Gen.RSS2( title="[podly] " + feed.title, link=link, description=feed.description, lastBuildDate=last_build_date, image=PyRSS2Gen.Image(url=feed.image_url, title=feed.title, link=link), items=items, ) rss_feed.rss_attrs["xmlns:itunes"] = "http://www.itunes.com/dtds/podcast-1.0.dtd" rss_feed.rss_attrs["xmlns:content"] = "http://purl.org/rss/1.0/modules/content/" logger.info(f"XML generated for feed with ID: {feed.id}") return rss_feed.to_xml("utf-8") def generate_aggregate_feed_xml(user: Optional[User]) -> Any: """Generate RSS XML for a user's aggregate feed (last 3 processed posts per feed).""" username = user.username if user else "Public" user_id = user.id if user else 0 logger.info(f"Generating aggregate feed XML for: {username}") posts = get_user_aggregate_posts(user_id) items = [feed_item(post, prepend_feed_title=True) for post in posts] base_url = _get_base_url() link = _append_feed_token_params(f"{base_url}/feed/user/{user_id}") last_build_date = format_datetime(datetime.datetime.now(datetime.timezone.utc)) if current_app.config.get("REQUIRE_AUTH") and user: feed_title = f"Podly Podcasts - {user.username}" feed_description = f"Aggregate feed for {user.username} - Last 3 processed episodes from each subscribed feed." else: feed_title = "Podly Podcasts" feed_description = ( "Aggregate feed - Last 3 processed episodes from each subscribed feed." ) rss_feed = PyRSS2Gen.RSS2( title=feed_title, link=link, description=feed_description, lastBuildDate=last_build_date, items=items, image=PyRSS2Gen.Image( url=f"{base_url}/static/images/logos/manifest-icon-512.maskable.png", title=feed_title, link=link, ), ) rss_feed.rss_attrs["xmlns:itunes"] = "http://www.itunes.com/dtds/podcast-1.0.dtd" rss_feed.rss_attrs["xmlns:content"] = "http://purl.org/rss/1.0/modules/content/" logger.info(f"Aggregate XML generated for: {username}") return rss_feed.to_xml("utf-8") def get_user_aggregate_posts(user_id: int, limit_per_feed: int = 3) -> list[Post]: """Fetch last N processed posts from each of the user's subscribed feeds.""" if not current_app.config.get("REQUIRE_AUTH") or user_id == 0: feed_ids = [r[0] for r in Feed.query.with_entities(Feed.id).all()] else: user_feeds = UserFeed.query.filter_by(user_id=user_id).all() feed_ids = [uf.feed_id for uf in user_feeds] all_posts = [] for feed_id in feed_ids: # Fetch last N processed posts for this feed posts = ( Post.query.filter( Post.feed_id == feed_id, Post.whitelisted.is_(True), Post.processed_audio_path.isnot(None), ) .order_by(Post.release_date.desc().nullslast(), Post.id.desc()) .limit(limit_per_feed) .all() ) all_posts.extend(posts) # Sort all posts by release date descending all_posts.sort(key=lambda p: p.release_date or datetime.datetime.min, reverse=True) return all_posts def _append_feed_token_params(url: str) -> str: if not current_app.config.get("REQUIRE_AUTH"): return url try: token_result = getattr(g, "feed_token", None) token_id = request.args.get("feed_token") secret = request.args.get("feed_secret") except RuntimeError: return url if token_result is not None: token_id = token_id or token_result.token.token_id secret = secret or token_result.token.token_secret if not token_id or not secret: return url parsed = urlparse(url) query_params = dict(parse_qsl(parsed.query, keep_blank_values=True)) query_params["feed_token"] = token_id query_params["feed_secret"] = secret new_query = urlencode(query_params) return urlunparse(parsed._replace(query=new_query)) def make_post(feed: Feed, entry: feedparser.FeedParserDict) -> Post: # Extract episode image URL, fallback to feed image episode_image_url = None # Try to get episode-specific image from various RSS fields if hasattr(entry, "image") and entry.image: if isinstance(entry.image, dict) and "href" in entry.image: episode_image_url = entry.image["href"] elif isinstance(entry.image, str): episode_image_url = entry.image # Try iTunes image tag if not episode_image_url and hasattr(entry, "itunes_image"): if isinstance(entry.itunes_image, dict) and "href" in entry.itunes_image: episode_image_url = entry.itunes_image["href"] elif isinstance(entry.itunes_image, str): episode_image_url = entry.itunes_image # Try media:thumbnail or media:content if not episode_image_url and hasattr(entry, "media_thumbnail"): if entry.media_thumbnail and len(entry.media_thumbnail) > 0: episode_image_url = entry.media_thumbnail[0].get("url") # Fallback to feed image if no episode-specific image found if not episode_image_url: episode_image_url = feed.image_url # Try multiple description fields in order of preference description = entry.get("description", "") if not description: description = entry.get("summary", "") if not description and hasattr(entry, "content") and entry.content: description = entry.content[0].get("value", "") if not description: description = entry.get("subtitle", "") return Post( feed_id=feed.id, guid=get_guid(entry), download_url=find_audio_link(entry), title=entry.title, description=description, release_date=_parse_release_date(entry), duration=get_duration(entry), image_url=episode_image_url, ) def _get_entry_field(entry: feedparser.FeedParserDict, field: str) -> Optional[Any]: value = getattr(entry, field, None) return value if value is not None else entry.get(field) def _parse_datetime_string( value: Optional[str], field: str ) -> Optional[datetime.datetime]: if not value: return None try: return parsedate_to_datetime(value) except (TypeError, ValueError): logger.debug("Failed to parse %s string for release date", field) return None def _parse_struct_time(value: Optional[Any], field: str) -> Optional[datetime.datetime]: if not value: return None try: dt = datetime.datetime(*value[:6]) except (TypeError, ValueError): logger.debug("Failed to parse %s for release date", field) return None gmtoff = getattr(value, "tm_gmtoff", None) if gmtoff is not None: dt = dt.replace(tzinfo=datetime.timezone(datetime.timedelta(seconds=gmtoff))) return dt def _normalize_to_utc(dt: Optional[datetime.datetime]) -> Optional[datetime.datetime]: if dt is None: return None if dt.tzinfo is None: dt = dt.replace(tzinfo=datetime.timezone.utc) return dt.astimezone(datetime.timezone.utc) def _parse_release_date( entry: feedparser.FeedParserDict, ) -> Optional[datetime.datetime]: """Parse a release datetime from a feed entry and normalize to UTC.""" for field in ("published", "updated"): dt = _parse_datetime_string(_get_entry_field(entry, field), field) normalized = _normalize_to_utc(dt) if normalized: return normalized for field in ("published_parsed", "updated_parsed"): dt = _parse_struct_time(_get_entry_field(entry, field), field) normalized = _normalize_to_utc(dt) if normalized: return normalized return None def _format_pub_date(release_date: Optional[datetime.datetime]) -> Optional[str]: if not release_date: return None normalized = release_date if normalized.tzinfo is None: normalized = normalized.replace(tzinfo=datetime.timezone.utc) return format_datetime(normalized.astimezone(datetime.timezone.utc)) # sometimes feed entry ids are the post url or something else def get_guid(entry: feedparser.FeedParserDict) -> str: try: uuid.UUID(entry.id) return str(entry.id) except ValueError: dlurl = find_audio_link(entry) return str(uuid.uuid5(uuid.NAMESPACE_URL, dlurl)) def get_duration(entry: feedparser.FeedParserDict) -> Optional[int]: try: return int(entry["itunes_duration"]) except Exception: # pylint: disable=broad-except logger.error("Failed to get duration") logger.error("Failed to get duration") return None ================================================ FILE: src/app/ipc.py ================================================ import multiprocessing import os from multiprocessing.managers import BaseManager from queue import Queue from typing import Any class QueueManager(BaseManager): pass # Define the queue globally so it can be registered _command_queue: Queue[Any] = Queue() def _get_default_authkey() -> bytes: # This key is only used for localhost IPC between the web and writer processes. # It must be identical across processes, otherwise Manager proxy calls can fail # with AuthenticationError ('digest sent was rejected'). raw = os.environ.get("PODLY_IPC_AUTHKEY", "podly_secret") return raw.encode("utf-8") def _ensure_process_authkey(authkey: bytes) -> None: try: multiprocessing.current_process().authkey = authkey except Exception: # Best-effort: if we can't set it, the explicit authkey passed to the # manager will still be used for direct manager connections. pass def get_queue() -> Queue[Any]: return _command_queue def make_server_manager( address: tuple[str, int] = ("127.0.0.1", 50001), authkey: bytes | None = None, ) -> QueueManager: if authkey is None: authkey = _get_default_authkey() _ensure_process_authkey(authkey) QueueManager.register("get_command_queue", callable=get_queue) # Register Queue so we can pass it around for replies QueueManager.register("Queue", callable=Queue) manager = QueueManager(address=address, authkey=authkey) return manager def make_client_manager( address: tuple[str, int] = ("127.0.0.1", 50001), authkey: bytes | None = None, ) -> QueueManager: if authkey is None: authkey = _get_default_authkey() _ensure_process_authkey(authkey) QueueManager.register("get_command_queue") QueueManager.register("Queue") manager = QueueManager(address=address, authkey=authkey) manager.connect() return manager ================================================ FILE: src/app/job_manager.py ================================================ import logging import os from typing import Any, Dict, Optional, Tuple from app.extensions import db as _db from app.models import Post, ProcessingJob from podcast_processor.processing_status_manager import ProcessingStatusManager class JobManager: """Manage the lifecycle guarantees for a single `ProcessingJob` record.""" ACTIVE_STATUSES = {"pending", "running"} def __init__( self, post_guid: str, status_manager: ProcessingStatusManager, logger_obj: logging.Logger, run_id: Optional[str], *, requested_by_user_id: Optional[int] = None, billing_user_id: Optional[int] = None, ) -> None: self.post_guid = post_guid self._status_manager = status_manager self._logger = logger_obj self._run_id = run_id self._requested_by_user_id = requested_by_user_id self._billing_user_id = billing_user_id self.job: Optional[ProcessingJob] = None @property def job_id(self) -> Optional[str]: return getattr(self.job, "id", None) if self.job else None def _reload_job(self) -> Optional[ProcessingJob]: self.job = ( ProcessingJob.query.filter_by(post_guid=self.post_guid) .order_by(ProcessingJob.created_at.desc()) .first() ) return self.job def get_active_job(self) -> Optional[ProcessingJob]: job = self.job or self._reload_job() if job and job.status in self.ACTIVE_STATUSES: return job return None def ensure_job(self) -> ProcessingJob: job = self.get_active_job() if job: changed = False if self._run_id and job.jobs_manager_run_id != self._run_id: job.jobs_manager_run_id = self._run_id changed = True if self._requested_by_user_id and job.requested_by_user_id is None: job.requested_by_user_id = self._requested_by_user_id changed = True if self._billing_user_id is not None and ( job.billing_user_id != self._billing_user_id ): job.billing_user_id = self._billing_user_id changed = True if changed: self._status_manager.db_session.flush() return job job_id = self._status_manager.generate_job_id() job = self._status_manager.create_job( self.post_guid, job_id, self._run_id, requested_by_user_id=self._requested_by_user_id, billing_user_id=self._billing_user_id, ) self.job = job return job def fail(self, message: str, step: int = 0, progress: float = 0.0) -> ProcessingJob: job = self.ensure_job() step = step or job.current_step or 0 progress = progress or job.progress_percentage or 0.0 self._status_manager.update_job_status(job, "failed", step, message, progress) return job def complete(self, message: str = "Processing complete") -> ProcessingJob: job = self.ensure_job() total_steps = job.total_steps or 4 self._status_manager.update_job_status( job, "completed", total_steps, message, 100.0 ) return job def skip( self, message: str = "Processing skipped", step: Optional[int] = None, progress: Optional[float] = None, ) -> ProcessingJob: job = self.ensure_job() total_steps = job.total_steps or 4 resolved_step = step if step is not None else total_steps resolved_progress = progress if progress is not None else 100.0 job.error_message = None self._status_manager.update_job_status( job, "skipped", resolved_step, message, resolved_progress ) return job def _load_and_validate_post( self, ) -> Tuple[Optional[Post], Optional[Dict[str, Any]]]: """Load the post and perform lifecycle validations.""" post = Post.query.filter_by(guid=self.post_guid).first() if not post: job = self._mark_job_skipped("Post no longer exists") return ( None, { "status": "error", "error_code": "NOT_FOUND", "message": "Post not found", "job_id": getattr(job, "id", None), }, ) if not post.whitelisted: job = self._mark_job_skipped("Post not whitelisted") return ( None, { "status": "error", "error_code": "NOT_WHITELISTED", "message": "Post not whitelisted", "job_id": getattr(job, "id", None), }, ) if not post.download_url: self._logger.warning( "Post %s (%s) is whitelisted but missing download_url; marking job as failed", post.guid, post.title, ) job = self.fail("Download URL missing") return ( None, { "status": "error", "error_code": "MISSING_DOWNLOAD_URL", "message": "Post is missing a download URL", "job_id": job.id, }, ) if post.processed_audio_path and os.path.exists(post.processed_audio_path): try: job = self.skip("Post already processed") except Exception as err: # pylint: disable=broad-exception-caught self._logger.error( "Failed to mark job as completed during short-circuit for %s: %s", self.post_guid, err, ) job = None return ( None, { "status": "skipped", "message": "Post already processed", "job_id": getattr(job, "id", None), "download_url": f"/api/posts/{self.post_guid}/download", }, ) return post, None def _mark_job_skipped(self, reason: str) -> Optional[ProcessingJob]: job = self.get_active_job() if job and job.status in {"pending", "running"}: job.error_message = None total_steps = job.total_steps or job.current_step or 4 self._status_manager.update_job_status( job, "skipped", total_steps, reason, 100.0, ) return job try: return self.skip(reason) except Exception as err: # pylint: disable=broad-exception-caught self._logger.error( "Failed to mark job as skipped for %s: %s", self.post_guid, err ) return job def start_processing(self, priority: str) -> Dict[str, Any]: """ Handle the end-to-end lifecycle for a single post processing request. Ensures a job exists and is marked ready for the worker thread. """ _, early_result = self._load_and_validate_post() if early_result: return early_result _db.session.expire_all() job = self.ensure_job() if job.status == "running": return { "status": "running", "message": "Another processing job is already running for this episode", "job_id": job.id, } self._status_manager.update_job_status( job, "pending", 0, f"Queued for processing (priority={priority})", 0.0, ) return { "status": "started", "message": "Job queued for processing", "job_id": job.id, } ================================================ FILE: src/app/jobs_manager.py ================================================ import logging import os from datetime import datetime, timedelta from threading import Event, Lock, Thread from typing import Any, Dict, List, Optional, Tuple, cast from sqlalchemy import case from app.db_guard import db_guard, reset_session from app.extensions import db as _db from app.extensions import scheduler from app.feeds import refresh_feed from app.job_manager import JobManager as SingleJobManager from app.models import Feed, JobsManagerRun, Post, ProcessingJob from app.processor import get_processor from app.writer.client import writer_client from podcast_processor.podcast_processor import ProcessorException from podcast_processor.processing_status_manager import ProcessingStatusManager logger = logging.getLogger("global_logger") class JobsManager: """ Centralized manager for starting, tracking, listing, and cancelling podcast processing jobs. Owns a shared worker pool and coordinates with ProcessingStatusManager. """ # Class-level lock to ensure only one job processes at a time across ALL instances _global_processing_lock = Lock() def __init__(self) -> None: # Status manager for DB interactions self._status_manager = ProcessingStatusManager( db_session=_db.session, logger=logger ) # Track the singleton run id with thread-safe access self._run_lock = Lock() self._run_id: Optional[str] = None # Persistent worker thread coordination self._stop_event = Event() self._work_event = Event() self._worker_thread = Thread( target=self._worker_loop, name="jobs-manager-worker", daemon=True ) self._worker_thread.start() # Initialize run via writer with scheduler.app.app_context(): try: result = writer_client.action( "ensure_active_run", {"trigger": "startup", "context": {"source": "init"}}, wait=True, ) if result and result.success and result.data: self._set_run_id(result.data["run_id"]) except Exception as e: logger.error(f"Failed to initialize run: {e}") def _set_run_id(self, run_id: Optional[str]) -> None: with self._run_lock: self._run_id = run_id def _get_run_id(self) -> Optional[str]: with self._run_lock: return self._run_id def _wake_worker(self) -> None: self._work_event.set() def _wait_for_work(self, timeout: float = 5.0) -> None: triggered = self._work_event.wait(timeout) if triggered: self._work_event.clear() # ------------------------ Public API ------------------------ def start_post_processing( self, post_guid: str, priority: str = "interactive", *, requested_by_user_id: Optional[int] = None, billing_user_id: Optional[int] = None, ) -> Dict[str, Any]: """ Idempotently start processing for a post. If an active job exists, return it. """ with scheduler.app.app_context(): ensure_result = writer_client.action( "ensure_active_run", { "trigger": "interactive_start", "context": {"post_guid": post_guid, "priority": priority}, }, wait=True, ) run_id = None if ensure_result and ensure_result.success and ensure_result.data: run_id = ensure_result.data.get("run_id") self._set_run_id(run_id) start_result = SingleJobManager( post_guid, self._status_manager, logger, run_id, requested_by_user_id=requested_by_user_id, billing_user_id=billing_user_id, ).start_processing(priority) if start_result.get("status") in {"started", "running"}: self._wake_worker() return start_result def enqueue_pending_jobs( self, trigger: str = "system", context: Optional[Dict[str, Any]] = None, ) -> Dict[str, Any]: """ Ensure all posts have job records and enqueue pending work. Returns basic stats for logging/monitoring. """ with scheduler.app.app_context(): result = writer_client.action( "ensure_active_run", {"trigger": trigger, "context": context}, wait=True ) run_id = None if result and result.success and result.data: run_id = result.data["run_id"] self._set_run_id(run_id) active_run = _db.session.get(JobsManagerRun, run_id) if run_id else None created_count, pending_count = self._cleanup_and_process_new_posts( active_run ) response = { "status": "ok", "created": created_count, "pending": pending_count, "enqueued": pending_count, "run_id": run_id, } if pending_count: self._wake_worker() return response def _ensure_jobs_for_all_posts(self, run_id: Optional[str]) -> int: """Ensure every post has an associated ProcessingJob record.""" posts_without_jobs = ( Post.query.outerjoin(ProcessingJob, ProcessingJob.post_guid == Post.guid) .filter(ProcessingJob.id.is_(None)) .all() ) created = 0 for post in posts_without_jobs: if post.whitelisted: SingleJobManager( post.guid, self._status_manager, logger, run_id, ).ensure_job() created += 1 return created def get_post_status(self, post_guid: str) -> Dict[str, Any]: with scheduler.app.app_context(): post = Post.query.filter_by(guid=post_guid).first() if not post: return { "status": "error", "error_code": "NOT_FOUND", "message": "Post not found", } job = ( ProcessingJob.query.filter_by(post_guid=post_guid) .order_by(ProcessingJob.created_at.desc()) .first() ) if not job: if post.processed_audio_path and os.path.exists( post.processed_audio_path ): return { "status": "skipped", "step": 4, "step_name": "Processing skipped", "total_steps": 4, "progress_percentage": 100.0, "message": "Post already processed", "download_url": f"/api/posts/{post_guid}/download", } return { "status": "not_started", "step": 0, "step_name": "Not started", "total_steps": 4, "progress_percentage": 0.0, "message": "No processing job found", } response = { "status": job.status, "step": job.current_step, "step_name": job.step_name or "Unknown", "total_steps": job.total_steps, "progress_percentage": job.progress_percentage, "message": job.step_name or f"Step {job.current_step} of {job.total_steps}", } if job.started_at: response["started_at"] = job.started_at.isoformat() if ( job.status in {"completed", "skipped"} and post.processed_audio_path and os.path.exists(post.processed_audio_path) ): response["download_url"] = f"/api/posts/{post_guid}/download" if job.status == "failed" and job.error_message: response["error"] = job.error_message if job.status == "cancelled" and job.error_message: response["message"] = job.error_message return response def get_job_status(self, job_id: str) -> Dict[str, Any]: with scheduler.app.app_context(): job = _db.session.get(ProcessingJob, job_id) if not job: return { "status": "error", "error_code": "NOT_FOUND", "message": "Job not found", } return { "job_id": job.id, "post_guid": job.post_guid, "status": job.status, "step": job.current_step, "step_name": job.step_name, "total_steps": job.total_steps, "progress_percentage": job.progress_percentage, "started_at": job.started_at.isoformat() if job.started_at else None, "completed_at": ( job.completed_at.isoformat() if job.completed_at else None ), "error": job.error_message, } def list_active_jobs(self, limit: int = 100) -> List[Dict[str, Any]]: with scheduler.app.app_context(): # Derive a simple priority from status: running > pending priority_order = case( (ProcessingJob.status == "running", 2), (ProcessingJob.status == "pending", 1), else_=0, ).label("priority") rows = ( _db.session.query(ProcessingJob, Post, priority_order) .outerjoin(Post, ProcessingJob.post_guid == Post.guid) .filter(ProcessingJob.status.in_(["pending", "running"])) .order_by(priority_order.desc(), ProcessingJob.created_at.desc()) .limit(limit) .all() ) results: List[Dict[str, Any]] = [] for job, post, prio in rows: results.append( { "job_id": job.id, "post_guid": job.post_guid, "post_title": post.title if post else None, "feed_title": post.feed.title if post and post.feed else None, "status": job.status, "priority": int(prio) if prio is not None else 0, "step": job.current_step, "step_name": job.step_name, "total_steps": job.total_steps, "progress_percentage": job.progress_percentage, "created_at": ( job.created_at.isoformat() if job.created_at else None ), "started_at": ( job.started_at.isoformat() if job.started_at else None ), "completed_at": ( job.completed_at.isoformat() if job.completed_at else None ), "error_message": job.error_message, } ) return results def list_all_jobs_detailed(self, limit: int = 200) -> List[Dict[str, Any]]: with scheduler.app.app_context(): # Priority by status, others ranked lowest priority_order = case( (ProcessingJob.status == "running", 2), (ProcessingJob.status == "pending", 1), else_=0, ).label("priority") rows = ( _db.session.query(ProcessingJob, Post, priority_order) .outerjoin(Post, ProcessingJob.post_guid == Post.guid) .order_by(priority_order.desc(), ProcessingJob.created_at.desc()) .limit(limit) .all() ) results: List[Dict[str, Any]] = [] for job, post, prio in rows: results.append( { "job_id": job.id, "post_guid": job.post_guid, "post_title": post.title if post else None, "feed_title": post.feed.title if post and post.feed else None, "status": job.status, "priority": int(prio) if prio is not None else 0, "step": job.current_step, "step_name": job.step_name, "total_steps": job.total_steps, "progress_percentage": job.progress_percentage, "created_at": ( job.created_at.isoformat() if job.created_at else None ), "started_at": ( job.started_at.isoformat() if job.started_at else None ), "completed_at": ( job.completed_at.isoformat() if job.completed_at else None ), "error_message": job.error_message, } ) return results def cancel_job(self, job_id: str) -> Dict[str, Any]: with scheduler.app.app_context(): job = _db.session.get(ProcessingJob, job_id) if not job: return { "status": "error", "error_code": "NOT_FOUND", "message": "Job not found", } if job.status in ["completed", "failed", "cancelled", "skipped"]: return { "status": "error", "error_code": "ALREADY_FINISHED", "message": f"Job already {job.status}", } # Mark job as cancelled in database self._status_manager.mark_cancelled(job_id, "Cancelled by user request") return { "status": "cancelled", "job_id": job_id, "message": "Job cancelled", } def cancel_post_jobs(self, post_guid: str) -> Dict[str, Any]: with scheduler.app.app_context(): # Find active jobs for this post in database active_jobs = ( ProcessingJob.query.filter_by(post_guid=post_guid) .filter(ProcessingJob.status.in_(["pending", "running"])) .all() ) job_ids = [job.id for job in active_jobs] for job in active_jobs: self._status_manager.mark_cancelled(job.id, "Cancelled by user request") return { "status": "cancelled", "post_guid": post_guid, "job_ids": job_ids, "message": f"Cancelled {len(job_ids)} jobs", } def cleanup_stale_jobs(self, older_than: timedelta) -> int: try: result = writer_client.action( "cleanup_stale_jobs", {"older_than_seconds": older_than.total_seconds()}, wait=True, ) if result and result.success and result.data: return cast(int, result.data.get("count", 0)) return 0 except Exception as e: logger.error(f"Failed to cleanup stale jobs: {e}") return 0 def cleanup_stuck_pending_jobs(self, stuck_threshold_minutes: int = 10) -> int: """ Clean up jobs that have been stuck in 'pending' status for too long. This indicates they were never picked up by the thread pool. """ cutoff = datetime.utcnow() - timedelta(minutes=stuck_threshold_minutes) with scheduler.app.app_context(): stuck_jobs = ProcessingJob.query.filter( ProcessingJob.status == "pending", ProcessingJob.created_at < cutoff ).all() count = len(stuck_jobs) for job in stuck_jobs: try: logger.warning( f"Marking stuck pending job {job.id} as failed (created at {job.created_at})" ) self._status_manager.update_job_status( job, "failed", job.current_step, f"Job was stuck in pending status for over {stuck_threshold_minutes} minutes", ) except Exception as e: # pylint: disable=broad-except logger.error(f"Failed to update stuck job {job.id}: {e}") return count def clear_all_jobs(self) -> Dict[str, Any]: """ Clear all processing jobs from the database. This is typically called during application startup to ensure a clean state. """ try: result = writer_client.action("clear_all_jobs", {}, wait=True) count = result.data if result and result.success else 0 logger.info(f"Cleared {count} processing jobs on startup") return { "status": "success", "cleared_jobs": count, "message": f"Cleared {count} jobs from database", } except Exception as e: logger.error(f"Error clearing all jobs: {e}") return {"status": "error", "message": f"Failed to clear jobs: {str(e)}"} def start_refresh_all_feeds( self, trigger: str = "scheduled", context: Optional[Dict[str, Any]] = None, ) -> Dict[str, Any]: """ Refresh feeds and enqueue per-post processing into internal worker pool. """ with scheduler.app.app_context(): feeds = Feed.query.all() for feed in feeds: refresh_feed(feed) # Clean up posts with missing audio files self._cleanup_inconsistent_posts() # Process new posts return self.enqueue_pending_jobs(trigger=trigger, context=context) # ------------------------ Helpers ------------------------ def _cleanup_inconsistent_posts(self) -> None: """Clean up posts with missing audio files.""" try: writer_client.action("cleanup_missing_audio_paths", {}, wait=True) except Exception as e: logger.error( f"Failed to cleanup inconsistent posts: {e}", exc_info=True, ) def _cleanup_and_process_new_posts( self, active_run: Optional[JobsManagerRun] ) -> Tuple[int, int]: """Ensure all posts have jobs and return counts for monitoring.""" run_id = active_run.id if active_run else None created_jobs = self._ensure_jobs_for_all_posts(run_id) pending_jobs = ( ProcessingJob.query.filter(ProcessingJob.status == "pending") .order_by(ProcessingJob.created_at.asc()) .all() ) if active_run and pending_jobs: try: writer_client.action( "reassign_pending_jobs", {"run_id": run_id}, wait=True ) except Exception as e: # pylint: disable=broad-except logger.error("Failed to reassign pending jobs: %s", e) if created_jobs: logger.info("Created %s new job records", created_jobs) logger.info( "Pending jobs ready for worker: count=%s run_id=%s", len(pending_jobs), run_id, ) return created_jobs, len(pending_jobs) # Removed _get_active_job_for_guid - now using direct database queries # ------------------------ Internal helpers ------------------------ def _dequeue_next_job(self) -> Optional[Tuple[str, str]]: """Return the next pending job id and post guid, or None if idle. CRITICAL: This method atomically marks the job as "running" when dequeuing to prevent race conditions where multiple jobs could be dequeued before any is marked as running. """ try: run_id = self._get_run_id() result = writer_client.action("dequeue_job", {"run_id": run_id}, wait=True) if result and result.success and result.data: job_id = result.data["job_id"] post_guid = result.data["post_guid"] logger.info( "[JOB_DEQUEUE] Successfully dequeued and marked running: job_id=%s post_guid=%s", job_id, post_guid, ) return job_id, post_guid return None except Exception as e: logger.error(f"Error dequeuing job: {e}") return None def _worker_loop(self) -> None: """Background loop that continuously processes pending jobs. CRITICAL: This runs in a single dedicated daemon thread. Combined with the _global_processing_lock in _process_job, this ensures truly sequential job execution with no parallelism. """ import threading logger.info( "[WORKER_LOOP] Started single worker thread: thread_name=%s thread_id=%s", threading.current_thread().name, threading.current_thread().ident, ) while not self._stop_event.is_set(): try: job_details = self._dequeue_next_job() if not job_details: self._wait_for_work() continue job_id, post_guid = job_details self._process_job(job_id, post_guid) except Exception as exc: # pylint: disable=broad-except logger.error("Worker loop error: %s", exc, exc_info=True) reset_session(_db.session, logger, "worker_loop_exception", exc) def _process_job(self, job_id: str, post_guid: str) -> None: """Execute a single job using the processor. Uses a global processing lock to absolutely guarantee single-job execution. """ # Acquire global lock to ensure only one job runs at a time logger.info( "[JOB_PROCESS] Waiting for processing lock: job_id=%s post_guid=%s", job_id, post_guid, ) with JobsManager._global_processing_lock: logger.info( "[JOB_PROCESS] Acquired processing lock: job_id=%s post_guid=%s", job_id, post_guid, ) with scheduler.app.app_context(): with db_guard("process_job", _db.session, logger): try: # Clear any failed transaction state from prior work on this session. try: _db.session.rollback() except Exception: # pylint: disable=broad-except pass # Expire all cached objects to ensure fresh reads _db.session.expire_all() logger.debug( "Worker starting job_id=%s post_guid=%s", job_id, post_guid ) worker_post = Post.query.filter_by(guid=post_guid).first() if not worker_post: logger.error( "Post with GUID %s not found; failing job %s", post_guid, job_id, ) job = _db.session.get(ProcessingJob, job_id) if job: self._status_manager.update_job_status( job, "failed", job.current_step or 0, "Post not found", 0.0, ) return def _cancelled() -> bool: # Expire the job before re-querying to get fresh state _db.session.expire_all() current_job = _db.session.get(ProcessingJob, job_id) return ( current_job is None or current_job.status == "cancelled" ) get_processor().process( worker_post, job_id=job_id, cancel_callback=_cancelled ) except ProcessorException as exc: logger.info( "Job %s finished with processor exception: %s", job_id, exc ) except Exception as exc: # pylint: disable=broad-except logger.error( "Unexpected error in job %s: %s", job_id, exc, exc_info=True ) try: _db.session.expire_all() failed_job = _db.session.get(ProcessingJob, job_id) if failed_job and failed_job.status not in [ "completed", "cancelled", "failed", ]: self._status_manager.update_job_status( failed_job, "failed", failed_job.current_step or 0, f"Job execution failed: {exc}", failed_job.progress_percentage or 0.0, ) except ( Exception ) as cleanup_error: # pylint: disable=broad-except logger.error( "Failed to update job status after error: %s", cleanup_error, exc_info=True, ) finally: # Always clean up session state after job processing to release any locks try: _db.session.rollback() except Exception: # pylint: disable=broad-except pass try: _db.session.remove() except Exception as exc: # pylint: disable=broad-except logger.warning( "Failed to remove session after job: %s", exc ) logger.info( "[JOB_PROCESS] Released processing lock: job_id=%s post_guid=%s", job_id, post_guid, ) # Singleton accessor def get_jobs_manager() -> JobsManager: if not hasattr(get_jobs_manager, "_instance"): get_jobs_manager._instance = JobsManager() # type: ignore[attr-defined] return get_jobs_manager._instance # type: ignore[attr-defined, no-any-return] def scheduled_refresh_all_feeds() -> None: """Top-level function for APScheduler to invoke periodically.""" try: get_jobs_manager().start_refresh_all_feeds(trigger="scheduled") except Exception as e: # pylint: disable=broad-except logger.error(f"Scheduled refresh failed: {e}") ================================================ FILE: src/app/jobs_manager_run_service.py ================================================ """Helpers for managing the singleton JobsManagerRun row.""" from __future__ import annotations import logging from datetime import datetime from typing import Any, Dict, Optional, cast from sqlalchemy import func from app.models import JobsManagerRun, ProcessingJob logger = logging.getLogger("writer") SINGLETON_RUN_ID = "jobs-manager-singleton" def _session_get(session: Any, ident: str) -> Optional[JobsManagerRun]: """Get a JobsManagerRun by id from a session-like object. Accepts both modern Session objects that implement .get(model, id) and older SQLAlchemy session objects where .query(...).get(id) is used. Returns None if not found. """ getter = getattr(session, "get", None) if callable(getter): return cast(Optional[JobsManagerRun], getter(JobsManagerRun, ident)) # Fallback for older SQLAlchemy versions return cast(Optional[JobsManagerRun], session.query(JobsManagerRun).get(ident)) def _build_context_payload( trigger: str, context: Optional[Dict[str, object]], updated_at: datetime ) -> Dict[str, object]: payload: Dict[str, object] = {} if context: payload.update(context) payload["last_trigger"] = trigger payload["last_trigger_at"] = updated_at.isoformat() return payload def get_or_create_singleton_run( session: Any, trigger: str, context: Optional[Dict[str, object]] = None ) -> JobsManagerRun: """Return the singleton run, creating it if necessary.""" now = datetime.utcnow() run = _session_get(session, SINGLETON_RUN_ID) if run: run.trigger = trigger run.context_json = _build_context_payload(trigger, context, now) run.updated_at = now if not run.started_at: run.started_at = now if not run.counters_reset_at: run.counters_reset_at = run.started_at or now session.flush() return run run = JobsManagerRun( id=SINGLETON_RUN_ID, status="running", trigger=trigger, started_at=now, counters_reset_at=now, created_at=now, updated_at=now, context_json=_build_context_payload(trigger, context, now), ) session.add(run) session.flush() return run def ensure_active_run( session: Any, trigger: str, context: Optional[Dict[str, object]] = None ) -> JobsManagerRun: """Return the singleton run, ensuring it exists and is up to date.""" return get_or_create_singleton_run(session, trigger, context) def get_active_run(session: Any) -> Optional[JobsManagerRun]: """Return the singleton run if it exists.""" return _session_get(session, SINGLETON_RUN_ID) def recalculate_run_counts(session: Any) -> Optional[JobsManagerRun]: """ Recompute aggregate counters for the singleton run. When no jobs remain in the system the counters are reset to zero so the UI reflects an idle manager. """ run = get_active_run(session) if not run: return None cutoff = run.counters_reset_at # The linter incorrectly flags func.count as not callable. query = session.query( ProcessingJob.status, func.count(ProcessingJob.id), # pylint: disable=not-callable ).filter(ProcessingJob.jobs_manager_run_id == run.id) if cutoff: query = query.filter(ProcessingJob.created_at >= cutoff) counts = dict(query.group_by(ProcessingJob.status).all()) logger.debug( "[WRITER] recalculate_run_counts: run_id=%s counts=%s", getattr(run, "id", None), counts, ) now = datetime.utcnow() queued = counts.get("pending", 0) + counts.get("queued", 0) running = counts.get("running", 0) completed = counts.get("completed", 0) failed = counts.get("failed", 0) + counts.get("cancelled", 0) skipped = counts.get("skipped", 0) total_jobs = sum(counts.values()) has_active_work = (queued + running) > 0 if has_active_work: run.total_jobs = total_jobs run.queued_jobs = queued run.running_jobs = running run.completed_jobs = completed run.failed_jobs = failed if hasattr(run, "skipped_jobs"): run.skipped_jobs = skipped run.updated_at = now if run.running_jobs > 0: run.status = "running" else: run.status = "pending" if not run.started_at: run.started_at = now if not run.counters_reset_at: run.counters_reset_at = run.started_at or now run.completed_at = None else: run.status = "pending" run.completed_at = now run.started_at = None run.total_jobs = 0 run.queued_jobs = 0 run.running_jobs = 0 run.completed_jobs = 0 run.failed_jobs = 0 if hasattr(run, "skipped_jobs"): run.skipped_jobs = 0 run.updated_at = now run.counters_reset_at = now session.flush() return run def serialize_run(run: JobsManagerRun) -> Dict[str, object]: """Return a JSON-serialisable representation of a run.""" progress_denom = max(run.total_jobs or 0, 1) progress_percentage = ( ((run.completed_jobs + getattr(run, "skipped_jobs", 0)) / progress_denom) * 100.0 if run.total_jobs else 0.0 ) return { "id": run.id, "status": run.status, "trigger": run.trigger, "started_at": run.started_at.isoformat() if run.started_at else None, "completed_at": run.completed_at.isoformat() if run.completed_at else None, "updated_at": run.updated_at.isoformat() if run.updated_at else None, "total_jobs": run.total_jobs, "queued_jobs": run.queued_jobs, "running_jobs": run.running_jobs, "completed_jobs": run.completed_jobs, "failed_jobs": run.failed_jobs, "skipped_jobs": getattr(run, "skipped_jobs", 0), "context": run.context_json, "counters_reset_at": ( run.counters_reset_at.isoformat() if run.counters_reset_at else None ), "progress_percentage": round(progress_percentage, 2), } def build_run_status_snapshot(session: Any) -> Optional[Dict[str, object]]: """ Return a fresh, non-persisted snapshot of the current run counters. This mirrors recalculate_run_counts but does not mutate or flush the JobsManagerRun row, making it safe for high-frequency polling without competing for SQLite write locks. """ run = get_active_run(session) if not run: return None cutoff = run.counters_reset_at query = session.query( ProcessingJob.status, func.count(ProcessingJob.id), # pylint: disable=not-callable ).filter(ProcessingJob.jobs_manager_run_id == run.id) if cutoff: query = query.filter(ProcessingJob.created_at >= cutoff) counts = dict(query.group_by(ProcessingJob.status).all()) queued = counts.get("pending", 0) + counts.get("queued", 0) running = counts.get("running", 0) completed = counts.get("completed", 0) failed = counts.get("failed", 0) + counts.get("cancelled", 0) skipped = counts.get("skipped", 0) total_jobs = sum(counts.values()) has_active_work = (queued + running) > 0 status = run.status if has_active_work: status = "running" if running > 0 else "pending" else: status = "pending" progress_denom = max(total_jobs or 0, 1) progress_percentage = ( ((completed + skipped) / progress_denom) * 100.0 if total_jobs else 0.0 ) return { "id": run.id, "status": status, "trigger": run.trigger, "started_at": run.started_at.isoformat() if run.started_at else None, "completed_at": run.completed_at.isoformat() if run.completed_at else None, "updated_at": run.updated_at.isoformat() if run.updated_at else None, "total_jobs": total_jobs, "queued_jobs": queued, "running_jobs": running, "completed_jobs": completed, "failed_jobs": failed, "skipped_jobs": skipped, "context": run.context_json, "counters_reset_at": ( run.counters_reset_at.isoformat() if run.counters_reset_at else None ), "progress_percentage": round(progress_percentage, 2), } ================================================ FILE: src/app/logger.py ================================================ import json import logging import os class ExtraFormatter(logging.Formatter): """Formatter that appends structured extras to log lines. Any LogRecord attributes not in the standard set are captured into a JSON object and appended as ``extra={...}`` so contextual fields are visible in plain-text logs. """ _standard_attrs = { "name", "msg", "args", "levelname", "levelno", "pathname", "filename", "module", "exc_info", "exc_text", "stack_info", "lineno", "funcName", "created", "msecs", "relativeCreated", "thread", "threadName", "processName", "process", "message", "asctime", } def format(self, record: logging.LogRecord) -> str: base = super().format(record) extras = { k: v for k, v in record.__dict__.items() if k not in self._standard_attrs } if extras: try: extras_json = json.dumps(extras, ensure_ascii=True, default=str) except Exception: extras_json = str(extras) return f"{base} | extra={extras_json}" return base def setup_logger( name: str, log_file: str, level: int = logging.DEBUG ) -> logging.Logger: """Create or return a configured logger. - Writes to the specified log_file - Emits to console exactly once (no duplicates) - Disables propagation to avoid duplicate root handling - Guards against adding duplicate handlers across repeated calls """ file_formatter = ExtraFormatter("%(asctime)s %(levelname)s %(message)s") console_formatter = ExtraFormatter("%(levelname)s [%(name)s] %(message)s") logger = logging.getLogger(name) logger.setLevel(level) # Prevent records from also bubbling up to root logger handlers (which can cause duplicates) logger.propagate = False # Ensure directory exists for log file log_dir = os.path.dirname(log_file) if log_dir: os.makedirs(log_dir, exist_ok=True) # Add file handler if not already present for this file abs_log_file = os.path.abspath(log_file) has_file_handler = any( isinstance(h, logging.FileHandler) and getattr(h, "baseFilename", None) == abs_log_file for h in logger.handlers ) if not has_file_handler: file_handler = logging.FileHandler(abs_log_file) file_handler.setFormatter(file_formatter) logger.addHandler(file_handler) # Add a single console handler if not already present has_stream_handler = any( isinstance(h, logging.StreamHandler) for h in logger.handlers ) if not has_stream_handler: stream_handler = logging.StreamHandler() stream_handler.setFormatter(console_formatter) logger.addHandler(stream_handler) return logger ================================================ FILE: src/app/models.py ================================================ import os import uuid from datetime import datetime from sqlalchemy.orm import validates from app.auth.passwords import hash_password, verify_password from app.extensions import db from shared import defaults as DEFAULTS def generate_uuid() -> str: """Generate a UUID4 string.""" return str(uuid.uuid4()) def generate_job_id() -> str: """Generate a unique job ID.""" return generate_uuid() # mypy typing issue https://github.com/python/mypy/issues/17918 class Feed(db.Model): # type: ignore[name-defined, misc] id = db.Column(db.Integer, primary_key=True, autoincrement=True) alt_id = db.Column( db.Text, nullable=True ) # used for backwards compatibility with legacy YAML-based feed definitions title = db.Column(db.Text, nullable=False) description = db.Column(db.Text) author = db.Column(db.Text) rss_url = db.Column(db.Text, unique=True, nullable=False) image_url = db.Column(db.Text) auto_whitelist_new_episodes_override = db.Column(db.Boolean, nullable=True) posts = db.relationship( "Post", backref="feed", lazy=True, order_by="Post.release_date.desc()" ) user_feeds = db.relationship( "UserFeed", back_populates="feed", cascade="all, delete-orphan", ) def __repr__(self) -> str: return f"" class FeedAccessToken(db.Model): # type: ignore[name-defined, misc] __tablename__ = "feed_access_token" id = db.Column(db.Integer, primary_key=True, autoincrement=True) token_id = db.Column(db.String(32), unique=True, nullable=False, index=True) token_hash = db.Column(db.String(64), nullable=False) token_secret = db.Column(db.String(128), nullable=True) feed_id = db.Column(db.Integer, db.ForeignKey("feed.id"), nullable=True) user_id = db.Column(db.Integer, db.ForeignKey("users.id"), nullable=False) created_at = db.Column(db.DateTime, default=datetime.utcnow, nullable=False) last_used_at = db.Column(db.DateTime, nullable=True) revoked = db.Column(db.Boolean, default=False, nullable=False) feed = db.relationship("Feed", backref=db.backref("access_tokens", lazy="dynamic")) user = db.relationship( "User", backref=db.backref("feed_access_tokens", lazy="dynamic") ) def __repr__(self) -> str: return ( f"" ) class Post(db.Model): # type: ignore[name-defined, misc] feed_id = db.Column(db.Integer, db.ForeignKey("feed.id"), nullable=False) id = db.Column(db.Integer, primary_key=True, autoincrement=True) guid = db.Column(db.Text, unique=True, nullable=False) download_url = db.Column( db.Text, unique=True, nullable=False ) # remote download URL, not podly url title = db.Column(db.Text, nullable=False) unprocessed_audio_path = db.Column(db.Text) processed_audio_path = db.Column(db.Text) description = db.Column(db.Text) release_date = db.Column(db.DateTime(timezone=True)) duration = db.Column(db.Integer) whitelisted = db.Column(db.Boolean, default=False, nullable=False) image_url = db.Column(db.Text) # Episode thumbnail URL download_count = db.Column(db.Integer, nullable=True, default=0) # Latest (most recent) refined ad cut windows for this post. # This is written by the ad classifier boundary refinement step and read by the # audio processor to cut ads using refined (intra-segment) timestamps. refined_ad_boundaries = db.Column(db.JSON, nullable=True) refined_ad_boundaries_updated_at = db.Column(db.DateTime, nullable=True) segments = db.relationship( "TranscriptSegment", backref="post", lazy="dynamic", order_by="TranscriptSegment.sequence_num", ) def audio_len_bytes(self) -> int: audio_len_bytes = 0 if self.processed_audio_path is not None and os.path.isfile( self.processed_audio_path ): audio_len_bytes = os.path.getsize(self.processed_audio_path) return audio_len_bytes class TranscriptSegment(db.Model): # type: ignore[name-defined, misc] __tablename__ = "transcript_segment" id = db.Column(db.Integer, primary_key=True, autoincrement=True) post_id = db.Column(db.Integer, db.ForeignKey("post.id"), nullable=False) sequence_num = db.Column(db.Integer, nullable=False) start_time = db.Column(db.Float, nullable=False) end_time = db.Column(db.Float, nullable=False) text = db.Column(db.Text, nullable=False) identifications = db.relationship( "Identification", backref="transcript_segment", lazy="dynamic" ) __table_args__ = ( db.Index( "ix_transcript_segment_post_id_sequence_num", "post_id", "sequence_num", unique=True, ), ) def __repr__(self) -> str: return f"" class User(db.Model): # type: ignore[name-defined, misc] __tablename__ = "users" id = db.Column(db.Integer, primary_key=True, autoincrement=True) username = db.Column(db.String(255), unique=True, nullable=False, index=True) password_hash = db.Column(db.String(255), nullable=False) role = db.Column(db.String(50), nullable=False, default="user") feed_allowance = db.Column(db.Integer, nullable=False, default=0) feed_subscription_status = db.Column( db.String(32), nullable=False, default="inactive" ) stripe_customer_id = db.Column(db.String(64), nullable=True) stripe_subscription_id = db.Column(db.String(64), nullable=True) created_at = db.Column(db.DateTime, default=datetime.utcnow, nullable=False) updated_at = db.Column( db.DateTime, default=datetime.utcnow, onupdate=datetime.utcnow, nullable=False ) # Discord SSO fields discord_id = db.Column(db.String(32), unique=True, nullable=True, index=True) discord_username = db.Column(db.String(100), nullable=True) last_active = db.Column(db.DateTime, nullable=True) # Admin override for feed allowance (if set, overrides plan-based allowance) manual_feed_allowance = db.Column(db.Integer, nullable=True) user_feeds = db.relationship( "UserFeed", back_populates="user", cascade="all, delete-orphan", ) @validates("username") def _normalize_username(self, key: str, value: str) -> str: del key return value.strip().lower() def set_password(self, password: str) -> None: self.password_hash = hash_password(password) def verify_password(self, password: str) -> bool: return verify_password(password, self.password_hash) def __repr__(self) -> str: return f"" class ModelCall(db.Model): # type: ignore[name-defined, misc] __tablename__ = "model_call" id = db.Column(db.Integer, primary_key=True, autoincrement=True) post_id = db.Column(db.Integer, db.ForeignKey("post.id"), nullable=False) first_segment_sequence_num = db.Column(db.Integer, nullable=False) last_segment_sequence_num = db.Column(db.Integer, nullable=False) model_name = db.Column(db.String, nullable=False) prompt = db.Column(db.Text, nullable=False) response = db.Column(db.Text, nullable=True) timestamp = db.Column(db.DateTime, nullable=False, default=datetime.utcnow) status = db.Column(db.String, nullable=False, default="pending") error_message = db.Column(db.Text, nullable=True) retry_attempts = db.Column(db.Integer, nullable=False, default=0) identifications = db.relationship( "Identification", backref="model_call", lazy="dynamic" ) post = db.relationship("Post", backref=db.backref("model_calls", lazy="dynamic")) __table_args__ = ( db.Index( "ix_model_call_post_chunk_model", "post_id", "first_segment_sequence_num", "last_segment_sequence_num", "model_name", unique=True, ), ) def __repr__(self) -> str: return f"" class Identification(db.Model): # type: ignore[name-defined, misc] __tablename__ = "identification" id = db.Column(db.Integer, primary_key=True, autoincrement=True) transcript_segment_id = db.Column( db.Integer, db.ForeignKey("transcript_segment.id"), nullable=False ) model_call_id = db.Column( db.Integer, db.ForeignKey("model_call.id"), nullable=False ) confidence = db.Column(db.Float, nullable=True) label = db.Column(db.String, nullable=False) __table_args__ = ( db.Index( "ix_identification_segment_call_label", "transcript_segment_id", "model_call_id", "label", unique=True, ), ) def __repr__(self) -> str: # Ensure confidence is handled if None for f-string formatting confidence_str = ( f"{self.confidence:.2f}" if self.confidence is not None else "N/A" ) return f"" class JobsManagerRun(db.Model): # type: ignore[name-defined, misc] __tablename__ = "jobs_manager_run" id = db.Column(db.String(36), primary_key=True, default=generate_uuid) status = db.Column(db.String(50), nullable=False, default="pending", index=True) trigger = db.Column(db.String(100), nullable=False) started_at = db.Column(db.DateTime, nullable=True) completed_at = db.Column(db.DateTime, nullable=True) total_jobs = db.Column(db.Integer, nullable=False, default=0) queued_jobs = db.Column(db.Integer, nullable=False, default=0) running_jobs = db.Column(db.Integer, nullable=False, default=0) completed_jobs = db.Column(db.Integer, nullable=False, default=0) failed_jobs = db.Column(db.Integer, nullable=False, default=0) skipped_jobs = db.Column(db.Integer, nullable=False, default=0) context_json = db.Column(db.JSON, nullable=True) counters_reset_at = db.Column(db.DateTime, nullable=True) created_at = db.Column(db.DateTime, default=datetime.utcnow) updated_at = db.Column( db.DateTime, default=datetime.utcnow, onupdate=datetime.utcnow ) processing_jobs = db.relationship( "ProcessingJob", back_populates="run", lazy="dynamic" ) def __repr__(self) -> str: return ( f"" ) class ProcessingJob(db.Model): # type: ignore[name-defined, misc] __tablename__ = "processing_job" id = db.Column(db.String(36), primary_key=True, default=generate_job_id) jobs_manager_run_id = db.Column( db.String(36), db.ForeignKey("jobs_manager_run.id"), index=True ) post_guid = db.Column(db.String(255), nullable=False, index=True) status = db.Column( db.String(50), nullable=False ) # pending, running, completed, failed, cancelled, skipped current_step = db.Column(db.Integer, default=0) # 0-4 (0=not started, 4=completed) step_name = db.Column(db.String(100)) total_steps = db.Column(db.Integer, default=4) progress_percentage = db.Column(db.Float, default=0.0) started_at = db.Column(db.DateTime) completed_at = db.Column(db.DateTime) error_message = db.Column(db.Text) scheduler_job_id = db.Column(db.String(255)) # APScheduler job ID created_at = db.Column(db.DateTime, default=datetime.utcnow, index=True) requested_by_user_id = db.Column(db.Integer, db.ForeignKey("users.id")) billing_user_id = db.Column(db.Integer, db.ForeignKey("users.id")) # Relationships post = db.relationship( "Post", backref="processing_jobs", primaryjoin="ProcessingJob.post_guid == Post.guid", foreign_keys=[post_guid], ) run = db.relationship("JobsManagerRun", back_populates="processing_jobs") requested_by_user = db.relationship( "User", foreign_keys=[requested_by_user_id], backref=db.backref("requested_jobs", lazy="dynamic"), ) billing_user = db.relationship( "User", foreign_keys=[billing_user_id], backref=db.backref("billed_jobs", lazy="dynamic"), ) def __repr__(self) -> str: return f"" class UserFeed(db.Model): # type: ignore[name-defined, misc] __tablename__ = "feed_supporter" id = db.Column(db.Integer, primary_key=True, autoincrement=True) feed_id = db.Column(db.Integer, db.ForeignKey("feed.id"), nullable=False) user_id = db.Column(db.Integer, db.ForeignKey("users.id"), nullable=False) created_at = db.Column(db.DateTime, default=datetime.utcnow, nullable=False) __table_args__ = ( db.UniqueConstraint("feed_id", "user_id", name="uq_feed_supporter_feed_user"), ) feed = db.relationship("Feed", back_populates="user_feeds") user = db.relationship("User", back_populates="user_feeds") def __repr__(self) -> str: return f"" # ----- Application Settings (Singleton Tables) ----- class LLMSettings(db.Model): # type: ignore[name-defined, misc] __tablename__ = "llm_settings" id = db.Column(db.Integer, primary_key=True, default=1) llm_api_key = db.Column(db.Text, nullable=True) llm_model = db.Column(db.Text, nullable=False, default=DEFAULTS.LLM_DEFAULT_MODEL) openai_base_url = db.Column(db.Text, nullable=True) openai_timeout = db.Column( db.Integer, nullable=False, default=DEFAULTS.OPENAI_DEFAULT_TIMEOUT_SEC ) openai_max_tokens = db.Column( db.Integer, nullable=False, default=DEFAULTS.OPENAI_DEFAULT_MAX_TOKENS ) llm_max_concurrent_calls = db.Column( db.Integer, nullable=False, default=DEFAULTS.LLM_DEFAULT_MAX_CONCURRENT_CALLS ) llm_max_retry_attempts = db.Column( db.Integer, nullable=False, default=DEFAULTS.LLM_DEFAULT_MAX_RETRY_ATTEMPTS ) llm_max_input_tokens_per_call = db.Column(db.Integer, nullable=True) llm_enable_token_rate_limiting = db.Column( db.Boolean, nullable=False, default=DEFAULTS.LLM_ENABLE_TOKEN_RATE_LIMITING ) llm_max_input_tokens_per_minute = db.Column(db.Integer, nullable=True) enable_boundary_refinement = db.Column( db.Boolean, nullable=False, default=DEFAULTS.ENABLE_BOUNDARY_REFINEMENT ) enable_word_level_boundary_refinder = db.Column( db.Boolean, nullable=False, default=DEFAULTS.ENABLE_WORD_LEVEL_BOUNDARY_REFINDER, ) created_at = db.Column(db.DateTime, nullable=False, default=datetime.utcnow) updated_at = db.Column(db.DateTime, nullable=False, default=datetime.utcnow) class WhisperSettings(db.Model): # type: ignore[name-defined, misc] __tablename__ = "whisper_settings" id = db.Column(db.Integer, primary_key=True, default=1) whisper_type = db.Column( db.Text, nullable=False, default=DEFAULTS.WHISPER_DEFAULT_TYPE ) # local|remote|groq|test # Local local_model = db.Column( db.Text, nullable=False, default=DEFAULTS.WHISPER_LOCAL_MODEL ) # Remote remote_model = db.Column( db.Text, nullable=False, default=DEFAULTS.WHISPER_REMOTE_MODEL ) remote_api_key = db.Column(db.Text, nullable=True) remote_base_url = db.Column( db.Text, nullable=False, default=DEFAULTS.WHISPER_REMOTE_BASE_URL ) remote_language = db.Column( db.Text, nullable=False, default=DEFAULTS.WHISPER_REMOTE_LANGUAGE ) remote_timeout_sec = db.Column( db.Integer, nullable=False, default=DEFAULTS.WHISPER_REMOTE_TIMEOUT_SEC ) remote_chunksize_mb = db.Column( db.Integer, nullable=False, default=DEFAULTS.WHISPER_REMOTE_CHUNKSIZE_MB ) # Groq groq_api_key = db.Column(db.Text, nullable=True) groq_model = db.Column(db.Text, nullable=False, default=DEFAULTS.WHISPER_GROQ_MODEL) groq_language = db.Column( db.Text, nullable=False, default=DEFAULTS.WHISPER_GROQ_LANGUAGE ) groq_max_retries = db.Column( db.Integer, nullable=False, default=DEFAULTS.WHISPER_GROQ_MAX_RETRIES ) created_at = db.Column(db.DateTime, nullable=False, default=datetime.utcnow) updated_at = db.Column(db.DateTime, nullable=False, default=datetime.utcnow) class ProcessingSettings(db.Model): # type: ignore[name-defined, misc] __tablename__ = "processing_settings" id = db.Column(db.Integer, primary_key=True, default=1) # Deprecated: paths are now hardcoded; keep columns for migration compatibility system_prompt_path = db.Column( db.Text, nullable=False, default="src/system_prompt.txt" ) user_prompt_template_path = db.Column( db.Text, nullable=False, default="src/user_prompt.jinja" ) num_segments_to_input_to_prompt = db.Column( db.Integer, nullable=False, default=DEFAULTS.PROCESSING_NUM_SEGMENTS_TO_INPUT_TO_PROMPT, ) created_at = db.Column(db.DateTime, nullable=False, default=datetime.utcnow) updated_at = db.Column(db.DateTime, nullable=False, default=datetime.utcnow) class OutputSettings(db.Model): # type: ignore[name-defined, misc] __tablename__ = "output_settings" id = db.Column(db.Integer, primary_key=True, default=1) fade_ms = db.Column(db.Integer, nullable=False, default=DEFAULTS.OUTPUT_FADE_MS) min_ad_segement_separation_seconds = db.Column( db.Integer, nullable=False, default=DEFAULTS.OUTPUT_MIN_AD_SEGMENT_SEPARATION_SECONDS, ) min_ad_segment_length_seconds = db.Column( db.Integer, nullable=False, default=DEFAULTS.OUTPUT_MIN_AD_SEGMENT_LENGTH_SECONDS, ) min_confidence = db.Column( db.Float, nullable=False, default=DEFAULTS.OUTPUT_MIN_CONFIDENCE ) created_at = db.Column(db.DateTime, nullable=False, default=datetime.utcnow) updated_at = db.Column(db.DateTime, nullable=False, default=datetime.utcnow) class AppSettings(db.Model): # type: ignore[name-defined, misc] __tablename__ = "app_settings" id = db.Column(db.Integer, primary_key=True, default=1) background_update_interval_minute = db.Column( db.Integer, nullable=True ) # intentionally nullable; default applied in config store/runtime automatically_whitelist_new_episodes = db.Column( db.Boolean, nullable=False, default=DEFAULTS.APP_AUTOMATICALLY_WHITELIST_NEW_EPISODES, ) post_cleanup_retention_days = db.Column( db.Integer, nullable=True, default=DEFAULTS.APP_POST_CLEANUP_RETENTION_DAYS, ) number_of_episodes_to_whitelist_from_archive_of_new_feed = db.Column( db.Integer, nullable=False, default=DEFAULTS.APP_NUM_EPISODES_TO_WHITELIST_FROM_ARCHIVE_OF_NEW_FEED, ) enable_public_landing_page = db.Column( db.Boolean, nullable=False, default=DEFAULTS.APP_ENABLE_PUBLIC_LANDING_PAGE, ) user_limit_total = db.Column(db.Integer, nullable=True) autoprocess_on_download = db.Column( db.Boolean, nullable=False, default=DEFAULTS.APP_AUTOPROCESS_ON_DOWNLOAD, ) # Hash of the environment variables used to seed configuration. # Used to detect changes in environment variables between restarts. env_config_hash = db.Column(db.String(64), nullable=True) created_at = db.Column(db.DateTime, nullable=False, default=datetime.utcnow) updated_at = db.Column(db.DateTime, nullable=False, default=datetime.utcnow) class DiscordSettings(db.Model): # type: ignore[name-defined, misc] __tablename__ = "discord_settings" id = db.Column(db.Integer, primary_key=True, default=1) client_id = db.Column(db.Text, nullable=True) client_secret = db.Column(db.Text, nullable=True) redirect_uri = db.Column(db.Text, nullable=True) guild_ids = db.Column(db.Text, nullable=True) # Comma-separated list allow_registration = db.Column(db.Boolean, nullable=False, default=True) created_at = db.Column(db.DateTime, nullable=False, default=datetime.utcnow) updated_at = db.Column(db.DateTime, nullable=False, default=datetime.utcnow) ================================================ FILE: src/app/post_cleanup.py ================================================ """Cleanup job for pruning processed posts and associated artifacts.""" from __future__ import annotations import logging from datetime import datetime, timedelta from pathlib import Path from typing import Dict, Optional, Sequence, Tuple from sqlalchemy import func from sqlalchemy.orm import Query from app.db_guard import db_guard, reset_session from app.extensions import db, scheduler from app.models import Post, ProcessingJob from app.runtime_config import config as runtime_config from app.writer.client import writer_client from shared import defaults as DEFAULTS logger = logging.getLogger("global_logger") def _build_cleanup_query( retention_days: Optional[int], ) -> Tuple[Optional[Query["Post"]], Optional[datetime]]: """Construct the base query for posts eligible for cleanup.""" if retention_days is None or retention_days <= 0: return None, None cutoff = datetime.utcnow() - timedelta(days=retention_days) active_jobs_exists = ( db.session.query(ProcessingJob.id) .filter(ProcessingJob.post_guid == Post.guid) .filter(ProcessingJob.status.in_(["pending", "running"])) .exists() ) posts_query = Post.query.filter(Post.processed_audio_path.isnot(None)).filter( ~active_jobs_exists ) return posts_query, cutoff def count_cleanup_candidates( retention_days: Optional[int], ) -> Tuple[int, Optional[datetime]]: """Return how many posts would currently be removed along with the cutoff.""" posts_query, cutoff = _build_cleanup_query(retention_days) if posts_query is None or cutoff is None: return 0, None posts = posts_query.all() latest_completed = _load_latest_completed_map([post.guid for post in posts]) count = sum( 1 for post in posts if _processed_timestamp_before_cutoff(post, cutoff, latest_completed) ) return count, cutoff def cleanup_processed_posts(retention_days: Optional[int]) -> int: """Prune processed posts older than the retention window. Posts qualify when their processed audio artifact (or, if missing, the latest completed job) is older than the retention window. Eligible posts are un-whitelisted, artifacts are removed, and dependent rows are deleted, but the post row is retained to prevent reprocessing. Returns the number of posts that were cleaned. Callers must ensure an application context is active. """ with db_guard("cleanup_processed_posts", db.session, logger): posts_query, cutoff = _build_cleanup_query(retention_days) if posts_query is None or cutoff is None: return 0 posts: Sequence[Post] = posts_query.all() latest_completed = _load_latest_completed_map([post.guid for post in posts]) if not posts: return 0 removed_posts = 0 for post in posts: if not _processed_timestamp_before_cutoff(post, cutoff, latest_completed): continue removed_posts += 1 logger.info( "Cleanup removing post '%s' (guid=%s) completed before %s", post.title, post.guid, cutoff.isoformat(), ) _remove_associated_files(post) try: writer_client.action( "cleanup_processed_post", {"post_id": post.id}, wait=True ) except Exception as exc: # pylint: disable=broad-except logger.error( "Cleanup failed for post %s (guid=%s): %s", post.id, post.guid, exc, exc_info=True, ) logger.info( "Cleanup job removed %s posts", removed_posts, ) return removed_posts def scheduled_cleanup_processed_posts() -> None: """Entry-point for APScheduler.""" retention = getattr( runtime_config, "post_cleanup_retention_days", DEFAULTS.APP_POST_CLEANUP_RETENTION_DAYS, ) if scheduler.app is None: logger.warning("Cleanup skipped: scheduler has no associated app.") return try: with scheduler.app.app_context(): cleanup_processed_posts(retention) except Exception as exc: # pylint: disable=broad-except logger.error("Scheduled cleanup failed: %s", exc, exc_info=True) reset_session(db.session, logger, "scheduled_cleanup_processed_posts", exc) def _remove_associated_files(post: Post) -> None: """Delete processed and unprocessed audio files for a post.""" for path_str in [post.unprocessed_audio_path, post.processed_audio_path]: if not path_str: continue try: file_path = Path(path_str) except Exception: # pylint: disable=broad-except logger.warning("Cleanup: invalid path for post %s: %s", post.guid, path_str) continue if not file_path.exists(): continue try: file_path.unlink() logger.info("Cleanup deleted file: %s", file_path) except OSError as exc: logger.warning("Cleanup unable to delete %s: %s", file_path, exc) def _load_latest_completed_map( post_guids: Sequence[str], ) -> Dict[str, Optional[datetime]]: if not post_guids: return {} rows = ( db.session.query( ProcessingJob.post_guid, func.max(ProcessingJob.completed_at), ) .filter(ProcessingJob.post_guid.in_(post_guids)) .group_by(ProcessingJob.post_guid) .all() ) return dict(rows) def _processed_timestamp_before_cutoff( post: Post, cutoff: datetime, latest_completed: Dict[str, Optional[datetime]] ) -> bool: file_timestamp = _get_processed_file_timestamp(post) job_timestamp = latest_completed.get(post.guid) candidate: Optional[datetime] if file_timestamp and job_timestamp: candidate = min(file_timestamp, job_timestamp) else: candidate = file_timestamp or job_timestamp return bool(candidate and candidate < cutoff) def _get_processed_file_timestamp(post: Post) -> Optional[datetime]: if not post.processed_audio_path: return None try: file_path = Path(post.processed_audio_path) except Exception: # pylint: disable=broad-except logger.warning( "Cleanup: invalid processed path for post %s: %s", post.guid, post.processed_audio_path, ) return None if not file_path.exists(): return None try: mtime = file_path.stat().st_mtime except OSError as exc: logger.warning("Cleanup: unable to stat processed file %s: %s", file_path, exc) return None return datetime.utcfromtimestamp(mtime) ================================================ FILE: src/app/posts.py ================================================ import logging from pathlib import Path from typing import List, Optional from app.models import Post from app.writer.client import writer_client from podcast_processor.podcast_downloader import get_and_make_download_path logger = logging.getLogger("global_logger") def _collect_processed_paths(post: Post) -> List[Path]: """Collect all possible processed audio paths to check for a post.""" import re from podcast_processor.podcast_downloader import sanitize_title from shared.processing_paths import get_srv_root, paths_from_unprocessed_path processed_paths_to_check: List[Path] = [] # 1. Check database path first (most reliable if set) if post.processed_audio_path: processed_paths_to_check.append(Path(post.processed_audio_path)) # 2. Compute path using paths_from_unprocessed_path (matches processor logic) if post.unprocessed_audio_path and post.feed and post.feed.title: processing_paths = paths_from_unprocessed_path( post.unprocessed_audio_path, post.feed.title ) if processing_paths: processed_paths_to_check.append(processing_paths.post_processed_audio_path) # 3. Fallback: compute expected path from post/feed titles if post.feed and post.feed.title and post.title: safe_feed_title = sanitize_title(post.feed.title) safe_post_title = sanitize_title(post.title) processed_paths_to_check.append( get_srv_root() / safe_feed_title / f"{safe_post_title}.mp3" ) # 4. Also check with underscore-style sanitization sanitized_feed_title = re.sub(r"[^a-zA-Z0-9\s_.-]", "", post.feed.title).strip() sanitized_feed_title = sanitized_feed_title.rstrip(".") sanitized_feed_title = re.sub(r"\s+", "_", sanitized_feed_title) processed_paths_to_check.append( get_srv_root() / sanitized_feed_title / f"{safe_post_title}.mp3" ) return processed_paths_to_check def _dedupe_and_find_existing(paths: List[Path]) -> tuple[List[Path], Optional[Path]]: """Deduplicate paths and find the first existing one.""" seen: set[Path] = set() unique_paths: List[Path] = [] for p in paths: resolved = p.resolve() if resolved not in seen: seen.add(resolved) unique_paths.append(resolved) existing_path: Optional[Path] = None for p in unique_paths: if p.exists(): existing_path = p break return unique_paths, existing_path def _remove_file_if_exists(path: Optional[Path], file_type: str, post_id: int) -> None: """Remove a file if it exists and log the result.""" if not path: logger.debug(f"{file_type} path is None for post {post_id}.") return if not path.exists(): logger.debug(f"No {file_type} file to remove for post {post_id}.") return try: path.unlink() logger.info(f"Removed {file_type} file: {path}") except OSError as e: logger.error(f"Failed to remove {file_type} file {path}: {e}") def remove_associated_files(post: Post) -> None: """ Remove unprocessed and processed audio files associated with a post. Computes paths from post/feed metadata to ensure files are found even if database paths are already cleared. We check multiple possible locations for processed audio because the path calculation has varied over time and between different code paths. """ try: # Collect and find processed audio path processed_paths = _collect_processed_paths(post) unique_paths, processed_abs_path = _dedupe_and_find_existing(processed_paths) # Compute expected unprocessed audio path unprocessed_abs_path: Optional[Path] = None if post.title: unprocessed_path = get_and_make_download_path(post.title) if unprocessed_path: unprocessed_abs_path = Path(unprocessed_path).resolve() # Fallback: if we couldn't find a processed path, try using the stored path directly if processed_abs_path is None and post.processed_audio_path: processed_abs_path = Path(post.processed_audio_path).resolve() # Remove audio files _remove_file_if_exists(unprocessed_abs_path, "unprocessed audio", post.id) if processed_abs_path: _remove_file_if_exists(processed_abs_path, "processed audio", post.id) elif unique_paths: logger.debug( f"No processed audio file to remove for post {post.id}. " f"Checked paths: {[str(p) for p in unique_paths]}" ) else: logger.debug( f"Could not determine processed audio path for post {post.id}." ) except Exception as e: # pylint: disable=broad-except logger.error( f"Unexpected error in remove_associated_files for post {post.id}: {e}", exc_info=True, ) def clear_post_processing_data(post: Post) -> None: """ Clear all processing data for a post including: - Audio files (unprocessed and processed) - Database entries (transcript segments, identifications, model calls, processing jobs) - Reset relevant post fields """ try: logger.info( f"Starting to clear processing data for post: {post.title} (ID: {post.id})" ) # Remove audio files first remove_associated_files(post) writer_client.action( "clear_post_processing_data", {"post_id": post.id}, wait=True ) logger.info( f"Successfully cleared all processing data for post: {post.title} (ID: {post.id})" ) except Exception as e: logger.error( f"Error clearing processing data for post {post.id}: {e}", exc_info=True, ) raise PostException(f"Failed to clear processing data: {str(e)}") from e class PostException(Exception): pass ================================================ FILE: src/app/processor.py ================================================ from app.runtime_config import config from podcast_processor.podcast_processor import PodcastProcessor class ProcessorSingleton: """Singleton class to manage the PodcastProcessor instance.""" _instance: PodcastProcessor | None = None @classmethod def get_instance(cls) -> PodcastProcessor: """Get or create the PodcastProcessor instance.""" if cls._instance is None: cls._instance = PodcastProcessor(config) return cls._instance @classmethod def reset_instance(cls) -> None: """Reset the singleton instance (useful for testing).""" cls._instance = None def get_processor() -> PodcastProcessor: """Get the PodcastProcessor instance.""" return ProcessorSingleton.get_instance() ================================================ FILE: src/app/routes/__init__.py ================================================ from flask import Flask from .auth_routes import auth_bp from .billing_routes import billing_bp from .config_routes import config_bp from .discord_routes import discord_bp from .feed_routes import feed_bp from .jobs_routes import jobs_bp from .main_routes import main_bp from .post_routes import post_bp def register_routes(app: Flask) -> None: """Register all route blueprints with the Flask app.""" app.register_blueprint(main_bp) app.register_blueprint(feed_bp) app.register_blueprint(post_bp) app.register_blueprint(config_bp) app.register_blueprint(jobs_bp) app.register_blueprint(auth_bp) app.register_blueprint(billing_bp) app.register_blueprint(discord_bp) ================================================ FILE: src/app/routes/auth_routes.py ================================================ from __future__ import annotations import logging from typing import cast from flask import Blueprint, Response, current_app, g, jsonify, request, session from app.auth.service import ( AuthServiceError, DuplicateUserError, InvalidCredentialsError, LastAdminRemovalError, PasswordValidationError, UserLimitExceededError, authenticate, change_password, create_user, delete_user, list_users, set_manual_feed_allowance, set_role, update_password, update_user_last_active, ) from app.auth.state import failure_rate_limiter from app.extensions import db from app.models import User from app.runtime_config import config as runtime_config logger = logging.getLogger("global_logger") auth_bp = Blueprint("auth", __name__) RouteResult = Response | tuple[Response, int] | tuple[Response, int, dict[str, str]] SESSION_USER_KEY = "user_id" def _auth_enabled() -> bool: settings = current_app.config.get("AUTH_SETTINGS") return bool(settings and settings.require_auth) @auth_bp.route("/api/auth/status", methods=["GET"]) def auth_status() -> Response: landing_enabled = bool(getattr(runtime_config, "enable_public_landing_page", False)) return jsonify( {"require_auth": _auth_enabled(), "landing_page_enabled": landing_enabled} ) @auth_bp.route("/api/auth/login", methods=["POST"]) def login() -> RouteResult: if not _auth_enabled(): return jsonify({"error": "Authentication is disabled."}), 404 payload = request.get_json(silent=True) or {} username = (payload.get("username") or "").strip() password = payload.get("password") or "" if not username or not password: return jsonify({"error": "Username and password are required."}), 400 client_identifier = request.remote_addr or "unknown" retry_after = failure_rate_limiter.retry_after(client_identifier) if retry_after: return ( jsonify({"error": "Too many failed attempts.", "retry_after": retry_after}), 429, {"Retry-After": str(retry_after)}, ) authenticated = authenticate(username, password) if authenticated is None: backoff = failure_rate_limiter.register_failure(client_identifier) response_headers: dict[str, str] = {} if backoff: response_headers["Retry-After"] = str(backoff) response = jsonify({"error": "Invalid username or password."}) if response_headers: return response, 401, response_headers return response, 401 failure_rate_limiter.register_success(client_identifier) session.clear() session[SESSION_USER_KEY] = authenticated.id session.permanent = True update_user_last_active(authenticated.id) # Calculate effective allowance for frontend display allowance = getattr(authenticated, "manual_feed_allowance", None) if allowance is None: allowance = getattr(authenticated, "feed_allowance", 0) return jsonify( { "user": { "id": authenticated.id, "username": authenticated.username, "role": authenticated.role, "feed_allowance": allowance, "feed_subscription_status": getattr( authenticated, "feed_subscription_status", "inactive" ), } } ) @auth_bp.route("/api/auth/logout", methods=["POST"]) def logout() -> RouteResult: if not _auth_enabled(): return jsonify({"error": "Authentication is disabled."}), 404 if getattr(g, "current_user", None) is None: session.clear() return jsonify({"error": "Authentication required."}), 401 session.clear() return Response(status=204) @auth_bp.route("/api/auth/me", methods=["GET"]) def auth_me() -> RouteResult: if not _auth_enabled(): return jsonify({"error": "Authentication is disabled."}), 404 user = _require_authenticated_user() if user is None: return _unauthorized_response() # Calculate effective allowance for frontend display allowance = getattr(user, "manual_feed_allowance", None) if allowance is None: allowance = getattr(user, "feed_allowance", 0) return jsonify( { "user": { "id": user.id, "username": user.username, "role": user.role, "feed_allowance": allowance, "feed_subscription_status": getattr( user, "feed_subscription_status", "inactive" ), } } ) @auth_bp.route("/api/auth/change-password", methods=["POST"]) def change_password_route() -> RouteResult: if not _auth_enabled(): return jsonify({"error": "Authentication is disabled."}), 404 user = _require_authenticated_user() if user is None: return _unauthorized_response() payload = request.get_json(silent=True) or {} current_password = payload.get("current_password") or "" new_password = payload.get("new_password") or "" if not current_password or not new_password: return ( jsonify({"error": "Current and new passwords are required."}), 400, ) try: change_password(user, current_password, new_password) except InvalidCredentialsError as exc: return jsonify({"error": str(exc)}), 401 except PasswordValidationError as exc: return jsonify({"error": str(exc)}), 400 except AuthServiceError as exc: # fallback logger.error("Password change failed: %s", exc) return jsonify({"error": "Unable to change password."}), 500 return jsonify({"status": "ok"}) @auth_bp.route("/api/auth/users", methods=["GET"]) def list_users_route() -> RouteResult: if not _auth_enabled(): return jsonify({"error": "Authentication is disabled."}), 404 user = _require_authenticated_user() if user is None: return _unauthorized_response() if not user.role == "admin": return jsonify({"error": "Admin privileges required."}), 403 users = list_users() return jsonify( { "users": [ { "id": u.id, "username": u.username, "role": u.role, "created_at": u.created_at.isoformat(), "updated_at": u.updated_at.isoformat(), "last_active": u.last_active.isoformat() if u.last_active else None, "feed_allowance": getattr(u, "feed_allowance", 0), "manual_feed_allowance": getattr(u, "manual_feed_allowance", None), "feed_subscription_status": getattr( u, "feed_subscription_status", "inactive" ), } for u in users ] } ) @auth_bp.route("/api/auth/users", methods=["POST"]) def create_user_route() -> RouteResult: if not _auth_enabled(): return jsonify({"error": "Authentication is disabled."}), 404 user = _require_authenticated_user() if user is None: return _unauthorized_response() if user.role != "admin": return jsonify({"error": "Admin privileges required."}), 403 payload = request.get_json(silent=True) or {} username = (payload.get("username") or "").strip() password = payload.get("password") or "" role = (payload.get("role") or "user").strip() if not username or not password: return jsonify({"error": "Username and password are required."}), 400 try: new_user = create_user(username, password, role) except ( PasswordValidationError, DuplicateUserError, UserLimitExceededError, AuthServiceError, ) as exc: status = 409 if isinstance(exc, DuplicateUserError) else 400 return jsonify({"error": str(exc)}), status return ( jsonify( { "user": { "id": new_user.id, "username": new_user.username, "role": new_user.role, "created_at": new_user.created_at.isoformat(), "updated_at": new_user.updated_at.isoformat(), } } ), 201, ) @auth_bp.route("/api/auth/users/", methods=["PATCH"]) def update_user_route(username: str) -> RouteResult: if not _auth_enabled(): return jsonify({"error": "Authentication is disabled."}), 404 acting_user = _require_authenticated_user() if acting_user is None: return _unauthorized_response() if acting_user.role != "admin": return jsonify({"error": "Admin privileges required."}), 403 target = User.query.filter_by(username=username.lower()).first() if target is None: return jsonify({"error": "User not found."}), 404 payload = request.get_json(silent=True) or {} role = payload.get("role") new_password = payload.get("password") manual_feed_allowance = payload.get("manual_feed_allowance") try: if role is not None: set_role(target, role) if new_password: update_password(target, new_password) if "manual_feed_allowance" in payload: set_manual_feed_allowance(target, manual_feed_allowance) return jsonify({"status": "ok"}) except (PasswordValidationError, LastAdminRemovalError, AuthServiceError) as exc: status_code = 400 return jsonify({"error": str(exc)}), status_code @auth_bp.route("/api/auth/users/", methods=["DELETE"]) def delete_user_route(username: str) -> RouteResult: if not _auth_enabled(): return jsonify({"error": "Authentication is disabled."}), 404 acting_user = _require_authenticated_user() if acting_user is None: return _unauthorized_response() if acting_user.role != "admin": return jsonify({"error": "Admin privileges required."}), 403 target = User.query.filter_by(username=username.lower()).first() if target is None: return jsonify({"error": "User not found."}), 404 try: delete_user(target) except LastAdminRemovalError as exc: return jsonify({"error": str(exc)}), 400 return jsonify({"status": "ok"}) def _require_authenticated_user() -> User | None: if not _auth_enabled(): return None current = getattr(g, "current_user", None) if current is None: return None return cast(User | None, db.session.get(User, current.id)) def _unauthorized_response() -> RouteResult: if not _auth_enabled(): return jsonify({"error": "Authentication is disabled."}), 404 return jsonify({"error": "Authentication required."}), 401 ================================================ FILE: src/app/routes/billing_routes.py ================================================ import logging import os from typing import Any, Optional from flask import Blueprint, jsonify, request from app.extensions import db from app.models import User, UserFeed from app.writer.client import writer_client from .auth_routes import _require_authenticated_user logger = logging.getLogger("global_logger") billing_bp = Blueprint("billing", __name__) def _get_stripe_client() -> tuple[Optional[Any], Optional[str]]: secret = os.getenv("STRIPE_SECRET_KEY") if not secret: return None, "Stripe secret key missing" try: import stripe except ImportError: return None, "Stripe library not installed" stripe.api_key = secret return stripe, None def _product_id() -> Optional[str]: return os.getenv("STRIPE_PRODUCT_ID") def _min_subscription_amount_cents() -> int: """Minimum non-zero subscription amount in cents. Allow 0 to cancel, otherwise enforce this minimum. Configurable via STRIPE_MIN_SUBSCRIPTION_AMOUNT_CENTS. """ raw = os.getenv("STRIPE_MIN_SUBSCRIPTION_AMOUNT_CENTS") if raw is None or raw == "": return 100 try: value = int(raw) except ValueError: logger.warning( "Invalid STRIPE_MIN_SUBSCRIPTION_AMOUNT_CENTS=%r; defaulting to 100", raw, ) return 100 return max(0, value) def _user_feed_usage(user: User) -> dict[str, int]: feeds_in_use = UserFeed.query.filter_by(user_id=user.id).count() allowance = getattr(user, "manual_feed_allowance", None) if allowance is None: allowance = getattr(user, "feed_allowance", 0) or 0 remaining = max(0, allowance - feeds_in_use) return { "feed_allowance": allowance, "feeds_in_use": feeds_in_use, "remaining": remaining, } @billing_bp.route("/api/billing/summary", methods=["GET"]) def billing_summary() -> Any: """Return feed allowance and subscription state for the current user.""" user = _require_authenticated_user() if user is None: logger.warning("Billing summary requested by unauthenticated user") return jsonify({"error": "Authentication required"}), 401 logger.info("Billing summary requested for user %s", user.id) usage = _user_feed_usage(user) product_id = _product_id() stripe_client, _ = _get_stripe_client() current_amount = 0 if ( stripe_client is not None and user.stripe_customer_id and not user.stripe_subscription_id ): # Try to find an active subscription if we don't have one linked subs = stripe_client.Subscription.list( customer=user.stripe_customer_id, limit=1, status="active" ) if subs and subs.get("data"): sub = subs["data"][0] items = sub.get("items", {}).get("data", []) # For PWYW bundle, allowance is 10 if active feed_allowance = 10 if items else 0 writer_client.action( "set_user_billing_fields", { "user_id": user.id, "stripe_subscription_id": sub["id"], "feed_subscription_status": sub["status"], "feed_allowance": feed_allowance, }, wait=True, ) db.session.expire(user) usage = _user_feed_usage(user) # Fetch current price amount if subscribed if ( stripe_client is not None and user.stripe_subscription_id and user.feed_subscription_status == "active" ): try: sub = stripe_client.Subscription.retrieve( user.stripe_subscription_id, expand=["items.data.price"] ) if sub and sub.get("items") and sub["items"]["data"]: price_item = sub["items"]["data"][0].get("price") if price_item: current_amount = price_item.get("unit_amount", 0) except Exception as e: logger.error("Error fetching subscription details: %s", e) return jsonify( { "feed_allowance": usage["feed_allowance"], "feeds_in_use": usage["feeds_in_use"], "remaining": usage["remaining"], "current_amount": current_amount, "min_amount_cents": _min_subscription_amount_cents(), "subscription_status": getattr( user, "feed_subscription_status", "inactive" ), "stripe_subscription_id": getattr(user, "stripe_subscription_id", None), "stripe_customer_id": getattr(user, "stripe_customer_id", None), "product_id": product_id, } ) def _build_return_urls() -> tuple[str, str]: host = request.host_url.rstrip("/") success = f"{host}/billing?checkout=success" cancel = f"{host}/billing?checkout=cancel" return success, cancel @billing_bp.route("/api/billing/subscription", methods=["POST"]) def update_subscription() -> Any: # pylint: disable=too-many-statements """Update subscription amount or create new subscription.""" user = _require_authenticated_user() if user is None: logger.warning("Update subscription requested by unauthenticated user") return jsonify({"error": "Authentication required"}), 401 payload = request.get_json(silent=True) or {} amount = int(payload.get("amount") or 0) logger.info("Update subscription for user %s: %s cents", user.id, amount) # Allow 0 to cancel, otherwise enforce configured minimum. min_amount_cents = _min_subscription_amount_cents() if 0 < amount < min_amount_cents: min_amount_dollars = min_amount_cents / 100.0 return ( jsonify({"error": f"Minimum amount is ${min_amount_dollars:.2f}"}), 400, ) stripe_client, stripe_err = _get_stripe_client() product_id = _product_id() if stripe_client is None or not product_id: logger.error("Stripe not configured. err=%s", stripe_err) return ( jsonify( { "error": "STRIPE_NOT_CONFIGURED", "message": "Billing system is not configured.", } ), 503, ) try: requested_subscription_id = payload.get("subscription_id") if ( requested_subscription_id and not user.stripe_subscription_id and stripe_client is not None ): # Attach known subscription id to the user if it belongs to their customer sub = stripe_client.Subscription.retrieve(requested_subscription_id) if sub and sub.get("customer") == user.stripe_customer_id: writer_client.action( "set_user_billing_fields", {"user_id": user.id, "stripe_subscription_id": sub["id"]}, wait=True, ) db.session.expire(user) # Ensure customer exists if not user.stripe_customer_id: customer = stripe_client.Customer.create( name=user.username or f"user-{user.id}", metadata={"user_id": user.id}, ) writer_client.action( "set_user_billing_fields", {"user_id": user.id, "stripe_customer_id": customer["id"]}, wait=True, ) db.session.expire(user) # If subscription exists, update or cancel if user.stripe_subscription_id: if amount <= 0: logger.info("Canceling subscription for user %s", user.id) stripe_client.Subscription.delete(user.stripe_subscription_id) writer_client.action( "set_user_billing_fields", { "user_id": user.id, "feed_allowance": 0, "feed_subscription_status": "canceled", "stripe_subscription_id": None, }, wait=True, ) db.session.expire(user) usage = _user_feed_usage(user) return jsonify( { "feed_allowance": usage["feed_allowance"], "feeds_in_use": usage["feeds_in_use"], "remaining": usage["remaining"], "subscription_status": user.feed_subscription_status, "requires_stripe_checkout": False, "message": "Subscription canceled.", } ) # Update existing subscription with new price sub = stripe_client.Subscription.retrieve( user.stripe_subscription_id, expand=["items"] ) items = sub["items"]["data"] if not items: return jsonify({"error": "Subscription has no items"}), 400 item_id = items[0]["id"] updated = stripe_client.Subscription.modify( user.stripe_subscription_id, items=[ { "id": item_id, "price_data": { "currency": "usd", "product": product_id, "unit_amount": amount, "recurring": {"interval": "month"}, }, } ], proration_behavior="none", ) logger.info( "Updated subscription for user %s to amount %s", user.id, amount ) status = updated["status"] writer_client.action( "set_user_billing_fields", { "user_id": user.id, "feed_allowance": 10, # Fixed allowance for active sub "feed_subscription_status": status, }, wait=True, ) db.session.expire(user) usage = _user_feed_usage(user) return jsonify( { "feed_allowance": usage["feed_allowance"], "feeds_in_use": usage["feeds_in_use"], "remaining": usage["remaining"], "subscription_status": status, "requires_stripe_checkout": False, "message": "Subscription updated.", } ) # Otherwise, create checkout session for a new subscription if amount <= 0: writer_client.action( "set_user_billing_fields", { "user_id": user.id, "feed_allowance": 0, "feed_subscription_status": "inactive", }, wait=True, ) db.session.expire(user) usage = _user_feed_usage(user) return jsonify( { "feed_allowance": usage["feed_allowance"], "feeds_in_use": usage["feeds_in_use"], "remaining": usage["remaining"], "subscription_status": user.feed_subscription_status, "requires_stripe_checkout": False, "message": "No subscription created for zero amount.", } ) logger.info( "Creating checkout session for user %s with amount %s", user.id, amount ) success_url, cancel_url = _build_return_urls() session = stripe_client.checkout.Session.create( mode="subscription", customer=user.stripe_customer_id, line_items=[ { "price_data": { "currency": "usd", "product": product_id, "unit_amount": amount, "recurring": {"interval": "month"}, }, "quantity": 1, } ], subscription_data={"metadata": {"user_id": user.id}}, metadata={"user_id": user.id}, success_url=payload.get("success_url") or success_url, cancel_url=payload.get("cancel_url") or cancel_url, ) return jsonify( { "checkout_url": session["url"], "requires_stripe_checkout": True, "feed_allowance": user.feed_allowance, "feeds_in_use": _user_feed_usage(user)["feeds_in_use"], "subscription_status": user.feed_subscription_status, } ) except Exception as exc: # pylint: disable=broad-except logger.error("Stripe error updating subscription: %s", exc) return jsonify({"error": "STRIPE_ERROR", "message": str(exc)}), 502 usage = _user_feed_usage(user) return jsonify( { "feed_allowance": usage["feed_allowance"], "feeds_in_use": usage["feeds_in_use"], "remaining": usage["remaining"], "subscription_status": user.feed_subscription_status, "requires_stripe_checkout": True, "message": "Local update completed.", } ) @billing_bp.route("/api/billing/portal-session", methods=["POST"]) def billing_portal_session() -> Any: user = _require_authenticated_user() if user is None: logger.warning("Billing portal session requested by unauthenticated user") return jsonify({"error": "Authentication required"}), 401 logger.info("Billing portal session requested for user %s", user.id) stripe_client, stripe_err = _get_stripe_client() if stripe_client is None: return jsonify({"error": "STRIPE_NOT_CONFIGURED", "message": stripe_err}), 400 if not user.stripe_customer_id: return ( jsonify( { "error": "NO_STRIPE_CUSTOMER", "message": "No Stripe customer on file.", } ), 400, ) return_url, _ = _build_return_urls() try: session = stripe_client.billing_portal.Session.create( customer=user.stripe_customer_id, return_url=return_url, ) return jsonify({"url": session["url"]}) except Exception as exc: # pylint: disable=broad-except logger.error("Failed to create billing portal session: %s", exc) return jsonify({"error": "STRIPE_ERROR", "message": str(exc)}), 502 def _update_user_from_subscription(sub: Any) -> None: customer_id = sub.get("customer") if not customer_id: return user = User.query.filter_by(stripe_customer_id=customer_id).first() if not user: return status = sub.get("status") if isinstance(sub, dict) else sub["status"] # For PWYW bundle, allowance is 10 if active feed_allowance = 10 if status in ("active", "trialing", "past_due") else 0 writer_client.action( "set_user_billing_by_customer_id", { "stripe_customer_id": customer_id, "feed_allowance": feed_allowance, "feed_subscription_status": status, "stripe_subscription_id": ( sub.get("id") if isinstance(sub, dict) else sub["id"] ), }, wait=True, ) @billing_bp.route("/api/billing/stripe-webhook", methods=["POST"]) def stripe_webhook() -> Any: stripe_client, stripe_err = _get_stripe_client() if stripe_client is None: return jsonify({"error": "STRIPE_NOT_CONFIGURED", "message": stripe_err}), 400 payload = request.data sig_header = request.headers.get("Stripe-Signature") webhook_secret = os.getenv("STRIPE_WEBHOOK_SECRET") if not webhook_secret: logger.error("Stripe webhook secret not configured; rejecting webhook request.") return ( jsonify( { "error": "WEBHOOK_SECRET_MISSING", "message": "Stripe webhook secret is not configured.", } ), 400, ) try: event = stripe_client.Webhook.construct_event( payload, sig_header, webhook_secret ) logger.info("Stripe webhook received: %s", event["type"]) except Exception as exc: # pylint: disable=broad-except logger.error("Invalid Stripe webhook: %s", exc) return jsonify({"error": "INVALID_SIGNATURE"}), 400 event_type = event["type"] data_object = event["data"]["object"] if event_type in ( "customer.subscription.created", "customer.subscription.updated", "customer.subscription.deleted", "customer.subscription.paused", ): _update_user_from_subscription(data_object) elif event_type == "checkout.session.completed": sub_id = data_object.get("subscription") customer_id = data_object.get("customer") user_id = data_object.get("metadata", {}).get("user_id") user = None if customer_id: user = User.query.filter_by(stripe_customer_id=customer_id).first() if user is None and user_id: user = db.session.get(User, int(user_id)) if user and customer_id: writer_client.action( "set_user_billing_fields", {"user_id": user.id, "stripe_customer_id": customer_id}, wait=True, ) db.session.expire(user) if user and sub_id: writer_client.action( "set_user_billing_fields", {"user_id": user.id, "stripe_subscription_id": sub_id}, wait=True, ) db.session.expire(user) else: logger.info("Unhandled Stripe event: %s", event_type) return jsonify({"status": "ok"}) ================================================ FILE: src/app/routes/config_routes.py ================================================ import logging import os from typing import Any, Dict import flask import litellm from flask import Blueprint, jsonify, request from groq import Groq from openai import OpenAI from app.auth.guards import require_admin from app.config_store import read_combined, to_pydantic_config from app.processor import ProcessorSingleton from app.runtime_config import config as runtime_config from app.writer.client import writer_client from shared.llm_utils import model_uses_max_completion_tokens logger = logging.getLogger("global_logger") config_bp = Blueprint("config", __name__) def _mask_secret(value: Any | None) -> str | None: if value is None: return None try: secret = str(value).strip() except Exception: # pragma: no cover - defensive return None if not secret: return None if len(secret) <= 8: return secret return f"{secret[:4]}...{secret[-4:]}" def _sanitize_config_for_client(cfg: Dict[str, Any]) -> Dict[str, Any]: try: data: Dict[str, Any] = dict(cfg) llm: Dict[str, Any] = dict(data.get("llm", {})) whisper: Dict[str, Any] = dict(data.get("whisper", {})) llm_api_key = llm.pop("llm_api_key", None) if llm_api_key: llm["llm_api_key_preview"] = _mask_secret(llm_api_key) whisper_api_key = whisper.pop("api_key", None) if whisper_api_key: whisper["api_key_preview"] = _mask_secret(whisper_api_key) data["llm"] = llm data["whisper"] = whisper return data except Exception: return {} @config_bp.route("/api/config", methods=["GET"]) def api_get_config() -> flask.Response: _, error_response = require_admin() if error_response: return error_response try: data = read_combined() _hydrate_runtime_config(data) env_metadata = _build_env_override_metadata(data) return flask.jsonify( { "config": _sanitize_config_for_client(data), "env_overrides": env_metadata, } ) except Exception as e: # pylint: disable=broad-except logger.error(f"Failed to read configuration: {e}") return flask.make_response( jsonify({"error": "Failed to read configuration"}), 500 ) def _hydrate_runtime_config(data: Dict[str, Any]) -> None: _hydrate_llm_config(data) _hydrate_whisper_config(data) _hydrate_app_config(data) def _hydrate_llm_config(data: Dict[str, Any]) -> None: data.setdefault("llm", {}) llm = data["llm"] llm["llm_api_key"] = getattr(runtime_config, "llm_api_key", llm.get("llm_api_key")) llm["llm_model"] = getattr(runtime_config, "llm_model", llm.get("llm_model")) llm["openai_base_url"] = getattr( runtime_config, "openai_base_url", llm.get("openai_base_url") ) llm["openai_timeout"] = getattr( runtime_config, "openai_timeout", llm.get("openai_timeout") ) llm["openai_max_tokens"] = getattr( runtime_config, "openai_max_tokens", llm.get("openai_max_tokens") ) llm["llm_max_concurrent_calls"] = getattr( runtime_config, "llm_max_concurrent_calls", llm.get("llm_max_concurrent_calls") ) llm["llm_max_retry_attempts"] = getattr( runtime_config, "llm_max_retry_attempts", llm.get("llm_max_retry_attempts") ) llm["llm_max_input_tokens_per_call"] = getattr( runtime_config, "llm_max_input_tokens_per_call", llm.get("llm_max_input_tokens_per_call"), ) llm["llm_enable_token_rate_limiting"] = getattr( runtime_config, "llm_enable_token_rate_limiting", llm.get("llm_enable_token_rate_limiting"), ) llm["llm_max_input_tokens_per_minute"] = getattr( runtime_config, "llm_max_input_tokens_per_minute", llm.get("llm_max_input_tokens_per_minute"), ) def _hydrate_whisper_config(data: Dict[str, Any]) -> None: data.setdefault("whisper", {}) whisper = data["whisper"] rt_whisper = getattr(runtime_config, "whisper", None) if isinstance(rt_whisper, dict): _overlay_whisper_dict(whisper, rt_whisper) return if rt_whisper is not None and hasattr(rt_whisper, "whisper_type"): _overlay_whisper_object(whisper, rt_whisper) def _overlay_whisper_dict(target: Dict[str, Any], source: Dict[str, Any]) -> None: wtype = source.get("whisper_type") target["whisper_type"] = wtype or target.get("whisper_type") if wtype == "local": target["model"] = source.get("model", target.get("model")) elif wtype == "remote": _overlay_remote_whisper_fields(target, source) elif wtype == "groq": _overlay_groq_whisper_fields(target, source) def _overlay_whisper_object(target: Dict[str, Any], source: Any) -> None: wtype = getattr(source, "whisper_type") target["whisper_type"] = wtype if wtype == "local": target["model"] = getattr(source, "model", target.get("model")) elif wtype == "remote": _overlay_remote_whisper_fields(target, source) elif wtype == "groq": _overlay_groq_whisper_fields(target, source) def _overlay_remote_whisper_fields(target: Dict[str, Any], source: Any) -> None: target["model"] = _get_attr_or_value(source, "model", target.get("model")) target["api_key"] = _get_attr_or_value(source, "api_key", target.get("api_key")) target["base_url"] = _get_attr_or_value(source, "base_url", target.get("base_url")) target["language"] = _get_attr_or_value(source, "language", target.get("language")) target["timeout_sec"] = _get_attr_or_value( source, "timeout_sec", target.get("timeout_sec") ) target["chunksize_mb"] = _get_attr_or_value( source, "chunksize_mb", target.get("chunksize_mb") ) def _overlay_groq_whisper_fields(target: Dict[str, Any], source: Any) -> None: target["api_key"] = _get_attr_or_value(source, "api_key", target.get("api_key")) target["model"] = _get_attr_or_value(source, "model", target.get("model")) target["language"] = _get_attr_or_value(source, "language", target.get("language")) target["max_retries"] = _get_attr_or_value( source, "max_retries", target.get("max_retries") ) def _get_attr_or_value(source: Any, key: str, default: Any) -> Any: if isinstance(source, dict): return source.get(key, default) return getattr(source, key, default) def _hydrate_app_config(data: Dict[str, Any]) -> None: data.setdefault("app", {}) app_cfg = data["app"] app_cfg["post_cleanup_retention_days"] = getattr( runtime_config, "post_cleanup_retention_days", app_cfg.get("post_cleanup_retention_days"), ) app_cfg["enable_public_landing_page"] = getattr( runtime_config, "enable_public_landing_page", app_cfg.get("enable_public_landing_page"), ) app_cfg["user_limit_total"] = getattr( runtime_config, "user_limit_total", app_cfg.get("user_limit_total") ) app_cfg["autoprocess_on_download"] = getattr( runtime_config, "autoprocess_on_download", app_cfg.get("autoprocess_on_download"), ) def _first_env(env_names: list[str]) -> tuple[str | None, str | None]: """Return first found environment variable name and value.""" for name in env_names: value = os.environ.get(name) if value is not None and value != "": return name, value return None, None def _register_override( overrides: Dict[str, Any], path: str, env_var: str | None, value: Any | None, *, secret: bool = False, ) -> None: """Register an environment override in the metadata dict.""" if not env_var or value is None: return entry: Dict[str, Any] = {"env_var": env_var} if secret: entry["is_secret"] = True entry["value_preview"] = _mask_secret(value) else: entry["value"] = value overrides[path] = entry def _register_llm_overrides(overrides: Dict[str, Any]) -> None: """Register LLM-related environment overrides.""" env_var, env_value = _first_env(["LLM_API_KEY", "OPENAI_API_KEY", "GROQ_API_KEY"]) _register_override(overrides, "llm.llm_api_key", env_var, env_value, secret=True) base_url = os.environ.get("OPENAI_BASE_URL") if base_url: _register_override( overrides, "llm.openai_base_url", "OPENAI_BASE_URL", base_url ) llm_model = os.environ.get("LLM_MODEL") if llm_model: _register_override(overrides, "llm.llm_model", "LLM_MODEL", llm_model) def _register_groq_shared_overrides(overrides: Dict[str, Any]) -> None: """Register shared Groq API key override metadata.""" groq_key = os.environ.get("GROQ_API_KEY") if groq_key: _register_override( overrides, "groq.api_key", "GROQ_API_KEY", groq_key, secret=True ) def _register_remote_whisper_overrides(overrides: Dict[str, Any]) -> None: """Register remote whisper environment overrides.""" remote_key = _first_env(["WHISPER_REMOTE_API_KEY", "OPENAI_API_KEY"]) _register_override( overrides, "whisper.api_key", remote_key[0], remote_key[1], secret=True ) remote_base = _first_env(["WHISPER_REMOTE_BASE_URL", "OPENAI_BASE_URL"]) _register_override(overrides, "whisper.base_url", remote_base[0], remote_base[1]) remote_model = os.environ.get("WHISPER_REMOTE_MODEL") if remote_model: _register_override( overrides, "whisper.model", "WHISPER_REMOTE_MODEL", remote_model ) remote_timeout = os.environ.get("WHISPER_REMOTE_TIMEOUT_SEC") if remote_timeout: _register_override( overrides, "whisper.timeout_sec", "WHISPER_REMOTE_TIMEOUT_SEC", remote_timeout, ) remote_chunksize = os.environ.get("WHISPER_REMOTE_CHUNKSIZE_MB") if remote_chunksize: _register_override( overrides, "whisper.chunksize_mb", "WHISPER_REMOTE_CHUNKSIZE_MB", remote_chunksize, ) def _register_groq_whisper_overrides(overrides: Dict[str, Any]) -> None: """Register groq whisper environment overrides.""" groq_key = os.environ.get("GROQ_API_KEY") if groq_key: _register_override( overrides, "whisper.api_key", "GROQ_API_KEY", groq_key, secret=True ) groq_model_env, groq_model_val = _first_env( ["GROQ_WHISPER_MODEL", "WHISPER_GROQ_MODEL"] ) _register_override(overrides, "whisper.model", groq_model_env, groq_model_val) groq_retries = os.environ.get("GROQ_MAX_RETRIES") if groq_retries: _register_override( overrides, "whisper.max_retries", "GROQ_MAX_RETRIES", groq_retries ) def _register_local_whisper_overrides(overrides: Dict[str, Any]) -> None: """Register local whisper environment overrides.""" local_model = os.environ.get("WHISPER_LOCAL_MODEL") if local_model: _register_override( overrides, "whisper.model", "WHISPER_LOCAL_MODEL", local_model ) def _determine_whisper_type_for_metadata(data: Dict[str, Any]) -> str | None: """Determine whisper type for environment metadata (with auto-detection).""" whisper_cfg = data.get("whisper", {}) or {} wtype = whisper_cfg.get("whisper_type") env_whisper_type = os.environ.get("WHISPER_TYPE") # Auto-detect whisper type from API key environment variables if not explicitly set # (matching the logic in config_store._apply_whisper_type_override) if not env_whisper_type: if os.environ.get("WHISPER_REMOTE_API_KEY"): env_whisper_type = "remote" elif os.environ.get("GROQ_API_KEY") and not os.environ.get("LLM_API_KEY"): env_whisper_type = "groq" if env_whisper_type: wtype = env_whisper_type.strip().lower() return wtype if isinstance(wtype, str) else None def _build_env_override_metadata(data: Dict[str, Any]) -> Dict[str, Any]: overrides: Dict[str, Any] = {} _register_llm_overrides(overrides) _register_groq_shared_overrides(overrides) env_whisper_type = os.environ.get("WHISPER_TYPE") if env_whisper_type: _register_override( overrides, "whisper.whisper_type", "WHISPER_TYPE", env_whisper_type ) wtype = _determine_whisper_type_for_metadata(data) if wtype == "remote": _register_remote_whisper_overrides(overrides) elif wtype == "groq": _register_groq_whisper_overrides(overrides) elif wtype == "local": _register_local_whisper_overrides(overrides) return overrides @config_bp.route("/api/config", methods=["PUT"]) def api_put_config() -> flask.Response: _, error_response = require_admin() if error_response: return error_response payload = request.get_json(silent=True) or {} llm_payload = payload.get("llm") if isinstance(llm_payload, dict): llm_payload.pop("llm_api_key_preview", None) whisper_payload = payload.get("whisper") if isinstance(whisper_payload, dict): whisper_payload.pop("api_key_preview", None) try: result = writer_client.action( "update_combined_config", {"payload": payload}, wait=True, ) if not result or not result.success: raise RuntimeError(getattr(result, "error", "Writer update failed")) data = result.data or {} try: db_cfg = to_pydantic_config() except Exception as hydrate_err: # pylint: disable=broad-except logger.error(f"Post-update config hydration failed: {hydrate_err}") return flask.make_response( jsonify( {"error": "Invalid configuration", "details": str(hydrate_err)} ), 400, ) for field_name in runtime_config.__class__.model_fields.keys(): setattr(runtime_config, field_name, getattr(db_cfg, field_name)) ProcessorSingleton.reset_instance() return flask.jsonify(_sanitize_config_for_client(data)) except Exception as e: # pylint: disable=broad-except logger.error(f"Failed to update configuration: {e}") return flask.make_response( jsonify({"error": "Failed to update configuration", "details": str(e)}), 400 ) @config_bp.route("/api/config/test-llm", methods=["POST"]) def api_test_llm() -> flask.Response: _, error_response = require_admin() if error_response: return error_response payload: Dict[str, Any] = request.get_json(silent=True) or {} llm: Dict[str, Any] = dict(payload.get("llm", {})) api_key: str | None = llm.get("llm_api_key") or getattr( runtime_config, "llm_api_key", None ) model_val = llm.get("llm_model") model: str = ( model_val if isinstance(model_val, str) else getattr(runtime_config, "llm_model", "gpt-4o") ) base_url: str | None = llm.get("openai_base_url") or getattr( runtime_config, "openai_base_url", None ) timeout_val = llm.get("openai_timeout") timeout: int = ( int(timeout_val) if timeout_val is not None else int(getattr(runtime_config, "openai_timeout", 30)) ) if not api_key: return flask.make_response( jsonify({"ok": False, "error": "Missing llm_api_key"}), 400 ) try: # Configure litellm for this probe litellm.api_key = api_key if base_url: litellm.api_base = base_url # Minimal completion to validate connectivity and credentials messages = [ {"role": "system", "content": "You are a healthcheck probe."}, {"role": "user", "content": "ping"}, ] completion_kwargs: Dict[str, Any] = { "model": model, "messages": messages, "timeout": timeout, } if model_uses_max_completion_tokens(model): completion_kwargs["max_completion_tokens"] = 1 else: completion_kwargs["max_tokens"] = 1 _ = litellm.completion(**completion_kwargs) return flask.jsonify( { "ok": True, "message": "LLM connection OK", "model": model, "base_url": base_url, } ) except Exception as e: # pylint: disable=broad-except logger.error(f"LLM connection test failed: {e}") return flask.make_response(jsonify({"ok": False, "error": str(e)}), 400) def _make_error_response(error_msg: str, status_code: int = 400) -> flask.Response: return flask.make_response(jsonify({"ok": False, "error": error_msg}), status_code) def _make_success_response(message: str, **extra_data: Any) -> flask.Response: response_data = {"ok": True, "message": message} response_data.update(extra_data) return flask.jsonify(response_data) def _get_whisper_config_value( whisper_cfg: Dict[str, Any], key: str, default: Any | None = None ) -> Any | None: value = whisper_cfg.get(key) if value is not None: return value try: runtime_whisper = getattr(runtime_config, "whisper", None) if runtime_whisper is not None: return getattr(runtime_whisper, key, default) except Exception: # pragma: no cover - defensive pass return default def _get_env_whisper_api_key(whisper_type: str) -> str | None: if whisper_type == "remote": return os.environ.get("WHISPER_REMOTE_API_KEY") or os.environ.get( "OPENAI_API_KEY" ) if whisper_type == "groq": return os.environ.get("GROQ_API_KEY") return None def _determine_whisper_type(whisper_cfg: Dict[str, Any]) -> str | None: wtype_any = whisper_cfg.get("whisper_type") if isinstance(wtype_any, str): return wtype_any try: runtime_whisper = getattr(runtime_config, "whisper", None) if runtime_whisper is not None and hasattr(runtime_whisper, "whisper_type"): rt_type = getattr(runtime_whisper, "whisper_type") return rt_type if isinstance(rt_type, str) else None except Exception: # pragma: no cover - defensive pass return None def _test_local_whisper(whisper_cfg: Dict[str, Any]) -> flask.Response: """Test local whisper configuration.""" model_name = _get_whisper_config_value(whisper_cfg, "model", "base.en") try: import whisper # type: ignore[import-untyped] except ImportError as e: return _make_error_response(f"whisper not installed: {e}") try: available = whisper.available_models() except Exception as e: # pragma: no cover - library call available = [] logger.warning(f"Failed to list local whisper models: {e}") if model_name not in available: return flask.make_response( jsonify( { "ok": False, "error": f"Model '{model_name}' not available. Install or adjust model.", "available_models": available, } ), 400, ) return _make_success_response(f"Local whisper OK (model {model_name})") def _test_remote_whisper(whisper_cfg: Dict[str, Any]) -> flask.Response: """Test remote whisper configuration.""" api_key_any = _get_whisper_config_value(whisper_cfg, "api_key") base_url_any = _get_whisper_config_value( whisper_cfg, "base_url", "https://api.openai.com/v1" ) timeout_any = _get_whisper_config_value(whisper_cfg, "timeout_sec", 30) api_key: str | None = api_key_any if isinstance(api_key_any, str) else None base_url: str | None = base_url_any if isinstance(base_url_any, str) else None timeout: int = int(timeout_any) if timeout_any is not None else 30 if not api_key: api_key = _get_env_whisper_api_key("remote") if not api_key: return _make_error_response("Missing whisper.api_key") _ = OpenAI(base_url=base_url, api_key=api_key, timeout=timeout).models.list() return _make_success_response("Remote whisper connection OK", base_url=base_url) def _test_groq_whisper(whisper_cfg: Dict[str, Any]) -> flask.Response: """Test groq whisper configuration.""" groq_api_key_any = _get_whisper_config_value(whisper_cfg, "api_key") groq_api_key: str | None = ( groq_api_key_any if isinstance(groq_api_key_any, str) else None ) if not groq_api_key: groq_api_key = _get_env_whisper_api_key("groq") if not groq_api_key: return _make_error_response("Missing whisper.api_key") _ = Groq(api_key=groq_api_key).models.list() return _make_success_response("Groq whisper connection OK") @config_bp.route("/api/config/test-whisper", methods=["POST"]) def api_test_whisper() -> flask.Response: """Test whisper configuration based on whisper_type.""" # pylint: disable=too-many-return-statements _, error_response = require_admin() if error_response: return error_response payload: Dict[str, Any] = request.get_json(silent=True) or {} whisper_cfg: Dict[str, Any] = dict(payload.get("whisper", {})) wtype = _determine_whisper_type(whisper_cfg) if not wtype: return _make_error_response("Missing whisper_type") try: if wtype == "local": return _test_local_whisper(whisper_cfg) if wtype == "remote": return _test_remote_whisper(whisper_cfg) if wtype == "groq": return _test_groq_whisper(whisper_cfg) return _make_error_response(f"Unknown whisper_type '{wtype}'") except Exception as e: # pylint: disable=broad-except logger.error(f"Whisper connection test failed: {e}") return _make_error_response(str(e)) @config_bp.route("/api/config/whisper-capabilities", methods=["GET"]) def api_get_whisper_capabilities() -> flask.Response: """Report Whisper capabilities for the current runtime. Currently returns a boolean indicating whether local Whisper is importable. This enables the frontend to hide the 'local' option when unavailable. """ _, error_response = require_admin() if error_response: return error_response local_available = False try: # pragma: no cover - simple import feature check import whisper # If import succeeds, we consider local whisper available. # Optionally probe models list, but ignore failures here. try: _ = whisper.available_models() # noqa: F841 except Exception: pass local_available = True except Exception: local_available = False return flask.jsonify({"local_available": local_available}) @config_bp.route("/api/config/api_configured_check", methods=["GET"]) def api_configured_check() -> flask.Response: """Return whether the API configuration is sufficient to process. For our purposes, this means an LLM API key is present either in the persisted config or the runtime overlay. """ _, error_response = require_admin() if error_response: return error_response try: data = read_combined() _hydrate_runtime_config(data) llm = data.get("llm", {}) if isinstance(data, dict) else {} api_key = llm.get("llm_api_key") configured = bool(api_key) return flask.jsonify({"configured": configured}) except Exception as e: # pylint: disable=broad-except logger.error(f"Failed to check API configuration: {e}") # Be conservative: report not configured on error return flask.jsonify({"configured": False}) ================================================ FILE: src/app/routes/discord_routes.py ================================================ from __future__ import annotations import logging import os from typing import TYPE_CHECKING from flask import ( Blueprint, Response, current_app, jsonify, request, session, ) from app.auth.discord_service import ( DiscordAuthError, DiscordRegistrationDisabledError, build_authorization_url, check_guild_membership, exchange_code_for_token, find_or_create_user_from_discord, generate_oauth_state, get_discord_user, ) from app.auth.discord_settings import reload_discord_settings from app.auth.guards import require_admin from app.writer.client import writer_client if TYPE_CHECKING: from app.auth.discord_settings import DiscordSettings logger = logging.getLogger("global_logger") discord_bp = Blueprint("discord", __name__) SESSION_OAUTH_STATE_KEY = "discord_oauth_state" SESSION_USER_KEY = "user_id" SESSION_OAUTH_PROMPT_UPGRADED = "discord_prompt_upgraded" def _get_discord_settings() -> DiscordSettings | None: return current_app.config.get("DISCORD_SETTINGS") def _mask_secret(value: str | None) -> str | None: """Mask a secret value for display.""" if not value: return None if len(value) <= 8: return value return f"{value[:4]}...{value[-4:]}" def _has_env_override(env_var: str) -> bool: """Check if an environment variable is set.""" return bool(os.environ.get(env_var)) @discord_bp.route("/api/auth/discord/status", methods=["GET"]) def discord_status() -> Response: """Return whether Discord SSO is enabled.""" settings = _get_discord_settings() return jsonify( { "enabled": settings.enabled if settings else False, } ) @discord_bp.route("/api/auth/discord/config", methods=["GET"]) def discord_config_get() -> Response | tuple[Response, int]: """Get Discord configuration (admin only).""" _, error_response = require_admin() if error_response: return error_response, error_response.status_code settings = _get_discord_settings() # Build env override info env_overrides: dict[str, dict[str, str]] = {} if _has_env_override("DISCORD_CLIENT_ID"): env_overrides["client_id"] = {"env_var": "DISCORD_CLIENT_ID"} if _has_env_override("DISCORD_CLIENT_SECRET"): env_overrides["client_secret"] = { "env_var": "DISCORD_CLIENT_SECRET", "is_secret": "true", } if _has_env_override("DISCORD_REDIRECT_URI"): env_overrides["redirect_uri"] = { "env_var": "DISCORD_REDIRECT_URI", "value": os.environ.get("DISCORD_REDIRECT_URI", ""), } if _has_env_override("DISCORD_GUILD_IDS"): env_overrides["guild_ids"] = { "env_var": "DISCORD_GUILD_IDS", "value": os.environ.get("DISCORD_GUILD_IDS", ""), } if _has_env_override("DISCORD_ALLOW_REGISTRATION"): env_overrides["allow_registration"] = { "env_var": "DISCORD_ALLOW_REGISTRATION", "value": os.environ.get("DISCORD_ALLOW_REGISTRATION", ""), } return jsonify( { "config": { "enabled": settings.enabled if settings else False, "client_id": settings.client_id if settings else None, "client_secret_preview": ( _mask_secret(settings.client_secret) if settings else None ), "redirect_uri": settings.redirect_uri if settings else None, "guild_ids": ( ",".join(settings.guild_ids) if settings and settings.guild_ids else "" ), "allow_registration": settings.allow_registration if settings else True, }, "env_overrides": env_overrides, } ) @discord_bp.route("/api/auth/discord/config", methods=["PUT"]) def discord_config_put() -> Response | tuple[Response, int]: """Update Discord configuration (admin only).""" _, error_response = require_admin() if error_response: return error_response, error_response.status_code payload = request.get_json(silent=True) or {} try: update_params: dict[str, object] = {} if "client_id" in payload and not _has_env_override("DISCORD_CLIENT_ID"): update_params["client_id"] = payload["client_id"] or None if "client_secret" in payload and not _has_env_override( "DISCORD_CLIENT_SECRET" ): secret = payload["client_secret"] if secret and not str(secret).endswith("..."): update_params["client_secret"] = secret if "redirect_uri" in payload and not _has_env_override("DISCORD_REDIRECT_URI"): update_params["redirect_uri"] = payload["redirect_uri"] or None if "guild_ids" in payload and not _has_env_override("DISCORD_GUILD_IDS"): update_params["guild_ids"] = payload["guild_ids"] or None if "allow_registration" in payload and not _has_env_override( "DISCORD_ALLOW_REGISTRATION" ): update_params["allow_registration"] = bool(payload["allow_registration"]) if update_params: result = writer_client.action( "update_discord_settings", update_params, wait=True ) if not result or not result.success: raise RuntimeError(getattr(result, "error", "Writer update failed")) # Reload settings into app config new_settings = reload_discord_settings(current_app) logger.info("Discord settings updated (enabled=%s)", new_settings.enabled) return jsonify( { "status": "ok", "config": { "enabled": new_settings.enabled, "client_id": new_settings.client_id, "client_secret_preview": _mask_secret(new_settings.client_secret), "redirect_uri": new_settings.redirect_uri, "guild_ids": ( ",".join(new_settings.guild_ids) if new_settings.guild_ids else "" ), "allow_registration": new_settings.allow_registration, }, } ) except Exception as e: logger.exception("Failed to update Discord settings: %s", e) return jsonify({"error": "Failed to update Discord settings"}), 500 @discord_bp.route("/api/auth/discord/login", methods=["GET"]) def discord_login() -> Response | tuple[Response, int]: """Start the Discord OAuth2 flow by returning the authorization URL.""" settings = _get_discord_settings() if not settings or not settings.enabled: return jsonify({"error": "Discord SSO is not configured."}), 404 prompt = request.args.get("prompt", "none") state = generate_oauth_state() session[SESSION_OAUTH_STATE_KEY] = state session[SESSION_OAUTH_PROMPT_UPGRADED] = prompt == "consent" auth_url = build_authorization_url(settings, state, prompt=prompt) return jsonify({"authorization_url": auth_url}) @discord_bp.route("/api/auth/discord/callback", methods=["GET"]) def discord_callback() -> Response: """Handle the OAuth2 callback from Discord.""" settings = _get_discord_settings() if not settings or not settings.enabled: return Response( response="", status=302, headers={"Location": "/?error=discord_not_configured"}, ) # Verify state to prevent CSRF state = request.args.get("state") expected_state = session.pop(SESSION_OAUTH_STATE_KEY, None) if not state or state != expected_state: return Response( response="", status=302, headers={"Location": "/?error=invalid_state"} ) # Check for error from Discord (e.g., user denied access) error = request.args.get("error") if error: if error in {"interaction_required", "login_required", "consent_required"}: # Try again with an explicit consent prompt (only once) to avoid loops. if not session.get(SESSION_OAUTH_PROMPT_UPGRADED): new_state = generate_oauth_state() session[SESSION_OAUTH_STATE_KEY] = new_state session[SESSION_OAUTH_PROMPT_UPGRADED] = True auth_url = build_authorization_url( settings, new_state, prompt="consent" ) return Response(response="", status=302, headers={"Location": auth_url}) return Response( response="", status=302, headers={"Location": f"/?error={error}"} ) code = request.args.get("code") if not code: return Response( response="", status=302, headers={"Location": "/?error=missing_code"} ) try: # Exchange code for token token_data = exchange_code_for_token(settings, code) access_token = token_data["access_token"] # Get Discord user info discord_user = get_discord_user(access_token) # Check guild requirements if configured if settings.guild_ids: is_allowed = check_guild_membership(access_token, settings) if not is_allowed: return Response( response="", status=302, headers={"Location": "/?error=guild_requirement_not_met"}, ) # Find or create user user = find_or_create_user_from_discord(discord_user, settings) # Create session session.clear() session[SESSION_USER_KEY] = user.id session.permanent = True session.pop(SESSION_OAUTH_PROMPT_UPGRADED, None) logger.info( "Discord SSO login successful for user %s (discord_id=%s)", user.username, discord_user.id, ) return Response(response="", status=302, headers={"Location": "/"}) except DiscordRegistrationDisabledError: return Response( response="", status=302, headers={"Location": "/?error=registration_disabled"}, ) except DiscordAuthError as e: logger.warning("Discord auth error: %s", e) return Response( response="", status=302, headers={"Location": "/?error=auth_failed"} ) except Exception as e: logger.exception("Discord auth failed unexpectedly: %s", e) return Response( response="", status=302, headers={"Location": "/?error=auth_failed"} ) ================================================ FILE: src/app/routes/feed_routes.py ================================================ import logging import re import secrets from pathlib import Path from threading import Thread from typing import Any, Optional, cast # pylint: disable=chained-comparison from urllib.parse import urlencode, urlparse, urlunparse import requests import validators from flask import ( Blueprint, Flask, Response, current_app, g, jsonify, make_response, redirect, request, send_from_directory, url_for, ) from flask.typing import ResponseReturnValue from app.auth import is_auth_enabled from app.auth.guards import require_admin from app.auth.service import update_user_last_active from app.extensions import db from app.feeds import ( add_or_refresh_feed, generate_aggregate_feed_xml, generate_feed_xml, is_feed_active_for_user, refresh_feed, ) from app.jobs_manager import get_jobs_manager from app.models import ( Feed, Post, User, UserFeed, ) from app.writer.client import writer_client from podcast_processor.podcast_downloader import sanitize_title from shared.processing_paths import get_in_root, get_srv_root from .auth_routes import _require_authenticated_user as _auth_get_user logger = logging.getLogger("global_logger") feed_bp = Blueprint("feed", __name__) def fix_url(url: str) -> str: url = re.sub(r"(http(s)?):/([^/])", r"\1://\3", url) if not url.startswith("http://") and not url.startswith("https://"): url = "https://" + url return url def _user_feed_count(user_id: int) -> int: return int(UserFeed.query.filter_by(user_id=user_id).count()) def _get_latest_post(feed: Feed) -> Post | None: return cast( Optional[Post], Post.query.filter_by(feed_id=feed.id) .order_by(Post.release_date.desc().nullslast(), Post.id.desc()) .first(), ) def _ensure_user_feed_membership(feed: Feed, user_id: int | None) -> tuple[bool, int]: """Add a user↔feed link if missing. Returns (created, previous_feed_member_count).""" if not user_id: return False, UserFeed.query.filter_by(feed_id=feed.id).count() result = writer_client.action( "ensure_user_feed_membership", {"feed_id": feed.id, "user_id": int(user_id)}, wait=True, ) if not result or not result.success or not isinstance(result.data, dict): raise RuntimeError(getattr(result, "error", "Failed to join feed")) return bool(result.data.get("created")), int(result.data.get("previous_count") or 0) def _whitelist_latest_for_first_member( feed: Feed, requested_by_user_id: int | None ) -> None: """When a feed goes from 0→1 members, whitelist and process the latest post.""" try: result = writer_client.action( "whitelist_latest_post_for_feed", {"feed_id": feed.id}, wait=True ) if not result or not result.success or not isinstance(result.data, dict): return post_guid = result.data.get("post_guid") updated = bool(result.data.get("updated")) if not updated or not post_guid: return except Exception: # pylint: disable=broad-except return try: get_jobs_manager().start_post_processing( str(post_guid), priority="interactive", requested_by_user_id=requested_by_user_id, billing_user_id=requested_by_user_id, ) except Exception as exc: # pylint: disable=broad-except logger.error( "Failed to enqueue processing for latest post %s: %s", post_guid, exc ) def _handle_developer_mode_feed(url: str, user: Optional[User]) -> ResponseReturnValue: try: feed_id_str = url.split("/")[-1] feed_num = int(feed_id_str) result = writer_client.action( "create_dev_test_feed", { "rss_url": url, "title": f"Test Feed {feed_num}", "image_url": "https://via.placeholder.com/150", "description": "A test feed for development", "author": "Test Author", "post_count": 5, "guid_prefix": f"test-guid-{feed_num}", "download_url_prefix": f"http://test-feed/{feed_num}", }, wait=True, ) if not result or not result.success or not isinstance(result.data, dict): raise RuntimeError(getattr(result, "error", "Failed to create test feed")) feed_id = int(result.data["feed_id"]) feed = db.session.get(Feed, feed_id) if not feed: raise RuntimeError("Test feed disappeared") if user: created, previous_count = _ensure_user_feed_membership(feed, user.id) if created and previous_count == 0: _whitelist_latest_for_first_member(feed, getattr(user, "id", None)) return redirect(url_for("main.index")) except Exception as e: logger.error(f"Error adding test feed: {e}") return make_response((f"Error adding test feed: {e}", 500)) def _check_feed_allowance(user: User, url: str) -> Optional[ResponseReturnValue]: if user.role == "admin": return None existing_feed = Feed.query.filter_by(rss_url=url).first() existing_membership = None if existing_feed: existing_membership = UserFeed.query.filter_by( feed_id=existing_feed.id, user_id=user.id ).first() # Use manual allowance if set, otherwise fall back to plan allowance allowance = user.manual_feed_allowance if allowance is None: allowance = getattr(user, "feed_allowance", 0) or 0 if allowance > 0: current_count = _user_feed_count(user.id) if current_count >= allowance and existing_membership is None: return ( jsonify( { "error": "FEED_LIMIT_REACHED", "message": f"Your plan allows {allowance} feeds. Increase your plan to add more.", "feeds_in_use": current_count, "feed_allowance": allowance, } ), 402, ) return None @feed_bp.route("/feed", methods=["POST"]) def add_feed() -> ResponseReturnValue: settings = current_app.config.get("AUTH_SETTINGS") user = None if settings and settings.require_auth: user, error = _require_user_or_error() if error: return error url = request.form.get("url") if not url: return make_response(("URL is required", 400)) url = fix_url(url) if current_app.config.get("developer_mode") and url.startswith("http://test-feed/"): return _handle_developer_mode_feed(url, user) if not validators.url(url): return make_response(("Invalid URL", 400)) try: if user: allowance_error = _check_feed_allowance(user, url) if allowance_error: return allowance_error feed = add_or_refresh_feed(url) if user: created, previous_count = _ensure_user_feed_membership(feed, user.id) if created and previous_count == 0: _whitelist_latest_for_first_member(feed, getattr(user, "id", None)) elif not is_auth_enabled(): # In no-auth mode, if this feed has no members, trigger whitelisting for the latest post. if UserFeed.query.filter_by(feed_id=feed.id).count() == 0: _whitelist_latest_for_first_member(feed, None) app = cast(Any, current_app)._get_current_object() Thread( target=_enqueue_pending_jobs_async, args=(app,), daemon=True, name="enqueue-jobs-after-add", ).start() return redirect(url_for("main.index")) except Exception as e: # pylint: disable=broad-except logger.error(f"Error adding feed: {e}") return make_response((f"Error adding feed: {e}", 500)) @feed_bp.route("/api/feeds//share-link", methods=["POST"]) def create_feed_share_link(feed_id: int) -> ResponseReturnValue: settings = current_app.config.get("AUTH_SETTINGS") if not settings or not settings.require_auth: return jsonify({"error": "Authentication is disabled."}), 404 current = getattr(g, "current_user", None) if current is None: return jsonify({"error": "Authentication required."}), 401 feed = Feed.query.get_or_404(feed_id) user = db.session.get(User, current.id) if user is None: return jsonify({"error": "User not found."}), 404 result = writer_client.action( "create_feed_access_token", {"user_id": user.id, "feed_id": feed.id}, wait=True, ) if not result or not result.success or not isinstance(result.data, dict): return jsonify({"error": "Failed to create feed token"}), 500 token_id = str(result.data["token_id"]) secret = str(result.data["secret"]) parsed = urlparse(request.host_url) netloc = parsed.netloc scheme = parsed.scheme path = f"/feed/{feed.id}" query = urlencode({"feed_token": token_id, "feed_secret": secret}) prefilled_url = urlunparse((scheme, netloc, path, "", query, "")) return ( jsonify( { "url": prefilled_url, "feed_token": token_id, "feed_secret": secret, "feed_id": feed.id, } ), 201, ) @feed_bp.route("/api/feeds/search", methods=["GET"]) def search_feeds() -> ResponseReturnValue: term = (request.args.get("term") or "").strip() logger.info("Searching for podcasts with term: %s", term) if not term: return jsonify({"error": "term parameter is required"}), 400 try: headers = { "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36" } response = requests.get( "http://api.podcastindex.org/search", headers=headers, params={"term": term}, timeout=10, ) response.raise_for_status() upstream_data = response.json() except requests.exceptions.RequestException as exc: logger.error("Podcast search request failed: %s", exc) return jsonify({"error": "Search request failed"}), 502 except ValueError: logger.error("Podcast search returned non-JSON response") return ( jsonify({"error": "Unexpected response from search provider"}), 502, ) results = upstream_data.get("results") or [] transformed_results = [] if current_app.config.get("developer_mode") and term.lower() == "test": logger.info("Developer mode test search - adding mock results") for i in range(1, 11): transformed_results.append( { "title": f"Test Feed {i}", "author": "Test Author", "feedUrl": f"http://test-feed/{i}", "artwork": "https://via.placeholder.com/150", "genres": ["Test Genre"], } ) else: logger.info( "(dev mode disabled) Podcast search returned %d results", len(results) ) for item in results: feed_url = item.get("feedUrl") if not feed_url: continue transformed_results.append( { "title": item.get("collectionName") or item.get("trackName") or "Unknown title", "author": item.get("artistName") or "", "feedUrl": feed_url, "artworkUrl": item.get("artworkUrl100") or item.get("artworkUrl600") or "", "description": item.get("collectionCensoredName") or item.get("trackCensoredName") or "", "genres": item.get("genres") or [], } ) total = upstream_data.get("resultCount") if not isinstance(total, int) or total == 0: total = len(transformed_results) return jsonify( { "results": transformed_results, "total": total, } ) @feed_bp.route("/feed/", methods=["GET"]) def get_feed(f_id: int) -> Response: if hasattr(g, "current_user") and g.current_user: update_user_last_active(g.current_user.id) feed = Feed.query.get_or_404(f_id) # Refresh the feed refresh_feed(feed) # Generate the XML xml_content = generate_feed_xml(feed) response = make_response(xml_content) response.headers["Content-Type"] = "application/rss+xml" return response @feed_bp.route("/feed/", methods=["DELETE"]) def delete_feed(f_id: int) -> ResponseReturnValue: # pylint: disable=too-many-branches user, error = _require_user_or_error(allow_missing_auth=True) if error: return error feed = Feed.query.get_or_404(f_id) if user is not None and user.role != "admin": return ( jsonify({"error": "Only administrators can delete feeds."}), 403, ) # Get all post IDs for this feed post_ids = [post.id for post in feed.posts] # Delete audio files if they exist for post in feed.posts: if post.unprocessed_audio_path and Path(post.unprocessed_audio_path).exists(): try: Path(post.unprocessed_audio_path).unlink() logger.info(f"Deleted unprocessed audio: {post.unprocessed_audio_path}") except Exception as e: # pylint: disable=broad-except logger.error( f"Error deleting unprocessed audio {post.unprocessed_audio_path}: {e}" ) if post.processed_audio_path and Path(post.processed_audio_path).exists(): try: Path(post.processed_audio_path).unlink() logger.info(f"Deleted processed audio: {post.processed_audio_path}") except Exception as e: # pylint: disable=broad-except logger.error( f"Error deleting processed audio {post.processed_audio_path}: {e}" ) # Clean up directory structures _cleanup_feed_directories(feed) try: result = writer_client.action( "delete_feed_cascade", {"feed_id": feed.id}, wait=True ) if not result or not result.success: raise RuntimeError(getattr(result, "error", "Failed to delete feed")) except Exception as e: # pylint: disable=broad-except logger.error("Failed to delete feed %s: %s", feed.id, e) return make_response(("Failed to delete feed", 500)) logger.info( f"Deleted feed: {feed.title} (ID: {feed.id}) with {len(post_ids)} posts" ) return make_response("", 204) @feed_bp.route("/api/feeds//refresh", methods=["POST"]) def refresh_feed_endpoint(f_id: int) -> ResponseReturnValue: """ Refresh the specified feed and return a JSON response indicating the result. """ if hasattr(g, "current_user") and g.current_user: update_user_last_active(g.current_user.id) feed = Feed.query.get_or_404(f_id) feed_title = feed.title app = cast(Any, current_app)._get_current_object() Thread( target=_refresh_feed_background, args=(app, f_id), daemon=True, name=f"feed-refresh-{f_id}", ).start() return ( jsonify( { "status": "accepted", "message": f'Feed "{feed_title}" refresh queued for processing', } ), 202, ) @feed_bp.route("/api/feeds//settings", methods=["PATCH"]) def update_feed_settings_endpoint(feed_id: int) -> ResponseReturnValue: _, error_response = require_admin("update feed settings") if error_response is not None: return error_response payload = request.get_json(silent=True) or {} if "auto_whitelist_new_episodes_override" not in payload: return jsonify({"error": "No settings provided."}), 400 override = payload.get("auto_whitelist_new_episodes_override") if override is not None and not isinstance(override, bool): return ( jsonify( { "error": "auto_whitelist_new_episodes_override must be a boolean or null." } ), 400, ) result = writer_client.action( "update_feed_settings", {"feed_id": feed_id, "auto_whitelist_new_episodes_override": override}, wait=True, ) if result is None or not result.success: return ( jsonify({"error": getattr(result, "error", "Failed to update feed")}), 500, ) feed = db.session.get(Feed, feed_id) if feed is None: return jsonify({"error": "Feed not found"}), 404 return jsonify(_serialize_feed(feed, current_user=getattr(g, "current_user", None))) def _refresh_feed_background(app: Flask, feed_id: int) -> None: with app.app_context(): feed = db.session.get(Feed, feed_id) if not feed: logger.warning("Feed %s disappeared before refresh could run", feed_id) return try: refresh_feed(feed) get_jobs_manager().enqueue_pending_jobs( trigger="feed_refresh", context={"feed_id": feed_id} ) except Exception as exc: # pylint: disable=broad-except logger.error("Failed to refresh feed %s asynchronously: %s", feed_id, exc) @feed_bp.route("/api/feeds/refresh-all", methods=["POST"]) def refresh_all_feeds_endpoint() -> Response: """Trigger a refresh for all feeds and enqueue pending jobs.""" if hasattr(g, "current_user") and g.current_user: update_user_last_active(g.current_user.id) result = get_jobs_manager().start_refresh_all_feeds(trigger="manual_refresh") feed_count = Feed.query.count() return jsonify( { "status": "success", "feeds_refreshed": feed_count, "jobs_enqueued": result.get("enqueued", 0), } ) def _enqueue_pending_jobs_async(app: Flask) -> None: with app.app_context(): try: get_jobs_manager().enqueue_pending_jobs(trigger="feed_refresh") except Exception as exc: # pylint: disable=broad-except logger.error("Failed to enqueue pending jobs asynchronously: %s", exc) def _cleanup_feed_directories(feed: Feed) -> None: """ Clean up directory structures for a feed in both in/ and srv/ directories. Args: feed: The Feed object being deleted """ # Clean up srv/ directory (processed audio) # srv/{sanitized_feed_title}/ sanitized_feed_title = sanitize_title(feed.title) # Use the same sanitization logic as in processing_paths.py sanitized_feed_title = re.sub( r"[^a-zA-Z0-9\s_.-]", "", sanitized_feed_title ).strip() sanitized_feed_title = sanitized_feed_title.rstrip(".") sanitized_feed_title = re.sub(r"\s+", "_", sanitized_feed_title) srv_feed_dir = get_srv_root() / sanitized_feed_title if srv_feed_dir.exists() and srv_feed_dir.is_dir(): try: # Remove all files in the directory first for file_path in srv_feed_dir.iterdir(): if file_path.is_file(): file_path.unlink() logger.info(f"Deleted processed audio file: {file_path}") # Remove the directory itself srv_feed_dir.rmdir() logger.info(f"Deleted processed audio directory: {srv_feed_dir}") except Exception as e: # pylint: disable=broad-except logger.error( f"Error deleting processed audio directory {srv_feed_dir}: {e}" ) # Clean up in/ directories (unprocessed audio) # in/{sanitized_post_title}/ for post in feed.posts: # type: ignore[attr-defined] sanitized_post_title = sanitize_title(post.title) in_post_dir = get_in_root() / sanitized_post_title if in_post_dir.exists() and in_post_dir.is_dir(): try: # Remove all files in the directory first for file_path in in_post_dir.iterdir(): if file_path.is_file(): file_path.unlink() logger.info(f"Deleted unprocessed audio file: {file_path}") # Remove the directory itself in_post_dir.rmdir() logger.info(f"Deleted unprocessed audio directory: {in_post_dir}") except Exception as e: # pylint: disable=broad-except logger.error( f"Error deleting unprocessed audio directory {in_post_dir}: {e}" ) @feed_bp.route("/", methods=["GET"]) def get_feed_by_alt_or_url(something_or_rss: str) -> Response: # first try to serve ANY static file matching the path if current_app.static_folder is not None: # Use Flask's safe helper to prevent directory traversal outside static_folder try: return send_from_directory(current_app.static_folder, something_or_rss) except Exception: # Not a valid static file; fall through to RSS/DB lookup pass feed = Feed.query.filter_by(rss_url=something_or_rss).first() if feed: xml_content = generate_feed_xml(feed) response = make_response(xml_content) response.headers["Content-Type"] = "application/rss+xml" return response return make_response(("Feed not found", 404)) @feed_bp.route("/feeds", methods=["GET"]) def api_feeds() -> ResponseReturnValue: settings = current_app.config.get("AUTH_SETTINGS") if settings and settings.require_auth: user, error = _require_user_or_error() if error: return error if user and user.role != "admin": feeds = ( Feed.query.join(UserFeed, UserFeed.feed_id == Feed.id) .filter(UserFeed.user_id == user.id) .all() ) # Hack: Always include Feed 1 feed_1 = Feed.query.get(1) if feed_1 and feed_1 not in feeds: feeds.append(feed_1) else: feeds = Feed.query.all() current_user = user else: feeds = Feed.query.all() current_user = getattr(g, "current_user", None) feeds_data = [_serialize_feed(feed, current_user=current_user) for feed in feeds] return jsonify(feeds_data) @feed_bp.route("/api/feeds//join", methods=["POST"]) def api_join_feed(feed_id: int) -> ResponseReturnValue: user, error = _require_user_or_error() if error: return error if user is None: return jsonify({"error": "Authentication required."}), 401 feed = Feed.query.get_or_404(feed_id) existing_membership = UserFeed.query.filter_by( feed_id=feed.id, user_id=user.id ).first() if user.role != "admin": # Use manual allowance if set, otherwise fall back to plan allowance allowance = user.manual_feed_allowance if allowance is None: allowance = getattr(user, "feed_allowance", 0) or 0 at_capacity = allowance > 0 and _user_feed_count(user.id) >= allowance missing_membership = existing_membership is None if at_capacity and missing_membership: return ( jsonify( { "error": "FEED_LIMIT_REACHED", "message": f"Your plan allows {allowance} feeds. Increase your plan to add more.", "feeds_in_use": _user_feed_count(user.id), "feed_allowance": allowance, } ), 402, ) if existing_membership: refreshed = Feed.query.get(feed_id) return jsonify(_serialize_feed(refreshed or feed, current_user=user)), 200 created, previous_count = _ensure_user_feed_membership( feed, getattr(user, "id", None) ) if created and previous_count == 0: _whitelist_latest_for_first_member(feed, getattr(user, "id", None)) refreshed = Feed.query.get(feed_id) return ( jsonify(_serialize_feed(refreshed or feed, current_user=user)), 200, ) @feed_bp.route("/api/feeds//exit", methods=["POST"]) def api_exit_feed(feed_id: int) -> ResponseReturnValue: user, error = _require_user_or_error() if error: return error if user is None: return jsonify({"error": "Authentication required."}), 401 feed = Feed.query.get_or_404(feed_id) writer_client.action( "remove_user_feed_membership", {"feed_id": feed.id, "user_id": user.id}, wait=True, ) refreshed = Feed.query.get(feed_id) return ( jsonify(_serialize_feed(refreshed or feed, current_user=user)), 200, ) @feed_bp.route("/api/feeds//leave", methods=["POST"]) def api_leave_feed(feed_id: int) -> ResponseReturnValue: """Remove current user membership; hide from their view.""" user, error = _require_user_or_error() if error: return error if user is None: return jsonify({"error": "Authentication required."}), 401 feed = Feed.query.get_or_404(feed_id) writer_client.action( "remove_user_feed_membership", {"feed_id": feed.id, "user_id": user.id}, wait=True, ) return jsonify({"status": "ok", "feed_id": feed.id}) @feed_bp.route("/feed/user/", methods=["GET"]) def get_user_aggregate_feed(user_id: int) -> Response: """Serve the aggregate RSS feed for a specific user.""" # Auth check is handled by middleware via feed_token # If auth is disabled, this is public. # If auth is enabled, middleware ensures we have a valid token for this user_id. if is_auth_enabled(): current = getattr(g, "current_user", None) if current is None: return make_response(("Authentication required", 401)) if current.role != "admin" and current.id != user_id: return make_response(("Forbidden", 403)) user = db.session.get(User, user_id) if not user: if user_id == 0 and not is_auth_enabled(): # Support anonymous aggregate feed when auth is disabled xml_content = generate_aggregate_feed_xml(None) response = make_response(xml_content) response.headers["Content-Type"] = "application/rss+xml" return response return make_response(("User not found", 404)) xml_content = generate_aggregate_feed_xml(user) response = make_response(xml_content) response.headers["Content-Type"] = "application/rss+xml" return response @feed_bp.route("/feed/aggregate", methods=["GET"]) def get_aggregate_feed_redirect() -> ResponseReturnValue: """Convenience endpoint to redirect to the user's aggregate feed.""" settings = current_app.config.get("AUTH_SETTINGS") # Case 1: Auth Disabled -> Redirect to Admin User (or ID 0 if none exist) if not settings or not settings.require_auth: admin = User.query.filter_by(role="admin").first() user_id = admin.id if admin else 0 return redirect(url_for("feed.get_user_aggregate_feed", user_id=user_id)) # Case 2: Auth Enabled -> Require explicit user link # We cannot easily determine "current user" for a podcast player without a token. # If accessed via browser with session, we could redirect, but for consistency # we should probably just tell them to get their link. current = getattr(g, "current_user", None) if current: return redirect(url_for("feed.get_user_aggregate_feed", user_id=current.id)) return ( jsonify( { "error": "Authentication required", "message": "Please use your unique aggregate feed URL from the dashboard.", } ), 401, ) @feed_bp.route("/api/user/aggregate-link", methods=["POST"]) def create_aggregate_feed_link() -> ResponseReturnValue: """Generate a unique RSS link for the current user's aggregate feed.""" settings = current_app.config.get("AUTH_SETTINGS") user = None if not settings or not settings.require_auth: # Auth disabled: Use admin user or first available user user = User.query.filter_by(role="admin").first() if not user: user = User.query.first() if not user: # Create a default admin user if none exists default_username = "admin" default_password = secrets.token_urlsafe(16) result = writer_client.action( "create_user", { "username": default_username, "password": default_password, "role": "admin", }, wait=True, ) if result and result.success and isinstance(result.data, dict): user_id = result.data.get("user_id") if user_id: user = db.session.get(User, user_id) if not user: return ( jsonify({"error": "No user found and failed to create one."}), 500, ) else: user, error = _require_user_or_error() if error: return error if user is None: return jsonify({"error": "Authentication required."}), 401 # Create a token with feed_id=None (Aggregate Token) result = writer_client.action( "create_feed_access_token", {"user_id": user.id, "feed_id": None}, wait=True, ) if not result or not result.success or not isinstance(result.data, dict): return jsonify({"error": "Failed to create aggregate feed token"}), 500 token_id = str(result.data["token_id"]) secret = str(result.data["secret"]) parsed = urlparse(request.host_url) netloc = parsed.netloc scheme = parsed.scheme path = f"/feed/user/{user.id}" # If auth is disabled, we don't strictly need the token params, # but including them doesn't hurt and ensures the link works if auth is enabled later. # However, to keep it clean for single-user mode: settings = current_app.config.get("AUTH_SETTINGS") if settings and settings.require_auth: query = urlencode({"feed_token": token_id, "feed_secret": secret}) else: query = "" full_url = urlunparse((scheme, netloc, path, "", query, "")) return ( jsonify( { "url": full_url, "feed_token": token_id, "feed_secret": secret, } ), 201, ) def _require_user_or_error( allow_missing_auth: bool = False, ) -> tuple[User | None, ResponseReturnValue | None]: settings = current_app.config.get("AUTH_SETTINGS") if not settings or not settings.require_auth: if allow_missing_auth: return None, None return None, (jsonify({"error": "Authentication is disabled."}), 404) current = getattr(g, "current_user", None) if current is None: return None, (jsonify({"error": "Authentication required."}), 401) user = _auth_get_user() if user is None: return None, (jsonify({"error": "User not found."}), 404) return user, None def _serialize_feed( feed: Feed, *, current_user: Optional[User] = None, ) -> dict[str, Any]: auth_enabled = is_auth_enabled() member_ids = [membership.user_id for membership in getattr(feed, "user_feeds", [])] # In no-auth mode, everyone is functionally a member. is_member = not auth_enabled or bool( current_user and getattr(current_user, "id", None) in member_ids ) # Hack: Always treat Feed 1 as a member if feed.id == 1 and (current_user or not auth_enabled): is_member = True is_active_subscription = False if is_member: if current_user: is_active_subscription = is_feed_active_for_user(feed.id, current_user) elif not auth_enabled: is_active_subscription = True feed_payload = { "id": feed.id, "title": feed.title, "rss_url": feed.rss_url, "description": feed.description, "author": feed.author, "image_url": feed.image_url, "auto_whitelist_new_episodes_override": getattr( feed, "auto_whitelist_new_episodes_override", None ), "posts_count": len(feed.posts), "member_count": len(member_ids), "is_member": is_member, "is_active_subscription": is_active_subscription, } return feed_payload ================================================ FILE: src/app/routes/jobs_routes.py ================================================ import logging import flask from flask import Blueprint, request from flask.typing import ResponseReturnValue from app.extensions import db from app.jobs_manager import get_jobs_manager from app.jobs_manager_run_service import build_run_status_snapshot from app.post_cleanup import cleanup_processed_posts, count_cleanup_candidates from app.runtime_config import config as runtime_config logger = logging.getLogger("global_logger") jobs_bp = Blueprint("jobs", __name__) @jobs_bp.route("/api/jobs/active", methods=["GET"]) def api_list_active_jobs() -> ResponseReturnValue: try: limit = int(request.args.get("limit", "100")) except ValueError: limit = 100 result = get_jobs_manager().list_active_jobs(limit=limit) return flask.jsonify(result) @jobs_bp.route("/api/jobs/all", methods=["GET"]) def api_list_all_jobs() -> ResponseReturnValue: try: limit = int(request.args.get("limit", "100")) except ValueError: limit = 100 result = get_jobs_manager().list_all_jobs_detailed(limit=limit) return flask.jsonify(result) @jobs_bp.route("/api/job-manager/status", methods=["GET"]) def api_job_manager_status() -> ResponseReturnValue: run_snapshot = build_run_status_snapshot(db.session) return flask.jsonify({"run": run_snapshot}) @jobs_bp.route("/api/jobs//cancel", methods=["POST"]) def api_cancel_job(job_id: str) -> ResponseReturnValue: try: result = get_jobs_manager().cancel_job(job_id) status_code = ( 200 if result.get("status") == "cancelled" else (404 if result.get("error_code") == "NOT_FOUND" else 400) ) db.session.expire_all() return flask.jsonify(result), status_code except Exception as e: logger.error(f"Failed to cancel job {job_id}: {e}") return ( flask.jsonify( { "status": "error", "error_code": "CANCEL_FAILED", "message": f"Failed to cancel job: {str(e)}", } ), 500, ) @jobs_bp.route("/api/jobs/cleanup/preview", methods=["GET"]) def api_cleanup_preview() -> ResponseReturnValue: retention = getattr(runtime_config, "post_cleanup_retention_days", None) count, cutoff = count_cleanup_candidates(retention) return flask.jsonify( { "count": count, "retention_days": retention, "cutoff_utc": cutoff.isoformat() if cutoff else None, } ) @jobs_bp.route("/api/jobs/cleanup/run", methods=["POST"]) def api_run_cleanup() -> ResponseReturnValue: retention = getattr(runtime_config, "post_cleanup_retention_days", None) if retention is None or retention <= 0: return flask.jsonify( { "status": "disabled", "message": "Cleanup is disabled because retention_days <= 0.", } ) try: removed = cleanup_processed_posts(retention) remaining, cutoff = count_cleanup_candidates(retention) except Exception as exc: # pylint: disable=broad-except logger.error("Manual cleanup failed: %s", exc, exc_info=True) return ( flask.jsonify( { "status": "error", "message": "Cleanup job failed. Check server logs for details.", } ), 500, ) return flask.jsonify( { "status": "ok", "removed_posts": removed, "remaining_candidates": remaining, "retention_days": retention, "cutoff_utc": cutoff.isoformat() if cutoff else None, } ) ================================================ FILE: src/app/routes/main_routes.py ================================================ import logging import os import flask from flask import Blueprint, send_from_directory from app.auth.guards import require_admin from app.extensions import db from app.models import Feed, Post, User from app.runtime_config import config from app.writer.client import writer_client logger = logging.getLogger("global_logger") logger = logging.getLogger("global_logger") main_bp = Blueprint("main", __name__) @main_bp.route("/") def index() -> flask.Response: """Serve the React app's index.html.""" static_folder = flask.current_app.static_folder if static_folder and os.path.exists(os.path.join(static_folder, "index.html")): return send_from_directory(static_folder, "index.html") feeds = Feed.query.all() return flask.make_response( flask.render_template("index.html", feeds=feeds, config=config), 200 ) @main_bp.route("/api/landing/status", methods=["GET"]) def landing_status() -> flask.Response: """Public landing-page status with user counts and limits. Intended for the unauthenticated landing page; returns current user count and configured total limit (if any) so the UI can show remaining slots. """ require_auth = False landing_enabled = False try: settings = flask.current_app.config.get("AUTH_SETTINGS") require_auth = bool(settings and settings.require_auth) except Exception: # pragma: no cover - defensive require_auth = False try: landing_enabled = bool(getattr(config, "enable_public_landing_page", False)) except Exception: # pragma: no cover - defensive landing_enabled = False try: user_count = int(User.query.count()) except Exception: # pragma: no cover - defensive user_count = 0 limit_raw = getattr(config, "user_limit_total", None) try: user_limit_total = int(limit_raw) if limit_raw is not None else None except Exception: # pragma: no cover - defensive user_limit_total = None slots_remaining = None if user_limit_total is not None: slots_remaining = max(user_limit_total - user_count, 0) return flask.jsonify( { "require_auth": require_auth, "landing_page_enabled": landing_enabled, "user_count": user_count, "user_limit_total": user_limit_total, "slots_remaining": slots_remaining, } ) @main_bp.route("/") def catch_all(path: str) -> flask.Response: """Serve React app for all frontend routes, or serve static files.""" # Don't handle API routes - let them be handled by API blueprint if path.startswith("api/"): flask.abort(404) static_folder = flask.current_app.static_folder if static_folder: # First try to serve a static file if it exists static_file_path = os.path.join(static_folder, path) if os.path.exists(static_file_path) and os.path.isfile(static_file_path): return send_from_directory(static_folder, path) # If it's not a static file and index.html exists, serve the React app if os.path.exists(os.path.join(static_folder, "index.html")): return send_from_directory(static_folder, "index.html") # Fallback to 404 flask.abort(404) @main_bp.route("/feed//toggle-whitelist-all/", methods=["POST"]) def whitelist_all(f_id: str, val: str) -> flask.Response: _, error_response = require_admin("toggle whitelist for all posts") if error_response: return error_response feed = Feed.query.get_or_404(f_id) new_status = val.lower() == "true" try: result = writer_client.action( "toggle_whitelist_all_for_feed", {"feed_id": feed.id, "new_status": new_status}, wait=True, ) if not result or not result.success: raise RuntimeError(getattr(result, "error", "Unknown writer error")) except Exception: # pylint: disable=broad-except return flask.make_response( ( flask.jsonify( { "error": "Database busy, please retry", "retry_after_seconds": 1, } ), 503, ) ) return flask.make_response("", 200) @main_bp.route("/set_whitelist//", methods=["GET"]) def set_whitelist(p_guid: str, val: str) -> flask.Response: logger.info(f"Setting whitelist status for post with GUID: {p_guid} to {val}") post = Post.query.filter_by(guid=p_guid).first() if post is None: return flask.make_response(("Post not found", 404)) new_status = val.lower() == "true" try: result = writer_client.update( "Post", post.id, {"whitelisted": new_status}, wait=True ) if not result or not result.success: raise RuntimeError(getattr(result, "error", "Unknown writer error")) db.session.expire(post) except Exception: # pylint: disable=broad-except return flask.make_response( ( flask.jsonify( { "error": "Database busy, please retry", "retry_after_seconds": 1, } ), 503, ) ) return index() ================================================ FILE: src/app/routes/post_routes.py ================================================ import logging import math import os from pathlib import Path from typing import Any, Dict, Optional, cast import flask from flask import Blueprint, g, jsonify, request, send_file from flask.typing import ResponseReturnValue from app.auth.guards import require_admin from app.auth.service import update_user_last_active from app.extensions import db from app.jobs_manager import get_jobs_manager from app.models import ( Feed, Identification, ModelCall, Post, TranscriptSegment, ) from app.posts import clear_post_processing_data from app.routes.post_stats_utils import ( count_model_calls, is_mixed_segment, parse_refined_windows, ) from app.runtime_config import config as runtime_config from app.writer.client import writer_client logger = logging.getLogger("global_logger") post_bp = Blueprint("post", __name__) def _is_latest_post(feed: Feed, post: Post) -> bool: """Return True if the post is the latest by release_date (fallback to id).""" latest = ( Post.query.filter_by(feed_id=feed.id) .order_by(Post.release_date.desc().nullslast(), Post.id.desc()) .first() ) return bool(latest and latest.id == post.id) def _increment_download_count(post: Post) -> None: """Safely increment the download counter for a post.""" try: writer_client.action( "increment_download_count", {"post_id": post.id}, wait=False ) except Exception as e: # pylint: disable=broad-except logger.error(f"Failed to increment download count for post {post.guid}: {e}") def _ensure_whitelisted_for_download( post: Post, p_guid: str ) -> Optional[flask.Response]: """Make sure a post is whitelisted before serving or queuing processing.""" if post.whitelisted: return None if not getattr(runtime_config, "autoprocess_on_download", False): logger.warning( "Post %s not whitelisted and auto-process is disabled", post.guid ) return flask.make_response(("Post not whitelisted", 403)) try: writer_client.action( "whitelist_post", {"post_id": post.id}, wait=True, ) post.whitelisted = True logger.info("Auto-whitelisted post %s on download request", p_guid) return None except Exception as exc: # pylint: disable=broad-except logger.warning( "Failed to auto-whitelist post %s on download: %s", post.guid, exc ) return flask.make_response(("Post not whitelisted", 403)) def _missing_processed_audio_response(post: Post, p_guid: str) -> flask.Response: """Return a response when processed audio is missing, optionally queueing work.""" if not getattr(runtime_config, "autoprocess_on_download", False): logger.warning("Processed audio not found for post: %s", post.id) return flask.make_response(("Processed audio not found", 404)) logger.info( "Auto-processing on download is enabled; queuing processing for %s", p_guid, ) requester = getattr(getattr(g, "current_user", None), "id", None) job_response = get_jobs_manager().start_post_processing( p_guid, priority="download", requested_by_user_id=requester, billing_user_id=requester, ) status = cast(Optional[str], job_response.get("status")) status_code = { "completed": 200, "skipped": 200, "error": 400, "running": 202, "started": 202, }.get(status or "pending", 202) message = job_response.get( "message", "Processing queued because audio was not ready for download", ) return flask.make_response( flask.jsonify({**job_response, "message": message}), status_code, ) @post_bp.route("/api/feeds//posts", methods=["GET"]) def api_feed_posts(feed_id: int) -> flask.Response: """Return a paginated JSON list of posts for a specific feed.""" # Ensure we have fresh data db.session.expire_all() feed = Feed.query.get_or_404(feed_id) # Pagination and filtering try: page = int(request.args.get("page", 1)) except (TypeError, ValueError): page = 1 page = max(page, 1) try: page_size = int(request.args.get("page_size", 25)) except (TypeError, ValueError): page_size = 25 page_size = max(1, min(page_size, 200)) whitelisted_only = str(request.args.get("whitelisted_only", "false")).lower() in { "1", "true", "yes", "on", } # Query posts directly to avoid stale relationship cache base_query = Post.query.filter_by(feed_id=feed.id) if whitelisted_only: base_query = base_query.filter_by(whitelisted=True) ordered_query = base_query.order_by( Post.release_date.desc().nullslast(), Post.id.desc() ) total_posts = ordered_query.count() whitelisted_total = Post.query.filter_by(feed_id=feed.id, whitelisted=True).count() db_posts = ordered_query.offset((page - 1) * page_size).limit(page_size).all() posts = [ { "id": post.id, "guid": post.guid, "title": post.title, "description": post.description, "release_date": ( post.release_date.isoformat() if post.release_date else None ), "duration": post.duration, "whitelisted": post.whitelisted, "has_processed_audio": post.processed_audio_path is not None, "has_unprocessed_audio": post.unprocessed_audio_path is not None, "download_url": post.download_url, "image_url": post.image_url, "download_count": post.download_count, } for post in db_posts ] total_pages = math.ceil(total_posts / page_size) if total_posts else 0 return flask.jsonify( { "items": posts, "page": page, "page_size": page_size, "total": total_posts, "total_pages": total_pages, "whitelisted_total": whitelisted_total, } ) @post_bp.route("/api/posts//processing-estimate", methods=["GET"]) def api_post_processing_estimate(p_guid: str) -> ResponseReturnValue: post = Post.query.filter_by(guid=p_guid).first() if post is None: return flask.make_response(flask.jsonify({"error": "Post not found"}), 404) feed = db.session.get(Feed, post.feed_id) if feed is None: return flask.make_response(flask.jsonify({"error": "Feed not found"}), 404) _, error = require_admin("estimate processing costs") if error: return error minutes = max(1.0, float(post.duration or 0) / 60.0) if post.duration else 60.0 return flask.jsonify( { "post_guid": post.guid, "estimated_minutes": minutes, "can_process": True, "reason": None, } ) @post_bp.route("/post//json", methods=["GET"]) def get_post_json(p_guid: str) -> flask.Response: logger.info(f"API request for post details with GUID: {p_guid}") post = Post.query.filter_by(guid=p_guid).first() if post is None: return flask.make_response(jsonify({"error": "Post not found"}), 404) segment_count = post.segments.count() transcript_segments = [] if segment_count > 0: sample_segments = post.segments.limit(5).all() for segment in sample_segments: transcript_segments.append( { "id": segment.id, "sequence_num": segment.sequence_num, "start_time": segment.start_time, "end_time": segment.end_time, "text": ( segment.text[:100] + "..." if len(segment.text) > 100 else segment.text ), } ) whisper_model_calls = [] for model_call in post.model_calls.filter( ModelCall.model_name.like("%whisper%") ).all(): whisper_model_calls.append( { "id": model_call.id, "model_name": model_call.model_name, "status": model_call.status, "first_segment": model_call.first_segment_sequence_num, "last_segment": model_call.last_segment_sequence_num, "timestamp": ( model_call.timestamp.isoformat() if model_call.timestamp else None ), "response": ( model_call.response[:100] + "..." if model_call.response and len(model_call.response) > 100 else model_call.response ), "error": model_call.error_message, } ) post_data = { "id": post.id, "guid": post.guid, "title": post.title, "feed_id": post.feed_id, "unprocessed_audio_path": post.unprocessed_audio_path, "processed_audio_path": post.processed_audio_path, "has_unprocessed_audio": post.unprocessed_audio_path is not None, "has_processed_audio": post.processed_audio_path is not None, "transcript_segment_count": segment_count, "transcript_sample": transcript_segments, "model_call_count": post.model_calls.count(), "whisper_model_calls": whisper_model_calls, "whitelisted": post.whitelisted, "download_count": post.download_count, } return flask.jsonify(post_data) @post_bp.route("/post//debug", methods=["GET"]) def post_debug(p_guid: str) -> flask.Response: """Debug view for a post, showing model calls, transcript segments, and identifications.""" post = Post.query.filter_by(guid=p_guid).first() if post is None: return flask.make_response(("Post not found", 404)) model_calls = ( ModelCall.query.filter_by(post_id=post.id) .order_by(ModelCall.model_name, ModelCall.first_segment_sequence_num) .all() ) transcript_segments = post.segments.all() identifications = ( Identification.query.join(TranscriptSegment) .filter(TranscriptSegment.post_id == post.id) .order_by(TranscriptSegment.sequence_num) .all() ) model_call_statuses, model_types = count_model_calls(model_calls) content_segments = sum(1 for i in identifications if i.label == "content") ad_segments = sum(1 for i in identifications if i.label == "ad") stats = { "total_segments": len(transcript_segments), "total_model_calls": len(model_calls), "total_identifications": len(identifications), "content_segments": content_segments, "ad_segments_count": ad_segments, "model_call_statuses": model_call_statuses, "model_types": model_types, "download_count": post.download_count, } return flask.make_response( flask.render_template( "post_debug.html", post=post, model_calls=model_calls, transcript_segments=transcript_segments, identifications=identifications, stats=stats, ), 200, ) @post_bp.route("/api/posts//stats", methods=["GET"]) def api_post_stats(p_guid: str) -> flask.Response: """Get processing statistics for a post in JSON format.""" post = Post.query.filter_by(guid=p_guid).first() if post is None: return flask.make_response(flask.jsonify({"error": "Post not found"}), 404) model_calls = ( ModelCall.query.filter_by(post_id=post.id) .order_by(ModelCall.model_name, ModelCall.first_segment_sequence_num) .all() ) transcript_segments = post.segments.all() identifications = ( Identification.query.join(TranscriptSegment) .filter(TranscriptSegment.post_id == post.id) .order_by(TranscriptSegment.sequence_num) .all() ) model_call_statuses: Dict[str, int] = {} model_types: Dict[str, int] = {} for call in model_calls: if call.status not in model_call_statuses: model_call_statuses[call.status] = 0 model_call_statuses[call.status] += 1 if call.model_name not in model_types: model_types[call.model_name] = 0 model_types[call.model_name] += 1 content_segments = sum(1 for i in identifications if i.label == "content") ad_segments = sum(1 for i in identifications if i.label == "ad") # Refined ad windows are written by boundary refinement and are used for precise # cutting. We also derive a UI-only "mixed" flag for segments that overlap a # refined ad window but are not fully contained by it (i.e., segment contains # both content and ad). raw_refined = getattr(post, "refined_ad_boundaries", None) or [] refined_windows = parse_refined_windows(raw_refined) model_call_details = [] for call in model_calls: model_call_details.append( { "id": call.id, "model_name": call.model_name, "status": call.status, "segment_range": f"{call.first_segment_sequence_num}-{call.last_segment_sequence_num}", "first_segment_sequence_num": call.first_segment_sequence_num, "last_segment_sequence_num": call.last_segment_sequence_num, "timestamp": call.timestamp.isoformat() if call.timestamp else None, "retry_attempts": call.retry_attempts, "error_message": call.error_message, "prompt": call.prompt, "response": call.response, } ) transcript_segments_data = [] segment_mixed_by_id: Dict[int, bool] = {} for segment in transcript_segments: segment_identifications = [ i for i in identifications if i.transcript_segment_id == segment.id ] has_ad_label = any(i.label == "ad" for i in segment_identifications) primary_label = "ad" if has_ad_label else "content" seg_start = float(segment.start_time) seg_end = float(segment.end_time) mixed = bool(has_ad_label) and is_mixed_segment( seg_start=seg_start, seg_end=seg_end, refined_windows=refined_windows ) segment_mixed_by_id[int(segment.id)] = mixed transcript_segments_data.append( { "id": segment.id, "sequence_num": segment.sequence_num, "start_time": round(segment.start_time, 1), "end_time": round(segment.end_time, 1), "text": segment.text, "primary_label": primary_label, "mixed": mixed, "identifications": [ { "id": ident.id, "label": ident.label, "confidence": ( round(ident.confidence, 2) if ident.confidence else None ), "model_call_id": ident.model_call_id, } for ident in segment_identifications ], } ) identifications_data = [] for identification in identifications: segment = identification.transcript_segment identifications_data.append( { "id": identification.id, "transcript_segment_id": identification.transcript_segment_id, "label": identification.label, "confidence": ( round(identification.confidence, 2) if identification.confidence else None ), "model_call_id": identification.model_call_id, "segment_sequence_num": segment.sequence_num, "segment_start_time": round(segment.start_time, 1), "segment_end_time": round(segment.end_time, 1), "segment_text": segment.text, "mixed": bool(segment_mixed_by_id.get(int(segment.id), False)), } ) stats_data = { "post": { "guid": post.guid, "title": post.title, "duration": post.duration, "release_date": ( post.release_date.isoformat() if post.release_date else None ), "whitelisted": post.whitelisted, "has_processed_audio": post.processed_audio_path is not None, "download_count": post.download_count, }, "processing_stats": { "total_segments": len(transcript_segments), "total_model_calls": len(model_calls), "total_identifications": len(identifications), "content_segments": content_segments, "ad_segments_count": ad_segments, "model_call_statuses": model_call_statuses, "model_types": model_types, }, "model_calls": model_call_details, "transcript_segments": transcript_segments_data, "identifications": identifications_data, } return flask.jsonify(stats_data) @post_bp.route("/api/posts//whitelist", methods=["POST"]) def api_toggle_whitelist(p_guid: str) -> ResponseReturnValue: """Toggle whitelist status for a post via API (admins only).""" post = Post.query.filter_by(guid=p_guid).first() if post is None: return flask.make_response(flask.jsonify({"error": "Post not found"}), 404) feed = db.session.get(Feed, post.feed_id) if feed is None: return flask.make_response(flask.jsonify({"error": "Feed not found"}), 404) user, error = require_admin("whitelist this episode") if error: return error if user is not None and user.role != "admin": return ( flask.jsonify( { "error": "FORBIDDEN", "message": "Only admins can change whitelist status.", } ), 403, ) data = request.get_json() if data is None or "whitelisted" not in data: return flask.make_response( flask.jsonify({"error": "Missing whitelisted field"}), 400 ) try: writer_client.update( "Post", post.id, {"whitelisted": bool(data["whitelisted"])}, wait=True ) # Refresh post object db.session.expire(post) except Exception as e: logger.error(f"Failed to toggle whitelist: {e}") return ( flask.jsonify( { "error": "Failed to update post", } ), 500, ) response_body: Dict[str, Any] = { "guid": post.guid, "whitelisted": post.whitelisted, "message": "Whitelist status updated successfully", } trigger_processing = bool(data.get("trigger_processing")) if post.whitelisted and trigger_processing: billing_user_id = getattr(user, "id", None) job_response = get_jobs_manager().start_post_processing( post.guid, priority="interactive", requested_by_user_id=billing_user_id, billing_user_id=billing_user_id, ) response_body["processing_job"] = job_response return flask.jsonify(response_body) @post_bp.route("/api/feeds//toggle-whitelist-all", methods=["POST"]) def api_toggle_whitelist_all(feed_id: int) -> ResponseReturnValue: """Intelligently toggle whitelist status for all posts in a feed. Admin only. """ feed = Feed.query.get_or_404(feed_id) _, error = require_admin("toggle whitelist for all posts") if error: return error if not feed.posts: return flask.jsonify( { "message": "No posts found in this feed", "whitelisted_count": 0, "total_count": 0, } ) all_whitelisted = all(post.whitelisted for post in feed.posts) new_status = not all_whitelisted try: result = writer_client.action( "toggle_whitelist_all_for_feed", {"feed_id": feed.id, "new_status": new_status}, wait=True, ) if not result or not result.success: raise RuntimeError(getattr(result, "error", "Unknown writer error")) updated = int((result.data or {}).get("updated_count") or 0) except Exception: # pylint: disable=broad-except return ( flask.jsonify( { "error": "Database busy, please retry", "retry_after_seconds": 1, } ), 503, ) whitelisted_count = Post.query.filter_by(feed_id=feed.id, whitelisted=True).count() total_count = Post.query.filter_by(feed_id=feed.id).count() return flask.jsonify( { "message": f"{'Whitelisted' if new_status else 'Unwhitelisted'} all posts", "whitelisted_count": whitelisted_count, "total_count": total_count, "all_whitelisted": new_status, "updated_count": updated, } ) @post_bp.route("/api/posts//process", methods=["POST"]) def api_process_post(p_guid: str) -> ResponseReturnValue: """Start processing a post and return immediately. Admin only. """ post = Post.query.filter_by(guid=p_guid).first() if not post: return ( flask.jsonify( { "status": "error", "error_code": "NOT_FOUND", "message": "Post not found", } ), 404, ) feed = db.session.get(Feed, post.feed_id) if feed is None: return ( flask.jsonify( { "status": "error", "error_code": "FEED_NOT_FOUND", "message": "Feed not found", } ), 404, ) user, error = require_admin("process this episode") if error: return error if not post.whitelisted: return ( flask.jsonify( { "status": "error", "error_code": "NOT_WHITELISTED", "message": "Post not whitelisted", } ), 400, ) if post.processed_audio_path and os.path.exists(post.processed_audio_path): return flask.jsonify( { "status": "completed", "message": "Post already processed", "download_url": f"/api/posts/{p_guid}/download", } ) billing_user_id = getattr(user, "id", None) try: result = get_jobs_manager().start_post_processing( p_guid, priority="interactive", requested_by_user_id=billing_user_id, billing_user_id=billing_user_id, ) status_code = 200 if result.get("status") in ("started", "completed") else 400 return flask.jsonify(result), status_code except Exception as e: logger.error(f"Failed to start processing job for {p_guid}: {e}") return ( flask.jsonify( { "status": "error", "error_code": "JOB_START_FAILED", "message": f"Failed to start processing job: {str(e)}", } ), 500, ) @post_bp.route("/api/posts//reprocess", methods=["POST"]) def api_reprocess_post(p_guid: str) -> ResponseReturnValue: """Clear all processing data for a post and start processing from scratch. Admin only. """ logger.info("[API] Reprocess requested for post_guid=%s", p_guid) post = Post.query.filter_by(guid=p_guid).first() if not post: logger.warning("[API] Reprocess: post not found for guid=%s", p_guid) return ( flask.jsonify( { "status": "error", "error_code": "NOT_FOUND", "message": "Post not found", } ), 404, ) feed = db.session.get(Feed, post.feed_id) if feed is None: logger.warning( "[API] Reprocess: feed not found for guid=%s feed_id=%s", p_guid, getattr(post, "feed_id", None), ) return ( flask.jsonify( { "status": "error", "error_code": "FEED_NOT_FOUND", "message": "Feed not found", } ), 404, ) user, error = require_admin("reprocess this episode") if error: logger.warning("[API] Reprocess: auth error for guid=%s", p_guid) return error if user and user.role != "admin": logger.warning( "[API] Reprocess: non-admin user attempted reprocess guid=%s user_id=%s role=%s", p_guid, getattr(user, "id", None), getattr(user, "role", None), ) return ( flask.jsonify( { "status": "error", "error_code": "REPROCESS_FORBIDDEN", "message": "Only admins can reprocess episodes.", } ), 403, ) if not post.whitelisted: logger.info( "[API] Reprocess: post not whitelisted guid=%s post_id=%s", p_guid, getattr(post, "id", None), ) return ( flask.jsonify( { "status": "error", "error_code": "NOT_WHITELISTED", "message": "Post not whitelisted", } ), 400, ) billing_user_id = getattr(user, "id", None) try: logger.info( "[API] Reprocess: cancelling jobs and clearing processing data guid=%s post_id=%s", p_guid, getattr(post, "id", None), ) get_jobs_manager().cancel_post_jobs(p_guid) clear_post_processing_data(post) logger.info( "[API] Reprocess: starting post processing guid=%s post_id=%s", p_guid, getattr(post, "id", None), ) result = get_jobs_manager().start_post_processing( p_guid, priority="interactive", requested_by_user_id=billing_user_id, billing_user_id=billing_user_id, ) status_code = 200 if result.get("status") in ("started", "completed") else 400 if result.get("status") == "started": result["message"] = "Post cleared and reprocessing started" logger.info( "[API] Reprocess: completed guid=%s status=%s code=%s", p_guid, result.get("status"), status_code, ) return flask.jsonify(result), status_code except Exception as e: logger.error(f"Failed to reprocess post {p_guid}: {e}", exc_info=True) return ( flask.jsonify( { "status": "error", "error_code": "REPROCESS_FAILED", "message": f"Failed to reprocess post: {str(e)}", } ), 500, ) @post_bp.route("/api/posts//status", methods=["GET"]) def api_post_status(p_guid: str) -> ResponseReturnValue: """Get the current processing status of a post via JobsManager.""" result = get_jobs_manager().get_post_status(p_guid) status_code = ( 200 if result.get("status") != "error" else (404 if result.get("error_code") == "NOT_FOUND" else 400) ) return flask.jsonify(result), status_code @post_bp.route("/api/posts//audio", methods=["GET"]) def api_get_post_audio(p_guid: str) -> ResponseReturnValue: """API endpoint to serve processed audio files with proper CORS headers.""" logger.info(f"API request for audio file with GUID: {p_guid}") post = Post.query.filter_by(guid=p_guid).first() if post is None: logger.warning(f"Post with GUID: {p_guid} not found") return flask.make_response( jsonify({"error": "Post not found", "error_code": "NOT_FOUND"}), 404 ) if not post.whitelisted: logger.warning(f"Post: {post.title} is not whitelisted") return flask.make_response( jsonify({"error": "Post not whitelisted", "error_code": "NOT_WHITELISTED"}), 403, ) if not post.processed_audio_path or not Path(post.processed_audio_path).exists(): logger.warning(f"Processed audio not found for post: {post.id}") return flask.make_response( jsonify( { "error": "Processed audio not available", "error_code": "AUDIO_NOT_READY", "message": "Post needs to be processed first", } ), 404, ) try: response = send_file( path_or_file=Path(post.processed_audio_path).resolve(), mimetype="audio/mpeg", as_attachment=False, ) response.headers["Accept-Ranges"] = "bytes" return response except Exception as e: # pylint: disable=broad-except logger.error(f"Error serving audio file for {p_guid}: {e}") return flask.make_response( jsonify( {"error": "Error serving audio file", "error_code": "SERVER_ERROR"} ), 500, ) @post_bp.route("/api/posts//download", methods=["GET"]) def api_download_post(p_guid: str) -> flask.Response: """API endpoint to download processed audio files.""" current_user = getattr(g, "current_user", None) if current_user: update_user_last_active(current_user.id) logger.info(f"Request to download post with GUID: {p_guid}") post = Post.query.filter_by(guid=p_guid).first() if post is None: logger.warning(f"Post with GUID: {p_guid} not found") return flask.make_response(("Post not found", 404)) whitelist_response = _ensure_whitelisted_for_download(post, p_guid) if whitelist_response: return whitelist_response if not post.processed_audio_path or not Path(post.processed_audio_path).exists(): return _missing_processed_audio_response(post, p_guid) try: response = send_file( path_or_file=Path(post.processed_audio_path).resolve(), mimetype="audio/mpeg", as_attachment=True, download_name=f"{post.title}.mp3", ) except Exception as e: # pylint: disable=broad-except logger.error(f"Error serving file for {p_guid}: {e}") return flask.make_response(("Error serving file", 500)) _increment_download_count(post) return response @post_bp.route("/api/posts//download/original", methods=["GET"]) def api_download_original_post(p_guid: str) -> flask.Response: """API endpoint to download original (unprocessed) audio files.""" logger.info(f"Request to download original post with GUID: {p_guid}") post = Post.query.filter_by(guid=p_guid).first() if post is None: logger.warning(f"Post with GUID: {p_guid} not found") return flask.make_response(("Post not found", 404)) if not post.whitelisted: logger.warning(f"Post: {post.title} is not whitelisted") return flask.make_response(("Post not whitelisted", 403)) if ( not post.unprocessed_audio_path or not Path(post.unprocessed_audio_path).exists() ): logger.warning(f"Original audio not found for post: {post.id}") return flask.make_response(("Original audio not found", 404)) try: response = send_file( path_or_file=Path(post.unprocessed_audio_path).resolve(), mimetype="audio/mpeg", as_attachment=True, download_name=f"{post.title}_original.mp3", ) except Exception as e: # pylint: disable=broad-except logger.error(f"Error serving original file for {p_guid}: {e}") return flask.make_response(("Error serving file", 500)) _increment_download_count(post) return response # Legacy endpoints for backward compatibility @post_bp.route("/post/.mp3", methods=["GET"]) def download_post_legacy(p_guid: str) -> flask.Response: return api_download_post(p_guid) @post_bp.route("/post//original.mp3", methods=["GET"]) def download_original_post_legacy(p_guid: str) -> flask.Response: return api_download_original_post(p_guid) ================================================ FILE: src/app/routes/post_stats_utils.py ================================================ from __future__ import annotations from typing import Any, Dict, Iterable, List, Tuple def count_model_calls( model_calls: Iterable[Any], ) -> Tuple[Dict[str, int], Dict[str, int]]: model_call_statuses: Dict[str, int] = {} model_types: Dict[str, int] = {} for call in model_calls: status = getattr(call, "status", None) model_name = getattr(call, "model_name", None) if status is not None: model_call_statuses[status] = model_call_statuses.get(status, 0) + 1 if model_name is not None: model_types[model_name] = model_types.get(model_name, 0) + 1 return model_call_statuses, model_types def parse_refined_windows(raw_refined: Any) -> List[Tuple[float, float]]: refined_windows: List[Tuple[float, float]] = [] if not isinstance(raw_refined, list): return refined_windows for item in raw_refined: if not isinstance(item, dict): continue start_raw = item.get("refined_start") end_raw = item.get("refined_end") if start_raw is None or end_raw is None: continue try: start_v = float(start_raw) end_v = float(end_raw) except Exception: continue if end_v > start_v: refined_windows.append((start_v, end_v)) return refined_windows def is_mixed_segment( *, seg_start: float, seg_end: float, refined_windows: List[Tuple[float, float]] ) -> bool: for win_start, win_end in refined_windows: overlaps = seg_start <= win_end and seg_end >= win_start if not overlaps: continue fully_contained = seg_start >= win_start and seg_end <= win_end if not fully_contained: return True return False ================================================ FILE: src/app/runtime_config.py ================================================ """ Runtime configuration module - isolated to prevent circular imports. Initializes the global config object that is used throughout the application. """ import os import sys from shared import defaults as DEFAULTS from shared.config import Config as RuntimeConfig from shared.config import LocalWhisperConfig, OutputConfig, ProcessingConfig is_test = "pytest" in sys.modules # For tests, use in-memory config for deterministic behavior. For runtime, # initialize with sensible defaults; DB-backed settings will hydrate immediately after migrations. if is_test: from shared.test_utils import create_standard_test_config config = create_standard_test_config() else: config = RuntimeConfig( llm_api_key=None, llm_model=DEFAULTS.LLM_DEFAULT_MODEL, openai_base_url=None, openai_max_tokens=DEFAULTS.OPENAI_DEFAULT_MAX_TOKENS, openai_timeout=DEFAULTS.OPENAI_DEFAULT_TIMEOUT_SEC, output=OutputConfig( fade_ms=DEFAULTS.OUTPUT_FADE_MS, min_ad_segement_separation_seconds=DEFAULTS.OUTPUT_MIN_AD_SEGMENT_SEPARATION_SECONDS, min_ad_segment_length_seconds=DEFAULTS.OUTPUT_MIN_AD_SEGMENT_LENGTH_SECONDS, min_confidence=DEFAULTS.OUTPUT_MIN_CONFIDENCE, ), processing=ProcessingConfig( num_segments_to_input_to_prompt=DEFAULTS.PROCESSING_NUM_SEGMENTS_TO_INPUT_TO_PROMPT, max_overlap_segments=DEFAULTS.PROCESSING_MAX_OVERLAP_SEGMENTS, ), background_update_interval_minute=DEFAULTS.APP_BACKGROUND_UPDATE_INTERVAL_MINUTE, post_cleanup_retention_days=DEFAULTS.APP_POST_CLEANUP_RETENTION_DAYS, llm_max_concurrent_calls=DEFAULTS.LLM_DEFAULT_MAX_CONCURRENT_CALLS, llm_max_retry_attempts=DEFAULTS.LLM_DEFAULT_MAX_RETRY_ATTEMPTS, llm_enable_token_rate_limiting=DEFAULTS.LLM_ENABLE_TOKEN_RATE_LIMITING, llm_max_input_tokens_per_call=DEFAULTS.LLM_MAX_INPUT_TOKENS_PER_CALL, llm_max_input_tokens_per_minute=DEFAULTS.LLM_MAX_INPUT_TOKENS_PER_MINUTE, automatically_whitelist_new_episodes=DEFAULTS.APP_AUTOMATICALLY_WHITELIST_NEW_EPISODES, number_of_episodes_to_whitelist_from_archive_of_new_feed=DEFAULTS.APP_NUM_EPISODES_TO_WHITELIST_FROM_ARCHIVE_OF_NEW_FEED, whisper=LocalWhisperConfig(model=DEFAULTS.WHISPER_LOCAL_MODEL), enable_public_landing_page=DEFAULTS.APP_ENABLE_PUBLIC_LANDING_PAGE, user_limit_total=DEFAULTS.APP_USER_LIMIT_TOTAL, developer_mode=os.environ.get("DEVELOPER_MODE", "false").lower() == "true", autoprocess_on_download=DEFAULTS.APP_AUTOPROCESS_ON_DOWNLOAD, ) ================================================ FILE: src/app/static/.gitignore ================================================ # This file ensures the static directory exists in the repository. # Frontend build assets are generated here but not committed to git. * !.gitignore ================================================ FILE: src/app/templates/index.html ================================================ Podly - Redirecting to New UI

Welcome to Podly

We've moved to a new and improved interface!

You will be automatically redirected to our new UI in 5 seconds.

{% set redirect_url = "http://" + request.host.split(':')[0] + ":5001" %} Go to New UI Now

If you are not redirected automatically, click the button above.

================================================ FILE: src/app/timeout_decorator.py ================================================ import functools import threading from typing import Any, Callable, List, Optional, TypeVar T = TypeVar("T") class TimeoutException(Exception): """Custom exception to indicate a timeout.""" def timeout_decorator(timeout: int) -> Callable[[Callable[..., T]], Callable[..., T]]: """ Decorator to enforce a timeout on a function. If the function execution exceeds the timeout, a TimeoutException is raised. """ def decorator(func: Callable[..., T]) -> Callable[..., T]: @functools.wraps(func) def wrapper(*args: Any, **kwargs: Any) -> T: timeout_flag = threading.Event() result: List[Optional[T]] = [None] def target() -> None: try: result[0] = func(*args, **kwargs) except Exception as e: # pylint: disable=broad-exception-caught print(f"Exception in thread: {e}") finally: timeout_flag.set() thread = threading.Thread(target=target) thread.start() thread.join(timeout) if not timeout_flag.is_set(): raise TimeoutException( f"Function '{func.__name__}' exceeded timeout of {timeout} seconds." ) return result[0] # type: ignore return wrapper return decorator ================================================ FILE: src/app/writer/__init__.py ================================================ from .executor import CommandExecutor from .service import run_writer_service __all__ = ["CommandExecutor", "run_writer_service"] ================================================ FILE: src/app/writer/__main__.py ================================================ from .service import run_writer_service if __name__ == "__main__": run_writer_service() ================================================ FILE: src/app/writer/actions/__init__.py ================================================ """Writer action function re-exports. Mypy runs with `--no-implicit-reexport`, so imports use explicit aliasing. """ # pylint: disable=useless-import-alias from .cleanup import ( cleanup_missing_audio_paths_action as cleanup_missing_audio_paths_action, ) from .cleanup import cleanup_processed_post_action as cleanup_processed_post_action from .cleanup import ( clear_post_processing_data_action as clear_post_processing_data_action, ) from .feeds import add_feed_action as add_feed_action from .feeds import create_dev_test_feed_action as create_dev_test_feed_action from .feeds import create_feed_access_token_action as create_feed_access_token_action from .feeds import delete_feed_cascade_action as delete_feed_cascade_action from .feeds import ( ensure_user_feed_membership_action as ensure_user_feed_membership_action, ) from .feeds import increment_download_count_action as increment_download_count_action from .feeds import refresh_feed_action as refresh_feed_action from .feeds import ( remove_user_feed_membership_action as remove_user_feed_membership_action, ) from .feeds import ( toggle_whitelist_all_for_feed_action as toggle_whitelist_all_for_feed_action, ) from .feeds import touch_feed_access_token_action as touch_feed_access_token_action from .feeds import update_feed_settings_action as update_feed_settings_action from .feeds import ( whitelist_latest_post_for_feed_action as whitelist_latest_post_for_feed_action, ) from .feeds import whitelist_post_action as whitelist_post_action from .jobs import cancel_existing_jobs_action as cancel_existing_jobs_action from .jobs import cleanup_stale_jobs_action as cleanup_stale_jobs_action from .jobs import clear_all_jobs_action as clear_all_jobs_action from .jobs import create_job_action as create_job_action from .jobs import dequeue_job_action as dequeue_job_action from .jobs import mark_cancelled_action as mark_cancelled_action from .jobs import reassign_pending_jobs_action as reassign_pending_jobs_action from .jobs import update_job_status_action as update_job_status_action from .processor import insert_identifications_action as insert_identifications_action from .processor import mark_model_call_failed_action as mark_model_call_failed_action from .processor import replace_identifications_action as replace_identifications_action from .processor import replace_transcription_action as replace_transcription_action from .processor import upsert_model_call_action as upsert_model_call_action from .processor import ( upsert_whisper_model_call_action as upsert_whisper_model_call_action, ) from .system import ensure_active_run_action as ensure_active_run_action from .system import update_combined_config_action as update_combined_config_action from .system import update_discord_settings_action as update_discord_settings_action from .users import create_user_action as create_user_action from .users import delete_user_action as delete_user_action from .users import set_manual_feed_allowance_action as set_manual_feed_allowance_action from .users import ( set_user_billing_by_customer_id_action as set_user_billing_by_customer_id_action, ) from .users import set_user_billing_fields_action as set_user_billing_fields_action from .users import set_user_role_action as set_user_role_action from .users import update_user_last_active_action as update_user_last_active_action from .users import update_user_password_action as update_user_password_action from .users import upsert_discord_user_action as upsert_discord_user_action ================================================ FILE: src/app/writer/actions/cleanup.py ================================================ import logging import os from typing import Any, Dict from app.extensions import db from app.jobs_manager_run_service import recalculate_run_counts from app.models import ( Identification, ModelCall, Post, ProcessingJob, TranscriptSegment, ) logger = logging.getLogger("writer") def cleanup_missing_audio_paths_action(params: Dict[str, Any]) -> int: inconsistent_posts = Post.query.filter( Post.whitelisted, ( (Post.unprocessed_audio_path.isnot(None)) | (Post.processed_audio_path.isnot(None)) ), ).all() count = 0 for post in inconsistent_posts: changed = False if post.processed_audio_path and not os.path.exists(post.processed_audio_path): post.processed_audio_path = None changed = True if post.unprocessed_audio_path and not os.path.exists( post.unprocessed_audio_path ): post.unprocessed_audio_path = None changed = True if changed: latest_job = ( ProcessingJob.query.filter_by(post_guid=post.guid) .order_by(ProcessingJob.created_at.desc()) .first() ) if latest_job and latest_job.status not in {"pending", "running"}: latest_job.status = "pending" latest_job.current_step = 0 latest_job.progress_percentage = 0.0 latest_job.step_name = "Not started" latest_job.error_message = None latest_job.started_at = None latest_job.completed_at = None count += 1 return count def clear_post_processing_data_action(params: Dict[str, Any]) -> Dict[str, Any]: post_id = params.get("post_id") post = db.session.get(Post, post_id) if not post: raise ValueError(f"Post {post_id} not found") logger.info("[WRITER] clear_post_processing_data_action: post_id=%s", post_id) # Chunked deletes for segments and identifications while True: ids_batch = [ row[0] for row in db.session.query(TranscriptSegment.id) .filter_by(post_id=post.id) .limit(500) .all() ] if not ids_batch: logger.debug( "[WRITER] clear_post_processing_data_action: no more segments for post_id=%s", post_id, ) break db.session.query(Identification).filter( Identification.transcript_segment_id.in_(ids_batch) ).delete(synchronize_session=False) db.session.query(TranscriptSegment).filter( TranscriptSegment.id.in_(ids_batch) ).delete(synchronize_session=False) # Model calls db.session.query(ModelCall).filter_by(post_id=post.id).delete() # Processing jobs db.session.query(ProcessingJob).filter_by(post_guid=post.guid).delete() # Reset post fields post.unprocessed_audio_path = None post.processed_audio_path = None post.duration = None logger.info( "[WRITER] clear_post_processing_data_action: completed post_id=%s", post_id ) return {"post_id": post.id} def cleanup_processed_post_action(params: Dict[str, Any]) -> Dict[str, Any]: post_id = params.get("post_id") if not post_id: raise ValueError("post_id is required") post = db.session.get(Post, int(post_id)) if not post: raise ValueError(f"Post {post_id} not found") logger.info("[WRITER] cleanup_processed_post_action: post_id=%s", post_id) # Remove processing artifacts and dependent rows. clear_post_processing_data_action({"post_id": post.id}) post.whitelisted = False recalculate_run_counts(db.session) logger.info("[WRITER] cleanup_processed_post_action: completed post_id=%s", post_id) return {"post_id": post.id} ================================================ FILE: src/app/writer/actions/feeds.py ================================================ import hashlib import secrets import uuid from datetime import datetime from typing import Any, Dict from sqlalchemy import func from app.extensions import db from app.jobs_manager_run_service import recalculate_run_counts from app.models import ( Feed, FeedAccessToken, Identification, ModelCall, Post, ProcessingJob, TranscriptSegment, UserFeed, ) def refresh_feed_action(params: Dict[str, Any]) -> Dict[str, Any]: feed_id = params.get("feed_id") updates = params.get("updates", {}) new_posts_data = params.get("new_posts", []) feed = db.session.get(Feed, feed_id) if not feed: raise ValueError(f"Feed {feed_id} not found") for k, v in updates.items(): setattr(feed, k, v) created_posts = [] for post_data in new_posts_data: # Handle datetime deserialization if "release_date" in post_data and isinstance(post_data["release_date"], str): post_data["release_date"] = datetime.fromisoformat( post_data["release_date"] ) post = Post(**post_data) db.session.add(post) created_posts.append(post) db.session.flush() for post in created_posts: if post.whitelisted: job = ProcessingJob( id=str(uuid.uuid4()), post_guid=post.guid, status="pending", current_step=0, total_steps=4, progress_percentage=0.0, created_at=datetime.utcnow(), ) db.session.add(job) recalculate_run_counts(db.session) return {"feed_id": feed.id, "new_posts_count": len(created_posts)} def add_feed_action(params: Dict[str, Any]) -> Dict[str, Any]: feed_data = params.get("feed") if not isinstance(feed_data, dict): raise ValueError("feed data must be a dictionary") posts_data = params.get("posts", []) feed = Feed(**feed_data) db.session.add(feed) db.session.flush() created_posts = [] for post_data in posts_data: post_data["feed_id"] = feed.id if "release_date" in post_data and isinstance(post_data["release_date"], str): post_data["release_date"] = datetime.fromisoformat( post_data["release_date"] ) post = Post(**post_data) db.session.add(post) created_posts.append(post) db.session.flush() for post in created_posts: if post.whitelisted: job = ProcessingJob( id=str(uuid.uuid4()), post_guid=post.guid, status="pending", current_step=0, total_steps=4, progress_percentage=0.0, created_at=datetime.utcnow(), ) db.session.add(job) recalculate_run_counts(db.session) return {"feed_id": feed.id} def update_feed_settings_action(params: Dict[str, Any]) -> Dict[str, Any]: feed_id = params.get("feed_id") if not feed_id: raise ValueError("feed_id is required") feed = db.session.get(Feed, int(feed_id)) if not feed: raise ValueError(f"Feed {feed_id} not found") if "auto_whitelist_new_episodes_override" in params: feed.auto_whitelist_new_episodes_override = params.get( "auto_whitelist_new_episodes_override" ) db.session.flush() return {"feed_id": feed.id} def increment_download_count_action(params: Dict[str, Any]) -> Dict[str, Any]: post_id = params.get("post_id") if not post_id: raise ValueError("post_id is required") updated = Post.query.filter_by(id=post_id).update( {Post.download_count: func.coalesce(Post.download_count, 0) + 1}, synchronize_session=False, ) return {"post_id": post_id, "updated": updated} def whitelist_post_action(params: Dict[str, Any]) -> Dict[str, Any]: post_id = params.get("post_id") if not post_id: raise ValueError("post_id is required") updated = Post.query.filter_by(id=int(post_id)).update( {Post.whitelisted: True}, synchronize_session=False ) return {"post_id": int(post_id), "updated": int(updated)} def ensure_user_feed_membership_action(params: Dict[str, Any]) -> Dict[str, Any]: feed_id = params.get("feed_id") user_id = params.get("user_id") if not feed_id or not user_id: raise ValueError("feed_id and user_id are required") feed_id_i = int(feed_id) user_id_i = int(user_id) previous_count = int(UserFeed.query.filter_by(feed_id=feed_id_i).count()) existing = UserFeed.query.filter_by(feed_id=feed_id_i, user_id=user_id_i).first() if existing: return {"created": False, "previous_count": previous_count} db.session.add(UserFeed(feed_id=feed_id_i, user_id=user_id_i)) db.session.flush() return {"created": True, "previous_count": previous_count} def remove_user_feed_membership_action(params: Dict[str, Any]) -> Dict[str, Any]: feed_id = params.get("feed_id") user_id = params.get("user_id") if not feed_id or not user_id: raise ValueError("feed_id and user_id are required") removed = UserFeed.query.filter_by( feed_id=int(feed_id), user_id=int(user_id) ).delete(synchronize_session=False) return {"removed": int(removed)} def whitelist_latest_post_for_feed_action(params: Dict[str, Any]) -> Dict[str, Any]: feed_id = params.get("feed_id") if not feed_id: raise ValueError("feed_id is required") latest = ( Post.query.filter_by(feed_id=int(feed_id)) .order_by(Post.release_date.desc().nullslast(), Post.id.desc()) .first() ) if not latest: return {"updated": False} if latest.whitelisted: return {"updated": False, "post_guid": latest.guid} latest.whitelisted = True db.session.flush() return {"updated": True, "post_guid": latest.guid} def toggle_whitelist_all_for_feed_action(params: Dict[str, Any]) -> Dict[str, Any]: feed_id = params.get("feed_id") new_status = params.get("new_status") if feed_id is None or new_status is None: raise ValueError("feed_id and new_status are required") updated = Post.query.filter_by(feed_id=int(feed_id)).update( {Post.whitelisted: bool(new_status)}, synchronize_session=False, ) return {"feed_id": int(feed_id), "updated_count": int(updated)} def create_dev_test_feed_action(params: Dict[str, Any]) -> Dict[str, Any]: rss_url = params.get("rss_url") title = params.get("title") if not rss_url or not title: raise ValueError("rss_url and title are required") existing = Feed.query.filter_by(rss_url=rss_url).first() if existing: return {"feed_id": existing.id, "created": False} feed = Feed( title=title, rss_url=rss_url, image_url=params.get("image_url"), description=params.get("description"), author=params.get("author"), ) db.session.add(feed) db.session.flush() now = datetime.utcnow() # Use a larger default so dev/test feeds exercise paging in the UI post_count = int(params.get("post_count") or 30) for i in range(1, post_count + 1): guid = f"{params.get('guid_prefix') or 'test-guid'}-{feed.id}-{i}" post = Post( feed_id=feed.id, guid=guid, title=f"Test Episode {i}", download_url=f"{params.get('download_url_prefix') or 'http://test-feed'}/{feed.id}/{i}.mp3", release_date=now, duration=3600, description=f"Test episode description {i}", whitelisted=True, ) db.session.add(post) db.session.flush() job = ProcessingJob( post_guid=post.guid, status="completed", current_step=4, total_steps=4, progress_percentage=100.0, started_at=now, completed_at=now, step_name="completed", ) db.session.add(job) return {"feed_id": feed.id, "created": True} def delete_feed_cascade_action(params: Dict[str, Any]) -> Dict[str, Any]: feed_id = params.get("feed_id") if not feed_id: raise ValueError("feed_id is required") feed_id_i = int(feed_id) feed = db.session.get(Feed, feed_id_i) if not feed: return {"deleted": False} post_rows = db.session.query(Post.id, Post.guid).filter_by(feed_id=feed_id_i).all() post_ids = [row[0] for row in post_rows] post_guids = [row[1] for row in post_rows] batch_size = 200 if post_ids: while True: seg_ids = [ seg_id for (seg_id,) in db.session.query(TranscriptSegment.id) .filter(TranscriptSegment.post_id.in_(post_ids)) .limit(batch_size) .all() ] if not seg_ids: break db.session.query(Identification).filter( Identification.transcript_segment_id.in_(seg_ids) ).delete(synchronize_session=False) db.session.query(TranscriptSegment).filter( TranscriptSegment.id.in_(seg_ids) ).delete(synchronize_session=False) while True: mc_ids = [ mc_id for (mc_id,) in db.session.query(ModelCall.id) .filter(ModelCall.post_id.in_(post_ids)) .limit(batch_size) .all() ] if not mc_ids: break db.session.query(ModelCall).filter(ModelCall.id.in_(mc_ids)).delete( synchronize_session=False ) while True: job_ids = [ job_id for (job_id,) in db.session.query(ProcessingJob.id) .filter(ProcessingJob.post_guid.in_(post_guids)) .limit(batch_size) .all() ] if not job_ids: break db.session.query(ProcessingJob).filter( ProcessingJob.id.in_(job_ids) ).delete(synchronize_session=False) db.session.query(Post).filter(Post.id.in_(post_ids)).delete( synchronize_session=False ) FeedAccessToken.query.filter(FeedAccessToken.feed_id == feed_id_i).delete( synchronize_session=False ) UserFeed.query.filter(UserFeed.feed_id == feed_id_i).delete( synchronize_session=False ) db.session.delete(feed) return {"deleted": True, "feed_id": feed_id_i} def _hash_token(secret_value: str) -> str: return hashlib.sha256(secret_value.encode("utf-8")).hexdigest() def create_feed_access_token_action(params: Dict[str, Any]) -> Dict[str, Any]: user_id = params.get("user_id") feed_id = params.get("feed_id") if not user_id: raise ValueError("user_id is required") # feed_id can be None for aggregate tokens query = FeedAccessToken.query.filter_by(user_id=int(user_id), revoked=False) if feed_id is not None: query = query.filter_by(feed_id=int(feed_id)) else: query = query.filter(FeedAccessToken.feed_id.is_(None)) existing = query.first() if existing is not None: if existing.token_secret: return {"token_id": existing.token_id, "secret": existing.token_secret} secret_value = secrets.token_urlsafe(18) existing.token_hash = _hash_token(secret_value) existing.token_secret = secret_value db.session.flush() return {"token_id": existing.token_id, "secret": secret_value} token_id = uuid.uuid4().hex secret_value = secrets.token_urlsafe(18) token = FeedAccessToken( token_id=token_id, token_hash=_hash_token(secret_value), token_secret=secret_value, feed_id=int(feed_id) if feed_id is not None else None, user_id=int(user_id), ) db.session.add(token) db.session.flush() return {"token_id": token_id, "secret": secret_value} def touch_feed_access_token_action(params: Dict[str, Any]) -> Dict[str, Any]: token_id = params.get("token_id") secret_value = params.get("secret") if not token_id: raise ValueError("token_id is required") token = FeedAccessToken.query.filter_by(token_id=token_id, revoked=False).first() if token is None: return {"updated": False} token.last_used_at = datetime.utcnow() if token.token_secret is None and secret_value: token.token_secret = str(secret_value) db.session.flush() return {"updated": True} ================================================ FILE: src/app/writer/actions/jobs.py ================================================ from datetime import datetime, timedelta from typing import Any, Dict, Optional from app.extensions import db from app.jobs_manager_run_service import recalculate_run_counts from app.models import ProcessingJob def dequeue_job_action(params: Dict[str, Any]) -> Optional[Dict[str, Any]]: run_id = params.get("run_id") # Check for running jobs running_job = ( ProcessingJob.query.filter(ProcessingJob.status == "running") .order_by(ProcessingJob.started_at.desc().nullslast()) .first() ) if running_job: return None job = ( ProcessingJob.query.filter(ProcessingJob.status == "pending") .order_by(ProcessingJob.created_at.asc()) .first() ) if not job: return None job.status = "running" job.started_at = datetime.utcnow() if run_id and job.jobs_manager_run_id != run_id: job.jobs_manager_run_id = run_id return {"job_id": job.id, "post_guid": job.post_guid} def cleanup_stale_jobs_action(params: Dict[str, Any]) -> Dict[str, Any]: older_than_seconds = params.get("older_than_seconds", 3600) cutoff = datetime.utcnow() - timedelta(seconds=older_than_seconds) old_jobs = ProcessingJob.query.filter(ProcessingJob.created_at < cutoff).all() count = len(old_jobs) for job in old_jobs: db.session.delete(job) return {"count": count} def clear_all_jobs_action(params: Dict[str, Any]) -> int: all_jobs = ProcessingJob.query.all() count = len(all_jobs) for job in all_jobs: db.session.delete(job) return count def create_job_action(params: Dict[str, Any]) -> Dict[str, Any]: job_data = params.get("job_data") if not isinstance(job_data, dict): raise ValueError("job_data must be a dictionary") # Convert date strings back to datetime objects if necessary if "created_at" in job_data and isinstance(job_data["created_at"], str): job_data["created_at"] = datetime.fromisoformat(job_data["created_at"]) job = ProcessingJob(**job_data) db.session.add(job) if job.jobs_manager_run_id: recalculate_run_counts(db.session) db.session.flush() return {"job_id": job.id} def cancel_existing_jobs_action(params: Dict[str, Any]) -> int: post_guid = params.get("post_guid") current_job_id = params.get("current_job_id") existing_jobs = ( ProcessingJob.query.filter_by(post_guid=post_guid) .filter( ProcessingJob.status.in_(["pending", "running"]), ProcessingJob.id != current_job_id, ) .all() ) count = len(existing_jobs) for existing_job in existing_jobs: db.session.delete(existing_job) if count > 0: recalculate_run_counts(db.session) return count def update_job_status_action(params: Dict[str, Any]) -> Dict[str, Any]: job_id = params.get("job_id") status = params.get("status") step = params.get("step") step_name = params.get("step_name") progress = params.get("progress") error_message = params.get("error_message") job = db.session.get(ProcessingJob, job_id) if not job: raise ValueError(f"Job {job_id} not found") job.status = status job.current_step = step job.step_name = step_name if progress is not None: job.progress_percentage = progress if error_message: job.error_message = error_message if status == "running" and not job.started_at: job.started_at = datetime.utcnow() elif ( status in ["completed", "failed", "cancelled", "skipped"] and not job.completed_at ): job.completed_at = datetime.utcnow() if job.jobs_manager_run_id: recalculate_run_counts(db.session) return {"job_id": job.id, "status": job.status} def mark_cancelled_action(params: Dict[str, Any]) -> Dict[str, Any]: job_id = params.get("job_id") reason = params.get("reason") job = db.session.get(ProcessingJob, job_id) if not job: raise ValueError(f"Job {job_id} not found") job.status = "cancelled" job.error_message = reason job.completed_at = datetime.utcnow() if job.jobs_manager_run_id: recalculate_run_counts(db.session) return {"job_id": job.id, "status": "cancelled"} def reassign_pending_jobs_action(params: Dict[str, Any]) -> int: run_id = params.get("run_id") if not run_id: return 0 pending_jobs = ( ProcessingJob.query.filter(ProcessingJob.status == "pending") .order_by(ProcessingJob.created_at.asc()) .all() ) reassigned = 0 for job in pending_jobs: if job.jobs_manager_run_id != run_id: job.jobs_manager_run_id = run_id reassigned += 1 if reassigned: recalculate_run_counts(db.session) return reassigned ================================================ FILE: src/app/writer/actions/processor.py ================================================ from __future__ import annotations from datetime import datetime from typing import Any, Dict, Iterable, List from sqlalchemy.dialects.sqlite import insert as sqlite_insert from sqlalchemy.exc import IntegrityError from app.extensions import db from app.models import Identification, ModelCall, TranscriptSegment def upsert_model_call_action(params: Dict[str, Any]) -> Dict[str, Any]: post_id = params.get("post_id") model_name = params.get("model_name") first_seq = params.get("first_segment_sequence_num") last_seq = params.get("last_segment_sequence_num") prompt = params.get("prompt") if post_id is None or model_name is None or first_seq is None or last_seq is None: raise ValueError( "post_id, model_name, first_segment_sequence_num, last_segment_sequence_num are required" ) if not isinstance(prompt, str) or not prompt: raise ValueError("prompt is required") def _query() -> ModelCall | None: return ( db.session.query(ModelCall) .filter_by( post_id=int(post_id), model_name=str(model_name), first_segment_sequence_num=int(first_seq), last_segment_sequence_num=int(last_seq), ) .order_by(ModelCall.timestamp.desc()) .first() ) model_call = _query() if model_call is None: model_call = ModelCall( post_id=int(post_id), first_segment_sequence_num=int(first_seq), last_segment_sequence_num=int(last_seq), model_name=str(model_name), prompt=str(prompt), status="pending", timestamp=datetime.utcnow(), retry_attempts=0, error_message=None, response=None, ) db.session.add(model_call) try: db.session.flush() except IntegrityError: db.session.rollback() model_call = _query() if model_call is None: raise # Match prior behavior: reset only when pending/failed_retries. if model_call.status in ["pending", "failed_retries"]: model_call.status = "pending" model_call.prompt = str(prompt) model_call.retry_attempts = 0 model_call.error_message = None model_call.response = None db.session.flush() return {"model_call_id": int(model_call.id)} def upsert_whisper_model_call_action(params: Dict[str, Any]) -> Dict[str, Any]: post_id = params.get("post_id") model_name = params.get("model_name") first_seq = params.get("first_segment_sequence_num", 0) last_seq = params.get("last_segment_sequence_num", -1) prompt = params.get("prompt") or "Whisper transcription job" if post_id is None or model_name is None: raise ValueError("post_id and model_name are required") reset_fields: Dict[str, Any] = params.get("reset_fields") or { "status": "pending", "prompt": "Whisper transcription job", "retry_attempts": 0, "error_message": None, "response": None, } def _query() -> ModelCall | None: return ( db.session.query(ModelCall) .filter_by( post_id=int(post_id), model_name=str(model_name), first_segment_sequence_num=int(first_seq), last_segment_sequence_num=int(last_seq), ) .order_by(ModelCall.timestamp.desc()) .first() ) model_call = _query() if model_call is None: model_call = ModelCall( post_id=int(post_id), model_name=str(model_name), first_segment_sequence_num=int(first_seq), last_segment_sequence_num=int(last_seq), prompt=str(prompt), status=str(reset_fields.get("status") or "pending"), retry_attempts=int(reset_fields.get("retry_attempts") or 0), error_message=reset_fields.get("error_message"), response=reset_fields.get("response"), timestamp=datetime.utcnow(), ) db.session.add(model_call) try: db.session.flush() except IntegrityError: db.session.rollback() model_call = _query() if model_call is None: raise for k, v in reset_fields.items(): if hasattr(model_call, k): setattr(model_call, k, v) db.session.flush() return {"model_call_id": int(model_call.id)} def _normalize_segments_payload( segments: Iterable[Dict[str, Any]], ) -> List[Dict[str, Any]]: normalized: List[Dict[str, Any]] = [] for seg in segments: if not isinstance(seg, dict): continue normalized.append( { "post_id": int(seg["post_id"]), "sequence_num": int(seg["sequence_num"]), "start_time": float(seg["start_time"]), "end_time": float(seg["end_time"]), "text": str(seg["text"]), } ) return normalized def replace_transcription_action(params: Dict[str, Any]) -> Dict[str, Any]: post_id = params.get("post_id") segments = params.get("segments") model_call_id = params.get("model_call_id") if post_id is None: raise ValueError("post_id is required") if not isinstance(segments, list): raise ValueError("segments must be a list") post_id_i = int(post_id) seg_ids = [ row[0] for row in db.session.query(TranscriptSegment.id) .filter(TranscriptSegment.post_id == post_id_i) .all() ] if seg_ids: db.session.query(Identification).filter( Identification.transcript_segment_id.in_(seg_ids) ).delete(synchronize_session=False) db.session.query(TranscriptSegment).filter( TranscriptSegment.post_id == post_id_i ).delete(synchronize_session=False) payload = [] for i, seg in enumerate(segments): if not isinstance(seg, dict): continue payload.append( { "post_id": post_id_i, "sequence_num": int(seg.get("sequence_num", i)), "start_time": float(seg["start_time"]), "end_time": float(seg["end_time"]), "text": str(seg["text"]), } ) if payload: db.session.execute(sqlite_insert(TranscriptSegment).values(payload)) if model_call_id is not None: mc = db.session.get(ModelCall, int(model_call_id)) if mc is not None: mc.first_segment_sequence_num = 0 mc.last_segment_sequence_num = len(payload) - 1 mc.response = f"{len(payload)} segments transcribed." mc.status = "success" mc.error_message = None db.session.flush() return {"post_id": post_id_i, "segment_count": len(payload)} def mark_model_call_failed_action(params: Dict[str, Any]) -> Dict[str, Any]: model_call_id = params.get("model_call_id") error_message = params.get("error_message") status = params.get("status", "failed_permanent") if model_call_id is None: raise ValueError("model_call_id is required") mc = db.session.get(ModelCall, int(model_call_id)) if mc is None: return {"updated": False} mc.status = str(status) mc.error_message = str(error_message) if error_message is not None else None db.session.flush() return {"updated": True, "model_call_id": int(mc.id)} def insert_identifications_action(params: Dict[str, Any]) -> Dict[str, Any]: identifications = params.get("identifications") if not isinstance(identifications, list): raise ValueError("identifications must be a list") values = [] for ident in identifications: if not isinstance(ident, dict): continue values.append( { "transcript_segment_id": int(ident["transcript_segment_id"]), "model_call_id": int(ident["model_call_id"]), "label": str(ident.get("label") or "ad"), "confidence": ident.get("confidence"), } ) if not values: return {"inserted": 0} stmt = sqlite_insert(Identification).values(values).prefix_with("OR IGNORE") result = db.session.execute(stmt) db.session.flush() return {"inserted": int(getattr(result, "rowcount", 0) or 0)} def replace_identifications_action(params: Dict[str, Any]) -> Dict[str, Any]: delete_ids = params.get("delete_ids") or [] new_identifications = params.get("new_identifications") or [] if not isinstance(delete_ids, list) or not isinstance(new_identifications, list): raise ValueError("delete_ids and new_identifications must be lists") if delete_ids: db.session.query(Identification).filter( Identification.id.in_([int(i) for i in delete_ids]) ).delete(synchronize_session=False) inserted = insert_identifications_action( {"identifications": new_identifications} ).get("inserted", 0) db.session.flush() return {"deleted": len(delete_ids), "inserted": int(inserted)} ================================================ FILE: src/app/writer/actions/system.py ================================================ import logging from datetime import datetime from typing import Any, Dict from app.extensions import db from app.jobs_manager_run_service import get_or_create_singleton_run from app.models import DiscordSettings logger = logging.getLogger("writer") def ensure_active_run_action(params: Dict[str, Any]) -> Dict[str, Any]: trigger = params.get("trigger", "system") context = params.get("context") logger.info( "[WRITER] ensure_active_run_action: trigger=%s context_keys=%s", trigger, list(context.keys()) if isinstance(context, dict) else None, ) run = get_or_create_singleton_run(db.session, trigger, context) db.session.flush() # Ensure ID is available logger.info( "[WRITER] ensure_active_run_action: obtained run_id=%s status=%s", getattr(run, "id", None), getattr(run, "status", None), ) return {"run_id": run.id} def update_discord_settings_action(params: Dict[str, Any]) -> Dict[str, Any]: settings = db.session.get(DiscordSettings, 1) if settings is None: settings = DiscordSettings(id=1) db.session.add(settings) for field in ( "client_id", "client_secret", "redirect_uri", "guild_ids", "allow_registration", ): if field in params: setattr(settings, field, params.get(field)) settings.updated_at = datetime.utcnow() db.session.flush() return {"updated": True} def update_combined_config_action(params: Dict[str, Any]) -> Dict[str, Any]: payload = params.get("payload") if not isinstance(payload, dict): raise ValueError("payload must be a dictionary") # Import locally to avoid cyclic dependencies from app.config_store import ( # pylint: disable=import-outside-toplevel hydrate_runtime_config_inplace, update_combined, ) updated = update_combined(payload) # Ensure the running process sees the new config immediately hydrate_runtime_config_inplace() # Reset processor instance to pick up new config (e.g. litellm globals) # Import locally to avoid cyclic dependencies import importlib processor = importlib.import_module("app.processor") processor.ProcessorSingleton.reset_instance() if not isinstance(updated, dict): return {"updated": True} return updated ================================================ FILE: src/app/writer/actions/users.py ================================================ from datetime import datetime from typing import Any, Dict from app.extensions import db from app.models import FeedAccessToken, User def create_user_action(params: Dict[str, Any]) -> Dict[str, Any]: username = (params.get("username") or "").strip().lower() password = params.get("password") role = params.get("role") or "user" if not username: raise ValueError("username is required") if not isinstance(password, str) or not password: raise ValueError("password is required") if role not in {"admin", "user"}: raise ValueError("role must be 'admin' or 'user'") if User.query.filter_by(username=username).first(): raise ValueError("A user with that username already exists") user = User(username=username, role=role) user.set_password(password) db.session.add(user) db.session.flush() return {"user_id": user.id} def update_user_password_action(params: Dict[str, Any]) -> Dict[str, Any]: user_id = params.get("user_id") new_password = params.get("new_password") if not user_id: raise ValueError("user_id is required") if not isinstance(new_password, str) or not new_password: raise ValueError("new_password is required") user = db.session.get(User, int(user_id)) if not user: raise ValueError(f"User {user_id} not found") user.set_password(new_password) db.session.flush() return {"user_id": user.id} def delete_user_action(params: Dict[str, Any]) -> Dict[str, Any]: user_id = params.get("user_id") if not user_id: raise ValueError("user_id is required") user = db.session.get(User, int(user_id)) if not user: return {"deleted": False} # FeedAccessToken.user_id is non-nullable; without cascading deletes SQLAlchemy # will attempt to NULL the FK when deleting a User, causing an IntegrityError. # Delete tokens explicitly as part of the writer action. tokens = ( db.session.query(FeedAccessToken) .filter(FeedAccessToken.user_id == user.id) .all() ) for token in tokens: db.session.delete(token) db.session.delete(user) return {"deleted": True} def set_user_role_action(params: Dict[str, Any]) -> Dict[str, Any]: user_id = params.get("user_id") role = params.get("role") if not user_id or not role: raise ValueError("user_id and role are required") if role not in {"admin", "user"}: raise ValueError("role must be 'admin' or 'user'") user = db.session.get(User, int(user_id)) if not user: raise ValueError(f"User {user_id} not found") user.role = role db.session.flush() return {"user_id": user.id} def set_manual_feed_allowance_action(params: Dict[str, Any]) -> Dict[str, Any]: user_id = params.get("user_id") allowance = params.get("allowance") if not user_id: raise ValueError("user_id is required") user = db.session.get(User, int(user_id)) if not user: raise ValueError(f"User {user_id} not found") if allowance is None: user.manual_feed_allowance = None else: try: user.manual_feed_allowance = int(allowance) except (ValueError, TypeError) as exc: raise ValueError("allowance must be an integer or None") from exc db.session.flush() return {"user_id": user.id} def upsert_discord_user_action(params: Dict[str, Any]) -> Dict[str, Any]: discord_id = params.get("discord_id") discord_username = params.get("discord_username") allow_registration = bool(params.get("allow_registration", True)) if not discord_id or not discord_username: raise ValueError("discord_id and discord_username are required") existing_user: User | None = User.query.filter_by( discord_id=str(discord_id) ).first() if existing_user: existing_user.discord_username = str(discord_username) db.session.flush() return {"user_id": existing_user.id, "created": False} if not allow_registration: raise ValueError("Self-registration via Discord is disabled") base_username = str(discord_username).lower().replace(" ", "_")[:50] username = base_username counter = 1 while User.query.filter_by(username=username).first(): username = f"{base_username}_{counter}" counter += 1 new_user = User( username=username, password_hash="", role="user", discord_id=str(discord_id), discord_username=str(discord_username), ) db.session.add(new_user) db.session.flush() return {"user_id": new_user.id, "created": True} def set_user_billing_fields_action(params: Dict[str, Any]) -> Dict[str, Any]: user_id = params.get("user_id") if not user_id: raise ValueError("user_id is required") user = db.session.get(User, int(user_id)) if not user: raise ValueError(f"User {user_id} not found") if "stripe_customer_id" in params: user.stripe_customer_id = params.get("stripe_customer_id") if "stripe_subscription_id" in params: user.stripe_subscription_id = params.get("stripe_subscription_id") if "feed_allowance" in params: user.feed_allowance = int(params.get("feed_allowance") or 0) if "feed_subscription_status" in params: user.feed_subscription_status = params.get("feed_subscription_status") or "" db.session.flush() return {"user_id": user.id} def set_user_billing_by_customer_id_action(params: Dict[str, Any]) -> Dict[str, Any]: customer_id = params.get("stripe_customer_id") if not customer_id: raise ValueError("stripe_customer_id is required") user = User.query.filter_by(stripe_customer_id=customer_id).first() if not user: return {"updated": False} if "stripe_subscription_id" in params: user.stripe_subscription_id = params.get("stripe_subscription_id") if "feed_allowance" in params: user.feed_allowance = int(params.get("feed_allowance") or 0) if "feed_subscription_status" in params: user.feed_subscription_status = params.get("feed_subscription_status") or "" db.session.flush() return {"updated": True, "user_id": user.id} def update_user_last_active_action(params: Dict[str, Any]) -> Dict[str, Any]: user_id = params.get("user_id") if not user_id: raise ValueError("user_id is required") user = db.session.get(User, int(user_id)) if not user: raise ValueError(f"User {user_id} not found") user.last_active = datetime.utcnow() db.session.flush() return {"user_id": user.id, "last_active": user.last_active.isoformat()} ================================================ FILE: src/app/writer/client.py ================================================ import os import uuid from queue import Empty from typing import Any, Callable, Dict, Optional, cast from flask import current_app from app.ipc import make_client_manager from app.writer.model_ops import execute_model_command from app.writer.protocol import WriteCommand, WriteCommandType, WriteResult class WriterClient: def __init__(self) -> None: self.manager: Any = None self.queue: Any = None def connect(self) -> None: if not self.manager: self.manager = make_client_manager() self.queue = self.manager.get_command_queue() # pylint: disable=no-member def _should_use_local_fallback(self) -> bool: if os.environ.get("PYTEST_CURRENT_TEST"): return True if os.environ.get("PODLY_WRITER_LOCAL_FALLBACK") == "1": return True try: return bool(getattr(current_app, "testing", False)) except Exception: # pylint: disable=broad-except return False def _local_execute(self, cmd: WriteCommand) -> WriteResult: # Import locally to avoid cyclic dependencies from app import models # pylint: disable=import-outside-toplevel from app.extensions import db # pylint: disable=import-outside-toplevel model_map: Dict[str, Any] = {} for name, obj in vars(models).items(): if isinstance(obj, type) and issubclass(obj, db.Model) and obj != db.Model: model_map[name] = obj try: if cmd.type == WriteCommandType.TRANSACTION: return self._local_execute_transaction(cmd, model_map) result = self._local_execute_single(cmd, model_map) if result.success: db.session.commit() else: db.session.rollback() return result except Exception as exc: # pylint: disable=broad-except db.session.rollback() return WriteResult(cmd.id, False, error=str(exc)) def _local_execute_single( self, cmd: WriteCommand, model_map: Dict[str, Any] ) -> WriteResult: if cmd.type == WriteCommandType.ACTION: return self._local_execute_action(cmd) return self._local_execute_model(cmd, model_map) def _local_execute_transaction( self, cmd: WriteCommand, model_map: Dict[str, Any] ) -> WriteResult: # Import locally to avoid cyclic dependencies from app.extensions import db # pylint: disable=import-outside-toplevel results = [] for sub_cmd_data in cmd.data.get("commands", []): if isinstance(sub_cmd_data, dict): sub_cmd = WriteCommand( id=sub_cmd_data.get("id", "sub"), type=WriteCommandType(sub_cmd_data.get("type")), model=sub_cmd_data.get("model"), data=sub_cmd_data.get("data", {}), ) else: sub_cmd = sub_cmd_data res = self._local_execute_single(sub_cmd, model_map) if not res.success: db.session.rollback() return WriteResult( cmd.id, False, error=f"Transaction failed at {sub_cmd.id}: {res.error}", ) results.append(res) db.session.commit() return WriteResult(cmd.id, True, data={"results": [r.data for r in results]}) def _local_execute_action(self, cmd: WriteCommand) -> WriteResult: # Import locally to avoid cyclic dependencies # pylint: disable=import-outside-toplevel from app.writer import actions as writer_actions action_name = cmd.data.get("action") func_name = f"{action_name}_action" if action_name else None func_obj = getattr(writer_actions, func_name, None) if func_name else None if func_obj is None or not callable(func_obj): return WriteResult(cmd.id, False, error=f"Unknown action: {action_name}") func = cast(Callable[[Dict[str, Any]], Any], func_obj) result = func(cmd.data.get("params", {})) # pylint: disable=not-callable return WriteResult( cmd.id, True, data=result if isinstance(result, dict) else {"result": result}, ) def _local_execute_model( self, cmd: WriteCommand, model_map: Dict[str, Any] ) -> WriteResult: # Import locally to avoid cyclic dependencies from app.extensions import db # pylint: disable=import-outside-toplevel if not cmd.model or cmd.model not in model_map: return WriteResult(cmd.id, False, error=f"Unknown model: {cmd.model}") model_cls = model_map[cmd.model] return execute_model_command( cmd=cmd, model_cls=model_cls, db_session=db.session ) def submit( self, cmd: WriteCommand, wait: bool = False, timeout: int = 10 ) -> Optional[WriteResult]: if not self.queue: try: self.connect() except Exception: # pylint: disable=broad-except if self._should_use_local_fallback(): result = self._local_execute(cmd) return result if wait else None raise if wait: if not self.manager: raise RuntimeError("Manager not connected") # Create a temporary queue for the reply reply_q = self.manager.Queue() # pylint: disable=no-member cmd.reply_queue = reply_q if self.queue: self.queue.put(cmd) if wait: try: return reply_q.get(timeout=timeout) # type: ignore except Empty as exc: raise TimeoutError("Writer service did not respond") from exc return None def create( self, model: str, data: Dict[str, Any], wait: bool = True ) -> Optional[WriteResult]: cmd = WriteCommand( id=str(uuid.uuid4()), type=WriteCommandType.CREATE, model=model, data=data ) return self.submit(cmd, wait=wait) def update( self, model: str, pk: Any, data: Dict[str, Any], wait: bool = True ) -> Optional[WriteResult]: data["id"] = pk cmd = WriteCommand( id=str(uuid.uuid4()), type=WriteCommandType.UPDATE, model=model, data=data ) return self.submit(cmd, wait=wait) def delete(self, model: str, pk: Any, wait: bool = True) -> Optional[WriteResult]: cmd = WriteCommand( id=str(uuid.uuid4()), type=WriteCommandType.DELETE, model=model, data={"id": pk}, ) return self.submit(cmd, wait=wait) def action( self, action_name: str, params: Dict[str, Any], wait: bool = True ) -> Optional[WriteResult]: cmd = WriteCommand( id=str(uuid.uuid4()), type=WriteCommandType.ACTION, model=None, data={"action": action_name, "params": params}, ) return self.submit(cmd, wait=wait) # Singleton instance writer_client = WriterClient() ================================================ FILE: src/app/writer/executor.py ================================================ import logging from typing import Any, Callable, Dict from flask import Flask from app import models from app.extensions import db from app.writer import actions as writer_actions from app.writer.model_ops import execute_model_command from app.writer.protocol import WriteCommand, WriteCommandType, WriteResult logger = logging.getLogger("writer") class CommandExecutor: def __init__(self, app: Flask): self.app = app self.models = self._discover_models() self.actions: Dict[str, Any] = {} # Registry for custom actions self._register_default_actions() def _register_default_actions(self) -> None: self.register_action( "ensure_active_run", writer_actions.ensure_active_run_action ) self.register_action("dequeue_job", writer_actions.dequeue_job_action) self.register_action( "cleanup_stale_jobs", writer_actions.cleanup_stale_jobs_action ) self.register_action("clear_all_jobs", writer_actions.clear_all_jobs_action) self.register_action( "cleanup_missing_audio_paths", writer_actions.cleanup_missing_audio_paths_action, ) self.register_action("create_job", writer_actions.create_job_action) self.register_action( "cancel_existing_jobs", writer_actions.cancel_existing_jobs_action ) self.register_action( "update_job_status", writer_actions.update_job_status_action ) self.register_action("mark_cancelled", writer_actions.mark_cancelled_action) self.register_action( "reassign_pending_jobs", writer_actions.reassign_pending_jobs_action ) self.register_action("refresh_feed", writer_actions.refresh_feed_action) self.register_action("add_feed", writer_actions.add_feed_action) self.register_action( "update_feed_settings", writer_actions.update_feed_settings_action ) self.register_action( "clear_post_processing_data", writer_actions.clear_post_processing_data_action, ) self.register_action( "cleanup_processed_post", writer_actions.cleanup_processed_post_action ) self.register_action( "increment_download_count", writer_actions.increment_download_count_action ) self.register_action( "set_user_billing_fields", writer_actions.set_user_billing_fields_action ) self.register_action( "set_user_billing_by_customer_id", writer_actions.set_user_billing_by_customer_id_action, ) self.register_action( "ensure_user_feed_membership", writer_actions.ensure_user_feed_membership_action, ) self.register_action( "remove_user_feed_membership", writer_actions.remove_user_feed_membership_action, ) self.register_action( "whitelist_latest_post_for_feed", writer_actions.whitelist_latest_post_for_feed_action, ) self.register_action( "toggle_whitelist_all_for_feed", writer_actions.toggle_whitelist_all_for_feed_action, ) self.register_action( "whitelist_post", writer_actions.whitelist_post_action, ) self.register_action( "create_dev_test_feed", writer_actions.create_dev_test_feed_action ) self.register_action( "delete_feed_cascade", writer_actions.delete_feed_cascade_action ) self.register_action( "update_discord_settings", writer_actions.update_discord_settings_action ) self.register_action( "update_combined_config", writer_actions.update_combined_config_action ) self.register_action( "create_feed_access_token", writer_actions.create_feed_access_token_action ) self.register_action( "touch_feed_access_token", writer_actions.touch_feed_access_token_action ) self.register_action("create_user", writer_actions.create_user_action) self.register_action( "update_user_password", writer_actions.update_user_password_action ) self.register_action("delete_user", writer_actions.delete_user_action) self.register_action("set_user_role", writer_actions.set_user_role_action) self.register_action( "set_manual_feed_allowance", writer_actions.set_manual_feed_allowance_action ) self.register_action( "upsert_discord_user", writer_actions.upsert_discord_user_action ) self.register_action( "upsert_model_call", writer_actions.upsert_model_call_action ) self.register_action( "upsert_whisper_model_call", writer_actions.upsert_whisper_model_call_action ) self.register_action( "replace_transcription", writer_actions.replace_transcription_action ) self.register_action( "mark_model_call_failed", writer_actions.mark_model_call_failed_action ) self.register_action( "insert_identifications", writer_actions.insert_identifications_action ) self.register_action( "replace_identifications", writer_actions.replace_identifications_action ) self.register_action( "update_user_last_active", writer_actions.update_user_last_active_action ) def _discover_models(self) -> Dict[str, Any]: """Discover all SQLAlchemy models in app.models""" model_map = {} for name, obj in vars(models).items(): if isinstance(obj, type) and issubclass(obj, db.Model) and obj != db.Model: model_map[name] = obj return model_map def register_action(self, name: str, func: Callable[[Dict[str, Any]], Any]) -> None: self.actions[name] = func def process_command(self, cmd: WriteCommand) -> WriteResult: with self.app.app_context(): try: logger.info( "[WRITER] Processing command: id=%s type=%s model=%s", cmd.id, cmd.type, cmd.model, ) if cmd.type == WriteCommandType.TRANSACTION: result = self._handle_transaction(cmd) if result.success: logger.debug( "[WRITER] Committing TRANSACTION command id=%s", cmd.id ) db.session.commit() else: logger.debug( "[WRITER] Rolling back TRANSACTION command id=%s", cmd.id ) db.session.rollback() return result # Single operation result = self._execute_single_command(cmd) if result.success: # Suppress commit log for empty dequeue_job actions (polling) is_polling_noop = ( cmd.type == WriteCommandType.ACTION and cmd.data.get("action") == "dequeue_job" and not result.data ) if not is_polling_noop: logger.info("[WRITER] Committing single command id=%s", cmd.id) db.session.commit() else: logger.info("[WRITER] Rolling back single command id=%s", cmd.id) db.session.rollback() return result except Exception as e: logger.error( "[WRITER] Error processing command id=%s: %s", cmd.id, e, exc_info=True, ) db.session.rollback() return WriteResult(cmd.id, False, error=str(e)) def _execute_single_command(self, cmd: WriteCommand) -> WriteResult: if cmd.type == WriteCommandType.ACTION: return self._handle_action(cmd) if not cmd.model or cmd.model not in self.models: return WriteResult(cmd.id, False, error=f"Unknown model: {cmd.model}") model_cls = self.models[cmd.model] if cmd.type in ( WriteCommandType.CREATE, WriteCommandType.UPDATE, WriteCommandType.DELETE, ): return execute_model_command( cmd=cmd, model_cls=model_cls, db_session=db.session ) return WriteResult(cmd.id, False, error="Unknown command type") def _handle_transaction(self, cmd: WriteCommand) -> WriteResult: sub_commands_data = cmd.data.get("commands", []) results = [] try: for sub_cmd_data in sub_commands_data: if isinstance(sub_cmd_data, dict): sub_cmd = WriteCommand( id=sub_cmd_data.get("id", "sub"), type=WriteCommandType(sub_cmd_data.get("type")), model=sub_cmd_data.get("model"), data=sub_cmd_data.get("data", {}), ) else: sub_cmd = sub_cmd_data res = self._execute_single_command(sub_cmd) if not res.success: # Let process_command handle rollback return WriteResult( cmd.id, False, error=f"Transaction failed at {sub_cmd.id}: {res.error}", ) results.append(res) # Let process_command handle commit return WriteResult( cmd.id, True, data={ "results": [ { "command_id": r.command_id, "success": r.success, "data": r.data, "error": r.error, } for r in results ] }, ) except Exception as e: # Let process_command handle rollback return WriteResult(cmd.id, False, error=str(e)) def _handle_action(self, cmd: WriteCommand) -> WriteResult: action_name = cmd.data.get("action") if action_name not in self.actions: return WriteResult(cmd.id, False, error=f"Unknown action: {action_name}") func = self.actions[action_name] try: result = func(cmd.data.get("params", {})) # Commit is handled by process_command return WriteResult(cmd.id, True, data=result) except Exception as e: # Rollback is handled by process_command raise e ================================================ FILE: src/app/writer/model_ops.py ================================================ from __future__ import annotations from typing import Any from app.writer.protocol import WriteCommand, WriteCommandType, WriteResult def execute_model_command( *, cmd: WriteCommand, model_cls: Any, db_session: Any, ) -> WriteResult: if cmd.type == WriteCommandType.CREATE: obj = model_cls(**cmd.data) db_session.add(obj) db_session.flush() data = {"id": obj.id} if hasattr(obj, "id") else None return WriteResult(cmd.id, True, data=data) if cmd.type == WriteCommandType.UPDATE: pk = cmd.data.get("id") if not pk: return WriteResult(cmd.id, False, error="Missing 'id' in data for UPDATE") obj = db_session.get(model_cls, pk) if not obj: return WriteResult( cmd.id, False, error=f"Record not found: {cmd.model} {pk}" ) for k, v in cmd.data.items(): if k != "id" and hasattr(obj, k): setattr(obj, k, v) return WriteResult(cmd.id, True) if cmd.type == WriteCommandType.DELETE: pk = cmd.data.get("id") if not pk: return WriteResult(cmd.id, False, error="Missing 'id' in data for DELETE") obj = db_session.get(model_cls, pk) if obj: db_session.delete(obj) return WriteResult(cmd.id, True) return WriteResult(cmd.id, False, error="Unknown command type") ================================================ FILE: src/app/writer/protocol.py ================================================ from dataclasses import dataclass from enum import Enum from typing import Any, Dict, Optional class WriteCommandType(Enum): CREATE = "create" UPDATE = "update" DELETE = "delete" # Critical for integrity: Execute multiple operations in one commit TRANSACTION = "transaction" # For complex logic that needs to run inside the writer (e.g. "deduct_credits_and_start_job") ACTION = "action" @dataclass class WriteCommand: id: str type: WriteCommandType model: Optional[str] data: Dict[str, Any] # The queue to send the result back to (managed by the client) reply_queue: Any = None @dataclass class WriteResult: command_id: str success: bool data: Optional[Dict[str, Any]] = None error: Optional[str] = None ================================================ FILE: src/app/writer/service.py ================================================ import logging import threading import time from app.ipc import get_queue, make_server_manager from app.logger import setup_logger from app.writer.protocol import WriteCommandType from .executor import CommandExecutor logger = setup_logger("writer", "src/instance/logs/app.log", level=logging.INFO) def run_writer_service() -> None: from app import create_writer_app logger.info("Starting Writer Service...") # 1. Start the IPC Server manager = make_server_manager() server = manager.get_server() server_thread = threading.Thread(target=server.serve_forever) server_thread.daemon = True server_thread.start() logger.info("IPC Server started on port 50001") # 2. Get the queue queue = get_queue() # 3. Initialize App and Executor app = create_writer_app() executor = CommandExecutor(app) logger.info("Writer Loop starting...") # 4. Writer Loop while True: try: cmd = queue.get() # Check if this is a polling command (dequeue_job) is_polling = ( getattr(cmd, "type", None) == WriteCommandType.ACTION and isinstance(getattr(cmd, "data", None), dict) and cmd.data.get("action") == "dequeue_job" ) if not is_polling: logger.info( "[WRITER] Received command: id=%s type=%s model=%s has_reply=%s", getattr(cmd, "id", None), getattr(cmd, "type", None), getattr(cmd, "model", None), bool(getattr(cmd, "reply_queue", None)), ) result = executor.process_command(cmd) # Only log finished/reply if not polling or if polling actually did something if not is_polling or (result and result.data): logger.info( "[WRITER] Finished command: id=%s success=%s error=%s", getattr(result, "command_id", None), getattr(result, "success", None), getattr(result, "error", None), ) if cmd.reply_queue: if not is_polling or (result and result.data): logger.info( "[WRITER] Sending reply for command id=%s", getattr(cmd, "id", None), ) cmd.reply_queue.put(result) except Exception as e: logger.error("Error in writer loop: %s", e, exc_info=True) time.sleep(1) ================================================ FILE: src/boundary_refinement_prompt.jinja ================================================ You are analyzing podcast transcript segments to precisely identify advertisement boundaries. Your job is to determine the EXACT start and end points of advertisement content by analyzing transition patterns and content flow. BOUNDARY DETECTION RULES: **AD START INDICATORS** (extend boundary backward): - Sponsor introductions: "This episode is brought to you by...", "And now a word from our sponsor" - Transition phrases: "Before we continue...", "Let me tell you about...", "Speaking of..." - Host acknowledgments: "I want to thank...", "Special thanks to...", "Our sponsor today is..." - Subtle lead-ins: "You know what's interesting...", "I've been using...", "Let me share something..." **AD END INDICATORS** (extend boundary forward): - Sponsor conclusions: "Thanks to [sponsor]", "That's [website].com", "Use code [PROMO]" - Final CTAs: "Visit today", "Don't wait", "Get started now", "Learn more at..." - Transition back: "Now back to...", "Let's continue...", "So anyway...", "Where were we..." - Topic resumption: Clear return to previous discussion topic **CONTENT RESUMPTION SIGNALS** (stop ad boundary): - Natural conversation flow: Questions, responses, continued technical discussion - Topic changes: New subjects unrelated to sponsor - Interview continuation: "So tell me about...", "What do you think about..." - Technical deep-dives: Code examples, implementation details, architecture discussion **CONFIDENCE-BASED BOUNDARY RULES**: - **High Confidence (>0.9)**: Aggressive boundary extension, include subtle transitions - **Medium Confidence (0.7-0.9)**: Conservative extension, clear transition signals only - **Low Confidence (<0.7)**: Minimal changes, bias toward preserving content **ANALYSIS CONTEXT**: - **Detected Ad Block**: {{ad_start}}s - {{ad_end}}s - **Original Confidence**: {{ad_confidence}} **CONTEXT SEGMENTS**: {% for segment in context_segments -%} [{{segment.start_time}}] {{segment.text}} {% endfor %} **OUTPUT FORMAT**: Respond with valid JSON containing refined boundaries: ```json { "refined_start": {{ad_start}}, "refined_end": {{ad_end}}, "start_adjustment_reason": "reason for start boundary change", "end_adjustment_reason": "reason for end boundary change" } ``` **REFINEMENT GUIDELINES**: - If no refinement needed, return original timestamps with "No adjustment needed" reasons - Keep adjustments close to the detected timestamps - For confidence {{ad_confidence}}: {% if ad_confidence > 0.9 %}be aggressive with boundary extension{% elif ad_confidence > 0.7 %}be conservative, only extend for clear signals{% else %}minimal changes, preserve content{% endif %} - Always ensure refined_start < refined_end ================================================ FILE: src/main.py ================================================ import os from waitress import serve from app import create_web_app def main() -> None: """Main entry point for the application.""" app = create_web_app() # Start the application server threads_env = os.environ.get("SERVER_THREADS") try: threads = int(threads_env) if threads_env is not None else 1 except ValueError: threads = 1 port = os.environ.get("PORT", 5001) serve( app, host="0.0.0.0", port=port, threads=threads, ) if __name__ == "__main__": main() ================================================ FILE: src/migrations/README ================================================ Single-database configuration for Flask. ================================================ FILE: src/migrations/alembic.ini ================================================ # A generic, single database configuration. [alembic] # template used to generate migration files # file_template = %%(rev)s_%%(slug)s script_location = %(here)s # set to 'true' to run the environment during # the 'revision' command, regardless of autogenerate # revision_environment = false # Logging configuration [loggers] keys = root,sqlalchemy,alembic,flask_migrate [handlers] keys = console [formatters] keys = generic [logger_root] level = DEBUG handlers = console qualname = [logger_sqlalchemy] level = WARN handlers = qualname = sqlalchemy.engine [logger_alembic] level = INFO handlers = qualname = alembic [logger_flask_migrate] level = INFO handlers = qualname = flask_migrate [handler_console] class = StreamHandler args = (sys.stderr,) level = NOTSET formatter = generic [formatter_generic] format = %(levelname)-5.5s [%(name)s] %(message)s datefmt = %H:%M:%S ================================================ FILE: src/migrations/env.py ================================================ import logging from logging.config import fileConfig from alembic import context from flask import current_app # this is the Alembic Config object, which provides # access to the values within the .ini file in use. config = context.config # Interpret the config file for Python logging. # This line sets up loggers basically. fileConfig(config.config_file_name, disable_existing_loggers=False) logger = logging.getLogger("alembic.env") def get_engine(): try: # this works with Flask-SQLAlchemy<3 and Alchemical return current_app.extensions["migrate"].db.get_engine() except (TypeError, AttributeError): # this works with Flask-SQLAlchemy>=3 return current_app.extensions["migrate"].db.engine def get_engine_url(): try: return get_engine().url.render_as_string(hide_password=False).replace("%", "%%") except AttributeError: return str(get_engine().url).replace("%", "%%") # add your model's MetaData object here # for 'autogenerate' support # from myapp import mymodel # target_metadata = mymodel.Base.metadata config.set_main_option("sqlalchemy.url", get_engine_url()) target_db = current_app.extensions["migrate"].db # other values from the config, defined by the needs of env.py, # can be acquired: # my_important_option = config.get_main_option("my_important_option") # ... etc. def get_metadata(): if hasattr(target_db, "metadatas"): return target_db.metadatas[None] return target_db.metadata def run_migrations_offline(): """Run migrations in 'offline' mode. This configures the context with just a URL and not an Engine, though an Engine is acceptable here as well. By skipping the Engine creation we don't even need a DBAPI to be available. Calls to context.execute() here emit the given string to the script output. """ url = config.get_main_option("sqlalchemy.url") context.configure(url=url, target_metadata=get_metadata(), literal_binds=True) with context.begin_transaction(): context.run_migrations() def run_migrations_online(): """Run migrations in 'online' mode. In this scenario we need to create an Engine and associate a connection with the context. """ # this callback is used to prevent an auto-migration from being generated # when there are no changes to the schema # reference: http://alembic.zzzcomputing.com/en/latest/cookbook.html def process_revision_directives(context, revision, directives): if getattr(config.cmd_opts, "autogenerate", False): script = directives[0] if script.upgrade_ops.is_empty(): directives[:] = [] logger.info("No changes in schema detected.") conf_args = current_app.extensions["migrate"].configure_args if conf_args.get("process_revision_directives") is None: conf_args["process_revision_directives"] = process_revision_directives connectable = get_engine() with connectable.connect() as connection: context.configure( connection=connection, target_metadata=get_metadata(), **conf_args ) with context.begin_transaction(): context.run_migrations() if context.is_offline_mode(): run_migrations_offline() else: run_migrations_online() ================================================ FILE: src/migrations/script.py.mako ================================================ """${message} Revision ID: ${up_revision} Revises: ${down_revision | comma,n} Create Date: ${create_date} """ from alembic import op import sqlalchemy as sa ${imports if imports else ""} # revision identifiers, used by Alembic. revision = ${repr(up_revision)} down_revision = ${repr(down_revision)} branch_labels = ${repr(branch_labels)} depends_on = ${repr(depends_on)} def upgrade(): ${upgrades if upgrades else "pass"} def downgrade(): ${downgrades if downgrades else "pass"} ================================================ FILE: src/migrations/versions/0d954a44fa8e_feed_access.py ================================================ """feed_access Revision ID: 0d954a44fa8e Revises: 91ff431c832e Create Date: 2025-11-04 21:43:07.716121 """ import sqlalchemy as sa from alembic import op # revision identifiers, used by Alembic. revision = "0d954a44fa8e" down_revision = "91ff431c832e" branch_labels = None depends_on = None def upgrade(): # ### commands auto generated by Alembic - please adjust! ### op.create_table( "feed_access_token", sa.Column("id", sa.Integer(), autoincrement=True, nullable=False), sa.Column("token_id", sa.String(length=32), nullable=False), sa.Column("token_hash", sa.String(length=64), nullable=False), sa.Column("feed_id", sa.Integer(), nullable=False), sa.Column("user_id", sa.Integer(), nullable=False), sa.Column("created_at", sa.DateTime(), nullable=False), sa.Column("last_used_at", sa.DateTime(), nullable=True), sa.Column("revoked", sa.Boolean(), nullable=False), sa.ForeignKeyConstraint( ["feed_id"], ["feed.id"], ), sa.ForeignKeyConstraint( ["user_id"], ["users.id"], ), sa.PrimaryKeyConstraint("id"), ) with op.batch_alter_table("feed_access_token", schema=None) as batch_op: batch_op.create_index( batch_op.f("ix_feed_access_token_token_id"), ["token_id"], unique=True ) # ### end Alembic commands ### def downgrade(): # ### commands auto generated by Alembic - please adjust! ### with op.batch_alter_table("feed_access_token", schema=None) as batch_op: batch_op.drop_index(batch_op.f("ix_feed_access_token_token_id")) op.drop_table("feed_access_token") # ### end Alembic commands ### ================================================ FILE: src/migrations/versions/16311623dd58_env_hash.py ================================================ """env_hash Revision ID: 16311623dd58 Revises: 5bccc39c9685 Create Date: 2025-12-14 10:32:15.843860 """ import sqlalchemy as sa from alembic import op # revision identifiers, used by Alembic. revision = "16311623dd58" down_revision = "5bccc39c9685" branch_labels = None depends_on = None def upgrade(): # ### commands auto generated by Alembic - please adjust! ### with op.batch_alter_table("app_settings", schema=None) as batch_op: batch_op.add_column( sa.Column("env_config_hash", sa.String(length=64), nullable=True) ) # ### end Alembic commands ### def downgrade(): # ### commands auto generated by Alembic - please adjust! ### with op.batch_alter_table("app_settings", schema=None) as batch_op: batch_op.drop_column("env_config_hash") # ### end Alembic commands ### ================================================ FILE: src/migrations/versions/185d3448990e_stripe.py ================================================ """stripe Revision ID: 185d3448990e Revises: 35b12b2d9feb Create Date: 2025-12-10 21:51:55.888021 """ import sqlalchemy as sa from alembic import op # revision identifiers, used by Alembic. revision = "185d3448990e" down_revision = "35b12b2d9feb" branch_labels = None depends_on = None def upgrade(): # ### commands auto generated by Alembic - please adjust! ### bind = op.get_bind() inspector = sa.inspect(bind) if inspector.has_table("credit_transaction"): indexes = [i["name"] for i in inspector.get_indexes("credit_transaction")] with op.batch_alter_table("credit_transaction", schema=None) as batch_op: if "ix_credit_transaction_feed_id" in indexes: batch_op.drop_index(batch_op.f("ix_credit_transaction_feed_id")) if "ix_credit_transaction_post_id" in indexes: batch_op.drop_index(batch_op.f("ix_credit_transaction_post_id")) if "ix_credit_transaction_user_created" in indexes: batch_op.drop_index(batch_op.f("ix_credit_transaction_user_created")) if "ix_credit_transaction_user_id" in indexes: batch_op.drop_index(batch_op.f("ix_credit_transaction_user_id")) op.drop_table("credit_transaction") if inspector.has_table("app_settings"): columns = [c["name"] for c in inspector.get_columns("app_settings")] with op.batch_alter_table("app_settings", schema=None) as batch_op: if "minutes_per_credit" in columns: batch_op.drop_column("minutes_per_credit") if inspector.has_table("users"): columns = [c["name"] for c in inspector.get_columns("users")] with op.batch_alter_table("users", schema=None) as batch_op: if "feed_allowance" not in columns: batch_op.add_column( sa.Column("feed_allowance", sa.Integer(), nullable=False) ) if "feed_subscription_status" not in columns: batch_op.add_column( sa.Column( "feed_subscription_status", sa.String(length=32), nullable=False ) ) if "stripe_customer_id" not in columns: batch_op.add_column( sa.Column("stripe_customer_id", sa.String(length=64), nullable=True) ) if "stripe_subscription_id" not in columns: batch_op.add_column( sa.Column( "stripe_subscription_id", sa.String(length=64), nullable=True ) ) if "credits_balance" in columns: batch_op.drop_column("credits_balance") # ### end Alembic commands ### def downgrade(): # ### commands auto generated by Alembic - please adjust! ### bind = op.get_bind() inspector = sa.inspect(bind) if inspector.has_table("users"): columns = [c["name"] for c in inspector.get_columns("users")] with op.batch_alter_table("users", schema=None) as batch_op: if "credits_balance" not in columns: batch_op.add_column( sa.Column( "credits_balance", sa.NUMERIC(precision=12, scale=2), nullable=False, ) ) if "stripe_subscription_id" in columns: batch_op.drop_column("stripe_subscription_id") if "stripe_customer_id" in columns: batch_op.drop_column("stripe_customer_id") if "feed_subscription_status" in columns: batch_op.drop_column("feed_subscription_status") if "feed_allowance" in columns: batch_op.drop_column("feed_allowance") if inspector.has_table("app_settings"): columns = [c["name"] for c in inspector.get_columns("app_settings")] with op.batch_alter_table("app_settings", schema=None) as batch_op: if "minutes_per_credit" not in columns: batch_op.add_column( sa.Column( "minutes_per_credit", sa.INTEGER(), server_default=sa.text("(60)"), nullable=False, ) ) if not inspector.has_table("credit_transaction"): op.create_table( "credit_transaction", sa.Column("id", sa.INTEGER(), nullable=False), sa.Column("user_id", sa.INTEGER(), nullable=False), sa.Column("feed_id", sa.INTEGER(), nullable=True), sa.Column("post_id", sa.INTEGER(), nullable=True), sa.Column("idempotency_key", sa.VARCHAR(length=128), nullable=True), sa.Column( "amount_signed", sa.NUMERIC(precision=12, scale=2), nullable=False ), sa.Column("type", sa.VARCHAR(length=32), nullable=False), sa.Column("note", sa.TEXT(), nullable=True), sa.Column("created_at", sa.DATETIME(), nullable=False), sa.ForeignKeyConstraint( ["feed_id"], ["feed.id"], ), sa.ForeignKeyConstraint( ["post_id"], ["post.id"], ), sa.ForeignKeyConstraint( ["user_id"], ["users.id"], ), sa.PrimaryKeyConstraint("id"), sa.UniqueConstraint("idempotency_key"), ) with op.batch_alter_table("credit_transaction", schema=None) as batch_op: batch_op.create_index( batch_op.f("ix_credit_transaction_user_id"), ["user_id"], unique=False ) batch_op.create_index( batch_op.f("ix_credit_transaction_user_created"), ["user_id", "created_at"], unique=False, ) batch_op.create_index( batch_op.f("ix_credit_transaction_post_id"), ["post_id"], unique=False ) batch_op.create_index( batch_op.f("ix_credit_transaction_feed_id"), ["feed_id"], unique=False ) # ### end Alembic commands ### ================================================ FILE: src/migrations/versions/18c2402c9202_cleanup_retention_days.py ================================================ """cleanup_retention_days Revision ID: 18c2402c9202 Revises: a6f5df1a50ac Create Date: 2025-11-03 22:05:56.956113 """ import sqlalchemy as sa from alembic import op # revision identifiers, used by Alembic. revision = "18c2402c9202" down_revision = "a6f5df1a50ac" branch_labels = None depends_on = None def upgrade(): # ### commands auto generated by Alembic - please adjust! ### with op.batch_alter_table("app_settings", schema=None) as batch_op: batch_op.add_column( sa.Column("post_cleanup_retention_days", sa.Integer(), nullable=True) ) # ### end Alembic commands ### def downgrade(): # ### commands auto generated by Alembic - please adjust! ### with op.batch_alter_table("app_settings", schema=None) as batch_op: batch_op.drop_column("post_cleanup_retention_days") # ### end Alembic commands ### ================================================ FILE: src/migrations/versions/2e25a15d11de_per_feed_auto_whitelist.py ================================================ """per feed auto whitelist Revision ID: 2e25a15d11de Revises: 82cfcc8e0326 Create Date: 2026-01-12 12:47:42.611999 """ import sqlalchemy as sa from alembic import op # revision identifiers, used by Alembic. revision = "2e25a15d11de" down_revision = "82cfcc8e0326" branch_labels = None depends_on = None def upgrade(): # ### commands auto generated by Alembic - please adjust! ### with op.batch_alter_table("feed", schema=None) as batch_op: batch_op.add_column( sa.Column( "auto_whitelist_new_episodes_override", sa.Boolean(), nullable=True ) ) # ### end Alembic commands ### def downgrade(): # ### commands auto generated by Alembic - please adjust! ### with op.batch_alter_table("feed", schema=None) as batch_op: batch_op.drop_column("auto_whitelist_new_episodes_override") # ### end Alembic commands ### ================================================ FILE: src/migrations/versions/31d767deb401_credits.py ================================================ """credits Revision ID: 31d767deb401 Revises: 608e0b27fcda Create Date: 2025-11-29 11:42:27.900494 """ import sqlalchemy as sa from alembic import op # revision identifiers, used by Alembic. revision = "31d767deb401" down_revision = "608e0b27fcda" branch_labels = None depends_on = None def upgrade(): bind = op.get_bind() inspector = sa.inspect(bind) existing_tables = set(inspector.get_table_names()) # ### commands auto generated by Alembic - please adjust! ### if "credit_transaction" not in existing_tables: op.create_table( "credit_transaction", sa.Column("id", sa.Integer(), autoincrement=True, nullable=False), sa.Column("user_id", sa.Integer(), nullable=False), sa.Column("feed_id", sa.Integer(), nullable=True), sa.Column("post_id", sa.Integer(), nullable=True), sa.Column("idempotency_key", sa.String(length=128), nullable=True), sa.Column( "amount_signed", sa.Numeric(precision=12, scale=1), nullable=False ), sa.Column("type", sa.String(length=32), nullable=False), sa.Column("note", sa.Text(), nullable=True), sa.Column("created_at", sa.DateTime(), nullable=False), sa.ForeignKeyConstraint( ["feed_id"], ["feed.id"], ), sa.ForeignKeyConstraint( ["post_id"], ["post.id"], ), sa.ForeignKeyConstraint( ["user_id"], ["users.id"], ), sa.PrimaryKeyConstraint("id"), sa.UniqueConstraint("idempotency_key"), ) with op.batch_alter_table("credit_transaction", schema=None) as batch_op: batch_op.create_index( batch_op.f("ix_credit_transaction_feed_id"), ["feed_id"], unique=False ) batch_op.create_index( batch_op.f("ix_credit_transaction_post_id"), ["post_id"], unique=False ) batch_op.create_index( "ix_credit_transaction_user_created", ["user_id", "created_at"], unique=False, ) batch_op.create_index( batch_op.f("ix_credit_transaction_user_id"), ["user_id"], unique=False ) if "app_settings" in existing_tables: app_columns = {col["name"] for col in inspector.get_columns("app_settings")} if "minutes_per_credit" not in app_columns: with op.batch_alter_table("app_settings", schema=None) as batch_op: batch_op.add_column( sa.Column( "minutes_per_credit", sa.Integer(), nullable=False, server_default=sa.text("60"), ) ) if "feed" in existing_tables: feed_columns = {col["name"] for col in inspector.get_columns("feed")} if "sponsor_user_id" not in feed_columns: with op.batch_alter_table("feed", schema=None) as batch_op: batch_op.add_column( sa.Column("sponsor_user_id", sa.Integer(), nullable=True) ) batch_op.add_column(sa.Column("sponsor_note", sa.Text(), nullable=True)) batch_op.create_index( batch_op.f("ix_feed_sponsor_user_id"), ["sponsor_user_id"], unique=False, ) batch_op.create_foreign_key( "fk_feed_sponsor_user_id", "users", ["sponsor_user_id"], ["id"], ) if "users" in existing_tables: user_columns = {col["name"] for col in inspector.get_columns("users")} if "credits_balance" not in user_columns: with op.batch_alter_table("users", schema=None) as batch_op: batch_op.add_column( sa.Column( "credits_balance", sa.Numeric(precision=12, scale=1), nullable=False, server_default=sa.text("1"), ) ) # ### end Alembic commands ### def downgrade(): bind = op.get_bind() inspector = sa.inspect(bind) existing_tables = set(inspector.get_table_names()) # ### commands auto generated by Alembic - please adjust! ### if "users" in existing_tables: user_columns = {col["name"] for col in inspector.get_columns("users")} if "credits_balance" in user_columns: with op.batch_alter_table("users", schema=None) as batch_op: batch_op.drop_column("credits_balance") if "feed" in existing_tables: feed_columns = {col["name"] for col in inspector.get_columns("feed")} if "sponsor_user_id" in feed_columns or "sponsor_note" in feed_columns: with op.batch_alter_table("feed", schema=None) as batch_op: if "fk_feed_sponsor_user_id" in { fk["name"] for fk in inspector.get_foreign_keys("feed") }: batch_op.drop_constraint( "fk_feed_sponsor_user_id", type_="foreignkey" ) if "ix_feed_sponsor_user_id" in { idx["name"] for idx in inspector.get_indexes("feed") }: batch_op.drop_index(batch_op.f("ix_feed_sponsor_user_id")) if "sponsor_note" in feed_columns: batch_op.drop_column("sponsor_note") if "sponsor_user_id" in feed_columns: batch_op.drop_column("sponsor_user_id") if "app_settings" in existing_tables: app_columns = {col["name"] for col in inspector.get_columns("app_settings")} if "minutes_per_credit" in app_columns: with op.batch_alter_table("app_settings", schema=None) as batch_op: batch_op.drop_column("minutes_per_credit") if "credit_transaction" in existing_tables: with op.batch_alter_table("credit_transaction", schema=None) as batch_op: existing_indexes = { idx["name"] for idx in inspector.get_indexes("credit_transaction") } if batch_op.f("ix_credit_transaction_user_id") in existing_indexes: batch_op.drop_index(batch_op.f("ix_credit_transaction_user_id")) if "ix_credit_transaction_user_created" in existing_indexes: batch_op.drop_index("ix_credit_transaction_user_created") if batch_op.f("ix_credit_transaction_post_id") in existing_indexes: batch_op.drop_index(batch_op.f("ix_credit_transaction_post_id")) if batch_op.f("ix_credit_transaction_feed_id") in existing_indexes: batch_op.drop_index(batch_op.f("ix_credit_transaction_feed_id")) op.drop_table("credit_transaction") # ### end Alembic commands ### ================================================ FILE: src/migrations/versions/35b12b2d9feb_landing_page.py ================================================ """landing page Revision ID: 35b12b2d9feb Revises: eb51923af483 Create Date: 2025-12-01 23:49:10.400190 """ import sqlalchemy as sa from alembic import op # revision identifiers, used by Alembic. revision = "35b12b2d9feb" down_revision = "eb51923af483" branch_labels = None depends_on = None def upgrade(): # ### commands auto generated by Alembic - please adjust! ### with op.batch_alter_table("app_settings", schema=None) as batch_op: batch_op.add_column( sa.Column( "enable_public_landing_page", sa.Boolean(), nullable=False, server_default=sa.text("false"), ), ) # ### end Alembic commands ### def downgrade(): # ### commands auto generated by Alembic - please adjust! ### with op.batch_alter_table("app_settings", schema=None) as batch_op: batch_op.drop_column("enable_public_landing_page") # ### end Alembic commands ### ================================================ FILE: src/migrations/versions/3c7f5f7640e4_add_counters_reset_timestamp.py ================================================ """add counters reset timestamp to jobs_manager_run Revision ID: 3c7f5f7640e4 Revises: c0f8893ce927 Create Date: 2026-12-01 00:00:00.000000 """ from __future__ import annotations import sqlalchemy as sa from alembic import op # revision identifiers, used by Alembic. revision = "3c7f5f7640e4" down_revision = "c0f8893ce927" branch_labels = None depends_on = None def upgrade() -> None: bind = op.get_bind() inspector = sa.inspect(bind) existing_tables = set(inspector.get_table_names()) if "jobs_manager_run" not in existing_tables: return columns = {col["name"] for col in inspector.get_columns("jobs_manager_run")} if "counters_reset_at" not in columns: with op.batch_alter_table("jobs_manager_run", schema=None) as batch_op: batch_op.add_column( sa.Column("counters_reset_at", sa.DateTime(), nullable=True) ) op.execute( sa.text( "UPDATE jobs_manager_run " "SET counters_reset_at = COALESCE(started_at, created_at, CURRENT_TIMESTAMP) " "WHERE counters_reset_at IS NULL" ) ) def downgrade() -> None: bind = op.get_bind() inspector = sa.inspect(bind) existing_tables = set(inspector.get_table_names()) if "jobs_manager_run" not in existing_tables: return columns = {col["name"] for col in inspector.get_columns("jobs_manager_run")} if "counters_reset_at" in columns: with op.batch_alter_table("jobs_manager_run", schema=None) as batch_op: batch_op.drop_column("counters_reset_at") ================================================ FILE: src/migrations/versions/3d232f215842_migration.py ================================================ """migration Revision ID: 3d232f215842 Revises: f7a4195e0953 Create Date: 2026-01-11 18:35:34.763013 """ import sqlalchemy as sa from alembic import op # revision identifiers, used by Alembic. revision = "3d232f215842" down_revision = "f7a4195e0953" branch_labels = None depends_on = None def upgrade(): # ### commands auto generated by Alembic - please adjust! ### with op.batch_alter_table("llm_settings", schema=None) as batch_op: batch_op.add_column( sa.Column( "enable_word_level_boundary_refinder", sa.Boolean(), nullable=False, server_default=sa.text("0"), ) ) # ### end Alembic commands ### def downgrade(): # ### commands auto generated by Alembic - please adjust! ### with op.batch_alter_table("llm_settings", schema=None) as batch_op: batch_op.drop_column("enable_word_level_boundary_refinder") # ### end Alembic commands ### ================================================ FILE: src/migrations/versions/3eb0a3a0870b_discord.py ================================================ """discord Revision ID: 3eb0a3a0870b Revises: 31d767deb401 Create Date: 2025-11-29 12:41:40.446049 """ import sqlalchemy as sa from alembic import op # revision identifiers, used by Alembic. revision = "3eb0a3a0870b" down_revision = "31d767deb401" branch_labels = None depends_on = None def upgrade(): # ### commands auto generated by Alembic - please adjust! ### with op.batch_alter_table("users", schema=None) as batch_op: batch_op.add_column( sa.Column("discord_id", sa.String(length=32), nullable=True) ) batch_op.add_column( sa.Column("discord_username", sa.String(length=100), nullable=True) ) batch_op.create_index( batch_op.f("ix_users_discord_id"), ["discord_id"], unique=True ) # ### end Alembic commands ### def downgrade(): # ### commands auto generated by Alembic - please adjust! ### with op.batch_alter_table("users", schema=None) as batch_op: batch_op.drop_index(batch_op.f("ix_users_discord_id")) batch_op.drop_column("discord_username") batch_op.drop_column("discord_id") # ### end Alembic commands ### ================================================ FILE: src/migrations/versions/401071604e7b_config_tables.py ================================================ """Create settings tables and seed defaults Revision ID: 401071604e7b Revises: 611dcb5d7f12 Create Date: 2025-09-28 00:00:00.000000 """ import sqlalchemy as sa from alembic import op # revision identifiers, used by Alembic. revision = "401071604e7b" down_revision = "611dcb5d7f12" branch_labels = None depends_on = None def upgrade(): bind = op.get_bind() inspector = sa.inspect(bind) existing_tables = set(inspector.get_table_names()) if "llm_settings" not in existing_tables: op.create_table( "llm_settings", sa.Column("id", sa.Integer(), nullable=False), sa.Column("llm_api_key", sa.Text(), nullable=True), sa.Column( "llm_model", sa.Text(), nullable=False, server_default="groq/openai/gpt-oss-120b", ), sa.Column("openai_base_url", sa.Text(), nullable=True), sa.Column( "openai_timeout", sa.Integer(), nullable=False, server_default="300" ), sa.Column( "openai_max_tokens", sa.Integer(), nullable=False, server_default="4096" ), sa.Column( "llm_max_concurrent_calls", sa.Integer(), nullable=False, server_default="3", ), sa.Column( "llm_max_retry_attempts", sa.Integer(), nullable=False, server_default="5", ), sa.Column("llm_max_input_tokens_per_call", sa.Integer(), nullable=True), sa.Column( "llm_enable_token_rate_limiting", sa.Boolean(), nullable=False, server_default=sa.text("0"), ), sa.Column("llm_max_input_tokens_per_minute", sa.Integer(), nullable=True), sa.Column( "created_at", sa.DateTime(), nullable=False, server_default=sa.func.current_timestamp(), ), sa.Column( "updated_at", sa.DateTime(), nullable=False, server_default=sa.func.current_timestamp(), ), sa.PrimaryKeyConstraint("id"), ) if "whisper_settings" not in existing_tables: op.create_table( "whisper_settings", sa.Column("id", sa.Integer(), nullable=False), sa.Column("whisper_type", sa.Text(), nullable=False, server_default="groq"), sa.Column( "local_model", sa.Text(), nullable=False, server_default="base.en" ), sa.Column( "remote_model", sa.Text(), nullable=False, server_default="whisper-1" ), sa.Column("remote_api_key", sa.Text(), nullable=True), sa.Column( "remote_base_url", sa.Text(), nullable=False, server_default="https://api.openai.com/v1", ), sa.Column( "remote_language", sa.Text(), nullable=False, server_default="en" ), sa.Column( "remote_timeout_sec", sa.Integer(), nullable=False, server_default="600" ), sa.Column( "remote_chunksize_mb", sa.Integer(), nullable=False, server_default="24" ), sa.Column("groq_api_key", sa.Text(), nullable=True), sa.Column( "groq_model", sa.Text(), nullable=False, server_default="whisper-large-v3-turbo", ), sa.Column("groq_language", sa.Text(), nullable=False, server_default="en"), sa.Column( "groq_max_retries", sa.Integer(), nullable=False, server_default="3" ), sa.Column( "groq_initial_backoff", sa.Float(), nullable=False, server_default="1.0" ), sa.Column( "groq_backoff_factor", sa.Float(), nullable=False, server_default="2.0" ), sa.Column( "created_at", sa.DateTime(), nullable=False, server_default=sa.func.current_timestamp(), ), sa.Column( "updated_at", sa.DateTime(), nullable=False, server_default=sa.func.current_timestamp(), ), sa.PrimaryKeyConstraint("id"), ) if "processing_settings" not in existing_tables: op.create_table( "processing_settings", sa.Column("id", sa.Integer(), nullable=False), sa.Column( "system_prompt_path", sa.Text(), nullable=False, server_default="src/system_prompt.txt", ), sa.Column( "user_prompt_template_path", sa.Text(), nullable=False, server_default="src/user_prompt.jinja", ), sa.Column( "num_segments_to_input_to_prompt", sa.Integer(), nullable=False, server_default="60", ), sa.Column( "created_at", sa.DateTime(), nullable=False, server_default=sa.func.current_timestamp(), ), sa.Column( "updated_at", sa.DateTime(), nullable=False, server_default=sa.func.current_timestamp(), ), sa.PrimaryKeyConstraint("id"), ) if "output_settings" not in existing_tables: op.create_table( "output_settings", sa.Column("id", sa.Integer(), nullable=False), sa.Column("fade_ms", sa.Integer(), nullable=False, server_default="3000"), sa.Column( "min_ad_segement_separation_seconds", sa.Integer(), nullable=False, server_default="60", ), sa.Column( "min_ad_segment_length_seconds", sa.Integer(), nullable=False, server_default="14", ), sa.Column( "min_confidence", sa.Float(), nullable=False, server_default="0.8" ), sa.Column( "created_at", sa.DateTime(), nullable=False, server_default=sa.func.current_timestamp(), ), sa.Column( "updated_at", sa.DateTime(), nullable=False, server_default=sa.func.current_timestamp(), ), sa.PrimaryKeyConstraint("id"), ) if "app_settings" not in existing_tables: op.create_table( "app_settings", sa.Column("id", sa.Integer(), nullable=False), sa.Column("background_update_interval_minute", sa.Integer(), nullable=True), sa.Column( "automatically_whitelist_new_episodes", sa.Boolean(), nullable=False, server_default=sa.text("1"), ), sa.Column( "number_of_episodes_to_whitelist_from_archive_of_new_feed", sa.Integer(), nullable=False, server_default="1", ), sa.Column( "created_at", sa.DateTime(), nullable=False, server_default=sa.func.current_timestamp(), ), sa.Column( "updated_at", sa.DateTime(), nullable=False, server_default=sa.func.current_timestamp(), ), sa.PrimaryKeyConstraint("id"), ) # Seed singleton rows (id=1) - SQLite requires one statement per execute op.execute( sa.text("INSERT INTO llm_settings (id) VALUES (1) ON CONFLICT(id) DO NOTHING") ) op.execute( sa.text( "INSERT INTO whisper_settings (id) VALUES (1) ON CONFLICT(id) DO NOTHING" ) ) op.execute( sa.text( "INSERT INTO processing_settings (id) VALUES (1) ON CONFLICT(id) DO NOTHING" ) ) op.execute( sa.text( "INSERT INTO output_settings (id) VALUES (1) ON CONFLICT(id) DO NOTHING" ) ) op.execute( sa.text("INSERT INTO app_settings (id) VALUES (1) ON CONFLICT(id) DO NOTHING") ) def downgrade(): op.drop_table("app_settings") op.drop_table("output_settings") op.drop_table("processing_settings") op.drop_table("whisper_settings") op.drop_table("llm_settings") ================================================ FILE: src/migrations/versions/58b4eedd4c61_add_last_active_to_user.py ================================================ """add_last_active_to_user Revision ID: 58b4eedd4c61 Revises: 73a6b9f9b643 Create Date: 2025-12-20 14:01:36.022682 """ import sqlalchemy as sa from alembic import op # revision identifiers, used by Alembic. revision = "58b4eedd4c61" down_revision = "73a6b9f9b643" branch_labels = None depends_on = None def upgrade(): # ### commands auto generated by Alembic - please adjust! ### with op.batch_alter_table("users", schema=None) as batch_op: batch_op.add_column(sa.Column("last_active", sa.DateTime(), nullable=True)) # ### end Alembic commands ### def downgrade(): # ### commands auto generated by Alembic - please adjust! ### with op.batch_alter_table("users", schema=None) as batch_op: batch_op.drop_column("last_active") # ### end Alembic commands ### ================================================ FILE: src/migrations/versions/5bccc39c9685_zero_initial_allowance.py ================================================ """zero initial allowance Revision ID: 5bccc39c9685 Revises: ab643af6472e Create Date: 2025-12-12 14:21:35.530141 """ import sqlalchemy as sa from alembic import op # revision identifiers, used by Alembic. revision = "5bccc39c9685" down_revision = "ab643af6472e" branch_labels = None depends_on = None def upgrade(): # ### commands auto generated by Alembic - please adjust! ### pass # ### end Alembic commands ### def downgrade(): # ### commands auto generated by Alembic - please adjust! ### pass # ### end Alembic commands ### ================================================ FILE: src/migrations/versions/608e0b27fcda_stronger_access_token.py ================================================ """stronger_access_token Revision ID: 608e0b27fcda Revises: f6d5fee57cc3 Create Date: 2025-11-05 21:27:10.923394 """ import sqlalchemy as sa from alembic import op # revision identifiers, used by Alembic. revision = "608e0b27fcda" down_revision = "f6d5fee57cc3" branch_labels = None depends_on = None def upgrade(): # ### commands auto generated by Alembic - please adjust! ### with op.batch_alter_table("feed_access_token", schema=None) as batch_op: batch_op.add_column( sa.Column("token_secret", sa.String(length=128), nullable=True) ) # ### end Alembic commands ### def downgrade(): # ### commands auto generated by Alembic - please adjust! ### with op.batch_alter_table("feed_access_token", schema=None) as batch_op: batch_op.drop_column("token_secret") # ### end Alembic commands ### ================================================ FILE: src/migrations/versions/611dcb5d7f12_add_image_url_to_post_model_for_episode_.py ================================================ """Add image_url to Post model for episode thumbnails Revision ID: 611dcb5d7f12 Revises: b038c2f99086 Create Date: 2025-05-25 13:39:49.168287 """ import sqlalchemy as sa from alembic import op # revision identifiers, used by Alembic. revision = "611dcb5d7f12" down_revision = "b038c2f99086" branch_labels = None depends_on = None def upgrade(): # ### commands auto generated by Alembic - please adjust! ### with op.batch_alter_table("post", schema=None) as batch_op: batch_op.add_column(sa.Column("image_url", sa.Text(), nullable=True)) # ### end Alembic commands ### def downgrade(): # ### commands auto generated by Alembic - please adjust! ### with op.batch_alter_table("post", schema=None) as batch_op: batch_op.drop_column("image_url") # ### end Alembic commands ### ================================================ FILE: src/migrations/versions/6e0e16299dcb_alternate_feed_id.py ================================================ """alternate feed ID Revision ID: 6e0e16299dcb Revises: 770771437280 Create Date: 2024-11-23 11:04:37.861614 """ import sqlalchemy as sa from alembic import op # revision identifiers, used by Alembic. revision = "6e0e16299dcb" down_revision = "770771437280" branch_labels = None depends_on = None def upgrade(): # ### commands auto generated by Alembic - please adjust! ### with op.batch_alter_table("feed", schema=None) as batch_op: batch_op.add_column(sa.Column("alt_id", sa.Text(), nullable=True)) # ### end Alembic commands ### def downgrade(): # ### commands auto generated by Alembic - please adjust! ### with op.batch_alter_table("feed", schema=None) as batch_op: batch_op.drop_column("alt_id") # ### end Alembic commands ### ================================================ FILE: src/migrations/versions/73a6b9f9b643_allow_null_feed_id_for_aggregate_tokens.py ================================================ """allow_null_feed_id_for_aggregate_tokens Revision ID: 73a6b9f9b643 Revises: 89d86978f407 Create Date: 2025-12-14 13:28:57.243239 """ import sqlalchemy as sa from alembic import op # revision identifiers, used by Alembic. revision = "73a6b9f9b643" down_revision = "89d86978f407" branch_labels = None depends_on = None def upgrade(): # ### commands auto generated by Alembic - please adjust! ### with op.batch_alter_table("feed_access_token", schema=None) as batch_op: batch_op.alter_column("feed_id", existing_type=sa.INTEGER(), nullable=True) # ### end Alembic commands ### def downgrade(): # ### commands auto generated by Alembic - please adjust! ### with op.batch_alter_table("feed_access_token", schema=None) as batch_op: batch_op.alter_column("feed_id", existing_type=sa.INTEGER(), nullable=False) # ### end Alembic commands ### ================================================ FILE: src/migrations/versions/770771437280_episode_whitelist.py ================================================ """episode whitelist Revision ID: 770771437280 Revises: fa3a95ecd67d Create Date: 2024-11-16 08:27:46.081562 """ import sqlalchemy as sa from alembic import op # revision identifiers, used by Alembic. revision = "770771437280" down_revision = "fa3a95ecd67d" branch_labels = None depends_on = None def upgrade(): # ### commands auto generated by Alembic - please adjust! ### with op.batch_alter_table("post", schema=None) as batch_op: batch_op.add_column( sa.Column( "whitelisted", sa.Boolean(), nullable=False, server_default=sa.false() ) ) # ### end Alembic commands ### def downgrade(): # ### commands auto generated by Alembic - please adjust! ### with op.batch_alter_table("post", schema=None) as batch_op: batch_op.drop_column("whitelisted") op.create_table( "ad_identification", sa.Column("id", sa.INTEGER(), nullable=False), sa.Column("post_id", sa.INTEGER(), nullable=False), sa.Column("content", sa.TEXT(), nullable=False), sa.Column("timestamp", sa.DATETIME(), nullable=True), sa.ForeignKeyConstraint( ["post_id"], ["post.id"], ), sa.PrimaryKeyConstraint("id"), sa.UniqueConstraint("post_id"), ) op.create_table( "identification", sa.Column("id", sa.INTEGER(), nullable=False), sa.Column("post_id", sa.INTEGER(), nullable=False), sa.Column("content", sa.TEXT(), nullable=False), sa.Column("timestamp", sa.DATETIME(), nullable=True), sa.ForeignKeyConstraint( ["post_id"], ["post.id"], ), sa.PrimaryKeyConstraint("id"), sa.UniqueConstraint("post_id"), ) # ### end Alembic commands ### ================================================ FILE: src/migrations/versions/7de4e57ec4bb_discord_settings.py ================================================ """discord settings Revision ID: 7de4e57ec4bb Revises: 3eb0a3a0870b Create Date: 2025-11-29 12:47:45.289285 """ import sqlalchemy as sa from alembic import op # revision identifiers, used by Alembic. revision = "7de4e57ec4bb" down_revision = "3eb0a3a0870b" branch_labels = None depends_on = None def upgrade(): # ### commands auto generated by Alembic - please adjust! ### op.create_table( "discord_settings", sa.Column("id", sa.Integer(), nullable=False), sa.Column("client_id", sa.Text(), nullable=True), sa.Column("client_secret", sa.Text(), nullable=True), sa.Column("redirect_uri", sa.Text(), nullable=True), sa.Column("guild_ids", sa.Text(), nullable=True), sa.Column("allow_registration", sa.Boolean(), nullable=False), sa.Column("created_at", sa.DateTime(), nullable=False), sa.Column("updated_at", sa.DateTime(), nullable=False), sa.PrimaryKeyConstraint("id"), ) # ### end Alembic commands ### def downgrade(): # ### commands auto generated by Alembic - please adjust! ### op.drop_table("discord_settings") # ### end Alembic commands ### ================================================ FILE: src/migrations/versions/802a2365976d_gruanular_credits.py ================================================ """gruanular credits Revision ID: 802a2365976d Revises: 7de4e57ec4bb Create Date: 2025-11-29 19:10:18.950548 """ import sqlalchemy as sa from alembic import op # revision identifiers, used by Alembic. revision = "802a2365976d" down_revision = "7de4e57ec4bb" branch_labels = None depends_on = None def upgrade(): # ### commands auto generated by Alembic - please adjust! ### with op.batch_alter_table("credit_transaction", schema=None) as batch_op: batch_op.alter_column( "amount_signed", existing_type=sa.NUMERIC(precision=12, scale=1), type_=sa.Numeric(precision=12, scale=2), existing_nullable=False, ) with op.batch_alter_table("users", schema=None) as batch_op: batch_op.alter_column( "credits_balance", existing_type=sa.NUMERIC(precision=12, scale=1), type_=sa.Numeric(precision=12, scale=2), existing_nullable=False, ) # ### end Alembic commands ### def downgrade(): # ### commands auto generated by Alembic - please adjust! ### with op.batch_alter_table("users", schema=None) as batch_op: batch_op.alter_column( "credits_balance", existing_type=sa.Numeric(precision=12, scale=2), type_=sa.NUMERIC(precision=12, scale=1), existing_nullable=False, ) with op.batch_alter_table("credit_transaction", schema=None) as batch_op: batch_op.alter_column( "amount_signed", existing_type=sa.Numeric(precision=12, scale=2), type_=sa.NUMERIC(precision=12, scale=1), existing_nullable=False, ) # ### end Alembic commands ### ================================================ FILE: src/migrations/versions/82cfcc8e0326_refined_cuts.py ================================================ """refined cuts Revision ID: 82cfcc8e0326 Revises: 3d232f215842 Create Date: 2026-01-11 20:44:32.127284 """ import sqlalchemy as sa from alembic import op # revision identifiers, used by Alembic. revision = "82cfcc8e0326" down_revision = "3d232f215842" branch_labels = None depends_on = None def upgrade(): # ### commands auto generated by Alembic - please adjust! ### with op.batch_alter_table("post", schema=None) as batch_op: batch_op.add_column( sa.Column("refined_ad_boundaries", sa.JSON(), nullable=True) ) batch_op.add_column( sa.Column("refined_ad_boundaries_updated_at", sa.DateTime(), nullable=True) ) # ### end Alembic commands ### def downgrade(): # ### commands auto generated by Alembic - please adjust! ### with op.batch_alter_table("post", schema=None) as batch_op: batch_op.drop_column("refined_ad_boundaries_updated_at") batch_op.drop_column("refined_ad_boundaries") # ### end Alembic commands ### ================================================ FILE: src/migrations/versions/89d86978f407_limit_users.py ================================================ """limit users Revision ID: 89d86978f407 Revises: 16311623dd58 Create Date: 2025-12-14 12:45:22.788888 """ import sqlalchemy as sa from alembic import op # revision identifiers, used by Alembic. revision = "89d86978f407" down_revision = "16311623dd58" branch_labels = None depends_on = None def upgrade(): # ### commands auto generated by Alembic - please adjust! ### with op.batch_alter_table("app_settings", schema=None) as batch_op: batch_op.add_column(sa.Column("user_limit_total", sa.Integer(), nullable=True)) # ### end Alembic commands ### def downgrade(): # ### commands auto generated by Alembic - please adjust! ### with op.batch_alter_table("app_settings", schema=None) as batch_op: batch_op.drop_column("user_limit_total") # ### end Alembic commands ### ================================================ FILE: src/migrations/versions/91ff431c832e_download_count.py ================================================ """download_count Revision ID: 91ff431c832e Revises: 18c2402c9202 Create Date: 2025-11-03 23:24:04.934488 """ import sqlalchemy as sa from alembic import op # revision identifiers, used by Alembic. revision = "91ff431c832e" down_revision = "18c2402c9202" branch_labels = None depends_on = None def upgrade(): # ### commands auto generated by Alembic - please adjust! ### with op.batch_alter_table("jobs_manager_run", schema=None) as batch_op: batch_op.alter_column( "created_at", existing_type=sa.DATETIME(), nullable=True, existing_server_default=sa.text("(CURRENT_TIMESTAMP)"), ) batch_op.alter_column( "updated_at", existing_type=sa.DATETIME(), nullable=True, existing_server_default=sa.text("(CURRENT_TIMESTAMP)"), ) batch_op.drop_column("previous_run_id") with op.batch_alter_table("post", schema=None) as batch_op: batch_op.add_column(sa.Column("download_count", sa.Integer(), nullable=True)) with op.batch_alter_table("users", schema=None) as batch_op: batch_op.drop_constraint(batch_op.f("uq_users_username"), type_="unique") batch_op.drop_index(batch_op.f("ix_users_username")) batch_op.create_index( batch_op.f("ix_users_username"), ["username"], unique=True ) # ### end Alembic commands ### def downgrade(): # ### commands auto generated by Alembic - please adjust! ### with op.batch_alter_table("users", schema=None) as batch_op: batch_op.drop_index(batch_op.f("ix_users_username")) batch_op.create_index( batch_op.f("ix_users_username"), ["username"], unique=False ) batch_op.create_unique_constraint(batch_op.f("uq_users_username"), ["username"]) with op.batch_alter_table("post", schema=None) as batch_op: batch_op.drop_column("download_count") with op.batch_alter_table("jobs_manager_run", schema=None) as batch_op: batch_op.add_column( sa.Column("previous_run_id", sa.VARCHAR(length=36), nullable=True) ) batch_op.alter_column( "updated_at", existing_type=sa.DATETIME(), nullable=False, existing_server_default=sa.text("(CURRENT_TIMESTAMP)"), ) batch_op.alter_column( "created_at", existing_type=sa.DATETIME(), nullable=False, existing_server_default=sa.text("(CURRENT_TIMESTAMP)"), ) # ### end Alembic commands ### ================================================ FILE: src/migrations/versions/999b921ffc58_migration.py ================================================ """migration Revision ID: 999b921ffc58 Revises: 401071604e7b Create Date: 2025-10-18 15:11:24.463135 """ import sqlalchemy as sa from alembic import op # revision identifiers, used by Alembic. revision = "999b921ffc58" down_revision = "401071604e7b" branch_labels = None depends_on = None def upgrade(): # ### commands auto generated by Alembic - please adjust! ### bind = op.get_bind() inspector = sa.inspect(bind) existing_tables = set(inspector.get_table_names()) # Create jobs_manager_run table only if it doesn't exist (makes migration idempotent) if "jobs_manager_run" not in existing_tables: op.create_table( "jobs_manager_run", sa.Column("id", sa.String(length=36), nullable=False), sa.Column( "status", sa.String(length=50), nullable=False, server_default="pending" ), sa.Column("trigger", sa.String(length=100), nullable=False), sa.Column("started_at", sa.DateTime(), nullable=True), sa.Column("completed_at", sa.DateTime(), nullable=True), sa.Column("total_jobs", sa.Integer(), nullable=False, server_default="0"), sa.Column("queued_jobs", sa.Integer(), nullable=False, server_default="0"), sa.Column("running_jobs", sa.Integer(), nullable=False, server_default="0"), sa.Column( "completed_jobs", sa.Integer(), nullable=False, server_default="0" ), sa.Column("failed_jobs", sa.Integer(), nullable=False, server_default="0"), sa.Column("context_json", sa.JSON(), nullable=True), sa.Column("previous_run_id", sa.String(length=36), nullable=True), sa.Column( "created_at", sa.DateTime(), nullable=False, server_default=sa.func.current_timestamp(), ), sa.Column( "updated_at", sa.DateTime(), nullable=False, server_default=sa.func.current_timestamp(), ), sa.PrimaryKeyConstraint("id"), ) # Index on status for quick filtering (create only if missing) if "jobs_manager_run" in existing_tables: existing_indexes = { idx["name"] for idx in inspector.get_indexes("jobs_manager_run") } else: existing_indexes = set() if "ix_jobs_manager_run_status" not in existing_indexes: op.create_index( "ix_jobs_manager_run_status", "jobs_manager_run", ["status"], unique=False ) # Add jobs_manager_run_id column and FK to processing_job only if column doesn't exist processing_cols = {col["name"] for col in inspector.get_columns("processing_job")} if "jobs_manager_run_id" not in processing_cols: with op.batch_alter_table("processing_job", schema=None) as batch_op: batch_op.add_column( sa.Column("jobs_manager_run_id", sa.String(length=36), nullable=True) ) batch_op.create_index( batch_op.f("ix_processing_job_jobs_manager_run_id"), ["jobs_manager_run_id"], unique=False, ) batch_op.create_foreign_key( "fk_processing_job_jobs_manager_run_id", "jobs_manager_run", ["jobs_manager_run_id"], ["id"], ) with op.batch_alter_table("whisper_settings", schema=None) as batch_op: batch_op.drop_column("groq_initial_backoff") batch_op.drop_column("groq_backoff_factor") # ### end Alembic commands ### def downgrade(): # ### commands auto generated by Alembic - please adjust! ### bind = op.get_bind() inspector = sa.inspect(bind) existing_tables = set(inspector.get_table_names()) with op.batch_alter_table("whisper_settings", schema=None) as batch_op: batch_op.add_column( sa.Column( "groq_backoff_factor", sa.FLOAT(), server_default=sa.text("'2.0'"), nullable=False, ) ) batch_op.add_column( sa.Column( "groq_initial_backoff", sa.FLOAT(), server_default=sa.text("'1.0'"), nullable=False, ) ) with op.batch_alter_table("processing_job", schema=None) as batch_op: # Only drop FK/index/column if they exist processing_cols = { col["name"] for col in inspector.get_columns("processing_job") } if "jobs_manager_run_id" in processing_cols: batch_op.drop_constraint( "fk_processing_job_jobs_manager_run_id", type_="foreignkey" ) batch_op.drop_index(batch_op.f("ix_processing_job_jobs_manager_run_id")) batch_op.drop_column("jobs_manager_run_id") # Drop jobs_manager_run index and table if present if "jobs_manager_run" in existing_tables: # drop index if exists try: op.drop_index("ix_jobs_manager_run_status", table_name="jobs_manager_run") except Exception: # ignore if index doesn't exist pass op.drop_table("jobs_manager_run") # ### end Alembic commands ### ================================================ FILE: src/migrations/versions/a6f5df1a50ac_add_users_table.py ================================================ """add users table Revision ID: a6f5df1a50ac Revises: 3c7f5f7640e4 Create Date: 2024-05-15 00:00:00.000000 """ from __future__ import annotations import sqlalchemy as sa from alembic import op # revision identifiers, used by Alembic. revision = "a6f5df1a50ac" down_revision = "3c7f5f7640e4" branch_labels = None depends_on = None def upgrade() -> None: op.create_table( "users", sa.Column("id", sa.Integer(), primary_key=True, autoincrement=True), sa.Column("username", sa.String(length=255), nullable=False), sa.Column("password_hash", sa.String(length=255), nullable=False), sa.Column("role", sa.String(length=50), nullable=False, server_default="user"), sa.Column( "created_at", sa.DateTime(), nullable=False, server_default=sa.text("CURRENT_TIMESTAMP"), ), sa.Column( "updated_at", sa.DateTime(), nullable=False, server_default=sa.text("CURRENT_TIMESTAMP"), ), sa.UniqueConstraint("username", name="uq_users_username"), ) op.create_index("ix_users_username", "users", ["username"], unique=False) def downgrade() -> None: op.drop_index("ix_users_username", table_name="users") op.drop_table("users") ================================================ FILE: src/migrations/versions/ab643af6472e_add_manual_feed_allowance_to_user.py ================================================ """add_manual_feed_allowance_to_user Revision ID: ab643af6472e Revises: 185d3448990e Create Date: 2025-12-12 14:06:14.400553 """ import sqlalchemy as sa from alembic import op # revision identifiers, used by Alembic. revision = "ab643af6472e" down_revision = "185d3448990e" branch_labels = None depends_on = None def upgrade(): # ### commands auto generated by Alembic - please adjust! ### with op.batch_alter_table("feed", schema=None) as batch_op: batch_op.drop_index(batch_op.f("ix_feed_sponsor_user_id")) batch_op.drop_constraint( batch_op.f("fk_feed_sponsor_user_id"), type_="foreignkey" ) batch_op.drop_column("sponsor_user_id") batch_op.drop_column("sponsor_note") with op.batch_alter_table("users", schema=None) as batch_op: batch_op.add_column( sa.Column("manual_feed_allowance", sa.Integer(), nullable=True) ) # ### end Alembic commands ### def downgrade(): # ### commands auto generated by Alembic - please adjust! ### with op.batch_alter_table("users", schema=None) as batch_op: batch_op.drop_column("manual_feed_allowance") with op.batch_alter_table("feed", schema=None) as batch_op: batch_op.add_column(sa.Column("sponsor_note", sa.TEXT(), nullable=True)) batch_op.add_column(sa.Column("sponsor_user_id", sa.INTEGER(), nullable=True)) batch_op.create_foreign_key( batch_op.f("fk_feed_sponsor_user_id"), "users", ["sponsor_user_id"], ["id"] ) batch_op.create_index( batch_op.f("ix_feed_sponsor_user_id"), ["sponsor_user_id"], unique=False ) # ### end Alembic commands ### ================================================ FILE: src/migrations/versions/b038c2f99086_add_processingjob_table_for_async_.py ================================================ """Add ProcessingJob table for async episode processing Revision ID: b038c2f99086 Revises: b92e47a03bb2 Create Date: 2025-05-25 12:18:50.783647 """ import sqlalchemy as sa from alembic import op # revision identifiers, used by Alembic. revision = "b038c2f99086" down_revision = "b92e47a03bb2" branch_labels = None depends_on = None def upgrade(): # ### commands auto generated by Alembic - please adjust! ### op.create_table( "processing_job", sa.Column("id", sa.String(length=36), nullable=False), sa.Column("post_guid", sa.String(length=255), nullable=False), sa.Column("status", sa.String(length=50), nullable=False), sa.Column("current_step", sa.Integer(), nullable=True), sa.Column("step_name", sa.String(length=100), nullable=True), sa.Column("total_steps", sa.Integer(), nullable=True), sa.Column("progress_percentage", sa.Float(), nullable=True), sa.Column("started_at", sa.DateTime(), nullable=True), sa.Column("completed_at", sa.DateTime(), nullable=True), sa.Column("error_message", sa.Text(), nullable=True), sa.Column("scheduler_job_id", sa.String(length=255), nullable=True), sa.Column("created_at", sa.DateTime(), nullable=True), sa.PrimaryKeyConstraint("id"), ) with op.batch_alter_table("processing_job", schema=None) as batch_op: batch_op.create_index( batch_op.f("ix_processing_job_created_at"), ["created_at"], unique=False ) batch_op.create_index( batch_op.f("ix_processing_job_post_guid"), ["post_guid"], unique=False ) # ### end Alembic commands ### def downgrade(): # ### commands auto generated by Alembic - please adjust! ### with op.batch_alter_table("processing_job", schema=None) as batch_op: batch_op.drop_index(batch_op.f("ix_processing_job_post_guid")) batch_op.drop_index(batch_op.f("ix_processing_job_created_at")) op.drop_table("processing_job") # ### end Alembic commands ### ================================================ FILE: src/migrations/versions/b92e47a03bb2_refactor_transcripts_to_db_tables_.py ================================================ """Refactor transcripts to DB tables: TranscriptSegment, ModelCall, Identification Revision ID: b92e47a03bb2 Revises: ded4b70feadb Create Date: 2025-05-11 12:24:43.232263 """ import sqlalchemy as sa from alembic import op # revision identifiers, used by Alembic. revision = "b92e47a03bb2" down_revision = "ded4b70feadb" branch_labels = None depends_on = None def upgrade(): # ### commands auto generated by Alembic - please adjust! ### op.create_table( "model_call", sa.Column("id", sa.Integer(), autoincrement=True, nullable=False), sa.Column("post_id", sa.Integer(), nullable=False), sa.Column("first_segment_sequence_num", sa.Integer(), nullable=False), sa.Column("last_segment_sequence_num", sa.Integer(), nullable=False), sa.Column("model_name", sa.String(), nullable=False), sa.Column("prompt", sa.Text(), nullable=False), sa.Column("response", sa.Text(), nullable=True), sa.Column("timestamp", sa.DateTime(), nullable=False), sa.Column("status", sa.String(), nullable=False), sa.Column("error_message", sa.Text(), nullable=True), sa.Column("retry_attempts", sa.Integer(), nullable=False), sa.ForeignKeyConstraint( ["post_id"], ["post.id"], ), sa.PrimaryKeyConstraint("id"), ) with op.batch_alter_table("model_call", schema=None) as batch_op: batch_op.create_index( "ix_model_call_post_chunk_model", [ "post_id", "first_segment_sequence_num", "last_segment_sequence_num", "model_name", ], unique=True, ) op.create_table( "transcript_segment", sa.Column("id", sa.Integer(), autoincrement=True, nullable=False), sa.Column("post_id", sa.Integer(), nullable=False), sa.Column("sequence_num", sa.Integer(), nullable=False), sa.Column("start_time", sa.Float(), nullable=False), sa.Column("end_time", sa.Float(), nullable=False), sa.Column("text", sa.Text(), nullable=False), sa.ForeignKeyConstraint( ["post_id"], ["post.id"], ), sa.PrimaryKeyConstraint("id"), ) with op.batch_alter_table("transcript_segment", schema=None) as batch_op: batch_op.create_index( "ix_transcript_segment_post_id_sequence_num", ["post_id", "sequence_num"], unique=True, ) op.create_table( "identification", sa.Column("id", sa.Integer(), autoincrement=True, nullable=False), sa.Column("transcript_segment_id", sa.Integer(), nullable=False), sa.Column("model_call_id", sa.Integer(), nullable=False), sa.Column("confidence", sa.Float(), nullable=True), sa.Column("label", sa.String(), nullable=False), sa.ForeignKeyConstraint( ["model_call_id"], ["model_call.id"], ), sa.ForeignKeyConstraint( ["transcript_segment_id"], ["transcript_segment.id"], ), sa.PrimaryKeyConstraint("id"), ) with op.batch_alter_table("identification", schema=None) as batch_op: batch_op.create_index( "ix_identification_segment_call_label", ["transcript_segment_id", "model_call_id", "label"], unique=True, ) op.drop_table("transcript") # ### end Alembic commands ### def downgrade(): # ### commands auto generated by Alembic - please adjust! ### op.create_table( "transcript", sa.Column("id", sa.INTEGER(), nullable=False), sa.Column("post_id", sa.INTEGER(), nullable=False), sa.Column("content", sa.TEXT(), nullable=False), sa.Column("timestamp", sa.DATETIME(), nullable=True), sa.ForeignKeyConstraint( ["post_id"], ["post.id"], ), sa.PrimaryKeyConstraint("id"), sa.UniqueConstraint("post_id"), ) with op.batch_alter_table("identification", schema=None) as batch_op: batch_op.drop_index("ix_identification_segment_call_label") op.drop_table("identification") with op.batch_alter_table("transcript_segment", schema=None) as batch_op: batch_op.drop_index("ix_transcript_segment_post_id_sequence_num") op.drop_table("transcript_segment") with op.batch_alter_table("model_call", schema=None) as batch_op: batch_op.drop_index("ix_model_call_post_chunk_model") op.drop_table("model_call") # ### end Alembic commands ### ================================================ FILE: src/migrations/versions/bae70e584468_.py ================================================ """empty message Revision ID: bae70e584468 Revises: Create Date: 2024-10-20 14:45:30.170794 """ import sqlalchemy as sa from alembic import op # revision identifiers, used by Alembic. revision = "bae70e584468" down_revision = None branch_labels = None depends_on = None def upgrade(): # ### commands auto generated by Alembic - please adjust! ### op.create_table( "feed", sa.Column("id", sa.Integer(), autoincrement=True, nullable=False), sa.Column("title", sa.Text(), nullable=False), sa.Column("description", sa.Text(), nullable=True), sa.Column("author", sa.Text(), nullable=True), sa.Column("rss_url", sa.Text(), nullable=False, unique=True), sa.PrimaryKeyConstraint("id"), ) op.create_table( "post", sa.Column("id", sa.Integer(), autoincrement=True, nullable=False), sa.Column("feed_id", sa.Integer(), nullable=False), sa.Column("guid", sa.Text(), nullable=False, unique=True), sa.Column("download_url", sa.Text(), nullable=False, unique=True), sa.Column("title", sa.Text(), nullable=False), sa.Column("description", sa.Text(), nullable=True), sa.Column("release_date", sa.Date(), nullable=True), sa.Column("duration", sa.Integer(), nullable=True), sa.ForeignKeyConstraint( ["feed_id"], ["feed.id"], ), sa.PrimaryKeyConstraint("id"), ) op.create_table( "transcript", sa.Column("id", sa.Integer(), autoincrement=True, nullable=False), sa.Column("post_id", sa.Integer(), nullable=False, unique=True), sa.Column("content", sa.Text(), nullable=False), sa.Column("timestamp", sa.DateTime(), nullable=True), sa.ForeignKeyConstraint( ["post_id"], ["post.id"], ), sa.PrimaryKeyConstraint("id"), ) pass # ### end Alembic commands ### def downgrade(): # ### commands auto generated by Alembic - please adjust! ### pass # ### end Alembic commands ### ================================================ FILE: src/migrations/versions/c0f8893ce927_add_skipped_jobs_columns.py ================================================ """add skipped jobs counters Revision ID: c0f8893ce927 Revises: 999b921ffc58 Create Date: 2026-11-27 00:00:00.000000 """ import sqlalchemy as sa from alembic import op # revision identifiers, used by Alembic. revision = "c0f8893ce927" down_revision = "999b921ffc58" branch_labels = None depends_on = None def upgrade(): bind = op.get_bind() inspector = sa.inspect(bind) existing_tables = set(inspector.get_table_names()) if "jobs_manager_run" not in existing_tables: return columns = {col["name"] for col in inspector.get_columns("jobs_manager_run")} if "skipped_jobs" not in columns: with op.batch_alter_table("jobs_manager_run", schema=None) as batch_op: batch_op.add_column( sa.Column( "skipped_jobs", sa.Integer(), nullable=False, server_default="0", ) ) # Align existing rows to default value op.execute( sa.text( "UPDATE jobs_manager_run SET skipped_jobs = 0 WHERE skipped_jobs IS NULL" ) ) def downgrade(): bind = op.get_bind() inspector = sa.inspect(bind) existing_tables = set(inspector.get_table_names()) if "jobs_manager_run" not in existing_tables: return columns = {col["name"] for col in inspector.get_columns("jobs_manager_run")} if "skipped_jobs" in columns: with op.batch_alter_table("jobs_manager_run", schema=None) as batch_op: batch_op.drop_column("skipped_jobs") ================================================ FILE: src/migrations/versions/ded4b70feadb_add_image_metadata_to_feed.py ================================================ """Add image metadata to feed Revision ID: ded4b70feadb Revises: 6e0e16299dcb Create Date: 2025-03-01 14:30:20.177608 """ import sqlalchemy as sa from alembic import op # revision identifiers, used by Alembic. revision = "ded4b70feadb" down_revision = "6e0e16299dcb" branch_labels = None depends_on = None def upgrade(): with op.batch_alter_table("feed", schema=None) as batch_op: batch_op.add_column(sa.Column("image_url", sa.Text(), nullable=True)) pass def downgrade(): with op.batch_alter_table("feed", schema=None) as batch_op: batch_op.drop_column("image_url") pass ================================================ FILE: src/migrations/versions/e1325294473b_add_autoprocess_on_download.py ================================================ """add autoprocess_on_download Revision ID: e1325294473b Revises: 58b4eedd4c61 Create Date: 2025-12-25 20:45:12.595954 """ import sqlalchemy as sa from alembic import op # revision identifiers, used by Alembic. revision = "e1325294473b" down_revision = "58b4eedd4c61" branch_labels = None depends_on = None def upgrade(): # ### commands auto generated by Alembic - please adjust! ### with op.batch_alter_table("app_settings", schema=None) as batch_op: batch_op.add_column( sa.Column( "autoprocess_on_download", sa.Boolean(), nullable=False, server_default=sa.false(), # ensure existing SQLite rows get a value ) ) # ### end Alembic commands ### def downgrade(): # ### commands auto generated by Alembic - please adjust! ### with op.batch_alter_table("app_settings", schema=None) as batch_op: batch_op.drop_column("autoprocess_on_download") # ### end Alembic commands ### ================================================ FILE: src/migrations/versions/eb51923af483_multiple_supporters.py ================================================ """multiple supporters Revision ID: eb51923af483 Revises: 802a2365976d Create Date: 2025-12-01 22:25:13.104687 """ from datetime import datetime import sqlalchemy as sa from alembic import op from sqlalchemy import inspect # revision identifiers, used by Alembic. revision = "eb51923af483" down_revision = "802a2365976d" branch_labels = None depends_on = None def _table_exists(table_name: str) -> bool: """Check if a table exists in the database.""" connection = op.get_bind() inspector = inspect(connection) return table_name in inspector.get_table_names() def _column_exists(table_name: str, column_name: str) -> bool: """Check if a column exists in a table.""" connection = op.get_bind() inspector = inspect(connection) columns = [col["name"] for col in inspector.get_columns(table_name)] return column_name in columns def upgrade(): # ### commands auto generated by Alembic - please adjust! ### # Create feed_supporter table if it doesn't exist if not _table_exists("feed_supporter"): op.create_table( "feed_supporter", sa.Column("id", sa.Integer(), autoincrement=True, nullable=False), sa.Column("feed_id", sa.Integer(), nullable=False), sa.Column("user_id", sa.Integer(), nullable=False), sa.Column("created_at", sa.DateTime(), nullable=False), sa.ForeignKeyConstraint( ["feed_id"], ["feed.id"], ), sa.ForeignKeyConstraint( ["user_id"], ["users.id"], ), sa.PrimaryKeyConstraint("id"), sa.UniqueConstraint( "feed_id", "user_id", name="uq_feed_supporter_feed_user" ), ) # Add columns to processing_job if they don't exist if not _column_exists("processing_job", "requested_by_user_id"): with op.batch_alter_table("processing_job", schema=None) as batch_op: batch_op.add_column( sa.Column("requested_by_user_id", sa.Integer(), nullable=True) ) batch_op.add_column( sa.Column("billing_user_id", sa.Integer(), nullable=True) ) batch_op.create_foreign_key( "fk_processing_job_billing_user_id", "users", ["billing_user_id"], ["id"], ) batch_op.create_foreign_key( "fk_processing_job_requested_by_user_id", "users", ["requested_by_user_id"], ["id"], ) # Seed supporter rows for existing sponsors so they keep access permissions. connection = op.get_bind() feed_supporter_table = sa.table( "feed_supporter", sa.column("feed_id", sa.Integer), sa.column("user_id", sa.Integer), sa.column("created_at", sa.DateTime), ) # Check which sponsor/feed combos already exist existing = set() result = connection.execute(sa.text("SELECT feed_id, user_id FROM feed_supporter")) for row in result: existing.add((row._mapping["feed_id"], row._mapping["user_id"])) result = connection.execute( sa.text( "SELECT id AS feed_id, sponsor_user_id FROM feed WHERE sponsor_user_id IS NOT NULL" ) ) inserts = [] seen = set() for row in result: feed_id = row._mapping["feed_id"] user_id = row._mapping["sponsor_user_id"] if not user_id: continue key = (feed_id, user_id) if key in seen or key in existing: continue seen.add(key) inserts.append( { "feed_id": feed_id, "user_id": user_id, "created_at": datetime.utcnow(), } ) if inserts: op.bulk_insert(feed_supporter_table, inserts) # ### end Alembic commands ### def downgrade(): # ### commands auto generated by Alembic - please adjust! ### with op.batch_alter_table("processing_job", schema=None) as batch_op: batch_op.drop_constraint( "fk_processing_job_requested_by_user_id", type_="foreignkey" ) batch_op.drop_constraint( "fk_processing_job_billing_user_id", type_="foreignkey" ) batch_op.drop_column("billing_user_id") batch_op.drop_column("requested_by_user_id") op.drop_table("feed_supporter") # ### end Alembic commands ### ================================================ FILE: src/migrations/versions/f6d5fee57cc3_tz_fix.py ================================================ """tz_fix Revision ID: f6d5fee57cc3 Revises: 0d954a44fa8e Create Date: 2025-11-04 22:31:38.563280 """ import datetime import sqlalchemy as sa from alembic import op # revision identifiers, used by Alembic. revision = "f6d5fee57cc3" down_revision = "0d954a44fa8e" branch_labels = None depends_on = None def upgrade(): bind = op.get_bind() inspector = sa.inspect(bind) column_names = {col["name"] for col in inspector.get_columns("post")} if "release_date" not in column_names and "release_date_tmp" in column_names: with op.batch_alter_table("post", schema=None) as batch_op: batch_op.alter_column("release_date_tmp", new_column_name="release_date") return if "release_date" not in column_names: # Nothing to migrate (already applied manually, or table missing column) return if "release_date_tmp" not in column_names: with op.batch_alter_table("post", schema=None) as batch_op: batch_op.add_column( sa.Column("release_date_tmp", sa.DateTime(timezone=True), nullable=True) ) metadata = sa.MetaData() post = sa.Table("post", metadata, autoload_with=bind) select_stmt = sa.select(post.c.id, post.c.release_date) rows = bind.execute(select_stmt).fetchall() for row in rows: if row.release_date is None: continue if isinstance(row.release_date, datetime.datetime): dt = row.release_date else: dt = datetime.datetime.combine(row.release_date, datetime.time()) dt = dt.replace(tzinfo=datetime.timezone.utc) bind.execute( post.update().where(post.c.id == row.id).values(release_date_tmp=dt) ) inspector = sa.inspect(bind) column_names = {col["name"] for col in inspector.get_columns("post")} if "release_date" in column_names: with op.batch_alter_table("post", schema=None) as batch_op: batch_op.drop_column("release_date") inspector = sa.inspect(bind) column_names = {col["name"] for col in inspector.get_columns("post")} if "release_date_tmp" in column_names: with op.batch_alter_table("post", schema=None) as batch_op: batch_op.alter_column("release_date_tmp", new_column_name="release_date") def downgrade(): bind = op.get_bind() inspector = sa.inspect(bind) column_names = {col["name"] for col in inspector.get_columns("post")} if "release_date" not in column_names and "release_date_date" in column_names: with op.batch_alter_table("post", schema=None) as batch_op: batch_op.alter_column("release_date_date", new_column_name="release_date") return if "release_date" not in column_names: # Nothing to revert return if "release_date_date" not in column_names: with op.batch_alter_table("post", schema=None) as batch_op: batch_op.add_column( sa.Column("release_date_date", sa.DATE(), nullable=True) ) metadata = sa.MetaData() post = sa.Table("post", metadata, autoload_with=bind) select_stmt = sa.select(post.c.id, post.c.release_date) rows = bind.execute(select_stmt).fetchall() for row in rows: if row.release_date is None: continue if isinstance(row.release_date, datetime.datetime): dt = row.release_date else: dt = datetime.datetime.combine(row.release_date, datetime.time()) date_only = dt.astimezone(datetime.timezone.utc).date() bind.execute( post.update().where(post.c.id == row.id).values(release_date_date=date_only) ) inspector = sa.inspect(bind) column_names = {col["name"] for col in inspector.get_columns("post")} if "release_date" in column_names: with op.batch_alter_table("post", schema=None) as batch_op: batch_op.drop_column("release_date") inspector = sa.inspect(bind) column_names = {col["name"] for col in inspector.get_columns("post")} if "release_date_date" in column_names: with op.batch_alter_table("post", schema=None) as batch_op: batch_op.alter_column("release_date_date", new_column_name="release_date") ================================================ FILE: src/migrations/versions/f7a4195e0953_add_enable_boundary_refinement_to_llm_.py ================================================ """add enable_boundary_refinement to llm_settings Revision ID: f7a4195e0953 Revises: e1325294473b Create Date: 2026-01-06 23:02:56.142954 """ import sqlalchemy as sa from alembic import op # revision identifiers, used by Alembic. revision = "f7a4195e0953" down_revision = "e1325294473b" branch_labels = None depends_on = None def upgrade(): # ### commands auto generated by Alembic - please adjust! ### with op.batch_alter_table("llm_settings", schema=None) as batch_op: batch_op.add_column( sa.Column( "enable_boundary_refinement", sa.Boolean(), nullable=False, server_default=sa.text("1"), ) ) # ### end Alembic commands ### def downgrade(): # ### commands auto generated by Alembic - please adjust! ### with op.batch_alter_table("llm_settings", schema=None) as batch_op: batch_op.drop_column("enable_boundary_refinement") # ### end Alembic commands ### ================================================ FILE: src/migrations/versions/fa3a95ecd67d_audio_processing_paths.py ================================================ """audio processing paths Revision ID: fa3a95ecd67d Revises: bae70e584468 Create Date: 2024-11-09 16:48:09.337029 """ import sqlalchemy as sa from alembic import op # revision identifiers, used by Alembic. revision = "fa3a95ecd67d" down_revision = "bae70e584468" branch_labels = None depends_on = None def upgrade(): # ### commands auto generated by Alembic - please adjust! ### with op.batch_alter_table("post", schema=None) as batch_op: batch_op.add_column( sa.Column("unprocessed_audio_path", sa.Text(), nullable=True) ) batch_op.add_column(sa.Column("processed_audio_path", sa.Text(), nullable=True)) # ### end Alembic commands ### def downgrade(): # ### commands auto generated by Alembic - please adjust! ### with op.batch_alter_table("post", schema=None) as batch_op: batch_op.drop_column("processed_audio_path") batch_op.drop_column("unprocessed_audio_path") # ### end Alembic commands ### ================================================ FILE: src/podcast_processor/__init__.py ================================================ from warnings import filterwarnings from beartype.claw import beartype_this_package from beartype.roar import BeartypeDecorHintPep585DeprecationWarning beartype_this_package() filterwarnings("ignore", category=BeartypeDecorHintPep585DeprecationWarning) ================================================ FILE: src/podcast_processor/ad_classifier.py ================================================ import logging import math import time # pylint: disable=too-many-lines from datetime import datetime from typing import Any, Dict, List, Optional, Set, Tuple, Union import litellm from jinja2 import Template from litellm.exceptions import InternalServerError from litellm.types.utils import Choices from pydantic import ValidationError from sqlalchemy import and_ from app.extensions import db from app.models import Identification, ModelCall, Post, TranscriptSegment from app.writer.client import writer_client from podcast_processor.boundary_refiner import BoundaryRefiner from podcast_processor.cue_detector import CueDetector from podcast_processor.llm_concurrency_limiter import ( ConcurrencyContext, LLMConcurrencyLimiter, get_concurrency_limiter, ) from podcast_processor.model_output import ( AdSegmentPredictionList, clean_and_parse_model_output, ) from podcast_processor.prompt import transcript_excerpt_for_prompt from podcast_processor.token_rate_limiter import ( TokenRateLimiter, configure_rate_limiter_for_model, ) from podcast_processor.transcribe import Segment from podcast_processor.word_boundary_refiner import WordBoundaryRefiner from shared.config import Config, TestWhisperConfig from shared.llm_utils import model_uses_max_completion_tokens class ClassifyParams: def __init__( self, system_prompt: str, user_prompt_template: Template, post: Post, num_segments_per_prompt: int, max_overlap_segments: int, ): self.system_prompt = system_prompt self.user_prompt_template = user_prompt_template self.post = post self.num_segments_per_prompt = num_segments_per_prompt self.max_overlap_segments = max_overlap_segments class ClassifyException(Exception): """Custom exception for classification errors.""" class AdClassifier: """Handles the classification of ad segments in podcast transcripts.""" def __init__( self, config: Config, logger: Optional[logging.Logger] = None, model_call_query: Optional[Any] = None, identification_query: Optional[Any] = None, db_session: Optional[Any] = None, ): self.config = config self.logger = logger or logging.getLogger("global_logger") self.model_call_query = model_call_query or ModelCall.query self.identification_query = identification_query or Identification.query self.db_session = db_session or db.session # Initialize rate limiter for the configured model self.rate_limiter: Optional[TokenRateLimiter] if self.config.llm_enable_token_rate_limiting: tokens_per_minute = self.config.llm_max_input_tokens_per_minute if tokens_per_minute is None: # Use model-specific defaults self.rate_limiter = configure_rate_limiter_for_model( self.config.llm_model ) else: # Use custom limit from podcast_processor.token_rate_limiter import get_rate_limiter self.rate_limiter = get_rate_limiter(tokens_per_minute) self.logger.info( f"Using custom token rate limit: {tokens_per_minute}/min" ) else: self.rate_limiter = None self.logger.info("Token rate limiting disabled") # Initialize concurrency limiter for LLM API calls self.concurrency_limiter: Optional[LLMConcurrencyLimiter] max_concurrent = getattr(self.config, "llm_max_concurrent_calls", 3) if max_concurrent > 0: self.concurrency_limiter = get_concurrency_limiter(max_concurrent) self.logger.info( f"LLM concurrency limiting enabled: max {max_concurrent} concurrent calls" ) else: self.concurrency_limiter = None self.logger.info("LLM concurrency limiting disabled") # Initialize cue detector for neighbor expansion self.cue_detector = CueDetector() # Initialize boundary refiner (conditionally based on config) self.boundary_refiner: Optional[BoundaryRefiner] = None if config.enable_boundary_refinement: if getattr(config, "enable_word_level_boundary_refinder", False): self.boundary_refiner = WordBoundaryRefiner(config, self.logger) # type: ignore[assignment] self.logger.info("Word-level boundary refiner enabled") else: self.boundary_refiner = BoundaryRefiner(config, self.logger) self.logger.info("Boundary refinement enabled") else: self.logger.info("Boundary refinement disabled via config") def classify( self, *, transcript_segments: List[TranscriptSegment], system_prompt: str, user_prompt_template: Template, post: Post, ) -> None: """ Classifies transcript segments to identify ad segments. Args: transcript_segments: List of transcript segments to classify system_prompt: System prompt for the LLM user_prompt_template: User prompt template for the LLM post: Post containing the podcast to classify """ self.logger.info( f"Starting ad classification for post {post.id} with {len(transcript_segments)} segments." ) if not transcript_segments: self.logger.info( f"No transcript segments to classify for post {post.id}. Skipping." ) return classify_params = ClassifyParams( system_prompt=system_prompt, user_prompt_template=user_prompt_template, post=post, num_segments_per_prompt=self.config.processing.num_segments_to_input_to_prompt, max_overlap_segments=self.config.processing.max_overlap_segments, ) total_segments = len(transcript_segments) try: current_index = 0 next_overlap_segments: List[TranscriptSegment] = [] max_iterations = ( total_segments + 10 ) # Safety limit to prevent infinite loops iteration_count = 0 while current_index < total_segments and iteration_count < max_iterations: consumed_segments, next_overlap_segments = self._step( classify_params, next_overlap_segments, current_index, transcript_segments, ) current_index += consumed_segments iteration_count += 1 if consumed_segments == 0: self.logger.error( f"No progress made in iteration {iteration_count} for post {post.id}. " "Breaking to avoid infinite loop." ) break # Expand neighbors using bulk operations # NOTE: Use self.db_session.query() instead of self.identification_query # to ensure all operations use the same session consistently. ad_identifications = ( self.db_session.query(Identification) .join(TranscriptSegment) .filter( TranscriptSegment.post_id == post.id, Identification.label == "ad", ) .all() ) if ad_identifications: # Get model_call from first identification model_call = ( ad_identifications[0].model_call if ad_identifications else None ) if model_call: created = self.expand_neighbors_bulk( ad_identifications=ad_identifications, model_call=model_call, post_id=post.id, window=5, ) self.logger.info( f"Created {created} neighbor identifications via bulk ops" ) # Pass 2: Refine boundaries if self.boundary_refiner: self._refine_boundaries(transcript_segments, post) except ClassifyException as e: self.logger.error(f"Classification failed for post {post.id}: {e}") return def _step( self, classify_params: ClassifyParams, prev_overlap_segments: List[TranscriptSegment], current_index: int, transcript_segments: List[TranscriptSegment], ) -> Tuple[int, List[TranscriptSegment]]: overlap_segments = self._apply_overlap_cap(prev_overlap_segments) remaining_segments = transcript_segments[current_index:] ( chunk_segments, user_prompt_str, consumed_segments, token_limit_trimmed, ) = self._build_chunk_payload( overlap_segments=overlap_segments, remaining_segments=remaining_segments, total_segments=transcript_segments, post=classify_params.post, system_prompt=classify_params.system_prompt, user_prompt_template=classify_params.user_prompt_template, max_new_segments=classify_params.num_segments_per_prompt, ) if not chunk_segments or consumed_segments <= 0: self.logger.error( "No progress made while building classification chunk for post %s. " "Stopping to avoid infinite loop.", classify_params.post.id, ) raise ClassifyException( "No progress made while building classification chunk." ) if token_limit_trimmed: self.logger.debug( "Token limit trimming applied for post %s at transcript index %s. " "Processing chunk with %s new segments across %s total segments.", classify_params.post.id, current_index, consumed_segments, len(chunk_segments), ) identified_segments = self._process_chunk( chunk_segments=chunk_segments, system_prompt=classify_params.system_prompt, user_prompt_str=user_prompt_str, post=classify_params.post, ) next_overlap_segments = self._compute_next_overlap_segments( chunk_segments=chunk_segments, identified_segments=identified_segments, max_overlap_segments=classify_params.max_overlap_segments, ) if next_overlap_segments: self.logger.debug( "Carrying forward %s overlap segments for post %s: %s", len(next_overlap_segments), classify_params.post.id, [seg.sequence_num for seg in next_overlap_segments], ) return consumed_segments, next_overlap_segments def _process_chunk( self, *, chunk_segments: List[TranscriptSegment], system_prompt: str, post: Post, user_prompt_str: str, ) -> List[TranscriptSegment]: """Process a chunk of transcript segments for classification.""" if not chunk_segments: return [] first_seq_num = chunk_segments[0].sequence_num last_seq_num = chunk_segments[-1].sequence_num self.logger.info( f"Processing classification for post {post.id}, segments {first_seq_num}-{last_seq_num}." ) model_call = self._get_or_create_model_call( post=post, first_seq_num=first_seq_num, last_seq_num=last_seq_num, user_prompt_str=user_prompt_str, ) if not model_call: self.logger.error("ModelCall object is unexpectedly None. Skipping chunk.") return [] if self._should_call_llm(model_call): self._perform_llm_call( model_call=model_call, system_prompt=system_prompt, ) if model_call.status == "success" and model_call.response: return self._process_successful_response( model_call=model_call, current_chunk_db_segments=chunk_segments, ) if model_call.status != "success": self.logger.info( f"LLM call for ModelCall {model_call.id} was not successful (status: {model_call.status}). No identifications to process." ) return [] def _build_chunk_payload( self, *, overlap_segments: List[TranscriptSegment], remaining_segments: List[TranscriptSegment], total_segments: List[TranscriptSegment], post: Post, system_prompt: str, user_prompt_template: Template, max_new_segments: int, ) -> Tuple[List[TranscriptSegment], str, int, bool]: """Construct chunk data while enforcing overlap and token constraints.""" if not remaining_segments: return ([], "", 0, False) capped_overlap = self._apply_overlap_cap(overlap_segments) new_segment_count = min(max_new_segments, len(remaining_segments)) token_limit_trimmed = False while new_segment_count > 0: base_segments = remaining_segments[:new_segment_count] chunk_segments = self._combine_overlap_segments( overlap_segments=capped_overlap, base_segments=base_segments, ) if not chunk_segments: return ([], "", 0, token_limit_trimmed) includes_start = ( chunk_segments[0].id == total_segments[0].id if total_segments else False ) includes_end = ( chunk_segments[-1].id == total_segments[-1].id if total_segments else False ) user_prompt_str = self._generate_user_prompt( current_chunk_db_segments=chunk_segments, post=post, user_prompt_template=user_prompt_template, includes_start=includes_start, includes_end=includes_end, ) if ( self.config.llm_max_input_tokens_per_call is not None and not self._validate_token_limit(user_prompt_str, system_prompt) ): token_limit_trimmed = True if new_segment_count == 1: self.logger.warning( "Even single segment at transcript index %s exceeds token limit " "for post %s. Proceeding with minimal chunk.", base_segments[0].sequence_num, post.id, ) return (chunk_segments, user_prompt_str, new_segment_count, True) new_segment_count -= 1 continue return ( chunk_segments, user_prompt_str, new_segment_count, token_limit_trimmed, ) return ([], "", 0, token_limit_trimmed) def _combine_overlap_segments( self, *, overlap_segments: List[TranscriptSegment], base_segments: List[TranscriptSegment], ) -> List[TranscriptSegment]: """Combine overlap and new segments while preserving order and removing duplicates.""" combined: List[TranscriptSegment] = [] seen_ids: Set[int] = set() for segment in overlap_segments: if segment.id not in seen_ids: combined.append(segment) seen_ids.add(segment.id) for segment in base_segments: if segment.id not in seen_ids: combined.append(segment) seen_ids.add(segment.id) self.logger.debug( "Combined overlap (%s segments) and base (%s segments) into %s total segments. " "Overlap seq nums: %s, Base seq nums: %s", len(overlap_segments), len(base_segments), len(combined), [seg.sequence_num for seg in overlap_segments], [seg.sequence_num for seg in base_segments], ) return combined def _compute_next_overlap_segments( self, *, chunk_segments: List[TranscriptSegment], identified_segments: List[TranscriptSegment], max_overlap_segments: int, ) -> List[TranscriptSegment]: """Determine which segments should be carried forward to the next chunk.""" if max_overlap_segments <= 0 or not chunk_segments: return [] # Baseline: carry ~50% of the chunk to guarantee overlap even without detections base_tail_count = max(1, math.ceil(len(chunk_segments) / 2)) overlap_candidates = list(chunk_segments[-base_tail_count:]) if identified_segments: # Preserve from earliest detected ad through the end of the chunk identified_ids = {seg.id for seg in identified_segments} earliest_index = None for i, seg in enumerate(chunk_segments): if seg.id in identified_ids: earliest_index = i break if earliest_index is not None: ad_tail = chunk_segments[earliest_index:] overlap_candidates = self._combine_overlap_segments( overlap_segments=ad_tail, base_segments=overlap_candidates, ) # Conditional tail replay: always include the final ~15 seconds when ads are present tail_replay_segments = self._segments_covering_tail( chunk_segments=chunk_segments, seconds=15.0 ) overlap_candidates = self._combine_overlap_segments( overlap_segments=tail_replay_segments, base_segments=overlap_candidates, ) capped = self._apply_overlap_cap( overlap_candidates, max_override=max_overlap_segments ) self.logger.debug( "Carrying forward %s overlap segments: seq_nums %s (identified=%s)", len(capped), [seg.sequence_num for seg in capped], bool(identified_segments), ) return capped def _apply_overlap_cap( self, overlap_segments: List[TranscriptSegment], max_override: Optional[int] = None, ) -> List[TranscriptSegment]: """Ensure stored overlap obeys configured limits.""" max_overlap = ( self.config.processing.max_overlap_segments if max_override is None else max_override ) if max_overlap <= 0 or not overlap_segments: if max_overlap <= 0 and overlap_segments: self.logger.debug( "Discarding %s overlap segments because max_overlap_segments is %s.", len(overlap_segments), max_overlap, ) return [] if max_overlap <= 0 else list(overlap_segments) if len(overlap_segments) <= max_overlap: self.logger.debug( "Overlap cap check: %s segments within limit of %s, no trimming needed", len(overlap_segments), max_overlap, ) return list(overlap_segments) trimmed = overlap_segments[-max_overlap:] self.logger.debug( "Overlap cap enforcement: trimming from %s to %s segments (max=%s). " "Keeping seq_nums: %s", len(overlap_segments), len(trimmed), max_overlap, [seg.sequence_num for seg in trimmed], ) return trimmed def _segments_covering_tail( self, *, chunk_segments: List[TranscriptSegment], seconds: float ) -> List[TranscriptSegment]: """Return the minimal set of segments covering the last `seconds` of audio.""" if not chunk_segments: return [] last_end_time = ( chunk_segments[-1].end_time if chunk_segments[-1].end_time is not None else chunk_segments[-1].start_time ) cutoff = last_end_time - seconds tail_segments: List[TranscriptSegment] = [] for seg in reversed(chunk_segments): tail_segments.append(seg) if seg.start_time <= cutoff: break return list(reversed(tail_segments)) def _validate_token_limit(self, user_prompt_str: str, system_prompt: str) -> bool: """Validate that the prompt doesn't exceed the configured token limit.""" if self.config.llm_max_input_tokens_per_call is None: return True # Create messages as they would be sent to the API messages = [ {"role": "system", "content": system_prompt}, {"role": "user", "content": user_prompt_str}, ] # Count tokens (reuse the existing token counting logic from rate limiter) if self.rate_limiter: token_count = self.rate_limiter.count_tokens( messages, self.config.llm_model ) else: # Fallback token estimation if no rate limiter total_chars = len(system_prompt) + len(user_prompt_str) token_count = total_chars // 4 # ~4 characters per token is_valid = token_count <= self.config.llm_max_input_tokens_per_call if not is_valid: self.logger.debug( f"Prompt exceeds token limit: {token_count} > {self.config.llm_max_input_tokens_per_call}" ) else: self.logger.debug( f"Prompt within token limit: {token_count} <= {self.config.llm_max_input_tokens_per_call}" ) return is_valid def _prepare_api_call( self, model_call_obj: ModelCall, system_prompt: str ) -> Optional[Dict[str, Any]]: """Prepare API call arguments and validate token limits.""" # Prepare messages for the API call messages = [ {"role": "system", "content": system_prompt}, {"role": "user", "content": model_call_obj.prompt}, ] # Use rate limiter to wait if necessary and track token usage if self.rate_limiter: self.rate_limiter.wait_if_needed(messages, model_call_obj.model_name) # Get usage stats for logging usage_stats = self.rate_limiter.get_usage_stats() self.logger.info( f"Token usage: {usage_stats['current_usage']}/{usage_stats['limit']} " f"({usage_stats['usage_percentage']:.1f}%) for ModelCall {model_call_obj.id}" ) # Final validation: Check per-call token limit before making API call if self.config.llm_max_input_tokens_per_call is not None: if not self._validate_token_limit(model_call_obj.prompt, system_prompt): error_msg = ( f"Prompt for ModelCall {model_call_obj.id} exceeds configured " f"token limit of {self.config.llm_max_input_tokens_per_call}. " f"Consider reducing num_segments_to_input_to_prompt." ) self.logger.error(error_msg) if model_call_obj.id is not None: res = writer_client.update( "ModelCall", model_call_obj.id, {"status": "failed", "error_message": error_msg}, wait=True, ) if not res or not res.success: raise RuntimeError( getattr(res, "error", "Failed to update ModelCall") ) # Update local object to reflect database state model_call_obj.status = "failed" model_call_obj.error_message = error_msg return None # Prepare completion arguments completion_args = { "model": model_call_obj.model_name, "messages": messages, "timeout": self.config.openai_timeout, } # Use max_completion_tokens for newer OpenAI models (o1, gpt-5, gpt-4o variants) # OpenAI deprecated max_tokens for these models in favor of max_completion_tokens # Check if this is a model that requires max_completion_tokens # This includes: gpt-5, gpt-4o variants, o1 series, and latest chatgpt models uses_max_completion_tokens = model_uses_max_completion_tokens( model_call_obj.model_name ) # Debug logging to help diagnose model parameter issues self.logger.info( f"Model: '{model_call_obj.model_name}', using max_completion_tokens: {uses_max_completion_tokens}" ) if uses_max_completion_tokens: completion_args["max_completion_tokens"] = self.config.openai_max_tokens else: # For older models and non-OpenAI models, use max_tokens completion_args["max_tokens"] = self.config.openai_max_tokens return completion_args def _generate_user_prompt( self, *, current_chunk_db_segments: List[TranscriptSegment], post: Post, user_prompt_template: Template, includes_start: bool, includes_end: bool, ) -> str: """Generate the user prompt string for the LLM.""" temp_pydantic_segments_for_prompt = [ Segment(start=db_seg.start_time, end=db_seg.end_time, text=db_seg.text) for db_seg in current_chunk_db_segments ] return user_prompt_template.render( podcast_title=post.title, podcast_topic=post.description if post.description else "", transcript=transcript_excerpt_for_prompt( segments=temp_pydantic_segments_for_prompt, includes_start=includes_start, includes_end=includes_end, ), ) def _get_or_create_model_call( self, *, post: Post, first_seq_num: int, last_seq_num: int, user_prompt_str: str, ) -> Optional[ModelCall]: """Get an existing ModelCall or create a new one via writer.""" model = self.config.llm_model result = writer_client.action( "upsert_model_call", { "post_id": post.id, "model_name": model, "first_segment_sequence_num": first_seq_num, "last_segment_sequence_num": last_seq_num, "prompt": user_prompt_str, }, wait=True, ) if not result or not result.success: raise RuntimeError(getattr(result, "error", "Failed to upsert ModelCall")) model_call_id = (result.data or {}).get("model_call_id") if model_call_id is None: raise RuntimeError("Writer did not return model_call_id") model_call = self.db_session.get(ModelCall, int(model_call_id)) if model_call is None: raise RuntimeError(f"ModelCall {model_call_id} not found after upsert") return model_call def _should_call_llm(self, model_call: ModelCall) -> bool: """Determine if an LLM call should be made.""" return model_call.status not in ("success", "failed_permanent") def _perform_llm_call(self, *, model_call: ModelCall, system_prompt: str) -> None: """Perform the LLM call for classification.""" self.logger.info( f"Calling LLM for ModelCall {model_call.id} (post {model_call.post_id}, segments {model_call.first_segment_sequence_num}-{model_call.last_segment_sequence_num})." ) try: if isinstance(self.config.whisper, TestWhisperConfig): self._handle_test_mode_call(model_call) else: self._call_model(model_call_obj=model_call, system_prompt=system_prompt) except Exception as e: # pylint: disable=broad-exception-caught self.logger.error( f"LLM interaction via _call_model for ModelCall {model_call.id} resulted in an exception: {e}", exc_info=True, ) def _handle_test_mode_call(self, model_call: ModelCall) -> None: """Handle LLM call in test mode.""" self.logger.info("Test mode: Simulating successful LLM call for classify.") test_response = AdSegmentPredictionList(ad_segments=[]).model_dump_json() res = writer_client.update( "ModelCall", model_call.id, { "response": test_response, "status": "success", "error_message": None, "retry_attempts": 1, }, wait=True, ) if not res or not res.success: raise RuntimeError(getattr(res, "error", "Failed to update ModelCall")) # Update local object to reflect database state model_call.status = "success" model_call.response = test_response model_call.error_message = None def _process_successful_response( self, *, model_call: ModelCall, current_chunk_db_segments: List[TranscriptSegment], ) -> List[TranscriptSegment]: """Process a successful LLM response and create Identification records.""" self.logger.info( f"LLM call for ModelCall {model_call.id} was successful. Parsing response." ) try: prediction_list = clean_and_parse_model_output(model_call.response) created_identification_count, matched_segments = ( self._create_identifications( prediction_list=prediction_list, current_chunk_db_segments=current_chunk_db_segments, model_call=model_call, ) ) if created_identification_count > 0: self.logger.info( f"Created {created_identification_count} new Identification records for ModelCall {model_call.id}." ) return matched_segments except (ValidationError, AssertionError) as e: self.logger.error( f"Error processing LLM response for ModelCall {model_call.id}: {e}", exc_info=True, ) return [] def _create_identifications( self, *, prediction_list: AdSegmentPredictionList, current_chunk_db_segments: List[TranscriptSegment], model_call: ModelCall, ) -> Tuple[int, List[TranscriptSegment]]: """Create Identification records from the prediction list.""" to_insert: List[Dict[str, Any]] = [] matched_segments: List[TranscriptSegment] = [] processed_segment_ids: Set[int] = set() content_type = prediction_list.content_type for pred in prediction_list.ad_segments: adjusted_confidence = self._adjust_confidence( base_confidence=pred.confidence, content_type=content_type, ) if adjusted_confidence < self.config.output.min_confidence: self.logger.info( f"Ad prediction offset {pred.segment_offset:.2f} for post {model_call.post_id} ignored due to low confidence: {pred.confidence:.2f} (min: {self.config.output.min_confidence})" ) continue matched_segment = self._find_matching_segment( segment_offset=pred.segment_offset, current_chunk_db_segments=current_chunk_db_segments, ) if not matched_segment: self.logger.warning( f"Could not find matching TranscriptSegment for ad prediction offset {pred.segment_offset:.2f} in post {model_call.post_id}, chunk {model_call.first_segment_sequence_num}-{model_call.last_segment_sequence_num}. Confidence: {pred.confidence:.2f}" ) continue if matched_segment.id in processed_segment_ids: continue processed_segment_ids.add(matched_segment.id) matched_segments.append(matched_segment) if self._segment_has_ad_identification(matched_segment.id): self.logger.debug( "Segment %s for post %s already has an ad identification; skipping new record.", matched_segment.id, model_call.post_id, ) continue to_insert.append( { "transcript_segment_id": matched_segment.id, "model_call_id": model_call.id, "label": "ad", "confidence": adjusted_confidence, } ) self._maybe_add_preroll_context( matched_segment=matched_segment, current_chunk_db_segments=current_chunk_db_segments, model_call=model_call, processed_segment_ids=processed_segment_ids, matched_segments=matched_segments, base_confidence=adjusted_confidence, to_insert=to_insert, ) if not to_insert: return 0, matched_segments res = writer_client.action( "insert_identifications", {"identifications": to_insert}, wait=True, ) if not res or not res.success: raise RuntimeError( getattr(res, "error", "Failed to insert identifications") ) inserted = int((res.data or {}).get("inserted") or 0) return inserted, matched_segments def _adjust_confidence( self, *, base_confidence: float, content_type: Optional[str] ) -> float: """Demote confidence for self-promo/educational contexts.""" if not content_type: return base_confidence if content_type in {"educational/self_promo", "technical_discussion"}: return max(0.0, base_confidence - 0.25) if content_type == "transition": return max(0.0, base_confidence - 0.1) return base_confidence def _maybe_add_preroll_context( self, *, matched_segment: TranscriptSegment, current_chunk_db_segments: List[TranscriptSegment], model_call: ModelCall, processed_segment_ids: Set[int], matched_segments: List[TranscriptSegment], base_confidence: float, to_insert: List[Dict[str, Any]], ) -> int: """If an ad is detected within the first 45s, include up to 3 preceding intro segments.""" if matched_segment.start_time > 45.0: return 0 created = 0 matched_index = current_chunk_db_segments.index(matched_segment) start_index = max(0, matched_index - 3) for seg in current_chunk_db_segments[start_index:matched_index]: if seg.id in processed_segment_ids: continue if self._segment_has_ad_identification(seg.id): continue processed_segment_ids.add(seg.id) matched_segments.append(seg) to_insert.append( { "transcript_segment_id": seg.id, "model_call_id": model_call.id, "label": "ad", "confidence": max( base_confidence, self.config.output.min_confidence ), } ) created += 1 if created: self.logger.debug( "Pre-roll look-back added %s intro segments before %s (post %s)", created, matched_segment.sequence_num, model_call.post_id, ) return created def _find_matching_segment( self, *, segment_offset: float, current_chunk_db_segments: List[TranscriptSegment], ) -> Optional[TranscriptSegment]: """Find the TranscriptSegment that matches the given segment offset.""" min_diff = float("inf") matched_segment = None for ts_segment in current_chunk_db_segments: diff = abs(ts_segment.start_time - segment_offset) if diff < min_diff and diff < 0.5: # Tolerance of 0.5 seconds matched_segment = ts_segment min_diff = diff return matched_segment def _segment_has_ad_identification(self, transcript_segment_id: int) -> bool: """Check if a transcript segment already has an ad identification. NOTE: Uses self.db_session.query() for session consistency. """ return ( self.db_session.query(Identification) .filter_by( transcript_segment_id=transcript_segment_id, label="ad", ) .first() is not None ) def _is_retryable_error(self, error: Exception) -> bool: """Determine if an error should be retried.""" if isinstance(error, InternalServerError): return True # Check for retryable HTTP errors in other exception types error_str = str(error).lower() return ( "503" in error_str or "service unavailable" in error_str or "rate_limit_error" in error_str or "ratelimiterror" in error_str or "429" in error_str or "rate limit" in error_str ) def _call_model( self, model_call_obj: ModelCall, system_prompt: str, max_retries: Optional[int] = None, ) -> Optional[str]: """Call the LLM model with retry logic.""" # Use configured retry count if not specified retry_count = ( max_retries if max_retries is not None else getattr(self.config, "llm_max_retry_attempts", 3) ) last_error: Optional[Exception] = None raw_response_content = None original_retry_attempts = ( 0 if model_call_obj.retry_attempts is None else model_call_obj.retry_attempts ) for attempt in range(retry_count): retry_attempts_value = original_retry_attempts + attempt + 1 current_attempt_num = attempt + 1 self.logger.info( f"Calling model {model_call_obj.model_name} for ModelCall {model_call_obj.id} (attempt {current_attempt_num}/{retry_count})" ) try: # Persist retry attempt + pending status via writer if model_call_obj.id is not None: pending_res = writer_client.update( "ModelCall", model_call_obj.id, {"status": "pending", "retry_attempts": retry_attempts_value}, wait=True, ) if not pending_res or not pending_res.success: raise RuntimeError( getattr(pending_res, "error", "Failed to update ModelCall") ) # Prepare API call and validate token limits completion_args = self._prepare_api_call(model_call_obj, system_prompt) if completion_args is None: return None # Token limit exceeded # Use concurrency limiter if available if self.concurrency_limiter: with ConcurrencyContext(self.concurrency_limiter, timeout=30.0): response = litellm.completion(**completion_args) else: response = litellm.completion(**completion_args) response_first_choice = response.choices[0] assert isinstance(response_first_choice, Choices) content = response_first_choice.message.content assert content is not None raw_response_content = content success_res = writer_client.update( "ModelCall", model_call_obj.id, { "response": raw_response_content, "status": "success", "error_message": None, "retry_attempts": retry_attempts_value, }, wait=True, ) if not success_res or not success_res.success: raise RuntimeError( getattr(success_res, "error", "Failed to update ModelCall") ) # Update local object to reflect database state model_call_obj.status = "success" model_call_obj.response = raw_response_content model_call_obj.error_message = None self.logger.info( f"Model call {model_call_obj.id} successful on attempt {current_attempt_num}." ) return raw_response_content except Exception as e: last_error = e if self._is_retryable_error(e): self._handle_retryable_error( model_call_obj=model_call_obj, error=e, attempt=attempt, current_attempt_num=current_attempt_num, ) # Continue to next retry else: self.logger.error( f"Non-retryable LLM error for ModelCall {model_call_obj.id} (attempt {current_attempt_num}): {e}", exc_info=True, ) fail_res = writer_client.update( "ModelCall", model_call_obj.id, {"status": "failed_permanent", "error_message": str(e)}, wait=True, ) if not fail_res or not fail_res.success: raise RuntimeError( getattr(fail_res, "error", "Failed to update ModelCall") ) from e # Update local object to reflect database state model_call_obj.status = "failed_permanent" model_call_obj.error_message = str(e) raise # Re-raise non-retryable exceptions immediately # If we get here, all retries were exhausted self._handle_retry_exhausted(model_call_obj, retry_count, last_error) if last_error: raise last_error raise RuntimeError( f"Maximum retries ({retry_count}) exceeded for ModelCall {model_call_obj.id}." ) def _handle_retryable_error( self, *, model_call_obj: ModelCall, error: Union[InternalServerError, Exception], attempt: int, current_attempt_num: int, ) -> None: """Handle a retryable error during LLM call.""" self.logger.error( f"LLM retryable error for ModelCall {model_call_obj.id} (attempt {current_attempt_num}): {error}" ) res = writer_client.update( "ModelCall", model_call_obj.id, {"error_message": str(error)}, wait=True, ) if not res or not res.success: raise RuntimeError(getattr(res, "error", "Failed to update ModelCall")) # Update local object to reflect database state model_call_obj.error_message = str(error) # Use longer backoff for rate limiting errors error_str = str(error).lower() if any( term in error_str for term in ["rate_limit_error", "ratelimiterror", "429", "rate limit"] ): # For rate limiting, use longer backoff: 60, 120, 240 seconds wait_time = 60 * (2**attempt) self.logger.info( f"Rate limit detected. Waiting {wait_time}s before retry for ModelCall {model_call_obj.id}." ) else: # For other errors, use shorter exponential backoff: 1, 2, 4 seconds wait_time = (2**attempt) * 1 self.logger.info( f"Waiting {wait_time}s before next retry for ModelCall {model_call_obj.id}." ) time.sleep(wait_time) def _handle_retry_exhausted( self, model_call_obj: ModelCall, max_retries: int, last_error: Optional[Exception], ) -> None: """Handle the case when all retries are exhausted.""" self.logger.error( f"Failed to call model for ModelCall {model_call_obj.id} after {max_retries} attempts." ) if last_error: error_message = str(last_error) else: error_message = f"Maximum retries ({max_retries}) exceeded without a specific InternalServerError." res = writer_client.update( "ModelCall", model_call_obj.id, {"status": "failed_retries", "error_message": error_message}, wait=True, ) if not res or not res.success: raise RuntimeError(getattr(res, "error", "Failed to update ModelCall")) # Update local object to reflect database state model_call_obj.status = "failed_retries" model_call_obj.error_message = error_message def _get_segments_bulk( self, post_id: int, sequence_numbers: List[int] ) -> Dict[int, TranscriptSegment]: """Fetch multiple segments in one query. NOTE: Must use self.db_session.query() instead of TranscriptSegment.query to ensure we use the same session. Using TranscriptSegment.query (the Flask-SQLAlchemy scoped session) can lead to SQLite lock issues when another query on self.db_session is mid-transaction. """ segments = ( self.db_session.query(TranscriptSegment) .filter( and_( TranscriptSegment.post_id == post_id, TranscriptSegment.sequence_num.in_(sequence_numbers), ) ) .all() ) return {seg.sequence_num: seg for seg in segments} def _get_existing_ids_bulk( self, post_id: int, model_call_id: int ) -> Set[Tuple[int, int, str]]: """Fetch all existing identifications as a set for O(1) lookup. NOTE: Uses self.db_session.query() for session consistency. """ ids = ( self.db_session.query(Identification) .join(TranscriptSegment) .filter( and_( TranscriptSegment.post_id == post_id, Identification.model_call_id == model_call_id, ) ) .all() ) return {(i.transcript_segment_id, i.model_call_id, i.label) for i in ids} def _create_identifications_bulk( self, identifications: List[Dict[str, Any]] ) -> int: """Bulk insert identifications""" if not identifications: return 0 res = writer_client.action( "insert_identifications", {"identifications": identifications}, wait=True, ) if not res or not res.success: raise RuntimeError( getattr(res, "error", "Failed to insert identifications") ) return int((res.data or {}).get("inserted") or 0) def expand_neighbors_bulk( self, ad_identifications: List[Identification], model_call: ModelCall, post_id: int, window: int = 5, ) -> int: """Expand neighbors using bulk operations (3 queries instead of 900)""" # PHASE 1: Bulk data collection (2 queries) # Collect all sequence numbers we need sequence_numbers = set() for ident in ad_identifications: base_seq = ident.transcript_segment.sequence_num for offset in range(-window, window + 1): sequence_numbers.add(base_seq + offset) # Query 1: Bulk fetch segments segments_by_seq = self._get_segments_bulk(post_id, list(sequence_numbers)) # Query 2: Bulk fetch existing identifications existing = self._get_existing_ids_bulk(post_id, model_call.id) # PHASE 2: In-memory processing (0 queries) to_create = [] for ident in ad_identifications: base_seq = ident.transcript_segment.sequence_num for offset in range(-window, window + 1): if offset == 0: continue neighbor_seq = base_seq + offset seg = segments_by_seq.get(neighbor_seq) if not seg: continue # Check if already exists (O(1) lookup) key = (seg.id, model_call.id, "ad") if key in existing: continue text = seg.text or "" signals = self.cue_detector.analyze(text) has_strong_cue = ( signals["url"] or signals["promo"] or signals["phone"] or signals["cta"] ) is_transition = signals["transition"] is_self_promo = signals["self_promo"] gap_seconds = abs( (seg.start_time or 0.0) - (ident.transcript_segment.start_time or 0.0) ) if not self._should_expand_neighbor( has_strong_cue=has_strong_cue, is_transition=is_transition, gap_seconds=gap_seconds, ): continue confidence = self._neighbor_confidence( has_strong_cue=has_strong_cue, is_transition=is_transition, is_self_promo=is_self_promo, gap_seconds=gap_seconds, ) to_create.append( { "transcript_segment_id": seg.id, "model_call_id": model_call.id, "label": "ad", "confidence": confidence, } ) existing.add(key) # Avoid duplicates in this batch # PHASE 3: Bulk insert (1 query) if to_create: return self._create_identifications_bulk(to_create) return 0 def _should_expand_neighbor( self, *, has_strong_cue: bool, is_transition: bool, gap_seconds: float, ) -> bool: if not self.config.enable_boundary_refinement: return has_strong_cue if has_strong_cue or is_transition: return True return gap_seconds <= 10.0 @staticmethod def _neighbor_confidence( *, has_strong_cue: bool, is_transition: bool, is_self_promo: bool, gap_seconds: float, ) -> float: confidence = 0.72 if is_transition else 0.75 if has_strong_cue: confidence = 0.85 if gap_seconds <= 10.0 else 0.8 if is_self_promo: confidence = max(0.5, confidence - 0.25) return confidence def _refine_boundaries( self, transcript_segments: List[TranscriptSegment], post: Post ) -> None: """Apply boundary refinement to detected ads. NOTE: Uses self.db_session.query() for session consistency. """ if not self.boundary_refiner: return # Latest refined boundaries for downstream audio cuts. Overwrites prior # values for the post ("latest successful" semantics). refined_boundaries: List[Dict[str, Any]] = [] # Get ad identifications identifications = ( self.db_session.query(Identification) .join(TranscriptSegment) .filter(TranscriptSegment.post_id == post.id, Identification.label == "ad") .all() ) # Group into ad blocks ad_blocks = self._group_into_blocks(identifications) for block in ad_blocks: # Skip low confidence or very short blocks if block["confidence"] < 0.6 or (block["end"] - block["start"]) < 15.0: continue # Refine seq_nums = [ ident.transcript_segment.sequence_num for ident in block["identifications"] if ident.transcript_segment is not None ] refinement = self.boundary_refiner.refine( ad_start=block["start"], ad_end=block["end"], confidence=block["confidence"], all_segments=[ { "sequence_num": s.sequence_num, "start_time": s.start_time, "text": s.text, "end_time": s.end_time, } for s in transcript_segments ], post_id=post.id, first_seq_num=min(seq_nums) if seq_nums else None, last_seq_num=max(seq_nums) if seq_nums else None, ) # Apply refinement: delete old identifications, create new ones # Note: Get model_call from block identifications model_call = ( block["identifications"][0].model_call if block["identifications"] else None ) if model_call: self._apply_refinement( block, refinement, transcript_segments, post, model_call ) refined_boundaries.append( { "orig_start": float(block["start"]), "orig_end": float(block["end"]), "refined_start": float(refinement.refined_start), "refined_end": float(refinement.refined_end), "confidence": float(block.get("confidence", 0.0) or 0.0), } ) # Store latest refined boundaries on the post so audio processing can cut # using refined timestamps (including word-level refined start times). # Clear the value when we have no refined boundaries so stale data doesn't # affect future audio cuts. try: res = writer_client.update( "Post", post.id, { "refined_ad_boundaries": refined_boundaries or None, "refined_ad_boundaries_updated_at": datetime.utcnow(), }, wait=True, ) if not res or not res.success: raise RuntimeError( getattr(res, "error", "Failed to update refined ad boundaries") ) except Exception as exc: # pylint: disable=broad-except # Best-effort: cutting can fall back to segment-derived windows. self.logger.warning( "Failed to persist refined ad boundaries for post %s: %s", post.id, exc, ) def _group_into_blocks( self, identifications: List[Identification] ) -> List[Dict[str, Any]]: """Group adjacent identifications into ad blocks""" if not identifications: return [] identifications = sorted( identifications, key=lambda i: i.transcript_segment.start_time ) blocks: List[Dict[str, Any]] = [] current: List[Identification] = [] for ident in identifications: if ( not current or ident.transcript_segment.start_time - current[-1].transcript_segment.end_time <= 10.0 ): current.append(ident) else: blocks.append(self._create_block(current)) current = [ident] if current: blocks.append(self._create_block(current)) return blocks def _create_block(self, identifications: List[Identification]) -> Dict[str, Any]: return { "start": min(i.transcript_segment.start_time for i in identifications), "end": max(i.transcript_segment.end_time for i in identifications), "confidence": sum(i.confidence for i in identifications) / len(identifications), "identifications": identifications, } def _apply_refinement( self, block: Dict[str, Any], refinement: Any, transcript_segments: List[TranscriptSegment], post: Post, model_call: ModelCall, ) -> None: """Update identifications based on refined boundaries""" delete_ids = [ i.id for i in block.get("identifications", []) if getattr(i, "id", None) is not None ] new_identifications: List[Dict[str, Any]] = [] for seg in transcript_segments: seg_start = float(seg.start_time or 0.0) seg_end = float(seg.end_time or seg_start) # Keep segments that overlap the refined window. This preserves the # containing segment when refined boundaries fall mid-segment. if seg_start <= float(refinement.refined_end) and seg_end >= float( refinement.refined_start ): new_identifications.append( { "transcript_segment_id": seg.id, "model_call_id": model_call.id, "label": "ad", "confidence": block["confidence"], } ) res = writer_client.action( "replace_identifications", {"delete_ids": delete_ids, "new_identifications": new_identifications}, wait=True, ) if not res or not res.success: raise RuntimeError( getattr(res, "error", "Failed to replace identifications") ) ================================================ FILE: src/podcast_processor/ad_merger.py ================================================ import re from dataclasses import dataclass from typing import Dict, List, Pattern from app.models import Identification, TranscriptSegment @dataclass class AdGroup: segments: List[TranscriptSegment] identifications: List[Identification] start_time: float end_time: float confidence_avg: float keywords: List[str] class AdMerger: def __init__(self) -> None: self.url_pattern: Pattern[str] = re.compile( r"\b([a-z0-9\-\.]+\.(?:com|net|org|io))\b", re.I ) self.promo_pattern: Pattern[str] = re.compile( r"\b(code|promo|save)\s+\w+\b", re.I ) self.phone_pattern: Pattern[str] = re.compile(r"\b\d{3}[ -]?\d{3}[ -]?\d{4}\b") def merge( self, ad_segments: List[TranscriptSegment], identifications: List[Identification], max_gap: float = 8.0, min_content_gap: float = 12.0, ) -> List[AdGroup]: """Merge ad segments using content analysis""" if not ad_segments: return [] # Sort by time ad_segments = sorted(ad_segments, key=lambda s: s.start_time) # Group by proximity groups = self._group_by_proximity(ad_segments, identifications, max_gap) # Refine using content analysis groups = self._refine_by_content(groups, min_content_gap) # Filter weak groups return [g for g in groups if self._is_valid_group(g)] def _group_by_proximity( self, segments: List[TranscriptSegment], identifications: List[Identification], max_gap: float, ) -> List[AdGroup]: """Initial grouping by time proximity""" id_lookup: Dict[int, Identification] = { i.transcript_segment_id: i for i in identifications } groups: List[AdGroup] = [] current: List[TranscriptSegment] = [] for seg in segments: if not current or seg.start_time - current[-1].end_time <= max_gap: current.append(seg) else: if current: groups.append(self._create_group(current, id_lookup)) current = [seg] if current: groups.append(self._create_group(current, id_lookup)) return groups def _create_group( self, segments: List[TranscriptSegment], id_lookup: Dict[int, Identification], ) -> AdGroup: ids = [id_lookup[s.id] for s in segments if s.id in id_lookup] return AdGroup( segments=segments, identifications=ids, start_time=segments[0].start_time, end_time=segments[-1].end_time, confidence_avg=sum(i.confidence for i in ids) / len(ids) if ids else 0.0, keywords=self._extract_keywords(segments), ) def _extract_keywords(self, segments: List[TranscriptSegment]) -> List[str]: """Extract URLs, promo codes, brands""" text = " ".join(s.text or "" for s in segments).lower() keywords: List[str] = [] # URLs keywords.extend(self.url_pattern.findall(text)) # Promo codes keywords.extend(self.promo_pattern.findall(text)) # Phone numbers if self.phone_pattern.search(text): keywords.append("phone") # Brand names (capitalized words appearing 2+ times) words = re.findall(r"\b[A-Z][a-z]+\b", " ".join(s.text for s in segments)) counts: Dict[str, int] = {} for word in words: if len(word) > 3: counts[word] = counts.get(word, 0) + 1 keywords.extend(w.lower() for w, c in counts.items() if c >= 2) return list(set(keywords)) def _refine_by_content( self, groups: List[AdGroup], min_content_gap: float ) -> List[AdGroup]: """Merge groups with shared sponsors""" if len(groups) <= 1: return groups refined: List[AdGroup] = [] i = 0 while i < len(groups): current = groups[i] if i + 1 < len(groups): next_group = groups[i + 1] gap = next_group.start_time - current.end_time if gap <= min_content_gap and self._should_merge(current, next_group): # Merge merged = AdGroup( segments=current.segments + next_group.segments, identifications=current.identifications + next_group.identifications, start_time=current.start_time, end_time=next_group.end_time, confidence_avg=( current.confidence_avg + next_group.confidence_avg ) / 2, keywords=list(set(current.keywords + next_group.keywords)), ) refined.append(merged) i += 2 else: refined.append(current) i += 1 else: refined.append(current) i += 1 return refined def _should_merge(self, group1: AdGroup, group2: AdGroup) -> bool: """Check if groups belong to same sponsor""" # High confidence → merge if group1.confidence_avg >= 0.9 and group2.confidence_avg >= 0.9: return True # Shared keywords (URL or brand) shared = set(group1.keywords) & set(group2.keywords) if len(shared) >= 1: return True # Small gap with good confidence gap = group2.start_time - group1.end_time if ( gap <= 10.0 and group1.confidence_avg >= 0.8 and group2.confidence_avg >= 0.8 ): return True return False def _is_valid_group(self, group: AdGroup) -> bool: """Filter out weak single-segment groups""" duration = group.end_time - group.start_time if duration > 180.0 and not group.keywords and group.confidence_avg < 0.9: # Long sponsor monologues without clear cues are likely educational/self-promo return False if len(group.segments) < 2 or duration <= 10.0: # Keep only if has strong keywords or high confidence return len(group.keywords) >= 1 or group.confidence_avg >= 0.9 return True ================================================ FILE: src/podcast_processor/audio.py ================================================ import logging import math import os import tempfile from pathlib import Path from typing import List, Optional, Tuple import ffmpeg # type: ignore[import-untyped] logger = logging.getLogger("global_logger") def get_audio_duration_ms(file_path: str) -> Optional[int]: try: logger.debug("[FFMPEG_PROBE] Probing audio file: %s", file_path) probe = ffmpeg.probe(file_path) format_info = probe["format"] duration_seconds = float(format_info["duration"]) duration_milliseconds = duration_seconds * 1000 logger.debug("[FFMPEG_PROBE] Duration: %.2f seconds", duration_seconds) return int(duration_milliseconds) except ffmpeg.Error as e: logger.error( "[FFMPEG_PROBE] Error probing file %s: %s", file_path, e.stderr.decode() if e.stderr else str(e), ) return None def clip_segments_with_fade( ad_segments_ms: List[Tuple[int, int]], fade_ms: int, in_path: str, out_path: str, ) -> None: audio_duration_ms = get_audio_duration_ms(in_path) assert audio_duration_ms is not None # Try the complex filter approach first, fall back to simple if it fails # Catch both ffmpeg.Error (runtime) and broader exceptions (filter graph construction) try: _clip_segments_complex( ad_segments_ms, fade_ms, in_path, out_path, audio_duration_ms ) except ffmpeg.Error as e: err_msg = e.stderr.decode() if getattr(e, "stderr", None) else str(e) logger.warning( "Complex filter failed (ffmpeg error), trying simple approach: %s", err_msg ) _clip_segments_simple(ad_segments_ms, in_path, out_path, audio_duration_ms) except Exception as e: # pylint: disable=broad-except # Catches filter graph construction errors like "multiple outgoing edges" logger.warning( "Complex filter failed (graph error), trying simple approach: %s", e ) _clip_segments_simple(ad_segments_ms, in_path, out_path, audio_duration_ms) def _clip_segments_complex( ad_segments_ms: List[Tuple[int, int]], fade_ms: int, in_path: str, out_path: str, audio_duration_ms: int, ) -> None: """Original complex approach with fades.""" trimmed_list = [] last_end = 0 for start_ms, end_ms in ad_segments_ms: trimmed_list.extend( [ ffmpeg.input(in_path).filter( "atrim", start=last_end / 1000.0, end=start_ms / 1000.0 ), ffmpeg.input(in_path) .filter( "atrim", start=start_ms / 1000.0, end=(start_ms + fade_ms) / 1000.0 ) .filter("afade", t="out", ss=0, d=fade_ms / 1000.0), ffmpeg.input(in_path) .filter("atrim", start=(end_ms - fade_ms) / 1000.0, end=end_ms / 1000.0) .filter("afade", t="in", ss=0, d=fade_ms / 1000.0), ] ) last_end = end_ms if last_end != audio_duration_ms: trimmed_list.append( ffmpeg.input(in_path).filter( "atrim", start=last_end / 1000.0, end=audio_duration_ms / 1000.0 ) ) logger.info( "[FFMPEG_CONCAT] Starting audio concatenation: %s -> %s (%d segments)", in_path, out_path, len(trimmed_list), ) ffmpeg.concat(*trimmed_list, v=0, a=1).output(out_path).overwrite_output().run() logger.info("[FFMPEG_CONCAT] Completed audio concatenation: %s", out_path) def _clip_segments_simple( ad_segments_ms: List[Tuple[int, int]], in_path: str, out_path: str, audio_duration_ms: int, ) -> None: """Simpler approach without fades - more reliable for many segments.""" # Build list of segments to keep (inverse of ad segments) keep_segments: List[Tuple[int, int]] = [] last_end = 0 for start_ms, end_ms in ad_segments_ms: if start_ms > last_end: keep_segments.append((last_end, start_ms)) last_end = end_ms if last_end < audio_duration_ms: keep_segments.append((last_end, audio_duration_ms)) if not keep_segments: raise ValueError("No audio segments to keep after ad removal") logger.info( "[FFMPEG_SIMPLE] Starting simple concat with %d segments", len(keep_segments) ) # Create temp directory for intermediate files with tempfile.TemporaryDirectory() as temp_dir: segment_files = [] # Extract each segment to keep for i, (start_ms, end_ms) in enumerate(keep_segments): segment_path = os.path.join(temp_dir, f"segment_{i}.mp3") start_sec = start_ms / 1000.0 duration_sec = (end_ms - start_ms) / 1000.0 ( ffmpeg.input(in_path) .output( segment_path, ss=start_sec, t=duration_sec, acodec="libmp3lame", q=2 ) .overwrite_output() .run(quiet=True) ) segment_files.append(segment_path) # Create concat file list concat_list_path = os.path.join(temp_dir, "concat_list.txt") with open(concat_list_path, "w", encoding="utf-8") as file_list: for seg_file in segment_files: file_list.write(f"file '{seg_file}'\n") # Concatenate all segments ( ffmpeg.input(concat_list_path, format="concat", safe=0) .output(out_path, acodec="libmp3lame", q=2) .overwrite_output() .run(quiet=True) ) logger.info("[FFMPEG_SIMPLE] Completed simple audio concatenation: %s", out_path) def trim_file(in_path: Path, out_path: Path, start_ms: int, end_ms: int) -> None: duration_ms = end_ms - start_ms if duration_ms <= 0: return start_sec = max(start_ms, 0) / 1000.0 duration_sec = duration_ms / 1000.0 logger.debug( "[FFMPEG_TRIM] Trimming %s -> %s (start=%.2fs, duration=%.2fs)", in_path, out_path, start_sec, duration_sec, ) ( ffmpeg.input(str(in_path)) .output( str(out_path), ss=start_sec, t=duration_sec, acodec="copy", vn=None, ) .overwrite_output() .run() ) def split_audio( audio_file_path: Path, audio_chunk_path: Path, chunk_size_bytes: int, ) -> List[Tuple[Path, int]]: audio_chunk_path.mkdir(parents=True, exist_ok=True) logger.info( "[FFMPEG_SPLIT] Splitting audio file: %s into chunks of %d bytes", audio_file_path, chunk_size_bytes, ) duration_ms = get_audio_duration_ms(str(audio_file_path)) assert duration_ms is not None if chunk_size_bytes <= 0: raise ValueError("chunk_size_bytes must be a positive integer") file_size_bytes = audio_file_path.stat().st_size if file_size_bytes == 0: raise ValueError("Cannot split zero-byte audio file") chunk_ratio = chunk_size_bytes / file_size_bytes chunk_duration_ms = max(1, math.ceil(duration_ms * chunk_ratio)) num_chunks = max(1, math.ceil(duration_ms / chunk_duration_ms)) logger.info( "[FFMPEG_SPLIT] Will create %d chunks (duration per chunk: %d ms)", num_chunks, chunk_duration_ms, ) chunks: List[Tuple[Path, int]] = [] for i in range(num_chunks): start_offset_ms = i * chunk_duration_ms if start_offset_ms >= duration_ms: break end_offset_ms = min(duration_ms, (i + 1) * chunk_duration_ms) export_path = audio_chunk_path / f"{i}.mp3" logger.debug( "[FFMPEG_SPLIT] Creating chunk %d/%d: %s", i + 1, num_chunks, export_path ) trim_file(audio_file_path, export_path, start_offset_ms, end_offset_ms) chunks.append((export_path, start_offset_ms)) logger.info("[FFMPEG_SPLIT] Split complete: created %d chunks", len(chunks)) return chunks ================================================ FILE: src/podcast_processor/audio_processor.py ================================================ import logging from typing import Any, List, Optional, Tuple from app.extensions import db from app.models import Identification, ModelCall, Post, TranscriptSegment from app.writer.client import writer_client from podcast_processor.ad_merger import AdMerger from podcast_processor.audio import clip_segments_with_fade, get_audio_duration_ms from shared.config import Config class AudioProcessor: """Handles audio processing and ad segment removal from podcast files.""" def __init__( self, config: Config, logger: Optional[logging.Logger] = None, identification_query: Optional[Any] = None, transcript_segment_query: Optional[Any] = None, model_call_query: Optional[Any] = None, db_session: Optional[Any] = None, ): self.logger = logger or logging.getLogger("global_logger") self.config = config self._identification_query_provided = identification_query is not None self.identification_query = identification_query or Identification.query self.transcript_segment_query = ( transcript_segment_query or TranscriptSegment.query ) self.model_call_query = model_call_query or ModelCall.query self.db_session = db_session or db.session self.ad_merger = AdMerger() def get_ad_segments(self, post: Post) -> List[Tuple[float, float]]: """ Retrieves ad segments from the database for a given post. NOTE: Uses self.db_session.query() instead of self.identification_query to ensure all operations use the same session consistently. Args: post: The Post object to retrieve ad segments for Returns: A list of tuples containing start and end times (in seconds) of ad segments """ self.logger.info(f"Retrieving ad segments from database for post {post.id}.") query = ( self.identification_query if self._identification_query_provided else self.db_session.query(Identification) ) ad_identifications = ( query.join( TranscriptSegment, Identification.transcript_segment_id == TranscriptSegment.id, ) .join(ModelCall, Identification.model_call_id == ModelCall.id) .filter( TranscriptSegment.post_id == post.id, Identification.label == "ad", Identification.confidence >= self.config.output.min_confidence, ModelCall.status == "success", # Only consider identifications from successful LLM calls ) .all() ) if not ad_identifications: self.logger.info( f"No ad segments found meeting criteria for post {post.id}." ) return [] # Get full segment objects with text for content analysis # Filter out any identifications with missing segments (DB integrity check) ad_segments_with_text = [] valid_identifications = [] for ident in ad_identifications: segment = ident.transcript_segment if segment: ad_segments_with_text.append(segment) valid_identifications.append(ident) else: # This should ideally not happen if DB integrity is maintained self.logger.warning( f"Identification {ident.id} for post {post.id} refers to a missing TranscriptSegment {ident.transcript_segment_id}. Skipping." ) if not ad_segments_with_text: self.logger.info( f"No valid ad segments with transcript data for post {post.id}." ) return [] # Content-aware merge ad_groups = self.ad_merger.merge( ad_segments=ad_segments_with_text, identifications=valid_identifications, max_gap=float(self.config.output.min_ad_segment_separation_seconds), min_content_gap=12.0, ) # If boundary refinement persisted refined windows on the post, prefer those # refined timestamps for audio cutting (this allows word-level refinement to # affect the actual cut start time). if getattr(self.config, "enable_boundary_refinement", False): self._apply_refined_boundaries(post, ad_groups) self.logger.info( f"Merged {len(ad_segments_with_text)} segments into {len(ad_groups)} groups for post {post.id}" ) # Convert to time tuples for merge_ad_segments() ad_segments_times = [(g.start_time, g.end_time) for g in ad_groups] ad_segments_times.sort(key=lambda x: x[0]) return ad_segments_times def _apply_refined_boundaries(self, post: Post, ad_groups: Any) -> None: post_row = self._safe_get_post_row(post) refined = getattr(post_row, "refined_ad_boundaries", None) if post_row else None parsed = self._parse_refined_boundaries(refined) if not parsed: return for group in ad_groups: overlap_window = self._refined_overlap_window_for_group(group, parsed) if overlap_window is None: continue refined_start_min, refined_end_max = overlap_window new_start = max(group.start_time, refined_start_min) new_end = min(group.end_time, refined_end_max) if new_end > new_start: group.start_time = new_start group.end_time = new_end def _safe_get_post_row(self, post: Post) -> Optional[Post]: try: return self.db_session.get(Post, post.id) except Exception: # pylint: disable=broad-except return None @staticmethod def _parse_refined_boundaries( refined: Any, ) -> List[Tuple[float, float, float, float]]: if not refined or not isinstance(refined, list): return [] parsed: List[Tuple[float, float, float, float]] = [] for item in refined: if not isinstance(item, dict): continue orig_start_raw = item.get("orig_start") orig_end_raw = item.get("orig_end") refined_start_raw = item.get("refined_start") refined_end_raw = item.get("refined_end") if ( orig_start_raw is None or orig_end_raw is None or refined_start_raw is None or refined_end_raw is None ): continue try: orig_start = float(orig_start_raw) orig_end = float(orig_end_raw) refined_start = float(refined_start_raw) refined_end = float(refined_end_raw) except Exception: # pylint: disable=broad-except continue if refined_end <= refined_start: continue parsed.append((orig_start, orig_end, refined_start, refined_end)) return parsed @staticmethod def _refined_overlap_window_for_group( group: Any, parsed: List[Tuple[float, float, float, float]], ) -> Optional[Tuple[float, float]]: overlaps: List[Tuple[float, float]] = [] for orig_start, orig_end, refined_start, refined_end in parsed: overlap = max( 0.0, min(group.end_time, orig_end) - max(group.start_time, orig_start), ) if overlap > 0.0: overlaps.append((refined_start, refined_end)) if not overlaps: return None refined_start_min = min(s for s, _ in overlaps) refined_end_max = max(e for _, e in overlaps) return refined_start_min, refined_end_max def merge_ad_segments( self, *, duration_ms: int, ad_segments: List[Tuple[float, float]], min_ad_segment_length_seconds: float, min_ad_segment_separation_seconds: float, ) -> List[Tuple[int, int]]: """ Merges nearby ad segments and filters out segments that are too short. Args: duration_ms: Duration of the audio in milliseconds ad_segments: List of ad segments as (start, end) tuples in seconds min_ad_segment_length_seconds: Minimum length of an ad segment to retain min_ad_segment_separation_seconds: Minimum separation between segments before merging Returns: List of merged ad segments as (start, end) tuples in milliseconds """ audio_duration_seconds = duration_ms / 1000.0 self.logger.info( f"Creating new audio with ads segments removed between: {ad_segments}" ) if not ad_segments: return [] ad_segments = sorted(ad_segments) last_segment = self._get_last_segment_if_near_end( ad_segments, audio_duration_seconds=audio_duration_seconds, min_separation=min_ad_segment_separation_seconds, ) ad_segments = self._merge_close_segments( ad_segments, min_separation=min_ad_segment_separation_seconds ) ad_segments = self._filter_short_segments( ad_segments, min_length=min_ad_segment_length_seconds ) ad_segments = self._restore_last_segment_if_needed(ad_segments, last_segment) ad_segments = self._extend_last_segment_to_end_if_needed( ad_segments, audio_duration_seconds=audio_duration_seconds, min_separation=min_ad_segment_separation_seconds, ) self.logger.info(f"Joined ad segments into: {ad_segments}") return [(int(start * 1000), int(end * 1000)) for start, end in ad_segments] def _get_last_segment_if_near_end( self, ad_segments: List[Tuple[float, float]], *, audio_duration_seconds: float, min_separation: float, ) -> Optional[Tuple[float, float]]: if not ad_segments: return None if (audio_duration_seconds - ad_segments[-1][1]) < min_separation: return ad_segments[-1] return None def _merge_close_segments( self, ad_segments: List[Tuple[float, float]], *, min_separation: float, ) -> List[Tuple[float, float]]: merged = list(ad_segments) i = 0 while i < len(merged) - 1: if merged[i][1] + min_separation >= merged[i + 1][0]: merged[i] = (merged[i][0], merged[i + 1][1]) merged.pop(i + 1) else: i += 1 return merged def _filter_short_segments( self, ad_segments: List[Tuple[float, float]], *, min_length: float, ) -> List[Tuple[float, float]]: return [s for s in ad_segments if (s[1] - s[0]) >= min_length] def _restore_last_segment_if_needed( self, ad_segments: List[Tuple[float, float]], last_segment: Optional[Tuple[float, float]], ) -> List[Tuple[float, float]]: if last_segment is None: return ad_segments if not ad_segments or ad_segments[-1] != last_segment: return [*ad_segments, last_segment] return ad_segments def _extend_last_segment_to_end_if_needed( self, ad_segments: List[Tuple[float, float]], *, audio_duration_seconds: float, min_separation: float, ) -> List[Tuple[float, float]]: if not ad_segments: return ad_segments if (audio_duration_seconds - ad_segments[-1][1]) < min_separation: return [*ad_segments[:-1], (ad_segments[-1][0], audio_duration_seconds)] return ad_segments def process_audio(self, post: Post, output_path: str) -> None: """ Process the podcast audio by removing ad segments. Args: post: The Post object containing the podcast to process output_path: Path where the processed audio file should be saved """ ad_segments = self.get_ad_segments(post) duration_ms = get_audio_duration_ms(post.unprocessed_audio_path) if duration_ms is None: raise ValueError( f"Could not determine duration for audio: {post.unprocessed_audio_path}" ) # Store duration in seconds post.duration = duration_ms / 1000.0 merged_ad_segments = self.merge_ad_segments( duration_ms=duration_ms, ad_segments=ad_segments, min_ad_segment_length_seconds=float( self.config.output.min_ad_segment_length_seconds ), min_ad_segment_separation_seconds=float( self.config.output.min_ad_segement_separation_seconds ), ) clip_segments_with_fade( in_path=post.unprocessed_audio_path, ad_segments_ms=merged_ad_segments, fade_ms=self.config.output.fade_ms, out_path=output_path, ) post.processed_audio_path = output_path result = writer_client.update( "Post", post.id, {"processed_audio_path": output_path, "duration": post.duration}, wait=True, ) if not result or not result.success: raise RuntimeError(getattr(result, "error", "Failed to update post")) try: self.db_session.expire(post) except Exception: # pylint: disable=broad-except pass self.logger.info( f"Audio processing complete for post {post.id}, saved to {output_path}" ) ================================================ FILE: src/podcast_processor/boundary_refiner.py ================================================ """LLM-based boundary refiner. Note: We intentionally share some call-setup patterns with WordBoundaryRefiner. Pylint may flag these as R0801 (duplicate-code); we ignore that for this module. """ # pylint: disable=duplicate-code import json import logging import re from dataclasses import dataclass from pathlib import Path from typing import Any, Dict, List, Optional import litellm from jinja2 import Template from app.writer.client import writer_client from shared.config import Config # Internal defaults for boundary expansion; not user-configurable. MAX_START_EXTENSION_SECONDS = 30.0 MAX_END_EXTENSION_SECONDS = 15.0 @dataclass class BoundaryRefinement: refined_start: float refined_end: float start_adjustment_reason: str end_adjustment_reason: str class BoundaryRefiner: def __init__(self, config: Config, logger: Optional[logging.Logger] = None): self.config = config self.logger = logger or logging.getLogger(__name__) self.template = self._load_template() def _load_template(self) -> Template: path = ( Path(__file__).resolve().parent.parent # project src root / "boundary_refinement_prompt.jinja" ) if path.exists(): return Template(path.read_text()) # Minimal fallback return Template( """Refine ad boundaries. Ad: {{ad_start}}s-{{ad_end}}s {% for seg in context_segments %}[{{seg.start_time}}] {{seg.text}} {% endfor %} Return JSON: {"refined_start": {{ad_start}}, "refined_end": {{ad_end}}, "start_reason": "", "end_reason": ""}""" ) def refine( self, ad_start: float, ad_end: float, confidence: float, all_segments: List[Dict[str, Any]], *, post_id: Optional[int] = None, first_seq_num: Optional[int] = None, last_seq_num: Optional[int] = None, ) -> BoundaryRefinement: """Refine ad boundaries using LLM analysis and record the call in ModelCall.""" self.logger.debug( "Refining boundaries", extra={ "ad_start": ad_start, "ad_end": ad_end, "confidence": confidence, "segments_count": len(all_segments), }, ) context = self._get_context(ad_start, ad_end, all_segments) self.logger.debug( "Context window selected", extra={ "context_size": len(context), "first_seg": context[0] if context else None, }, ) prompt = self.template.render( ad_start=ad_start, ad_end=ad_end, ad_confidence=confidence, context_segments=context, ) model_call_id: Optional[int] = None raw_response: Optional[str] = None # Record the intent to call the LLM when we have enough context to do so if ( post_id is not None and first_seq_num is not None and last_seq_num is not None ): try: res = writer_client.action( "upsert_model_call", { "post_id": post_id, "model_name": self.config.llm_model, "first_segment_sequence_num": first_seq_num, "last_segment_sequence_num": last_seq_num, "prompt": prompt, }, wait=True, ) if res and res.success: model_call_id = (res.data or {}).get("model_call_id") except Exception as e: # best-effort; do not block refinement self.logger.warning( "Boundary refine: failed to upsert ModelCall: %s", e ) try: response = litellm.completion( model=self.config.llm_model, messages=[{"role": "user", "content": prompt}], temperature=0.1, max_tokens=4096, timeout=self.config.openai_timeout, api_key=self.config.llm_api_key, base_url=self.config.openai_base_url, ) choice = response.choices[0] if response.choices else None content = "" if choice: # Prefer chat content; fall back to text for completion-style responses content = ( getattr(getattr(choice, "message", None), "content", None) or "" ) if not content: content = getattr(choice, "text", "") or "" raw_response = content self.logger.debug( "LLM response received", extra={ "model": self.config.llm_model, "content_preview": content[:200], }, ) # Full response for debugging parse issues; remove or redact if noisy. raw_preview = content[:1000] self.logger.debug( "LLM response raw (%s chars, preview up to 1000): %r", len(content), raw_preview, extra={"model": self.config.llm_model}, ) # Log the full response object so provider quirks are visible. try: response_payload = ( response.model_dump() if hasattr(response, "model_dump") else response ) self.logger.debug( "LLM full response object", extra={"response_payload": response_payload}, ) except Exception: self.logger.debug("LLM full response object unavailable", exc_info=True) # Persist the raw response immediately so it's available even if parsing fails. self._update_model_call( model_call_id, status="received_response", response=raw_response, error_message=None, ) # Parse JSON (strip markdown fences). Log parse diagnostics so failures are actionable. cleaned = re.sub(r"```json|```", "", content.strip()) json_candidates = re.findall(r"\{.*?\}", cleaned, re.DOTALL) parse_error: Optional[str] = None parsed: Optional[Dict[str, Any]] = None for candidate in json_candidates: try: parsed = json.loads(candidate) break except Exception as exc: # capture the last parse error for logging parse_error = str(exc) if parsed: refined = self._validate( ad_start, ad_end, BoundaryRefinement( refined_start=float(parsed["refined_start"]), refined_end=float(parsed["refined_end"]), start_adjustment_reason=parsed.get( "start_adjustment_reason", parsed.get("start_reason", "") ), end_adjustment_reason=parsed.get( "end_adjustment_reason", parsed.get("end_reason", "") ), ), ) self._update_model_call( model_call_id, status="success", response=raw_response, error_message=None, ) self.logger.info( "LLM refinement applied", extra={ "refined_start": refined.refined_start, "refined_end": refined.refined_end, }, ) return refined self.logger.warning( "Boundary refinement LLM response had no parseable JSON; falling back to heuristic", extra={ "model_call_id": model_call_id, "ad_start": ad_start, "ad_end": ad_end, "json_candidate_count": len(json_candidates), "parse_error": parse_error, "first_candidate_preview": ( json_candidates[0][:200] if json_candidates else None ), "content_preview": (content or "")[:200], "raw_response": raw_response, "raw_response_len": len(content), }, ) # Also emit the raw response in-band so it shows up in plain-text logs. self.logger.debug( "Boundary refinement raw response (len=%s): %r", len(content), raw_preview, extra={"model_call_id": model_call_id}, ) self._update_model_call( model_call_id, status="success_heuristic", response=raw_response, error_message=parse_error or "parse_failed", ) except Exception as e: self._update_model_call( model_call_id, status="failed_permanent", response=raw_response, error_message=str(e), ) self.logger.warning(f"LLM refinement failed: {e}, using heuristic") # Fallback: heuristic refinement return self._heuristic_refine(ad_start, ad_end, context) def _update_model_call( self, model_call_id: Optional[int], *, status: str, response: Optional[str], error_message: Optional[str], ) -> None: """Best-effort ModelCall updater; no-op if call creation failed.""" if model_call_id is None: return try: writer_client.update( "ModelCall", int(model_call_id), { "status": status, "response": response, "error_message": error_message, "retry_attempts": 1, }, wait=True, ) except Exception as exc: # best-effort; do not block refinement self.logger.warning( "Boundary refine: failed to update ModelCall %s: %s", model_call_id, exc, ) def _get_context( self, ad_start: float, ad_end: float, all_segments: List[Dict[str, Any]] ) -> List[Dict[str, Any]]: """Get ±8 segments around ad""" ad_segs = [s for s in all_segments if ad_start <= s["start_time"] <= ad_end] if not ad_segs: return [] first_idx = all_segments.index(ad_segs[0]) last_idx = all_segments.index(ad_segs[-1]) start_idx = max(0, first_idx - 8) end_idx = min(len(all_segments), last_idx + 9) return all_segments[start_idx:end_idx] def _heuristic_refine( self, ad_start: float, ad_end: float, context: List[Dict[str, Any]] ) -> BoundaryRefinement: """Simple pattern-based refinement""" intro_patterns = ["brought to you", "sponsor", "let me tell you"] outro_patterns = [".com", "thanks to", "use code", "visit"] refined_start = ad_start refined_end = ad_end # Check before ad for intros for seg in context: if seg["start_time"] < ad_start: if any(p in seg["text"].lower() for p in intro_patterns): self.logger.debug( "Intro pattern matched", extra={ "matched_text": seg["text"], "start_time": seg["start_time"], }, ) refined_start = seg["start_time"] # Check after ad for outros for seg in context: if seg["start_time"] > ad_end: if any(p in seg["text"].lower() for p in outro_patterns): self.logger.debug( "Outro pattern matched", extra={ "matched_text": seg["text"], "start_time": seg["start_time"], }, ) refined_end = seg.get("end_time", seg["start_time"] + 5.0) result = BoundaryRefinement( refined_start, refined_end, "heuristic", "heuristic", ) self.logger.info( "Heuristic refinement applied", extra={ "refined_start": result.refined_start, "refined_end": result.refined_end, }, ) return result def _validate( self, orig_start: float, orig_end: float, refinement: BoundaryRefinement ) -> BoundaryRefinement: """Constrain refinement to reasonable bounds""" max_start_ext = MAX_START_EXTENSION_SECONDS max_end_ext = MAX_END_EXTENSION_SECONDS refinement.refined_start = max( refinement.refined_start, orig_start - max_start_ext ) refinement.refined_end = min(refinement.refined_end, orig_end + max_end_ext) if refinement.refined_start >= refinement.refined_end: refinement.refined_start = orig_start refinement.refined_end = orig_end self.logger.debug( "Refinement validated", extra={ "orig_start": orig_start, "orig_end": orig_end, "refined_start": refinement.refined_start, "refined_end": refinement.refined_end, }, ) return refinement ================================================ FILE: src/podcast_processor/cue_detector.py ================================================ import re from typing import Dict, List, Pattern, Tuple class CueDetector: def __init__(self) -> None: self.url_pattern: Pattern[str] = re.compile( r"\b([a-z0-9\-\.]+\.(?:com|net|org|io))\b", re.I ) self.promo_pattern: Pattern[str] = re.compile( r"\b(code|promo|save|discount)\s+\w+\b", re.I ) self.phone_pattern: Pattern[str] = re.compile( r"\b(?:\+?1[ -]?)?\d{3}[ -]?\d{3}[ -]?\d{4}\b" ) self.cta_pattern: Pattern[str] = re.compile( r"\b(visit|go to|check out|head over|sign up|start today|start now|use code|offer|deal|free trial)\b", re.I, ) self.transition_pattern: Pattern[str] = re.compile( r"\b(back to the show|after the break|stay tuned|we'll be right back|now back)\b", re.I, ) self.self_promo_pattern: Pattern[str] = re.compile( r"\b(my|our)\s+(book|course|newsletter|fund|patreon|substack|community|platform)\b", re.I, ) def has_cue(self, text: str) -> bool: return bool( self.url_pattern.search(text) or self.promo_pattern.search(text) or self.phone_pattern.search(text) or self.cta_pattern.search(text) ) def analyze(self, text: str) -> Dict[str, bool]: return { "url": bool(self.url_pattern.search(text)), "promo": bool(self.promo_pattern.search(text)), "phone": bool(self.phone_pattern.search(text)), "cta": bool(self.cta_pattern.search(text)), "transition": bool(self.transition_pattern.search(text)), "self_promo": bool(self.self_promo_pattern.search(text)), } def highlight_cues(self, text: str) -> str: """ Highlights detected cues in the text by wrapping them in *** ***. Useful for drawing attention to cues in LLM prompts. """ matches: List[Tuple[int, int]] = [] patterns = [ self.url_pattern, self.promo_pattern, self.phone_pattern, self.cta_pattern, self.transition_pattern, self.self_promo_pattern, ] for pattern in patterns: for match in pattern.finditer(text): matches.append(match.span()) if not matches: return text # Sort by start, then end (descending) to handle containment matches.sort(key=lambda x: (x[0], -x[1])) # Merge overlapping intervals merged: List[Tuple[int, int]] = [] if matches: curr_start, curr_end = matches[0] for next_start, next_end in matches[1:]: if next_start < curr_end: # Overlap curr_end = max(curr_end, next_end) else: merged.append((curr_start, curr_end)) curr_start, curr_end = next_start, next_end merged.append((curr_start, curr_end)) # Reconstruct string backwards to avoid index shifting result_parts = [] last_idx = len(text) for start, end in reversed(merged): result_parts.append(text[end:last_idx]) # Unchanged suffix result_parts.append(" ***") result_parts.append(text[start:end]) # The match result_parts.append("*** ") last_idx = start result_parts.append(text[:last_idx]) # Remaining prefix return "".join(reversed(result_parts)) ================================================ FILE: src/podcast_processor/llm_concurrency_limiter.py ================================================ """ LLM concurrency limiter to control the number of simultaneous LLM API calls. This module provides a semaphore-based concurrency control mechanism to prevent too many simultaneous LLM API calls, which can help avoid rate limiting and improve system stability. """ import logging import threading from typing import Any, Optional logger = logging.getLogger(__name__) class LLMConcurrencyLimiter: """Controls the number of concurrent LLM API calls using a semaphore.""" def __init__(self, max_concurrent_calls: int): """ Initialize the concurrency limiter. Args: max_concurrent_calls: Maximum number of simultaneous LLM API calls allowed """ if max_concurrent_calls <= 0: raise ValueError("max_concurrent_calls must be greater than 0") self.max_concurrent_calls = max_concurrent_calls self._semaphore = threading.Semaphore(max_concurrent_calls) logger.info( f"LLM concurrency limiter initialized with {max_concurrent_calls} max concurrent calls" ) def acquire(self, timeout: Optional[float] = None) -> bool: """ Acquire a slot for making an LLM API call. Note: Consider using ConcurrencyContext for automatic resource management. Args: timeout: Maximum time to wait for a slot in seconds. None means wait indefinitely. Returns: True if a slot was acquired, False if timeout occurred """ # Disable specific pylint warning for this line as manual semaphore control is needed acquired = self._semaphore.acquire( # pylint: disable=consider-using-with timeout=timeout ) if acquired: logger.debug("Acquired LLM concurrency slot") else: logger.warning( f"Failed to acquire LLM concurrency slot within {timeout}s timeout" ) return acquired def release(self) -> None: """ Release a slot after completing an LLM API call. Note: Consider using ConcurrencyContext for automatic resource management. """ self._semaphore.release() logger.debug("Released LLM concurrency slot") def get_available_slots(self) -> int: """Get the number of currently available slots.""" return self._semaphore._value def get_active_calls(self) -> int: """Get the number of currently active LLM calls.""" return self.max_concurrent_calls - self._semaphore._value # Global concurrency limiter instance _CONCURRENCY_LIMITER: Optional[LLMConcurrencyLimiter] = None def get_concurrency_limiter(max_concurrent_calls: int = 3) -> LLMConcurrencyLimiter: """Get or create the global concurrency limiter instance.""" global _CONCURRENCY_LIMITER # pylint: disable=global-statement if ( _CONCURRENCY_LIMITER is None or _CONCURRENCY_LIMITER.max_concurrent_calls != max_concurrent_calls ): _CONCURRENCY_LIMITER = LLMConcurrencyLimiter(max_concurrent_calls) return _CONCURRENCY_LIMITER class ConcurrencyContext: """Context manager for controlling LLM API call concurrency.""" def __init__(self, limiter: LLMConcurrencyLimiter, timeout: Optional[float] = None): """ Initialize the context manager. Args: limiter: The concurrency limiter to use timeout: Maximum time to wait for a slot """ self.limiter = limiter self.timeout = timeout self.acquired = False def __enter__(self) -> "ConcurrencyContext": """Acquire a concurrency slot.""" self.acquired = self.limiter.acquire(timeout=self.timeout) if not self.acquired: raise RuntimeError( f"Could not acquire LLM concurrency slot within {self.timeout}s" ) return self def __exit__( self, exc_type: Optional[type], exc_val: Optional[BaseException], exc_tb: Optional[Any], ) -> None: """Release the concurrency slot.""" if self.acquired: self.limiter.release() ================================================ FILE: src/podcast_processor/llm_error_classifier.py ================================================ """ Enhanced error classification for LLM API calls. Provides more robust and extensible error handling beyond simple string matching. """ import re from typing import Union from litellm.exceptions import InternalServerError class LLMErrorClassifier: """Classifies LLM API errors into retryable and non-retryable categories.""" # Rate limiting error patterns RATE_LIMIT_PATTERNS = [ re.compile(r"rate.?limit", re.IGNORECASE), re.compile(r"too many requests", re.IGNORECASE), re.compile(r"quota.?exceeded", re.IGNORECASE), re.compile(r"429", re.IGNORECASE), # HTTP 429 status ] # Timeout error patterns TIMEOUT_PATTERNS = [ re.compile(r"timeout", re.IGNORECASE), re.compile(r"timed.?out", re.IGNORECASE), re.compile(r"408", re.IGNORECASE), # HTTP 408 status re.compile(r"504", re.IGNORECASE), # HTTP 504 status ] # Server error patterns (retryable) SERVER_ERROR_PATTERNS = [ re.compile(r"internal.?server.?error", re.IGNORECASE), re.compile(r"502", re.IGNORECASE), # Bad Gateway re.compile(r"503", re.IGNORECASE), # Service Unavailable re.compile(r"500", re.IGNORECASE), # Internal Server Error ] # Non-retryable error patterns NON_RETRYABLE_PATTERNS = [ re.compile(r"authentication", re.IGNORECASE), re.compile(r"authorization", re.IGNORECASE), re.compile(r"invalid.?api.?key", re.IGNORECASE), re.compile(r"401", re.IGNORECASE), # Unauthorized re.compile(r"403", re.IGNORECASE), # Forbidden re.compile(r"400", re.IGNORECASE), # Bad Request re.compile(r"invalid.?parameter", re.IGNORECASE), ] @classmethod def is_retryable_error(cls, error: Union[Exception, str]) -> bool: """ Determine if an error should be retried. Args: error: Exception instance or error string Returns: True if the error should be retried, False otherwise """ # Handle specific exception types if isinstance(error, InternalServerError): return True # Convert to string for pattern matching error_str = str(error) # Check for non-retryable errors first (higher priority) if cls._matches_patterns(error_str, cls.NON_RETRYABLE_PATTERNS): return False # Check for retryable error patterns retryable_patterns = ( cls.RATE_LIMIT_PATTERNS + cls.TIMEOUT_PATTERNS + cls.SERVER_ERROR_PATTERNS ) return cls._matches_patterns(error_str, retryable_patterns) @classmethod def get_error_category(cls, error: Union[Exception, str]) -> str: """ Categorize the error type for better handling. Returns: One of: 'rate_limit', 'timeout', 'server_error', 'auth_error', 'client_error', 'unknown' """ error_str = str(error) if cls._matches_patterns(error_str, cls.RATE_LIMIT_PATTERNS): return "rate_limit" if cls._matches_patterns(error_str, cls.TIMEOUT_PATTERNS): return "timeout" if cls._matches_patterns(error_str, cls.SERVER_ERROR_PATTERNS): return "server_error" if cls._matches_patterns(error_str, cls.NON_RETRYABLE_PATTERNS): if any( pattern.search(error_str) for pattern in [ re.compile(r"authentication", re.IGNORECASE), re.compile(r"authorization", re.IGNORECASE), re.compile(r"401", re.IGNORECASE), re.compile(r"403", re.IGNORECASE), ] ): return "auth_error" return "client_error" return "unknown" @classmethod def get_suggested_backoff(cls, error: Union[Exception, str], attempt: int) -> float: """ Get suggested backoff time based on error type and attempt number. Args: error: The error that occurred attempt: Current attempt number (0-based) Returns: Suggested backoff time in seconds """ category = cls.get_error_category(error) base_backoff = float(2**attempt) # Exponential backoff # Adjust based on error type if category == "rate_limit": return base_backoff * 2.0 # Longer backoff for rate limits if category == "timeout": return base_backoff * 1.5 # Moderate backoff for timeouts if category == "server_error": return base_backoff # Standard backoff for server errors return base_backoff @staticmethod def _matches_patterns(text: str, patterns: list[re.Pattern[str]]) -> bool: """Check if text matches any of the provided regex patterns.""" return any(pattern.search(text) for pattern in patterns) ================================================ FILE: src/podcast_processor/llm_model_call_utils.py ================================================ from __future__ import annotations import logging from typing import Any, Optional from app.writer.client import writer_client def render_prompt_and_upsert_model_call( *, template: Any, ad_start: float, ad_end: float, confidence: float, context_segments: Any, post_id: Optional[int], first_seq_num: Optional[int], last_seq_num: Optional[int], model_name: str, logger: logging.Logger, log_prefix: str, ) -> tuple[str, Optional[int]]: prompt = template.render( ad_start=ad_start, ad_end=ad_end, ad_confidence=confidence, context_segments=context_segments, ) model_call_id = try_upsert_model_call( post_id=post_id, first_seq_num=first_seq_num, last_seq_num=last_seq_num, model_name=model_name, prompt=prompt, logger=logger, log_prefix=log_prefix, ) return prompt, model_call_id def try_upsert_model_call( *, post_id: Optional[int], first_seq_num: Optional[int], last_seq_num: Optional[int], model_name: str, prompt: str, logger: logging.Logger, log_prefix: str, ) -> Optional[int]: """Best-effort ModelCall creation. Returns model_call_id if successfully created/upserted, else None. """ if post_id is None or first_seq_num is None or last_seq_num is None: return None try: res = writer_client.action( "upsert_model_call", { "post_id": post_id, "model_name": model_name, "first_segment_sequence_num": first_seq_num, "last_segment_sequence_num": last_seq_num, "prompt": prompt, }, wait=True, ) if res and res.success: return (res.data or {}).get("model_call_id") except Exception as exc: # best-effort logger.warning("%s: failed to upsert ModelCall: %s", log_prefix, exc) return None def try_update_model_call( model_call_id: Optional[int], *, status: str, response: Optional[str], error_message: Optional[str], logger: logging.Logger, log_prefix: str, ) -> None: """Best-effort ModelCall updater; no-op if call creation failed.""" if model_call_id is None: return try: writer_client.update( "ModelCall", int(model_call_id), { "status": status, "response": response, "error_message": error_message, "retry_attempts": 1, }, wait=True, ) except Exception as exc: # best-effort logger.warning( "%s: failed to update ModelCall %s: %s", log_prefix, model_call_id, exc, ) def extract_litellm_content(response: Any) -> str: """Extracts the primary text content from a litellm completion response.""" choices = getattr(response, "choices", None) or [] choice = choices[0] if choices else None if not choice: return "" # Prefer chat content; fall back to text for completion-style responses content = getattr(getattr(choice, "message", None), "content", None) or "" if not content: content = getattr(choice, "text", "") or "" return str(content) ================================================ FILE: src/podcast_processor/model_output.py ================================================ import logging import re from typing import List, Literal, Optional from pydantic import BaseModel logger = logging.getLogger(__name__) class AdSegmentPrediction(BaseModel): segment_offset: float confidence: float class AdSegmentPredictionList(BaseModel): ad_segments: List[AdSegmentPrediction] content_type: Optional[ Literal[ "technical_discussion", "educational/self_promo", "promotional_external", "transition", ] ] = None confidence: Optional[float] = None def _attempt_json_repair(json_str: str) -> str: """ Attempt to repair truncated JSON by adding missing closing brackets. This handles cases where the LLM response was cut off mid-JSON, e.g., '{"ad_segments":[{"segment_offset":10.5,"confidence":0.92}' """ # Count opening and closing brackets/braces open_braces = json_str.count("{") close_braces = json_str.count("}") open_brackets = json_str.count("[") close_brackets = json_str.count("]") # If brackets are balanced, no repair needed if open_braces == close_braces and open_brackets == close_brackets: return json_str logger.warning( f"Detected unbalanced JSON: {open_braces} '{{' vs {close_braces} '}}', " f"{open_brackets} '[' vs {close_brackets} ']'. Attempting repair." ) # Remove any trailing incomplete key-value pair # e.g., '..."confidence":0.9' or '..."key":"val' or '..."key":' # First, try to find the last complete value repaired = json_str.rstrip() # If ends with a comma, remove it (incomplete next element) repaired = repaired.rstrip(",") # If ends with a colon or incomplete string, try to truncate to last complete element # Pattern: ends with "key": or "key":"incomplete or similar incomplete_patterns = [ r',"[^"]*":\s*$', # ,"key": r',"[^"]*":\s*"[^"]*$', # ,"key":"incomplete ] for pattern in incomplete_patterns: match = re.search(pattern, repaired) if match: repaired = repaired[: match.start()] logger.debug(f"Removed incomplete trailing content: {match.group()}") break # Recount after cleanup open_braces = repaired.count("{") close_braces = repaired.count("}") open_brackets = repaired.count("[") close_brackets = repaired.count("]") # Add missing closing brackets/braces in the right order # We need to determine the order based on the structure # Typically for our schema it's: ]} to close ad_segments array and outer object missing_brackets = close_brackets - open_brackets # negative means we need more ] missing_braces = close_braces - open_braces # negative means we need more } if missing_brackets < 0: repaired += "]" * abs(missing_brackets) if missing_braces < 0: repaired += "}" * abs(missing_braces) logger.info("Repaired JSON by adding missing closing brackets/braces") return repaired def clean_and_parse_model_output(model_output: str) -> AdSegmentPredictionList: start_marker, end_marker = "{", "}" assert ( model_output.count(start_marker) >= 1 ), f"No opening brace found in: {model_output[:200]}" start_idx = model_output.index(start_marker) model_output = model_output[start_idx:] # If we have at least as many closing braces as opening braces, trim to the last # closing brace to drop any trailing non-JSON content. Otherwise, keep the # content as-is so we can attempt repair on truncated JSON. open_braces = model_output.count(start_marker) close_braces = model_output.count(end_marker) if close_braces >= open_braces and close_braces > 0: model_output = model_output[: 1 + model_output.rindex(end_marker)] model_output = model_output.replace("'", '"') model_output = model_output.replace("\n", "") model_output = model_output.strip() # First attempt: try to parse as-is try: return AdSegmentPredictionList.parse_raw(model_output) except Exception as first_error: logger.debug(f"Initial parse failed: {first_error}") # Second attempt: try to repair truncated JSON try: repaired_output = _attempt_json_repair(model_output) result = AdSegmentPredictionList.parse_raw(repaired_output) logger.info("Successfully parsed model output after JSON repair") return result except Exception as repair_error: logger.error( f"JSON repair also failed. Original output (first 500 chars): {model_output[:500]}" ) # Re-raise the original error with more context raise first_error from repair_error ================================================ FILE: src/podcast_processor/podcast_downloader.py ================================================ from __future__ import annotations import logging import os import re from pathlib import Path from typing import Any, Iterator, Optional, Set import requests import validators from flask import abort from shared.interfaces import Post from shared.processing_paths import get_in_root logger = logging.getLogger(__name__) DOWNLOAD_DIR = str(get_in_root()) class PodcastDownloader: """ Handles downloading podcast episodes with robust file checking and path management. """ def __init__( self, download_dir: str = DOWNLOAD_DIR, logger: Optional[logging.Logger] = None ): self.download_dir = download_dir self.logger = logger or logging.getLogger(__name__) def download_episode(self, post: Post, dest_path: str) -> Optional[str]: """ Download a podcast episode if it doesn't already exist. Args: post: The Post object containing the podcast episode to download Returns: Path to the downloaded file, or None if download failed """ # Destination is required; ensure parent directory exists download_path = dest_path Path(download_path).parent.mkdir(parents=True, exist_ok=True) if not download_path: self.logger.error(f"Invalid download path for post {post.id}") return None # First, check if the file truly exists and has nonzero size. try: if os.path.isfile(download_path) and os.path.getsize(download_path) > 0: self.logger.info("Episode already downloaded.") return download_path self.logger.info("File is zero bytes, re-downloading.") # else except FileNotFoundError: # Covers both "file actually missing" and "broken symlink" pass # If we get here, the file is missing or zero bytes -> perform download audio_link = post.download_url if audio_link is None or not validators.url(audio_link): abort(404) return None self.logger.info(f"Downloading {audio_link} into {download_path}...") referer = "https://open.acast.com/" if "acast.com" in audio_link else None headers = { "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36", "Referer": referer, } with requests.get( audio_link, stream=True, timeout=60, headers=headers ) as response: if response.status_code == 200: with open(download_path, "wb") as file: for chunk in response.iter_content(chunk_size=8192): file.write(chunk) self.logger.info("Download complete.") else: self.logger.info( f"Failed to download the podcast episode, response: {response.status_code}" ) return None return download_path def get_and_make_download_path(self, post_title: str) -> Path: """ Generate the download path for a post and create necessary directories. Args: post_title: The title of the post to generate a path for Returns: Path object for the download location """ sanitized_title = sanitize_title(post_title) post_directory = sanitized_title post_filename = sanitized_title + ".mp3" post_directory_path = Path(self.download_dir) / post_directory post_directory_path.mkdir(parents=True, exist_ok=True) return post_directory_path / post_filename def sanitize_title(title: str) -> str: """Sanitize a title for use in file paths.""" return re.sub(r"[^a-zA-Z0-9\s]", "", title) def find_audio_link(entry: Any) -> str: """Find the audio link in a feed entry.""" audio_mime_types: Set[str] = { "audio/mpeg", "audio/mp3", "audio/x-mp3", "audio/mpeg3", "audio/mp4", "audio/m4a", "audio/x-m4a", "audio/aac", "audio/wav", "audio/x-wav", "audio/ogg", "audio/opus", "audio/flac", } for url in _iter_enclosure_audio_urls(entry, audio_mime_types): return url for url in _iter_link_audio_urls(entry, audio_mime_types, match_any_audio=False): return url for url in _iter_link_audio_urls(entry, audio_mime_types, match_any_audio=True): return url return str(getattr(entry, "id", "")) def _iter_enclosure_audio_urls(entry: Any, audio_mime_types: Set[str]) -> Iterator[str]: enclosures = getattr(entry, "enclosures", None) or [] for enclosure in enclosures: enc_type = (getattr(enclosure, "type", "") or "").lower() if enc_type not in audio_mime_types: continue href = getattr(enclosure, "href", None) if href: yield str(href) url = getattr(enclosure, "url", None) if url: yield str(url) def _iter_link_audio_urls( entry: Any, audio_mime_types: Set[str], *, match_any_audio: bool, ) -> Iterator[str]: links = getattr(entry, "links", None) or [] for link in links: link_type = (getattr(link, "type", "") or "").lower() if match_any_audio: if not link_type.startswith("audio/"): continue else: if link_type not in audio_mime_types: continue href = getattr(link, "href", None) if href: yield str(href) # Backward compatibility - create a default instance _default_downloader = PodcastDownloader() def download_episode(post: Post, dest_path: str) -> Optional[str]: return _default_downloader.download_episode(post, dest_path) def get_and_make_download_path(post_title: str) -> Path: return _default_downloader.get_and_make_download_path(post_title) ================================================ FILE: src/podcast_processor/podcast_processor.py ================================================ import logging import os import shutil import threading from pathlib import Path from typing import Any, Callable, Dict, List, Optional import litellm from jinja2 import Template from sqlalchemy.orm import object_session from app.extensions import db from app.models import Post, ProcessingJob, TranscriptSegment from app.writer.client import writer_client from podcast_processor.ad_classifier import AdClassifier from podcast_processor.audio_processor import AudioProcessor from podcast_processor.podcast_downloader import PodcastDownloader, sanitize_title from podcast_processor.processing_status_manager import ProcessingStatusManager from podcast_processor.prompt import ( DEFAULT_SYSTEM_PROMPT_PATH, DEFAULT_USER_PROMPT_TEMPLATE_PATH, ) from podcast_processor.transcription_manager import TranscriptionManager from shared.config import Config from shared.processing_paths import ( ProcessingPaths, get_job_unprocessed_path, get_srv_root, paths_from_unprocessed_path, ) logger = logging.getLogger("global_logger") def get_post_processed_audio_path(post: Post) -> Optional[ProcessingPaths]: """ Generate the processed audio path based on the post's unprocessed audio path. Returns None if unprocessed_audio_path is not set. """ unprocessed_path = post.unprocessed_audio_path if not unprocessed_path or not isinstance(unprocessed_path, str): logger.warning(f"Post {post.id} has no unprocessed_audio_path.") return None title = post.feed.title if not title or not isinstance(title, str): logger.warning(f"Post {post.id} has no feed title.") return None return paths_from_unprocessed_path(unprocessed_path, title) def get_post_processed_audio_path_cached( post: Post, feed_title: str ) -> Optional[ProcessingPaths]: """ Generate the processed audio path using cached feed title to avoid ORM access. Returns None if unprocessed_audio_path is not set. """ unprocessed_path = post.unprocessed_audio_path if not unprocessed_path or not isinstance(unprocessed_path, str): logger.warning(f"Post {post.id} has no unprocessed_audio_path.") return None if not feed_title or not isinstance(feed_title, str): logger.warning(f"Post {post.id} has no feed title.") return None return paths_from_unprocessed_path(unprocessed_path, feed_title) class PodcastProcessor: """ Main coordinator for podcast processing workflow. Delegates to specialized components for transcription, ad classification, and audio processing. """ lock_lock = threading.Lock() locks: Dict[str, threading.Lock] = {} # Now keyed by post GUID instead of file path def __init__( self, config: Config, logger: Optional[logging.Logger] = None, transcription_manager: Optional[TranscriptionManager] = None, ad_classifier: Optional[AdClassifier] = None, audio_processor: Optional[AudioProcessor] = None, status_manager: Optional[ProcessingStatusManager] = None, db_session: Optional[Any] = None, downloader: Optional[PodcastDownloader] = None, ) -> None: super().__init__() self.logger = logger or logging.getLogger("global_logger") self.output_dir = str(get_srv_root()) self.config: Config = config self.db_session = db_session or db.session # Initialize downloader self.downloader = downloader or PodcastDownloader(logger=self.logger) # Initialize status manager self.status_manager = status_manager or ProcessingStatusManager( self.db_session, self.logger ) litellm.api_base = self.config.openai_base_url litellm.api_key = self.config.llm_api_key # Initialize components with default implementations if not provided if transcription_manager is None: self.transcription_manager = TranscriptionManager(self.logger, config) else: self.transcription_manager = transcription_manager if ad_classifier is None: self.ad_classifier = AdClassifier(config) else: self.ad_classifier = ad_classifier if audio_processor is None: self.audio_processor = AudioProcessor(config=config, logger=self.logger) else: self.audio_processor = audio_processor # pylint: disable=too-many-branches, too-many-statements def process( self, post: Post, job_id: str, cancel_callback: Optional[Callable[[], bool]] = None, ) -> str: """ Process a podcast by downloading, transcribing, identifying ads, and removing ad segments. Updates the existing job record for tracking progress. Args: post: The Post object containing the podcast to process job_id: Job ID of the existing job to update (required) cancel_callback: Optional callback to check for cancellation Returns: Path to the processed audio file """ job = self.db_session.get(ProcessingJob, job_id) if not job: raise ProcessorException(f"Job with ID {job_id} not found") # Cache job and post attributes early to avoid ORM access after expire_all() # This includes relationship access like post.feed.title cached_post_guid = post.guid cached_post_title = post.title cached_feed_title = post.feed.title cached_job_id = job.id cached_current_step = job.current_step try: self.logger.debug( "processor.process enter: job_id=%s post_guid=%s job_bound=%s", job_id, getattr(post, "guid", None), object_session(job) is not None, ) # Update job to running status self.status_manager.update_job_status( job, "running", 0, "Starting processing" ) # Validate post if not post.whitelisted: raise ProcessorException( f"Post with GUID {cached_post_guid} not whitelisted" ) # Check if processed audio already exists (database or disk) if self._check_existing_processed_audio(post): self.status_manager.update_job_status( job, "completed", 4, "Processing complete", 100.0 ) return str(post.processed_audio_path) simulated_path = self._simulate_developer_processing( post, job, cached_post_guid, cached_post_title, cached_feed_title, cached_job_id, ) if simulated_path: return simulated_path # Step 1: Download (if needed) self._handle_download_step( post, job, cached_post_guid, cached_post_title, cached_job_id ) self._raise_if_cancelled(job, 1, cancel_callback) # Get processing paths and acquire lock processed_audio_path = self._acquire_processing_lock( post, job, cached_post_guid, cached_job_id, cached_feed_title ) try: if os.path.exists(processed_audio_path): self.logger.info(f"Audio already processed: {post}") # Update the database with the processed audio path self._remove_unprocessed_audio(post) result = writer_client.update( "Post", post.id, { "processed_audio_path": processed_audio_path, "unprocessed_audio_path": None, }, wait=True, ) if not result or not result.success: raise RuntimeError( getattr(result, "error", "Failed to update post") ) self.status_manager.update_job_status( job, "completed", 4, "Processing complete", 100.0 ) return processed_audio_path # Perform the main processing steps self._perform_processing_steps( post, job, processed_audio_path, cancel_callback ) self.logger.info(f"Processing podcast: {post} complete") return processed_audio_path finally: # Release lock using cached GUID without touching ORM state after potential rollback try: if cached_post_guid is not None: lock = PodcastProcessor.locks.get(cached_post_guid) if lock is not None and lock.locked(): lock.release() except Exception: # Best-effort lock release; avoid masking original exceptions pass except ProcessorException as e: error_msg = str(e) if "Processing job in progress" in error_msg: self.status_manager.update_job_status( job, "failed", cached_current_step, "Another processing job is already running for this episode", ) else: self.status_manager.update_job_status( job, "failed", cached_current_step, error_msg ) raise except Exception as e: self.logger.error( "processor.process unexpected error: job_id=%s %s", job_id, e, exc_info=True, ) self.status_manager.update_job_status( job, "failed", cached_current_step, f"Unexpected error: {str(e)}" ) raise def _acquire_processing_lock( self, post: Post, job: ProcessingJob, post_guid: str, job_id: str, feed_title: str, ) -> str: """ Acquire processing lock for the post and return the processed audio path. Lock is now based on post GUID for better granularity and reliability. Args: post: The Post object to process job: The ProcessingJob for tracking post_guid: Cached post GUID to avoid ORM access job_id: Cached job ID to avoid ORM access feed_title: Cached feed title to avoid ORM access Returns: Path to the processed audio file Raises: ProcessorException: If lock cannot be acquired or paths are invalid """ # Get processing paths working_paths = get_post_processed_audio_path_cached(post, feed_title) if working_paths is None: raise ProcessorException("Processed audio path not found") processed_audio_path = str(working_paths.post_processed_audio_path) # Use post GUID as lock key instead of file path for better granularity lock_key = post_guid # Acquire lock (this is where we cancel existing jobs if we can get the lock) locked = False with PodcastProcessor.lock_lock: if lock_key not in PodcastProcessor.locks: PodcastProcessor.locks[lock_key] = threading.Lock() PodcastProcessor.locks[lock_key].acquire(blocking=False) locked = True if not locked and not PodcastProcessor.locks[lock_key].acquire(blocking=False): raise ProcessorException("Processing job in progress") # Cancel existing jobs since we got the lock self.status_manager.cancel_existing_jobs(post_guid, job_id) self.make_dirs(working_paths) return processed_audio_path def _perform_processing_steps( self, post: Post, job: ProcessingJob, processed_audio_path: str, cancel_callback: Optional[Callable[[], bool]] = None, ) -> None: """ Perform the main processing steps: transcription, ad classification, and audio processing. Args: post: The Post object to process job: The ProcessingJob for tracking processed_audio_path: Path where the processed audio will be saved """ # Step 2: Transcribe audio self.status_manager.update_job_status( job, "running", 2, "Transcribing audio", 50.0 ) transcript_segments = self.transcription_manager.transcribe(post) self._raise_if_cancelled(job, 2, cancel_callback) # Step 3: Classify ad segments self._classify_ad_segments(post, job, transcript_segments) self._raise_if_cancelled(job, 3, cancel_callback) # Step 4: Process audio (remove ad segments) self.status_manager.update_job_status( job, "running", 4, "Processing audio", 90.0 ) self.audio_processor.process_audio(post, processed_audio_path) # Update the database with the processed audio path self._remove_unprocessed_audio(post) result = writer_client.update( "Post", post.id, { "processed_audio_path": processed_audio_path, "unprocessed_audio_path": None, }, wait=True, ) if not result or not result.success: raise RuntimeError(getattr(result, "error", "Failed to update post")) # Mark job complete self.status_manager.update_job_status( job, "completed", 4, "Processing complete", 100.0 ) def _raise_if_cancelled( self, job: ProcessingJob, current_step: int, cancel_callback: Optional[Callable[[], bool]], ) -> None: """Helper to centralize cancellation checking and update job state.""" if cancel_callback and cancel_callback(): self.status_manager.update_job_status( job, "cancelled", current_step, "Cancellation requested" ) raise ProcessorException("Cancelled") def _classify_ad_segments( self, post: Post, job: ProcessingJob, transcript_segments: List[TranscriptSegment], ) -> None: """ Classify ad segments in the transcript. Args: post: The Post object being processed job: The ProcessingJob for tracking transcript_segments: The transcript segments to classify """ self.status_manager.update_job_status( job, "running", 3, "Identifying ads", 75.0 ) user_prompt_template = self.get_user_prompt_template( DEFAULT_USER_PROMPT_TEMPLATE_PATH ) system_prompt = self.get_system_prompt(DEFAULT_SYSTEM_PROMPT_PATH) self.ad_classifier.classify( transcript_segments=transcript_segments, system_prompt=system_prompt, user_prompt_template=user_prompt_template, post=post, ) def _simulate_developer_processing( self, post: Post, job: ProcessingJob, post_guid: str, post_title: str, feed_title: str, job_id: str, ) -> Optional[str]: """Short-circuit processing for developer-mode test feeds. When developer mode is enabled and a post comes from a synthetic test feed (download_url contains "test-feed"), skip the full pipeline and copy a tiny bundled MP3 into the expected processed/unprocessed locations. This keeps the UI happy without relying on external downloads or LLM calls. """ download_url = (post.download_url or "").lower() is_test_feed = "test-feed" in download_url or post_guid.startswith("test-guid") if not (self.config.developer_mode or is_test_feed): return None sample_audio = ( Path(__file__).resolve().parent.parent / "tests" / "data" / "count_0_99.mp3" ) if not sample_audio.exists(): self.status_manager.update_job_status( job, "failed", job.current_step or 0, "Developer sample audio missing", ) raise ProcessorException("Developer sample audio missing") self.status_manager.update_job_status( job, "running", 1, "Simulating processing (developer mode)", 25.0, ) unprocessed_path = get_job_unprocessed_path(post_guid, job_id, post_title) unprocessed_path.parent.mkdir(parents=True, exist_ok=True) shutil.copyfile(sample_audio, unprocessed_path) processed_path = ( get_srv_root() / sanitize_title(feed_title) / f"{sanitize_title(post_title)}.mp3" ) processed_path.parent.mkdir(parents=True, exist_ok=True) shutil.copyfile(sample_audio, processed_path) result = writer_client.update( "Post", post.id, { "unprocessed_audio_path": str(unprocessed_path), "processed_audio_path": str(processed_path), }, wait=True, ) if not result or not result.success: raise RuntimeError(getattr(result, "error", "Failed to update post")) self.status_manager.update_job_status( job, "completed", 4, "Processing complete (developer mode)", 100.0, ) return str(processed_path) def _handle_download_step( self, post: Post, job: ProcessingJob, post_guid: str, post_title: str, job_id: str, ) -> None: """ Handle the download step with progress tracking and robust file checking. This method checks for existing files on disk before downloading. Args: post: The Post object being processed job: The ProcessingJob for tracking post_guid: Cached post GUID to avoid ORM access post_title: Cached post title to avoid ORM access job_id: Cached job ID to avoid ORM access """ # If we have a path in the database, check if the file actually exists if post.unprocessed_audio_path is not None: if ( os.path.exists(post.unprocessed_audio_path) and os.path.getsize(post.unprocessed_audio_path) > 0 ): self.logger.debug( f"Unprocessed audio already available at: {post.unprocessed_audio_path}" ) return self.logger.info( f"Database path {post.unprocessed_audio_path} doesn't exist or is empty, resetting" ) result = writer_client.update( "Post", post.id, {"unprocessed_audio_path": None}, wait=True ) if not result or not result.success: raise RuntimeError(getattr(result, "error", "Failed to update post")) # Compute a unique per-job expected path expected_unprocessed_path = get_job_unprocessed_path( post_guid, job_id, post_title ) if ( expected_unprocessed_path.exists() and expected_unprocessed_path.stat().st_size > 0 ): # Found a local unprocessed file unprocessed_path_str = str(expected_unprocessed_path.resolve()) self.logger.info( f"Found existing unprocessed audio for post '{post_title}' at '{unprocessed_path_str}'. " "Updated the database path." ) result = writer_client.update( "Post", post.id, {"unprocessed_audio_path": unprocessed_path_str}, wait=True, ) if not result or not result.success: raise RuntimeError(getattr(result, "error", "Failed to update post")) return # Need to download the file self.status_manager.update_job_status( job, "running", 1, "Downloading episode", 25.0 ) self.logger.info(f"Downloading post: {post_title}") download_path = self.downloader.download_episode( post, dest_path=str(expected_unprocessed_path) ) if download_path is None: raise ProcessorException("Download failed") result = writer_client.update( "Post", post.id, {"unprocessed_audio_path": download_path}, wait=True ) if not result or not result.success: raise RuntimeError(getattr(result, "error", "Failed to update post")) def make_dirs(self, processing_paths: ProcessingPaths) -> None: """Create necessary directories for output files.""" if processing_paths.post_processed_audio_path: processing_paths.post_processed_audio_path.parent.mkdir( parents=True, exist_ok=True ) def get_system_prompt(self, system_prompt_path: str) -> str: """Load the system prompt from a file.""" with open(system_prompt_path, "r") as f: return f.read() def get_user_prompt_template(self, prompt_template_path: str) -> Template: """Load the user prompt template from a file.""" with open(prompt_template_path, "r") as f: return Template(f.read()) def remove_audio_files_and_reset_db(self, post_id: Optional[int]) -> None: """ Removes unprocessed/processed audio for the given post from disk, and resets the DB fields so the next run will re-download the files. """ if post_id is None: return post = self.db_session.get(Post, post_id) if not post: self.logger.warning( f"Could not find Post with ID {post_id} to remove files." ) return if post.unprocessed_audio_path and os.path.isfile(post.unprocessed_audio_path): try: os.remove(post.unprocessed_audio_path) self.logger.info( f"Removed unprocessed file: {post.unprocessed_audio_path}" ) except OSError as e: self.logger.error( f"Failed to remove unprocessed file '{post.unprocessed_audio_path}': {e}" ) if post.processed_audio_path and os.path.isfile(post.processed_audio_path): try: os.remove(post.processed_audio_path) self.logger.info(f"Removed processed file: {post.processed_audio_path}") except OSError as e: self.logger.error( f"Failed to remove processed file '{post.processed_audio_path}': {e}" ) result = writer_client.update( "Post", post.id, {"unprocessed_audio_path": None, "processed_audio_path": None}, wait=True, ) if not result or not result.success: raise RuntimeError(getattr(result, "error", "Failed to update post")) def _remove_unprocessed_audio(self, post: Post) -> None: """ Delete the downloaded source audio and clear its DB reference. Used after we have a finalized processed file so stale downloads do not accumulate on disk. """ path = post.unprocessed_audio_path if not path: return if os.path.isfile(path): try: os.remove(path) self.logger.info("Removed unprocessed file after processing: %s", path) except OSError as exc: # best-effort cleanup self.logger.warning( "Failed to remove unprocessed file '%s': %s", path, exc ) post.unprocessed_audio_path = None def _check_existing_processed_audio(self, post: Post) -> bool: """ Check if processed audio already exists, either in database or on disk. Updates the database path if found on disk. Returns: True if processed audio exists and is valid, False otherwise """ # If we have a path in the database, check if the file actually exists if post.processed_audio_path is not None: if ( os.path.exists(post.processed_audio_path) and os.path.getsize(post.processed_audio_path) > 0 ): self.logger.info( f"Processed audio already available at: {post.processed_audio_path}" ) return True self.logger.info( f"Database path {post.processed_audio_path} doesn't exist or is empty, resetting" ) result = writer_client.update( "Post", post.id, {"processed_audio_path": None}, wait=True ) if not result or not result.success: raise RuntimeError(getattr(result, "error", "Failed to update post")) # Check if file exists on disk at expected location safe_feed_title = sanitize_title(post.feed.title) safe_post_title = sanitize_title(post.title) expected_processed_path = ( get_srv_root() / safe_feed_title / f"{safe_post_title}.mp3" ) if ( expected_processed_path.exists() and expected_processed_path.stat().st_size > 0 ): # Found a local processed file processed_path_str = str(expected_processed_path.resolve()) self.logger.info( f"Found existing processed audio for post '{post.title}' at '{processed_path_str}'. " "Updated the database path." ) result = writer_client.update( "Post", post.id, {"processed_audio_path": processed_path_str}, wait=True, ) if not result or not result.success: raise RuntimeError(getattr(result, "error", "Failed to update post")) return True return False class ProcessorException(Exception): """Exception raised for podcast processing errors.""" ================================================ FILE: src/podcast_processor/processing_status_manager.py ================================================ import logging import uuid from datetime import datetime from typing import Any, Optional, cast from sqlalchemy.orm import object_session from app.models import ProcessingJob from app.writer.client import writer_client class ProcessingStatusManager: """ Manages processing job status, creation, updates, and cleanup. Handles all database operations related to job tracking via Writer Service. """ def __init__(self, db_session: Any, logger: Optional[logging.Logger] = None): self.db_session = db_session self.logger = logger or logging.getLogger(__name__) def generate_job_id(self) -> str: """Generate a unique job ID.""" return str(uuid.uuid4()) def create_job( self, post_guid: str, job_id: str, run_id: Optional[str] = None, *, requested_by_user_id: Optional[int] = None, billing_user_id: Optional[int] = None, ) -> ProcessingJob: """Create a new pending job record for the provided post.""" job_data = { "id": job_id, "jobs_manager_run_id": run_id, "post_guid": post_guid, "status": "pending", "current_step": 0, "total_steps": 4, "progress_percentage": 0.0, "created_at": datetime.utcnow().isoformat(), "requested_by_user_id": requested_by_user_id, "billing_user_id": billing_user_id, } writer_client.action("create_job", {"job_data": job_data}, wait=True) self.db_session.expire_all() job = self.db_session.get(ProcessingJob, job_id) if not job: raise RuntimeError(f"Failed to create job {job_id}") return cast(ProcessingJob, job) def cancel_existing_jobs(self, post_guid: str, current_job_id: str) -> None: """Delete any existing active jobs for this post.""" writer_client.action( "cancel_existing_jobs", {"post_guid": post_guid, "current_job_id": current_job_id}, wait=True, ) self.db_session.expire_all() def update_job_status( self, job: ProcessingJob, status: str, step: int, step_name: str, progress: Optional[float] = None, ) -> None: """Update job status in database.""" # Cache job attributes before any operations that might expire the object job_id = job.id total_steps = job.total_steps is_bound = object_session(job) is not None self.logger.info( "[JOB_STATUS_UPDATE] job_id=%s status=%s step=%s step_name=%s bound=%s", job_id, status, step, step_name, is_bound, ) if progress is None: progress = (step / total_steps) * 100.0 writer_client.action( "update_job_status", { "job_id": job_id, "status": status, "step": step, "step_name": step_name, "progress": progress, }, wait=True, ) self.db_session.expire_all() if status in {"failed", "cancelled"}: self.logger.error( "[JOB_STATUS_ERROR] job_id=%s post_guid=%s status=%s step=%s step_name=%s progress=%.2f", job_id, job.post_guid, # post_guid is safe - not cached but accessed before expire_all status, step, step_name, progress, ) def mark_cancelled(self, job_id: str, error_message: Optional[str] = None) -> None: writer_client.action( "mark_cancelled", {"job_id": job_id, "reason": error_message}, wait=True ) self.db_session.expire_all() self.logger.info(f"Successfully cancelled job {job_id}") ================================================ FILE: src/podcast_processor/prompt.py ================================================ from typing import List from podcast_processor.cue_detector import CueDetector from podcast_processor.model_output import AdSegmentPrediction, AdSegmentPredictionList from podcast_processor.transcribe import Segment DEFAULT_SYSTEM_PROMPT_PATH = "src/system_prompt.txt" DEFAULT_USER_PROMPT_TEMPLATE_PATH = "src/user_prompt.jinja" _cue_detector = CueDetector() def transcript_excerpt_for_prompt( segments: List[Segment], includes_start: bool, includes_end: bool ) -> str: excerpts = [ f"[{segment.start}] {_cue_detector.highlight_cues(segment.text)}" for segment in segments ] if includes_start: excerpts.insert(0, "[TRANSCRIPT START]") if includes_end: excerpts.append("[TRANSCRIPT END]") return "\n".join(excerpts) def generate_system_prompt() -> str: valid_empty_example = AdSegmentPredictionList(ad_segments=[]).model_dump_json( exclude_none=True ) output_for_one_shot_example = AdSegmentPredictionList( ad_segments=[ AdSegmentPrediction(segment_offset=59.8, confidence=0.95), AdSegmentPrediction(segment_offset=64.8, confidence=0.9), AdSegmentPrediction(segment_offset=73.8, confidence=0.92), AdSegmentPrediction(segment_offset=77.8, confidence=0.98), AdSegmentPrediction(segment_offset=79.8, confidence=0.9), ], content_type="promotional_external", confidence=0.96, ).model_dump_json(exclude_none=True) example_output_for_prompt = output_for_one_shot_example.strip() one_shot_transcript_example = transcript_excerpt_for_prompt( [ Segment(start=53.8, end=-1, text="That's all coming after the break."), Segment( start=59.8, end=-1, text="On this week's episode of Wildcard, actor Chris Pine tells " "us, it's okay not to be perfect.", ), Segment( start=64.8, end=-1, text="My film got absolutely decimated when it premiered, which " "brings up for me one of my primary triggers or whatever it was " "like, not being liked.", ), Segment( start=73.8, end=-1, text="I'm Rachel Martin, Chris Pine on How to Find Joy in Imperfection.", ), Segment( start=77.8, end=-1, text="That's on the new podcast, Wildcard.", ), Segment( start=79.8, end=-1, text="The Game Where Cards control the conversation.", ), Segment( start=83.8, end=-1, text="And welcome back to the show, today we're talking to Professor Hopkins", ), ], includes_start=False, includes_end=False, ) technical_example = transcript_excerpt_for_prompt( [ Segment( start=4762.7, end=-1, text="Our brains are configured differently.", ), Segment( start=4765.6, end=-1, text="My brain is configured perfectly for Ruby, perfectly for a dynamically typed language.", ), Segment( start=4831.3, end=-1, text="Shopify exists at a scale most programmers never touch, and it still runs on Rails.", ), Segment(start=4933.2, end=-1, text="Shopify.com has supported this show."), ], includes_start=False, includes_end=False, ) # pylint: disable=line-too-long return f"""Your job is to identify advertisements in podcast transcript excerpts with high precision, continuity awareness, and content-context sensitivity. CRITICAL: distinguish external sponsor ads from technical discussion and self-promotion. CONTENT-AWARE TAXONOMY: - technical_discussion: Educational content, case studies, implementation details. Company names may appear as examples; do not mark as ads. - educational/self_promo: Host discussing their own products, newsletters, funds, or courses (may include CTAs but are first-party). - promotional_external: True sponsor ads for external companies with sales intent, URLs, promo codes, or explicit offers. - transition: Brief bumpers that connect to or from ads; include if they are part of an ad block. JSON CONTRACT (strict): - Always respond with: {{"ad_segments": [...], "content_type": "", "confidence": <0.0-1.0>}} - Each ad_segments item must be: {{"segment_offset": , "confidence": <0.0-1.0>}} - If there are no ads, respond with: {valid_empty_example} (no extra keys). DURATION AND CUE GUIDANCE: - Ads are typically 15–120 seconds and contain CTAs, URLs/domains, promo/discount codes, phone numbers, or phrases like "brought to you by". - Integrated ads can be longer but maintain sales intent; continuous mention of the same sponsor for >3 minutes without CTAs is likely educational/self_promo. - Pre-roll/mid-roll/post-roll intros ("a word from our sponsor") and quick outros ("back to the show") belong to the ad block. DECISION RULES: 1) Continuous ads: once an ad starts, follow it to its natural conclusion; include 1–5 second transitions. 2) Strong cues: treat URLs/domains, promo/discount language, and phone numbers as strong sponsor indicators. 3) Self-promotion guardrail: host promoting their own products/platforms → classify as educational/self_promo with lower confidence unless explicit external sponsorship language is present. 4) Boundary bias: if later segments clearly form an ad for a sponsor, pull in the prior two intro/transition lines as ad content. 5) Prefer labeling as content unless multiple strong ad cues appear with clear external branding. This transcript excerpt is broken into segments starting with a timestamp [X] (seconds). Output every segment that is advertisement content. Example (external sponsor with CTA): {one_shot_transcript_example} Output: {example_output_for_prompt} Example (technical mention, not an ad): {technical_example} Output: {{"ad_segments": [{{"segment_offset": 4933.2, "confidence": 0.75}}], "content_type": "technical_discussion", "confidence": 0.45}} \n\n""" ================================================ FILE: src/podcast_processor/token_rate_limiter.py ================================================ """ Token-based rate limiting for LLM API calls. This module provides client-side rate limiting based on input token consumption to prevent hitting API provider rate limits (e.g., Anthropic's 30,000 tokens/minute). """ import logging import threading import time from collections import deque from datetime import datetime from typing import Dict, List, Optional, Tuple, Union logger = logging.getLogger(__name__) class TokenRateLimiter: """ Client-side rate limiter that tracks token usage over time windows. Prevents hitting API rate limits by calculating token usage and waiting when necessary before making API calls. """ def __init__(self, tokens_per_minute: int = 30000, window_minutes: int = 1): """ Initialize the rate limiter. Args: tokens_per_minute: Maximum tokens allowed per minute window_minutes: Time window for rate limiting (default: 1 minute) """ self.tokens_per_minute = tokens_per_minute self.window_seconds = window_minutes * 60 self.token_usage: deque[Tuple[float, int]] = ( deque() ) # [(timestamp, token_count), ...] self.lock = threading.Lock() logger.info( f"Initialized TokenRateLimiter: {tokens_per_minute} tokens/{window_minutes}min" ) def count_tokens(self, messages: List[Dict[str, str]], model: str) -> int: """ Count tokens in messages using litellm's token counting. Args: messages: List of message dicts with 'role' and 'content' model: Model name for accurate token counting Returns: Number of input tokens """ try: # Simple token estimation: ~4 characters per token total_chars = sum(len(msg.get("content", "")) for msg in messages) estimated_tokens = total_chars // 4 logger.debug(f"Estimated {estimated_tokens} tokens for model {model}") return estimated_tokens except Exception as e: # Fallback: conservative estimate logger.warning(f"Token counting failed, using fallback. Error: {e}") return 1000 # Conservative fallback def _cleanup_old_usage(self, current_time: float) -> None: """Remove token usage records outside the time window.""" cutoff_time = current_time - self.window_seconds while self.token_usage and self.token_usage[0][0] < cutoff_time: self.token_usage.popleft() def _get_current_usage(self, current_time: float) -> int: """Get total token usage within the current time window.""" self._cleanup_old_usage(current_time) return sum(count for _, count in self.token_usage) def check_rate_limit( self, messages: List[Dict[str, str]], model: str ) -> Tuple[bool, float]: """ Check if we can make an API call without hitting rate limits. Args: messages: Messages to send to the API model: Model name Returns: Tuple of (can_proceed, wait_seconds) - can_proceed: True if call can be made immediately - wait_seconds: Seconds to wait if can_proceed is False """ token_count = self.count_tokens(messages, model) current_time = time.time() with self.lock: current_usage = self._get_current_usage(current_time) # Check if adding this request would exceed the limit if current_usage + token_count <= self.tokens_per_minute: return True, 0.0 # Calculate wait time: find when oldest tokens will expire if not self.token_usage: return True, 0.0 oldest_time = self.token_usage[0][0] wait_seconds = (oldest_time + self.window_seconds) - current_time wait_seconds = max(0, wait_seconds) logger.info( f"Rate limit check: current={current_usage}, " f"requested={token_count}, " f"limit={self.tokens_per_minute}, " f"wait={wait_seconds:.1f}s" ) return False, wait_seconds def record_usage(self, messages: List[Dict[str, str]], model: str) -> None: """ Record token usage for a successful API call. Args: messages: Messages that were sent to the API model: Model name that was used """ token_count = self.count_tokens(messages, model) current_time = time.time() with self.lock: self.token_usage.append((current_time, token_count)) logger.debug( f"Recorded {token_count} tokens at {datetime.fromtimestamp(current_time)}" ) def wait_if_needed(self, messages: List[Dict[str, str]], model: str) -> None: """ Wait if necessary to avoid hitting rate limits, then record usage. Args: messages: Messages to send to the API model: Model name """ can_proceed, wait_seconds = self.check_rate_limit(messages, model) if not can_proceed and wait_seconds > 0: logger.info( f"Rate limiting: waiting {wait_seconds:.1f}s to avoid API limits" ) time.sleep(wait_seconds) # Record the usage immediately before making the call self.record_usage(messages, model) def get_usage_stats(self) -> Dict[str, Union[int, float]]: """Get current usage statistics.""" current_time = time.time() with self.lock: current_usage = self._get_current_usage(current_time) usage_percentage = (current_usage / self.tokens_per_minute) * 100 return { "current_usage": current_usage, "limit": self.tokens_per_minute, "usage_percentage": usage_percentage, "window_seconds": self.window_seconds, "active_records": len(self.token_usage), } # Global rate limiter instance _RATE_LIMITER: Optional[TokenRateLimiter] = None # pylint: disable=invalid-name def get_rate_limiter(tokens_per_minute: int = 30000) -> TokenRateLimiter: """Get or create the global rate limiter instance.""" global _RATE_LIMITER # pylint: disable=global-statement if _RATE_LIMITER is None or _RATE_LIMITER.tokens_per_minute != tokens_per_minute: _RATE_LIMITER = TokenRateLimiter(tokens_per_minute=tokens_per_minute) return _RATE_LIMITER def configure_rate_limiter_for_model(model: str) -> TokenRateLimiter: """ Configure rate limiter with appropriate limits for the given model. Args: model: Model name (e.g., "anthropic/claude-sonnet-4-20250514") Returns: Configured TokenRateLimiter instance """ # Model-specific rate limits (tokens per minute) model_limits = { # Anthropic models "anthropic/claude-3-5-sonnet-20240620": 30000, "anthropic/claude-sonnet-4-20250514": 30000, "anthropic/claude-3-opus-20240229": 30000, # OpenAI models "gpt-4o-mini": 200000, "gpt-4o": 150000, "gpt-4": 40000, # Google Gemini models "gemini/gemini-3-flash-preview": 60000, "gemini/gemini-2.5-flash": 60000, "gemini/gemini-2.5-pro": 30000, } # Extract base model name and find limit tokens_per_minute = 30000 # Conservative default for model_pattern, limit in model_limits.items(): if model_pattern in model: tokens_per_minute = limit break logger.info( f"Configured rate limiter for {model}: {tokens_per_minute} tokens/minute" ) return get_rate_limiter(tokens_per_minute) ================================================ FILE: src/podcast_processor/transcribe.py ================================================ import logging import shutil import time from abc import ABC, abstractmethod from pathlib import Path from typing import Any, List from groq import Groq from openai import OpenAI from openai.types.audio.transcription_segment import TranscriptionSegment from pydantic import BaseModel from podcast_processor.audio import split_audio from shared.config import GroqWhisperConfig, RemoteWhisperConfig class Segment(BaseModel): start: float end: float text: str class Transcriber(ABC): @property @abstractmethod def model_name(self) -> str: pass @abstractmethod def transcribe(self, audio_file_path: str) -> List[Segment]: pass class LocalTranscriptSegment(BaseModel): id: int seek: int start: float end: float text: str tokens: List[int] temperature: float avg_logprob: float compression_ratio: float no_speech_prob: float def to_segment(self) -> Segment: return Segment(start=self.start, end=self.end, text=self.text) class TestWhisperTranscriber(Transcriber): def __init__(self, logger: logging.Logger): self.logger = logger @property def model_name(self) -> str: return "test_whisper" def transcribe(self, _: str) -> List[Segment]: self.logger.info("Using test whisper") return [ Segment(start=0, end=1, text="This is a test"), Segment(start=1, end=2, text="This is another test"), ] class LocalWhisperTranscriber(Transcriber): def __init__(self, logger: logging.Logger, whisper_model: str): self.logger = logger self.whisper_model = whisper_model @property def model_name(self) -> str: return f"local_{self.whisper_model}" @staticmethod def convert_to_pydantic( transcript_data: List[Any], ) -> List[LocalTranscriptSegment]: return [LocalTranscriptSegment(**item) for item in transcript_data] @staticmethod def local_seg_to_seg(local_segments: List[LocalTranscriptSegment]) -> List[Segment]: return [seg.to_segment() for seg in local_segments] def transcribe(self, audio_file_path: str) -> List[Segment]: # Import whisper only when needed to avoid CUDA dependencies during module import try: import whisper # type: ignore[import-untyped] except ImportError as e: self.logger.error(f"Failed to import whisper: {e}") raise ImportError( "whisper library is required for LocalWhisperTranscriber" ) from e self.logger.info("Using local whisper") models = whisper.available_models() self.logger.info(f"Available models: {models}") model = whisper.load_model(name=self.whisper_model) self.logger.info("Beginning transcription") start = time.time() result = model.transcribe(audio_file_path, fp16=False, language="English") end = time.time() elapsed = end - start self.logger.info(f"Transcription completed in {elapsed}") segments = result["segments"] typed_segments = self.convert_to_pydantic(segments) return self.local_seg_to_seg(typed_segments) class OpenAIWhisperTranscriber(Transcriber): def __init__(self, logger: logging.Logger, config: RemoteWhisperConfig): self.logger = logger self.config = config self.openai_client = OpenAI( base_url=config.base_url, api_key=config.api_key, timeout=config.timeout_sec, ) @property def model_name(self) -> str: return self.config.model # e.g. "whisper-1" def transcribe(self, audio_file_path: str) -> List[Segment]: self.logger.info( "[WHISPER_REMOTE] Starting remote whisper transcription for: %s", audio_file_path, ) audio_chunk_path = audio_file_path + "_parts" chunks = split_audio( Path(audio_file_path), Path(audio_chunk_path), self.config.chunksize_mb * 1024 * 1024, ) self.logger.info("[WHISPER_REMOTE] Processing %d chunks", len(chunks)) all_segments: List[TranscriptionSegment] = [] for idx, chunk in enumerate(chunks): chunk_path, offset = chunk self.logger.info( "[WHISPER_REMOTE] Processing chunk %d/%d: %s", idx + 1, len(chunks), chunk_path, ) segments = self.get_segments_for_chunk(str(chunk_path)) self.logger.info( "[WHISPER_REMOTE] Chunk %d/%d complete: %d segments", idx + 1, len(chunks), len(segments), ) all_segments.extend(self.add_offset_to_segments(segments, offset)) shutil.rmtree(audio_chunk_path) self.logger.info( "[WHISPER_REMOTE] Transcription complete: %d total segments", len(all_segments), ) return self.convert_segments(all_segments) @staticmethod def convert_segments(segments: List[TranscriptionSegment]) -> List[Segment]: return [ Segment( start=seg.start, end=seg.end, text=seg.text, ) for seg in segments ] @staticmethod def add_offset_to_segments( segments: List[TranscriptionSegment], offset_ms: int ) -> List[TranscriptionSegment]: offset_sec = float(offset_ms) / 1000.0 for segment in segments: segment.start += offset_sec segment.end += offset_sec return segments def get_segments_for_chunk(self, chunk_path: str) -> List[TranscriptionSegment]: with open(chunk_path, "rb") as f: self.logger.info( "[WHISPER_API_CALL] Sending chunk to API: %s (timeout=%ds)", chunk_path, self.config.timeout_sec, ) transcription = self.openai_client.audio.transcriptions.create( model=self.config.model, file=f, timestamp_granularities=["segment"], language=self.config.language, response_format="verbose_json", ) self.logger.debug("Got transcription") segments = transcription.segments assert segments is not None self.logger.debug(f"Got {len(segments)} segments") return segments class GroqTranscriptionSegment(BaseModel): start: float end: float text: str class GroqWhisperTranscriber(Transcriber): def __init__(self, logger: logging.Logger, config: GroqWhisperConfig): self.logger = logger self.config = config self.client = Groq( api_key=config.api_key, max_retries=config.max_retries, ) @property def model_name(self) -> str: return f"groq_{self.config.model}" def transcribe(self, audio_file_path: str) -> List[Segment]: self.logger.info( "[WHISPER_GROQ] Starting Groq whisper transcription for: %s", audio_file_path, ) audio_chunk_path = audio_file_path + "_parts" chunks = split_audio( Path(audio_file_path), Path(audio_chunk_path), 12 * 1024 * 1024 ) self.logger.info("[WHISPER_GROQ] Processing %d chunks", len(chunks)) all_segments: List[GroqTranscriptionSegment] = [] for idx, chunk in enumerate(chunks): chunk_path, offset = chunk self.logger.info( "[WHISPER_GROQ] Processing chunk %d/%d: %s", idx + 1, len(chunks), chunk_path, ) segments = self.get_segments_for_chunk(str(chunk_path)) self.logger.info( "[WHISPER_GROQ] Chunk %d/%d complete: %d segments", idx + 1, len(chunks), len(segments), ) all_segments.extend(self.add_offset_to_segments(segments, offset)) shutil.rmtree(audio_chunk_path) self.logger.info( "[WHISPER_GROQ] Transcription complete: %d total segments", len(all_segments), ) return self.convert_segments(all_segments) @staticmethod def convert_segments(segments: List[GroqTranscriptionSegment]) -> List[Segment]: return [ Segment( start=seg.start, end=seg.end, text=seg.text, ) for seg in segments ] @staticmethod def add_offset_to_segments( segments: List[GroqTranscriptionSegment], offset_ms: int ) -> List[GroqTranscriptionSegment]: offset_sec = float(offset_ms) / 1000.0 for segment in segments: segment.start += offset_sec segment.end += offset_sec return segments def get_segments_for_chunk(self, chunk_path: str) -> List[GroqTranscriptionSegment]: self.logger.info("[GROQ_API_CALL] Sending chunk to Groq API: %s", chunk_path) transcription = self.client.audio.transcriptions.create( file=Path(chunk_path), model=self.config.model, response_format="verbose_json", # Ensure segments are included language=self.config.language, ) self.logger.info( "[GROQ_API_CALL] Received response from Groq API for: %s", chunk_path ) if transcription.segments is None: # type: ignore [attr-defined] self.logger.warning( "[GROQ_API_CALL] No segments found in transcription for %s", chunk_path ) return [] groq_segments = [ GroqTranscriptionSegment( start=seg["start"], end=seg["end"], text=seg["text"] ) for seg in transcription.segments # type: ignore [attr-defined] ] self.logger.info( "[GROQ_API_CALL] Got %d segments from chunk", len(groq_segments) ) return groq_segments ================================================ FILE: src/podcast_processor/transcription_manager.py ================================================ import logging from typing import Any, List, Optional from app.extensions import db from app.models import ModelCall, Post, TranscriptSegment from app.writer.client import writer_client from shared.config import ( Config, GroqWhisperConfig, LocalWhisperConfig, RemoteWhisperConfig, TestWhisperConfig, ) from .transcribe import ( GroqWhisperTranscriber, LocalWhisperTranscriber, OpenAIWhisperTranscriber, TestWhisperTranscriber, Transcriber, ) class TranscriptionManager: """Handles the transcription of podcast audio files.""" def __init__( self, logger: logging.Logger, config: Config, model_call_query: Optional[Any] = None, segment_query: Optional[Any] = None, db_session: Optional[Any] = None, transcriber: Optional[Transcriber] = None, ): self.logger = logger self.config = config self.transcriber = transcriber or self._create_transcriber() self._model_call_query_provided = model_call_query is not None self.model_call_query = model_call_query or ModelCall.query self._segment_query_provided = segment_query is not None self.segment_query = segment_query or TranscriptSegment.query self.db_session = db_session or db.session def _create_transcriber(self) -> Transcriber: """Create the appropriate transcriber based on configuration.""" assert self.config.whisper is not None, ( "validate_whisper_config ensures that even if old style whisper " "config is given, it will be translated and config.whisper set." ) if isinstance(self.config.whisper, TestWhisperConfig): return TestWhisperTranscriber(self.logger) if isinstance(self.config.whisper, RemoteWhisperConfig): return OpenAIWhisperTranscriber(self.logger, self.config.whisper) if isinstance(self.config.whisper, LocalWhisperConfig): return LocalWhisperTranscriber(self.logger, self.config.whisper.model) if isinstance(self.config.whisper, GroqWhisperConfig): return GroqWhisperTranscriber(self.logger, self.config.whisper) raise ValueError(f"unhandled whisper config {self.config.whisper}") def _check_existing_transcription( self, post: Post ) -> Optional[List[TranscriptSegment]]: """Checks for existing successful transcription and returns segments if valid. NOTE: Defaults to using self.db_session for queries to keep a single session, but will honor injected model_call_query/segment_query when provided (e.g. tests). """ model_call_query = ( self.model_call_query if self._model_call_query_provided else self.db_session.query(ModelCall) ) segment_query = ( self.segment_query if self._segment_query_provided else self.db_session.query(TranscriptSegment) ) existing_whisper_call = ( model_call_query.filter_by( post_id=post.id, model_name=self.transcriber.model_name, status="success", ) .order_by(ModelCall.timestamp.desc()) .first() ) if existing_whisper_call: self.logger.info( f"Found existing successful Whisper ModelCall {existing_whisper_call.id} for post {post.id}." ) db_segments: List[TranscriptSegment] = ( segment_query.filter_by(post_id=post.id) .order_by(TranscriptSegment.sequence_num) .all() ) if db_segments: if ( existing_whisper_call.last_segment_sequence_num == len(db_segments) - 1 ): self.logger.info( f"Returning {len(db_segments)} existing transcript segments from database for post {post.id}." ) return db_segments self.logger.warning( f"ModelCall {existing_whisper_call.id} for post {post.id} indicates {existing_whisper_call.last_segment_sequence_num + 1} segments, but found {len(db_segments)} in DB. Re-transcribing." ) else: self.logger.warning( f"Successful ModelCall {existing_whisper_call.id} found for post {post.id}, but no transcript segments in DB. Re-transcribing." ) else: self.logger.info( f"No existing successful Whisper ModelCall found for post {post.id} with model {self.transcriber.model_name}. Proceeding to transcribe." ) return None def _get_or_create_whisper_model_call(self, post: Post) -> ModelCall: """Create or reuse the placeholder ModelCall row for a Whisper run via writer.""" result = writer_client.action( "upsert_whisper_model_call", { "post_id": post.id, "model_name": self.transcriber.model_name, "first_segment_sequence_num": 0, "last_segment_sequence_num": -1, "prompt": "Whisper transcription job", }, wait=True, ) if not result or not result.success: raise RuntimeError(getattr(result, "error", "Failed to upsert ModelCall")) model_call_id = (result.data or {}).get("model_call_id") if model_call_id is None: raise RuntimeError("Writer did not return model_call_id") model_call = self.db_session.get(ModelCall, int(model_call_id)) if model_call is None: raise RuntimeError(f"ModelCall {model_call_id} not found after upsert") return model_call def transcribe(self, post: Post) -> List[TranscriptSegment]: """ Transcribes a podcast audio file, or retrieves existing transcription. Args: post: The Post object containing the podcast audio to transcribe Returns: A list of TranscriptSegment objects with the transcription results """ self.logger.info( f"Starting transcription process for post {post.id} using {self.transcriber.model_name}" ) existing_segments = self._check_existing_transcription(post) if existing_segments is not None: return existing_segments # Create or reuse the ModelCall record for this transcription attempt current_whisper_call = self._get_or_create_whisper_model_call(post) self.logger.info( f"Prepared Whisper ModelCall {current_whisper_call.id} for post {post.id}." ) try: self.logger.info( f"[TRANSCRIBE_START] Calling transcriber {self.transcriber.model_name} for post {post.id}, audio: {post.unprocessed_audio_path}" ) # Expire session state before long-running transcription to avoid stale locks self.db_session.expire_all() pydantic_segments = self.transcriber.transcribe(post.unprocessed_audio_path) self.logger.info( f"[TRANSCRIBE_COMPLETE] Transcription by {self.transcriber.model_name} for post {post.id} resulted in {len(pydantic_segments)} segments." ) segments_payload = [ { "sequence_num": i, "start_time": round(seg.start, 1), "end_time": round(seg.end, 1), "text": seg.text, } for i, seg in enumerate(pydantic_segments or []) ] write_res = writer_client.action( "replace_transcription", { "post_id": post.id, "segments": segments_payload, "model_call_id": current_whisper_call.id, }, wait=True, ) if not write_res or not write_res.success: raise RuntimeError( getattr(write_res, "error", "Failed to persist transcription") ) segment_query = ( self.segment_query if self._segment_query_provided else self.db_session.query(TranscriptSegment) ) db_segments: List[TranscriptSegment] = ( segment_query.filter_by(post_id=post.id) .order_by(TranscriptSegment.sequence_num) .all() ) self.logger.info( f"Successfully stored {len(db_segments)} transcript segments and updated ModelCall {current_whisper_call.id} for post {post.id}." ) return db_segments except Exception as e: self.logger.error( f"Transcription failed for post {post.id} using {self.transcriber.model_name}. Error: {e}", exc_info=True, ) fail_res = writer_client.action( "mark_model_call_failed", { "model_call_id": current_whisper_call.id, "error_message": str(e), "status": "failed_permanent", }, wait=True, ) if not fail_res or not fail_res.success: self.logger.error( "Failed to mark ModelCall %s as failed via writer: %s", current_whisper_call.id, getattr(fail_res, "error", None), ) raise ================================================ FILE: src/podcast_processor/word_boundary_refiner.py ================================================ """LLM-based word-boundary refiner. Note: We intentionally share some call-setup patterns with BoundaryRefiner. Pylint may flag these as R0801 (duplicate-code); we ignore that for this module. """ # pylint: disable=duplicate-code import json import logging import re from dataclasses import dataclass from pathlib import Path from typing import Any, Dict, List, Optional, Tuple, cast import litellm from jinja2 import Template from podcast_processor.llm_model_call_utils import ( extract_litellm_content, render_prompt_and_upsert_model_call, try_update_model_call, ) from shared.config import Config # Keep the same internal bounds as the existing BoundaryRefiner. MAX_START_EXTENSION_SECONDS = 30.0 MAX_END_EXTENSION_SECONDS = 15.0 @dataclass class WordBoundaryRefinement: refined_start: float refined_end: float start_adjustment_reason: str end_adjustment_reason: str class WordBoundaryRefiner: """Refine ad start boundary by finding the first ad word and estimating its time. This refiner is intentionally heuristic-timed because we only have segment-level timestamps today. """ def __init__(self, config: Config, logger: Optional[logging.Logger] = None): self.config = config self.logger = logger or logging.getLogger(__name__) self.template = self._load_template() def _load_template(self) -> Template: path = ( Path(__file__).resolve().parent.parent # project src root / "word_boundary_refinement_prompt.jinja" ) if path.exists(): return Template(path.read_text()) return Template( """Find start/end phrases for the ad break. Ad: {{ad_start}}s-{{ad_end}}s {% for seg in context_segments %}[seq={{seg.sequence_num}} start={{seg.start_time}} end={{seg.end_time}}] {{seg.text}} {% endfor %} Return JSON: {"refined_start_segment_seq": 0, "refined_start_phrase": "", "refined_end_segment_seq": 0, "refined_end_phrase": "", "start_adjustment_reason": "", "end_adjustment_reason": ""} """ ) def refine( self, ad_start: float, ad_end: float, confidence: float, all_segments: List[Dict[str, Any]], *, post_id: Optional[int] = None, first_seq_num: Optional[int] = None, last_seq_num: Optional[int] = None, ) -> WordBoundaryRefinement: context = self._get_context( ad_start, ad_end, all_segments, first_seq_num=first_seq_num, last_seq_num=last_seq_num, ) prompt, model_call_id = render_prompt_and_upsert_model_call( template=self.template, ad_start=ad_start, ad_end=ad_end, confidence=confidence, context_segments=context, post_id=post_id, first_seq_num=first_seq_num, last_seq_num=last_seq_num, model_name=self.config.llm_model, logger=self.logger, log_prefix="Word boundary refine", ) raw_response: Optional[str] = None try: response = litellm.completion( model=self.config.llm_model, messages=[{"role": "user", "content": prompt}], temperature=0.1, max_tokens=2048, timeout=self.config.openai_timeout, api_key=self.config.llm_api_key, base_url=self.config.openai_base_url, ) content = extract_litellm_content(response) raw_response = content self._update_model_call( model_call_id, status="received_response", response=raw_response, error_message=None, ) parsed = self._parse_json(content) if not parsed: self.logger.warning( "Word boundary refine: no parseable JSON; falling back to original start", extra={"content_preview": (content or "")[:200]}, ) self._update_model_call( model_call_id, status="success_heuristic", response=raw_response, error_message="parse_failed", ) return self._fallback(ad_start, ad_end) payload = self._extract_payload(parsed) refined_start, start_changed, start_reason, start_err = self._refine_start( ad_start=ad_start, all_segments=all_segments, context_segments=context, start_segment_seq=payload["start_segment_seq"], start_phrase=payload["start_phrase"], start_word=payload["start_word"], start_occurrence=payload["start_occurrence"], start_word_index=payload["start_word_index"], start_reason=payload["start_reason"], ) refined_end, end_changed, end_reason, end_err = self._refine_end( ad_end=ad_end, all_segments=all_segments, context_segments=context, end_segment_seq=payload["end_segment_seq"], end_phrase=payload["end_phrase"], end_reason=payload["end_reason"], ) partial_errors = [e for e in [start_err, end_err] if e] # If caller didn't provide reasons, default to unchanged for untouched sides. start_reason = self._default_reason(start_reason, changed=start_changed) end_reason = self._default_reason(end_reason, changed=end_changed) # Guardrail: never return an invalid window. if refined_end <= refined_start: self._update_model_call( model_call_id, status="success_heuristic", response=raw_response, error_message="invalid_refined_window", ) return self._fallback(ad_start, ad_end) self._update_model_call( model_call_id, status=self._result_status(start_changed, end_changed, partial_errors), response=raw_response, error_message=(",".join(partial_errors) if partial_errors else None), ) result = WordBoundaryRefinement( refined_start=refined_start, refined_end=refined_end, start_adjustment_reason=start_reason, end_adjustment_reason=end_reason, ) self._update_model_call( model_call_id, status="success", response=raw_response, error_message=None, ) return result except Exception as exc: self._update_model_call( model_call_id, status="failed_permanent", response=raw_response, error_message=str(exc), ) self.logger.warning("Word boundary refine failed: %s", exc) return self._fallback(ad_start, ad_end) def _fallback(self, ad_start: float, ad_end: float) -> WordBoundaryRefinement: return WordBoundaryRefinement( refined_start=ad_start, refined_end=ad_end, start_adjustment_reason="heuristic_fallback", end_adjustment_reason="unchanged", ) def _constrain_start(self, estimated_start: float, orig_start: float) -> float: return max(estimated_start, orig_start - MAX_START_EXTENSION_SECONDS) def _constrain_end(self, estimated_end: float, orig_end: float) -> float: # Allow slight forward extension (for late boundary) but cap it. return min(estimated_end, orig_end + MAX_END_EXTENSION_SECONDS) def _parse_json(self, content: str) -> Optional[Dict[str, Any]]: cleaned = re.sub(r"```json|```", "", (content or "").strip()) json_candidates = re.findall(r"\{.*?\}", cleaned, re.DOTALL) for candidate in json_candidates: try: loaded = json.loads(candidate) if isinstance(loaded, dict): return cast(Dict[str, Any], loaded) except Exception: continue return None @staticmethod def _has_text(value: Any) -> bool: if value is None: return False try: return bool(str(value).strip()) except Exception: return False def _extract_payload(self, parsed: Dict[str, Any]) -> Dict[str, Any]: occurrence = parsed.get("occurrence") if occurrence is None: occurrence = parsed.get("occurance") return { "start_segment_seq": parsed.get("refined_start_segment_seq"), "start_phrase": parsed.get("refined_start_phrase"), "end_segment_seq": parsed.get("refined_end_segment_seq"), "end_phrase": parsed.get("refined_end_phrase"), "start_word": parsed.get("refined_start_word"), "start_occurrence": occurrence, "start_word_index": parsed.get("refined_start_word_index"), "start_reason": str(parsed.get("start_adjustment_reason") or ""), "end_reason": str(parsed.get("end_adjustment_reason") or ""), } @staticmethod def _default_reason(reason: str, *, changed: bool) -> str: if reason: return reason return "refined" if changed else "unchanged" @staticmethod def _result_status( start_changed: bool, end_changed: bool, partial_errors: List[str] ) -> str: if partial_errors and not start_changed and not end_changed: return "success_heuristic" return "success" def _refine_start( self, *, ad_start: float, all_segments: List[Dict[str, Any]], context_segments: List[Dict[str, Any]], start_segment_seq: Any, start_phrase: Any, start_word: Any, start_occurrence: Any, start_word_index: Any, start_reason: str, ) -> Tuple[float, bool, str, Optional[str]]: if self._has_text(start_phrase): estimated_start = self._estimate_phrase_time( all_segments=all_segments, context_segments=context_segments, preferred_segment_seq=start_segment_seq, phrase=start_phrase, direction="start", ) if estimated_start is None: return float(ad_start), False, start_reason, "start_phrase_not_found" return ( self._constrain_start(float(estimated_start), ad_start), True, start_reason, None, ) if self._has_text(start_word) or start_word_index is not None: estimated_start = self._estimate_word_time( all_segments=all_segments, segment_seq=start_segment_seq, word=start_word, occurrence=start_occurrence, word_index=start_word_index, ) return ( self._constrain_start(float(estimated_start), ad_start), True, start_reason, None, ) return float(ad_start), False, (start_reason or "unchanged"), None def _refine_end( self, *, ad_end: float, all_segments: List[Dict[str, Any]], context_segments: List[Dict[str, Any]], end_segment_seq: Any, end_phrase: Any, end_reason: str, ) -> Tuple[float, bool, str, Optional[str]]: if not self._has_text(end_phrase): return float(ad_end), False, (end_reason or "unchanged"), None estimated_end = self._estimate_phrase_time( all_segments=all_segments, context_segments=context_segments, preferred_segment_seq=end_segment_seq, phrase=end_phrase, direction="end", ) if estimated_end is None: return float(ad_end), False, end_reason, "end_phrase_not_found" return ( self._constrain_end(float(estimated_end), ad_end), True, end_reason, None, ) def _get_context( self, ad_start: float, ad_end: float, all_segments: List[Dict[str, Any]], *, first_seq_num: Optional[int], last_seq_num: Optional[int], ) -> List[Dict[str, Any]]: selected = self._context_by_seq_window( all_segments, first_seq_num=first_seq_num, last_seq_num=last_seq_num, ) if selected: return selected return self._context_by_time_overlap(ad_start, ad_end, all_segments) def _context_by_seq_window( self, all_segments: List[Dict[str, Any]], *, first_seq_num: Optional[int], last_seq_num: Optional[int], ) -> List[Dict[str, Any]]: if first_seq_num is None or last_seq_num is None or not all_segments: return [] seq_values: List[int] = [] for segment in all_segments: try: seq_values.append(int(segment.get("sequence_num", -1))) except Exception: continue if not seq_values: return [] min_seq = min(seq_values) max_seq = max(seq_values) start_seq = max(min_seq, int(first_seq_num) - 2) end_seq = min(max_seq, int(last_seq_num) + 2) selected: List[Dict[str, Any]] = [] for segment in all_segments: try: seq = int(segment.get("sequence_num", -1)) except Exception: continue if start_seq <= seq <= end_seq: selected.append(segment) return selected def _context_by_time_overlap( self, ad_start: float, ad_end: float, all_segments: List[Dict[str, Any]], ) -> List[Dict[str, Any]]: ad_segs = [ s for s in all_segments if self._segment_overlaps(s, ad_start, ad_end) ] if not ad_segs: return [] first_idx = all_segments.index(ad_segs[0]) last_idx = all_segments.index(ad_segs[-1]) start_idx = max(0, first_idx - 2) end_idx = min(len(all_segments), last_idx + 3) return all_segments[start_idx:end_idx] @staticmethod def _segment_overlaps( segment: Dict[str, Any], ad_start: float, ad_end: float ) -> bool: try: seg_start = float(segment.get("start_time", 0.0)) except Exception: seg_start = 0.0 try: seg_end = float(segment.get("end_time", seg_start)) except Exception: seg_end = seg_start return seg_start <= float(ad_end) and seg_end >= float(ad_start) def _estimate_phrase_times( self, *, all_segments: List[Dict[str, Any]], context_segments: List[Dict[str, Any]], start_segment_seq: Any, start_phrase: Any, end_segment_seq: Any, end_phrase: Any, ) -> Tuple[Optional[float], Optional[float]]: start_time = self._estimate_phrase_time( all_segments=all_segments, context_segments=context_segments, preferred_segment_seq=start_segment_seq, phrase=start_phrase, direction="start", ) end_time = self._estimate_phrase_time( all_segments=all_segments, context_segments=context_segments, preferred_segment_seq=end_segment_seq, phrase=end_phrase, direction="end", ) return start_time, end_time def _estimate_phrase_time( self, *, all_segments: List[Dict[str, Any]], context_segments: List[Dict[str, Any]], preferred_segment_seq: Any, phrase: Any, direction: str, ) -> Optional[float]: phrase_tokens = self._split_words(str(phrase or "")) phrase_tokens = [t.lower() for t in phrase_tokens if t] if not phrase_tokens: return None # Search order: # 1) preferred segment (if provided) # 2) other provided context segments (ad-range ±2) candidates: List[Dict[str, Any]] = [] preferred_seg = self._find_segment(all_segments, preferred_segment_seq) if preferred_seg is not None: candidates.append(preferred_seg) # De-duplicate and order additional candidates. ordered_context = list(context_segments or []) try: ordered_context.sort(key=lambda s: int(s.get("sequence_num", -1))) except Exception: pass if direction == "end": ordered_context = list(reversed(ordered_context)) preferred_seq_int: Optional[int] try: preferred_seq_int = int(preferred_segment_seq) except Exception: preferred_seq_int = None for seg in ordered_context: try: seq = int(seg.get("sequence_num", -1)) except Exception: seq = None if preferred_seq_int is not None and seq == preferred_seq_int: continue candidates.append(seg) for seg in candidates: start_time = float(seg.get("start_time", 0.0)) end_time = float(seg.get("end_time", start_time)) duration = max(0.0, end_time - start_time) words = [w.lower() for w in self._split_words(str(seg.get("text", "")))] if not words or duration <= 0.0: continue match = self._find_phrase_match( words=words, phrase_tokens=phrase_tokens, direction=direction, max_words=4, ) if match is None: continue match_start_idx, match_end_idx = match seconds_per_word = duration / float(len(words)) if direction == "start": estimated = start_time + (float(match_start_idx) * seconds_per_word) return min(estimated, end_time) # direction == "end": end boundary at the end of the last matched word. estimated = start_time + (float(match_end_idx + 1) * seconds_per_word) return min(estimated, end_time) return None def _find_phrase_match( self, *, words: List[str], phrase_tokens: List[str], direction: str, max_words: int, ) -> Optional[Tuple[int, int]]: if not words or not phrase_tokens: return None if direction == "start": base = phrase_tokens[:max_words] for k in range(len(base), 0, -1): target = base[:k] match = self._find_subsequence(words, target, choose="first") if match is not None: return match return None # direction == "end" base = phrase_tokens[-max_words:] for k in range(len(base), 0, -1): target = base[-k:] match = self._find_subsequence(words, target, choose="last") if match is not None: return match return None def _find_subsequence( self, words: List[str], target: List[str], *, choose: str ) -> Optional[Tuple[int, int]]: if not target or len(target) > len(words): return None matches: List[Tuple[int, int]] = [] k = len(target) for i in range(0, len(words) - k + 1): if words[i : i + k] == target: matches.append((i, i + k - 1)) if not matches: return None if choose == "last": return matches[-1] return matches[0] def _estimate_word_time( self, *, all_segments: List[Dict[str, Any]], segment_seq: Any, word: Any, occurrence: Any, word_index: Any, ) -> float: seg = self._find_segment(all_segments, segment_seq) if not seg: return float(all_segments[0]["start_time"]) if all_segments else 0.0 start_time = float(seg.get("start_time", 0.0)) end_time = float(seg.get("end_time", start_time)) duration = max(0.0, end_time - start_time) words = self._split_words(str(seg.get("text", ""))) if not words or duration <= 0.0: return start_time resolved_index = self._resolve_word_index( words, word=word, occurrence=occurrence, word_index=word_index, ) # Heuristic timing: constant word duration within the segment. # words_per_second = num_words / segment_duration # seconds_per_word = 1 / words_per_second = segment_duration / num_words seconds_per_word = duration / float(len(words)) estimated = start_time + (float(resolved_index) * seconds_per_word) # Guardrail: never return a start after the block end. return min(estimated, float(seg.get("end_time", end_time))) def _find_segment( self, all_segments: List[Dict[str, Any]], segment_seq: Any ) -> Optional[Dict[str, Any]]: if segment_seq is None: return None try: seq_int = int(segment_seq) except Exception: return None for seg in all_segments: if int(seg.get("sequence_num", -1)) == seq_int: return seg return None def _split_words(self, text: str) -> List[str]: # Word count/indexing heuristic: split on whitespace, then normalize away # leading/trailing punctuation to keep indices stable. raw_tokens = [t for t in re.split(r"\s+", (text or "").strip()) if t] normalized = [self._normalize_token(t) for t in raw_tokens] return [t for t in normalized if t] def _normalize_token(self, token: str) -> str: # Strip leading/trailing punctuation; keep internal apostrophes. # Examples: # "(brought" -> "brought" # "you..." -> "you" # "don't" -> "don't" return re.sub(r"(^[^A-Za-z0-9']+)|([^A-Za-z0-9']+$)", "", token) def _resolve_word_index( self, words: List[str], *, word: Any, occurrence: Any, word_index: Any ) -> int: # Prefer the verbatim word match if provided. # `occurance` chooses which matching instance to use. # Defaults to "first" if missing/invalid. target_raw = str(word).strip() if word is not None else "" target = self._normalize_token(target_raw).lower() if target: match_indexes = [ idx for idx, w in enumerate(words) if (w or "").lower() == target ] if match_indexes: occ = str(occurrence).strip().lower() if occurrence is not None else "" if occ == "last": return match_indexes[-1] # Default to first if LLM response is missing/invalid. return match_indexes[0] try: idx_int = int(word_index) except Exception: idx_int = 0 idx_int = max(0, min(idx_int, len(words) - 1)) return idx_int def _update_model_call( self, model_call_id: Optional[int], *, status: str, response: Optional[str], error_message: Optional[str], ) -> None: try_update_model_call( model_call_id, status=status, response=response, error_message=error_message, logger=self.logger, log_prefix="Word boundary refine", ) ================================================ FILE: src/shared/__init__.py ================================================ ================================================ FILE: src/shared/config.py ================================================ from __future__ import annotations from typing import Literal, Optional from pydantic import BaseModel, Field, model_validator from shared import defaults as DEFAULTS class ProcessingConfig(BaseModel): num_segments_to_input_to_prompt: int max_overlap_segments: int = Field( default=DEFAULTS.PROCESSING_MAX_OVERLAP_SEGMENTS, ge=0, description="Maximum number of previously identified segments carried into the next prompt.", ) @model_validator(mode="after") def validate_overlap_limits(self) -> "ProcessingConfig": assert ( self.max_overlap_segments <= self.num_segments_to_input_to_prompt ), "max_overlap_segments must be <= num_segments_to_input_to_prompt" return self class OutputConfig(BaseModel): fade_ms: int min_ad_segement_separation_seconds: int min_ad_segment_length_seconds: int min_confidence: float @property def min_ad_segment_separation_seconds(self) -> int: """Backwards-compatible alias for the misspelled config field.""" return self.min_ad_segement_separation_seconds @min_ad_segment_separation_seconds.setter def min_ad_segment_separation_seconds(self, value: int) -> None: self.min_ad_segement_separation_seconds = value WhisperConfigTypes = Literal["remote", "local", "test", "groq"] class TestWhisperConfig(BaseModel): whisper_type: Literal["test"] = "test" class RemoteWhisperConfig(BaseModel): whisper_type: Literal["remote"] = "remote" base_url: str = DEFAULTS.WHISPER_REMOTE_BASE_URL api_key: str language: str = DEFAULTS.WHISPER_REMOTE_LANGUAGE model: str = DEFAULTS.WHISPER_REMOTE_MODEL timeout_sec: int = DEFAULTS.WHISPER_REMOTE_TIMEOUT_SEC chunksize_mb: int = DEFAULTS.WHISPER_REMOTE_CHUNKSIZE_MB class GroqWhisperConfig(BaseModel): whisper_type: Literal["groq"] = "groq" api_key: str language: str = DEFAULTS.WHISPER_GROQ_LANGUAGE model: str = DEFAULTS.WHISPER_GROQ_MODEL max_retries: int = DEFAULTS.WHISPER_GROQ_MAX_RETRIES class LocalWhisperConfig(BaseModel): whisper_type: Literal["local"] = "local" model: str = DEFAULTS.WHISPER_LOCAL_MODEL class Config(BaseModel): llm_api_key: Optional[str] = Field(default=None) llm_model: str = Field(default=DEFAULTS.LLM_DEFAULT_MODEL) openai_base_url: Optional[str] = None openai_max_tokens: int = DEFAULTS.OPENAI_DEFAULT_MAX_TOKENS openai_timeout: int = DEFAULTS.OPENAI_DEFAULT_TIMEOUT_SEC # Optional: Rate limiting controls llm_max_concurrent_calls: int = Field( default=DEFAULTS.LLM_DEFAULT_MAX_CONCURRENT_CALLS, description="Maximum concurrent LLM calls to prevent rate limiting", ) llm_max_retry_attempts: int = Field( default=DEFAULTS.LLM_DEFAULT_MAX_RETRY_ATTEMPTS, description="Maximum retry attempts for failed LLM calls", ) llm_max_input_tokens_per_call: Optional[int] = Field( default=DEFAULTS.LLM_MAX_INPUT_TOKENS_PER_CALL, description="Maximum input tokens per LLM call to stay under API limits", ) # Token-based rate limiting llm_enable_token_rate_limiting: bool = Field( default=DEFAULTS.LLM_ENABLE_TOKEN_RATE_LIMITING, description="Enable client-side token-based rate limiting", ) llm_max_input_tokens_per_minute: Optional[int] = Field( default=DEFAULTS.LLM_MAX_INPUT_TOKENS_PER_MINUTE, description="Override default tokens per minute limit for the model", ) enable_boundary_refinement: bool = Field( default=DEFAULTS.ENABLE_BOUNDARY_REFINEMENT, description="Enable LLM-based ad boundary refinement for improved precision (consumes additional LLM tokens)", ) enable_word_level_boundary_refinder: bool = Field( default=DEFAULTS.ENABLE_WORD_LEVEL_BOUNDARY_REFINDER, description="Enable word-level (heuristic-timed) ad boundary refinement", ) developer_mode: bool = Field( default=False, description="Enable developer mode features like test feeds", ) output: OutputConfig processing: ProcessingConfig server: Optional[str] = Field( default=None, deprecated=True, description="deprecated in favor of request-aware URL generation", ) background_update_interval_minute: Optional[int] = ( DEFAULTS.APP_BACKGROUND_UPDATE_INTERVAL_MINUTE ) post_cleanup_retention_days: Optional[int] = Field( default=DEFAULTS.APP_POST_CLEANUP_RETENTION_DAYS, description="Number of days to retain processed post data before cleanup. None disables cleanup.", ) # removed job_timeout whisper: Optional[ LocalWhisperConfig | RemoteWhisperConfig | TestWhisperConfig | GroqWhisperConfig ] = Field( default=None, discriminator="whisper_type", ) remote_whisper: Optional[bool] = Field( default=False, deprecated=True, description="deprecated in favor of [Remote|Local]WhisperConfig", ) whisper_model: Optional[str] = Field( default=DEFAULTS.WHISPER_LOCAL_MODEL, deprecated=True, description="deprecated in favor of [Remote|Local]WhisperConfig", ) automatically_whitelist_new_episodes: bool = ( DEFAULTS.APP_AUTOMATICALLY_WHITELIST_NEW_EPISODES ) number_of_episodes_to_whitelist_from_archive_of_new_feed: int = ( DEFAULTS.APP_NUM_EPISODES_TO_WHITELIST_FROM_ARCHIVE_OF_NEW_FEED ) enable_public_landing_page: bool = DEFAULTS.APP_ENABLE_PUBLIC_LANDING_PAGE user_limit_total: int | None = DEFAULTS.APP_USER_LIMIT_TOTAL autoprocess_on_download: bool = DEFAULTS.APP_AUTOPROCESS_ON_DOWNLOAD def redacted(self) -> Config: return self.model_copy( update={ "llm_api_key": "X" * 10, }, deep=True, ) @model_validator(mode="after") def validate_whisper_config(self) -> "Config": new_style = self.whisper is not None if new_style: self.whisper_model = None self.remote_whisper = None return self # if we have old style, change to the equivalent new style if self.remote_whisper: assert ( self.llm_api_key is not None ), "must supply api key to use remote whisper" self.whisper = RemoteWhisperConfig( api_key=self.llm_api_key, base_url=self.openai_base_url or "https://api.openai.com/v1", ) else: assert ( self.whisper_model is not None ), "must supply whisper model to use local whisper" self.whisper = LocalWhisperConfig(model=self.whisper_model) self.whisper_model = None self.remote_whisper = None return self ================================================ FILE: src/shared/defaults.py ================================================ from __future__ import annotations # Centralized default values for application configuration. # Single source of truth for defaults across runtime, DB models, and Pydantic config. # LLM defaults LLM_DEFAULT_MODEL = "groq/openai/gpt-oss-120b" OPENAI_DEFAULT_MAX_TOKENS = 4096 OPENAI_DEFAULT_TIMEOUT_SEC = 300 LLM_DEFAULT_MAX_CONCURRENT_CALLS = 3 LLM_DEFAULT_MAX_RETRY_ATTEMPTS = 5 LLM_ENABLE_TOKEN_RATE_LIMITING = False LLM_MAX_INPUT_TOKENS_PER_CALL: int | None = None LLM_MAX_INPUT_TOKENS_PER_MINUTE: int | None = None ENABLE_BOUNDARY_REFINEMENT = True ENABLE_WORD_LEVEL_BOUNDARY_REFINDER = False # Whisper defaults WHISPER_DEFAULT_TYPE = "groq" WHISPER_LOCAL_MODEL = "base.en" WHISPER_REMOTE_BASE_URL = "https://api.openai.com/v1" WHISPER_REMOTE_MODEL = "whisper-1" WHISPER_REMOTE_LANGUAGE = "en" WHISPER_REMOTE_TIMEOUT_SEC = 600 WHISPER_REMOTE_CHUNKSIZE_MB = 24 WHISPER_GROQ_MODEL = "whisper-large-v3-turbo" WHISPER_GROQ_LANGUAGE = "en" WHISPER_GROQ_MAX_RETRIES = 3 # Processing defaults PROCESSING_NUM_SEGMENTS_TO_INPUT_TO_PROMPT = 60 PROCESSING_MAX_OVERLAP_SEGMENTS = 30 # Output defaults OUTPUT_FADE_MS = 3000 OUTPUT_MIN_AD_SEGMENT_SEPARATION_SECONDS = 60 OUTPUT_MIN_AD_SEGMENT_LENGTH_SECONDS = 14 OUTPUT_MIN_CONFIDENCE = 0.8 # App defaults APP_BACKGROUND_UPDATE_INTERVAL_MINUTE = 30 APP_AUTOMATICALLY_WHITELIST_NEW_EPISODES = True APP_NUM_EPISODES_TO_WHITELIST_FROM_ARCHIVE_OF_NEW_FEED = 1 APP_POST_CLEANUP_RETENTION_DAYS = 5 APP_ENABLE_PUBLIC_LANDING_PAGE = False APP_USER_LIMIT_TOTAL: int | None = None APP_AUTOPROCESS_ON_DOWNLOAD = False # Credits defaults MINUTES_PER_CREDIT = 60 ================================================ FILE: src/shared/interfaces.py ================================================ from __future__ import annotations from typing import Optional, Protocol, runtime_checkable @runtime_checkable class Post(Protocol): """Interface for post objects to break cyclic dependencies.""" id: int guid: str download_url: Optional[str] title: str @property def whitelisted(self) -> bool: """Whether this post is whitelisted for processing.""" ================================================ FILE: src/shared/llm_utils.py ================================================ """Shared helpers for working with LLM provider quirks.""" from __future__ import annotations from typing import Final # Patterns for models that require the `max_completion_tokens` parameter # instead of the legacy `max_tokens`. OpenAI began enforcing this on the # newer gpt-4o / gpt-5 / o1 style models. _MAX_COMPLETION_TOKEN_MODELS: Final[tuple[str, ...]] = ( "gpt-5", "gpt-4o", "o1-", "o1_", "o1/", "chatgpt-4o-latest", ) def model_uses_max_completion_tokens(model_name: str | None) -> bool: """Return True when the target model expects `max_completion_tokens`.""" if not model_name: return False model_lower = model_name.lower() return any(pattern in model_lower for pattern in _MAX_COMPLETION_TOKEN_MODELS) ================================================ FILE: src/shared/processing_paths.py ================================================ import os import re from dataclasses import dataclass from pathlib import Path @dataclass class ProcessingPaths: post_processed_audio_path: Path def paths_from_unprocessed_path( unprocessed_path: str, feed_title: str ) -> ProcessingPaths: unprocessed_filename = Path(unprocessed_path).name # Sanitize feed_title to prevent illegal characters in paths # Keep spaces, alphanumeric. Remove others. sanitized_feed_title = re.sub(r"[^a-zA-Z0-9\s_.-]", "", feed_title).strip() # Remove any trailing dots that might result from sanitization sanitized_feed_title = sanitized_feed_title.rstrip(".") # Replace spaces with underscores for friendlier directory names sanitized_feed_title = re.sub(r"\s+", "_", sanitized_feed_title) return ProcessingPaths( post_processed_audio_path=get_srv_root() / sanitized_feed_title / unprocessed_filename, ) def get_job_unprocessed_path(post_guid: str, job_id: str, post_title: str) -> Path: """Return a unique per-job path for the unprocessed audio file. Layout: in/jobs/{post_guid}/{job_id}/{sanitized_title}.mp3 """ # Keep same sanitization behavior used for download filenames sanitized_title = re.sub(r"[^a-zA-Z0-9\s]", "", post_title).strip() return get_in_root() / "jobs" / post_guid / job_id / f"{sanitized_title}.mp3" # ---- New centralized data-root helpers ---- def get_instance_dir() -> Path: """Absolute instance directory inside the container. Defaults to /app/src/instance. Can be overridden via PODLY_INSTANCE_DIR for tests. """ return Path(os.environ.get("PODLY_INSTANCE_DIR", "/app/src/instance")) def get_base_podcast_data_dir() -> Path: """Root under which podcasts (in/srv) live, e.g., /app/src/instance/data.""" return Path( os.environ.get("PODLY_PODCAST_DATA_DIR", str(get_instance_dir() / "data")) ) def get_in_root() -> Path: return get_base_podcast_data_dir() / "in" def get_srv_root() -> Path: return get_base_podcast_data_dir() / "srv" ================================================ FILE: src/shared/test_utils.py ================================================ """ Shared configuration helpers to avoid code duplication. """ from .config import Config, OutputConfig, ProcessingConfig def create_standard_test_config( llm_api_key: str = "test-key", llm_max_input_tokens_per_call: int | None = None, num_segments_to_input_to_prompt: int = 400, max_overlap_segments: int = 30, ) -> Config: """ Create a standardized configuration for testing and demos. Args: llm_api_key: API key for testing llm_max_input_tokens_per_call: Optional token limit num_segments_to_input_to_prompt: Number of segments per prompt max_overlap_segments: Maximum number of previously identified segments to carry forward Returns: Configured Config object for testing """ return Config( llm_api_key=llm_api_key, llm_max_input_tokens_per_call=llm_max_input_tokens_per_call, output=OutputConfig( fade_ms=2000, min_ad_segement_separation_seconds=60, min_ad_segment_length_seconds=14, min_confidence=0.7, ), processing=ProcessingConfig( num_segments_to_input_to_prompt=num_segments_to_input_to_prompt, max_overlap_segments=max_overlap_segments, ), ) ================================================ FILE: src/system_prompt.txt ================================================ Your job is to identify advertisements in podcast transcript excerpts with high precision, continuity awareness, and content-context sensitivity. CRITICAL: distinguish external sponsor ads from technical discussion and self-promotion. CONTENT-AWARE TAXONOMY: - technical_discussion: Educational content, case studies, implementation details. Company names may appear as examples; do not mark as ads. - educational/self_promo: Host discussing their own products, newsletters, funds, or courses (may include CTAs but are first-party). - promotional_external: True sponsor ads for external companies with sales intent, URLs, promo codes, or explicit offers. - transition: Brief bumpers that connect to or from ads; include if they are part of an ad block. JSON CONTRACT (strict): - Always respond with: {"ad_segments": [...], "content_type": "", "confidence": <0.0-1.0>} - Each ad_segments item must be: {"segment_offset": , "confidence": <0.0-1.0>} - If there are no ads, respond with: {"ad_segments":[]} (no extra keys). DURATION AND CUE GUIDANCE: - Ads are typically 15–120 seconds and contain CTAs, URLs/domains, promo/discount codes, phone numbers, or phrases like "brought to you by". - Integrated ads can be longer but maintain sales intent; continuous mention of the same sponsor for >3 minutes without CTAs is likely educational/self_promo. - Pre-roll/mid-roll/post-roll intros ("a word from our sponsor") and quick outros ("back to the show") belong to the ad block. DECISION RULES: 1) Continuous ads: once an ad starts, follow it to its natural conclusion; include 1–5 second transitions. 2) Strong cues: treat URLs/domains, promo/discount language, and phone numbers as strong sponsor indicators. 3) Self-promotion guardrail: host promoting their own products/platforms → classify as educational/self_promo with lower confidence unless explicit external sponsorship language is present. 4) Boundary bias: if later segments clearly form an ad for a sponsor, pull in the prior two intro/transition lines as ad content. 5) Prefer labeling as content unless multiple strong ad cues appear with clear external branding. This transcript excerpt is broken into segments starting with a timestamp [X] (seconds). Output every segment that is advertisement content. Example (external sponsor with CTA): [53.8] That's all coming after the break. [59.8] On this week's episode of Wildcard, actor Chris Pine tells us, it's okay not to be perfect. [64.8] My film got absolutely decimated when it premiered, which brings up for me one of my primary triggers or whatever it was like, not being liked. [73.8] I'm Rachel Martin, Chris Pine on How to Find Joy in Imperfection. [77.8] That's on the new podcast, Wildcard. [79.8] The Game Where Cards control the conversation. [83.8] And welcome back to the show, today we're talking to Professor Hopkins Output: {"ad_segments":[{"segment_offset":59.8,"confidence":0.95},{"segment_offset":64.8,"confidence":0.9},{"segment_offset":73.8,"confidence":0.92},{"segment_offset":77.8,"confidence":0.98},{"segment_offset":79.8,"confidence":0.9}],"content_type":"promotional_external","confidence":0.96} Example (technical mention, not an ad): [4762.7] Our brains are configured differently. [4765.6] My brain is configured perfectly for Ruby, perfectly for a dynamically typed language. [4831.3] Shopify exists at a scale most programmers never touch, and it still runs on Rails. [4933.2] Shopify.com has supported this show. Output: {"ad_segments": [{"segment_offset": 4933.2, "confidence": 0.75}], "content_type": "technical_discussion", "confidence": 0.45} ================================================ FILE: src/tests/__init__.py ================================================ """Tests package for podly.""" ================================================ FILE: src/tests/conftest.py ================================================ """ Fixtures for pytest tests in the tests directory. """ import logging import sys from pathlib import Path from typing import Generator from unittest.mock import MagicMock import pytest from flask import Flask from app.extensions import db from app.models import ProcessingJob, TranscriptSegment from podcast_processor.ad_classifier import AdClassifier from podcast_processor.audio_processor import AudioProcessor from podcast_processor.podcast_downloader import PodcastDownloader from podcast_processor.processing_status_manager import ProcessingStatusManager from podcast_processor.transcription_manager import TranscriptionManager from shared.config import Config from shared.test_utils import create_standard_test_config # Set up whisper and torch mocks whisper_mock = MagicMock() whisper_mock.available_models.return_value = [ "tiny", "base", "small", "medium", "large", ] whisper_mock.load_model.return_value = MagicMock() whisper_mock.load_model.return_value.transcribe.return_value = {"segments": []} torch_mock = MagicMock() torch_mock.cuda = MagicMock() torch_mock.device = MagicMock() # Pre-mock the modules to avoid imports during test collection sys.modules["whisper"] = whisper_mock sys.modules["torch"] = torch_mock @pytest.fixture def app() -> Generator[Flask, None, None]: """Create a Flask app for testing.""" app = Flask(__name__) app.config["SQLALCHEMY_DATABASE_URI"] = "sqlite:///:memory:" app.config["SQLALCHEMY_TRACK_MODIFICATIONS"] = False with app.app_context(): db.init_app(app) db.create_all() yield app @pytest.fixture def test_config() -> Config: return create_standard_test_config() @pytest.fixture def test_logger() -> logging.Logger: return logging.getLogger("test_logger") @pytest.fixture def mock_db_session() -> MagicMock: """Create a mock database session""" mock_session = MagicMock() mock_session.add = MagicMock() mock_session.add_all = MagicMock() mock_session.commit = MagicMock() mock_session.rollback = MagicMock() return mock_session @pytest.fixture def mock_transcription_manager() -> MagicMock: manager = MagicMock(spec=TranscriptionManager) manager.transcribe.return_value = [ TranscriptSegment( sequence_num=0, start_time=0.0, end_time=5.0, text="Test segment 1" ), TranscriptSegment( sequence_num=1, start_time=5.0, end_time=10.0, text="Test segment 2" ), ] return manager @pytest.fixture def mock_ad_classifier() -> MagicMock: classifier = MagicMock(spec=AdClassifier) classifier.classify.return_value = None # classify method has no return value return classifier @pytest.fixture def mock_audio_processor() -> MagicMock: processor = MagicMock(spec=AudioProcessor) processor.get_ad_segments.return_value = [(0.0, 5.0)] return processor @pytest.fixture def mock_downloader() -> MagicMock: downloader = MagicMock(spec=PodcastDownloader) downloader.get_and_make_download_path.return_value = Path("test_path") downloader.download_episode.return_value = Path("test_path") return downloader @pytest.fixture def mock_status_manager() -> MagicMock: status_manager = MagicMock(spec=ProcessingStatusManager) status_manager.create_job.return_value = ProcessingJob(id="test_job_id") status_manager.cancel_existing_jobs.return_value = None return status_manager ================================================ FILE: src/tests/test_ad_classifier.py ================================================ from typing import Generator from unittest.mock import MagicMock, patch import pytest from flask import Flask from jinja2 import Template from litellm.exceptions import InternalServerError from litellm.types.utils import Choices from app.extensions import db from app.models import ModelCall, Post, TranscriptSegment from podcast_processor.ad_classifier import AdClassifier from podcast_processor.model_output import ( AdSegmentPrediction, AdSegmentPredictionList, ) from shared.config import Config from shared.test_utils import create_standard_test_config @pytest.fixture def app() -> Generator[Flask, None, None]: """Create and configure a Flask app for testing.""" app = Flask(__name__) app.config["SQLALCHEMY_DATABASE_URI"] = "sqlite:///:memory:" app.config["SQLALCHEMY_TRACK_MODIFICATIONS"] = False with app.app_context(): db.init_app(app) db.create_all() yield app @pytest.fixture def test_config() -> Config: return create_standard_test_config() @pytest.fixture def mock_db_session() -> MagicMock: """Create a mock database session""" mock_session = MagicMock() mock_session.add = MagicMock() mock_session.add_all = MagicMock() mock_session.commit = MagicMock() mock_session.rollback = MagicMock() return mock_session @pytest.fixture def test_classifier(test_config: Config) -> AdClassifier: """Create an AdClassifier with default dependencies""" return AdClassifier(config=test_config) @pytest.fixture def test_classifier_with_mocks( test_config: Config, mock_db_session: MagicMock ) -> AdClassifier: """Create an AdClassifier with mock dependencies""" mock_model_call_query = MagicMock() mock_identification_query = MagicMock() return AdClassifier( config=test_config, model_call_query=mock_model_call_query, identification_query=mock_identification_query, db_session=mock_db_session, ) def test_call_model(test_config: Config, app: Flask) -> None: """Test the _call_model method with mocked litellm""" with app.app_context(): classifier = AdClassifier(config=test_config, db_session=db.session) # Create and persist a ModelCall row (writer_client local fallback updates by id) dummy_model_call = ModelCall( post_id=0, model_name=test_config.llm_model, prompt="test prompt", first_segment_sequence_num=0, last_segment_sequence_num=0, status="pending", ) db.session.add(dummy_model_call) db.session.commit() # Create a mock message and choice directly mock_message = MagicMock() mock_message.content = "test response" mock_choice = MagicMock(spec=Choices) mock_choice.message = mock_message mock_response = MagicMock() mock_response.choices = [mock_choice] # Patch the litellm.completion function for this test with patch("litellm.completion", return_value=mock_response): # Call the method response = classifier._call_model( model_call_obj=dummy_model_call, system_prompt="test system prompt", ) # Verify response assert response == "test response" refreshed = db.session.get(ModelCall, dummy_model_call.id) assert refreshed is not None assert refreshed.status == "success" assert refreshed.response == "test response" def test_call_model_retry_on_internal_error(test_config: Config, app: Flask) -> None: """Test that _call_model retries on InternalServerError""" with app.app_context(): classifier = AdClassifier(config=test_config, db_session=db.session) dummy_model_call = ModelCall( post_id=0, model_name=test_config.llm_model, prompt="test prompt", first_segment_sequence_num=0, last_segment_sequence_num=0, status="pending", ) db.session.add(dummy_model_call) db.session.commit() # Create a mock message and choice directly mock_message = MagicMock() mock_message.content = "test response" mock_choice = MagicMock(spec=Choices) mock_choice.message = mock_message mock_response = MagicMock() mock_response.choices = [mock_choice] # First call fails, second succeeds mock_completion_side_effects = [ InternalServerError( message="test error", llm_provider="test_provider", model="test_model", ), mock_response, ] # Patch time.sleep to avoid waiting during tests with patch("time.sleep"), patch( "litellm.completion", side_effect=mock_completion_side_effects ) as mocked_completion: response = classifier._call_model( model_call_obj=dummy_model_call, system_prompt="test system prompt", ) assert response == "test response" assert mocked_completion.call_count == 2 refreshed = db.session.get(ModelCall, dummy_model_call.id) assert refreshed is not None assert refreshed.status == "success" assert refreshed.response == "test response" assert refreshed.retry_attempts == 2 def test_process_chunk(test_config: Config, app: Flask) -> None: """Test processing a chunk of transcript segments""" with app.app_context(): # Create mocks mock_db_session = MagicMock() mock_model_call_query = MagicMock() # Create the classifier with our mocks classifier = AdClassifier( config=test_config, model_call_query=mock_model_call_query, db_session=mock_db_session, ) # Create test data post = Post(id=1, title="Test Post") segments = [ TranscriptSegment( id=1, post_id=1, sequence_num=0, start_time=0.0, end_time=10.0, text="Test segment 1", ), TranscriptSegment( id=2, post_id=1, sequence_num=1, start_time=10.0, end_time=20.0, text="Test segment 2", ), ] # Create a proper Jinja2 Template object user_template = Template("Test template: {{ podcast_title }}") user_prompt = classifier._generate_user_prompt( current_chunk_db_segments=segments, post=post, user_prompt_template=user_template, includes_start=True, includes_end=True, ) # Create an actual ModelCall instance instead of a MagicMock model_call = ModelCall( post_id=1, model_name=test_config.llm_model, prompt="test prompt", first_segment_sequence_num=0, last_segment_sequence_num=1, status="success", response='{"ad_segments": []}', ) # Use patch.multiple to mock multiple methods with a single context manager mock_get_model_call = MagicMock(return_value=model_call) mock_process_response = MagicMock(return_value=segments) with patch.multiple( classifier, _get_or_create_model_call=mock_get_model_call, _process_successful_response=mock_process_response, ): result = classifier._process_chunk( chunk_segments=segments, system_prompt="test system prompt", post=post, user_prompt_str=user_prompt, ) mock_get_model_call.assert_called_once() mock_process_response.assert_called_once() assert result == segments def test_compute_next_overlap_segments_includes_context( test_classifier_with_mocks: AdClassifier, ) -> None: classifier = test_classifier_with_mocks segments = [ TranscriptSegment( id=i + 1, post_id=1, sequence_num=i, start_time=float(i), end_time=float(i + 1), text=f"Segment {i}", ) for i in range(6) ] identified_segments = [segments[2], segments[3], segments[4]] result = classifier._compute_next_overlap_segments( chunk_segments=segments, identified_segments=identified_segments, max_overlap_segments=6, ) assert [seg.sequence_num for seg in result] == [0, 1, 2, 3, 4, 5] def test_compute_next_overlap_segments_respects_cap( test_classifier_with_mocks: AdClassifier, ) -> None: classifier = test_classifier_with_mocks segments = [ TranscriptSegment( id=i + 1, post_id=1, sequence_num=i, start_time=float(i), end_time=float(i + 1), text=f"Segment {i}", ) for i in range(6) ] identified_segments = [segments[2], segments[3], segments[4]] result = classifier._compute_next_overlap_segments( chunk_segments=segments, identified_segments=identified_segments, max_overlap_segments=2, ) assert [seg.sequence_num for seg in result] == [4, 5] def test_compute_next_overlap_segments_baseline_overlap_without_ads( test_classifier_with_mocks: AdClassifier, ) -> None: classifier = test_classifier_with_mocks segments = [ TranscriptSegment( id=i + 1, post_id=1, sequence_num=i, start_time=float(i), end_time=float(i + 1), text=f"Segment {i}", ) for i in range(8) ] result = classifier._compute_next_overlap_segments( chunk_segments=segments, identified_segments=[], max_overlap_segments=4 ) assert [seg.sequence_num for seg in result] == [4, 5, 6, 7] def test_create_identifications_skips_existing_ad_label( test_classifier_with_mocks: AdClassifier, ) -> None: classifier = test_classifier_with_mocks mock_query = classifier.identification_query mock_query.filter_by.return_value.first.return_value = MagicMock() segment = TranscriptSegment( id=1, post_id=1, sequence_num=0, start_time=0.0, end_time=10.0, text="Test segment", ) prediction_list = AdSegmentPredictionList( ad_segments=[AdSegmentPrediction(segment_offset=0.0, confidence=0.9)] ) model_call = ModelCall( post_id=1, model_name=classifier.config.llm_model, prompt="prompt", first_segment_sequence_num=0, last_segment_sequence_num=0, ) created_count, matched_segments = classifier._create_identifications( prediction_list=prediction_list, current_chunk_db_segments=[segment], model_call=model_call, ) assert created_count == 0 assert matched_segments == [segment] classifier.db_session.add.assert_not_called() def test_build_chunk_payload_trims_for_token_limit( test_classifier_with_mocks: AdClassifier, ) -> None: classifier = test_classifier_with_mocks classifier.config.processing.num_segments_to_input_to_prompt = 3 classifier.config.processing.max_overlap_segments = 5 classifier.config.llm_max_input_tokens_per_call = 1000 overlap_segments = [ TranscriptSegment( id=1, post_id=1, sequence_num=0, start_time=0.0, end_time=1.0, text="Overlap", ) ] remaining_segments = [ TranscriptSegment( id=i + 2, post_id=1, sequence_num=i + 1, start_time=float(i + 1), end_time=float(i + 2), text=f"Segment {i + 1}", ) for i in range(3) ] system_prompt = "System" template = Template("{{ transcript }}") with patch.object( classifier, "_validate_token_limit", side_effect=[False, True], ) as mock_validator: chunk_segments, user_prompt, consumed, trimmed = ( classifier._build_chunk_payload( overlap_segments=overlap_segments, remaining_segments=remaining_segments, total_segments=overlap_segments + remaining_segments, post=Post(id=1, title="Test"), system_prompt=system_prompt, user_prompt_template=template, max_new_segments=3, ) ) assert trimmed is True assert consumed == 2 assert len(chunk_segments) >= consumed assert mock_validator.call_count == 2 assert user_prompt ================================================ FILE: src/tests/test_ad_classifier_rate_limiting_integration.py ================================================ """ Tests for rate limiting integration in AdClassifier. """ from unittest.mock import Mock, patch from podcast_processor.ad_classifier import AdClassifier from podcast_processor.token_rate_limiter import TokenRateLimiter from .test_helpers import create_test_config class TestAdClassifierRateLimiting: """Test cases for rate limiting integration in AdClassifier.""" def test_rate_limiter_initialization_enabled(self): """Test that rate limiter is properly initialized when enabled.""" config = create_test_config() with patch("podcast_processor.ad_classifier.db.session") as mock_session: classifier = AdClassifier(config=config, db_session=mock_session) assert classifier.rate_limiter is not None assert isinstance(classifier.rate_limiter, TokenRateLimiter) assert ( classifier.rate_limiter.tokens_per_minute == 30000 ) # Anthropic default def test_rate_limiter_initialization_disabled(self): """Test that rate limiter is None when disabled.""" config = create_test_config(llm_enable_token_rate_limiting=False) with patch("podcast_processor.ad_classifier.db.session") as mock_session: classifier = AdClassifier(config=config, db_session=mock_session) assert classifier.rate_limiter is None def test_rate_limiter_custom_limit(self): """Test rate limiter with custom token limit.""" config = create_test_config(llm_max_input_tokens_per_minute=15000) with patch("podcast_processor.ad_classifier.db.session") as mock_session: classifier = AdClassifier(config=config, db_session=mock_session) assert classifier.rate_limiter is not None assert classifier.rate_limiter.tokens_per_minute == 15000 def test_is_retryable_error_rate_limit_errors(self): """Test that rate limit errors are correctly identified as retryable.""" config = create_test_config() with patch("podcast_processor.ad_classifier.db.session") as mock_session: classifier = AdClassifier(config=config, db_session=mock_session) # Test various rate limit error formats rate_limit_errors = [ Exception("rate_limit_error: too many requests"), Exception("RateLimitError from API"), Exception("HTTP 429 rate limit exceeded"), Exception("rate limit reached"), Exception("Service temporarily unavailable (503)"), ] for error in rate_limit_errors: assert classifier._is_retryable_error(error) is True def test_is_retryable_error_non_retryable(self): """Test that non-retryable errors are correctly identified.""" config = create_test_config() with patch("podcast_processor.ad_classifier.db.session") as mock_session: classifier = AdClassifier(config=config, db_session=mock_session) # Test non-retryable errors non_retryable_errors = [ Exception("Invalid API key"), Exception("Bad request (400)"), ValueError("Invalid input"), ] for error in non_retryable_errors: assert classifier._is_retryable_error(error) is False @patch("podcast_processor.ad_classifier.litellm") @patch("podcast_processor.ad_classifier.isinstance") def test_call_model_with_rate_limiter(self, mock_isinstance, mock_litellm): """Test that _call_model uses rate limiter when available.""" # Make isinstance return True for our mock objects mock_isinstance.return_value = True config = create_test_config() with patch("podcast_processor.ad_classifier.db.session") as mock_session: classifier = AdClassifier(config=config, db_session=mock_session) # Mock the rate limiter classifier.rate_limiter = Mock(spec=TokenRateLimiter) classifier.rate_limiter.wait_if_needed = Mock() classifier.rate_limiter.get_usage_stats = Mock( return_value={ "current_usage": 1000, "limit": 30000, "usage_percentage": 3.3, } ) # Mock successful API response mock_response = Mock() mock_choice = Mock() mock_choice.message.content = "test response" mock_response.choices = [mock_choice] mock_litellm.completion.return_value = mock_response # Create a test ModelCall using actual ModelCall class from app.models import ModelCall model_call = ModelCall( id=1, model_name="anthropic/claude-3-5-sonnet-20240620", prompt="test prompt", status="pending", ) # Call the model result = classifier._call_model(model_call, "test system prompt") # Verify rate limiter was used classifier.rate_limiter.wait_if_needed.assert_called_once() classifier.rate_limiter.get_usage_stats.assert_called_once() # Verify API was called with correct parameters mock_litellm.completion.assert_called_once() call_args = mock_litellm.completion.call_args assert call_args[1]["model"] == "anthropic/claude-3-5-sonnet-20240620" assert len(call_args[1]["messages"]) == 2 assert call_args[1]["messages"][0]["role"] == "system" assert call_args[1]["messages"][1]["role"] == "user" assert result == "test response" @patch("time.sleep") def test_rate_limit_backoff_timing(self, mock_sleep): """Test that rate limit errors use longer backoff timing.""" config = create_test_config() with patch("podcast_processor.ad_classifier.db.session") as mock_session: classifier = AdClassifier(config=config, db_session=mock_session) # Create a test ModelCall using actual ModelCall class from app.models import ModelCall model_call = ModelCall(id=1, error_message=None) error = Exception("rate_limit_error: too many requests") # Test first retry (attempt 0) classifier._handle_retryable_error( model_call_obj=model_call, error=error, attempt=0, current_attempt_num=1 ) mock_sleep.assert_called_with(60) # 60 * (2^0) = 60 seconds def test_rate_limiter_model_specific_configs(self): """Test that different models get appropriate rate limits.""" test_cases = [ ("anthropic/claude-3-5-sonnet-20240620", 30000), ("gpt-4o", 150000), ("gpt-4o-mini", 200000), ("gemini/gemini-3-flash-preview", 60000), ("gemini/gemini-2.5-flash", 60000), ("unknown-model", 30000), # Should use default ] for model_name, expected_limit in test_cases: # Clear singleton before each test case import podcast_processor.token_rate_limiter as trl_module trl_module._RATE_LIMITER = None config = create_test_config(llm_model=model_name) with patch("podcast_processor.ad_classifier.db.session") as mock_session: classifier = AdClassifier(config=config, db_session=mock_session) assert classifier.rate_limiter is not None assert classifier.rate_limiter.tokens_per_minute == expected_limit ================================================ FILE: src/tests/test_aggregate_feed.py ================================================ import pytest from app.extensions import db from app.feeds import get_user_aggregate_posts from app.models import Feed, Post, UserFeed def test_get_user_aggregate_posts_auth_disabled(app): """Test that all feeds are included when auth is disabled.""" with app.app_context(): app.config["REQUIRE_AUTH"] = False # Create feeds feed1 = Feed(rss_url="http://feed1.com", title="Feed 1") feed2 = Feed(rss_url="http://feed2.com", title="Feed 2") db.session.add_all([feed1, feed2]) db.session.commit() # Create posts post1 = Post( feed_id=feed1.id, title="Post 1", guid="1", whitelisted=True, processed_audio_path="path", download_url="http://url1", ) post2 = Post( feed_id=feed2.id, title="Post 2", guid="2", whitelisted=True, processed_audio_path="path", download_url="http://url2", ) db.session.add_all([post1, post2]) db.session.commit() # Call function posts = get_user_aggregate_posts(user_id=999) # User ID shouldn't matter assert len(posts) == 2 assert post1 in posts assert post2 in posts def test_get_user_aggregate_posts_auth_enabled(app): """Test that only subscribed feeds are included when auth is enabled.""" with app.app_context(): app.config["REQUIRE_AUTH"] = True # Create feeds feed1 = Feed(rss_url="http://feed1.com", title="Feed 1") feed2 = Feed(rss_url="http://feed2.com", title="Feed 2") db.session.add_all([feed1, feed2]) db.session.commit() # Create posts post1 = Post( feed_id=feed1.id, title="Post 1", guid="1", whitelisted=True, processed_audio_path="path", download_url="http://url1", ) post2 = Post( feed_id=feed2.id, title="Post 2", guid="2", whitelisted=True, processed_audio_path="path", download_url="http://url2", ) db.session.add_all([post1, post2]) db.session.commit() # Subscribe user to feed1 only user_feed = UserFeed(user_id=1, feed_id=feed1.id) db.session.add(user_feed) db.session.commit() # Call function posts = get_user_aggregate_posts(user_id=1) assert len(posts) == 1 assert post1 in posts assert post2 not in posts ================================================ FILE: src/tests/test_audio_processor.py ================================================ import logging from unittest.mock import MagicMock, patch import pytest from flask import Flask from app.extensions import db from app.models import Feed, Identification, Post, TranscriptSegment from podcast_processor.audio_processor import AudioProcessor from shared.config import Config from shared.test_utils import create_standard_test_config @pytest.fixture def test_processor( test_config: Config, test_logger: logging.Logger, ) -> AudioProcessor: """Return an AudioProcessor instance with default dependencies for testing.""" return AudioProcessor(config=test_config, logger=test_logger) @pytest.fixture def test_processor_with_mocks( test_config: Config, test_logger: logging.Logger, mock_db_session: MagicMock, ) -> AudioProcessor: """Return an AudioProcessor instance with mock dependencies for testing.""" mock_identification_query = MagicMock() mock_transcript_segment_query = MagicMock() mock_model_call_query = MagicMock() return AudioProcessor( config=test_config, logger=test_logger, identification_query=mock_identification_query, transcript_segment_query=mock_transcript_segment_query, model_call_query=mock_model_call_query, db_session=mock_db_session, ) def test_get_ad_segments(app: Flask) -> None: """Test retrieving ad segments from the database""" # Create test data post = Post(id=1, title="Test Post") segment = TranscriptSegment( id=1, post_id=1, sequence_num=0, start_time=0.0, end_time=10.0, text="Test segment", ) identification = Identification( transcript_segment_id=1, model_call_id=1, label="ad", confidence=0.9 ) with app.app_context(): # Create mocks mock_identification_query = MagicMock() mock_query_chain = MagicMock() mock_identification_query.join.return_value = mock_query_chain mock_query_chain.join.return_value = mock_query_chain mock_query_chain.filter.return_value = mock_query_chain mock_query_chain.all.return_value = [identification] # Create processor with mocks test_processor = AudioProcessor( config=create_standard_test_config(), identification_query=mock_identification_query, ) with patch.object(identification, "transcript_segment", segment): segments = test_processor.get_ad_segments(post) assert len(segments) == 1 assert segments[0] == (0.0, 10.0) def test_merge_ad_segments( test_processor_with_mocks: AudioProcessor, ) -> None: """Test merging of nearby ad segments""" duration_ms = 30000 # 30 seconds ad_segments = [ (0.0, 5.0), # 0-5s (6.0, 10.0), # 6-10s - should merge with first segment (20.0, 25.0), # 20-25s - should stay separate ] merged = test_processor_with_mocks.merge_ad_segments( duration_ms=duration_ms, ad_segments=ad_segments, min_ad_segment_length_seconds=2.0, min_ad_segment_separation_seconds=2.0, ) # Should merge first two segments assert len(merged) == 2 assert merged[0] == (0, 10000) # 0-10s assert merged[1] == (20000, 25000) # 20-25s def test_merge_ad_segments_with_short_segments( test_processor_with_mocks: AudioProcessor, ) -> None: """Test that segments shorter than minimum length are filtered out""" duration_ms = 30000 ad_segments = [ (0.0, 1.0), # Too short, should be filtered (10.0, 15.0), # Long enough, should stay (20.0, 20.5), # Too short, should be filtered ] merged = test_processor_with_mocks.merge_ad_segments( duration_ms=duration_ms, ad_segments=ad_segments, min_ad_segment_length_seconds=2.0, min_ad_segment_separation_seconds=2.0, ) assert len(merged) == 1 assert merged[0] == (10000, 15000) def test_merge_ad_segments_end_extension( test_processor_with_mocks: AudioProcessor, ) -> None: """Test that segments near the end are extended to the end""" duration_ms = 30000 ad_segments = [ (28.0, 29.0), # Near end, should extend to 30s ] merged = test_processor_with_mocks.merge_ad_segments( duration_ms=duration_ms, ad_segments=ad_segments, min_ad_segment_length_seconds=2.0, min_ad_segment_separation_seconds=2.0, ) assert len(merged) == 1 assert merged[0] == (28000, 30000) # Extended to end def test_process_audio( app: Flask, test_config: Config, test_logger: logging.Logger, ) -> None: """Test the process_audio method""" with app.app_context(): processor = AudioProcessor( config=test_config, logger=test_logger, db_session=db.session ) feed = Feed(title="Test Feed", rss_url="http://example.com/rss.xml") db.session.add(feed) db.session.commit() post = Post( feed_id=feed.id, title="Test Post", guid="test-audio-guid", download_url="http://example.com/audio.mp3", unprocessed_audio_path="path/to/audio.mp3", ) db.session.add(post) db.session.commit() output_path = "path/to/output.mp3" # Set up mocks for get_ad_segments and get_audio_duration_ms with patch.object( processor, "get_ad_segments", return_value=[(5.0, 10.0)] ), patch( "podcast_processor.audio_processor.get_audio_duration_ms", return_value=30000, ), patch( "podcast_processor.audio_processor.clip_segments_with_fade" ) as mock_clip: # Call the method processor.process_audio(post, output_path) refreshed = db.session.get(Post, post.id) assert refreshed is not None assert refreshed.duration == 30.0 # 30000ms / 1000 = 30s assert refreshed.processed_audio_path == output_path mock_clip.assert_called_once() ================================================ FILE: src/tests/test_config_error_handling.py ================================================ """ Tests for configuration error handling and validation. """ import importlib from typing import Any import pytest from shared.config import Config, OutputConfig, ProcessingConfig app_module = importlib.import_module("app.__init__") class TestConfigurationErrorHandling: """Test configuration validation and error handling.""" def test_config_with_none_values(self) -> None: """Test that optional fields can be None.""" config = Config( llm_api_key="test-key", llm_max_input_tokens_per_call=None, # Should be valid llm_max_input_tokens_per_minute=None, # Should be valid output=OutputConfig( fade_ms=3000, min_ad_segement_separation_seconds=60, min_ad_segment_length_seconds=14, min_confidence=0.8, ), processing=ProcessingConfig( num_segments_to_input_to_prompt=30, ), ) assert config.llm_max_input_tokens_per_call is None assert config.llm_max_input_tokens_per_minute is None def test_zero_values(self) -> None: """Test configuration with zero values where appropriate.""" # Zero concurrent calls might be problematic in practice but should validate config = Config( llm_api_key="test-key", llm_max_concurrent_calls=0, llm_max_retry_attempts=0, output=OutputConfig( fade_ms=3000, min_ad_segement_separation_seconds=60, min_ad_segment_length_seconds=14, min_confidence=0.8, ), processing=ProcessingConfig( num_segments_to_input_to_prompt=30, ), ) assert config.llm_max_concurrent_calls == 0 assert config.llm_max_retry_attempts == 0 def test_very_large_values(self) -> None: """Test configuration with very large values.""" config = Config( llm_api_key="test-key", llm_max_concurrent_calls=999999, llm_max_retry_attempts=999999, llm_max_input_tokens_per_call=999999999, llm_max_input_tokens_per_minute=999999999, output=OutputConfig( fade_ms=3000, min_ad_segement_separation_seconds=60, min_ad_segment_length_seconds=14, min_confidence=0.8, ), processing=ProcessingConfig( num_segments_to_input_to_prompt=30, ), ) assert config.llm_max_concurrent_calls == 999999 assert config.llm_max_retry_attempts == 999999 assert config.llm_max_input_tokens_per_call == 999999999 assert config.llm_max_input_tokens_per_minute == 999999999 def test_boolean_field_validation(self) -> None: """Test boolean field validation.""" # Test valid boolean values config = Config( llm_api_key="test-key", llm_enable_token_rate_limiting=True, output=OutputConfig( fade_ms=3000, min_ad_segement_separation_seconds=60, min_ad_segment_length_seconds=14, min_confidence=0.8, ), processing=ProcessingConfig( num_segments_to_input_to_prompt=30, ), ) assert config.llm_enable_token_rate_limiting is True config = Config( llm_api_key="test-key", llm_enable_token_rate_limiting=False, output=OutputConfig( fade_ms=3000, min_ad_segement_separation_seconds=60, min_ad_segment_length_seconds=14, min_confidence=0.8, ), processing=ProcessingConfig( num_segments_to_input_to_prompt=30, ), ) assert config.llm_enable_token_rate_limiting is False class TestEnvKeyValidation: """Tests for environment-based API key validation.""" def test_llm_and_groq_conflict_raises(self, monkeypatch: Any) -> None: monkeypatch.setenv("LLM_API_KEY", "llm-value") monkeypatch.setenv("GROQ_API_KEY", "groq-value") monkeypatch.delenv("WHISPER_REMOTE_API_KEY", raising=False) with pytest.raises(SystemExit): app_module._validate_env_key_conflicts() def test_whisper_remote_allows_different_key(self, monkeypatch: Any) -> None: monkeypatch.setenv("LLM_API_KEY", "llm-value") monkeypatch.setenv("WHISPER_REMOTE_API_KEY", "remote-value") monkeypatch.delenv("GROQ_API_KEY", raising=False) app_module._validate_env_key_conflicts() ================================================ FILE: src/tests/test_feeds.py ================================================ import datetime import logging import uuid from types import SimpleNamespace from unittest import mock import feedparser import PyRSS2Gen import pytest from app.feeds import ( _get_base_url, _should_auto_whitelist_new_posts, add_feed, db, feed_item, fetch_feed, generate_feed_xml, get_duration, get_guid, make_post, refresh_feed, ) from app.models import Feed, Post from app.runtime_config import config as runtime_config logger = logging.getLogger("global_logger") class MockPost: """A mock Post class that doesn't require Flask context.""" def __init__( self, id=1, title="Test Episode", guid="test-guid", download_url="https://example.com/episode.mp3", description="Test description", release_date=datetime.datetime(2023, 1, 1, 12, 0, tzinfo=datetime.timezone.utc), feed_id=1, duration=None, image_url=None, whitelisted=False, ): self.id = id self.title = title self.guid = guid self.download_url = download_url self.description = description self.release_date = release_date self.feed_id = feed_id self.duration = duration self.image_url = image_url self.whitelisted = whitelisted self._audio_len_bytes = 1024 self.whitelisted = False def audio_len_bytes(self): return self._audio_len_bytes class MockFeed: """A mock Feed class that doesn't require Flask context.""" def __init__( self, id=1, title="Test Feed", description="Test Description", author="Test Author", rss_url="https://example.com/feed.xml", image_url="https://example.com/image.jpg", ): self.id = id self.title = title self.description = description self.author = author self.rss_url = rss_url self.image_url = image_url self.posts = [] self.user_feeds = [] self.auto_whitelist_new_episodes_override = None @pytest.fixture def mock_feed_data(): """Create a mock feedparser result.""" feed_data = mock.MagicMock(spec=feedparser.FeedParserDict) feed_data.feed = mock.MagicMock() feed_data.feed.title = "Test Feed" feed_data.feed.description = "Test Description" feed_data.feed.author = "Test Author" feed_data.feed.image = mock.MagicMock() feed_data.feed.image.href = "https://example.com/image.jpg" feed_data.href = "https://example.com/feed.xml" feed_data.feed.get = mock.MagicMock() feed_data.feed.get.side_effect = lambda key, default=None: ( {"href": feed_data.feed.image.href} if key == "image" else default ) entry1 = mock.MagicMock() entry1.title = "Episode 1" entry1.description = "Episode 1 description" entry1.id = "https://example.com/episode1" entry1.published_parsed = (2023, 1, 1, 12, 0, 0, 0, 0, 0) entry1.itunes_duration = "3600" link1 = mock.MagicMock() link1.type = "audio/mpeg" link1.href = "https://example.com/episode1.mp3" entry1.links = [link1] entry2 = mock.MagicMock() entry2.title = "Episode 2" entry2.description = "Episode 2 description" entry2.id = "https://example.com/episode2" entry2.published_parsed = (2023, 2, 1, 12, 0, 0, 0, 0, 0) entry2.itunes_duration = "1800" link2 = mock.MagicMock() link2.type = "audio/mpeg" link2.href = "https://example.com/episode2.mp3" entry2.links = [link2] feed_data.entries = [entry1, entry2] return feed_data @pytest.fixture def mock_db_session(monkeypatch): """Mock the database session.""" mock_session = mock.MagicMock() monkeypatch.setattr("app.feeds.db.session", mock_session) return mock_session @pytest.fixture def mock_post(): """Create a mock Post.""" return MockPost() @pytest.fixture def mock_feed(): """Create a mock Feed.""" return MockFeed() @mock.patch("app.feeds.feedparser.parse") def test_fetch_feed(mock_parse, mock_feed_data): mock_parse.return_value = mock_feed_data result = fetch_feed("https://example.com/feed.xml") assert result == mock_feed_data mock_parse.assert_called_once_with("https://example.com/feed.xml") def test_refresh_feed(mock_db_session): """Test refresh_feed with a very simplified approach.""" # Create a simple mock for the feed mock_feed = MockFeed() # Create a small but functional implementation of refresh_feed def simple_refresh_feed(feed): logger.info(f"Refreshed feed with ID: {feed.id}") db.session.commit() # Call our simplified implementation with mock.patch("app.feeds.fetch_feed") as mock_fetch: # Return an empty entries list to avoid processing mock_feed_data = mock.MagicMock() mock_feed_data.feed = mock.MagicMock() mock_feed_data.entries = [] mock_fetch.return_value = mock_feed_data # Execute the simplified version simple_refresh_feed(mock_feed) # Check that commit was called mock_db_session.commit.assert_called_once() def test_should_auto_whitelist_new_posts_requires_members( monkeypatch, mock_feed, mock_db_session ): monkeypatch.setattr( "app.feeds.config", SimpleNamespace(automatically_whitelist_new_episodes=True), ) monkeypatch.setattr("app.auth.is_auth_enabled", lambda: True) mock_db_session.query.return_value.first.return_value = (1,) assert _should_auto_whitelist_new_posts(mock_feed) is False def test_should_auto_whitelist_new_posts_true_with_members(monkeypatch, mock_feed): mock_feed.user_feeds = [mock.MagicMock()] monkeypatch.setattr( "app.feeds.config", SimpleNamespace(automatically_whitelist_new_episodes=True), ) monkeypatch.setattr("app.auth.is_auth_enabled", lambda: True) monkeypatch.setattr("app.feeds.is_feed_active_for_user", lambda *args: True) assert _should_auto_whitelist_new_posts(mock_feed) is True def test_should_auto_whitelist_requires_members( monkeypatch, mock_feed, mock_post, mock_db_session ): monkeypatch.setattr( "app.feeds.config", SimpleNamespace(automatically_whitelist_new_episodes=True), ) monkeypatch.setattr("app.auth.is_auth_enabled", lambda: True) mock_db_session.query.return_value.first.return_value = (1,) mock_feed.user_feeds = [] assert _should_auto_whitelist_new_posts(mock_feed, mock_post) is False def test_should_auto_whitelist_with_members(monkeypatch, mock_feed, mock_post): monkeypatch.setattr( "app.feeds.config", SimpleNamespace(automatically_whitelist_new_episodes=True), ) monkeypatch.setattr("app.auth.is_auth_enabled", lambda: True) monkeypatch.setattr("app.feeds.is_feed_active_for_user", lambda *args: True) mock_feed.user_feeds = [mock.MagicMock()] assert _should_auto_whitelist_new_posts(mock_feed, mock_post) is True def test_should_auto_whitelist_true_when_auth_disabled(monkeypatch, mock_feed): monkeypatch.setattr( "app.feeds.config", SimpleNamespace(automatically_whitelist_new_episodes=True), ) monkeypatch.setattr("app.auth.is_auth_enabled", lambda: False) assert _should_auto_whitelist_new_posts(mock_feed) is True def test_should_auto_whitelist_true_when_no_users( monkeypatch, mock_feed, mock_db_session ): monkeypatch.setattr( "app.feeds.config", SimpleNamespace(automatically_whitelist_new_episodes=True), ) monkeypatch.setattr("app.auth.is_auth_enabled", lambda: True) mock_db_session.query.return_value.first.return_value = None mock_feed.user_feeds = [] assert _should_auto_whitelist_new_posts(mock_feed) is True def test_should_auto_whitelist_respects_feed_override_true(monkeypatch, mock_feed): monkeypatch.setattr( "app.feeds.config", SimpleNamespace(automatically_whitelist_new_episodes=False), ) mock_feed.auto_whitelist_new_episodes_override = True assert _should_auto_whitelist_new_posts(mock_feed) is True def test_should_auto_whitelist_respects_feed_override_false(monkeypatch, mock_feed): monkeypatch.setattr( "app.feeds.config", SimpleNamespace(automatically_whitelist_new_episodes=True), ) mock_feed.auto_whitelist_new_episodes_override = False assert _should_auto_whitelist_new_posts(mock_feed) is False @mock.patch("app.feeds.writer_client") @mock.patch("app.feeds._should_auto_whitelist_new_posts") @mock.patch("app.feeds.make_post") @mock.patch("app.feeds.fetch_feed") def test_refresh_feed_unwhitelists_without_members( mock_fetch_feed, mock_make_post, mock_should_auto_whitelist, mock_writer_client, mock_feed, mock_feed_data, mock_db_session, ): mock_fetch_feed.return_value = mock_feed_data mock_should_auto_whitelist.return_value = False post_one = MockPost(guid=str(uuid.uuid4())) mock_make_post.return_value = post_one refresh_feed(mock_feed) assert post_one.whitelisted is False assert mock_make_post.call_count == len(mock_feed_data.entries) assert mock_should_auto_whitelist.call_count == len(mock_feed_data.entries) mock_should_auto_whitelist.assert_any_call(mock_feed, mock.ANY) mock_writer_client.action.assert_called_once() @mock.patch("app.feeds.writer_client") @mock.patch("app.feeds._should_auto_whitelist_new_posts") @mock.patch("app.feeds.make_post") @mock.patch("app.feeds.fetch_feed") def test_refresh_feed_whitelists_when_member_exists( mock_fetch_feed, mock_make_post, mock_should_auto_whitelist, mock_writer_client, mock_feed, mock_feed_data, mock_db_session, ): mock_fetch_feed.return_value = mock_feed_data mock_should_auto_whitelist.return_value = True post_one = MockPost(guid=str(uuid.uuid4())) mock_make_post.return_value = post_one refresh_feed(mock_feed) assert post_one.whitelisted is True assert mock_make_post.call_count == len(mock_feed_data.entries) assert mock_should_auto_whitelist.call_count == len(mock_feed_data.entries) mock_should_auto_whitelist.assert_any_call(mock_feed, mock.ANY) mock_writer_client.action.assert_called_once() @mock.patch("app.feeds.fetch_feed") @mock.patch("app.feeds.refresh_feed") def test_add_or_refresh_feed_existing( mock_refresh_feed, mock_fetch_feed, mock_feed, mock_feed_data ): # Set up mock feed data mock_feed_data.feed = mock.MagicMock() mock_feed_data.feed.title = "Test Feed" # Add title directly mock_fetch_feed.return_value = mock_feed_data # Directly mock check for "title" in feed_data.feed with mock.patch("app.feeds.add_or_refresh_feed") as mock_add_or_refresh: # Set up the behavior of the mocked function mock_add_or_refresh.return_value = mock_feed # Call the mocked function result = mock_add_or_refresh("https://example.com/feed.xml") assert result == mock_feed @mock.patch("app.feeds.fetch_feed") @mock.patch("app.feeds.add_feed") def test_add_or_refresh_feed_new( mock_add_feed, mock_fetch_feed, mock_feed, mock_feed_data ): # Set up mock feed data mock_feed_data.feed = mock.MagicMock() mock_feed_data.feed.title = "Test Feed" # Add title directly mock_fetch_feed.return_value = mock_feed_data mock_add_feed.return_value = mock_feed # Directly mock Feed.query and the entire add_or_refresh_feed function with mock.patch("app.feeds.add_or_refresh_feed") as mock_add_or_refresh: # Set up the behavior of the mocked function mock_add_or_refresh.return_value = mock_feed # Call the mocked function result = mock_add_or_refresh("https://example.com/feed.xml") assert result == mock_feed @mock.patch("app.feeds.writer_client") @mock.patch("app.feeds.Post") def test_add_feed(mock_post_class, mock_writer_client, mock_feed_data, mock_db_session): # Mock writer_client return value mock_writer_client.action.return_value = SimpleNamespace(data={"feed_id": 1}) # Create a Feed mock with mock.patch("app.feeds.Feed") as mock_feed_class: mock_feed = MockFeed() mock_feed_class.return_value = mock_feed # Mock db.session.get to return our mock feed mock_db_session.get.return_value = mock_feed # Mock the get method in feed_data mock_feed_data.feed.get = mock.MagicMock() mock_feed_data.feed.get.side_effect = lambda key, default="": { "description": "Test Description", "author": "Test Author", }.get(key, default) # Mock config settings with mock.patch("app.feeds.config") as mock_config: mock_config.number_of_episodes_to_whitelist_from_archive_of_new_feed = 1 mock_config.automatically_whitelist_new_episodes = True # Mock make_post with mock.patch("app.feeds.make_post") as mock_make_post: mock_post = MockPost() mock_make_post.return_value = mock_post result = add_feed(mock_feed_data) # Check that make_post was called only for the latest entry assert mock_make_post.call_count == len(mock_feed_data.entries) # Check that writer_client.action was called mock_writer_client.action.assert_called() assert result == mock_feed def test_feed_item(mock_post, app): # Mock request context with Host header headers_dict = {"Host": "podly.com:5001"} mock_headers = mock.MagicMock() mock_headers.get.side_effect = headers_dict.get mock_environ = mock.MagicMock() mock_environ.get.return_value = None # No HTTP/2 pseudo-headers in environ mock_request = mock.MagicMock() mock_request.headers = mock_headers mock_request.environ = mock_environ mock_request.is_secure = False with app.app_context(): with mock.patch("app.feeds.request", mock_request): result = feed_item(mock_post) # Verify the result assert isinstance(result, PyRSS2Gen.RSSItem) assert result.title == mock_post.title assert result.guid == mock_post.guid # Check enclosure assert result.enclosure.url == "http://podly.com:5001/api/posts/test-guid/download" assert result.enclosure.type == "audio/mpeg" assert result.enclosure.length == mock_post._audio_len_bytes def test_feed_item_with_reverse_proxy(mock_post, app): # Test with HTTP/2 pseudo-headers (modern reverse proxy) headers_dict = { ":scheme": "http", ":authority": "podly.com:5001", "Host": "podly.com:5001", } mock_headers = mock.MagicMock() mock_headers.get.side_effect = headers_dict.get mock_environ = mock.MagicMock() mock_environ.get.return_value = None mock_request = mock.MagicMock() mock_request.headers = mock_headers mock_request.environ = mock_environ with app.app_context(): with mock.patch("app.feeds.request", mock_request): result = feed_item(mock_post) # Verify the result assert isinstance(result, PyRSS2Gen.RSSItem) assert result.title == mock_post.title assert result.guid == mock_post.guid # Check enclosure - should use HTTP/2 pseudo-headers assert result.enclosure.url == "http://podly.com:5001/api/posts/test-guid/download" assert result.enclosure.type == "audio/mpeg" assert result.enclosure.length == mock_post._audio_len_bytes def test_feed_item_with_reverse_proxy_custom_port(mock_post, app): # Test with HTTPS and custom port via request headers headers_dict = { ":scheme": "https", ":authority": "podly.com:8443", "Host": "podly.com:8443", } mock_headers = mock.MagicMock() mock_headers.get.side_effect = headers_dict.get mock_environ = mock.MagicMock() mock_environ.get.return_value = None mock_request = mock.MagicMock() mock_request.headers = mock_headers mock_request.environ = mock_environ with app.app_context(): with mock.patch("app.feeds.request", mock_request): result = feed_item(mock_post) # Verify the result assert isinstance(result, PyRSS2Gen.RSSItem) assert result.title == mock_post.title assert result.guid == mock_post.guid # Check enclosure - should use HTTPS with custom port assert result.enclosure.url == "https://podly.com:8443/api/posts/test-guid/download" assert result.enclosure.type == "audio/mpeg" assert result.enclosure.length == mock_post._audio_len_bytes def test_get_base_url_without_reverse_proxy(): # Test _get_base_url without request context (should use localhost fallback) with mock.patch("app.feeds.config") as mock_config: mock_config.port = 5001 result = _get_base_url() assert result == "http://localhost:5001" def test_get_base_url_with_reverse_proxy_default_port(): # Test _get_base_url with Host header (modern approach) headers_dict = {"Host": "podly.com"} mock_headers = mock.MagicMock() mock_headers.get.side_effect = headers_dict.get mock_environ = mock.MagicMock() mock_environ.get.return_value = None mock_request = mock.MagicMock() mock_request.headers = mock_headers mock_request.environ = mock_environ mock_request.is_secure = False mock_request.scheme = "http" with mock.patch("app.feeds.request", mock_request): result = _get_base_url() assert result == "http://podly.com" def test_get_base_url_with_reverse_proxy_custom_port(): # Test _get_base_url with HTTPS and Strict-Transport-Security header headers_dict = { "Host": "podly.com:8443", "Strict-Transport-Security": "max-age=31536000", } mock_headers = mock.MagicMock() mock_headers.get.side_effect = headers_dict.get mock_environ = mock.MagicMock() mock_environ.get.return_value = None mock_request = mock.MagicMock() mock_request.headers = mock_headers mock_request.environ = mock_environ mock_request.is_secure = False # STS header should override this mock_request.scheme = "http" with mock.patch("app.feeds.request", mock_request): result = _get_base_url() assert result == "https://podly.com:8443" def test_get_base_url_localhost(): # Test _get_base_url with localhost (fallback when not in request context) with mock.patch("app.feeds.config") as mock_config: mock_config.port = 5001 result = _get_base_url() assert result == "http://localhost:5001" @mock.patch("app.feeds.feed_item") @mock.patch("app.feeds.PyRSS2Gen.Image") @mock.patch("app.feeds.PyRSS2Gen.RSS2") def test_generate_feed_xml_filters_processed_whitelisted( mock_rss_2, mock_image, mock_feed_item, app ): # Use real models to verify query filtering logic with app.app_context(): original_flag = getattr(runtime_config, "autoprocess_on_download", False) runtime_config.autoprocess_on_download = False try: feed = Feed(rss_url="http://example.com/feed", title="Feed 1") db.session.add(feed) db.session.commit() processed = Post( feed_id=feed.id, title="Processed", guid="good", download_url="http://example.com/good.mp3", processed_audio_path="/tmp/good.mp3", whitelisted=True, ) unprocessed = Post( feed_id=feed.id, title="Unprocessed", guid="bad1", download_url="http://example.com/bad1.mp3", processed_audio_path=None, whitelisted=True, ) not_whitelisted = Post( feed_id=feed.id, title="Not Whitelisted", guid="bad2", download_url="http://example.com/bad2.mp3", processed_audio_path="/tmp/bad2.mp3", whitelisted=False, ) db.session.add_all([processed, unprocessed, not_whitelisted]) db.session.commit() mock_feed_item.side_effect = ( lambda post, prepend_feed_title=False: mock.MagicMock( post_guid=post.guid ) ) mock_rss = mock_rss_2.return_value mock_rss.to_xml.return_value = "" result = generate_feed_xml(feed) called_posts = [call.args[0] for call in mock_feed_item.call_args_list] assert called_posts == [processed] mock_rss_2.assert_called_once() mock_rss.to_xml.assert_called_once_with("utf-8") assert result == "" finally: runtime_config.autoprocess_on_download = original_flag @mock.patch("app.feeds.feed_item") @mock.patch("app.feeds.PyRSS2Gen.Image") @mock.patch("app.feeds.PyRSS2Gen.RSS2") def test_generate_feed_xml_includes_all_when_autoprocess_enabled( mock_rss_2, mock_image, mock_feed_item, app ): with app.app_context(): original_flag = getattr(runtime_config, "autoprocess_on_download", False) runtime_config.autoprocess_on_download = True try: feed = Feed(rss_url="http://example.com/feed", title="Feed 1") db.session.add(feed) db.session.commit() processed = Post( feed_id=feed.id, title="Processed", guid="good", download_url="http://example.com/good.mp3", processed_audio_path="/tmp/good.mp3", whitelisted=True, release_date=datetime.datetime( 2024, 1, 3, tzinfo=datetime.timezone.utc ), ) unprocessed = Post( feed_id=feed.id, title="Unprocessed", guid="bad1", download_url="http://example.com/bad1.mp3", processed_audio_path=None, whitelisted=True, release_date=datetime.datetime( 2024, 1, 2, tzinfo=datetime.timezone.utc ), ) not_whitelisted = Post( feed_id=feed.id, title="Not Whitelisted", guid="bad2", download_url="http://example.com/bad2.mp3", processed_audio_path="/tmp/bad2.mp3", whitelisted=False, release_date=datetime.datetime( 2024, 1, 1, tzinfo=datetime.timezone.utc ), ) db.session.add_all([processed, unprocessed, not_whitelisted]) db.session.commit() mock_feed_item.side_effect = ( lambda post, prepend_feed_title=False: mock.MagicMock( post_guid=post.guid ) ) mock_rss = mock_rss_2.return_value mock_rss.to_xml.return_value = "" result = generate_feed_xml(feed) called_posts = [call.args[0] for call in mock_feed_item.call_args_list] assert called_posts == [processed, unprocessed, not_whitelisted] mock_rss_2.assert_called_once() mock_rss.to_xml.assert_called_once_with("utf-8") assert result == "" finally: runtime_config.autoprocess_on_download = original_flag @mock.patch("app.feeds.Post") def test_make_post(mock_post_class, mock_feed): # Create a mock entry entry = mock.MagicMock() entry.title = "Test Episode" entry.description = "Test Description" entry.id = "test-guid" entry.published_parsed = (2023, 1, 1, 12, 0, 0, 0, 0, 0) entry.itunes_duration = "3600" # Set up entry.get behavior entry.get = mock.MagicMock() entry.get.side_effect = lambda key, default="": { "description": "Test Description", "published_parsed": entry.published_parsed, }.get(key, default) mock_post = MockPost() mock_post_class.return_value = mock_post # Mock find_audio_link with ( mock.patch("app.feeds.find_audio_link") as mock_find_audio_link, mock.patch("app.feeds.get_guid") as mock_get_guid, mock.patch("app.feeds.get_duration") as mock_get_duration, ): mock_find_audio_link.return_value = "https://example.com/audio.mp3" mock_get_guid.return_value = "test-guid" mock_get_duration.return_value = 3600 result = make_post(mock_feed, entry) # Check that Post was created with correct arguments mock_post_class.assert_called_once() assert result == mock_post @mock.patch("app.feeds.uuid.UUID") @mock.patch("app.feeds.find_audio_link") @mock.patch("app.feeds.uuid.uuid5") def test_get_guid_uses_id_if_valid_uuid(mock_uuid5, mock_find_audio_link, mock_uuid): """Test that get_guid returns the entry.id if it's a valid UUID.""" entry = mock.MagicMock() entry.id = "550e8400-e29b-41d4-a716-446655440000" # uuid.UUID doesn't raise an error, so entry.id is a valid UUID result = get_guid(entry) assert result == entry.id mock_uuid.assert_called_once_with(entry.id) mock_find_audio_link.assert_not_called() mock_uuid5.assert_not_called() @mock.patch("app.feeds.uuid.UUID") @mock.patch("app.feeds.find_audio_link") @mock.patch("app.feeds.uuid.uuid5") def test_get_guid_generates_uuid_if_invalid_id( mock_uuid5, mock_find_audio_link, mock_uuid ): """Test that get_guid generates a UUID if entry.id is not a valid UUID.""" entry = mock.MagicMock() entry.id = "not-a-uuid" # uuid.UUID raises ValueError, so entry.id is not a valid UUID mock_uuid.side_effect = ValueError mock_find_audio_link.return_value = "https://example.com/audio.mp3" mock_uuid5_instance = mock.MagicMock() mock_uuid5_instance.__str__.return_value = "550e8400-e29b-41d4-a716-446655440000" mock_uuid5.return_value = mock_uuid5_instance result = get_guid(entry) assert result == "550e8400-e29b-41d4-a716-446655440000" mock_uuid.assert_called_once_with(entry.id) mock_find_audio_link.assert_called_once_with(entry) mock_uuid5.assert_called_once_with( uuid.NAMESPACE_URL, "https://example.com/audio.mp3" ) def test_get_duration_with_valid_duration(): """Test get_duration with a valid duration.""" entry = {"itunes_duration": "3600"} result = get_duration(entry) assert result == 3600 def test_get_duration_with_invalid_duration(): """Test get_duration with an invalid duration.""" entry = {"itunes_duration": "not-a-number"} result = get_duration(entry) assert result is None def test_get_duration_with_missing_duration(): """Test get_duration with a missing duration.""" entry = {} result = get_duration(entry) assert result is None def test_get_base_url_no_request_context_fallback(): """Test _get_base_url falls back to config when no request context.""" with mock.patch("app.feeds.config") as mock_config: mock_config.port = 5001 result = _get_base_url() assert result == "http://localhost:5001" def test_get_base_url_with_http2_pseudo_headers(): """Test _get_base_url uses HTTP/2 pseudo-headers when available.""" headers_dict = { ":scheme": "https", ":authority": "podly.com", "Host": "podly.com", } mock_headers = mock.MagicMock() mock_headers.get.side_effect = headers_dict.get mock_environ = mock.MagicMock() mock_environ.get.return_value = None mock_request = mock.MagicMock() mock_request.headers = mock_headers mock_request.environ = mock_environ with mock.patch("app.feeds.request", mock_request): result = _get_base_url() # Should use HTTP/2 pseudo-headers assert result == "https://podly.com" def test_get_base_url_with_strict_transport_security(): """Test _get_base_url uses Strict-Transport-Security header to detect HTTPS.""" headers_dict = { "Host": "secure.example.com", "Strict-Transport-Security": "max-age=31536000; includeSubDomains", } mock_headers = mock.MagicMock() mock_headers.get.side_effect = headers_dict.get mock_environ = mock.MagicMock() mock_environ.get.return_value = None mock_request = mock.MagicMock() mock_request.headers = mock_headers mock_request.environ = mock_environ mock_request.is_secure = False # Even if Flask thinks it's HTTP mock_request.scheme = "http" with mock.patch("app.feeds.request", mock_request): result = _get_base_url() # Should use HTTPS because of Strict-Transport-Security header assert result == "https://secure.example.com" def test_get_base_url_fallback_http_without_sts(): """Test _get_base_url falls back to HTTP when no HTTPS indicators present.""" headers_dict = { "Host": "insecure.example.com", } mock_headers = mock.MagicMock() mock_headers.get.side_effect = headers_dict.get mock_environ = mock.MagicMock() mock_environ.get.return_value = None mock_request = mock.MagicMock() mock_request.headers = mock_headers mock_request.environ = mock_environ mock_request.is_secure = False mock_request.scheme = "http" with mock.patch("app.feeds.request", mock_request): result = _get_base_url() # Should use HTTP when no HTTPS indicators present assert result == "http://insecure.example.com" ================================================ FILE: src/tests/test_filenames.py ================================================ from shared.processing_paths import ( ProcessingPaths, get_srv_root, paths_from_unprocessed_path, ) def test_filenames() -> None: """Test filename processing with sanitized characters.""" work_paths = paths_from_unprocessed_path( "some/path/to/my/unprocessed.mp3", "fix buzz!! bang? a show?? about stuff." ) # Expect sanitized directory name with special characters removed and spaces replaced with underscores assert work_paths == ProcessingPaths( post_processed_audio_path=get_srv_root() / "fix_buzz_bang_a_show_about_stuff" / "unprocessed.mp3", ) ================================================ FILE: src/tests/test_helpers.py ================================================ """ Shared test utilities for rate limiting tests. """ from typing import Any from shared.config import Config def create_test_config(**overrides: Any) -> Config: """Create a test configuration with rate limiting enabled.""" config_data: dict[str, Any] = { "llm_model": "anthropic/claude-3-5-sonnet-20240620", "llm_api_key": "test-key", "llm_enable_token_rate_limiting": True, "llm_max_retry_attempts": 3, "llm_max_concurrent_calls": 2, "openai_timeout": 300, "openai_max_tokens": 4096, "output": { "fade_ms": 3000, "min_ad_segement_separation_seconds": 60, "min_ad_segment_length_seconds": 14, "min_confidence": 0.8, }, "processing": { "num_segments_to_input_to_prompt": 30, }, } config_data.update(overrides) return Config(**config_data) ================================================ FILE: src/tests/test_llm_concurrency_limiter.py ================================================ """ Test cases for LLM concurrency limiting functionality. """ import threading import time import pytest from podcast_processor.llm_concurrency_limiter import ( ConcurrencyContext, LLMConcurrencyLimiter, get_concurrency_limiter, ) class TestLLMConcurrencyLimiter: """Test cases for the LLMConcurrencyLimiter class.""" def test_initialization(self): """Test proper initialization of the concurrency limiter.""" limiter = LLMConcurrencyLimiter(max_concurrent_calls=3) assert limiter.max_concurrent_calls == 3 assert limiter.get_available_slots() == 3 assert limiter.get_active_calls() == 0 def test_initialization_invalid_value(self): """Test that invalid max_concurrent_calls raises ValueError.""" with pytest.raises( ValueError, match="max_concurrent_calls must be greater than 0" ): LLMConcurrencyLimiter(max_concurrent_calls=0) with pytest.raises( ValueError, match="max_concurrent_calls must be greater than 0" ): LLMConcurrencyLimiter(max_concurrent_calls=-1) def test_acquire_and_release(self): """Test basic acquire and release functionality.""" limiter = LLMConcurrencyLimiter(max_concurrent_calls=2) # Initially should have 2 available slots assert limiter.get_available_slots() == 2 assert limiter.get_active_calls() == 0 # Acquire first slot assert limiter.acquire() is True assert limiter.get_available_slots() == 1 assert limiter.get_active_calls() == 1 # Acquire second slot assert limiter.acquire() is True assert limiter.get_available_slots() == 0 assert limiter.get_active_calls() == 2 # Release first slot limiter.release() assert limiter.get_available_slots() == 1 assert limiter.get_active_calls() == 1 # Release second slot limiter.release() assert limiter.get_available_slots() == 2 assert limiter.get_active_calls() == 0 def test_acquire_timeout(self): """Test acquire with timeout when no slots available.""" limiter = LLMConcurrencyLimiter(max_concurrent_calls=1) # Acquire the only slot assert limiter.acquire() is True # Try to acquire another slot with timeout start_time = time.time() assert limiter.acquire(timeout=0.1) is False elapsed = time.time() - start_time # Should timeout quickly assert elapsed < 0.2 # Allow some margin for test execution def test_context_manager(self): """Test the ConcurrencyContext context manager.""" limiter = LLMConcurrencyLimiter(max_concurrent_calls=2) assert limiter.get_available_slots() == 2 with ConcurrencyContext(limiter): assert limiter.get_available_slots() == 1 assert limiter.get_active_calls() == 1 assert limiter.get_available_slots() == 2 assert limiter.get_active_calls() == 0 def test_context_manager_timeout(self): """Test context manager with timeout when no slots available.""" limiter = LLMConcurrencyLimiter(max_concurrent_calls=1) # Acquire the only slot limiter.acquire() # Try to use context manager with timeout with pytest.raises( RuntimeError, match="Could not acquire LLM concurrency slot" ): with ConcurrencyContext(limiter, timeout=0.1): pass def test_thread_safety(self): """Test that the limiter works correctly with multiple threads.""" limiter = LLMConcurrencyLimiter(max_concurrent_calls=2) results = [] errors = [] def worker(worker_id): try: with ConcurrencyContext(limiter, timeout=1.0): results.append(f"worker_{worker_id}_start") # Simulate some work time.sleep(0.1) results.append(f"worker_{worker_id}_end") except Exception as e: errors.append(f"worker_{worker_id}_error: {e}") # Start 4 threads, but only 2 should run concurrently threads = [] for i in range(4): thread = threading.Thread(target=worker, args=(i,)) threads.append(thread) thread.start() # Wait for all threads to complete for thread in threads: thread.join() # Should have no errors assert len(errors) == 0 # Should have 8 results total (start and end for each worker) assert len(results) == 8 # Check that we have the expected results start_results = [r for r in results if r.endswith("_start")] end_results = [r for r in results if r.endswith("_end")] assert len(start_results) == 4 assert len(end_results) == 4 class TestGlobalConcurrencyLimiter: """Test cases for global concurrency limiter functions.""" def test_get_concurrency_limiter_singleton(self): """Test that get_concurrency_limiter returns the same instance.""" # Clear any existing limiter import podcast_processor.llm_concurrency_limiter as limiter_module limiter_module._CONCURRENCY_LIMITER = None limiter1 = get_concurrency_limiter(max_concurrent_calls=3) limiter2 = get_concurrency_limiter(max_concurrent_calls=3) assert limiter1 is limiter2 assert limiter1.max_concurrent_calls == 3 def test_get_concurrency_limiter_different_limits(self): """Test that get_concurrency_limiter creates new instance for different limits.""" # Clear any existing limiter import podcast_processor.llm_concurrency_limiter as limiter_module limiter_module._CONCURRENCY_LIMITER = None limiter1 = get_concurrency_limiter(max_concurrent_calls=3) limiter2 = get_concurrency_limiter(max_concurrent_calls=5) assert limiter1 is not limiter2 assert limiter1.max_concurrent_calls == 3 assert limiter2.max_concurrent_calls == 5 ================================================ FILE: src/tests/test_llm_error_classifier.py ================================================ """ Tests for the LLM error classifier. """ import pytest from podcast_processor.llm_error_classifier import LLMErrorClassifier class TestLLMErrorClassifier: """Test suite for LLMErrorClassifier.""" def test_rate_limit_errors(self): """Test identification of rate limiting errors.""" rate_limit_errors = [ "Rate limit exceeded", "Too many requests", "Quota exceeded", "HTTP 429 error", "API rate limit hit", ] for error in rate_limit_errors: assert LLMErrorClassifier.is_retryable_error(error) assert LLMErrorClassifier.get_error_category(error) == "rate_limit" def test_timeout_errors(self): """Test identification of timeout errors.""" timeout_errors = [ "Request timeout", "Connection timed out", "HTTP 408 error", "HTTP 504 Gateway Timeout", ] for error in timeout_errors: assert LLMErrorClassifier.is_retryable_error(error) assert LLMErrorClassifier.get_error_category(error) == "timeout" def test_server_errors(self): """Test identification of server errors.""" server_errors = [ "Internal server error", "HTTP 500 error", "HTTP 502 Bad Gateway", "HTTP 503 Service Unavailable", ] for error in server_errors: assert LLMErrorClassifier.is_retryable_error(error) assert LLMErrorClassifier.get_error_category(error) == "server_error" def test_non_retryable_errors(self): """Test identification of non-retryable errors.""" non_retryable_errors = [ "Authentication failed", "Invalid API key", "Authorization denied", "HTTP 401 Unauthorized", "HTTP 403 Forbidden", "HTTP 400 Bad Request", ] for error in non_retryable_errors: assert not LLMErrorClassifier.is_retryable_error(error) category = LLMErrorClassifier.get_error_category(error) assert category in ["auth_error", "client_error"] def test_auth_vs_client_errors(self): """Test distinction between auth errors and other client errors.""" auth_errors = [ "Authentication failed", "Authorization denied", "HTTP 401 error", "HTTP 403 error", ] for error in auth_errors: assert LLMErrorClassifier.get_error_category(error) == "auth_error" client_errors = [ "HTTP 400 Bad Request", "Invalid parameter", ] for error in client_errors: assert LLMErrorClassifier.get_error_category(error) == "client_error" def test_unknown_errors(self): """Test handling of unknown error types.""" unknown_errors = [ "Something weird happened", "Unexpected error", "HTTP 418 I'm a teapot", ] for error in unknown_errors: assert not LLMErrorClassifier.is_retryable_error(error) assert LLMErrorClassifier.get_error_category(error) == "unknown" def test_suggested_backoff(self): """Test suggested backoff times for different error types.""" # Rate limit errors should have longer backoff rate_limit_backoff = LLMErrorClassifier.get_suggested_backoff( "Rate limit exceeded", 1 ) server_error_backoff = LLMErrorClassifier.get_suggested_backoff( "Internal server error", 1 ) assert rate_limit_backoff > server_error_backoff # Timeout errors should have moderate backoff timeout_backoff = LLMErrorClassifier.get_suggested_backoff("Request timeout", 1) assert timeout_backoff > server_error_backoff assert timeout_backoff < rate_limit_backoff # Backoff should increase with attempt number backoff_attempt_1 = LLMErrorClassifier.get_suggested_backoff( "Rate limit exceeded", 1 ) backoff_attempt_2 = LLMErrorClassifier.get_suggested_backoff( "Rate limit exceeded", 2 ) assert backoff_attempt_2 > backoff_attempt_1 def test_exception_objects(self): """Test handling of actual exception objects.""" try: # Test with a basic exception since LiteLLM constructor may vary error = Exception("Internal server error") assert LLMErrorClassifier.is_retryable_error(error) # Test with a more specific pattern server_error_msg = "HTTP 500 Internal Server Error" assert LLMErrorClassifier.is_retryable_error(server_error_msg) except ImportError: # Skip if litellm not available pytest.skip("litellm not available") def test_case_insensitive_matching(self): """Test that error classification is case insensitive.""" assert LLMErrorClassifier.is_retryable_error("RATE LIMIT EXCEEDED") assert LLMErrorClassifier.is_retryable_error("Rate Limit Exceeded") assert LLMErrorClassifier.is_retryable_error("rate limit exceeded") assert not LLMErrorClassifier.is_retryable_error("AUTHENTICATION FAILED") assert not LLMErrorClassifier.is_retryable_error("Authentication Failed") assert not LLMErrorClassifier.is_retryable_error("authentication failed") ================================================ FILE: src/tests/test_parse_model_output.py ================================================ import pytest from pydantic import ValidationError from podcast_processor.model_output import ( AdSegmentPrediction, AdSegmentPredictionList, clean_and_parse_model_output, ) def test_clean_parse_output() -> None: model_outupt = """ extra stuff bla bla {"ad_segments": [{"segment_offset": 123.45, "confidence": 0.7}]}. Note: Advertisements in the above podcast excerpt are identified with a moderate level of confidence due to their promotional nature, but not being from within the core content (i.e., discussing the movie or artwork) which suggests these segments could be a """ assert clean_and_parse_model_output(model_outupt) == AdSegmentPredictionList( ad_segments=[ AdSegmentPrediction( segment_offset=123.45, confidence=0.7, ) ] ) def test_parse_multiple_segments_output() -> None: model_outupt = """ {"ad_segments": [ {"segment_offset": 123.45, "confidence": 0.7}, {"segment_offset": 23.45, "confidence": 0.8}, {"segment_offset": 45.67, "confidence": 0.9} ] }""" assert clean_and_parse_model_output(model_outupt) == AdSegmentPredictionList( ad_segments=[ AdSegmentPrediction(segment_offset=123.45, confidence=0.7), AdSegmentPrediction(segment_offset=23.45, confidence=0.8), AdSegmentPrediction(segment_offset=45.67, confidence=0.9), ] ) def test_clean_parse_output_malformed() -> None: model_outupt = """ {"ad_segments": uhoh1.7, 1114.8, 1116.4, 1118.2, 1119.5, 1121.0, 1123.2, 1125.2], "confidence": 0.7}. Note: Advertisements in the above podcast excerpt are identified with a moderate level of confidence due to their promotional nature, but not being from within the core content (i.e., discussing the movie or artwork) which suggests these segments could be a """ with pytest.raises(ValidationError): clean_and_parse_model_output(model_outupt) def test_clean_parse_output_with_content_type() -> None: model_output = """ {"ad_segments": [{"segment_offset": 12.0, "confidence": 0.86}], "content_type": "promotional_external", "confidence": 0.91} """ assert clean_and_parse_model_output(model_output) == AdSegmentPredictionList( ad_segments=[AdSegmentPrediction(segment_offset=12.0, confidence=0.86)], content_type="promotional_external", confidence=0.91, ) def test_clean_parse_output_truncated_missing_closing_brackets() -> None: """Test parsing truncated JSON missing closing ]} at the end.""" model_output = '{"ad_segments":[{"segment_offset":10.5,"confidence":0.92}' result = clean_and_parse_model_output(model_output) assert result == AdSegmentPredictionList( ad_segments=[AdSegmentPrediction(segment_offset=10.5, confidence=0.92)] ) def test_clean_parse_output_truncated_multiple_segments() -> None: """Test parsing truncated JSON with multiple complete segments but missing closing.""" model_output = '{"ad_segments":[{"segment_offset":10.5,"confidence":0.92},{"segment_offset":25.0,"confidence":0.85}' result = clean_and_parse_model_output(model_output) assert result == AdSegmentPredictionList( ad_segments=[ AdSegmentPrediction(segment_offset=10.5, confidence=0.92), AdSegmentPrediction(segment_offset=25.0, confidence=0.85), ] ) def test_clean_parse_output_truncated_with_content_type() -> None: """Test parsing truncated JSON that includes content_type but is missing final }.""" model_output = '{"ad_segments":[{"segment_offset":12.0,"confidence":0.86}],"content_type":"promotional_external","confidence":0.92' result = clean_and_parse_model_output(model_output) assert result == AdSegmentPredictionList( ad_segments=[AdSegmentPrediction(segment_offset=12.0, confidence=0.86)], content_type="promotional_external", confidence=0.92, ) ================================================ FILE: src/tests/test_podcast_downloader.py ================================================ from unittest import mock import pytest from app.models import Feed, Post from podcast_processor.podcast_downloader import ( PodcastDownloader, find_audio_link, sanitize_title, ) @pytest.fixture def test_post(app): """Create a real Post object for testing.""" with app.app_context(): # Create a test feed first feed = Feed( title="Test Feed", description="Test Description", author="Test Author", rss_url="https://example.com/feed.xml", ) # Create a test post post = Post( feed_id=1, # Will be set properly when feed is saved guid="test-guid-123", download_url="https://example.com/podcast.mp3", title="Test Episode", description="Test episode description", ) post.feed = feed # Set the relationship return post @pytest.fixture def downloader(tmp_path): """Create a PodcastDownloader instance with a temporary directory.""" return PodcastDownloader(download_dir=str(tmp_path)) @pytest.fixture def mock_entry(): entry = mock.MagicMock() link1 = mock.MagicMock() link1.type = "audio/mpeg" link1.href = "https://example.com/podcast.mp3" link2 = mock.MagicMock() link2.type = "text/html" link2.href = "https://example.com/episode" entry.links = [link1, link2] entry.id = "https://example.com/episode-id" return entry def test_sanitize_title(): assert sanitize_title("Test Episode!@#$%^&*()") == "Test Episode" assert ( sanitize_title("123-ABC_DEF.mp3") == "123ABCDEFmp3" ) # Fixed expected output to match actual behavior assert sanitize_title("") == "" def test_get_and_make_download_path(downloader): path = downloader.get_and_make_download_path("Test Episode!") # Check that the directory was created assert path.parent.exists() assert path.parent.is_dir() # Check that the path is correct assert path.name == "Test Episode.mp3" def test_find_audio_link_with_audio_link(mock_entry): assert find_audio_link(mock_entry) == "https://example.com/podcast.mp3" def test_find_audio_link_without_audio_link(): entry = mock.MagicMock() entry.links = [] entry.id = "https://example.com/episode-id" assert find_audio_link(entry) == "https://example.com/episode-id" @mock.patch("podcast_processor.podcast_downloader.requests.get") def test_download_episode_already_exists(mock_get, test_post, downloader, app): with app.app_context(): # Create the directory and file episode_dir = downloader.get_and_make_download_path(test_post.title).parent episode_dir.mkdir(parents=True, exist_ok=True) episode_file = episode_dir / "Test Episode.mp3" episode_file.write_bytes(b"dummy data") result = downloader.download_episode(test_post, dest_path=str(episode_file)) # Check that we didn't try to download the file mock_get.assert_not_called() # Check that the correct path was returned assert result == str(episode_file) @mock.patch("podcast_processor.podcast_downloader.requests.get") def test_download_episode_new_file(mock_get, test_post, downloader, app): with app.app_context(): # Setup mock response mock_response = mock.MagicMock() mock_response.status_code = 200 mock_response.iter_content.return_value = [b"podcast audio content"] mock_response.__enter__.return_value = mock_response mock_response.__exit__.return_value = None mock_get.return_value = mock_response expected_path = downloader.get_and_make_download_path(test_post.title) result = downloader.download_episode(test_post, dest_path=str(expected_path)) # Check that we tried to download the file mock_get.assert_called_once_with( "https://example.com/podcast.mp3", headers=mock.ANY, stream=True, timeout=60 ) # Check that the file was created with the correct content expected_path = downloader.get_and_make_download_path(test_post.title) assert expected_path.exists() assert expected_path.read_bytes() == b"podcast audio content" # Check that the correct path was returned assert result == str(expected_path) @mock.patch("podcast_processor.podcast_downloader.requests.get") def test_download_episode_download_failed(mock_get, test_post, downloader, app): with app.app_context(): # Setup mock response mock_response = mock.MagicMock() mock_response.status_code = 404 mock_response.__enter__.return_value = mock_response mock_response.__exit__.return_value = None mock_get.return_value = mock_response expected_path = downloader.get_and_make_download_path(test_post.title) result = downloader.download_episode(test_post, dest_path=str(expected_path)) # Check that we tried to download the file mock_get.assert_called_once_with( "https://example.com/podcast.mp3", headers=mock.ANY, stream=True, timeout=60 ) # Check that no file was created expected_path = downloader.get_and_make_download_path(test_post.title) assert not expected_path.exists() # Check that None was returned assert result is None @mock.patch("podcast_processor.podcast_downloader.validators.url") @mock.patch("podcast_processor.podcast_downloader.abort") def test_download_episode_invalid_url( mock_abort, mock_validator, test_post, downloader, app ): with app.app_context(): # Make the validator fail mock_validator.return_value = False expected_path = downloader.get_and_make_download_path(test_post.title) downloader.download_episode(test_post, dest_path=str(expected_path)) # Check that abort was called with 404 mock_abort.assert_called_once_with(404) @mock.patch("podcast_processor.podcast_downloader.requests.get") def test_download_episode_invalid_post_title(mock_get, test_post, downloader, app): with app.app_context(): # Test with a post that has an invalid title that results in empty sanitized title test_post.title = "!@#$%^&*()" # This will sanitize to empty string with mock.patch.object( downloader, "get_and_make_download_path" ) as mock_get_path: mock_get_path.return_value = "" expected_path = downloader.get_and_make_download_path(test_post.title) result = downloader.download_episode(test_post, dest_path=expected_path) # Check that None was returned assert result is None mock_get.assert_not_called() ================================================ FILE: src/tests/test_podcast_processor_cleanup.py ================================================ from unittest.mock import MagicMock from app.extensions import db from app.models import Feed, Post from podcast_processor.ad_classifier import AdClassifier from podcast_processor.audio_processor import AudioProcessor from podcast_processor.podcast_downloader import PodcastDownloader from podcast_processor.podcast_processor import PodcastProcessor from podcast_processor.processing_status_manager import ProcessingStatusManager from podcast_processor.transcription_manager import TranscriptionManager from shared.test_utils import create_standard_test_config def test_remove_unprocessed_audio_deletes_file(app, tmp_path) -> None: file_path = tmp_path / "raw.mp3" file_path.write_text("audio") with app.app_context(): # Create a real Post object feed = Feed( title="Test Feed", description="Test Description", author="Test Author", rss_url="https://example.com/feed.xml", ) db.session.add(feed) db.session.commit() post = Post( guid="test-guid", title="Test Episode", download_url="https://example.com/episode.mp3", feed_id=feed.id, unprocessed_audio_path=str(file_path), ) db.session.add(post) db.session.commit() processor = PodcastProcessor( config=create_standard_test_config(), transcription_manager=MagicMock(spec=TranscriptionManager), ad_classifier=MagicMock(spec=AdClassifier), audio_processor=MagicMock(spec=AudioProcessor), status_manager=MagicMock(spec=ProcessingStatusManager), db_session=db.session, downloader=MagicMock(spec=PodcastDownloader), ) processor._remove_unprocessed_audio(post) assert post.unprocessed_audio_path is None assert not file_path.exists() ================================================ FILE: src/tests/test_post_cleanup.py ================================================ from __future__ import annotations from datetime import datetime, timedelta from pathlib import Path from app.extensions import db from app.models import ( Feed, Identification, ModelCall, Post, ProcessingJob, TranscriptSegment, ) from app.post_cleanup import cleanup_processed_posts, count_cleanup_candidates def _create_feed() -> Feed: feed = Feed( title="Test Feed", description="desc", author="author", rss_url="https://example.com/feed.xml", image_url="https://example.com/image.png", ) db.session.add(feed) db.session.commit() return feed def _create_post(feed: Feed, guid: str, download_url: str) -> Post: post = Post( feed_id=feed.id, guid=guid, download_url=download_url, title=f"Episode {guid}", description="test", whitelisted=True, ) db.session.add(post) db.session.commit() return post def test_cleanup_removes_expired_posts(app, tmp_path) -> None: with app.app_context(): feed = _create_feed() old_post = _create_post(feed, "old-guid", "https://example.com/old.mp3") recent_post = _create_post( feed, "recent-guid", "https://example.com/recent.mp3" ) old_processed = Path(tmp_path) / "old_processed.mp3" old_unprocessed = Path(tmp_path) / "old_unprocessed.mp3" old_processed.write_text("processed") old_unprocessed.write_text("unprocessed") old_post.processed_audio_path = str(old_processed) old_post.unprocessed_audio_path = str(old_unprocessed) db.session.commit() completed_at = datetime.utcnow() - timedelta(days=10) db.session.add( ProcessingJob( id="job-old", post_guid=old_post.guid, status="completed", current_step=4, total_steps=4, progress_percentage=100.0, created_at=completed_at, started_at=completed_at, completed_at=completed_at, ) ) recent_completed = datetime.utcnow() - timedelta(days=2) db.session.add( ProcessingJob( id="job-recent", post_guid=recent_post.guid, status="completed", current_step=4, total_steps=4, progress_percentage=100.0, created_at=recent_completed, started_at=recent_completed, completed_at=recent_completed, ) ) # Populate related tables for the old post to ensure cascading deletes model_call = ModelCall( post_id=old_post.id, first_segment_sequence_num=0, last_segment_sequence_num=0, model_name="test", prompt="prompt", response="resp", status="completed", timestamp=completed_at, ) db.session.add(model_call) segment = TranscriptSegment( post_id=old_post.id, sequence_num=0, start_time=0.0, end_time=1.0, text="segment", ) db.session.add(segment) db.session.flush() db.session.add( Identification( transcript_segment_id=segment.id, model_call_id=model_call.id, confidence=0.5, label="ad", ) ) db.session.commit() removed = cleanup_processed_posts(retention_days=5) assert removed == 1 cleaned_old_post = Post.query.filter_by(guid="old-guid").first() assert cleaned_old_post is not None assert cleaned_old_post.whitelisted is False assert cleaned_old_post.processed_audio_path is None assert cleaned_old_post.unprocessed_audio_path is None assert Post.query.filter_by(guid="recent-guid").first() is not None assert ProcessingJob.query.filter_by(post_guid="old-guid").first() is None assert Identification.query.count() == 0 assert TranscriptSegment.query.count() == 0 assert ModelCall.query.count() == 0 assert not old_processed.exists() assert not old_unprocessed.exists() def test_cleanup_skips_when_retention_disabled(app) -> None: with app.app_context(): feed = _create_feed() post = _create_post(feed, "guid", "https://example.com/audio.mp3") completed_at = datetime.utcnow() - timedelta(days=10) db.session.add( ProcessingJob( id="job-disable", post_guid=post.guid, status="completed", current_step=4, total_steps=4, progress_percentage=100.0, created_at=completed_at, started_at=completed_at, completed_at=completed_at, ) ) db.session.commit() removed = cleanup_processed_posts(retention_days=None) assert removed == 0 assert Post.query.filter_by(guid="guid").first() is not None def test_cleanup_includes_non_whitelisted_processed_posts(app, tmp_path) -> None: with app.app_context(): feed = _create_feed() post = _create_post(feed, "non-white", "https://example.com/nonwhite.mp3") post.whitelisted = False post.release_date = datetime.utcnow() - timedelta(days=10) processed = tmp_path / "processed.mp3" processed.write_text("audio") post.processed_audio_path = str(processed) # Add old completed job so post qualifies for cleanup completed_at = datetime.utcnow() - timedelta(days=10) db.session.add( ProcessingJob( id="job-non-white", post_guid=post.guid, status="completed", current_step=4, total_steps=4, progress_percentage=100.0, created_at=completed_at, started_at=completed_at, completed_at=completed_at, ) ) db.session.commit() count, _ = count_cleanup_candidates(retention_days=5) assert count == 1 removed = cleanup_processed_posts(retention_days=5) assert removed == 1 cleaned_post = Post.query.filter_by(guid="non-white").first() assert cleaned_post is not None assert cleaned_post.whitelisted is False assert cleaned_post.processed_audio_path is None assert cleaned_post.unprocessed_audio_path is None def test_cleanup_skips_unprocessed_unwhitelisted_posts(app) -> None: with app.app_context(): feed = _create_feed() post = _create_post(feed, "non-white-2", "https://example.com/nonwhite2.mp3") post.whitelisted = False post.release_date = datetime.utcnow() - timedelta(days=10) db.session.commit() count, _ = count_cleanup_candidates(retention_days=5) assert count == 0 removed = cleanup_processed_posts(retention_days=5) assert removed == 0 assert Post.query.filter_by(guid="non-white-2").first() is not None ================================================ FILE: src/tests/test_post_routes.py ================================================ import datetime from types import SimpleNamespace from unittest import mock from flask import g from app.extensions import db from app.models import Feed, Post, User from app.routes.post_routes import post_bp from app.runtime_config import config as runtime_config def test_download_endpoints_increment_counter(app, tmp_path): """Ensure both processed and original downloads increment the counter.""" app.testing = True app.register_blueprint(post_bp) with app.app_context(): feed = Feed(title="Test Feed", rss_url="https://example.com/feed.xml") db.session.add(feed) db.session.commit() processed_audio = tmp_path / "processed.mp3" processed_audio.write_bytes(b"processed audio") original_audio = tmp_path / "original.mp3" original_audio.write_bytes(b"original audio") post = Post( feed_id=feed.id, guid="test-guid", download_url="https://example.com/audio.mp3", title="Test Episode", processed_audio_path=str(processed_audio), unprocessed_audio_path=str(original_audio), whitelisted=True, ) db.session.add(post) db.session.commit() client = app.test_client() # Mock writer_client to simulate DB update with mock.patch("app.routes.post_routes.writer_client") as mock_writer: def side_effect(action, params, wait=False): if action == "increment_download_count": post_id = params["post_id"] Post.query.filter_by(id=post_id).update( {Post.download_count: (Post.download_count or 0) + 1} ) db.session.commit() mock_writer.action.side_effect = side_effect response = client.get(f"/api/posts/{post.guid}/download") assert response.status_code == 200 db.session.refresh(post) assert post.download_count == 1 response = client.get(f"/api/posts/{post.guid}/download/original") assert response.status_code == 200 db.session.refresh(post) assert post.download_count == 2 def test_download_triggers_processing_when_enabled(app): """Start processing when processed audio is missing and toggle is enabled.""" app.testing = True app.register_blueprint(post_bp) with app.app_context(): feed = Feed(title="Test Feed", rss_url="https://example.com/feed.xml") db.session.add(feed) db.session.commit() post = Post( feed_id=feed.id, guid="missing-audio-guid", download_url="https://example.com/audio.mp3", title="Missing Audio", whitelisted=True, ) db.session.add(post) db.session.commit() post_guid = post.guid client = app.test_client() original_flag = runtime_config.autoprocess_on_download runtime_config.autoprocess_on_download = True try: with mock.patch("app.routes.post_routes.get_jobs_manager") as mock_mgr: mock_mgr.return_value.start_post_processing.return_value = { "status": "started", "job_id": "job-123", } response = client.get(f"/api/posts/{post_guid}/download") assert response.status_code == 202 payload = response.get_json() assert payload["status"] == "started" mock_mgr.return_value.start_post_processing.assert_called_once_with( post_guid, priority="download", requested_by_user_id=None, billing_user_id=None, ) finally: runtime_config.autoprocess_on_download = original_flag def test_download_missing_audio_returns_404_when_disabled(app): """Keep existing 404 behavior when toggle is off.""" app.testing = True app.register_blueprint(post_bp) with app.app_context(): feed = Feed(title="Test Feed", rss_url="https://example.com/feed.xml") db.session.add(feed) db.session.commit() post = Post( feed_id=feed.id, guid="missing-audio-404", download_url="https://example.com/audio.mp3", title="Missing Audio", whitelisted=True, ) db.session.add(post) db.session.commit() post_guid = post.guid client = app.test_client() original_flag = runtime_config.autoprocess_on_download runtime_config.autoprocess_on_download = False try: with mock.patch("app.routes.post_routes.get_jobs_manager") as mock_mgr: response = client.get(f"/api/posts/{post_guid}/download") assert response.status_code == 404 mock_mgr.return_value.start_post_processing.assert_not_called() finally: runtime_config.autoprocess_on_download = original_flag def test_download_auto_whitelists_post(app, tmp_path): """Download request should whitelist the post automatically.""" app.testing = True app.register_blueprint(post_bp) with app.app_context(): feed = Feed(title="Test Feed", rss_url="https://example.com/feed.xml") db.session.add(feed) db.session.commit() processed_audio = tmp_path / "processed.mp3" processed_audio.write_bytes(b"processed audio") post = Post( feed_id=feed.id, guid="auto-whitelist-guid", download_url="https://example.com/audio.mp3", title="Auto Whitelist Episode", processed_audio_path=str(processed_audio), whitelisted=False, ) db.session.add(post) db.session.commit() post_guid = post.guid post_id = post.id client = app.test_client() original_flag = runtime_config.autoprocess_on_download runtime_config.autoprocess_on_download = True with mock.patch("app.routes.post_routes.writer_client") as mock_writer: mock_writer.action.return_value = SimpleNamespace(success=True, data=None) response = client.get(f"/api/posts/{post_guid}/download") assert response.status_code == 200 mock_writer.action.assert_has_calls( [ mock.call("whitelist_post", {"post_id": post_id}, wait=True), mock.call("increment_download_count", {"post_id": post_id}, wait=False), ] ) runtime_config.autoprocess_on_download = original_flag def test_download_rejects_when_not_whitelisted_and_toggle_off(app): """Ensure download is forbidden when not whitelisted and auto-process toggle is off.""" app.testing = True app.register_blueprint(post_bp) with app.app_context(): feed = Feed(title="Test Feed", rss_url="https://example.com/feed.xml") db.session.add(feed) db.session.commit() post = Post( feed_id=feed.id, guid="no-autoprocess-whitelist", download_url="https://example.com/audio.mp3", title="No Auto", whitelisted=False, ) db.session.add(post) db.session.commit() post_guid = post.guid client = app.test_client() original_flag = runtime_config.autoprocess_on_download runtime_config.autoprocess_on_download = False try: response = client.get(f"/api/posts/{post_guid}/download") assert response.status_code == 403 finally: runtime_config.autoprocess_on_download = original_flag def test_toggle_whitelist_all_requires_admin(app): """Ensure bulk whitelist actions are limited to admins.""" app.testing = True app.register_blueprint(post_bp) app.config["AUTH_SETTINGS"] = SimpleNamespace(require_auth=True) with app.app_context(): admin_user = User(username="admin", password_hash="hash", role="admin") regular_user = User(username="user", password_hash="hash", role="user") feed = Feed(title="Admin Feed", rss_url="https://example.com/feed.xml") db.session.add_all([admin_user, regular_user, feed]) db.session.commit() posts = [ Post( feed_id=feed.id, guid=f"guid-{idx}", download_url=f"https://example.com/{idx}.mp3", title=f"Episode {idx}", whitelisted=False, ) for idx in range(2) ] db.session.add_all(posts) db.session.commit() admin_id = admin_user.id regular_id = regular_user.id feed_id = feed.id current_user = {"id": admin_id} @app.before_request def _mock_auth() -> None: g.current_user = SimpleNamespace(id=current_user["id"]) client = app.test_client() current_user["id"] = regular_id response = client.post(f"/api/feeds/{feed_id}/toggle-whitelist-all") assert response.status_code == 403 assert response.get_json()["error"].startswith("Only admins") current_user["id"] = admin_id response = client.post(f"/api/feeds/{feed_id}/toggle-whitelist-all") assert response.status_code == 200 with app.app_context(): whitelisted = Post.query.filter_by(feed_id=feed_id, whitelisted=True).count() assert whitelisted == 2 def test_feed_posts_pagination_and_filtering(app): """Feed posts endpoint should paginate and support whitelisted filter.""" app.testing = True app.register_blueprint(post_bp) with app.app_context(): feed = Feed(title="Paged Feed", rss_url="https://example.com/feed.xml") db.session.add(feed) db.session.commit() base_date = datetime.date(2024, 1, 1) posts = [] # Create 30 posts with descending dates; even ones whitelisted. for idx in range(30): post = Post( feed_id=feed.id, guid=f"guid-{idx}", download_url=f"https://example.com/{idx}.mp3", title=f"Episode {idx}", release_date=base_date + datetime.timedelta(days=idx), whitelisted=(idx % 2 == 0), ) posts.append(post) db.session.add_all(posts) db.session.commit() client = app.test_client() # Default page (1) should return 25 items ordered newest-first response = client.get(f"/api/feeds/{feed.id}/posts") assert response.status_code == 200 data = response.get_json() assert data["page"] == 1 assert data["page_size"] == 25 assert data["total"] == 30 assert data["total_pages"] == 2 assert len(data["items"]) == 25 # First item should be newest (idx 29) assert data["items"][0]["guid"] == "guid-29" # Last item on page 1 should be idx 5 (25 items: 29..5) assert data["items"][-1]["guid"] == "guid-5" # Page 2 should return remaining 5 response = client.get(f"/api/feeds/{feed.id}/posts", query_string={"page": 2}) assert response.status_code == 200 data_page_2 = response.get_json() assert data_page_2["page"] == 2 assert len(data_page_2["items"]) == 5 # Items should be 4..0 assert {item["guid"] for item in data_page_2["items"]} == { "guid-4", "guid-3", "guid-2", "guid-1", "guid-0", } # Whitelisted filter should only return whitelisted posts (15 total) response = client.get( f"/api/feeds/{feed.id}/posts", query_string={"whitelisted_only": "true"}, ) assert response.status_code == 200 filtered = response.get_json() assert filtered["total"] == 15 assert filtered["whitelisted_total"] == 15 assert all(item["whitelisted"] for item in filtered["items"]) ================================================ FILE: src/tests/test_posts.py ================================================ from pathlib import Path from unittest.mock import patch from app.models import Post from app.posts import remove_associated_files class TestPostsFunctions: """Test class for functions in the app.posts module.""" @patch("app.posts._remove_file_if_exists") @patch("app.posts._dedupe_and_find_existing") @patch("app.posts._collect_processed_paths") @patch("app.posts.get_and_make_download_path") @patch("app.posts.logger") def test_remove_associated_files_files_dont_exist( self, mock_logger, mock_get_download_path, mock_collect_paths, mock_dedupe, mock_remove_file, app, ): """Test remove_associated_files when files don't exist.""" with app.app_context(): # Set up mocks mock_collect_paths.return_value = [Path("/path/to/processed.mp3")] mock_dedupe.return_value = ( [Path("/path/to/processed.mp3")], None, # No existing file found ) mock_get_download_path.return_value = "/path/to/unprocessed.mp3" # Create test post post = Post(id=1, title="Test Post") # Call the function remove_associated_files(post) # Verify _remove_file_if_exists was called for unprocessed path assert mock_remove_file.call_count >= 1 # Verify debug logging for no processed file mock_logger.debug.assert_called() ================================================ FILE: src/tests/test_process_audio.py ================================================ import tempfile from pathlib import Path from podcast_processor.audio import ( clip_segments_with_fade, get_audio_duration_ms, split_audio, ) TEST_FILE_DURATION = 66_048 TEST_FILE_PATH = "src/tests/data/count_0_99.mp3" def test_get_duration_ms() -> None: assert get_audio_duration_ms(TEST_FILE_PATH) == TEST_FILE_DURATION def test_clip_segment_with_fade() -> None: fade_len_ms = 5_000 ad_start_offset_ms, ad_end_offset_ms = 3_000, 21_000 with tempfile.NamedTemporaryFile(delete=True, suffix=".mp3") as temp_file: clip_segments_with_fade( [(ad_start_offset_ms, ad_end_offset_ms)], fade_len_ms, TEST_FILE_PATH, temp_file.name, ) expected_duration = ( TEST_FILE_DURATION - (ad_end_offset_ms - ad_start_offset_ms) + 2 * fade_len_ms + 56 # not sure where this fudge comes from ) actual_duration = get_audio_duration_ms(temp_file.name) assert actual_duration is not None, "Failed to get audio duration" assert abs(actual_duration - expected_duration) <= 60, ( f"Duration mismatch: expected {expected_duration}ms, got {actual_duration}ms, " f"difference: {abs(actual_duration - expected_duration)}ms" ) def test_clip_segment_with_fade_beginning() -> None: fade_len_ms = 5_000 ad_start_offset_ms, ad_end_offset_ms = 0, 18_000 with tempfile.NamedTemporaryFile(delete=True, suffix=".mp3") as temp_file: clip_segments_with_fade( [(ad_start_offset_ms, ad_end_offset_ms)], fade_len_ms, TEST_FILE_PATH, temp_file.name, ) expected_duration = ( TEST_FILE_DURATION - (ad_end_offset_ms - ad_start_offset_ms) + 2 * fade_len_ms + 56 # not sure where this fudge comes from ) actual_duration = get_audio_duration_ms(temp_file.name) assert actual_duration is not None, "Failed to get audio duration" assert abs(actual_duration - expected_duration) <= 60, ( f"Duration mismatch: expected {expected_duration}ms, got {actual_duration}ms, " f"difference: {abs(actual_duration - expected_duration)}ms" ) def test_clip_segment_with_fade_end() -> None: fade_len_ms = 5_000 ad_start_offset_ms, ad_end_offset_ms = ( TEST_FILE_DURATION - 18_000, TEST_FILE_DURATION, ) with tempfile.NamedTemporaryFile(delete=True, suffix=".mp3") as temp_file: clip_segments_with_fade( [(ad_start_offset_ms, ad_end_offset_ms)], fade_len_ms, TEST_FILE_PATH, temp_file.name, ) expected_duration = ( TEST_FILE_DURATION - (ad_end_offset_ms - ad_start_offset_ms) + 2 * fade_len_ms + 56 # not sure where this fudge comes from ) actual_duration = get_audio_duration_ms(temp_file.name) assert actual_duration is not None, "Failed to get audio duration" assert abs(actual_duration - expected_duration) <= 60, ( f"Duration mismatch: expected {expected_duration}ms, got {actual_duration}ms, " f"difference: {abs(actual_duration - expected_duration)}ms" ) def test_split_audio() -> None: with tempfile.TemporaryDirectory() as temp_dir: temp_dir_path = Path(temp_dir) split_audio(Path(TEST_FILE_PATH), temp_dir_path, 38_000) expected = { "0.mp3": (6_384, 38_108), "1.mp3": (6_384, 38_252), "2.mp3": (6_384, 38_108), "3.mp3": (6_384, 38_108), "4.mp3": (6_384, 38_252), "5.mp3": (6_384, 38_252), "6.mp3": (6_384, 38_252), "7.mp3": (6_384, 38_108), "8.mp3": (6_384, 38_108), "9.mp3": (6_384, 38_252), "10.mp3": (2_784, 16_508), } for split in temp_dir_path.iterdir(): assert split.name in expected duration_ms, filesize = expected[split.name] actual_duration = get_audio_duration_ms(str(split)) assert ( actual_duration is not None ), f"Failed to get audio duration for {split}" assert abs(actual_duration - duration_ms) <= 100, ( f"Duration mismatch for {split}. Expected {duration_ms}ms, got {actual_duration}ms, " f"difference: {abs(actual_duration - duration_ms)}ms" ) assert ( abs(filesize - split.stat().st_size) <= 500 ), f"filesize <> 500 bytes for {split}. found {split.stat().st_size}, expected {filesize}" # pylint: disable=line-too-long ================================================ FILE: src/tests/test_rate_limiting_config.py ================================================ """ Tests for new rate limiting configuration options. """ from typing import Any from shared.config import Config class TestRateLimitingConfig: """Test cases for rate limiting configuration.""" def test_default_rate_limiting_config(self) -> None: """Test that rate limiting defaults are properly set.""" config_data: dict[str, Any] = { "llm_api_key": "test-key", "output": { "fade_ms": 3000, "min_ad_segement_separation_seconds": 60, "min_ad_segment_length_seconds": 14, "min_confidence": 0.8, }, "processing": { "num_segments_to_input_to_prompt": 30, }, } config = Config(**config_data) # Test default values assert config.llm_max_concurrent_calls == 3 assert config.llm_max_retry_attempts == 5 assert config.llm_max_input_tokens_per_call is None assert config.llm_enable_token_rate_limiting is False assert config.llm_max_input_tokens_per_minute is None def test_custom_rate_limiting_config(self) -> None: """Test that custom rate limiting values are properly set.""" config_data: dict[str, Any] = { "llm_api_key": "test-key", "llm_max_concurrent_calls": 5, "llm_max_retry_attempts": 10, "llm_max_input_tokens_per_call": 50000, "llm_enable_token_rate_limiting": False, "llm_max_input_tokens_per_minute": 100000, "output": { "fade_ms": 3000, "min_ad_segement_separation_seconds": 60, "min_ad_segment_length_seconds": 14, "min_confidence": 0.8, }, "processing": { "num_segments_to_input_to_prompt": 30, }, } config = Config(**config_data) # Test custom values assert config.llm_max_concurrent_calls == 5 assert config.llm_max_retry_attempts == 10 assert config.llm_max_input_tokens_per_call == 50000 assert config.llm_enable_token_rate_limiting is False assert config.llm_max_input_tokens_per_minute == 100000 def test_partial_rate_limiting_config(self) -> None: """Test that partial rate limiting config uses defaults for missing values.""" config_data: dict[str, Any] = { "llm_api_key": "test-key", "llm_max_retry_attempts": 7, # Only override this one "output": { "fade_ms": 3000, "min_ad_segement_separation_seconds": 60, "min_ad_segment_length_seconds": 14, "min_confidence": 0.8, }, "processing": { "num_segments_to_input_to_prompt": 30, }, } config = Config(**config_data) # Test that custom value is set assert config.llm_max_retry_attempts == 7 # Test that defaults are used for other values assert config.llm_max_concurrent_calls == 3 assert config.llm_max_input_tokens_per_call is None assert config.llm_enable_token_rate_limiting is False assert config.llm_max_input_tokens_per_minute is None def test_config_field_descriptions(self) -> None: """Test that config fields have proper descriptions.""" # Test that the field definitions include helpful descriptions config_fields = Config.model_fields assert "llm_max_concurrent_calls" in config_fields assert "Maximum concurrent LLM calls" in str( config_fields["llm_max_concurrent_calls"].description ) assert "llm_max_retry_attempts" in config_fields assert "Maximum retry attempts" in str( config_fields["llm_max_retry_attempts"].description ) assert "llm_enable_token_rate_limiting" in config_fields assert "client-side token-based rate limiting" in str( config_fields["llm_enable_token_rate_limiting"].description ) ================================================ FILE: src/tests/test_rate_limiting_edge_cases.py ================================================ """ Additional edge case tests for rate limiting functionality. """ import time from typing import Any from unittest.mock import patch from podcast_processor.ad_classifier import AdClassifier from podcast_processor.token_rate_limiter import TokenRateLimiter from .test_helpers import create_test_config class TestRateLimitingEdgeCases: """Test edge cases and boundary conditions for rate limiting.""" def test_token_counting_edge_cases(self) -> None: """Test token counting with edge cases.""" limiter = TokenRateLimiter() # Test empty content messages: list[dict[str, str]] = [{"role": "user", "content": ""}] tokens = limiter.count_tokens(messages, "gpt-4") assert tokens == 0 # Test malformed message structure messages = [{"role": "user"}] # Missing content tokens = limiter.count_tokens(messages, "gpt-4") assert tokens == 0 # Test very large message large_content = "word " * 10000 # ~50k characters messages = [{"role": "user", "content": large_content}] tokens = limiter.count_tokens(messages, "gpt-4") assert tokens > 10000 # Should estimate significant tokens def test_rate_limiter_boundary_conditions(self) -> None: """Test rate limiter at exact boundary conditions.""" limiter = TokenRateLimiter(tokens_per_minute=100, window_minutes=1) current_time = time.time() # Fill exactly to the limit limiter.token_usage.append((current_time - 30, 100)) # Try to add exactly 0 more tokens messages: list[dict[str, str]] = [] can_proceed, wait_seconds = limiter.check_rate_limit(messages, "gpt-4") assert can_proceed is True assert wait_seconds == 0.0 # Try to add 1 more token (should exceed) messages = [{"role": "user", "content": "x"}] # Minimal content can_proceed, wait_seconds = limiter.check_rate_limit(messages, "gpt-4") # This might pass or fail depending on exact token counting, but should be consistent def test_rate_limiter_time_window_edge(self) -> None: """Test rate limiter behavior at time window boundaries.""" limiter = TokenRateLimiter(tokens_per_minute=100, window_minutes=1) current_time = time.time() # Add usage at different window boundaries limiter.token_usage.append((current_time - 61, 50)) # Outside 60-second window limiter.token_usage.append((current_time - 59, 40)) # Inside window # Check current usage usage = limiter._get_current_usage(current_time) assert usage == 40 # Only the second entry should count def test_config_validation_boundary_values(self) -> None: """Test configuration with boundary values.""" # Test minimum values config = create_test_config( llm_max_concurrent_calls=1, llm_max_retry_attempts=1, llm_max_input_tokens_per_call=1, llm_max_input_tokens_per_minute=1, ) assert config.llm_max_concurrent_calls == 1 assert config.llm_max_retry_attempts == 1 assert config.llm_max_input_tokens_per_call == 1 assert config.llm_max_input_tokens_per_minute == 1 def test_error_classification_comprehensive(self) -> None: """Test comprehensive error classification scenarios.""" config = create_test_config() with patch("podcast_processor.ad_classifier.db.session") as mock_session: classifier = AdClassifier(config=config, db_session=mock_session) retryable_errors = [ Exception("HTTP 429: Rate limit exceeded"), Exception("rate_limit_error: too many requests"), Exception("RateLimitError: Request rate limit exceeded"), Exception("Service temporarily unavailable (503)"), Exception("service unavailable"), Exception("Error 503: Service unavailable"), Exception("rate limit reached"), ] # Test specific LiteLLM exceptions by importing at runtime try: from litellm.exceptions import InternalServerError # InternalServerError requires specific parameters, so create a simple one retryable_errors.append( InternalServerError( "Service unavailable", llm_provider="test", model="test" ) ) except (ImportError, TypeError): # If litellm.exceptions not available or constructor changed, skip this specific test pass for error in retryable_errors: assert classifier._is_retryable_error(error) is True non_retryable_errors = [ Exception("Invalid API key (401)"), Exception("Bad request (400)"), Exception("Forbidden (403)"), ValueError("Invalid input format"), Exception("Model not found (404)"), Exception("Connection timeout"), # Not in the retryable list Exception("Internal server error (500)"), # Not in the retryable list ] for error in non_retryable_errors: assert classifier._is_retryable_error(error) is False @patch("time.sleep") def test_backoff_progression(self, mock_sleep: Any) -> None: """Test the complete backoff progression for different error types.""" config = create_test_config() with patch("podcast_processor.ad_classifier.db.session") as mock_session: classifier = AdClassifier(config=config, db_session=mock_session) from app.models import ModelCall model_call = ModelCall(id=1, error_message=None) # Test rate limit error backoff progression rate_limit_error = Exception("rate_limit_error: too many requests") # First attempt (attempt=0): 60 * (2^0) = 60 classifier._handle_retryable_error( model_call_obj=model_call, error=rate_limit_error, attempt=0, current_attempt_num=1, ) # Second attempt (attempt=1): 60 * (2^1) = 120 classifier._handle_retryable_error( model_call_obj=model_call, error=rate_limit_error, attempt=1, current_attempt_num=2, ) # Third attempt (attempt=2): 60 * (2^2) = 240 classifier._handle_retryable_error( model_call_obj=model_call, error=rate_limit_error, attempt=2, current_attempt_num=3, ) # Check the sleep calls expected_calls = [60, 120, 240] actual_calls = [call[0][0] for call in mock_sleep.call_args_list] assert actual_calls == expected_calls # Reset for non-rate-limit error test mock_sleep.reset_mock() # Test regular error backoff progression: 1, 2, 4 seconds regular_error = Exception("Internal server error") classifier._handle_retryable_error( model_call_obj=model_call, error=regular_error, attempt=0, current_attempt_num=1, ) classifier._handle_retryable_error( model_call_obj=model_call, error=regular_error, attempt=1, current_attempt_num=2, ) classifier._handle_retryable_error( model_call_obj=model_call, error=regular_error, attempt=2, current_attempt_num=3, ) expected_calls = [1, 2, 4] actual_calls = [call[0][0] for call in mock_sleep.call_args_list] assert actual_calls == expected_calls def test_rate_limiter_with_very_short_window(self) -> None: """Test rate limiter with very short time windows.""" # Use 1 minute window but test with 10-second spacing limiter = TokenRateLimiter(tokens_per_minute=60, window_minutes=1) current_time = time.time() # Add usage just outside typical processing time limiter.token_usage.append((current_time - 65, 30)) # Outside 1-min window limiter.token_usage.append((current_time - 5, 20)) # 5 seconds ago usage = limiter._get_current_usage(current_time) assert usage == 20 # Only the recent usage should count def test_model_configuration_case_sensitivity(self) -> None: """Test that model configuration handles different cases and formats.""" from podcast_processor.token_rate_limiter import ( configure_rate_limiter_for_model, ) # Test different cases of the same model test_cases = [ "gpt-4o-mini", "GPT-4O-MINI", # Different case "some-provider/gpt-4o-mini/version", # With provider prefix/suffix ] for model_name in test_cases: # Clear singleton to ensure fresh test import podcast_processor.token_rate_limiter as trl_module trl_module._RATE_LIMITER = None # Only the exact lowercase match should work due to current implementation limiter = configure_rate_limiter_for_model(model_name) if "gpt-4o-mini" in model_name.lower(): expected_limit = ( 200000 if model_name == "gpt-4o-mini" or "gpt-4o-mini" in model_name else 30000 ) else: expected_limit = 30000 # Default assert limiter.tokens_per_minute == expected_limit def test_thread_safety_stress(self) -> None: """More intensive thread safety test.""" import threading limiter = TokenRateLimiter( tokens_per_minute=50000 ) # Higher limit for stress test messages: list[dict[str, str]] = [{"role": "user", "content": "test " * 100}] results: list[tuple[int, int, float]] = [] errors: list[tuple[int, Exception]] = [] def worker(worker_id: int) -> None: try: for i in range(20): start_time = time.time() limiter.wait_if_needed(messages, "gpt-4") end_time = time.time() results.append((worker_id, i, end_time - start_time)) except Exception as e: errors.append((worker_id, e)) # Run 10 threads with 20 calls each threads = [] for worker_id in range(10): thread = threading.Thread(target=worker, args=(worker_id,)) threads.append(thread) thread.start() for thread in threads: thread.join() # Should have no errors assert len(errors) == 0 # Should have recorded all calls assert len(limiter.token_usage) == 200 # 10 threads * 20 calls # All calls should complete relatively quickly (no excessive waiting) max_wait_time = max(result[2] for result in results) assert max_wait_time < 5.0 # Should not wait more than 5 seconds ================================================ FILE: src/tests/test_session_auth.py ================================================ from __future__ import annotations from urllib.parse import parse_qs, urlparse import pytest from flask import Flask, Response, g, jsonify from app.auth import AuthSettings from app.auth.middleware import init_auth_middleware from app.auth.state import failure_rate_limiter from app.extensions import db from app.models import Feed, Post, User from app.routes.auth_routes import auth_bp from app.routes.feed_routes import feed_bp @pytest.fixture def auth_app() -> Flask: app = Flask(__name__) app.config.update( SECRET_KEY="test-secret", SESSION_COOKIE_NAME="podly_session", SQLALCHEMY_DATABASE_URI="sqlite:///:memory:", SQLALCHEMY_TRACK_MODIFICATIONS=False, ) settings = AuthSettings( require_auth=True, admin_username="admin", admin_password="password", ) app.config["AUTH_SETTINGS"] = settings app.config["REQUIRE_AUTH"] = True db.init_app(app) with app.app_context(): db.create_all() user = User(username="admin", role="admin") user.set_password("password") db.session.add(user) db.session.commit() failure_rate_limiter._storage.clear() init_auth_middleware(app) app.register_blueprint(auth_bp) app.register_blueprint(feed_bp) @app.route("/api/protected", methods=["GET"]) def protected() -> Response: current = getattr(g, "current_user", None) if current is None: return jsonify({"error": "missing user"}), 500 return jsonify({"status": "ok", "user": current.username}) @app.route("/feed/1", methods=["GET"]) def feed() -> Response: current = getattr(g, "current_user", None) if current is None: return Response("missing user", status=500) return Response("ok", mimetype="text/plain") @app.route("/api/posts//download", methods=["GET"]) def download(guid: str) -> Response: del guid current = getattr(g, "current_user", None) if current is None: return Response("missing user", status=500) return Response("download", mimetype="text/plain") yield app with app.app_context(): db.session.remove() db.drop_all() def test_login_sets_session_cookie_and_allows_authenticated_requests( auth_app: Flask, ) -> None: client = auth_app.test_client() response = client.post( "/api/auth/login", json={"username": "admin", "password": "password"}, ) assert response.status_code == 200 set_cookie = response.headers.get("Set-Cookie", "") assert "podly_session" in set_cookie me = client.get("/api/auth/me") assert me.status_code == 200 assert me.get_json()["user"]["username"] == "admin" protected = client.get("/api/protected") assert protected.status_code == 200 assert protected.get_json()["status"] == "ok" def test_logout_clears_session(auth_app: Flask) -> None: client = auth_app.test_client() client.post("/api/auth/login", json={"username": "admin", "password": "password"}) response = client.post("/api/auth/logout") assert response.status_code == 204 protected = client.get("/api/protected") assert protected.status_code == 401 assert protected.headers.get("WWW-Authenticate") is None def test_protected_route_without_session_returns_json_401(auth_app: Flask) -> None: client = auth_app.test_client() response = client.get("/api/protected") assert response.status_code == 401 assert response.get_json()["error"] == "Authentication required." assert response.headers.get("WWW-Authenticate") is None def test_feed_requires_token_when_no_session(auth_app: Flask) -> None: client = auth_app.test_client() unauthorized = client.get("/feed/1") assert unauthorized.status_code == 401 assert "Invalid or missing feed token" in unauthorized.get_data(as_text=True) def test_share_link_generates_token_and_allows_query_access(auth_app: Flask) -> None: client = auth_app.test_client() with auth_app.app_context(): feed = Feed(title="Example", rss_url="https://example.com/feed.xml") db.session.add(feed) db.session.commit() feed_id = feed.id post = Post( feed_id=feed_id, guid="episode-1", download_url="https://example.com/audio.mp3", title="Episode", whitelisted=True, ) db.session.add(post) db.session.commit() client.post("/api/auth/login", json={"username": "admin", "password": "password"}) share = client.post(f"/api/feeds/{feed_id}/share-link") assert share.status_code == 201 payload = share.get_json() assert payload["feed_id"] == feed_id token_id = payload["feed_token"] secret = payload["feed_secret"] parsed = urlparse(payload["url"]) params = parse_qs(parsed.query) assert params.get("feed_token", [None])[0] == token_id assert params.get("feed_secret", [None])[0] == secret anon_client = auth_app.test_client() feed_response = anon_client.get( f"/feed/{feed_id}", query_string={"feed_token": token_id, "feed_secret": secret}, ) assert feed_response.status_code == 200 assert feed_response.data == b"ok" download_response = anon_client.get( "/api/posts/episode-1/download", query_string={"feed_token": token_id, "feed_secret": secret}, ) assert download_response.status_code == 200 def test_share_link_returns_same_token_for_user_and_feed(auth_app: Flask) -> None: client = auth_app.test_client() with auth_app.app_context(): feed = Feed(title="Stable", rss_url="https://example.com/stable.xml") db.session.add(feed) db.session.commit() feed_id = feed.id client.post("/api/auth/login", json={"username": "admin", "password": "password"}) first = client.post(f"/api/feeds/{feed_id}/share-link").get_json() second = client.post(f"/api/feeds/{feed_id}/share-link").get_json() assert first["url"] == second["url"] assert first["feed_token"] == second["feed_token"] assert first["feed_secret"] == second["feed_secret"] ================================================ FILE: src/tests/test_token_limit_config.py ================================================ """ Simple integration test for the llm_max_input_tokens_per_call feature. """ from shared.test_utils import create_standard_test_config def test_config_validation() -> None: """Test that the config validation works with the new setting.""" # Test with token limit config_with_limit = create_standard_test_config(llm_max_input_tokens_per_call=50000) assert config_with_limit.llm_max_input_tokens_per_call == 50000 assert config_with_limit.processing.num_segments_to_input_to_prompt == 400 # Test without token limit config_without_limit = create_standard_test_config() assert config_without_limit.llm_max_input_tokens_per_call is None assert config_without_limit.processing.num_segments_to_input_to_prompt == 400 if __name__ == "__main__": test_config_validation() print("✓ Config validation test passed!") ================================================ FILE: src/tests/test_token_rate_limiter.py ================================================ """ Tests for the TokenRateLimiter class and related functionality. """ import threading import time from unittest.mock import patch from podcast_processor.token_rate_limiter import ( TokenRateLimiter, configure_rate_limiter_for_model, get_rate_limiter, ) class TestTokenRateLimiter: """Test cases for the TokenRateLimiter class.""" def test_initialization(self) -> None: """Test rate limiter initialization with default and custom parameters.""" # Test default initialization limiter = TokenRateLimiter() assert limiter.tokens_per_minute == 30000 assert limiter.window_seconds == 60 assert len(limiter.token_usage) == 0 # Test custom initialization limiter = TokenRateLimiter(tokens_per_minute=15000, window_minutes=2) assert limiter.tokens_per_minute == 15000 assert limiter.window_seconds == 120 def test_count_tokens(self) -> None: """Test token counting functionality.""" limiter = TokenRateLimiter() # Test empty messages messages: list[dict[str, str]] = [] tokens = limiter.count_tokens(messages, "gpt-4") assert tokens == 0 # Test single message messages = [{"role": "user", "content": "Hello world"}] tokens = limiter.count_tokens(messages, "gpt-4") assert tokens > 0 # Should estimate some tokens # Test multiple messages messages = [ {"role": "system", "content": "You are a helpful assistant."}, {"role": "user", "content": "What is the weather like today?"}, ] tokens = limiter.count_tokens(messages, "gpt-4") assert tokens > 0 def test_token_counting_fallback(self) -> None: """Test token counting fallback on error.""" limiter = TokenRateLimiter() # Test with malformed message (should use fallback) messages: list[dict[str, str]] = [{"role": "user"}] # Missing content tokens = limiter.count_tokens(messages, "gpt-4") assert tokens == 0 # Should return 0 for missing content def test_cleanup_old_usage(self) -> None: """Test cleanup of old token usage records.""" limiter = TokenRateLimiter(tokens_per_minute=1000, window_minutes=1) current_time = time.time() # Add some old usage records limiter.token_usage.append((current_time - 120, 100)) # 2 minutes ago limiter.token_usage.append((current_time - 30, 200)) # 30 seconds ago limiter.token_usage.append((current_time - 10, 300)) # 10 seconds ago # Cleanup should remove the 2-minute-old record limiter._cleanup_old_usage(current_time) assert len(limiter.token_usage) == 2 assert limiter.token_usage[0][1] == 200 # 30 seconds ago should remain assert limiter.token_usage[1][1] == 300 # 10 seconds ago should remain def test_get_current_usage(self) -> None: """Test getting current token usage within time window.""" limiter = TokenRateLimiter(tokens_per_minute=1000, window_minutes=1) current_time = time.time() # Add usage records limiter.token_usage.append((current_time - 120, 100)) # Outside window limiter.token_usage.append((current_time - 30, 200)) # Within window limiter.token_usage.append((current_time - 10, 300)) # Within window usage = limiter._get_current_usage(current_time) assert usage == 500 # 200 + 300 (only records within window) def test_check_rate_limit_within_limits(self) -> None: """Test rate limit check when within limits.""" limiter = TokenRateLimiter(tokens_per_minute=1000) messages: list[dict[str, str]] = [{"role": "user", "content": "Short message"}] can_proceed, wait_seconds = limiter.check_rate_limit(messages, "gpt-4") assert can_proceed is True assert wait_seconds == 0.0 def test_check_rate_limit_exceeds_limits(self) -> None: """Test rate limit check when exceeding limits.""" limiter = TokenRateLimiter(tokens_per_minute=100) # Very low limit current_time = time.time() # Add usage that nearly fills the limit limiter.token_usage.append((current_time - 30, 90)) # Try to add more tokens that would exceed the limit messages: list[dict[str, str]] = [ { "role": "user", "content": "This is a longer message that should exceed the token limit", } ] can_proceed, wait_seconds = limiter.check_rate_limit(messages, "gpt-4") assert can_proceed is False assert wait_seconds > 0 def test_record_usage(self) -> None: """Test recording token usage.""" limiter = TokenRateLimiter() messages: list[dict[str, str]] = [{"role": "user", "content": "Test message"}] initial_count = len(limiter.token_usage) limiter.record_usage(messages, "gpt-4") assert len(limiter.token_usage) == initial_count + 1 timestamp, token_count = limiter.token_usage[-1] assert timestamp > 0 assert token_count > 0 def test_wait_if_needed_no_wait(self) -> None: """Test wait_if_needed when no waiting is required.""" limiter = TokenRateLimiter(tokens_per_minute=10000) # High limit messages: list[dict[str, str]] = [{"role": "user", "content": "Short message"}] start_time = time.time() limiter.wait_if_needed(messages, "gpt-4") end_time = time.time() elapsed = end_time - start_time # Should not have waited significantly assert elapsed < 1.0 # Should have recorded usage assert len(limiter.token_usage) > 0 def test_wait_if_needed_with_wait(self) -> None: """Test wait_if_needed when waiting is required.""" limiter = TokenRateLimiter(tokens_per_minute=50) # Very low limit # Fill up the rate limit current_time = time.time() limiter.token_usage.append((current_time - 10, 45)) messages: list[dict[str, str]] = [ {"role": "user", "content": "This message should trigger waiting"} ] # Mock time.sleep to avoid actual waiting in tests with patch("time.sleep") as mock_sleep: limiter.wait_if_needed(messages, "gpt-4") # Should have called sleep mock_sleep.assert_called_once() call_args = mock_sleep.call_args[0] assert call_args[0] > 0 # Should have waited some positive amount def test_get_usage_stats(self) -> None: """Test getting usage statistics.""" limiter = TokenRateLimiter(tokens_per_minute=1000) # Add some usage current_time = time.time() limiter.token_usage.append((current_time - 30, 200)) limiter.token_usage.append((current_time - 10, 300)) stats = limiter.get_usage_stats() assert "current_usage" in stats assert "limit" in stats assert "usage_percentage" in stats assert "window_seconds" in stats assert "active_records" in stats assert stats["current_usage"] == 500 assert stats["limit"] == 1000 assert stats["usage_percentage"] == 50.0 assert stats["window_seconds"] == 60 assert stats["active_records"] == 2 def test_thread_safety(self) -> None: """Test that the rate limiter is thread-safe.""" limiter = TokenRateLimiter(tokens_per_minute=10000) messages: list[dict[str, str]] = [{"role": "user", "content": "Test message"}] def worker() -> None: for _ in range(10): limiter.wait_if_needed(messages, "gpt-4") # Run multiple threads concurrently threads = [] for _ in range(5): thread = threading.Thread(target=worker) threads.append(thread) thread.start() # Wait for all threads to complete for thread in threads: thread.join() # Should have recorded usage from all threads assert len(limiter.token_usage) == 50 # 5 threads * 10 calls each class TestGlobalRateLimiter: """Test cases for global rate limiter functions.""" def test_get_rate_limiter_singleton(self) -> None: """Test that get_rate_limiter returns the same instance.""" limiter1 = get_rate_limiter(5000) limiter2 = get_rate_limiter(5000) assert limiter1 is limiter2 # Should be the same instance assert limiter1.tokens_per_minute == 5000 def test_get_rate_limiter_different_limits(self) -> None: """Test that get_rate_limiter creates new instance for different limits.""" limiter1 = get_rate_limiter(5000) limiter2 = get_rate_limiter(8000) assert limiter1 is not limiter2 # Should be different instances assert limiter1.tokens_per_minute == 5000 assert limiter2.tokens_per_minute == 8000 def test_configure_rate_limiter_for_model_anthropic(self) -> None: """Test model-specific configuration for Anthropic models.""" limiter = configure_rate_limiter_for_model( "anthropic/claude-3-5-sonnet-20240620" ) assert limiter.tokens_per_minute == 30000 def test_configure_rate_limiter_for_model_openai(self) -> None: """Test model-specific configuration for OpenAI models.""" # Test each model in isolation to avoid singleton issues import podcast_processor.token_rate_limiter as trl_module # Test gpt-4o-mini first (higher limit) trl_module._RATE_LIMITER = None limiter = configure_rate_limiter_for_model("gpt-4o-mini") assert limiter.tokens_per_minute == 200000 # Test gpt-4o (lower limit) trl_module._RATE_LIMITER = None limiter = configure_rate_limiter_for_model("gpt-4o") assert limiter.tokens_per_minute == 150000 def test_configure_rate_limiter_for_model_gemini(self) -> None: """Test model-specific configuration for Gemini models.""" import podcast_processor.token_rate_limiter as trl_module trl_module._RATE_LIMITER = None limiter = configure_rate_limiter_for_model("gemini/gemini-3-flash-preview") assert limiter.tokens_per_minute == 60000 trl_module._RATE_LIMITER = None limiter = configure_rate_limiter_for_model("gemini/gemini-2.5-flash") assert limiter.tokens_per_minute == 60000 def test_configure_rate_limiter_for_model_unknown(self) -> None: """Test model-specific configuration for unknown models.""" limiter = configure_rate_limiter_for_model("unknown/model-name") assert limiter.tokens_per_minute == 30000 # Should use default def test_configure_rate_limiter_partial_match(self) -> None: """Test model-specific configuration with partial model names.""" # Test that partial matches work limiter = configure_rate_limiter_for_model("some-prefix/gpt-4o/some-suffix") assert limiter.tokens_per_minute == 150000 # Should match gpt-4o ================================================ FILE: src/tests/test_transcribe.py ================================================ import logging from typing import Any from unittest.mock import MagicMock import pytest from openai.types.audio.transcription_segment import TranscriptionSegment # from pytest_mock import MockerFixture @pytest.mark.skip def test_remote_transcribe() -> None: # import here instead of the toplevel because torch is not installed properly in CI. from podcast_processor.transcribe import ( # pylint: disable=import-outside-toplevel OpenAIWhisperTranscriber, ) logger = logging.getLogger("global_logger") from shared.test_utils import create_standard_test_config config = create_standard_test_config().model_dump() transcriber = OpenAIWhisperTranscriber(logger, config) transcription = transcriber.transcribe("file.mp3") assert transcription == [] @pytest.mark.skip def test_local_transcribe() -> None: # import here instead of the toplevel because torch is not installed properly in CI. from podcast_processor.transcribe import ( # pylint: disable=import-outside-toplevel LocalWhisperTranscriber, ) logger = logging.getLogger("global_logger") transcriber = LocalWhisperTranscriber(logger, "base.en") transcription = transcriber.transcribe("src/tests/file.mp3") assert transcription == [] @pytest.mark.skip def test_groq_transcribe(mocker: Any) -> None: # import here instead of the toplevel because dependencies aren't installed properly in CI. from podcast_processor.transcribe import ( # pylint: disable=import-outside-toplevel GroqWhisperTranscriber, ) from shared.config import ( # pylint: disable=import-outside-toplevel GroqWhisperConfig, ) # Mock the requests call mock_response = MagicMock() mock_response.status_code = 200 mock_response.json.return_value = { "segments": [ {"start": 0.0, "end": 1.0, "text": "This is a test segment."}, {"start": 1.0, "end": 2.0, "text": "This is another test segment."}, ] } mocker.patch("requests.post", return_value=mock_response) # Mock file operations mocker.patch("builtins.open", mocker.mock_open(read_data="test audio data")) mocker.patch("pathlib.Path.exists", return_value=True) mocker.patch("podcast_processor.audio.split_audio", return_value=[("test.mp3", 0)]) mocker.patch("shutil.rmtree") logger = logging.getLogger("global_logger") config = GroqWhisperConfig( api_key="test_key", model="whisper-large-v3-turbo", language="en" ) transcriber = GroqWhisperTranscriber(logger, config) transcription = transcriber.transcribe("test.mp3") assert len(transcription) == 2 assert transcription[0].text == "This is a test segment." assert transcription[1].text == "This is another test segment." def test_offset() -> None: # import here instead of the toplevel because torch is not installed properly in CI. from podcast_processor.transcribe import ( # pylint: disable=import-outside-toplevel OpenAIWhisperTranscriber, ) assert OpenAIWhisperTranscriber.add_offset_to_segments( [ TranscriptionSegment( id=1, avg_logprob=2, seek=6, temperature=7, text="hi", tokens=[], compression_ratio=3, no_speech_prob=4, start=12.345, end=45.678, ) ], 123, ) == [ TranscriptionSegment( id=1, avg_logprob=2, seek=6, temperature=7, text="hi", tokens=[], compression_ratio=3, no_speech_prob=4, start=12.468, end=45.800999999999995, ) ] ================================================ FILE: src/tests/test_transcription_manager.py ================================================ import logging from typing import Generator from unittest.mock import MagicMock import pytest from flask import Flask from app.extensions import db from app.models import Feed, ModelCall, Post, TranscriptSegment from podcast_processor.transcribe import Segment, Transcriber from podcast_processor.transcription_manager import TranscriptionManager from shared.config import Config, TestWhisperConfig from shared.test_utils import create_standard_test_config class MockTranscriber(Transcriber): """Mock transcriber for testing TranscriptionManager.""" def __init__(self, mock_response=None): self.mock_response = mock_response or [] self._model_name = "mock_transcriber" @property def model_name(self) -> str: """Implementation of the abstract property""" return self._model_name def transcribe(self, audio_path): """Return mock segments or raise exception based on configuration.""" if isinstance(self.mock_response, Exception): raise self.mock_response return self.mock_response @pytest.fixture def app() -> Generator[Flask, None, None]: """Create and configure a Flask app for testing.""" app = Flask(__name__) app.config["SQLALCHEMY_DATABASE_URI"] = "sqlite:///:memory:" app.config["SQLALCHEMY_TRACK_MODIFICATIONS"] = False with app.app_context(): db.init_app(app) db.create_all() yield app @pytest.fixture def test_config() -> Config: config = create_standard_test_config() # Override whisper config to use test mode config.whisper = TestWhisperConfig() return config @pytest.fixture def test_logger() -> logging.Logger: return logging.getLogger("test_logger") @pytest.fixture def mock_db_session() -> MagicMock: """Create a mock database session""" mock_session = MagicMock() mock_session.add = MagicMock() mock_session.add_all = MagicMock() mock_session.commit = MagicMock() mock_session.rollback = MagicMock() return mock_session @pytest.fixture def mock_transcriber() -> MockTranscriber: """Return a mock transcriber for testing.""" return MockTranscriber( [ Segment(start=0.0, end=5.0, text="Test segment 1"), Segment(start=5.0, end=10.0, text="Test segment 2"), ] ) @pytest.fixture def test_manager( test_config: Config, test_logger: logging.Logger, mock_db_session: MagicMock, mock_transcriber: MockTranscriber, app: Flask, ) -> TranscriptionManager: """Return a TranscriptionManager instance for testing.""" with app.app_context(): # We need to create mock query objects with proper structure mock_model_call_query = MagicMock() mock_segment_query = MagicMock() # Create a manager with our mocks return TranscriptionManager( test_logger, test_config, model_call_query=mock_model_call_query, segment_query=mock_segment_query, db_session=mock_db_session, transcriber=mock_transcriber, ) def test_check_existing_transcription_success( test_manager: TranscriptionManager, app: Flask, ) -> None: """Test finding existing successful transcription""" post = Post(id=1, title="Test Post") # Create test data model_call = ModelCall( post_id=1, model_name=test_manager.transcriber.model_name, status="success", first_segment_sequence_num=0, last_segment_sequence_num=1, ) segments = [ TranscriptSegment( post_id=1, sequence_num=0, start_time=0.0, end_time=5.0, text="Segment 1" ), TranscriptSegment( post_id=1, sequence_num=1, start_time=5.0, end_time=10.0, text="Segment 2" ), ] with app.app_context(): # Configure the existing mocks in the manager test_manager.model_call_query.filter_by().order_by().first.return_value = ( model_call ) test_manager.segment_query.filter_by().order_by().all.return_value = segments result = test_manager._check_existing_transcription(post) assert result is not None assert len(result) == 2 assert result[0].text == "Segment 1" assert result[1].text == "Segment 2" def test_check_existing_transcription_no_model_call( test_manager: TranscriptionManager, app: Flask, ) -> None: """Test when no existing ModelCall exists""" post = Post(id=1, title="Test Post") with app.app_context(): # Set return value for the existing mock in the manager test_manager.model_call_query.filter_by().order_by().first.return_value = None result = test_manager._check_existing_transcription(post) assert result is None def test_transcribe_new( test_config: Config, test_logger: logging.Logger, app: Flask, ) -> None: """Test transcribing a new audio file""" with app.app_context(): feed = Feed(title="Test Feed", rss_url="http://example.com/rss.xml") post = Post( feed=feed, guid="guid-1", download_url="http://example.com/audio-1.mp3", title="Test Post", unprocessed_audio_path="/path/to/audio.mp3", ) db.session.add_all([feed, post]) db.session.commit() transcriber = MockTranscriber( [ Segment(start=0.0, end=5.0, text="Test segment 1"), Segment(start=5.0, end=10.0, text="Test segment 2"), ] ) manager = TranscriptionManager( test_logger, test_config, db_session=db.session, transcriber=transcriber, ) segments = manager.transcribe(post) assert len(segments) == 2 assert segments[0].text == "Test segment 1" assert segments[1].text == "Test segment 2" assert TranscriptSegment.query.filter_by(post_id=post.id).count() == 2 assert ModelCall.query.filter_by(post_id=post.id).count() == 1 assert ModelCall.query.filter_by(post_id=post.id).first().status == "success" def test_transcribe_handles_error( test_config: Config, test_logger: logging.Logger, app: Flask, ) -> None: """Test error handling during transcription""" with app.app_context(): feed = Feed(title="Test Feed", rss_url="http://example.com/rss.xml") post = Post( feed=feed, guid="guid-err", download_url="http://example.com/audio-err.mp3", title="Test Post", unprocessed_audio_path="/path/to/audio.mp3", ) db.session.add_all([feed, post]) db.session.commit() # Create a mock transcriber that raises an exception error_transcriber = MockTranscriber(Exception("Transcription failed")) manager = TranscriptionManager( test_logger, test_config, db_session=db.session, transcriber=error_transcriber, ) # Test the exception with pytest.raises(Exception) as exc_info: manager.transcribe(post) assert str(exc_info.value) == "Transcription failed" call = ( ModelCall.query.filter_by(post_id=post.id) .order_by(ModelCall.timestamp.desc()) .first() ) assert call is not None assert call.status == "failed_permanent" assert call.error_message == "Transcription failed" def test_transcribe_reuses_placeholder_model_call( test_config: Config, test_logger: logging.Logger, app: Flask, ) -> None: """Ensure we reuse existing placeholder ModelCall rows instead of crashing on uniqueness.""" with app.app_context(): feed = Feed(title="Test Feed", rss_url="http://example.com/rss.xml") post = Post( feed=feed, guid="guid-123", download_url="http://example.com/audio.mp3", title="Test Post", unprocessed_audio_path="/tmp/audio.mp3", ) db.session.add_all([feed, post]) db.session.commit() existing_call = ModelCall( post_id=post.id, model_name="mock_transcriber", first_segment_sequence_num=0, last_segment_sequence_num=-1, prompt="Whisper transcription job", status="failed_permanent", ) db.session.add(existing_call) db.session.commit() manager = TranscriptionManager( test_logger, test_config, db_session=db.session, transcriber=MockTranscriber( [ Segment(start=0.0, end=5.0, text="Segment 1"), Segment(start=5.0, end=10.0, text="Segment 2"), ] ), ) segments = manager.transcribe(post) assert len(segments) == 2 assert ModelCall.query.count() == 1 refreshed_call = ModelCall.query.first() assert refreshed_call.id == existing_call.id assert refreshed_call.status == "success" assert refreshed_call.last_segment_sequence_num == 1 ================================================ FILE: src/user_prompt.jinja ================================================ You are analyzing "{{podcast_title}}", a podcast about {{podcast_topic}}. Return only the JSON contract described in the system prompt using the transcript excerpt below. {{transcript}} ================================================ FILE: src/word_boundary_refinement_prompt.jinja ================================================ You are analyzing podcast transcript segments to identify the precise START and END of advertisement content. Your job is to locate short, distinctive phrases at the START and END of the ad break within the provided segments. BOUNDARY DETECTION RULES: **AD START INDICATORS** (extend boundary backward): - Sponsor introductions: "This episode is brought to you by...", "And now a word from our sponsor" - Transition phrases: "Before we continue...", "Let me tell you about...", "Speaking of..." - Host acknowledgments: "I want to thank...", "Special thanks to...", "Our sponsor today is..." - Subtle lead-ins: "You know what's interesting...", "I've been using...", "Let me share something..." **AD END INDICATORS** (extend boundary forward or tighten earlier): - Transition back to content: "And we're back", "Now back to the show", "Alright, let's get back to..." - Host resumes discussion: references to the previous topic immediately after sponsor talk - Audible wrap-up phrases: "Check them out", "Use code...", "Link in the description" followed by topic continuation **ANALYSIS CONTEXT**: - **Detected Ad Block**: {{ad_start}}s - {{ad_end}}s - **Original Confidence**: {{ad_confidence}} **CONTEXT SEGMENTS**: Each segment has a stable sequence number and timing. {% for segment in context_segments -%} [seq={{segment.sequence_num}} start={{segment.start_time}} end={{segment.end_time}}] {{segment.text}} {% endfor %} **OUTPUT FORMAT**: Respond with valid JSON. - Identify the segment that contains the START of the ad break. - Identify a short phrase at the START of the ad break: the first 4 words of the promo/sponsor read. - Identify the segment that contains the END of the ad break. - Identify a short phrase at the END of the ad break: the last 4 words right before returning to content. Phrase requirements: - Each phrase should be a contiguous sequence of words that appears in the segment text. - Prefer phrases that are fully contained within a single segment. - Use exactly 4 words when possible. If you cannot, return fewer words (3, 2, or 1) that still appear contiguously. Partial output is allowed: - If you are unsure about the START phrase, you may omit `refined_start_phrase` (or set it to null/empty) and we will keep the original detected start boundary. - If you are unsure about the END phrase, you may omit `refined_end_phrase` (or set it to null/empty) and we will keep the original detected end boundary. ```json { "refined_start_segment_seq": 0, "refined_start_phrase": "this episode is brought", "refined_end_segment_seq": 0, "refined_end_phrase": "now back to the", "start_adjustment_reason": "reason for start boundary change", "end_adjustment_reason": "reason for end boundary change" } ``` **REFINEMENT GUIDELINES**: - If no refinement needed, pick the best segment/word corresponding to the existing detected start. - Prefer to refine both START and END boundaries, but return partial results if only one side is confident. - Always ensure the chosen start phrase occurs near the detected start boundary. - Always ensure the chosen end phrase occurs near the detected end boundary. ================================================ FILE: tests/test_cue_detector.py ================================================ import unittest from podcast_processor.cue_detector import CueDetector from podcast_processor.prompt import transcript_excerpt_for_prompt from podcast_processor.transcribe import Segment class TestCueDetector(unittest.TestCase): def setUp(self) -> None: self.detector = CueDetector() def test_highlight_cues_url(self) -> None: text = "Check out example.com for more info." # "Check out" is a CTA, "example.com" is a URL. Both should be highlighted. expected = "*** Check out *** *** example.com *** for more info." self.assertEqual(self.detector.highlight_cues(text), expected) def test_highlight_cues_promo(self) -> None: text = "Use promo code SAVE20 now." # "promo code" matches promo_pattern. # "code SAVE20" would also match promo_pattern, but re.finditer is non-overlapping for a single pattern. # So only "promo code" is captured. expected = "Use *** promo code *** SAVE20 now." self.assertEqual(self.detector.highlight_cues(text), expected) def test_highlight_cues_cta(self) -> None: text = "Please visit our website." expected = "Please *** visit *** our website." self.assertEqual(self.detector.highlight_cues(text), expected) def test_highlight_cues_multiple(self) -> None: text = "Visit example.com and use code TEST." # "Visit" -> cta # "example.com" -> url # "use code" -> cta # "code TEST" -> promo # "use code TEST" -> "use code" (cta) overlaps with "code TEST" (promo) # "use code" (22, 30) # "code TEST" (26, 35) # Merged: (22, 35) -> "use code TEST" expected = "*** Visit *** *** example.com *** and *** use code TEST ***." self.assertEqual(self.detector.highlight_cues(text), expected) def test_highlight_cues_no_cues(self) -> None: text = "Just a normal sentence." self.assertEqual(self.detector.highlight_cues(text), text) def test_integration_prompt(self) -> None: segments = [ Segment(start=10.0, end=15.0, text="Welcome back to the show."), Segment(start=15.0, end=20.0, text="Go to mywebsite.com today."), ] result = transcript_excerpt_for_prompt( segments, includes_start=False, includes_end=False ) # "back to the show" is a transition cue expected_line1 = "[10.0] Welcome *** back to the show ***." # "Go to" is CTA, "mywebsite.com" is URL expected_line2 = "[15.0] *** Go to *** *** mywebsite.com *** today." self.assertIn(expected_line1, result) self.assertIn(expected_line2, result) if __name__ == "__main__": unittest.main()