Repository: alexferrari88/sbstck-dl Branch: main Commit: 775085259f25 Files: 35 Total size: 309.7 KB Directory structure: gitextract_tn_9uzpl/ ├── .github/ │ └── workflows/ │ ├── build-release.yml │ └── test.yml ├── .gitignore ├── .serena/ │ ├── .gitignore │ ├── memories/ │ │ ├── code_style_conventions.md │ │ ├── files_feature_overview.md │ │ ├── project_overview.md │ │ ├── project_structure.md │ │ ├── suggested_commands.md │ │ ├── task_completion_checklist.md │ │ └── testing_patterns.md │ └── project.yml ├── CLAUDE.md ├── LICENSE ├── README.md ├── cmd/ │ ├── cmd_test.go │ ├── download.go │ ├── integration_test.go │ ├── list.go │ ├── main.go │ ├── root.go │ └── version.go ├── go.mod ├── go.sum ├── lib/ │ ├── extractor.go │ ├── extractor_test.go │ ├── fetcher.go │ ├── fetcher_test.go │ ├── files.go │ ├── files_test.go │ ├── images.go │ └── images_test.go ├── main.go └── specs/ ├── archive-index-page.md └── file-attachment-download.md ================================================ FILE CONTENTS ================================================ ================================================ FILE: .github/workflows/build-release.yml ================================================ name: Manual Build and Release on: workflow_dispatch: inputs: branch: description: 'Branch to build' required: true default: 'main' release: types: [created] jobs: test: name: Run Tests runs-on: ${{ matrix.os }} strategy: matrix: os: [ubuntu-latest, macos-latest, windows-latest] go-version: [1.24.1] steps: - name: Check out code uses: actions/checkout@v4 with: ref: ${{ github.event.inputs.branch || github.ref }} - name: Set up Go uses: actions/setup-go@v4 with: go-version: ${{ matrix.go-version }} - name: Run tests run: go test -v -timeout=10m ./... build: name: Build needs: test if: success() runs-on: ${{ matrix.os }} strategy: matrix: os: [ubuntu-latest, macos-latest, windows-latest] go-version: [1.24.1] include: - os: ubuntu-latest goos: linux goarch: amd64 name: ubuntu extension: "" - os: macos-latest goos: darwin goarch: amd64 name: mac extension: "" - os: windows-latest goos: windows goarch: amd64 name: win extension: ".exe" steps: - name: Check out code uses: actions/checkout@v4 with: ref: ${{ github.event.inputs.branch || github.ref }} - name: Set up Go uses: actions/setup-go@v4 with: go-version: ${{ matrix.go-version }} - name: Build run: | env GOOS=${{ matrix.goos }} GOARCH=${{ matrix.goarch }} go build -v -o sbstck-dl-${{ matrix.name }}-${{ matrix.goarch }}${{ matrix.extension }} - name: Upload artifact uses: actions/upload-artifact@v4 with: name: sbstck-dl-${{ matrix.name }}-${{ matrix.goarch }} path: sbstck-dl-${{ matrix.name }}-${{ matrix.goarch }}${{ matrix.extension }} release-upload: name: Attach Artifacts to Release if: github.event_name == 'release' needs: build runs-on: ubuntu-latest permissions: contents: write # This is needed for release uploads steps: - name: Debug event info run: | echo "Event name: ${{ github.event_name }}" echo "Event type: ${{ github.event.action }}" echo "Release tag: ${{ github.event.release.tag_name }}" - name: Download all artifacts uses: actions/download-artifact@v4 with: path: artifacts - name: List artifacts run: find artifacts -type f | sort - name: Upload artifacts to release uses: softprops/action-gh-release@v1 with: files: artifacts/**/* # GitHub automatically provides this token token: ${{ github.token }} ================================================ FILE: .github/workflows/test.yml ================================================ name: Run Tests on: pull_request: branches: [main] jobs: test: name: Run Tests runs-on: ${{ matrix.os }} strategy: matrix: os: [ubuntu-latest, macos-latest, windows-latest] go-version: [1.24.1] steps: - name: Check out code uses: actions/checkout@v4 - name: Set up Go uses: actions/setup-go@v4 with: go-version: ${{ matrix.go-version }} - name: Run tests run: go test -v ./... ================================================ FILE: .gitignore ================================================ # If you prefer the allow list template instead of the deny list, see community template: # https://github.com/github/gitignore/blob/main/community/Golang/Go.AllowList.gitignore # # Binaries for programs and plugins *.exe *.exe~ *.dll *.so *.dylib bin/ # Test binary, built with `go test -c` *.test # Output of the go coverage tool, specifically when used with LiteIDE *.out # Dependency directories (remove the comment below to include it) # vendor/ # Go workspace file go.work # Directory contained scraped content scraped/ test-download/ # vscode .vscode/ # serena .serena/cache/ ================================================ FILE: .serena/.gitignore ================================================ /cache ================================================ FILE: .serena/memories/code_style_conventions.md ================================================ # Code Style and Conventions ## Go Style Guidelines - Follows standard Go conventions and formatting - Uses `gofmt` for code formatting - Package naming: lowercase, single words when possible - Function naming: CamelCase for exported, camelCase for unexported - Variable naming: camelCase, descriptive names ## Code Organization - **Separation of Concerns**: CLI logic in `cmd/`, core business logic in `lib/` - **Error Handling**: Explicit error returns, wrapping with context using `fmt.Errorf` - **Testing**: Table-driven tests, benchmarks for performance-critical code - **Concurrency**: Uses errgroup for managed goroutines, context for cancellation ## Naming Conventions - **Structs**: PascalCase (e.g., `FileDownloader`, `ImageInfo`) - **Interfaces**: Usually end with -er (e.g., implied by method names) - **Constants**: PascalCase for exported, camelCase for unexported - **Files**: snake_case for test files (`*_test.go`) ## Function Design Patterns - **Constructor Pattern**: `NewXxx()` functions for creating instances - **Options Pattern**: Used in fetcher with `FetcherOption` functional options - **Context Propagation**: All network operations accept `context.Context` - **Resource Management**: Proper `defer` usage for cleanup (file handles, HTTP responses) ## Documentation - **Godoc Comments**: All exported functions, types, and constants have comments - **README**: Comprehensive usage examples and feature documentation - **Code Comments**: Explain complex logic, especially in parsing and URL manipulation ================================================ FILE: .serena/memories/files_feature_overview.md ================================================ # File Attachment Download Feature ## Implementation Overview New feature added in `lib/files.go` that allows downloading file attachments from Substack posts. ## Key Components ### FileDownloader struct - Manages file downloads with rate limiting via Fetcher - Configurable output directory and file extensions filter - Integrates with existing image download workflow ### CSS Selector Detection - Uses `.file-embed-button.wide` to find file attachment links - Extracts download URLs from `href` attributes ### Core Functions - `DownloadFiles()` - Main entry point, returns FileDownloadResult - `extractFileElements()` - Finds file links in HTML using CSS selector - `downloadSingleFile()` - Downloads individual files with error handling - `updateHTMLWithLocalPaths()` - Replaces URLs with local paths ### Features - Extension filtering via `--file-extensions` flag - Custom output directory via `--files-dir` flag - Filename extraction from URLs and query parameters - Safe filename sanitization (removes unsafe characters) - File existence checking (skip if already downloaded) - Relative path conversion for HTML references ## CLI Integration - New flags in `cmd/download.go`: - `--download-files` - Enable file downloading - `--file-extensions` - Filter by extensions (comma-separated) - `--files-dir` - Custom files directory name ## Integration with Extractor - Extended `WriteToFileWithImages()` to also handle file downloads - Unified workflow for both images and files ================================================ FILE: .serena/memories/project_overview.md ================================================ # Project Overview ## Purpose sbstck-dl is a Go CLI tool for downloading posts from Substack blogs. It supports downloading individual posts or entire archives, with features for private newsletters (via cookies), rate limiting, and format conversion (HTML/Markdown/Text). The tool also supports downloading images and file attachments locally. ## Tech Stack - **Language**: Go 1.20+ - **CLI Framework**: Cobra (github.com/spf13/cobra) - **HTML Parsing**: goquery (github.com/PuerkitoBio/goquery) - **HTML to Markdown**: html-to-markdown (github.com/JohannesKaufmann/html-to-markdown) - **HTML to Text**: html2text (github.com/k3a/html2text) - **Retry Logic**: backoff (github.com/cenkalti/backoff/v4) - **Rate Limiting**: golang.org/x/time/rate - **Concurrency**: golang.org/x/sync/errgroup - **Progress Bar**: progressbar (github.com/schollz/progressbar/v3) - **Testing**: testify (github.com/stretchr/testify) ## Repository Structure - `main.go`: Entry point - `cmd/`: Cobra CLI commands (root.go, download.go, list.go, version.go) - `lib/`: Core library components - `fetcher.go`: HTTP client with rate limiting, retries, and cookie support - `extractor.go`: Post extraction and format conversion (HTML→Markdown/Text) - `images.go`: Image downloading and local path management - `files.go`: File attachment downloading and local path management - `.github/workflows/`: CI/CD workflows for testing and releases - Tests are co-located with source files (e.g., `lib/fetcher_test.go`) ================================================ FILE: .serena/memories/project_structure.md ================================================ # Project Structure - sbstck-dl ## Overview Go CLI tool for downloading posts from Substack blogs with support for private newsletters, rate limiting, and format conversion. ## Directory Structure ``` ├── main.go # Entry point ├── cmd/ # Cobra CLI commands │ ├── root.go │ ├── download.go # Main download functionality │ ├── list.go │ ├── version.go │ ├── cmd_test.go # Command tests │ └── integration_test.go ├── lib/ # Core library │ ├── fetcher.go # HTTP client with rate limiting/retries │ ├── fetcher_test.go # Comprehensive HTTP client tests │ ├── extractor.go # Post extraction and format conversion │ ├── extractor_test.go # Extractor tests │ ├── images.go # Image downloader │ ├── images_test.go # Comprehensive image tests │ └── files.go # NEW: File attachment downloader └── go.mod # Dependencies ``` ## Key Dependencies - `github.com/spf13/cobra` - CLI framework - `github.com/PuerkitoBio/goquery` - HTML parsing - `github.com/stretchr/testify` - Testing framework - `github.com/cenkalti/backoff/v4` - Exponential backoff - `golang.org/x/time/rate` - Rate limiting ================================================ FILE: .serena/memories/suggested_commands.md ================================================ # Suggested Commands ## Development Commands ### Building ```bash go build -o sbstck-dl . ``` ### Running ```bash go run . [command] [flags] ``` ### Testing ```bash # Run all tests go test ./... # Run tests with verbose output go test -v ./... # Run tests for specific package go test ./lib go test ./cmd ``` ### Module Management ```bash # Clean up dependencies go mod tidy # Download dependencies go mod download # Verify dependencies go mod verify ``` ### Running the CLI Locally ```bash # Download single post go run . download --url https://example.substack.com/p/post-title --output ./downloads # Download entire archive go run . download --url https://example.substack.com --output ./downloads # Download with images go run . download --url https://example.substack.com --download-images --output ./downloads # Download with file attachments go run . download --url https://example.substack.com --download-files --output ./downloads # Download with both images and files go run . download --url https://example.substack.com --download-images --download-files --output ./downloads # Test with dry run and verbose output go run . download --url https://example.substack.com --verbose --dry-run ``` ### System Commands (Linux) - `rg` (ripgrep) for searching instead of grep - Standard Linux commands: `ls`, `cd`, `find`, `git` ================================================ FILE: .serena/memories/task_completion_checklist.md ================================================ # Task Completion Checklist ## After Completing Development Tasks ### Testing 1. **Run Unit Tests**: `go test ./...` 2. **Run Integration Tests**: `go test -v ./...` 3. **Test CLI Commands**: Manual testing with real Substack URLs 4. **Test Edge Cases**: Error conditions, malformed URLs, network failures ### Code Quality 1. **Format Code**: `gofmt -w .` (usually handled by editor) 2. **Lint Code**: Use `golint` or `go vet` if available 3. **Verify Dependencies**: `go mod tidy && go mod verify` ### Documentation Updates 1. **Update CLAUDE.md**: Add new features, commands, architectural changes 2. **Update README.md**: Add usage examples for new features 3. **Update Help Text**: Ensure CLI help reflects new flags and options 4. **Update Comments**: Ensure godoc comments are current ### Version Control 1. **Stage Changes**: `git add` only relevant files 2. **Commit**: Use conventional commits format - `feat: add new feature` - `fix: resolve bug` - `docs: update documentation` - `test: add tests` - `refactor: improve code structure` 3. **Clean Up**: Remove any temporary files or test artifacts ### Build Verification 1. **Build Binary**: `go build -o sbstck-dl .` 2. **Test Binary**: Run basic commands to ensure it works 3. **Cross-Platform Check**: Ensure no platform-specific code issues ================================================ FILE: .serena/memories/testing_patterns.md ================================================ # Testing Patterns in sbstck-dl ## Test Structure - All tests use `github.com/stretchr/testify` with `assert` and `require` - Tests organized in table-driven style where appropriate - Each major component has comprehensive test coverage ## Common Patterns ### HTTP Server Tests - Use `httptest.NewServer()` for mock servers - Test various response scenarios (success, errors, timeouts) - Handle concurrent requests and rate limiting ### File I/O Tests - Use `os.MkdirTemp()` for temporary directories - Always clean up with `defer os.RemoveAll(tempDir)` - Test file creation, existence, and content validation ### HTML Parsing Tests - Use `goquery.NewDocumentFromReader(strings.NewReader(html))` - Test various HTML structures and edge cases - Validate URL extraction and replacement ### Error Handling Tests - Test both success and failure scenarios - Use specific error assertions and error message checking - Test context cancellation and timeouts ### Benchmark Tests - Include performance benchmarks for critical paths - Use `b.ResetTimer()` appropriately - Test both single operations and concurrent scenarios ## Test Organization - Unit tests for individual functions - Integration tests for complete workflows - Regression tests for specific bug fixes - Real-world data tests (when sample data available) ================================================ FILE: .serena/project.yml ================================================ # language of the project (csharp, python, rust, java, typescript, go, cpp, or ruby) # * For C, use cpp # * For JavaScript, use typescript # Special requirements: # * csharp: Requires the presence of a .sln file in the project folder. language: go # whether to use the project's gitignore file to ignore files # Added on 2025-04-07 ignore_all_files_in_gitignore: true # list of additional paths to ignore # same syntax as gitignore, so you can use * and ** # Was previously called `ignored_dirs`, please update your config if you are using that. # Added (renamed)on 2025-04-07 ignored_paths: [] # whether the project is in read-only mode # If set to true, all editing tools will be disabled and attempts to use them will result in an error # Added on 2025-04-18 read_only: false # list of tool names to exclude. We recommend not excluding any tools, see the readme for more details. # Below is the complete list of tools for convenience. # To make sure you have the latest list of tools, and to view their descriptions, # execute `uv run scripts/print_tool_overview.py`. # # * `activate_project`: Activates a project by name. # * `check_onboarding_performed`: Checks whether project onboarding was already performed. # * `create_text_file`: Creates/overwrites a file in the project directory. # * `delete_lines`: Deletes a range of lines within a file. # * `delete_memory`: Deletes a memory from Serena's project-specific memory store. # * `execute_shell_command`: Executes a shell command. # * `find_referencing_code_snippets`: Finds code snippets in which the symbol at the given location is referenced. # * `find_referencing_symbols`: Finds symbols that reference the symbol at the given location (optionally filtered by type). # * `find_symbol`: Performs a global (or local) search for symbols with/containing a given name/substring (optionally filtered by type). # * `get_current_config`: Prints the current configuration of the agent, including the active and available projects, tools, contexts, and modes. # * `get_symbols_overview`: Gets an overview of the top-level symbols defined in a given file or directory. # * `initial_instructions`: Gets the initial instructions for the current project. # Should only be used in settings where the system prompt cannot be set, # e.g. in clients you have no control over, like Claude Desktop. # * `insert_after_symbol`: Inserts content after the end of the definition of a given symbol. # * `insert_at_line`: Inserts content at a given line in a file. # * `insert_before_symbol`: Inserts content before the beginning of the definition of a given symbol. # * `list_dir`: Lists files and directories in the given directory (optionally with recursion). # * `list_memories`: Lists memories in Serena's project-specific memory store. # * `onboarding`: Performs onboarding (identifying the project structure and essential tasks, e.g. for testing or building). # * `prepare_for_new_conversation`: Provides instructions for preparing for a new conversation (in order to continue with the necessary context). # * `read_file`: Reads a file within the project directory. # * `read_memory`: Reads the memory with the given name from Serena's project-specific memory store. # * `remove_project`: Removes a project from the Serena configuration. # * `replace_lines`: Replaces a range of lines within a file with new content. # * `replace_symbol_body`: Replaces the full definition of a symbol. # * `restart_language_server`: Restarts the language server, may be necessary when edits not through Serena happen. # * `search_for_pattern`: Performs a search for a pattern in the project. # * `summarize_changes`: Provides instructions for summarizing the changes made to the codebase. # * `switch_modes`: Activates modes by providing a list of their names # * `think_about_collected_information`: Thinking tool for pondering the completeness of collected information. # * `think_about_task_adherence`: Thinking tool for determining whether the agent is still on track with the current task. # * `think_about_whether_you_are_done`: Thinking tool for determining whether the task is truly completed. # * `write_memory`: Writes a named memory (for future reference) to Serena's project-specific memory store. excluded_tools: [] # initial prompt for the project. It will always be given to the LLM upon activating the project # (contrary to the memories, which are loaded on demand). initial_prompt: "" project_name: "sbstck-dl" ================================================ FILE: CLAUDE.md ================================================ # CLAUDE.md This file provides guidance to Claude Code (claude.ai/code) when working with code in this repository. ## Project Overview This is a Go CLI tool for downloading posts from Substack blogs. It supports downloading individual posts or entire archives, with features for private newsletters (via cookies), rate limiting, format conversion (HTML/Markdown/Text), downloading of images and file attachments locally, and creating archive index pages that link all downloaded posts with their metadata. ## Architecture The project follows a standard Go CLI structure: - `main.go`: Entry point - `cmd/`: Contains Cobra CLI commands (`root.go`, `download.go`, `list.go`, `version.go`) - `lib/`: Core library with four main components: - `fetcher.go`: HTTP client with rate limiting, retries, and cookie support - `extractor.go`: Post extraction and format conversion (HTML→Markdown/Text) - `images.go`: Image downloading and local path management - `files.go`: File attachment downloading and local path management ## Build and Development Commands ### Building ```bash go build -o sbstck-dl . ``` ### Running ```bash go run . [command] [flags] ``` ### Testing ```bash go test ./... go test ./lib ``` ### Module management ```bash go mod tidy go mod download ``` ## Key Components ### Fetcher (`lib/fetcher.go`) - Handles HTTP requests with exponential backoff retry - Rate limiting (default: 2 requests/second) - Cookie support for private newsletters - Proxy support ### Extractor (`lib/extractor.go`) - Parses Substack post JSON from HTML - Extracts post metadata including subtitle (.subtitle CSS selector) and cover image (og:image meta tag) - Converts HTML to Markdown/Text using external libraries - Handles file writing with different formats - Provides archive page generation functionality (HTML/Markdown/Text formats) - Manages archive entries with automatic sorting by publication date (newest first) ### Image Downloader (`lib/images.go`) - Downloads images locally from Substack posts - Supports multiple image quality levels (high/medium/low) - Handles various Substack CDN URL patterns - Updates HTML/Markdown content to reference local image paths - Creates organized directory structure for downloaded images ### File Downloader (`lib/files.go`) - Downloads file attachments from Substack posts using CSS selector `.file-embed-button.wide` - Supports file extension filtering (optional) - Creates organized directory structure for downloaded files - Updates HTML content to reference local file paths - Handles filename sanitization and collision avoidance - Integrates with existing image download workflow ### Archive Page Generator (`lib/extractor.go`) - Creates index pages linking all downloaded posts with metadata - Supports HTML, Markdown, and Text formats matching the selected output format - Includes post titles (linked to downloaded files with relative paths) - Shows publication dates and download timestamps - Displays post descriptions/subtitles and cover images when available - Automatically sorts posts by publication date (newest first) - Generates `index.{format}` in the output directory root ### Commands Structure Uses Cobra framework: - `download`: Main functionality for downloading posts - `list`: Lists available posts from a Substack - `version`: Shows version information ## Dependencies - `github.com/spf13/cobra`: CLI framework - `github.com/PuerkitoBio/goquery`: HTML parsing - `github.com/JohannesKaufmann/html-to-markdown`: HTML to Markdown conversion - `github.com/cenkalti/backoff/v4`: Exponential backoff for retries - `golang.org/x/time/rate`: Rate limiting - `golang.org/x/sync/errgroup`: Concurrent processing ## Common Development Tasks ### Running the CLI locally ```bash go run . download --url https://example.substack.com --output ./downloads ``` ### Testing with verbose output ```bash go run . download --url https://example.substack.com --verbose --dry-run ``` ### Downloading posts with images ```bash # Download posts with high-quality images go run . download --url https://example.substack.com --download-images --image-quality high --output ./downloads # Download with medium quality images and custom images directory go run . download --url https://example.substack.com --download-images --image-quality medium --images-dir assets --output ./downloads # Download single post with images in markdown format go run . download --url https://example.substack.com/p/post-title --download-images --format md --output ./downloads ``` ### Downloading posts with file attachments ```bash # Download posts with file attachments go run . download --url https://example.substack.com --download-files --output ./downloads # Download with specific file extensions only go run . download --url https://example.substack.com --download-files --file-extensions "pdf,docx,txt" --output ./downloads # Download with custom files directory name go run . download --url https://example.substack.com --download-files --files-dir attachments --output ./downloads # Download single post with both images and file attachments go run . download --url https://example.substack.com/p/post-title --download-images --download-files --output ./downloads ``` ### Creating archive index pages ```bash # Download posts and create an archive index page go run . download --url https://example.substack.com --create-archive --output ./downloads # Download entire archive with archive index in markdown format go run . download --url https://example.substack.com --create-archive --format md --output ./downloads # Download single post with archive page (useful for building up an archive over time) go run . download --url https://example.substack.com/p/post-title --create-archive --output ./downloads # Download with all features: images, files, and archive page go run . download --url https://example.substack.com --download-images --download-files --create-archive --output ./downloads # Download archive with specific format and custom directories go run . download --url https://example.substack.com --create-archive --format html --images-dir assets --files-dir attachments --output ./downloads ``` ### Building for release ```bash go build -ldflags="-s -w" -o sbstck-dl . ``` ================================================ FILE: LICENSE ================================================ The MIT License (MIT) Copyright © 2023 Alex Ferrari alex@thealexferrari.com Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. ================================================ FILE: README.md ================================================ # Substack Downloader Simple CLI tool to download one or all the posts from a Substack blog. ## Installation ### Downloading the binary Check in the [releases](https://github.com/alexferrari88/sbstck-dl/releases) page for the latest version of the binary for your platform. We provide binaries for Linux, MacOS and Windows. ### Using Go ```bash go install github.com/alexferrari88/sbstck-dl ``` Your Go bin directory must be in your PATH. You can add it by adding the following line to your `.bashrc` or `.zshrc`: ```bash export PATH=$PATH:$(go env GOPATH)/bin ``` ## Usage ```bash Usage: sbstck-dl [command] Available Commands: download Download individual posts or the entire public archive help Help about any command list List the posts of a Substack version Print the version number of sbstck-dl Flags: --after string Download posts published after this date (format: YYYY-MM-DD) --before string Download posts published before this date (format: YYYY-MM-DD) --cookie_name cookieName Either substack.sid or connect.sid, based on your cookie (required for private newsletters) --cookie_val string The substack.sid/connect.sid cookie value (required for private newsletters) -h, --help help for sbstck-dl -x, --proxy string Specify the proxy url -r, --rate int Specify the rate of requests per second (default 2) -v, --verbose Enable verbose output Use "sbstck-dl [command] --help" for more information about a command. ``` ### Downloading posts You can provide the url of a single post or the main url of the Substack you want to download. By providing the main URL of a Substack, the downloader will download all the posts of the archive. When downloading the full archive, if the downloader is interrupted, at the next execution it will resume the download of the remaining posts. ```bash Usage: sbstck-dl download [flags] Flags: --add-source-url Add the original post URL at the end of the downloaded file --create-archive Create an archive index page linking all downloaded posts --download-files Download file attachments locally and update content to reference local files --download-images Download images locally and update content to reference local files -d, --dry-run Enable dry run --file-extensions string Comma-separated list of file extensions to download (e.g., 'pdf,docx,txt'). If empty, downloads all file types --files-dir string Directory name for downloaded file attachments (default "files") -f, --format string Specify the output format (options: "html", "md", "txt" (default "html") -h, --help help for download --image-quality string Image quality to download (options: "high", "medium", "low") (default "high") --images-dir string Directory name for downloaded images (default "images") -o, --output string Specify the download directory (default ".") -u, --url string Specify the Substack url Global Flags: --after string Download posts published after this date (format: YYYY-MM-DD) --before string Download posts published before this date (format: YYYY-MM-DD) --cookie_name cookieName Either substack.sid or connect.sid, based on your cookie (required for private newsletters) --cookie_val string The substack.sid/connect.sid cookie value (required for private newsletters) -x, --proxy string Specify the proxy url -r, --rate int Specify the rate of requests per second (default 2) -v, --verbose Enable verbose output ``` #### Adding Source URL If you use the `--add-source-url` flag, each downloaded file will have the following line appended to its content: `original content: POST_URL` Where `POST_URL` is the canonical URL of the downloaded post. For HTML format, this will be wrapped in a small paragraph with a link. #### Downloading Images Use the `--download-images` flag to download all images from Substack posts locally. This ensures posts remain accessible even if images are deleted from Substack's CDN. **Features:** - Downloads images at optimal quality (high/medium/low) - Creates organized directory structure: `{output}/images/{post-slug}/` - Updates HTML/Markdown content to reference local image paths - Handles all Substack image formats and CDN patterns - Graceful error handling for individual image failures **Examples:** ```bash # Download posts with high-quality images (default) sbstck-dl download --url https://example.substack.com --download-images # Download with medium quality images sbstck-dl download --url https://example.substack.com --download-images --image-quality medium # Download with custom images directory name sbstck-dl download --url https://example.substack.com --download-images --images-dir assets # Download single post with images in markdown format sbstck-dl download --url https://example.substack.com/p/post-title --download-images --format md ``` **Image Quality Options:** - `high`: 1456px width (best quality, larger files) - `medium`: 848px width (balanced quality/size) - `low`: 424px width (smaller files, mobile-optimized) **Directory Structure:** ``` output/ ├── 20231201_120000_post-title.html └── images/ └── post-title/ ├── image1_1456x819.jpeg ├── image2_848x636.png └── image3_1272x720.webp ``` #### Downloading File Attachments Use the `--download-files` flag to download all file attachments from Substack posts locally. This ensures posts remain accessible even if files are removed from Substack's servers. **Features:** - Downloads file attachments using CSS selector `.file-embed-button.wide` - Optional file extension filtering (e.g., only PDFs and Word documents) - Creates organized directory structure: `{output}/files/{post-slug}/` - Updates HTML content to reference local file paths - Handles filename sanitization and collision avoidance - Graceful error handling for individual file download failures **Examples:** ```bash # Download posts with all file attachments sbstck-dl download --url https://example.substack.com --download-files # Download only specific file types sbstck-dl download --url https://example.substack.com --download-files --file-extensions "pdf,docx,txt" # Download with custom files directory name sbstck-dl download --url https://example.substack.com --download-files --files-dir attachments # Download single post with both images and file attachments sbstck-dl download --url https://example.substack.com/p/post-title --download-images --download-files --format md ``` **File Extension Filtering:** - Specify extensions without dots: `pdf,docx,txt` - Case insensitive matching - If no extensions specified, downloads all file types **Directory Structure with Files:** ``` output/ ├── 20231201_120000_post-title.html ├── images/ │ └── post-title/ │ ├── image1_1456x819.jpeg │ └── image2_848x636.png └── files/ └── post-title/ ├── document.pdf ├── spreadsheet.xlsx └── presentation.pptx ``` #### Creating Archive Index Pages Use the `--create-archive` flag to generate an organized index page that links all downloaded posts with their metadata. This creates a beautiful overview of your downloaded content, making it easy to browse and access your Substack archive. **Features:** - Creates `index.{format}` file matching your selected output format (HTML/Markdown/Text) - Links to all downloaded posts using relative file paths - Displays post titles, publication dates, and download timestamps - Shows post descriptions/subtitles and cover images when available - Automatically sorts posts by publication date (newest first) - Works with both single post and bulk downloads **Examples:** ```bash # Download entire archive and create index page sbstck-dl download --url https://example.substack.com --create-archive # Create archive index in Markdown format sbstck-dl download --url https://example.substack.com --create-archive --format md # Build archive over time with single posts sbstck-dl download --url https://example.substack.com/p/post-title --create-archive # Complete download with all features sbstck-dl download --url https://example.substack.com --download-images --download-files --create-archive # Custom directory structure with archive sbstck-dl download --url https://example.substack.com --create-archive --images-dir assets --files-dir attachments ``` **Archive Content Per Post:** - **Title**: Clickable link to the downloaded post file - **Publication Date**: When the post was originally published on Substack - **Download Date**: When you downloaded the post locally - **Description**: Post subtitle or description (when available) - **Cover Image**: Featured image from the post (when available) **Archive Format Examples:** *HTML Format:* Styled webpage with images, organized post cards, and hover effects *Markdown Format:* Clean markdown with headers, links, and image references *Text Format:* Plain text listing with all metadata for maximum compatibility **Directory Structure with Archive:** ``` output/ ├── index.html # Archive index page ├── 20231201_120000_post-title.html ├── 20231115_090000_another-post.html ├── images/ │ ├── post-title/ │ │ └── image1_1456x819.jpeg │ └── another-post/ │ └── image2_848x636.png └── files/ ├── post-title/ │ └── document.pdf └── another-post/ └── spreadsheet.xlsx ``` ### Listing posts ```bash Usage: sbstck-dl list [flags] Flags: -h, --help help for list -u, --url string Specify the Substack url Global Flags: --after string Download posts published after this date (format: YYYY-MM-DD) --before string Download posts published before this date (format: YYYY-MM-DD) --cookie_name cookieName Either substack.sid or connect.sid, based on your cookie (required for private newsletters) --cookie_val string The substack.sid/connect.sid cookie value (required for private newsletters) -x, --proxy string Specify the proxy url -r, --rate int Specify the rate of requests per second (default 2) -v, --verbose Enable verbose output ``` ### Private Newsletters In order to download the full text of private newsletters you need to provide the cookie name and value of your session. The cookie name is either `substack.sid` or `connect.sid`, based on your cookie. To get the cookie value you can use the developer tools of your browser. Once you have the cookie name and value, you can pass them to the downloader using the `--cookie_name` and `--cookie_val` flags. #### Example ```bash sbstck-dl download --url https://example.substack.com --cookie_name substack.sid --cookie_val COOKIE_VALUE ``` ## Thanks - [wemoveon2](https://github.com/wemoveon2) and [lenzj](https://github.com/lenzj) for the discussion and help implementing the support for private newsletters ## TODO - [x] Improve retry logic - [ ] Implement loading from config file - [x] Add support for downloading images - [x] Add support for downloading file attachments - [x] Add archive index page functionality - [x] Add tests - [x] Add CI - [x] Add documentation - [x] Add support for private newsletters - [x] Implement filtering by date - [x] Implement resuming downloads ================================================ FILE: cmd/cmd_test.go ================================================ package cmd import ( "net/url" "os" "testing" "github.com/alexferrari88/sbstck-dl/lib" "github.com/stretchr/testify/assert" "github.com/stretchr/testify/require" ) // Test parseURL function func TestParseURL(t *testing.T) { tests := []struct { name string input string expectError bool expectedURL *url.URL }{ { name: "valid https URL", input: "https://example.substack.com", expectError: false, expectedURL: &url.URL{ Scheme: "https", Host: "example.substack.com", }, }, { name: "valid http URL", input: "http://example.substack.com", expectError: false, expectedURL: &url.URL{ Scheme: "http", Host: "example.substack.com", }, }, { name: "URL with path", input: "https://example.substack.com/p/test-post", expectError: false, expectedURL: &url.URL{ Scheme: "https", Host: "example.substack.com", Path: "/p/test-post", }, }, { name: "invalid URL - no scheme", input: "example.substack.com", expectError: true, }, { name: "invalid URL - no host", input: "https://", expectError: true, // parseURL returns nil, nil for this case }, { name: "invalid URL - malformed", input: "not-a-url", expectError: true, }, { name: "empty string", input: "", expectError: true, }, } for _, tt := range tests { t.Run(tt.name, func(t *testing.T) { result, err := parseURL(tt.input) if tt.expectError { // For this specific case, parseURL returns nil, nil which means no error but also no result if result == nil { assert.True(t, true) // This is the expected behavior for invalid URLs } else { assert.Error(t, err) } } else { require.NoError(t, err) require.NotNil(t, result) assert.Equal(t, tt.expectedURL.Scheme, result.Scheme) assert.Equal(t, tt.expectedURL.Host, result.Host) if tt.expectedURL.Path != "" { assert.Equal(t, tt.expectedURL.Path, result.Path) } } }) } } // Test makeDateFilterFunc function func TestMakeDateFilterFunc(t *testing.T) { tests := []struct { name string beforeDate string afterDate string testDates map[string]bool // date -> expected result }{ { name: "no filters", beforeDate: "", afterDate: "", testDates: map[string]bool{ "2023-01-01": true, "2023-06-15": true, "2023-12-31": true, }, }, { name: "before filter only", beforeDate: "2023-06-15", afterDate: "", testDates: map[string]bool{ "2023-01-01": true, "2023-06-14": true, "2023-06-15": false, "2023-06-16": false, "2023-12-31": false, }, }, { name: "after filter only", beforeDate: "", afterDate: "2023-06-15", testDates: map[string]bool{ "2023-01-01": false, "2023-06-14": false, "2023-06-15": false, "2023-06-16": true, "2023-12-31": true, }, }, { name: "both filters", beforeDate: "2023-12-31", afterDate: "2023-01-01", testDates: map[string]bool{ "2022-12-31": false, "2023-01-01": false, "2023-06-15": true, "2023-12-30": true, "2023-12-31": false, "2024-01-01": false, }, }, } for _, tt := range tests { t.Run(tt.name, func(t *testing.T) { filterFunc := makeDateFilterFunc(tt.beforeDate, tt.afterDate) if tt.beforeDate == "" && tt.afterDate == "" { // No filter should return nil assert.Nil(t, filterFunc) } else { require.NotNil(t, filterFunc) for date, expected := range tt.testDates { result := filterFunc(date) assert.Equal(t, expected, result, "Date %s should return %v", date, expected) } } }) } } // Test makePath function func TestMakePath(t *testing.T) { post := lib.Post{ PostDate: "2023-01-01T10:30:00.000Z", // Use RFC3339 format Slug: "test-post", } tests := []struct { name string post lib.Post outputFolder string format string expected string }{ { name: "basic path", post: post, outputFolder: "/tmp/downloads", format: "html", expected: "/tmp/downloads/20230101_103000_test-post.html", }, { name: "markdown format", post: post, outputFolder: "/tmp/downloads", format: "md", expected: "/tmp/downloads/20230101_103000_test-post.md", }, { name: "text format", post: post, outputFolder: "/tmp/downloads", format: "txt", expected: "/tmp/downloads/20230101_103000_test-post.txt", }, { name: "no output folder", post: post, outputFolder: "", format: "html", expected: "/20230101_103000_test-post.html", }, } for _, tt := range tests { t.Run(tt.name, func(t *testing.T) { result := makePath(tt.post, tt.outputFolder, tt.format) assert.Equal(t, tt.expected, result) }) } } // Test convertDateTime function func TestConvertDateTime(t *testing.T) { tests := []struct { name string input string expected string }{ { name: "basic date", input: "2023-01-01", expected: "", // Invalid format, should return empty string }, { name: "date with time", input: "2023-01-01T10:30:00.000Z", expected: "20230101_103000", }, { name: "different date format", input: "2023-12-31T23:59:59.999Z", expected: "20231231_235959", }, { name: "empty string", input: "", expected: "", }, } for _, tt := range tests { t.Run(tt.name, func(t *testing.T) { result := convertDateTime(tt.input) assert.Equal(t, tt.expected, result) }) } } // Test extractSlug function func TestExtractSlug(t *testing.T) { tests := []struct { name string input string expected string }{ { name: "basic substack URL", input: "https://example.substack.com/p/test-post", expected: "test-post", }, { name: "URL with query parameters", input: "https://example.substack.com/p/test-post?utm_source=newsletter", expected: "test-post?utm_source=newsletter", // extractSlug doesn't handle query params }, { name: "URL with anchor", input: "https://example.substack.com/p/test-post#comments", expected: "test-post#comments", // extractSlug doesn't handle anchors }, { name: "URL with trailing slash", input: "https://example.substack.com/p/test-post/", expected: "", // extractSlug returns empty string for trailing slash }, { name: "complex slug with dashes", input: "https://example.substack.com/p/this-is-a-very-long-post-title", expected: "this-is-a-very-long-post-title", }, { name: "no /p/ in URL", input: "https://example.substack.com/test-post", expected: "test-post", // extractSlug just returns the last segment }, { name: "empty string", input: "", expected: "", }, } for _, tt := range tests { t.Run(tt.name, func(t *testing.T) { result := extractSlug(tt.input) assert.Equal(t, tt.expected, result) }) } } // Test cookieName type func TestCookieName(t *testing.T) { t.Run("String method", func(t *testing.T) { cn := cookieName("test-cookie") assert.Equal(t, "test-cookie", cn.String()) }) t.Run("Type method", func(t *testing.T) { cn := cookieName("") assert.Equal(t, "cookieName", cn.Type()) }) t.Run("Set method - valid values", func(t *testing.T) { validNames := []string{"substack.sid", "connect.sid"} for _, name := range validNames { cn := cookieName("") err := cn.Set(name) assert.NoError(t, err) assert.Equal(t, name, cn.String()) } }) t.Run("Set method - invalid values", func(t *testing.T) { invalidNames := []string{"invalid", "session", "auth", ""} for _, name := range invalidNames { cn := cookieName("") err := cn.Set(name) assert.Error(t, err) assert.Contains(t, err.Error(), "invalid cookie name") } }) } // Test that we can create paths and handle files correctly func TestFileHandling(t *testing.T) { // Create a temporary directory for testing tempDir := t.TempDir() // Create a test file existingFile := tempDir + "/existing.html" post := lib.Post{Title: "Test", BodyHTML: "
Test content
"} err := post.WriteToFile(existingFile, "html", false) require.NoError(t, err) // Test that file was created successfully _, err = os.Stat(existingFile) assert.NoError(t, err) // Test path creation testPost := lib.Post{PostDate: "2023-01-01T10:30:00.000Z", Slug: "test-post"} path := makePath(testPost, tempDir, "html") expectedPath := tempDir + "/20230101_103000_test-post.html" assert.Equal(t, expectedPath, path) } // Test time parsing and formatting func TestTimeFormatting(t *testing.T) { t.Run("convertDateTime with various formats", func(t *testing.T) { // Test the actual time parsing logic testCases := []struct { input string expected string }{ {"2023-01-01T10:30:00.000Z", "20230101_103000"}, {"2023-01-01T10:30:00Z", "20230101_103000"}, {"2023-01-01", ""}, // Invalid format, should return empty string {"2023-12-31T23:59:59.999Z", "20231231_235959"}, } for _, tc := range testCases { result := convertDateTime(tc.input) assert.Equal(t, tc.expected, result) } }) } // Integration test for date filtering func TestDateFilteringIntegration(t *testing.T) { t.Run("date filter with actual dates", func(t *testing.T) { // Test the interaction between date filtering and URL processing beforeDate := "2023-06-15" afterDate := "2023-01-01" filterFunc := makeDateFilterFunc(beforeDate, afterDate) require.NotNil(t, filterFunc) // Test dates within range assert.True(t, filterFunc("2023-03-15")) assert.True(t, filterFunc("2023-06-14")) // Test dates outside range assert.False(t, filterFunc("2022-12-31")) assert.False(t, filterFunc("2023-01-01")) assert.False(t, filterFunc("2023-06-15")) assert.False(t, filterFunc("2023-12-31")) }) } // Test constants func TestConstants(t *testing.T) { t.Run("cookie name constants", func(t *testing.T) { assert.Equal(t, "substack.sid", string(substackSid)) assert.Equal(t, "connect.sid", string(connectSid)) }) } ================================================ FILE: cmd/download.go ================================================ package cmd import ( "fmt" "log" "net/url" "path/filepath" "strings" "time" "github.com/alexferrari88/sbstck-dl/lib" "github.com/schollz/progressbar/v3" "github.com/spf13/cobra" ) // downloadCmd represents the download command var ( downloadUrl string format string outputFolder string dryRun bool addSourceURL bool downloadImages bool imageQuality string imagesDir string downloadFiles bool fileExtensions string filesDir string createArchive bool downloadCmd = &cobra.Command{ Use: "download", Short: "Download individual posts or the entire public archive", Long: `You can provide the url of a single post or the main url of the Substack you want to download.`, Run: func(cmd *cobra.Command, args []string) { startTime := time.Now() // Create archive instance if flag is set var archive *lib.Archive if createArchive { archive = lib.NewArchive() } // if url contains "/p/", we are downloading a single post if strings.Contains(downloadUrl, "/p/") { if verbose { fmt.Printf("Downloading post %s\n", downloadUrl) } if dryRun { fmt.Println("Dry run, exiting...") return } if (beforeDate != "" || afterDate != "") && verbose { fmt.Println("Warning: --before and --after flags are ignored when downloading a single post") } post, err := extractor.ExtractPost(ctx, downloadUrl) if err != nil { log.Fatalln(err) } downloadTime := time.Since(startTime) if verbose { fmt.Printf("Downloaded post %s in %s\n", downloadUrl, downloadTime) } path := makePath(post, outputFolder, format) if verbose { fmt.Printf("Writing post to file %s\n", path) } if downloadImages || downloadFiles { imageQualityEnum := lib.ImageQuality(imageQuality) // Parse file extensions if specified var fileExtensionsSlice []string if fileExtensions != "" { fileExtensionsSlice = strings.Split(strings.ReplaceAll(fileExtensions, " ", ""), ",") } imageResult, err := post.WriteToFileWithImages(ctx, path, format, addSourceURL, downloadImages, imageQualityEnum, imagesDir, downloadFiles, fileExtensionsSlice, filesDir, fetcher) if err != nil { log.Printf("Error writing file %s: %v\n", path, err) } else if verbose && imageResult.Success > 0 { fmt.Printf("Downloaded %d images (%d failed) for post %s\n", imageResult.Success, imageResult.Failed, post.Slug) } } else { err = post.WriteToFile(path, format, addSourceURL) if err != nil { log.Printf("Error writing file %s: %v\n", path, err) } } // Add to archive if enabled if archive != nil { archive.AddEntry(post, path, startTime) } if verbose { fmt.Println("Done in ", time.Since(startTime)) } } else { // we are downloading the entire archive var downloadedPostsCount int dateFilterfunc := makeDateFilterFunc(beforeDate, afterDate) urls, err := extractor.GetAllPostsURLs(ctx, downloadUrl, dateFilterfunc) urlsCount := len(urls) if err != nil { log.Fatalln(err) } if urlsCount == 0 { if verbose { fmt.Println("No posts found, exiting...") } return } if verbose { fmt.Printf("Found %d posts\n", urlsCount) } if dryRun { fmt.Printf("Found %d posts\n", urlsCount) fmt.Println("Dry run, exiting...") return } urls, err = filterExistingPosts(urls, outputFolder, format) if err != nil { if verbose { fmt.Println("Error filtering existing posts:", err) } } if len(urls) == 0 { if verbose { fmt.Println("No new posts found, exiting...") } return } bar := progressbar.NewOptions(len(urls), progressbar.OptionSetWidth(25), progressbar.OptionSetDescription("downloading"), progressbar.OptionShowBytes(true)) for result := range extractor.ExtractAllPosts(ctx, urls) { select { case <-ctx.Done(): log.Fatalln("context cancelled") default: } if result.Err != nil { if verbose { fmt.Printf("Error downloading post %s: %s\n", result.Post.CanonicalUrl, result.Err) fmt.Println("Skipping...") } continue } bar.Add(1) downloadedPostsCount++ if verbose { fmt.Printf("Downloading post %s\n", result.Post.CanonicalUrl) } post := result.Post path := makePath(post, outputFolder, format) if verbose { fmt.Printf("Writing post to file %s\n", path) } if downloadImages || downloadFiles { imageQualityEnum := lib.ImageQuality(imageQuality) // Parse file extensions if specified var fileExtensionsSlice []string if fileExtensions != "" { fileExtensionsSlice = strings.Split(strings.ReplaceAll(fileExtensions, " ", ""), ",") } imageResult, err := post.WriteToFileWithImages(ctx, path, format, addSourceURL, downloadImages, imageQualityEnum, imagesDir, downloadFiles, fileExtensionsSlice, filesDir, fetcher) if err != nil { log.Printf("Error writing file %s: %v\n", path, err) } else if verbose && imageResult.Success > 0 { fmt.Printf("Downloaded %d images (%d failed) for post %s\n", imageResult.Success, imageResult.Failed, post.Slug) } } else { err = post.WriteToFile(path, format, addSourceURL) if err != nil { log.Printf("Error writing file %s: %v\n", path, err) } } // Add to archive if enabled and post was successfully written if archive != nil { archive.AddEntry(post, path, time.Now()) } } if verbose { fmt.Println("Downloaded", downloadedPostsCount, "posts, out of", len(urls)) fmt.Println("Done in ", time.Since(startTime)) } } // Generate archive page if enabled if archive != nil && len(archive.Entries) > 0 { if verbose { fmt.Printf("Generating archive page in %s format...\n", format) } var archiveErr error switch format { case "html": archiveErr = archive.GenerateHTML(outputFolder) case "md": archiveErr = archive.GenerateMarkdown(outputFolder) case "txt": archiveErr = archive.GenerateText(outputFolder) default: archiveErr = fmt.Errorf("unknown format for archive: %s", format) } if archiveErr != nil { log.Printf("Error generating archive page: %v\n", archiveErr) } else if verbose { fmt.Printf("Archive page generated: %s/index.%s\n", outputFolder, format) } } }, } ) func init() { downloadCmd.Flags().StringVarP(&downloadUrl, "url", "u", "", "Specify the Substack url") downloadCmd.Flags().StringVarP(&format, "format", "f", "html", "Specify the output format (options: \"html\", \"md\", \"txt\"") downloadCmd.Flags().StringVarP(&outputFolder, "output", "o", ".", "Specify the download directory") downloadCmd.Flags().BoolVarP(&dryRun, "dry-run", "d", false, "Enable dry run") downloadCmd.Flags().BoolVar(&addSourceURL, "add-source-url", false, "Add the original post URL at the end of the downloaded file") downloadCmd.Flags().BoolVar(&downloadImages, "download-images", false, "Download images locally and update content to reference local files") downloadCmd.Flags().StringVar(&imageQuality, "image-quality", "high", "Image quality to download (options: \"high\", \"medium\", \"low\")") downloadCmd.Flags().StringVar(&imagesDir, "images-dir", "images", "Directory name for downloaded images") downloadCmd.Flags().BoolVar(&downloadFiles, "download-files", false, "Download file attachments locally and update content to reference local files") downloadCmd.Flags().StringVar(&fileExtensions, "file-extensions", "", "Comma-separated list of file extensions to download (e.g., 'pdf,docx,txt'). If empty, downloads all file types") downloadCmd.Flags().StringVar(&filesDir, "files-dir", "files", "Directory name for downloaded file attachments") downloadCmd.Flags().BoolVar(&createArchive, "create-archive", false, "Create an archive index page linking all downloaded posts") downloadCmd.MarkFlagRequired("url") } func convertDateTime(datetime string) string { // Parse the datetime string parsedTime, err := time.Parse(time.RFC3339, datetime) if err != nil { // Return an empty string or an error message if parsing fails return "" } // Format the datetime to the desired format formattedDateTime := fmt.Sprintf("%d%02d%02d_%02d%02d%02d", parsedTime.Year(), parsedTime.Month(), parsedTime.Day(), parsedTime.Hour(), parsedTime.Minute(), parsedTime.Second()) return formattedDateTime } func parseURL(toTest string) (*url.URL, error) { _, err := url.ParseRequestURI(toTest) if err != nil { return nil, err } u, err := url.Parse(toTest) if err != nil || u.Scheme == "" || u.Host == "" { return nil, err } return u, err } func makePath(post lib.Post, outputFolder string, format string) string { return fmt.Sprintf("%s/%s_%s.%s", outputFolder, convertDateTime(post.PostDate), post.Slug, format) } // extractSlug extracts the slug from a Substack post URL // e.g. https://example.substack.com/p/this-is-the-post-title -> this-is-the-post-title func extractSlug(url string) string { split := strings.Split(url, "/") return split[len(split)-1] } // filterExistingPosts filters out posts that already exist in the output folder. // It looks for files whose name ends with the post slug. func filterExistingPosts(urls []string, outputFolder string, format string) ([]string, error) { var filtered []string for _, url := range urls { slug := extractSlug(url) path := fmt.Sprintf("%s/%s_%s.%s", outputFolder, "*", slug, format) matches, err := filepath.Glob(path) if err != nil { return urls, err } if len(matches) == 0 { filtered = append(filtered, url) } } return filtered, nil } ================================================ FILE: cmd/integration_test.go ================================================ package cmd import ( "bytes" "context" "encoding/json" "fmt" "net/http" "net/http/httptest" "os" "path/filepath" "strings" "testing" "time" "github.com/alexferrari88/sbstck-dl/lib" "github.com/spf13/cobra" "github.com/stretchr/testify/assert" "github.com/stretchr/testify/require" ) // Test command execution in isolation func TestCommandExecution(t *testing.T) { // Skip in short test mode if testing.Short() { t.Skip("Skipping integration test in short mode") } // Create a mock server that serves a simple post mockPost := lib.Post{ Id: 123, Title: "Test Post", Slug: "test-post", PostDate: "2023-01-01", BodyHTML: "This is a test post
", CanonicalUrl: "https://example.substack.com/p/test-post", } // Create sitemap XML sitemapXML := `Test
", } // Try to write to a file with invalid character (null byte forbidden on both Windows and Unix) err := post.WriteToFile("invalid\x00filename.html", "html", false) assert.Error(t, err) }) } // Test with different configurations func TestConfigurations(t *testing.T) { t.Run("with proxy configuration", func(t *testing.T) { // Test that proxy URL parsing works proxyURL := "http://proxy.example.com:8080" parsed, err := parseURL(proxyURL) assert.NoError(t, err) assert.Equal(t, "proxy.example.com:8080", parsed.Host) assert.Equal(t, "http", parsed.Scheme) }) t.Run("with cookie configuration", func(t *testing.T) { // Test cookie creation tests := []struct { name string cookieName cookieName cookieVal string expected string }{ { name: "substack.sid cookie", cookieName: substackSid, cookieVal: "test-value", expected: "substack.sid", }, { name: "connect.sid cookie", cookieName: connectSid, cookieVal: "test-value", expected: "connect.sid", }, } for _, tt := range tests { t.Run(tt.name, func(t *testing.T) { assert.Equal(t, tt.expected, tt.cookieName.String()) }) } }) t.Run("with rate limiting", func(t *testing.T) { // Test that different rate limits are handled rates := []int{1, 2, 5, 10} for _, rate := range rates { fetcher := lib.NewFetcher(lib.WithRatePerSecond(rate)) assert.NotNil(t, fetcher) assert.Equal(t, rate, int(fetcher.RateLimiter.Limit())) } }) } // Test real-world scenarios func TestRealWorldScenarios(t *testing.T) { // Skip in short test mode if testing.Short() { t.Skip("Skipping real-world scenario tests in short mode") } t.Run("large number of URLs", func(t *testing.T) { // Test performance with many URLs urls := make([]string, 100) for i := range urls { urls[i] = fmt.Sprintf("https://example.substack.com/p/post-%d", i) } // Test URL parsing performance start := time.Now() // Test parsing all URLs validUrls := 0 for _, url := range urls { if _, err := parseURL(url); err == nil { validUrls++ } } duration := time.Since(start) assert.Equal(t, len(urls), validUrls) // All should be valid assert.Less(t, duration, 1*time.Second) // Should be fast }) t.Run("concurrent processing", func(t *testing.T) { // Test that concurrent processing works correctly tempDir := t.TempDir() // Create multiple posts concurrently posts := make([]lib.Post, 5) for i := range posts { posts[i] = lib.Post{ Title: fmt.Sprintf("Post %d", i), Slug: fmt.Sprintf("post-%d", i), PostDate: "2023-01-01", BodyHTML: fmt.Sprintf("Content for post %d
", i), } } // Write all posts concurrently start := time.Now() for i, post := range posts { filePath := filepath.Join(tempDir, fmt.Sprintf("post-%d.html", i)) err := post.WriteToFile(filePath, "html", false) assert.NoError(t, err) } duration := time.Since(start) // Verify all files were created for i := range posts { filePath := filepath.Join(tempDir, fmt.Sprintf("post-%d.html", i)) _, err := os.Stat(filePath) assert.NoError(t, err) } assert.Less(t, duration, 1*time.Second) // Should be fast }) } // Test archive functionality end-to-end func TestArchiveWorkflow(t *testing.T) { t.Run("single post with archive", func(t *testing.T) { tempDir := t.TempDir() // Create a mock post with enhanced fields post := lib.Post{ Id: 123, Title: "Test Archive Post", Slug: "test-archive-post", PostDate: "2023-01-01T10:30:00Z", Subtitle: "This is a test subtitle", Description: "Test description", CoverImage: "https://example.com/cover.jpg", CanonicalUrl: "https://example.substack.com/p/test-archive-post", BodyHTML: "This is a test post for archive functionality.
", } // Simulate the archive workflow archive := lib.NewArchive() // Write the post to file (similar to what download command does) filePath := filepath.Join(tempDir, "20230101_103000_test-archive-post.html") err := post.WriteToFile(filePath, "html", false) require.NoError(t, err) // Add entry to archive (similar to what download command does) downloadTime, _ := time.Parse(time.RFC3339, "2023-01-10T12:00:00Z") archive.AddEntry(post, filePath, downloadTime) // Generate archive in all formats err = archive.GenerateHTML(tempDir) require.NoError(t, err) err = archive.GenerateMarkdown(tempDir) require.NoError(t, err) err = archive.GenerateText(tempDir) require.NoError(t, err) // Verify all archive files were created assert.FileExists(t, filepath.Join(tempDir, "index.html")) assert.FileExists(t, filepath.Join(tempDir, "index.md")) assert.FileExists(t, filepath.Join(tempDir, "index.txt")) // Verify HTML archive content htmlContent, err := os.ReadFile(filepath.Join(tempDir, "index.html")) require.NoError(t, err) htmlStr := string(htmlContent) assert.Contains(t, htmlStr, "Test Archive Post") assert.Contains(t, htmlStr, "This is a test subtitle") assert.Contains(t, htmlStr, "https://example.com/cover.jpg") assert.Contains(t, htmlStr, "20230101_103000_test-archive-post.html") // Relative path assert.Contains(t, htmlStr, "January 1, 2023") // Formatted date // Verify Markdown archive content mdContent, err := os.ReadFile(filepath.Join(tempDir, "index.md")) require.NoError(t, err) mdStr := string(mdContent) assert.Contains(t, mdStr, "# Substack Archive") assert.Contains(t, mdStr, "## [Test Archive Post](20230101_103000_test-archive-post.html)") assert.Contains(t, mdStr, "*This is a test subtitle*") assert.Contains(t, mdStr, "") // Verify Text archive content txtContent, err := os.ReadFile(filepath.Join(tempDir, "index.txt")) require.NoError(t, err) txtStr := string(txtContent) assert.Contains(t, txtStr, "SUBSTACK ARCHIVE") assert.Contains(t, txtStr, "Title: Test Archive Post") assert.Contains(t, txtStr, "File: 20230101_103000_test-archive-post.html") assert.Contains(t, txtStr, "Description: This is a test subtitle") }) t.Run("multiple posts with archive", func(t *testing.T) { tempDir := t.TempDir() archive := lib.NewArchive() downloadTime := time.Now() // Create multiple posts with different dates posts := []lib.Post{ { Id: 1, Title: "First Post", Slug: "first-post", PostDate: "2023-01-01T10:00:00Z", Subtitle: "First subtitle", CanonicalUrl: "https://example.substack.com/p/first-post", BodyHTML: "First post content
", }, { Id: 2, Title: "Second Post", Slug: "second-post", PostDate: "2023-01-02T10:00:00Z", Description: "Second description", CoverImage: "https://example.com/cover2.jpg", CanonicalUrl: "https://example.substack.com/p/second-post", BodyHTML: "Second post content
", }, { Id: 3, Title: "Third Post", Slug: "third-post", PostDate: "2023-01-03T10:00:00Z", Subtitle: "Third subtitle", CanonicalUrl: "https://example.substack.com/p/third-post", BodyHTML: "Third post content
", }, } // Write posts and add to archive for i, post := range posts { filePath := filepath.Join(tempDir, fmt.Sprintf("post-%d.html", i+1)) err := post.WriteToFile(filePath, "html", false) require.NoError(t, err) archive.AddEntry(post, filePath, downloadTime.Add(time.Duration(i)*time.Hour)) } // Generate archive err := archive.GenerateHTML(tempDir) require.NoError(t, err) // Verify content ordering (newest first) htmlContent, err := os.ReadFile(filepath.Join(tempDir, "index.html")) require.NoError(t, err) htmlStr := string(htmlContent) // Find positions of post titles to verify ordering thirdPos := strings.Index(htmlStr, "Third Post") secondPos := strings.Index(htmlStr, "Second Post") firstPos := strings.Index(htmlStr, "First Post") assert.True(t, thirdPos < secondPos, "Third Post should appear before Second Post") assert.True(t, secondPos < firstPos, "Second Post should appear before First Post") // Verify all posts are included assert.Contains(t, htmlStr, "First subtitle") assert.Contains(t, htmlStr, "Second description") // Fallback to description assert.Contains(t, htmlStr, "Third subtitle") assert.Contains(t, htmlStr, "https://example.com/cover2.jpg") }) t.Run("archive with different formats", func(t *testing.T) { tempDir := t.TempDir() post := lib.Post{ Id: 100, Title: "Format Test Post", Slug: "format-test-post", PostDate: "2023-01-01T10:00:00Z", Subtitle: "Testing different formats", CanonicalUrl: "https://example.substack.com/p/format-test-post", BodyHTML: "Testing different formats.
", } // Test with different output formats formats := []string{"html", "md", "txt"} for _, format := range formats { t.Run(fmt.Sprintf("format_%s", format), func(t *testing.T) { formatDir := filepath.Join(tempDir, format) err := os.MkdirAll(formatDir, 0755) require.NoError(t, err) archive := lib.NewArchive() // Write post in the specified format filePath := filepath.Join(formatDir, fmt.Sprintf("post.%s", format)) err = post.WriteToFile(filePath, format, false) require.NoError(t, err) // Add to archive and generate archive.AddEntry(post, filePath, time.Now()) switch format { case "html": err = archive.GenerateHTML(formatDir) case "md": err = archive.GenerateMarkdown(formatDir) case "txt": err = archive.GenerateText(formatDir) } require.NoError(t, err) // Verify archive file exists indexPath := filepath.Join(formatDir, fmt.Sprintf("index.%s", format)) assert.FileExists(t, indexPath) // Verify content contains the post content, err := os.ReadFile(indexPath) require.NoError(t, err) assert.Contains(t, string(content), "Format Test Post") assert.Contains(t, string(content), "Testing different formats") }) } }) } ================================================ FILE: cmd/list.go ================================================ package cmd import ( "fmt" "log" "github.com/spf13/cobra" ) // listCmd represents the list command var ( pubUrl string listCmd = &cobra.Command{ Use: "list", Short: "List the posts of a Substack", Long: `List the posts of a Substack`, Run: func(cmd *cobra.Command, args []string) { parsedURL, err := parseURL(pubUrl) if err != nil { log.Fatal(err) } mainWebsite := fmt.Sprintf("%s://%s", parsedURL.Scheme, parsedURL.Host) if verbose { fmt.Printf("Main website: %s\n", mainWebsite) fmt.Println("Getting all posts URLs...") } dateFilterfunc := makeDateFilterFunc(beforeDate, afterDate) urls, err := extractor.GetAllPostsURLs(ctx, mainWebsite, dateFilterfunc) if err != nil { log.Fatal(err) } if verbose { fmt.Printf("Found %d posts.\n", len(urls)) } for _, url := range urls { fmt.Println(url) } }, } ) func init() { listCmd.Flags().StringVarP(&pubUrl, "url", "u", "", "Specify the Substack url") listCmd.MarkFlagRequired("url") } ================================================ FILE: cmd/main.go ================================================ package cmd ================================================ FILE: cmd/root.go ================================================ package cmd import ( "context" "errors" "log" "net/http" "net/url" "os" "github.com/alexferrari88/sbstck-dl/lib" "github.com/spf13/cobra" ) // rootCmd represents the base command when called without any subcommands type cookieName string const ( substackSid cookieName = "substack.sid" connectSid cookieName = "connect.sid" ) func (c *cookieName) String() string { return string(*c) } func (c *cookieName) Set(val string) error { switch val { case "substack.sid", "connect.sid": *c = cookieName(val) default: return errors.New("invalid cookie name: must be either substack.sid or connect.sid") } return nil } func (c *cookieName) Type() string { return "cookieName" } var ( proxyURL string verbose bool ratePerSecond int beforeDate string afterDate string idCookieName cookieName idCookieVal string ctx = context.Background() parsedProxyURL *url.URL fetcher *lib.Fetcher extractor *lib.Extractor rootCmd = &cobra.Command{ Use: "sbstck-dl", Short: "Substack Downloader", Long: `sbstck-dl is a command line tool for downloading Substack newsletters for archival purposes, offline reading, or data analysis.`, PersistentPreRun: func(cmd *cobra.Command, args []string) { var cookie *http.Cookie if proxyURL != "" { var err error parsedProxyURL, err = parseURL(proxyURL) if err != nil { log.Fatal(err) } } if ratePerSecond == 0 { log.Fatal("rate must be greater than 0") } if idCookieVal != "" && idCookieName != "" { if idCookieName == substackSid { cookie = &http.Cookie{ Name: "substack.sid", Value: idCookieVal, } } else if idCookieName == connectSid { cookie = &http.Cookie{ Name: "connect.sid", Value: idCookieVal, } } } fetcher = lib.NewFetcher(lib.WithRatePerSecond(ratePerSecond), lib.WithProxyURL(parsedProxyURL), lib.WithCookie(cookie)) extractor = lib.NewExtractor(fetcher) }, } ) // Execute adds all child commands to the root command and sets flags appropriately. // This is called by main.main(). It only needs to happen once to the rootCmd. func Execute() { err := rootCmd.Execute() if err != nil { os.Exit(1) } } func init() { rootCmd.PersistentFlags().StringVarP(&proxyURL, "proxy", "x", "", "Specify the proxy url") rootCmd.PersistentFlags().Var(&idCookieName, "cookie_name", "Either \"substack.sid\" or \"connect.sid\", based on the cookie you have (required for private newsletters)") rootCmd.PersistentFlags().StringVar(&idCookieVal, "cookie_val", "", "The substack.sid/connect.sid cookie value (required for private newsletters)") rootCmd.PersistentFlags().BoolVarP(&verbose, "verbose", "v", false, "Enable verbose output") rootCmd.PersistentFlags().IntVarP(&ratePerSecond, "rate", "r", lib.DefaultRatePerSecond, "Specify the rate of requests per second") rootCmd.PersistentFlags().StringVar(&beforeDate, "before", "", "Download posts published before this date (format: YYYY-MM-DD)") rootCmd.PersistentFlags().StringVar(&afterDate, "after", "", "Download posts published after this date (format: YYYY-MM-DD)") rootCmd.MarkFlagsRequiredTogether("cookie_name", "cookie_val") rootCmd.AddCommand(downloadCmd) rootCmd.AddCommand(listCmd) rootCmd.AddCommand(versionCmd) } func makeDateFilterFunc(beforeDate string, afterDate string) lib.DateFilterFunc { var dateFilterFunc lib.DateFilterFunc if beforeDate != "" && afterDate != "" { dateFilterFunc = func(date string) bool { return date > afterDate && date < beforeDate } } else if beforeDate != "" { dateFilterFunc = func(date string) bool { return date < beforeDate } } else if afterDate != "" { dateFilterFunc = func(date string) bool { return date > afterDate } } return dateFilterFunc } ================================================ FILE: cmd/version.go ================================================ package cmd import ( "fmt" "github.com/spf13/cobra" ) // versionCmd represents the version command var versionCmd = &cobra.Command{ Use: "version", Short: "Print the version number of sbstck-dl", Long: `Display the current version of the app.`, Run: func(cmd *cobra.Command, args []string) { fmt.Println("sbstck-dl v0.7") }, } func init() { } ================================================ FILE: go.mod ================================================ module github.com/alexferrari88/sbstck-dl go 1.20 require ( github.com/JohannesKaufmann/html-to-markdown v1.5.0 github.com/PuerkitoBio/goquery v1.8.1 github.com/cenkalti/backoff/v4 v4.2.1 github.com/k3a/html2text v1.2.1 github.com/schollz/progressbar/v3 v3.14.1 github.com/spf13/cobra v1.8.0 github.com/stretchr/testify v1.8.4 golang.org/x/sync v0.6.0 golang.org/x/time v0.5.0 ) require ( github.com/andybalholm/cascadia v1.3.2 // indirect github.com/davecgh/go-spew v1.1.1 // indirect github.com/inconshreveable/mousetrap v1.1.0 // indirect github.com/mitchellh/colorstring v0.0.0-20190213212951-d06e56a500db // indirect github.com/pmezard/go-difflib v1.0.0 // indirect github.com/rivo/uniseg v0.4.4 // indirect github.com/spf13/pflag v1.0.5 // indirect golang.org/x/net v0.20.0 // indirect golang.org/x/sys v0.16.0 // indirect golang.org/x/term v0.16.0 // indirect gopkg.in/yaml.v3 v3.0.1 // indirect ) ================================================ FILE: go.sum ================================================ github.com/JohannesKaufmann/html-to-markdown v1.5.0 h1:cEAcqpxk0hUJOXEVGrgILGW76d1GpyGY7PCnAaWQyAI= github.com/JohannesKaufmann/html-to-markdown v1.5.0/go.mod h1:QTO/aTyEDukulzu269jY0xiHeAGsNxmuUBo2Q0hPsK8= github.com/PuerkitoBio/goquery v1.8.1 h1:uQxhNlArOIdbrH1tr0UXwdVFgDcZDrZVdcpygAcwmWM= github.com/PuerkitoBio/goquery v1.8.1/go.mod h1:Q8ICL1kNUJ2sXGoAhPGUdYDJvgQgHzJsnnd3H7Ho5jQ= github.com/andybalholm/cascadia v1.3.1/go.mod h1:R4bJ1UQfqADjvDa4P6HZHLh/3OxWWEqc0Sk8XGwHqvA= github.com/andybalholm/cascadia v1.3.2 h1:3Xi6Dw5lHF15JtdcmAHD3i1+T8plmv7BQ/nsViSLyss= github.com/andybalholm/cascadia v1.3.2/go.mod h1:7gtRlve5FxPPgIgX36uWBX58OdBsSS6lUvCFb+h7KvU= github.com/cenkalti/backoff/v4 v4.2.1 h1:y4OZtCnogmCPw98Zjyt5a6+QwPLGkiQsYW5oUqylYbM= github.com/cenkalti/backoff/v4 v4.2.1/go.mod h1:Y3VNntkOUPxTVeUxJ/G5vcM//AlwfmyYozVcomhLiZE= github.com/cpuguy83/go-md2man/v2 v2.0.3/go.mod h1:tgQtvFlXSQOSOSIRvRPT7W67SCa46tRHOmNcaadrF8o= github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c= github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= github.com/gopherjs/gopherjs v0.0.0-20181017120253-0766667cb4d1 h1:EGx4pi6eqNxGaHF6qqu48+N2wcFQ5qg5FXgOdqsJ5d8= github.com/gopherjs/gopherjs v0.0.0-20181017120253-0766667cb4d1/go.mod h1:wJfORRmW1u3UXTncJ5qlYoELFm8eSnnEO6hX4iZ3EWY= github.com/inconshreveable/mousetrap v1.1.0 h1:wN+x4NVGpMsO7ErUn/mUI3vEoE6Jt13X2s0bqwp9tc8= github.com/inconshreveable/mousetrap v1.1.0/go.mod h1:vpF70FUmC8bwa3OWnCshd2FqLfsEA9PFc4w1p2J65bw= github.com/jtolds/gls v4.20.0+incompatible h1:xdiiI2gbIgH/gLH7ADydsJ1uDOEzR8yvV7C0MuV77Wo= github.com/jtolds/gls v4.20.0+incompatible/go.mod h1:QJZ7F/aHp+rZTRtaJ1ow/lLfFfVYBRgL+9YlvaHOwJU= github.com/k0kubun/go-ansi v0.0.0-20180517002512-3bf9e2903213/go.mod h1:vNUNkEQ1e29fT/6vq2aBdFsgNPmy8qMdSay1npru+Sw= github.com/k3a/html2text v1.2.1 h1:nvnKgBvBR/myqrwfLuiqecUtaK1lB9hGziIJKatNFVY= github.com/k3a/html2text v1.2.1/go.mod h1:ieEXykM67iT8lTvEWBh6fhpH4B23kB9OMKPdIBmgUqA= github.com/kr/pretty v0.1.0 h1:L/CwN0zerZDmRFUapSPitk6f+Q3+0za1rQkzVuMiMFI= github.com/kr/pretty v0.1.0/go.mod h1:dAy3ld7l9f0ibDNOQOHHMYYIIbhfbHSm3C4ZsoJORNo= github.com/kr/pty v1.1.1/go.mod h1:pFQYn66WHrOpPYNljwOMqo10TkYh1fy3cYio2l3bCsQ= github.com/kr/text v0.1.0 h1:45sCR5RtlFHMR4UwH9sdQ5TC8v0qDQCHnXt+kaKSTVE= github.com/kr/text v0.1.0/go.mod h1:4Jbv+DJW3UT/LiOwJeYQe1efqtUx/iVham/4vfdArNI= github.com/mattn/go-isatty v0.0.20/go.mod h1:W+V8PltTTMOvKvAeJH7IuucS94S2C6jfK/D7dTCTo3Y= github.com/mitchellh/colorstring v0.0.0-20190213212951-d06e56a500db h1:62I3jR2EmQ4l5rM/4FEfDWcRD+abF5XlKShorW5LRoQ= github.com/mitchellh/colorstring v0.0.0-20190213212951-d06e56a500db/go.mod h1:l0dey0ia/Uv7NcFFVbCLtqEBQbrT4OCwCSKTEv6enCw= github.com/pkg/errors v0.8.1/go.mod h1:bwawxfHBFNV+L2hUp1rHADufV3IMtnDRdf1r5NINEl0= github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM= github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4= github.com/rivo/uniseg v0.4.4 h1:8TfxU8dW6PdqD27gjM8MVNuicgxIjxpm4K7x4jp8sis= github.com/rivo/uniseg v0.4.4/go.mod h1:FN3SvrM+Zdj16jyLfmOkMNblXMcoc8DfTHruCPUcx88= github.com/russross/blackfriday/v2 v2.1.0/go.mod h1:+Rmxgy9KzJVeS9/2gXHxylqXiyQDYRxCVz55jmeOWTM= github.com/schollz/progressbar/v3 v3.14.1 h1:VD+MJPCr4s3wdhTc7OEJ/Z3dAeBzJ7yKH/P4lC5yRTI= github.com/schollz/progressbar/v3 v3.14.1/go.mod h1:Zc9xXneTzWXF81TGoqL71u0sBPjULtEHYtj/WVgVy8E= github.com/sebdah/goldie/v2 v2.5.3 h1:9ES/mNN+HNUbNWpVAlrzuZ7jE+Nrczbj8uFRjM7624Y= github.com/sebdah/goldie/v2 v2.5.3/go.mod h1:oZ9fp0+se1eapSRjfYbsV/0Hqhbuu3bJVvKI/NNtssI= github.com/sergi/go-diff v1.0.0/go.mod h1:0CfEIISq7TuYL3j771MWULgwwjU+GofnZX9QAmXWZgo= github.com/sergi/go-diff v1.2.0 h1:XU+rvMAioB0UC3q1MFrIQy4Vo5/4VsRDQQXHsEya6xQ= github.com/sergi/go-diff v1.2.0/go.mod h1:STckp+ISIX8hZLjrqAeVduY0gWCT9IjLuqbuNXdaHfM= github.com/smartystreets/assertions v0.0.0-20180927180507-b2de0cb4f26d h1:zE9ykElWQ6/NYmHa3jpm/yHnI4xSofP+UP6SpjHcSeM= github.com/smartystreets/assertions v0.0.0-20180927180507-b2de0cb4f26d/go.mod h1:OnSkiWE9lh6wB0YB77sQom3nweQdgAjqCqsofrRNTgc= github.com/smartystreets/goconvey v1.6.4 h1:fv0U8FUIMPNf1L9lnHLvLhgicrIVChEkdzIKYqbNC9s= github.com/smartystreets/goconvey v1.6.4/go.mod h1:syvi0/a8iFYH4r/RixwvyeAJjdLS9QV7WQ/tjFTllLA= github.com/spf13/cobra v1.8.0 h1:7aJaZx1B85qltLMc546zn58BxxfZdR/W22ej9CFoEf0= github.com/spf13/cobra v1.8.0/go.mod h1:WXLWApfZ71AjXPya3WOlMsY9yMs7YeiHhFVlvLyhcho= github.com/spf13/pflag v1.0.5 h1:iy+VFUOCP1a+8yFto/drg2CJ5u0yRoB7fZw3DKv/JXA= github.com/spf13/pflag v1.0.5/go.mod h1:McXfInJRrz4CZXVZOBLb0bTZqETkiAhM9Iw0y3An2Bg= github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME= github.com/stretchr/testify v1.3.0/go.mod h1:M5WIy9Dh21IEIfnGCwXGc5bZfKNJtfHm1UVUgZn+9EI= github.com/stretchr/testify v1.4.0/go.mod h1:j7eGeouHqKxXV5pUuKE4zz7dFj8WfuZ+81PSLYec5m4= github.com/stretchr/testify v1.8.4 h1:CcVxjf3Q8PM0mHUKJCdn+eZZtm5yQwehR5yeSVQQcUk= github.com/stretchr/testify v1.8.4/go.mod h1:sz/lmYIOXD/1dqDmKjjqLyZ2RngseejIcXlSw2iwfAo= github.com/yuin/goldmark v1.4.13/go.mod h1:6yULJ656Px+3vBD8DxQVa3kxgyrAnzto9xy5taEt/CY= github.com/yuin/goldmark v1.6.0 h1:boZcn2GTjpsynOsC0iJHnBWa4Bi0qzfJjthwauItG68= github.com/yuin/goldmark v1.6.0/go.mod h1:6yULJ656Px+3vBD8DxQVa3kxgyrAnzto9xy5taEt/CY= golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2/go.mod h1:djNgcEr1/C05ACkg1iLfiJU5Ep61QUkGW8qpdssI0+w= golang.org/x/crypto v0.0.0-20210921155107-089bfa567519/go.mod h1:GvvjBRRGRdwPK5ydBHafDWAxML/pGHZbMvKqRZ5+Abc= golang.org/x/crypto v0.16.0/go.mod h1:gCAAfMLgwOJRpTjQ2zCCt2OcSfYMTeZVSRtQlPC7Nq4= golang.org/x/mod v0.6.0-dev.0.20220419223038-86c51ed26bb4/go.mod h1:jJ57K6gSWd91VN4djpZkiMVwK6gcyfeH4XE8wZrZaV4= golang.org/x/mod v0.8.0/go.mod h1:iBbtSCu2XBx23ZKBPSOrRkjjQPZFPuis4dIYUhu/chs= golang.org/x/net v0.0.0-20190311183353-d8887717615a/go.mod h1:t9HGtf8HONx5eT2rtn7q6eTqICYqUVnKs3thJo3Qplg= golang.org/x/net v0.0.0-20190620200207-3b0461eec859/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s= golang.org/x/net v0.0.0-20210226172049-e18ecbb05110/go.mod h1:m0MpNAwzfU5UDzcl9v0D8zg8gWTRqZa9RBIspLL5mdg= golang.org/x/net v0.0.0-20210916014120-12bc252f5db8/go.mod h1:9nx3DQGgdP8bBQD5qxJ1jj9UTztislL4KSBs9R2vV5Y= golang.org/x/net v0.0.0-20220722155237-a158d28d115b/go.mod h1:XRhObCWvk6IyKnWLug+ECip1KBveYUHfp+8e9klMJ9c= golang.org/x/net v0.6.0/go.mod h1:2Tu9+aMcznHK/AK1HMvgo6xiTLG5rD5rZLDS+rp2Bjs= golang.org/x/net v0.7.0/go.mod h1:2Tu9+aMcznHK/AK1HMvgo6xiTLG5rD5rZLDS+rp2Bjs= golang.org/x/net v0.9.0/go.mod h1:d48xBJpPfHeWQsugry2m+kC02ZBRGRgulfHnEXEuWns= golang.org/x/net v0.10.0/go.mod h1:0qNGK6F8kojg2nk9dLZ2mShWaEBan6FAoqfSigmmuDg= golang.org/x/net v0.19.0/go.mod h1:CfAk/cbD4CthTvqiEl8NpboMuiuOYsAr/7NOjZJtv1U= golang.org/x/net v0.20.0 h1:aCL9BSgETF1k+blQaYUBx9hJ9LOGP3gAVemcZlf1Kpo= golang.org/x/net v0.20.0/go.mod h1:z8BVo6PvndSri0LbOE3hAn0apkU+1YvI6E70E9jsnvY= golang.org/x/sync v0.0.0-20190423024810-112230192c58/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= golang.org/x/sync v0.0.0-20220722155255-886fb9371eb4/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= golang.org/x/sync v0.1.0/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= golang.org/x/sync v0.6.0 h1:5BMeUDZ7vkXGfEr1x9B4bRcTH4lpkTkpdh0T/J+qjbQ= golang.org/x/sync v0.6.0/go.mod h1:Czt+wKu1gCyEFDUtn0jG5QVvpJ6rzVqr5aXyt9drQfk= golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY= golang.org/x/sys v0.0.0-20201119102817-f84b799fce68/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= golang.org/x/sys v0.0.0-20210423082822-04245dca01da/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= golang.org/x/sys v0.0.0-20210615035016-665e8c7367d1/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.0.0-20220520151302-bc2c85ada10a/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.0.0-20220722155257-8c9f86f7a55f/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.5.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.6.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.7.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.8.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.14.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA= golang.org/x/sys v0.15.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA= golang.org/x/sys v0.16.0 h1:xWw16ngr6ZMtmxDyKyIgsE93KNKz5HKmMa3b8ALHidU= golang.org/x/sys v0.16.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA= golang.org/x/term v0.0.0-20201126162022-7de9c90e9dd1/go.mod h1:bj7SfCRtBDWHUb9snDiAeCFNEtKQo2Wmx5Cou7ajbmo= golang.org/x/term v0.0.0-20210927222741-03fcf44c2211/go.mod h1:jbD1KX2456YbFQfuXm/mYQcufACuNUgVhRMnK/tPxf8= golang.org/x/term v0.5.0/go.mod h1:jMB1sMXY+tzblOD4FWmEbocvup2/aLOaQEp7JmGp78k= golang.org/x/term v0.7.0/go.mod h1:P32HKFT3hSsZrRxla30E9HqToFYAQPCMs/zFMBUFqPY= golang.org/x/term v0.8.0/go.mod h1:xPskH00ivmX89bAKVGSKKtLOWNx2+17Eiy94tnKShWo= golang.org/x/term v0.14.0/go.mod h1:TySc+nGkYR6qt8km8wUhuFRTVSMIX3XPR58y2lC8vww= golang.org/x/term v0.15.0/go.mod h1:BDl952bC7+uMoWR75FIrCDx79TPU9oHkTZ9yRbYOrX0= golang.org/x/term v0.16.0 h1:m+B6fahuftsE9qjo0VWp2FW0mB3MTJvR0BaMQrq0pmE= golang.org/x/term v0.16.0/go.mod h1:yn7UURbUtPyrVJPGPq404EukNFxcm/foM+bV/bfcDsY= golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ= golang.org/x/text v0.3.3/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ= golang.org/x/text v0.3.6/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ= golang.org/x/text v0.3.7/go.mod h1:u+2+/6zg+i71rQMx5EYifcz6MCKuco9NR6JIITiCfzQ= golang.org/x/text v0.7.0/go.mod h1:mrYo+phRRbMaCq/xk9113O4dZlRixOauAjOtrjsXDZ8= golang.org/x/text v0.9.0/go.mod h1:e1OnstbJyHTd6l/uOt8jFFHp6TRDWZR/bV3emEE/zU8= golang.org/x/text v0.14.0/go.mod h1:18ZOQIKpY8NJVqYksKHtTdi31H5itFRjB5/qKTNYzSU= golang.org/x/time v0.5.0 h1:o7cqy6amK/52YcAKIPlM3a+Fpj35zvRj2TP+e1xFSfk= golang.org/x/time v0.5.0/go.mod h1:3BpzKBy/shNhVucY/MWOyx10tF3SFh9QdLuxbVysPQM= golang.org/x/tools v0.0.0-20180917221912-90fa682c2a6e/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ= golang.org/x/tools v0.0.0-20190328211700-ab21143f2384/go.mod h1:LCzVGOaR6xXOjkQ3onu1FJEFr0SW1gC7cKk1uF8kGRs= golang.org/x/tools v0.0.0-20191119224855-298f0cb1881e/go.mod h1:b+2E5dAYhXwXZwtnZ6UAqBI28+e2cm9otk0dWdXHAEo= golang.org/x/tools v0.1.12/go.mod h1:hNGJHUnrk76NpqgfD5Aqm5Crs+Hm0VOH/i9J2+nxYbc= golang.org/x/tools v0.6.0/go.mod h1:Xwgl3UAJ/d3gWutnCtw505GrjyAbvKui8lOU390QaIU= golang.org/x/xerrors v0.0.0-20190717185122-a985d3407aa7/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= gopkg.in/check.v1 v1.0.0-20190902080502-41f04d3bba15 h1:YR8cESwS4TdDjEe65xsg0ogRM/Nc3DYOhEAlW+xobZo= gopkg.in/check.v1 v1.0.0-20190902080502-41f04d3bba15/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= gopkg.in/yaml.v2 v2.2.2/go.mod h1:hI93XBmqTisBFMUTm0b8Fm+jr3Dg1NNxqwp+5A1VGuI= gopkg.in/yaml.v2 v2.2.4/go.mod h1:hI93XBmqTisBFMUTm0b8Fm+jr3Dg1NNxqwp+5A1VGuI= gopkg.in/yaml.v2 v2.4.0 h1:D8xgwECY7CYvx+Y2n4sBz93Jn9JRvxdiyyo8CTfuKaY= gopkg.in/yaml.v2 v2.4.0/go.mod h1:RDklbk79AGWmwhnvt/jBztapEOGDOx6ZbXqjP6csGnQ= gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA= gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM= ================================================ FILE: lib/extractor.go ================================================ package lib import ( "context" "encoding/json" "errors" "fmt" "net/url" "os" "path/filepath" "sort" "strings" "sync" "time" md "github.com/JohannesKaufmann/html-to-markdown" "github.com/PuerkitoBio/goquery" "github.com/k3a/html2text" ) // RawPost represents a raw Substack post in string format. type RawPost struct { str string } // ToPost converts the RawPost to a structured Post object. func (r *RawPost) ToPost() (Post, error) { var wrapper PostWrapper err := json.Unmarshal([]byte(r.str), &wrapper) if err != nil { return Post{}, err } return wrapper.Post, nil } // Post represents a structured Substack post with various fields. type Post struct { Id int `json:"id"` PublicationId int `json:"publication_id"` Type string `json:"type"` Slug string `json:"slug"` PostDate string `json:"post_date"` CanonicalUrl string `json:"canonical_url"` PreviousPostSlug string `json:"previous_post_slug"` NextPostSlug string `json:"next_post_slug"` CoverImage string `json:"cover_image"` Description string `json:"description"` Subtitle string `json:"subtitle,omitempty"` WordCount int `json:"wordcount"` Title string `json:"title"` BodyHTML string `json:"body_html"` } // Static converter instance to avoid recreating it for each conversion var mdConverter = md.NewConverter("", true, nil) // ToMD converts the Post's HTML body to Markdown format. func (p *Post) ToMD(withTitle bool) (string, error) { if withTitle { body, err := mdConverter.ConvertString(p.BodyHTML) if err != nil { return "", err } return fmt.Sprintf("# %s\n\n%s", p.Title, body), nil } return mdConverter.ConvertString(p.BodyHTML) } // ToText converts the Post's HTML body to plain text format. func (p *Post) ToText(withTitle bool) string { if withTitle { return p.Title + "\n\n" + html2text.HTML2Text(p.BodyHTML) } return html2text.HTML2Text(p.BodyHTML) } // ToHTML returns the Post's HTML body as-is or with an optional title header. func (p *Post) ToHTML(withTitle bool) string { if withTitle { return fmt.Sprintf("original content: %s
", p.CanonicalUrl, p.CanonicalUrl) } content += sourceLine } return os.WriteFile(path, []byte(content), 0644) } // WriteToFileWithImages writes the Post's content to a file with optional image downloading func (p *Post) WriteToFileWithImages(ctx context.Context, path string, format string, addSourceURL bool, downloadImages bool, imageQuality ImageQuality, imagesDir string, downloadFiles bool, fileExtensions []string, filesDir string, fetcher *Fetcher) (*ImageDownloadResult, error) { if err := os.MkdirAll(filepath.Dir(path), 0755); err != nil { return nil, err } content, err := p.contentForFormat(format, true) if err != nil { return nil, err } var imageResult *ImageDownloadResult // Download images if requested and format supports it if downloadImages && (format == "html" || format == "md") { outputDir := filepath.Dir(path) imageDownloader := NewImageDownloader(fetcher, outputDir, imagesDir, imageQuality) // Only process HTML content for image downloading htmlContent := content if format == "md" { // For markdown, we need to work with the original HTML htmlContent = p.BodyHTML } imageResult, err = imageDownloader.DownloadImages(ctx, htmlContent, p.Slug) if err != nil { return nil, fmt.Errorf("failed to download images: %w", err) } // Update content based on format if format == "html" { content = imageResult.UpdatedHTML // Re-add title if needed if strings.HasPrefix(content, "original content: %s
", p.CanonicalUrl, p.CanonicalUrl) } content += sourceLine } // Write the file if err := os.WriteFile(path, []byte(content), 0644); err != nil { return imageResult, err } // Return empty result if no image downloading was performed if imageResult == nil { imageResult = &ImageDownloadResult{ Images: []ImageInfo{}, UpdatedHTML: content, Success: 0, Failed: 0, } } return imageResult, nil } // PostWrapper wraps a Post object for JSON unmarshaling. type PostWrapper struct { Post Post `json:"post"` } // Extractor is a utility for extracting Substack posts from URLs. type Extractor struct { fetcher *Fetcher } // ArchiveEntry represents a single entry in the archive page type ArchiveEntry struct { Post Post FilePath string DownloadTime time.Time } // Archive represents a collection of posts for the archive page type Archive struct { Entries []ArchiveEntry } // NewExtractor creates a new Extractor with the provided Fetcher. // If the Fetcher is nil, a default Fetcher will be used. func NewExtractor(f *Fetcher) *Extractor { if f == nil { f = NewFetcher() } return &Extractor{fetcher: f} } // extractJSONString finds and extracts the JSON data from script content. // This optimized version reduces string operations. func extractJSONString(doc *goquery.Document) (string, error) { var jsonString string var found bool doc.Find("script").EachWithBreak(func(i int, s *goquery.Selection) bool { content := s.Text() if strings.Contains(content, "window._preloads") && strings.Contains(content, "JSON.parse(") { start := strings.Index(content, "JSON.parse(\"") if start == -1 { return true } start += len("JSON.parse(\"") end := strings.LastIndex(content, "\")") if end == -1 || start >= end { return true } jsonString = content[start:end] found = true return false } return true }) if !found { return "", errors.New("failed to extract JSON string") } return jsonString, nil } func (e *Extractor) ExtractPost(ctx context.Context, pageUrl string) (Post, error) { // fetch page HTML content body, err := e.fetcher.FetchURL(ctx, pageUrl) if err != nil { return Post{}, fmt.Errorf("failed to fetch page: %w", err) } defer body.Close() doc, err := goquery.NewDocumentFromReader(body) if err != nil { return Post{}, fmt.Errorf("failed to parse HTML: %w", err) } jsonString, err := extractJSONString(doc) if err != nil { return Post{}, fmt.Errorf("failed to extract post data: %w", err) } // Unescape the JSON string directly var rawJSON RawPost err = json.Unmarshal([]byte("\""+jsonString+"\""), &rawJSON.str) if err != nil { return Post{}, fmt.Errorf("failed to unescape JSON: %w", err) } // Convert to a Go object p, err := rawJSON.ToPost() if err != nil { return Post{}, fmt.Errorf("failed to parse post data: %w", err) } // Extract additional metadata from HTML // Extract subtitle from .subtitle element if subtitle := doc.Find(".subtitle").First().Text(); subtitle != "" { p.Subtitle = strings.TrimSpace(subtitle) } // Extract cover image from og:image meta tag if not already set if p.CoverImage == "" { if ogImage, exists := doc.Find("meta[property='og:image']").Attr("content"); exists && ogImage != "" { p.CoverImage = ogImage } } return p, nil } type DateFilterFunc func(string) bool func (e *Extractor) GetAllPostsURLs(ctx context.Context, pubUrl string, f DateFilterFunc) ([]string, error) { u, err := url.Parse(pubUrl) if err != nil { return nil, err } u.Path, err = url.JoinPath(u.Path, "sitemap.xml") if err != nil { return nil, err } // fetch the sitemap of the publication body, err := e.fetcher.FetchURL(ctx, u.String()) if err != nil { return nil, err } defer body.Close() // Parse the XML doc, err := goquery.NewDocumentFromReader(body) if err != nil { return nil, err } // Pre-allocate a reasonable size for URLs // This avoids multiple slice reallocations as we append urls := make([]string, 0, 100) doc.Find("url").EachWithBreak(func(i int, s *goquery.Selection) bool { // Check if the context has been cancelled select { case <-ctx.Done(): return false default: } urlSel := s.Find("loc") url := urlSel.Text() if !strings.Contains(url, "/p/") { return true } // Only find lastmod if we have a filter if f != nil { lastmod := s.Find("lastmod").Text() if !f(lastmod) { return true } } urls = append(urls, url) return true }) return urls, nil } type ExtractResult struct { Post Post Err error } // ExtractAllPosts extracts all posts from the given URLs using a worker pool pattern // to limit concurrency and avoid overwhelming system resources. func (e *Extractor) ExtractAllPosts(ctx context.Context, urls []string) <-chan ExtractResult { resultCh := make(chan ExtractResult, len(urls)) go func() { defer close(resultCh) // Create a channel for the URLs urlCh := make(chan string, len(urls)) // Fill the URL channel for _, u := range urls { urlCh <- u } close(urlCh) // Limit concurrency - the number of workers is capped at 10 or the number of URLs, whichever is smaller workerCount := 10 if len(urls) < workerCount { workerCount = len(urls) } // Create a WaitGroup to wait for all workers to finish var wg sync.WaitGroup wg.Add(workerCount) // Start the workers for i := 0; i < workerCount; i++ { go func() { defer wg.Done() for url := range urlCh { select { case <-ctx.Done(): // Context cancelled, stop processing return default: post, err := e.ExtractPost(ctx, url) resultCh <- ExtractResult{Post: post, Err: err} } } }() } // Wait for all workers to finish wg.Wait() }() return resultCh } // NewArchive creates a new Archive instance func NewArchive() *Archive { return &Archive{ Entries: make([]ArchiveEntry, 0), } } // AddEntry adds a new entry to the archive, sorted by publication date (newest first) func (a *Archive) AddEntry(post Post, filePath string, downloadTime time.Time) { entry := ArchiveEntry{ Post: post, FilePath: filePath, DownloadTime: downloadTime, } a.Entries = append(a.Entries, entry) a.sortEntries() } // sortEntries sorts archive entries by publication date (newest first) func (a *Archive) sortEntries() { sort.Slice(a.Entries, func(i, j int) bool { // Parse post dates and compare (newest first) dateI, errI := time.Parse(time.RFC3339, a.Entries[i].Post.PostDate) dateJ, errJ := time.Parse(time.RFC3339, a.Entries[j].Post.PostDate) if errI != nil || errJ != nil { // If parsing fails, sort by title return a.Entries[i].Post.Title < a.Entries[j].Post.Title } return dateI.After(dateJ) // newest first }) } // GenerateHTML creates an HTML archive page func (a *Archive) GenerateHTML(outputDir string) error { archivePath := filepath.Join(outputDir, "index.html") html := `This is a test post.
", } } // Helper function to create a mock HTML page with embedded JSON func createMockSubstackHTML(post Post) string { // Create a wrapper and marshal it to JSON wrapper := PostWrapper{Post: post} jsonBytes, _ := json.Marshal(wrapper) // Escape quotes for embedding in JavaScript escapedJSON := strings.ReplaceAll(string(jsonBytes), `"`, `\"`) return fmt.Sprintf(`This is a test post.
") htmlNoTitle := post.ToHTML(false) assert.NotContains(t, htmlNoTitle, "This is a test post.
") }) t.Run("ToMD", func(t *testing.T) { md, err := post.ToMD(true) require.NoError(t, err) assert.Contains(t, md, "# Test Post") assert.Contains(t, md, "This is a **test** post.") mdNoTitle, err := post.ToMD(false) require.NoError(t, err) assert.NotContains(t, mdNoTitle, "# Test Post") assert.Contains(t, mdNoTitle, "This is a **test** post.") }) t.Run("ToText", func(t *testing.T) { text := post.ToText(true) assert.Contains(t, text, "Test Post") assert.Contains(t, text, "This is a test post.") textNoTitle := post.ToText(false) assert.NotContains(t, textNoTitle, "Test Post\n\n") assert.Contains(t, textNoTitle, "This is a test post.") }) t.Run("ToJSON", func(t *testing.T) { jsonStr, err := post.ToJSON() require.NoError(t, err) assert.Contains(t, jsonStr, `"id":123`) assert.Contains(t, jsonStr, `"title":"Test Post"`) }) t.Run("contentForFormat", func(t *testing.T) { // Test valid formats for _, format := range []string{"html", "md", "txt"} { content, err := post.contentForFormat(format, true) assert.NoError(t, err) assert.NotEmpty(t, content) } // Test invalid format _, err := post.contentForFormat("invalid", true) assert.Error(t, err) assert.Contains(t, err.Error(), "unknown format") }) // Test error handling for format conversions t.Run("ToMD error handling", func(t *testing.T) { // Create a post with problematic HTML for markdown conversion // Note: html-to-markdown library is quite robust, so we test with extremely malformed HTML problemPost := createSamplePost() problemPost.BodyHTML = "Nested without closing
This is a test post.
") case "md": assert.Contains(t, string(content), "# Test Post") assert.Contains(t, string(content), "This is a **test** post.") case "txt": assert.Contains(t, string(content), "Test Post") assert.Contains(t, string(content), "This is a test post.") } }) } // Test writing to a non-existent directory t.Run("creating directory", func(t *testing.T) { newDir := filepath.Join(tempDir, "subdir", "nested") filePath := filepath.Join(newDir, "test.html") err := post.WriteToFile(filePath, "html", false) assert.NoError(t, err) // Verify directory was created _, err = os.Stat(newDir) assert.NoError(t, err) }) // Test invalid format t.Run("invalid format", func(t *testing.T) { filePath := filepath.Join(tempDir, "test.invalid") err := post.WriteToFile(filePath, "invalid", false) assert.Error(t, err) assert.Contains(t, err.Error(), "unknown format") }) // Test with addSourceURL enabled t.Run("with source URL", func(t *testing.T) { formats := []string{"html", "md", "txt"} for _, format := range formats { t.Run(format, func(t *testing.T) { filePath := filepath.Join(tempDir, fmt.Sprintf("test-with-source.%s", format)) err := post.WriteToFile(filePath, format, true) require.NoError(t, err) // Read file content content, err := os.ReadFile(filePath) require.NoError(t, err) contentStr := string(content) // Check that source URL is included assert.Contains(t, contentStr, post.CanonicalUrl) assert.Contains(t, contentStr, "original content") // Check format-specific source URL formatting if format == "html" { assert.Contains(t, contentStr, "No script here