Repository: ksanjeev284/reddit-universal-scraper Branch: main Commit: c416fef6aae0 Files: 34 Total size: 203.4 KB Directory structure: gitextract__3x2rp91/ ├── .github/ │ └── workflows/ │ └── docker-publish.yml ├── .gitignore ├── Dockerfile ├── LICENSE ├── README.md ├── alerts/ │ ├── __init__.py │ └── notifications.py ├── analytics/ │ ├── __init__.py │ ├── sentiment.py │ └── subreddit_stats.py ├── api/ │ ├── __init__.py │ └── server.py ├── config.py ├── dashboard/ │ ├── __init__.py │ └── app.py ├── docker-compose.yml ├── docs/ │ ├── BLOG.md │ └── INTEGRATION.md ├── export/ │ ├── __init__.py │ ├── cloud.py │ ├── database.py │ └── parquet.py ├── main.py ├── plugins/ │ ├── __init__.py │ ├── deduplicator.py │ ├── keyword_extractor.py │ └── sentiment_tagger.py ├── requirements.txt ├── scheduler/ │ ├── __init__.py │ └── cron.py ├── scraper/ │ ├── __init__.py │ └── async_scraper.py └── search/ ├── __init__.py └── query.py ================================================ FILE CONTENTS ================================================ ================================================ FILE: .github/workflows/docker-publish.yml ================================================ name: Docker Build & Publish # This tells GitHub: "Run this every time I push code to the main branch" on: push: branches: [ "main" ] # Also run if I create a Release tag (e.g., v1.0) tags: [ 'v*.*.*' ] pull_request: branches: [ "main" ] env: # Use GitHub's built-in registry (ghcr.io) REGISTRY: ghcr.io # Use the repository name as the image name IMAGE_NAME: ${{ github.repository }} jobs: build: runs-on: ubuntu-latest permissions: contents: read packages: write # Needed to push the image to GHCR steps: - name: Checkout repository uses: actions/checkout@v3 # Set up Docker Buildx (The builder engine) - name: Set up Docker Buildx uses: docker/setup-buildx-action@v2 # Login to GitHub Container Registry using the automatic GitHub Token - name: Log into registry ${{ env.REGISTRY }} if: github.event_name != 'pull_request' uses: docker/login-action@v2 with: registry: ${{ env.REGISTRY }} username: ${{ github.actor }} password: ${{ secrets.GITHUB_TOKEN }} # Generate tags (e.g., :latest, :v1.0, :main) - name: Extract Docker metadata id: meta uses: docker/metadata-action@v4 with: images: ${{ env.REGISTRY }}/${{ env.IMAGE_NAME }} tags: | type=raw,value=latest,enable={{is_default_branch}} type=ref,event=branch type=semver,pattern={{version}} # Build the image and push it to the registry - name: Build and push Docker image uses: docker/build-push-action@v4 with: context: . push: ${{ github.event_name != 'pull_request' }} tags: ${{ steps.meta.outputs.tags }} labels: ${{ steps.meta.outputs.labels }} ================================================ FILE: .gitignore ================================================ data/ __pycache__/ .env ================================================ FILE: Dockerfile ================================================ FROM python:3.11-slim # Set environment variables ENV PYTHONUNBUFFERED=1 ENV PYTHONDONTWRITEBYTECODE=1 WORKDIR /app # Install system dependencies (for some Python packages) RUN apt-get update && apt-get install -y --no-install-recommends \ curl \ ffmpeg \ && rm -rf /var/lib/apt/lists/* # Copy requirements first for better caching COPY requirements.txt . RUN pip install --no-cache-dir -r requirements.txt # Copy all source code COPY main.py . COPY config.py . COPY analytics/ ./analytics/ COPY alerts/ ./alerts/ COPY dashboard/ ./dashboard/ COPY export/ ./export/ COPY scheduler/ ./scheduler/ COPY scraper/ ./scraper/ COPY search/ ./search/ COPY plugins/ ./plugins/ COPY api/ ./api/ COPY docs/ ./docs/ # Create data directory with subdirectories RUN mkdir -p data/backups data/parquet # Expose ports # 8501 = Streamlit Dashboard # 8000 = REST API EXPOSE 8501 8000 # Health check for API mode HEALTHCHECK --interval=30s --timeout=10s --start-period=5s --retries=3 \ CMD curl -f http://localhost:8000/health || exit 1 # Default: show help ENTRYPOINT ["python", "main.py"] ================================================ FILE: LICENSE ================================================ MIT License Copyright (c) 2024 Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. ================================================ FILE: README.md ================================================ # 🤖 Universal Reddit Scraper Suite [![Docker Build & Publish](https://github.com/ksanjeev284/reddit-universal-scraper/actions/workflows/docker-publish.yml/badge.svg)](https://github.com/ksanjeev284/reddit-universal-scraper/actions/workflows/docker-publish.yml) A **full-featured** Reddit scraper with analytics dashboard, REST API, scheduled scraping, plugins, and more. **No API keys required!** image ## ✨ Features | Feature | Description | |---------|-------------| | 📊 **Full Scraping** | Posts, comments, images, videos, galleries | | 📈 **Web Dashboard** | Beautiful Streamlit UI with 7 tabs | | 🚀 **REST API** | Connect Metabase, Grafana, DuckDB | | 🔌 **Plugin System** | Extensible post-processing (sentiment, dedupe, keywords) | | 📋 **Job Tracking** | Full history with status, duration, errors | | 🧪 **Dry Run Mode** | Test scrape rules without saving data | | 📦 **Parquet Export** | Analytics-ready format for DuckDB/warehouses | | 😀 **Sentiment Analysis** | Analyze post/comment sentiment | | 📅 **Scheduled Scraping** | Cron-style job scheduling | | 📧 **Notifications** | Discord & Telegram alerts | | 🗄️ **SQLite Database** | Structured storage with auto-backup | --- ## 🚀 Quick Start ```bash # Install dependencies pip install -r requirements.txt # Scrape a subreddit python main.py python --mode full --limit 100 # Launch dashboard python main.py --dashboard # Opens at http://localhost:8501 ``` ### 📋 Requirements - **Python 3.8+** - **ffmpeg** (optional, for video with audio) ```bash # Windows (via chocolatey) choco install ffmpeg # macOS brew install ffmpeg # Ubuntu/Debian sudo apt install ffmpeg ``` --- ## 📖 All Commands ### 🔄 Scraping ```bash # Full scrape (posts + media + comments) python main.py delhi --mode full --limit 100 # Fast history-only (no media/comments) python main.py delhi --mode history --limit 500 # Live monitor (checks every 5 min) python main.py delhi --mode monitor # Scrape a user's posts python main.py spez --user --mode full --limit 50 # Skip media or comments python main.py delhi --no-media --limit 200 python main.py delhi --no-comments --limit 200 ``` ### 🧪 Dry Run Mode Test scrape rules without saving any data: ```bash python main.py python --mode full --limit 50 --dry-run ``` Output: ``` 🧪 DRY RUN MODE - No data will be saved 🧪 DRY RUN COMPLETE! 📊 Would scrape: 100 posts 💬 Would scrape: 245 comments ``` ### 🔌 Plugins Enable post-processing plugins: ```bash # List available plugins python main.py --list-plugins # Run with plugins enabled python main.py python --mode full --plugins ``` **Built-in Plugins:** | Plugin | Description | |--------|-------------| | `sentiment_tagger` | Adds sentiment scores to posts | | `deduplicator` | Removes duplicate posts | | `keyword_extractor` | Extracts top keywords | Create custom plugins in `plugins/` folder. ### 📊 Dashboard ```bash python main.py --dashboard # Opens at http://localhost:8501 ``` **Dashboard Tabs:** - 📊 Overview - Stats & charts - 📈 Analytics - Sentiment & keywords - 🔍 Search - Query scraped data - 💬 Comments - Comment analysis - ⚙️ Scraper - Start new scrapes - 📋 Job History - View all jobs - 🔌 Integrations - API, export, plugins ### 🚀 REST API ```bash python main.py --api # API at http://localhost:8000 # Docs at http://localhost:8000/docs ``` **Endpoints:** | Endpoint | Description | |----------|-------------| | `GET /posts` | List posts with filters | | `GET /comments` | List comments | | `GET /subreddits` | All scraped subreddits | | `GET /jobs` | Job history | | `GET /query?sql=...` | Raw SQL queries | | `GET /grafana/query` | Grafana time-series | ### 📦 Export & Maintenance ```bash # Export to Parquet (for DuckDB/warehouses) python main.py --export-parquet python # View job history python main.py --job-history # Backup database python main.py --backup # Optimize database python main.py --vacuum ``` ### 📅 Scheduled Scraping ```bash # Scrape every 60 minutes python main.py --schedule delhi --every 60 # With options python main.py --schedule delhi --every 30 --mode full --limit 50 ``` ### 🔍 Search & Analytics ```bash # Search scraped data python main.py --search "credit card" --min-score 100 # Run sentiment analysis python main.py --analyze delhi --sentiment # Extract keywords python main.py --analyze delhi --keywords ``` --- ## 🐳 Docker ### Quick Start ```bash # Build docker build -t reddit-scraper . # Run scrape docker run -v ./data:/app/data reddit-scraper python --limit 100 # Run with plugins docker run -v ./data:/app/data reddit-scraper python --plugins ``` ### Docker Compose (Full Stack) ```bash # Start API + Dashboard docker-compose up -d # Access: # Dashboard: http://localhost:8501 # API: http://localhost:8000/docs ``` ### Deploy to AWS/VPS ```bash # SSH into your server ssh user@your-server-ip # Clone repo git clone https://github.com/ksanjeev284/reddit-universal-scraper.git cd reddit-universal-scraper # Start services docker-compose up -d # Open firewall ports sudo ufw allow 8000 sudo ufw allow 8501 ``` Access: - `http://your-server-ip:8501` → Dashboard - `http://your-server-ip:8000/docs` → API --- ## 🔗 External Integrations ### Metabase 1. Start API: `python main.py --api` 2. Add HTTP datasource: `http://localhost:8000` 3. Query: `/posts?subreddit=python&limit=100` ### Grafana 1. Install "JSON API" or "Infinity" plugin 2. Add datasource: `http://localhost:8000` 3. Use `/grafana/query` for time-series ### DuckDB ```python import duckdb # Export to Parquet first # python main.py --export-parquet python # Query directly duckdb.query("SELECT * FROM 'data/parquet/*.parquet'").df() ``` --- ## 📁 Project Structure ``` reddit-scraper/ ├── main.py # CLI entry point ├── config.py # Settings ├── analytics/ # Sentiment & keywords ├── alerts/ # Discord/Telegram ├── api/ # REST API server ├── dashboard/ # Streamlit UI ├── export/ # Database & exports ├── plugins/ # Post-processing plugins ├── scheduler/ # Cron scheduling ├── search/ # Search engine └── data/ ├── r_subreddit/ # Scraped data ├── backups/ # DB backups └── parquet/ # Parquet exports ``` --- ## 📊 Data Output ### posts.csv | Column | Description | |--------|-------------| | id | Reddit post ID | | title | Post title | | author | Username | | score | Net upvotes | | num_comments | Comment count | | post_type | text/image/video/gallery | | selftext | Post body | | sentiment_score | -1.0 to 1.0 (with plugins) | ### comments.csv | Column | Description | |--------|-------------| | comment_id | Comment ID | | post_permalink | Parent post | | author | Username | | body | Comment text | | score | Upvotes | --- ## ⚙️ Environment Variables ```bash # Notifications export DISCORD_WEBHOOK_URL="https://discord.com/api/webhooks/..." export TELEGRAM_BOT_TOKEN="123456:ABC..." export TELEGRAM_CHAT_ID="987654321" ``` --- ## 📜 License MIT License - Feel free to use, modify, and distribute. ## 🤝 Contributing Pull requests welcome! For major changes, please open an issue first. ================================================ FILE: alerts/__init__.py ================================================ # Alerts module from .notifications import * ================================================ FILE: alerts/notifications.py ================================================ """ Notification module - Discord & Telegram alerts """ import requests import json from datetime import datetime, timezone def send_discord_alert(webhook_url, title, message, posts=None, color=0x5865F2): """ Send alert to Discord via webhook. Args: webhook_url: Discord webhook URL title: Alert title message: Alert message posts: Optional list of posts to include color: Embed color (default: Discord blue) """ if not webhook_url: print("⚠️ Discord webhook URL not configured") return False embeds = [{ "title": f"🤖 {title}", "description": message, "color": color, "timestamp": datetime.now(timezone.utc).isoformat(), "footer": {"text": "Reddit Scraper Alert"} }] # Add post previews if posts: fields = [] for post in posts[:5]: # Max 5 posts fields.append({ "name": post.get('title', 'No Title')[:100], "value": f"Score: {post.get('score', 0)} | Comments: {post.get('num_comments', 0)}\n[View Post](https://reddit.com{post.get('permalink', '')})", "inline": False }) embeds[0]["fields"] = fields payload = {"embeds": embeds} try: response = requests.post( webhook_url, json=payload, headers={"Content-Type": "application/json"}, timeout=10 ) if response.status_code == 204: print("✅ Discord alert sent!") return True else: print(f"❌ Discord error: {response.status_code}") return False except Exception as e: print(f"❌ Discord error: {e}") return False def send_telegram_alert(bot_token, chat_id, title, message, posts=None): """ Send alert to Telegram via bot. Args: bot_token: Telegram bot token chat_id: Chat/Channel ID to send to title: Alert title message: Alert message posts: Optional list of posts to include """ if not bot_token or not chat_id: print("⚠️ Telegram credentials not configured") return False # Build message text = f"🤖 *{title}*\n\n{message}" if posts: text += "\n\n📝 *New Posts:*\n" for post in posts[:5]: title_text = post.get('title', 'No Title')[:80] score = post.get('score', 0) permalink = post.get('permalink', '') text += f"\n• [{title_text}](https://reddit.com{permalink}) (⬆️ {score})" url = f"https://api.telegram.org/bot{bot_token}/sendMessage" payload = { "chat_id": chat_id, "text": text, "parse_mode": "Markdown", "disable_web_page_preview": True } try: response = requests.post(url, json=payload, timeout=10) if response.status_code == 200: print("✅ Telegram alert sent!") return True else: print(f"❌ Telegram error: {response.json()}") return False except Exception as e: print(f"❌ Telegram error: {e}") return False def check_keyword_alerts(posts, keywords, webhook_url=None, telegram_token=None, telegram_chat=None): """ Check posts for keyword matches and send alerts. Args: posts: List of posts to check keywords: List of keywords to monitor webhook_url: Discord webhook URL telegram_token: Telegram bot token telegram_chat: Telegram chat ID Returns: List of matching posts """ if not keywords: return [] keywords_lower = [k.lower() for k in keywords] matching_posts = [] for post in posts: text = f"{post.get('title', '')} {post.get('selftext', '')}".lower() matched_keywords = [] for keyword in keywords_lower: if keyword in text: matched_keywords.append(keyword) if matched_keywords: post['matched_keywords'] = matched_keywords matching_posts.append(post) if matching_posts: title = f"Keyword Alert: {len(matching_posts)} matches!" message = f"Found posts matching: {', '.join(set(k for p in matching_posts for k in p.get('matched_keywords', [])))}" if webhook_url: send_discord_alert(webhook_url, title, message, matching_posts, color=0xFF6B6B) if telegram_token and telegram_chat: send_telegram_alert(telegram_token, telegram_chat, title, message, matching_posts) return matching_posts def send_scrape_summary(subreddit, stats, webhook_url=None, telegram_token=None, telegram_chat=None): """ Send a summary after scraping completes. Args: subreddit: Subreddit name stats: Dictionary with scrape statistics webhook_url: Discord webhook URL telegram_token: Telegram bot token telegram_chat: Telegram chat ID """ title = f"Scrape Complete: r/{subreddit}" message = f""" 📊 **Statistics:** • Posts: {stats.get('posts', 0)} • Comments: {stats.get('comments', 0)} • Images: {stats.get('images', 0)} • Videos: {stats.get('videos', 0)} • Duration: {stats.get('duration', 'N/A')} """.strip() if webhook_url: send_discord_alert(webhook_url, title, message, color=0x00D166) if telegram_token and telegram_chat: send_telegram_alert(telegram_token, telegram_chat, title, message) class AlertMonitor: """Monitor for keyword-based alerts.""" def __init__(self, keywords, discord_webhook=None, telegram_token=None, telegram_chat=None): self.keywords = keywords self.discord_webhook = discord_webhook self.telegram_token = telegram_token self.telegram_chat = telegram_chat self.seen_posts = set() def check_posts(self, posts): """Check new posts for keyword matches.""" new_posts = [p for p in posts if p.get('id') not in self.seen_posts] if not new_posts: return [] # Mark as seen for p in new_posts: self.seen_posts.add(p.get('id')) # Check for keywords matches = check_keyword_alerts( new_posts, self.keywords, self.discord_webhook, self.telegram_token, self.telegram_chat ) return matches ================================================ FILE: analytics/__init__.py ================================================ # Analytics module from .sentiment import * ================================================ FILE: analytics/sentiment.py ================================================ """ Analytics module - Sentiment Analysis, Word Clouds, Statistics """ import re from collections import Counter from pathlib import Path import sys # Simple sentiment analysis without external dependencies POSITIVE_WORDS = { 'good', 'great', 'awesome', 'excellent', 'amazing', 'love', 'best', 'perfect', 'nice', 'wonderful', 'fantastic', 'brilliant', 'superb', 'outstanding', 'happy', 'beautiful', 'helpful', 'thanks', 'thank', 'appreciate', 'recommend', 'interesting', 'useful', 'cool', 'fun', 'enjoy', 'like', 'loved', 'impressive', 'incredible' } NEGATIVE_WORDS = { 'bad', 'terrible', 'awful', 'horrible', 'hate', 'worst', 'poor', 'disappointing', 'useless', 'waste', 'annoying', 'boring', 'ugly', 'stupid', 'dumb', 'fail', 'wrong', 'broken', 'sad', 'angry', 'frustrated', 'scam', 'fake', 'trash', 'pathetic', 'ridiculous', 'disgusting', 'overpriced', 'avoid', 'never' } INTENSIFIERS = {'very', 'really', 'extremely', 'absolutely', 'totally', 'completely'} def analyze_sentiment(text): """ Simple sentiment analysis. Returns: (score, label) - score: -1.0 to 1.0 - label: 'positive', 'negative', or 'neutral' """ if not text: return 0.0, 'neutral' # Clean and tokenize words = re.findall(r'\b[a-z]+\b', text.lower()) if not words: return 0.0, 'neutral' positive_count = 0 negative_count = 0 intensifier_next = False for word in words: multiplier = 1.5 if intensifier_next else 1.0 if word in POSITIVE_WORDS: positive_count += multiplier elif word in NEGATIVE_WORDS: negative_count += multiplier intensifier_next = word in INTENSIFIERS total = positive_count + negative_count if total == 0: return 0.0, 'neutral' score = (positive_count - negative_count) / len(words) score = max(-1.0, min(1.0, score * 5)) # Normalize if score > 0.1: label = 'positive' elif score < -0.1: label = 'negative' else: label = 'neutral' return round(score, 3), label def analyze_posts_sentiment(posts): """Analyze sentiment for a list of posts.""" results = [] sentiment_counts = {'positive': 0, 'negative': 0, 'neutral': 0} for post in posts: text = f"{post.get('title', '')} {post.get('selftext', '')}" score, label = analyze_sentiment(text) post['sentiment_score'] = score post['sentiment_label'] = label sentiment_counts[label] += 1 results.append(post) return results, sentiment_counts def analyze_comments_sentiment(comments): """Analyze sentiment for comments.""" results = [] sentiment_counts = {'positive': 0, 'negative': 0, 'neutral': 0} for comment in comments: score, label = analyze_sentiment(comment.get('body', '')) comment['sentiment_score'] = score comment['sentiment_label'] = label sentiment_counts[label] += 1 results.append(comment) return results, sentiment_counts def extract_keywords(texts, top_n=50): """Extract most common keywords from texts.""" # Stopwords stopwords = { 'the', 'a', 'an', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'do', 'does', 'did', 'will', 'would', 'could', 'should', 'may', 'might', 'must', 'shall', 'can', 'to', 'of', 'in', 'for', 'on', 'with', 'at', 'by', 'from', 'as', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'between', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', 'too', 'very', 'just', 'and', 'but', 'if', 'or', 'because', 'until', 'while', 'this', 'that', 'these', 'those', 'i', 'me', 'my', 'myself', 'we', 'our', 'you', 'your', 'he', 'she', 'it', 'they', 'them', 'what', 'which', 'who', 'whom', 'its', 'his', 'her', 'their', 'our', 'up', 'out', 'about', 'any', 'also', 'get', 'got', 'like', 'one', 'two', 'know', 'even', 'new', 'want', 'way', 'people', 'time', 'year', 'think', 'amp', 'http', 'https', 'www', 'com', 'reddit', 'deleted', 'removed', 'nan' } all_words = [] for text in texts: if text: words = re.findall(r'\b[a-z]{3,}\b', text.lower()) all_words.extend([w for w in words if w not in stopwords]) return Counter(all_words).most_common(top_n) def generate_wordcloud_data(texts, top_n=100): """Generate word frequency data for word cloud visualization.""" keywords = extract_keywords(texts, top_n) if not keywords: return [] max_count = keywords[0][1] return [ {"text": word, "value": count, "size": int(10 + (count / max_count) * 90)} for word, count in keywords ] def calculate_engagement_metrics(posts): """Calculate engagement metrics for posts.""" if not posts: return {} total_posts = len(posts) total_score = sum(p.get('score', 0) for p in posts) total_comments = sum(p.get('num_comments', 0) for p in posts) total_awards = sum(p.get('total_awards', 0) for p in posts) # Posts with engagement engaged_posts = [p for p in posts if p.get('score', 0) > 0 or p.get('num_comments', 0) > 0] # Top performers top_by_score = sorted(posts, key=lambda x: x.get('score', 0), reverse=True)[:10] top_by_comments = sorted(posts, key=lambda x: x.get('num_comments', 0), reverse=True)[:10] # Post type performance type_performance = {} for post in posts: ptype = post.get('post_type', 'unknown') if ptype not in type_performance: type_performance[ptype] = {'count': 0, 'total_score': 0, 'total_comments': 0} type_performance[ptype]['count'] += 1 type_performance[ptype]['total_score'] += post.get('score', 0) type_performance[ptype]['total_comments'] += post.get('num_comments', 0) for ptype in type_performance: count = type_performance[ptype]['count'] type_performance[ptype]['avg_score'] = type_performance[ptype]['total_score'] / count type_performance[ptype]['avg_comments'] = type_performance[ptype]['total_comments'] / count return { 'total_posts': total_posts, 'total_score': total_score, 'total_comments': total_comments, 'total_awards': total_awards, 'avg_score': total_score / total_posts if total_posts else 0, 'avg_comments': total_comments / total_posts if total_posts else 0, 'engagement_rate': len(engaged_posts) / total_posts if total_posts else 0, 'top_by_score': top_by_score, 'top_by_comments': top_by_comments, 'type_performance': type_performance } def find_best_posting_times(posts): """Analyze best times to post based on engagement.""" hourly_stats = {} daily_stats = {} for post in posts: created = post.get('created_utc', '') if not created: continue try: # Parse ISO format from datetime import datetime dt = datetime.fromisoformat(created.replace('Z', '+00:00')) hour = dt.hour day = dt.strftime('%A') # Hourly if hour not in hourly_stats: hourly_stats[hour] = {'count': 0, 'total_score': 0} hourly_stats[hour]['count'] += 1 hourly_stats[hour]['total_score'] += post.get('score', 0) # Daily if day not in daily_stats: daily_stats[day] = {'count': 0, 'total_score': 0} daily_stats[day]['count'] += 1 daily_stats[day]['total_score'] += post.get('score', 0) except: continue # Calculate averages for hour in hourly_stats: hourly_stats[hour]['avg_score'] = hourly_stats[hour]['total_score'] / hourly_stats[hour]['count'] for day in daily_stats: daily_stats[day]['avg_score'] = daily_stats[day]['total_score'] / daily_stats[day]['count'] # Find best times best_hours = sorted(hourly_stats.items(), key=lambda x: x[1]['avg_score'], reverse=True)[:5] best_days = sorted(daily_stats.items(), key=lambda x: x[1]['avg_score'], reverse=True)[:3] return { 'hourly_stats': hourly_stats, 'daily_stats': daily_stats, 'best_hours': [(h, s['avg_score']) for h, s in best_hours], 'best_days': [(d, s['avg_score']) for d, s in best_days] } ================================================ FILE: analytics/subreddit_stats.py ================================================ """ Subreddit Statistics - Subscribers, rules, mods, and metadata """ import requests from datetime import datetime import json USER_AGENT = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36" def get_subreddit_about(subreddit): """ Fetch subreddit metadata (subscribers, description, rules, etc.) Args: subreddit: Subreddit name (without r/) Returns: Dictionary with subreddit info """ url = f"https://old.reddit.com/r/{subreddit}/about.json" try: response = requests.get(url, headers={"User-Agent": USER_AGENT}, timeout=15) if response.status_code != 200: print(f"❌ Failed to fetch r/{subreddit} info: {response.status_code}") return None data = response.json()['data'] return { "name": data.get('display_name'), "title": data.get('title'), "description": data.get('public_description'), "subscribers": data.get('subscribers', 0), "active_users": data.get('accounts_active', 0), "created_utc": datetime.fromtimestamp(data.get('created_utc', 0)).isoformat(), "over_18": data.get('over18', False), "subreddit_type": data.get('subreddit_type'), # public, private, restricted "lang": data.get('lang'), "icon_url": data.get('icon_img', '').split('?')[0] if data.get('icon_img') else None, "banner_url": data.get('banner_img', '').split('?')[0] if data.get('banner_img') else None, "header_url": data.get('header_img'), "community_icon": data.get('community_icon', '').split('?')[0] if data.get('community_icon') else None, "wiki_enabled": data.get('wiki_enabled', False), "spoilers_enabled": data.get('spoilers_enabled', False), "allow_videos": data.get('allow_videos', False), "allow_images": data.get('allow_images', False), "allow_polls": data.get('allow_polls', False), } except Exception as e: print(f"❌ Error fetching subreddit info: {e}") return None def get_subreddit_rules(subreddit): """ Fetch subreddit rules. Args: subreddit: Subreddit name Returns: List of rule dictionaries """ url = f"https://old.reddit.com/r/{subreddit}/about/rules.json" try: response = requests.get(url, headers={"User-Agent": USER_AGENT}, timeout=15) if response.status_code != 200: return [] data = response.json() rules = [] for rule in data.get('rules', []): rules.append({ "short_name": rule.get('short_name'), "description": rule.get('description'), "priority": rule.get('priority'), "kind": rule.get('kind'), # link, comment, all "created_utc": datetime.fromtimestamp(rule.get('created_utc', 0)).isoformat() }) return rules except Exception as e: print(f"❌ Error fetching rules: {e}") return [] def get_subreddit_mods(subreddit): """ Fetch subreddit moderators. Args: subreddit: Subreddit name Returns: List of moderator usernames """ url = f"https://old.reddit.com/r/{subreddit}/about/moderators.json" try: response = requests.get(url, headers={"User-Agent": USER_AGENT}, timeout=15) if response.status_code != 200: return [] data = response.json() mods = [] for mod in data.get('data', {}).get('children', []): mods.append({ "name": mod.get('name'), "permissions": mod.get('mod_permissions', []), "added_utc": datetime.fromtimestamp(mod.get('date', 0)).isoformat() if mod.get('date') else None }) return mods except Exception as e: print(f"❌ Error fetching mods: {e}") return [] def get_subreddit_flairs(subreddit): """ Fetch available post flairs. Args: subreddit: Subreddit name Returns: List of flair options """ url = f"https://old.reddit.com/r/{subreddit}/api/link_flair_v2.json" try: response = requests.get(url, headers={"User-Agent": USER_AGENT}, timeout=15) if response.status_code != 200: return [] flairs = [] for flair in response.json(): flairs.append({ "text": flair.get('text'), "id": flair.get('id'), "background_color": flair.get('background_color'), "text_color": flair.get('text_color'), "type": flair.get('type') }) return flairs except Exception as e: return [] def get_full_subreddit_stats(subreddit): """ Get comprehensive subreddit statistics. Args: subreddit: Subreddit name Returns: Dictionary with all stats """ print(f"📊 Fetching stats for r/{subreddit}...") about = get_subreddit_about(subreddit) if not about: return None rules = get_subreddit_rules(subreddit) mods = get_subreddit_mods(subreddit) flairs = get_subreddit_flairs(subreddit) stats = { **about, "rules": rules, "rules_count": len(rules), "moderators": mods, "moderator_count": len(mods), "flairs": flairs, "flair_count": len(flairs), "fetched_at": datetime.now().isoformat() } # Print summary print(f"\n📊 r/{subreddit} Statistics:") print(f" 👥 Subscribers: {stats['subscribers']:,}") print(f" 🟢 Active Users: {stats['active_users']:,}") print(f" 📜 Rules: {stats['rules_count']}") print(f" 👮 Moderators: {stats['moderator_count']}") print(f" 🏷️ Flairs: {stats['flair_count']}") print(f" 📅 Created: {stats['created_utc'][:10]}") print(f" 🔞 NSFW: {stats['over_18']}") return stats def save_subreddit_stats(subreddit, output_dir="data"): """ Fetch and save subreddit stats to JSON. Args: subreddit: Subreddit name output_dir: Output directory Returns: Path to saved file """ import os stats = get_full_subreddit_stats(subreddit) if not stats: return None save_dir = f"{output_dir}/r_{subreddit}" os.makedirs(save_dir, exist_ok=True) filepath = f"{save_dir}/subreddit_stats.json" with open(filepath, 'w', encoding='utf-8') as f: json.dump(stats, f, indent=2, ensure_ascii=False) print(f"\n💾 Saved to {filepath}") return filepath # CLI for testing if __name__ == "__main__": import argparse parser = argparse.ArgumentParser(description="Subreddit Statistics") parser.add_argument("subreddit", help="Subreddit name") parser.add_argument("--save", action="store_true", help="Save to JSON") args = parser.parse_args() if args.save: save_subreddit_stats(args.subreddit) else: stats = get_full_subreddit_stats(args.subreddit) if stats: print(f"\n📝 Description: {stats['description'][:200]}..." if stats['description'] else "") ================================================ FILE: api/__init__.py ================================================ """Reddit Scraper REST API""" from .server import app ================================================ FILE: api/server.py ================================================ """ REST API Module - Expose Reddit Scraper data as a REST API For integration with Metabase, Grafana, DreamFactory, and other tools. Start with: python api/server.py Or: uvicorn api.server:app --reload --port 8000 """ from fastapi import FastAPI, Query, HTTPException from fastapi.middleware.cors import CORSMiddleware from typing import Optional, List import sys from pathlib import Path # Add parent to path sys.path.insert(0, str(Path(__file__).parent.parent)) from export.database import ( get_connection, search_posts, search_comments, get_subreddit_stats, get_all_subreddits, get_job_history, get_job_stats, get_database_info ) # Create FastAPI app app = FastAPI( title="Reddit Scraper API", description="REST API for Reddit Scraper data. Use with Metabase, Grafana, or any tool.", version="1.0.0", docs_url="/docs", redoc_url="/redoc" ) # Enable CORS for external tools app.add_middleware( CORSMiddleware, allow_origins=["*"], # Allow all origins for local tools allow_credentials=True, allow_methods=["*"], allow_headers=["*"], ) # --- HEALTH & INFO --- @app.get("/", tags=["Info"]) def root(): """API root - basic info.""" return { "name": "Reddit Scraper API", "version": "1.0.0", "docs": "/docs", "endpoints": ["/posts", "/comments", "/subreddits", "/jobs", "/stats"] } @app.get("/health", tags=["Info"]) def health_check(): """Health check endpoint.""" try: info = get_database_info() return {"status": "healthy", "database": info} except Exception as e: return {"status": "unhealthy", "error": str(e)} @app.get("/info", tags=["Info"]) def database_info(): """Get database info and table counts.""" return get_database_info() # --- POSTS --- @app.get("/posts", tags=["Posts"]) def list_posts( q: Optional[str] = Query(None, description="Search query"), subreddit: Optional[str] = Query(None, description="Filter by subreddit"), author: Optional[str] = Query(None, description="Filter by author"), min_score: Optional[int] = Query(None, description="Minimum score"), post_type: Optional[str] = Query(None, description="Post type filter"), limit: int = Query(100, ge=1, le=1000, description="Max results") ): """ Get posts with optional filters. Use for Grafana dashboards, Metabase queries, or custom integrations. """ return search_posts( query=q, subreddit=subreddit, author=author, min_score=min_score, post_type=post_type, limit=limit ) @app.get("/posts/{post_id}", tags=["Posts"]) def get_post(post_id: str): """Get a single post by ID.""" conn = get_connection() cursor = conn.cursor() cursor.execute("SELECT * FROM posts WHERE id = ?", (post_id,)) row = cursor.fetchone() conn.close() if not row: raise HTTPException(status_code=404, detail="Post not found") return dict(row) # --- COMMENTS --- @app.get("/comments", tags=["Comments"]) def list_comments( q: Optional[str] = Query(None, description="Search in comment body"), post_id: Optional[str] = Query(None, description="Filter by post ID"), author: Optional[str] = Query(None, description="Filter by author"), min_score: Optional[int] = Query(None, description="Minimum score"), limit: int = Query(100, ge=1, le=1000, description="Max results") ): """Get comments with optional filters.""" return search_comments( query=q, post_id=post_id, author=author, min_score=min_score, limit=limit ) # --- SUBREDDITS --- @app.get("/subreddits", tags=["Subreddits"]) def list_subreddits(): """Get all scraped subreddits with post counts.""" return get_all_subreddits() @app.get("/subreddits/{subreddit}/stats", tags=["Subreddits"]) def subreddit_stats(subreddit: str): """Get detailed statistics for a subreddit.""" stats = get_subreddit_stats(subreddit) if not stats.get('total_posts'): raise HTTPException(status_code=404, detail=f"No data for r/{subreddit}") return stats # --- JOBS --- @app.get("/jobs", tags=["Jobs"]) def list_jobs( status: Optional[str] = Query(None, description="Filter by status"), target: Optional[str] = Query(None, description="Filter by target"), limit: int = Query(50, ge=1, le=200) ): """Get job history.""" return get_job_history(limit=limit, target=target, status=status) @app.get("/jobs/stats", tags=["Jobs"]) def job_stats(): """Get aggregated job statistics.""" return get_job_stats() # --- RAW SQL (for advanced users) --- @app.get("/query", tags=["Advanced"]) def raw_query( sql: str = Query(..., description="SQL SELECT query"), limit: int = Query(100, ge=1, le=1000) ): """ Execute a raw SQL SELECT query. ⚠️ Only SELECT queries allowed. Use for custom Grafana/Metabase queries. Example: /query?sql=SELECT title, score FROM posts ORDER BY score DESC """ # Security: Only allow SELECT if not sql.strip().upper().startswith("SELECT"): raise HTTPException(status_code=400, detail="Only SELECT queries allowed") # Add limit if not present if "LIMIT" not in sql.upper(): sql = f"{sql} LIMIT {limit}" try: conn = get_connection() cursor = conn.cursor() cursor.execute(sql) results = [dict(row) for row in cursor.fetchall()] conn.close() return {"query": sql, "count": len(results), "results": results} except Exception as e: raise HTTPException(status_code=400, detail=f"Query error: {e}") # --- GRAFANA COMPATIBLE ENDPOINTS --- @app.get("/grafana/search", tags=["Grafana"]) def grafana_search(): """Grafana SimpleJSON datasource - search endpoint.""" subs = get_all_subreddits() return [s['subreddit'] for s in subs] @app.post("/grafana/query", tags=["Grafana"]) def grafana_query(body: dict): """Grafana SimpleJSON datasource - query endpoint.""" # Return time series data for Grafana results = [] for target in body.get('targets', []): subreddit = target.get('target') if subreddit: conn = get_connection() cursor = conn.cursor() cursor.execute(""" SELECT date(created_utc) as time, COUNT(*) as value FROM posts WHERE subreddit = ? GROUP BY date(created_utc) ORDER BY time """, (subreddit,)) datapoints = [[row['value'], row['time']] for row in cursor.fetchall()] conn.close() results.append({ "target": subreddit, "datapoints": datapoints }) return results # --- CLI --- if __name__ == "__main__": import uvicorn print("🚀 Starting Reddit Scraper API...") print(" 📖 Docs: http://localhost:8000/docs") print(" 📊 Use with Metabase, Grafana, or any REST client") uvicorn.run(app, host="0.0.0.0", port=8000) ================================================ FILE: config.py ================================================ """ Reddit Scraper Suite - Configuration """ import os from pathlib import Path # --- PATHS --- BASE_DIR = Path(__file__).parent DATA_DIR = BASE_DIR / "data" DB_PATH = DATA_DIR / "reddit_scraper.db" # --- SCRAPER SETTINGS --- USER_AGENT = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36" # Sources: old.reddit.com for residential IPs, mirrors for data centers MIRRORS = [ "https://old.reddit.com", "https://redlib.catsarch.com", "https://redlib.vsls.cz", "https://r.nf", "https://libreddit.northboot.xyz", "https://redlib.tux.pizza" ] # Rate limiting REQUEST_TIMEOUT = 15 COOLDOWN_SECONDS = 3 RETRY_WAIT = 30 # Media settings MAX_IMAGES_PER_POST = 10 MAX_VIDEOS_PER_POST = 2 MAX_GALLERY_IMAGES = 15 # Comment settings MAX_COMMENT_DEPTH = 5 # --- ASYNC SETTINGS --- ASYNC_MAX_CONCURRENT = 10 ASYNC_BATCH_SIZE = 50 # --- NOTIFICATION SETTINGS --- DISCORD_WEBHOOK_URL = os.getenv("DISCORD_WEBHOOK_URL", "") TELEGRAM_BOT_TOKEN = os.getenv("TELEGRAM_BOT_TOKEN", "") TELEGRAM_CHAT_ID = os.getenv("TELEGRAM_CHAT_ID", "") # --- DASHBOARD SETTINGS --- DASHBOARD_HOST = "0.0.0.0" DASHBOARD_PORT = 8501 # --- SCHEDULER SETTINGS --- SCHEDULER_TIMEZONE = "Asia/Kolkata" # --- DATABASE SETTINGS --- DATABASE_URL = os.getenv("DATABASE_URL", f"sqlite:///{DB_PATH}") # Ensure data directory exists DATA_DIR.mkdir(exist_ok=True) ================================================ FILE: dashboard/__init__.py ================================================ # Dashboard module ================================================ FILE: dashboard/app.py ================================================ """ Reddit Scraper Dashboard - Streamlit Web UI Run with: streamlit run dashboard/app.py """ import streamlit as st import pandas as pd from pathlib import Path import sys from datetime import datetime import time import os import json import signal # Add parent to path sys.path.insert(0, str(Path(__file__).parent.parent)) from analytics.sentiment import ( analyze_posts_sentiment, extract_keywords, calculate_engagement_metrics, find_best_posting_times ) from search.query import search_all_data, advanced_search, get_top_posts # Page config st.set_page_config( page_title="Reddit Scraper Dashboard", page_icon="🤖", layout="wide", initial_sidebar_state="expanded" ) # Custom CSS st.markdown(""" """, unsafe_allow_html=True) def load_subreddit_data(subreddit_path): """Load all data for a subreddit.""" data = {} posts_file = subreddit_path / 'posts.csv' if posts_file.exists(): data['posts'] = pd.read_csv(posts_file) comments_file = subreddit_path / 'comments.csv' if comments_file.exists(): data['comments'] = pd.read_csv(comments_file) return data def get_available_data(): """Get list of scraped subreddits and users.""" data_dir = Path(__file__).parent.parent / 'data' data = {'subreddits': [], 'users': []} if data_dir.exists(): for sub_dir in data_dir.iterdir(): if sub_dir.is_dir(): # Check for r_ or u_ prefix (standard scraper format) # We allow folders even without posts.csv so users can see empty scrapes if sub_dir.name.startswith('u_'): data['users'].append(sub_dir.name) elif sub_dir.name.startswith('r_'): data['subreddits'].append(sub_dir.name) elif (sub_dir / 'posts.csv').exists(): # Fallback for old/other folders that have data data['subreddits'].append(sub_dir.name) # Sort lists data['subreddits'].sort() data['users'].sort() return data def main(): # Header st.markdown('

🤖 Reddit Scraper Dashboard

', unsafe_allow_html=True) # Sidebar st.sidebar.title("📊 Navigation") if st.sidebar.button("🔄 Refresh List"): st.rerun() # Get available data available_data = get_available_data() # Source Selector source_type = st.sidebar.radio( "Source Type", ["Subreddits", "Users"], horizontal=True ) # Filter list based on type if source_type == "Users": options = available_data['users'] prefix_len = 2 # 'u_' empty_msg = "No scraped users found." icon = "👤" else: options = available_data['subreddits'] prefix_len = 2 # 'r_' is 2 chars, but some might not have it if legacy? # Actually standard scraper uses r_. empty_msg = "No scraped subreddits found." icon = "📁" selected_sub = None if not options: st.sidebar.warning(empty_msg) if source_type == "Subreddits": st.sidebar.info("Go to '⚙️ Scraper' tab to start scraping.") else: st.sidebar.info("Go to '⚙️ Scraper' tab to start scraping users.") else: # Selector selected_sub = st.sidebar.selectbox( f"Select {source_type[:-1]}", # "Select Subreddit" or "Select User" options, format_func=lambda x: f"{icon} {x[2:] if x.startswith(('r_', 'u_')) else x}" ) # Load data if selected posts_df = pd.DataFrame() comments_df = pd.DataFrame() data_loaded = False if selected_sub: data_dir = Path(__file__).parent.parent / 'data' sub_path = data_dir / selected_sub data = load_subreddit_data(sub_path) if 'posts' in data: posts_df = data['posts'] comments_df = data.get('comments', pd.DataFrame()) data_loaded = True else: st.error("No posts data found for selected item!") # Define Tabs # Data tabs only if data loaded tab_list = [] if data_loaded: tab_list.extend(["📊 Overview", "📈 Analytics", "🔍 Search", "💬 Comments"]) # Always present tabs tab_list.extend(["⚙️ Scraper", "📋 Job History", "🔌 Integrations"]) # Create tabs tabs = st.tabs(tab_list) # Map tabs to variables for easy access tab_map = {name: tabs[i] for i, name in enumerate(tab_list)} # --- RENDER TABS --- if data_loaded: with tab_map["📊 Overview"]: st.header(f"📊 Overview: {selected_sub}") # Metrics row col1, col2, col3, col4, col5 = st.columns(5) with col1: st.metric("Total Posts", len(posts_df)) with col2: st.metric("Total Comments", len(comments_df)) with col3: total_score = posts_df['score'].sum() if 'score' in posts_df else 0 st.metric("Total Score", f"{total_score:,}") with col4: avg_score = posts_df['score'].mean() if 'score' in posts_df else 0 st.metric("Avg Score", f"{avg_score:.1f}") with col5: media_count = posts_df['has_media'].sum() if 'has_media' in posts_df else 0 st.metric("Media Posts", int(media_count)) st.divider() # Post type distribution col1, col2 = st.columns(2) with col1: st.subheader("📝 Post Types") if 'post_type' in posts_df: type_counts = posts_df['post_type'].value_counts() st.bar_chart(type_counts) with col2: st.subheader("📅 Posts Over Time") if 'created_utc' in posts_df: posts_df['date'] = pd.to_datetime(posts_df['created_utc']).dt.date daily = posts_df.groupby('date').size() st.line_chart(daily) st.divider() # Top posts st.subheader("🔥 Top Posts by Score") if 'score' in posts_df: top_posts = posts_df.nlargest(10, 'score')[['title', 'score', 'num_comments', 'post_type', 'created_utc']] st.dataframe(top_posts) with tab_map["📈 Analytics"]: st.header("📈 Analytics") # Sentiment Analysis st.subheader("😀 Sentiment Analysis") if st.button("Run Sentiment Analysis"): with st.spinner("Analyzing sentiment..."): posts_list = posts_df.to_dict('records') analyzed_posts, sentiment_counts = analyze_posts_sentiment(posts_list) col1, col2, col3 = st.columns(3) col1.metric("Positive", sentiment_counts['positive'], delta=None) col2.metric("Neutral", sentiment_counts['neutral'], delta=None) col3.metric("Negative", sentiment_counts['negative'], delta=None) # Pie chart sentiment_df = pd.DataFrame({ 'Sentiment': ['Positive', 'Neutral', 'Negative'], 'Count': [sentiment_counts['positive'], sentiment_counts['neutral'], sentiment_counts['negative']] }) st.bar_chart(sentiment_df.set_index('Sentiment')) st.divider() # Keywords st.subheader("☁️ Top Keywords") texts = posts_df['title'].tolist() if 'selftext' in posts_df: texts.extend(posts_df['selftext'].dropna().tolist()) keywords = extract_keywords(texts, top_n=30) if keywords: kw_df = pd.DataFrame(keywords, columns=['Word', 'Count']) st.bar_chart(kw_df.set_index('Word').head(20)) st.divider() # Best posting times st.subheader("⏰ Best Posting Times") if 'created_utc' in posts_df: timing_data = find_best_posting_times(posts_df.to_dict('records')) if timing_data['best_hours']: st.write("**Best Hours to Post:**") for hour, avg_score in timing_data['best_hours']: st.write(f"• {hour}:00 - Avg Score: {avg_score:.1f}") if timing_data['best_days']: st.write("**Best Days to Post:**") for day, avg_score in timing_data['best_days']: st.write(f"• {day} - Avg Score: {avg_score:.1f}") with tab_map["🔍 Search"]: st.header("🔍 Search Posts") # Search form col1, col2 = st.columns([3, 1]) with col1: search_query = st.text_input("Search query", placeholder="Enter keywords...") with col2: min_score = st.number_input("Min Score", min_value=0, value=0) col3, col4, col5 = st.columns(3) with col3: if 'post_type' in posts_df: post_types = ['All'] + posts_df['post_type'].dropna().unique().tolist() selected_type = st.selectbox("Post Type", post_types) with col4: if 'author' in posts_df: authors = ['All'] + posts_df['author'].dropna().unique().tolist()[:50] selected_author = st.selectbox("Author", authors) with col5: sort_by = st.selectbox("Sort by", ['score', 'num_comments', 'created_utc']) # Search button if st.button("🔍 Search"): filtered = posts_df.copy() if search_query: mask = filtered['title'].str.contains(search_query, case=False, na=False) if 'selftext' in filtered: mask |= filtered['selftext'].str.contains(search_query, case=False, na=False) filtered = filtered[mask] if min_score > 0: filtered = filtered[filtered['score'] >= min_score] if selected_type != 'All' and 'post_type' in filtered: filtered = filtered[filtered['post_type'] == selected_type] if selected_author != 'All' and 'author' in filtered: filtered = filtered[filtered['author'] == selected_author] filtered = filtered.sort_values(sort_by, ascending=False) st.write(f"Found {len(filtered)} results") st.dataframe(filtered[['title', 'score', 'num_comments', 'post_type', 'author', 'created_utc']].head(50)) with tab_map["💬 Comments"]: st.header("💬 Comments Analysis") if len(comments_df) == 0: st.warning("No comments data found for this subreddit") else: col1, col2, col3 = st.columns(3) with col1: st.metric("Total Comments", len(comments_df)) with col2: avg_score = comments_df['score'].mean() if 'score' in comments_df else 0 st.metric("Avg Score", f"{avg_score:.1f}") with col3: unique_authors = comments_df['author'].nunique() if 'author' in comments_df else 0 st.metric("Unique Commenters", unique_authors) st.divider() # Top comments st.subheader("🔥 Top Comments by Score") if 'score' in comments_df: top_comments = comments_df.nlargest(10, 'score')[['body', 'score', 'author', 'created_utc']] for _, row in top_comments.iterrows(): with st.expander(f"⬆️ {row['score']} - by u/{row['author']}"): st.write(row['body'][:500]) st.divider() # Top commenters st.subheader("👥 Top Commenters") if 'author' in comments_df: top_authors = comments_df['author'].value_counts().head(10) st.bar_chart(top_authors) # Scraper Tab (Always visible) with tab_map["⚙️ Scraper"]: st.header("⚙️ Scraper Controls") # Persistence logic import json import signal JOB_FILE = Path("active_job.json") LOG_DIR = Path("logs") LOG_DIR.mkdir(exist_ok=True) def get_active_job(): if JOB_FILE.exists(): try: with open(JOB_FILE, "r") as f: return json.load(f) except: return None return None # Check for active job active_job = get_active_job() # Auto-detect if process is dead if active_job: try: import psutil if not psutil.pid_exists(active_job['pid']): # Process is dead if JOB_FILE.exists(): JOB_FILE.unlink() active_job = None st.rerun() except ImportError: # Fallback for systems without psutil try: os.kill(active_job['pid'], 0) except OSError: # PID doesn't exist (Process dead) if JOB_FILE.exists(): JOB_FILE.unlink() active_job = None st.rerun() # Monitor Section (Always visible if job exists) if active_job: st.info(f"🔄 **Scraping in Progress**: {active_job.get('target', 'Unknown')} (PID: {active_job.get('pid')})") # Stop button if st.button("🛑 Stop Scraping"): try: import signal os.kill(active_job['pid'], signal.SIGTERM) st.warning("Stopped process.") except: st.warning("Process already stopped.") if JOB_FILE.exists(): JOB_FILE.unlink() st.rerun() # Read logs log_file = Path(active_job['log_file']) if log_file.exists(): with open(log_file, "r", encoding="utf-8", errors="replace") as f: lines = f.readlines() # Parse metrics from lines posts_saved = 0 comments_count = 0 images_count = 0 videos_count = 0 found_posts = 0 processed_posts = 0 for line in lines: import re # Progress: X/Y (Saved posts) m = re.search(r'Progress: (\d+)/(\d+)', line) if m: posts_saved = int(m.group(1)) # Saved X posts m = re.search(r'Saved (\d+)', line) if m: posts_saved += int(m.group(1)) # Found X posts m = re.search(r'Found (\d+) posts', line) if m: found_posts += int(m.group(1)) # Processed posts (Fetching comments) if "Fetching comments for:" in line: processed_posts += 1 # Comments: X (Summary) m = re.search(r'Comments:\s*(\d+)', line) if m: comments_count = int(m.group(1)) else: # Incremental comments m = re.search(r'\+ Scraped (\d+) comments', line) if m: comments_count += int(m.group(1)) # Images/Videos (Summary line) m = re.search(r'Images:\s*(\d+).*Videos:\s*(\d+)', line) if m: images_count = int(m.group(1)) videos_count = int(m.group(2)) # Images/Videos (Real-time line) m = re.search(r'\+ Downloaded: (\d+) images, (\d+) videos', line) if m: images_count += int(m.group(1)) videos_count += int(m.group(2)) # Display Metrics col1, col2, col3, col4 = st.columns(4) # Posts Metric Logic if posts_saved > 0: col1.metric("📊 Posts", f"{posts_saved} (Found {found_posts})") elif found_posts > 0: col1.metric("📊 Posts", f"Processing: {processed_posts}/{found_posts}") else: col1.metric("📊 Posts", "0") col2.metric("💬 Comments", comments_count) col3.metric("🖼️ Images", images_count) col4.metric("🎬 Videos", videos_count) # Show latest logs st.code("".join(lines[-20:]), language="text") # Auto-refresh time.sleep(1) st.rerun() else: st.warning("Log file not found.") else: # Start New Scrape UI st.subheader("🚀 Start New Scrape") col1, col2 = st.columns(2) with col1: new_sub = st.text_input("Subreddit/User name", placeholder="e.g. python") is_user = st.checkbox("Is a User (not subreddit)") with col2: limit = st.number_input("Post Limit", min_value=10, max_value=5000, value=100) mode = st.selectbox("Mode", ['full', 'history']) no_media = st.checkbox("Skip media download") no_comments = st.checkbox("Skip comments") if st.button("🚀 Start Scraping"): if not new_sub: st.error("Please enter a subreddit/user name!") else: target_cmd = ["python", "-u", "main.py", new_sub, "--mode", mode, "--limit", str(limit)] if is_user: target_cmd.append("--user") if no_media: target_cmd.append("--no-media") if no_comments: target_cmd.append("--no-comments") # Start background process import subprocess job_id = f"job_{int(time.time())}" log_file = LOG_DIR / f"{job_id}.log" try: with open(log_file, "w", encoding="utf-8") as f: env = os.environ.copy() env['PYTHONIOENCODING'] = 'utf-8' env['PYTHONUNBUFFERED'] = '1' process = subprocess.Popen( target_cmd, stdout=f, stderr=subprocess.STDOUT, cwd=str(Path(__file__).parent.parent), env=env ) # Save job state job_info = { "job_id": job_id, "pid": process.pid, "target": new_sub, "log_file": str(log_file.absolute()), "start_time": time.time() } with open(JOB_FILE, "w") as f: json.dump(job_info, f) st.success(f"Started job {job_id}!") st.rerun() except Exception as e: st.error(f"Failed to start: {e}") st.divider() if selected_sub: # Export options st.subheader("📤 Export Data") export_format = st.selectbox("Format", ['CSV', 'JSON', 'Excel']) if st.button("📥 Download Posts"): if export_format == 'CSV': csv = posts_df.to_csv(index=False) st.download_button( "Download CSV", csv, f"{selected_sub}_posts.csv", "text/csv" ) elif export_format == 'JSON': json_data = posts_df.to_json(orient='records', indent=2) st.download_button( "Download JSON", json_data, f"{selected_sub}_posts.json", "application/json" ) st.divider() # Media Export st.subheader("🖼️ Media Export") media_dir = Path(f"data/{selected_sub}/media") if media_dir.exists(): images_dir = media_dir / "images" videos_dir = media_dir / "videos" images = list(images_dir.glob("*")) if images_dir.exists() else [] videos = list(videos_dir.glob("*")) if videos_dir.exists() else [] col1, col2, col3 = st.columns(3) with col1: st.metric("📷 Images", len(images)) with col2: st.metric("🎬 Videos", len(videos)) with col3: total_size = sum(f.stat().st_size for f in images + videos) / (1024 * 1024) st.metric("💾 Total Size", f"{total_size:.1f} MB") if images or videos: if st.button("📦 Download All Media (ZIP)"): import zipfile import io zip_buffer = io.BytesIO() with zipfile.ZipFile(zip_buffer, 'w', zipfile.ZIP_DEFLATED) as zf: for img in images: zf.write(img, f"images/{img.name}") for vid in videos: zf.write(vid, f"videos/{vid.name}") st.download_button( "💾 Download ZIP", zip_buffer.getvalue(), f"{selected_sub}_media.zip", "application/zip" ) st.success(f"✅ ZIP ready: {len(images)} images, {len(videos)} videos") # Preview recent images if images: st.write("**Recent Images:**") preview_cols = st.columns(min(5, len(images))) for i, img in enumerate(images[:5]): with preview_cols[i]: try: st.image(str(img), width=100) except: st.text(img.name[:15]) else: st.info(f"No media found for {selected_sub}. Run with `--mode full` to download media.") with tab_map["📋 Job History"]: st.header("📋 Job History") try: from export.database import get_job_history, get_job_stats # Job stats stats = get_job_stats() col1, col2, col3, col4 = st.columns(4) with col1: st.metric("Total Jobs", stats.get('total_jobs', 0)) with col2: st.metric("Completed", stats.get('completed', 0)) with col3: st.metric("Failed", stats.get('failed', 0)) with col4: avg_dur = stats.get('avg_duration') st.metric("Avg Duration", f"{avg_dur:.1f}s" if avg_dur else "-") st.divider() # Job history table st.subheader("Recent Jobs") col1, col2 = st.columns(2) with col1: filter_status = st.selectbox("Filter by Status", ['All', 'completed', 'failed', 'running']) with col2: limit = st.number_input("Show last N jobs", min_value=10, max_value=100, value=20) status_filter = None if filter_status == 'All' else filter_status jobs = get_job_history(limit=limit, status=status_filter) if jobs: jobs_df = pd.DataFrame(jobs) # Format for display display_cols = ['job_id', 'target', 'mode', 'status', 'posts_scraped', 'comments_scraped', 'duration_seconds', 'started_at', 'dry_run'] display_cols = [c for c in display_cols if c in jobs_df.columns] st.dataframe(jobs_df[display_cols]) # Success rate chart st.subheader("Success Rate") if 'status' in jobs_df.columns: status_counts = jobs_df['status'].value_counts() st.bar_chart(status_counts) else: st.info("No job history found. Run some scrapes first!") except Exception as e: st.error(f"Failed to load job history: {e}") st.info("Make sure the database is initialized.") with tab_map["🔌 Integrations"]: st.header("🔌 Integrations & Settings") # REST API Section st.subheader("🚀 REST API") col1, col2, col3 = st.columns(3) with col1: api_port = st.number_input("API Port", value=8000, min_value=1000, max_value=65535) with col2: if st.button("🚀 Start API Server"): st.info("Starting API server in background...") import subprocess try: # Start API in background (non-blocking) subprocess.Popen( ["python", "main.py", "--api"], cwd=str(Path(__file__).parent.parent), creationflags=subprocess.CREATE_NEW_CONSOLE if hasattr(subprocess, 'CREATE_NEW_CONSOLE') else 0 ) st.success(f"✅ API server starting on port {api_port}!") st.markdown(f"**Open:** [http://localhost:{api_port}/docs](http://localhost:{api_port}/docs)") except Exception as e: st.error(f"❌ Failed to start API: {e}") with col3: # Check if API is running import requests try: resp = requests.get(f"http://localhost:{api_port}/health", timeout=1) if resp.status_code == 200: st.success("🟢 API is running") else: st.warning("🟡 API responded but not healthy") except: st.info("🔴 API not running") st.markdown(""" **Available Endpoints:** | Endpoint | Description | |----------|-------------| | `/posts` | List posts with filters | | `/comments` | List comments | | `/subreddits` | All scraped subreddits | | `/jobs` | Job history | | `/query?sql=...` | Raw SQL queries | | `/docs` | Interactive Swagger UI | """) st.divider() # External Tools st.subheader("📊 External Tools Integration") tool_tabs = st.tabs(["📈 Metabase", "📊 Grafana", "🔗 DreamFactory", "🧦 DuckDB"]) with tool_tabs[0]: st.markdown(""" **Metabase Setup:** 1. Start API: `python main.py --api` 2. In Metabase: New Question → Native Query 3. Use HTTP datasource with `http://localhost:8000` 4. Query: `/posts?subreddit=python&limit=100` **Or use raw SQL:** ``` /query?sql=SELECT title, score FROM posts ORDER BY score DESC ``` """) with tool_tabs[1]: st.markdown(""" **Grafana Setup:** 1. Install "JSON API" or "Infinity" plugin 2. Add datasource: `http://localhost:8000` 3. Use `/grafana/query` for time-series **Example Panel Query:** ```sql SELECT date(created_utc) as time, COUNT(*) as posts FROM posts GROUP BY date(created_utc) ``` """) with tool_tabs[2]: st.markdown(""" **DreamFactory Setup:** 1. Point to SQLite file: `data/reddit_scraper.db` 2. Or use REST API: `http://localhost:8000` 3. Auto-generates API for all tables """) with tool_tabs[3]: st.markdown(""" **DuckDB (Analytics):** 1. Export to Parquet first (see below) 2. Query directly: ```python import duckdb duckdb.query("SELECT * FROM 'data/parquet/*.parquet'").df() ``` """) st.divider() # Parquet Export st.subheader("📦 Parquet Export") all_targets = available_data['subreddits'] + available_data['users'] col1, col2 = st.columns(2) with col1: export_sub = st.selectbox("Select target to export", all_targets, key="parquet_export") with col2: if st.button("📦 Export to Parquet"): if export_sub: target_name = export_sub.replace('r_', '').replace('u_', '') with st.spinner(f"Exporting {target_name} to Parquet..."): import subprocess result = subprocess.run( ["python", "main.py", "--export-parquet", target_name], capture_output=True, text=True, cwd=str(Path(__file__).parent.parent) ) if result.returncode == 0: st.success(f"✅ Exported {target_name} to Parquet!") st.code(result.stdout[-500:] if len(result.stdout) > 500 else result.stdout) else: st.error(f"❌ Export failed: {result.stderr}") else: st.error("Select a target first") # List existing parquet files parquet_dir = Path("data/parquet") if parquet_dir.exists(): parquet_files = list(parquet_dir.glob("*.parquet")) if parquet_files: st.write("**Existing Parquet files:**") for f in parquet_files[:10]: size_mb = f.stat().st_size / (1024 * 1024) st.text(f" • {f.name} ({size_mb:.2f} MB)") st.divider() # Database Maintenance st.subheader("🛠️ Database Maintenance") col1, col2, col3 = st.columns(3) with col1: if st.button("💾 Backup Database"): with st.spinner("Creating backup..."): import subprocess result = subprocess.run( ["python", "main.py", "--backup"], capture_output=True, text=True, cwd=str(Path(__file__).parent.parent) ) if result.returncode == 0: st.success("✅ Database backed up!") st.code(result.stdout[-300:] if len(result.stdout) > 300 else result.stdout) else: st.error(f"❌ Backup failed: {result.stderr}") with col2: if st.button("🧹 Vacuum/Optimize"): with st.spinner("Optimizing database..."): import subprocess result = subprocess.run( ["python", "main.py", "--vacuum"], capture_output=True, text=True, cwd=str(Path(__file__).parent.parent) ) if result.returncode == 0: st.success("✅ Database optimized!") st.code(result.stdout[-300:] if len(result.stdout) > 300 else result.stdout) else: st.error(f"❌ Vacuum failed: {result.stderr}") with col3: try: from export.database import get_database_info db_info = get_database_info() st.metric("DB Size", f"{db_info.get('size_mb', 0):.2f} MB") except: st.metric("DB Size", "N/A") # Show backup files backup_dir = Path("data/backups") if backup_dir.exists(): backups = sorted(backup_dir.glob("*.db"), reverse=True)[:5] if backups: st.write("**Recent Backups:**") for b in backups: size_mb = b.stat().st_size / (1024 * 1024) st.text(f" • {b.name} ({size_mb:.2f} MB)") st.divider() # Plugin Configuration st.subheader("🔌 Plugins") try: from plugins import load_plugins plugins = load_plugins() if plugins: st.write("**Available Plugins:**") for plugin in plugins: status = "✅" if plugin.enabled else "❌" st.markdown(f"{status} **{plugin.name}** - {plugin.description}") st.info("💡 Enable plugins when scraping: `python main.py --plugins`") else: st.warning("No plugins found in plugins/ directory") except Exception as e: st.error(f"Plugin loading error: {e}") st.divider() # Quick Commands Reference st.subheader("📋 Quick Commands") st.code(""" # Start REST API python main.py --api # Export to Parquet python main.py --export-parquet # Backup database python main.py --backup # Scrape with plugins python main.py --plugins # Dry run (test without saving) python main.py --dry-run """, language="bash") if __name__ == "__main__": main() ================================================ FILE: docker-compose.yml ================================================ version: '3.8' # Reddit Scraper Suite - Full Stack # Start with: docker-compose up -d services: # Main Scraper (run scrape jobs) scraper: build: . volumes: - ./data:/app/data # Persist scraped data command: ["--help"] # Override with your scrape command profiles: ["scrape"] # Only run when explicitly requested # REST API Server (for Metabase/Grafana integration) api: build: . ports: - "8000:8000" volumes: - ./data:/app/data command: ["--api"] restart: unless-stopped healthcheck: test: ["CMD", "curl", "-f", "http://localhost:8000/health"] interval: 30s timeout: 10s retries: 3 # Streamlit Dashboard dashboard: build: . ports: - "8501:8501" volumes: - ./data:/app/data command: ["--dashboard"] restart: unless-stopped depends_on: - api # Scheduled Scraper (optional - uncomment and configure) # scheduler: # build: . # volumes: # - ./data:/app/data # command: ["--schedule", "python", "--every", "60"] # restart: unless-stopped # Optional: Add Metabase for data visualization # Uncomment to enable # # metabase: # image: metabase/metabase:latest # ports: # - "3000:3000" # environment: # MB_DB_TYPE: h2 # volumes: # - metabase-data:/metabase-data # depends_on: # - api # =========================================== # PRODUCTION DEPLOYMENT (AWS/VPS) # =========================================== # Uncomment the nginx service below for: # - HTTPS/SSL termination # - Basic authentication # - Single port exposure (80/443) # nginx: # image: nginx:alpine # ports: # - "80:80" # - "443:443" # volumes: # - ./nginx.conf:/etc/nginx/nginx.conf:ro # - ./ssl:/etc/nginx/ssl:ro # Add your SSL certs # depends_on: # - api # - dashboard # volumes: # metabase-data: # =========================================== # QUICK DEPLOY TO AWS/VPS: # =========================================== # 1. SSH into your server # 2. git clone # 3. docker-compose up -d # 4. Open firewall: ports 8000, 8501 # # Access: # http://:8501 (Dashboard) # http://:8000 (API) # =========================================== ================================================ FILE: docs/BLOG.md ================================================ # Building the Ultimate Reddit Scraper: A Full-Featured, API-Free Data Collection Suite ![Reddit Scraper](https://img.shields.io/badge/Reddit-Scraper-FF4500?style=for-the-badge&logo=reddit&logoColor=white) ![Python](https://img.shields.io/badge/Python-3.10+-3776AB?style=for-the-badge&logo=python&logoColor=white) ![Docker](https://img.shields.io/badge/Docker-Ready-2496ED?style=for-the-badge&logo=docker&logoColor=white) **December 2024** | By Sanjeev Kumar --- ## TL;DR I built a **complete Reddit scraper suite** that requires **zero API keys**. It comes with a beautiful Streamlit dashboard, REST API for integration with tools like Grafana and Metabase, plugin system for post-processing, scheduled scraping, notifications, and much more. Best of all—it's completely open source. 🔗 **GitHub**: [reddit-universal-scraper](https://github.com/ksanjeev284/reddit-universal-scraper) --- ## The Problem If you've ever tried to scrape Reddit data for analysis, research, or just personal projects, you know the pain: 1. **Reddit's API is heavily rate-limited** (especially after the 2023 API changes) 2. **API keys require approval** and are increasingly restricted 3. **Existing scrapers are often single-purpose** - scrape posts OR comments, not both 4. **No easy way to visualize or analyze the data** after scraping 5. **Running scrapes manually is tedious** - you want automation I decided to solve all of these problems at once. --- ## The Solution: Universal Reddit Scraper Suite After weeks of development, I created a full-featured scraper that: | Feature | What It Does | |---------|--------------| | 📊 **Full Scraping** | Posts, comments, images, videos, galleries—everything | | 🚫 **No API Keys** | Uses Reddit's public JSON endpoints and mirrors | | 📈 **Web Dashboard** | Beautiful 7-tab Streamlit UI for analysis | | 🚀 **REST API** | Connect Metabase, Grafana, DuckDB, and more | | 🔌 **Plugin System** | Extensible post-processing (sentiment analysis, deduplication, keywords) | | 📅 **Scheduled Scraping** | Cron-style automation | | 📧 **Notifications** | Discord & Telegram alerts when scrapes complete | | 🐳 **Docker Ready** | One command to deploy anywhere | --- ## Architecture Deep Dive ### How It Works Without API Keys The secret sauce is in the approach. Instead of using Reddit's official (and restricted) API, I leverage: 1. **Reddit's public JSON endpoints**: Every Reddit page has a `.json` suffix that returns structured data 2. **Multiple mirror fallbacks**: When one source is rate-limited, the scraper automatically rotates through alternatives like Redlib instances 3. **Smart rate limiting**: Built-in delays and cool-down periods to stay under the radar ```python MIRRORS = [ "https://old.reddit.com", "https://redlib.catsarch.com", "https://redlib.vsls.cz", "https://r.nf", "https://libreddit.northboot.xyz", "https://redlib.tux.pizza" ] ``` When one source fails, it automatically tries the next. No manual intervention needed. ### The Core Scraping Engine The scraper operates in three modes: **1. Full Mode** - The complete package ```bash python main.py python --mode full --limit 100 ``` This scrapes posts, downloads all media (images, videos, galleries), and fetches comments with their full thread hierarchy. **2. History Mode** - Fast metadata-only ```bash python main.py python --mode history --limit 500 ``` Perfect for quickly building a dataset of post metadata without the overhead of media downloads. **3. Monitor Mode** - Live watching ```bash python main.py python --mode monitor ``` Continuously checks for new posts every 5 minutes. Ideal for tracking breaking news or trending discussions. --- ## The Dashboard Experience One of the standout features is the **7-tab Streamlit dashboard** that makes data exploration a joy: ### 📊 Overview Tab At a glance, see: - Total posts and comments - Cumulative score across all posts - Media post breakdown - Posts-over-time chart - Top 10 posts by score ### 📈 Analytics Tab This is where it gets interesting: - **Sentiment Analysis**: Run VADER-based sentiment scoring on your entire dataset - **Keyword Cloud**: See the most frequently used terms - **Best Posting Times**: Data-driven insights on when posts get the most engagement ### 🔍 Search Tab Full-text search across all scraped data with filters for: - Minimum score - Post type (text, image, video, gallery, link) - Author - Custom sorting ### 💬 Comments Analysis - View top-scoring comments - See who the most active commenters are - Track comment patterns over time ### ⚙️ Scraper Controls Start new scrapes right from the dashboard! Configure: - Target subreddit/user - Post limits - Mode (full/history) - Media and comment toggles ### 📋 Job History Full observability into every scrape job: - Status tracking (running, completed, failed) - Duration metrics - Post/comment/media counts - Error logging ### 🔌 Integrations Pre-configured instructions for connecting: - Metabase - Grafana - DreamFactory - DuckDB --- ## The Plugin Architecture I designed a plugin system to allow extensible post-processing. The architecture is simple but powerful: ```python class Plugin: """Base class for all plugins.""" name = "base" description = "Base plugin" enabled = True def process_posts(self, posts): return posts def process_comments(self, comments): return comments ``` ### Built-in Plugins **1. Sentiment Tagger** Analyzes the emotional tone of every post and comment using VADER sentiment analysis: ```python class SentimentTagger(Plugin): name = "sentiment_tagger" description = "Adds sentiment scores and labels to posts" def process_posts(self, posts): for post in posts: text = f"{post.get('title', '')} {post.get('selftext', '')}" score, label = analyze_sentiment(text) post['sentiment_score'] = score post['sentiment_label'] = label return posts ``` **2. Deduplicator** Removes duplicate posts that may appear across multiple scraping sessions. **3. Keyword Extractor** Pulls out the most significant terms from your scraped content for trend analysis. ### Creating Your Own Plugin Drop a new Python file in the `plugins/` directory: ```python from plugins import Plugin class MyCustomPlugin(Plugin): name = "my_plugin" description = "Does something cool" enabled = True def process_posts(self, posts): # Your logic here return posts ``` Enable plugins during scraping: ```bash python main.py python --mode full --plugins ``` --- ## REST API for External Integrations The REST API opens up the scraper to a whole ecosystem of tools: ```bash python main.py --api # API at http://localhost:8000 # Docs at http://localhost:8000/docs ``` ### Key Endpoints | Endpoint | Description | |----------|-------------| | `GET /posts` | List posts with filters (subreddit, limit, offset) | | `GET /comments` | List comments | | `GET /subreddits` | All scraped subreddits | | `GET /jobs` | Job history | | `GET /query?sql=...` | Raw SQL queries for power users | | `GET /grafana/query` | Grafana-compatible time-series data | ### Real-World Integration: Grafana Dashboard 1. Install the "JSON API" or "Infinity" plugin in Grafana 2. Add datasource pointing to `http://localhost:8000` 3. Use the `/grafana/query` endpoint for time-series panels ```sql SELECT date(created_utc) as time, COUNT(*) as posts FROM posts GROUP BY date(created_utc) ``` Now you have a real-time dashboard tracking Reddit activity! --- ## Scheduled Scraping & Notifications ### Automation Made Easy Set up recurring scrapes with cron-style scheduling: ```bash # Scrape every 60 minutes python main.py --schedule delhi --every 60 # With custom options python main.py --schedule delhi --every 30 --mode full --limit 50 ``` ### Get Notified Configure Discord or Telegram alerts when scrapes complete: ```bash # Environment variables export DISCORD_WEBHOOK_URL="https://discord.com/api/webhooks/..." export TELEGRAM_BOT_TOKEN="123456:ABC..." export TELEGRAM_CHAT_ID="987654321" ``` Now you get notified with scrape summaries directly in your preferred platform. --- ## Dry Run Mode: Test Before You Commit One of my favorite features is **dry run mode**. It simulates the entire scrape without saving any data: ```bash python main.py python --mode full --limit 50 --dry-run ``` Output: ``` 🧪 DRY RUN MODE - No data will be saved 🧪 DRY RUN COMPLETE! 📊 Would scrape: 100 posts 💬 Would scrape: 245 comments ``` Perfect for: - Testing your scrape configuration - Estimating data volume before committing - Debugging without cluttering your dataset --- ## Docker Deployment ### Quick Start ```bash # Build docker build -t reddit-scraper . # Run a scrape docker run -v ./data:/app/data reddit-scraper python --limit 100 # Run with plugins docker run -v ./data:/app/data reddit-scraper python --plugins ``` ### Full Stack with Docker Compose ```bash docker-compose up -d ``` This spins up: - Dashboard at `http://localhost:8501` - REST API at `http://localhost:8000` ### Deploy to Any VPS ```bash ssh user@your-server-ip git clone https://github.com/ksanjeev284/reddit-universal-scraper.git cd reddit-universal-scraper docker-compose up -d ``` Open the firewall: ```bash sudo ufw allow 8000 sudo ufw allow 8501 ``` You now have a production-ready Reddit scraping platform! --- ## Data Export Options ### CSV (Default) All scraped data is saved as CSV files: - `data/r_/posts.csv` - `data/r_/comments.csv` ### Parquet (Analytics-Optimized) Export to columnar format for analytics tools: ```bash python main.py --export-parquet python ``` Query directly with DuckDB: ```python import duckdb duckdb.query("SELECT * FROM 'data/parquet/*.parquet'").df() ``` ### Database Maintenance ```bash # Backup python main.py --backup # Optimize/vacuum python main.py --vacuum # View job history python main.py --job-history ``` --- ## Data Schema ### Posts Table | Column | Description | |--------|-------------| | `id` | Reddit post ID | | `title` | Post title | | `author` | Username | | `score` | Net upvotes | | `num_comments` | Comment count | | `post_type` | text/image/video/gallery/link | | `selftext` | Post body (for text posts) | | `created_utc` | Timestamp | | `permalink` | Reddit URL | | `is_nsfw` | NSFW flag | | `flair` | Post flair | | `sentiment_score` | -1.0 to 1.0 (with plugins) | ### Comments Table | Column | Description | |--------|-------------| | `comment_id` | Comment ID | | `post_permalink` | Parent post URL | | `author` | Username | | `body` | Comment text | | `score` | Upvotes | | `depth` | Nesting level | | `is_submitter` | Whether author is OP | --- ## Use Cases ### 1. Academic Research - Analyze subreddit community dynamics - Track sentiment over time during events - Study user engagement patterns ### 2. Market Research - Monitor brand mentions - Track product feedback - Identify emerging trends ### 3. Content Creation - Find popular topics in your niche - Analyze what makes posts go viral - Discover optimal posting times ### 4. Data Journalism - Archive discussions around breaking news - Analyze public sentiment during events - Track narrative evolution ### 5. Personal Projects - Build a dataset for ML training - Create Reddit-based recommendation systems - Archive communities you care about --- ## Performance Considerations ### Respect Reddit's Servers The scraper includes built-in delays: - **3 second cooldown** between API requests - **30 second wait** if all mirrors fail - **Automatic mirror rotation** to distribute load ### Optimize Your Scrapes - Use `--mode history` for faster metadata-only scrapes - Use `--no-media` if you don't need images/videos - Use `--no-comments` for post-only data ### Handle Large Datasets - Parquet export for analytics queries - SQLite database for structured storage - Automatic deduplication to avoid bloat --- ## What's Next? Roadmap I'm actively developing new features: - [ ] **Async scraping** for even faster data collection - [ ] **Multi-subreddit monitoring** in a single command - [ ] **Email notifications** in addition to Discord/Telegram - [ ] **Cloud deployment templates** (AWS, GCP, Azure) - [ ] **Web-based scraper configuration** (no CLI needed) --- ## Getting Started ### Prerequisites - Python 3.10+ - pip ### Installation ```bash # Clone the repo git clone https://github.com/ksanjeev284/reddit-universal-scraper.git cd reddit-universal-scraper # Install dependencies pip install -r requirements.txt # Your first scrape python main.py python --mode full --limit 50 # Launch the dashboard python main.py --dashboard ``` That's it! You're now scraping Reddit like a pro. --- ## Contributing This is an open-source project and contributions are welcome! Whether it's: - Bug fixes - New plugins - Documentation improvements - Feature suggestions Open an issue or submit a PR on [GitHub](https://github.com/ksanjeev284/reddit-universal-scraper). --- ## Conclusion The Universal Reddit Scraper Suite represents months of work solving a problem that many data enthusiasts face. By combining a robust scraping engine with analytics capabilities, a beautiful dashboard, and extensive integration options—all without requiring API keys—I hope this tool empowers you to unlock insights from Reddit's vast treasure trove of community discussions. **Happy scraping!** 🤖 --- *If you found this useful, consider giving the project a ⭐ on [GitHub](https://github.com/ksanjeev284/reddit-universal-scraper)!* --- ## Connect - **GitHub**: [@ksanjeev284](https://github.com/ksanjeev284) - **Project**: [reddit-universal-scraper](https://github.com/ksanjeev284/reddit-universal-scraper) --- *Tags: Reddit, Web Scraping, Python, Data Analysis, Streamlit, REST API, Docker, Open Source* ================================================ FILE: docs/INTEGRATION.md ================================================ # External Tools Integration Guide Connect Metabase, Grafana, DreamFactory, or any REST client to your Reddit scraper data. --- ## Quick Start ```powershell # Install dependencies pip install fastapi uvicorn # Start the API server python main.py --api ``` The API will be available at `http://localhost:8000` --- ## API Endpoints | Endpoint | Description | |----------|-------------| | `GET /posts` | List posts with filters (q, subreddit, author, min_score) | | `GET /posts/{id}` | Get single post | | `GET /comments` | List comments with filters | | `GET /subreddits` | List all scraped subreddits | | `GET /subreddits/{name}/stats` | Get subreddit statistics | | `GET /jobs` | View job history | | `GET /jobs/stats` | Job statistics | | `GET /query?sql=...` | Raw SQL SELECT queries | | `GET /docs` | Interactive API documentation | --- ## Metabase Setup 1. Start API: `python main.py --api` 2. In Metabase, add a new "HTTP" question 3. Use `http://localhost:8000/posts?limit=1000` 4. Or use `/query?sql=SELECT * FROM posts` for custom queries --- ## Grafana Setup 1. Install "JSON API" or "Infinity" datasource plugin 2. Add datasource with URL: `http://localhost:8000` 3. Use `/grafana/query` for time-series data 4. Or use `/query?sql=...` for custom queries Example Grafana query: ```sql SELECT date(created_utc) as time, COUNT(*) as posts FROM posts GROUP BY date(created_utc) ``` --- ## DreamFactory / REST Clients The API includes full CORS support. Connect any tool that speaks REST: ```bash # Get posts curl http://localhost:8000/posts?subreddit=python&limit=10 # Custom SQL query curl "http://localhost:8000/query?sql=SELECT title, score FROM posts ORDER BY score DESC LIMIT 5" ``` --- ## Docker Compose (All-in-One) ```yaml version: '3' services: scraper-api: build: . ports: - "8000:8000" volumes: - ./data:/app/data command: python main.py --api metabase: image: metabase/metabase ports: - "3000:3000" ``` ================================================ FILE: export/__init__.py ================================================ # Export module from .database import * ================================================ FILE: export/cloud.py ================================================ """ Cloud Upload Module - S3 and Google Drive integration """ import os import json from pathlib import Path from datetime import datetime # Try importing boto3 for S3 try: import boto3 from botocore.exceptions import ClientError HAS_BOTO3 = True except ImportError: HAS_BOTO3 = False # Try importing Google Drive API try: from google.oauth2.credentials import Credentials from googleapiclient.discovery import build from googleapiclient.http import MediaFileUpload HAS_GDRIVE = True except ImportError: HAS_GDRIVE = False class S3Uploader: """Upload scraped data to AWS S3.""" def __init__(self, bucket_name, aws_access_key=None, aws_secret_key=None, region='us-east-1'): """ Initialize S3 uploader. Args: bucket_name: S3 bucket name aws_access_key: Optional, uses env/config if not provided aws_secret_key: Optional, uses env/config if not provided region: AWS region """ if not HAS_BOTO3: raise ImportError("boto3 not installed. Run: pip install boto3") self.bucket_name = bucket_name self.region = region # Use provided credentials or fall back to env vars self.s3 = boto3.client( 's3', aws_access_key_id=aws_access_key or os.getenv('AWS_ACCESS_KEY_ID'), aws_secret_access_key=aws_secret_key or os.getenv('AWS_SECRET_ACCESS_KEY'), region_name=region ) def upload_file(self, local_path, s3_key=None): """ Upload a single file to S3. Args: local_path: Local file path s3_key: S3 object key (default: same as filename) Returns: S3 URL or None on failure """ local_path = Path(local_path) if not local_path.exists(): print(f"❌ File not found: {local_path}") return None s3_key = s3_key or local_path.name try: self.s3.upload_file(str(local_path), self.bucket_name, s3_key) url = f"https://{self.bucket_name}.s3.{self.region}.amazonaws.com/{s3_key}" print(f"✅ Uploaded: {s3_key}") return url except ClientError as e: print(f"❌ S3 upload failed: {e}") return None def upload_directory(self, local_dir, s3_prefix=""): """ Upload entire directory to S3. Args: local_dir: Local directory path s3_prefix: Prefix for S3 keys Returns: Dictionary of uploaded files """ local_dir = Path(local_dir) if not local_dir.exists(): print(f"❌ Directory not found: {local_dir}") return {} uploaded = {} for file_path in local_dir.rglob('*'): if file_path.is_file(): relative_path = file_path.relative_to(local_dir) s3_key = f"{s3_prefix}/{relative_path}" if s3_prefix else str(relative_path) s3_key = s3_key.replace('\\', '/') # Windows path fix url = self.upload_file(file_path, s3_key) if url: uploaded[str(relative_path)] = url print(f"\n📤 Uploaded {len(uploaded)} files to S3") return uploaded def upload_subreddit_data(self, subreddit, prefix="u"): """ Upload all data for a subreddit. Args: subreddit: Subreddit name prefix: "r" for subreddit, "u" for user Returns: Upload results """ data_dir = Path(f"data/{prefix}_{subreddit}") if not data_dir.exists(): print(f"❌ Data not found for {prefix}/{subreddit}") return {} timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") s3_prefix = f"reddit/{prefix}_{subreddit}/{timestamp}" return self.upload_directory(data_dir, s3_prefix) def list_uploads(self, prefix="reddit/"): """List all uploaded data in S3.""" try: response = self.s3.list_objects_v2( Bucket=self.bucket_name, Prefix=prefix ) objects = response.get('Contents', []) print(f"\n📁 S3 Contents ({self.bucket_name}/{prefix}):") for obj in objects[:50]: # Limit to 50 size_kb = obj['Size'] / 1024 print(f" {obj['Key']} ({size_kb:.1f} KB)") if len(objects) > 50: print(f" ... and {len(objects) - 50} more") return objects except ClientError as e: print(f"❌ S3 list failed: {e}") return [] class GDriveUploader: """Upload scraped data to Google Drive.""" def __init__(self, credentials_file='credentials.json', token_file='token.json'): """ Initialize Google Drive uploader. Args: credentials_file: Path to OAuth credentials JSON token_file: Path to token JSON """ if not HAS_GDRIVE: raise ImportError("Google API client not installed. Run: pip install google-api-python-client google-auth-oauthlib") self.credentials_file = credentials_file self.token_file = token_file self.service = None self._authenticate() def _authenticate(self): """Authenticate with Google Drive API.""" creds = None if os.path.exists(self.token_file): creds = Credentials.from_authorized_user_file(self.token_file) if not creds or not creds.valid: if creds and creds.expired and creds.refresh_token: creds.refresh(Request()) else: from google_auth_oauthlib.flow import InstalledAppFlow SCOPES = ['https://www.googleapis.com/auth/drive.file'] flow = InstalledAppFlow.from_client_secrets_file(self.credentials_file, SCOPES) creds = flow.run_local_server(port=0) with open(self.token_file, 'w') as token: token.write(creds.to_json()) self.service = build('drive', 'v3', credentials=creds) print("✅ Google Drive authenticated") def create_folder(self, name, parent_id=None): """Create a folder in Google Drive.""" metadata = { 'name': name, 'mimeType': 'application/vnd.google-apps.folder' } if parent_id: metadata['parents'] = [parent_id] folder = self.service.files().create(body=metadata, fields='id').execute() return folder.get('id') def upload_file(self, local_path, folder_id=None): """Upload a file to Google Drive.""" local_path = Path(local_path) if not local_path.exists(): print(f"❌ File not found: {local_path}") return None metadata = {'name': local_path.name} if folder_id: metadata['parents'] = [folder_id] media = MediaFileUpload(str(local_path), resumable=True) try: file = self.service.files().create( body=metadata, media_body=media, fields='id,webViewLink' ).execute() print(f"✅ Uploaded: {local_path.name}") return file.get('webViewLink') except Exception as e: print(f"❌ Upload failed: {e}") return None def upload_subreddit_data(self, subreddit, prefix="r"): """Upload all data for a subreddit.""" data_dir = Path(f"data/{prefix}_{subreddit}") if not data_dir.exists(): print(f"❌ Data not found for {prefix}/{subreddit}") return {} # Create folder structure root_folder = self.create_folder(f"reddit_{prefix}_{subreddit}_{datetime.now().strftime('%Y%m%d')}") uploaded = {} for file_path in data_dir.rglob('*'): if file_path.is_file(): url = self.upload_file(file_path, root_folder) if url: uploaded[str(file_path.name)] = url print(f"\n📤 Uploaded {len(uploaded)} files to Google Drive") return uploaded def upload_to_s3(subreddit, bucket_name, prefix="r"): """ Convenience function to upload subreddit data to S3. Args: subreddit: Subreddit name bucket_name: S3 bucket name prefix: "r" or "u" Returns: Upload results """ uploader = S3Uploader(bucket_name) return uploader.upload_subreddit_data(subreddit, prefix) def upload_to_gdrive(subreddit, prefix="r"): """ Convenience function to upload subreddit data to Google Drive. Args: subreddit: Subreddit name prefix: "r" or "u" Returns: Upload results """ uploader = GDriveUploader() return uploader.upload_subreddit_data(subreddit, prefix) # CLI for testing if __name__ == "__main__": import argparse parser = argparse.ArgumentParser(description="Cloud Upload") parser.add_argument("subreddit", help="Subreddit to upload") parser.add_argument("--s3-bucket", help="S3 bucket name") parser.add_argument("--gdrive", action="store_true", help="Upload to Google Drive") parser.add_argument("--user", action="store_true", help="Is a user profile") args = parser.parse_args() prefix = "u" if args.user else "r" if args.s3_bucket: upload_to_s3(args.subreddit, args.s3_bucket, prefix) elif args.gdrive: upload_to_gdrive(args.subreddit, prefix) else: print("Please specify --s3-bucket or --gdrive") ================================================ FILE: export/database.py ================================================ """ Database module - SQLite storage for scraped data """ import sqlite3 from pathlib import Path from datetime import datetime import json import sys sys.path.insert(0, str(Path(__file__).parent.parent)) from config import DB_PATH, DATA_DIR def get_connection(): """Get database connection.""" DATA_DIR.mkdir(exist_ok=True) conn = sqlite3.connect(DB_PATH) conn.row_factory = sqlite3.Row return conn def init_database(): """Initialize database tables.""" conn = get_connection() cursor = conn.cursor() # Posts table cursor.execute(""" CREATE TABLE IF NOT EXISTS posts ( id TEXT PRIMARY KEY, subreddit TEXT, title TEXT, author TEXT, created_utc TEXT, permalink TEXT UNIQUE, url TEXT, score INTEGER DEFAULT 0, upvote_ratio REAL DEFAULT 0, num_comments INTEGER DEFAULT 0, num_crossposts INTEGER DEFAULT 0, selftext TEXT, post_type TEXT, is_nsfw BOOLEAN DEFAULT 0, is_spoiler BOOLEAN DEFAULT 0, flair TEXT, total_awards INTEGER DEFAULT 0, has_media BOOLEAN DEFAULT 0, media_downloaded BOOLEAN DEFAULT 0, source TEXT, scraped_at TEXT DEFAULT CURRENT_TIMESTAMP, sentiment_score REAL, sentiment_label TEXT ) """) # Comments table cursor.execute(""" CREATE TABLE IF NOT EXISTS comments ( id INTEGER PRIMARY KEY AUTOINCREMENT, comment_id TEXT UNIQUE, post_id TEXT, post_permalink TEXT, parent_id TEXT, author TEXT, body TEXT, score INTEGER DEFAULT 0, created_utc TEXT, depth INTEGER DEFAULT 0, is_submitter BOOLEAN DEFAULT 0, scraped_at TEXT DEFAULT CURRENT_TIMESTAMP, sentiment_score REAL, sentiment_label TEXT, FOREIGN KEY (post_id) REFERENCES posts(id) ) """) # Subreddits table (for tracking) cursor.execute(""" CREATE TABLE IF NOT EXISTS subreddits ( name TEXT PRIMARY KEY, last_scraped TEXT, total_posts INTEGER DEFAULT 0, total_comments INTEGER DEFAULT 0, total_media INTEGER DEFAULT 0 ) """) # Scheduled jobs table cursor.execute(""" CREATE TABLE IF NOT EXISTS scheduled_jobs ( id INTEGER PRIMARY KEY AUTOINCREMENT, target TEXT, is_user BOOLEAN DEFAULT 0, mode TEXT DEFAULT 'full', limit_posts INTEGER DEFAULT 100, cron_expression TEXT, last_run TEXT, next_run TEXT, enabled BOOLEAN DEFAULT 1, created_at TEXT DEFAULT CURRENT_TIMESTAMP ) """) # Alerts table cursor.execute(""" CREATE TABLE IF NOT EXISTS alerts ( id INTEGER PRIMARY KEY AUTOINCREMENT, keyword TEXT, subreddit TEXT, alert_type TEXT DEFAULT 'discord', webhook_url TEXT, enabled BOOLEAN DEFAULT 1, last_triggered TEXT, created_at TEXT DEFAULT CURRENT_TIMESTAMP ) """) # Job history table for observability cursor.execute(""" CREATE TABLE IF NOT EXISTS job_history ( id INTEGER PRIMARY KEY AUTOINCREMENT, job_id TEXT UNIQUE, target TEXT, is_user BOOLEAN DEFAULT 0, mode TEXT, status TEXT, started_at TEXT, completed_at TEXT, duration_seconds REAL, posts_scraped INTEGER DEFAULT 0, comments_scraped INTEGER DEFAULT 0, media_downloaded INTEGER DEFAULT 0, errors TEXT, error_count INTEGER DEFAULT 0, dry_run BOOLEAN DEFAULT 0 ) """) # Create indexes cursor.execute("CREATE INDEX IF NOT EXISTS idx_posts_subreddit ON posts(subreddit)") cursor.execute("CREATE INDEX IF NOT EXISTS idx_posts_created ON posts(created_utc)") cursor.execute("CREATE INDEX IF NOT EXISTS idx_posts_score ON posts(score)") cursor.execute("CREATE INDEX IF NOT EXISTS idx_comments_post ON comments(post_id)") cursor.execute("CREATE INDEX IF NOT EXISTS idx_comments_author ON comments(author)") conn.commit() conn.close() print("✅ Database initialized") def save_post(post_data, subreddit): """Save a single post to database.""" conn = get_connection() cursor = conn.cursor() try: cursor.execute(""" INSERT OR REPLACE INTO posts (id, subreddit, title, author, created_utc, permalink, url, score, upvote_ratio, num_comments, num_crossposts, selftext, post_type, is_nsfw, is_spoiler, flair, total_awards, has_media, media_downloaded, source) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?) """, ( post_data.get('id'), subreddit, post_data.get('title'), post_data.get('author'), post_data.get('created_utc'), post_data.get('permalink'), post_data.get('url'), post_data.get('score', 0), post_data.get('upvote_ratio', 0), post_data.get('num_comments', 0), post_data.get('num_crossposts', 0), post_data.get('selftext', ''), post_data.get('post_type'), post_data.get('is_nsfw', False), post_data.get('is_spoiler', False), post_data.get('flair', ''), post_data.get('total_awards', 0), post_data.get('has_media', False), post_data.get('media_downloaded', False), post_data.get('source', '') )) conn.commit() return True except Exception as e: print(f"DB Error: {e}") return False finally: conn.close() def save_posts_batch(posts, subreddit): """Save multiple posts efficiently.""" conn = get_connection() cursor = conn.cursor() saved = 0 for post in posts: try: cursor.execute(""" INSERT OR IGNORE INTO posts (id, subreddit, title, author, created_utc, permalink, url, score, upvote_ratio, num_comments, num_crossposts, selftext, post_type, is_nsfw, is_spoiler, flair, total_awards, has_media, media_downloaded, source) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?) """, ( post.get('id'), subreddit, post.get('title'), post.get('author'), post.get('created_utc'), post.get('permalink'), post.get('url'), post.get('score', 0), post.get('upvote_ratio', 0), post.get('num_comments', 0), post.get('num_crossposts', 0), post.get('selftext', ''), post.get('post_type'), post.get('is_nsfw', False), post.get('is_spoiler', False), post.get('flair', ''), post.get('total_awards', 0), post.get('has_media', False), post.get('media_downloaded', False), post.get('source', '') )) if cursor.rowcount > 0: saved += 1 except: continue conn.commit() conn.close() return saved def save_comments_batch(comments, post_id): """Save multiple comments efficiently.""" conn = get_connection() cursor = conn.cursor() saved = 0 for comment in comments: try: cursor.execute(""" INSERT OR IGNORE INTO comments (comment_id, post_id, post_permalink, parent_id, author, body, score, created_utc, depth, is_submitter) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?) """, ( comment.get('comment_id'), post_id, comment.get('post_permalink'), comment.get('parent_id'), comment.get('author'), comment.get('body'), comment.get('score', 0), comment.get('created_utc'), comment.get('depth', 0), comment.get('is_submitter', False) )) if cursor.rowcount > 0: saved += 1 except: continue conn.commit() conn.close() return saved def search_posts(query=None, subreddit=None, author=None, min_score=None, start_date=None, end_date=None, post_type=None, limit=100): """Search posts with filters.""" conn = get_connection() cursor = conn.cursor() sql = "SELECT * FROM posts WHERE 1=1" params = [] if query: sql += " AND (title LIKE ? OR selftext LIKE ?)" params.extend([f"%{query}%", f"%{query}%"]) if subreddit: sql += " AND subreddit = ?" params.append(subreddit) if author: sql += " AND author = ?" params.append(author) if min_score: sql += " AND score >= ?" params.append(min_score) if start_date: sql += " AND created_utc >= ?" params.append(start_date) if end_date: sql += " AND created_utc <= ?" params.append(end_date) if post_type: sql += " AND post_type = ?" params.append(post_type) sql += " ORDER BY created_utc DESC LIMIT ?" params.append(limit) cursor.execute(sql, params) results = [dict(row) for row in cursor.fetchall()] conn.close() return results def search_comments(query=None, post_id=None, author=None, min_score=None, limit=100): """Search comments with filters.""" conn = get_connection() cursor = conn.cursor() sql = "SELECT * FROM comments WHERE 1=1" params = [] if query: sql += " AND body LIKE ?" params.append(f"%{query}%") if post_id: sql += " AND post_id = ?" params.append(post_id) if author: sql += " AND author = ?" params.append(author) if min_score: sql += " AND score >= ?" params.append(min_score) sql += " ORDER BY score DESC LIMIT ?" params.append(limit) cursor.execute(sql, params) results = [dict(row) for row in cursor.fetchall()] conn.close() return results def get_subreddit_stats(subreddit): """Get statistics for a subreddit.""" conn = get_connection() cursor = conn.cursor() stats = {} # Post stats cursor.execute(""" SELECT COUNT(*) as total_posts, AVG(score) as avg_score, MAX(score) as max_score, SUM(num_comments) as total_comments, AVG(upvote_ratio) as avg_upvote_ratio FROM posts WHERE subreddit = ? """, (subreddit,)) row = cursor.fetchone() if row: stats.update(dict(row)) # Post type distribution cursor.execute(""" SELECT post_type, COUNT(*) as count FROM posts WHERE subreddit = ? GROUP BY post_type """, (subreddit,)) stats['post_types'] = {row['post_type']: row['count'] for row in cursor.fetchall()} # Top authors cursor.execute(""" SELECT author, COUNT(*) as post_count, SUM(score) as total_score FROM posts WHERE subreddit = ? AND author != '[deleted]' GROUP BY author ORDER BY post_count DESC LIMIT 10 """, (subreddit,)) stats['top_authors'] = [dict(row) for row in cursor.fetchall()] # Activity by hour cursor.execute(""" SELECT strftime('%H', created_utc) as hour, COUNT(*) as count FROM posts WHERE subreddit = ? GROUP BY hour ORDER BY hour """, (subreddit,)) stats['hourly_activity'] = {row['hour']: row['count'] for row in cursor.fetchall()} conn.close() return stats def get_all_subreddits(): """Get list of all scraped subreddits.""" conn = get_connection() cursor = conn.cursor() cursor.execute(""" SELECT subreddit, COUNT(*) as post_count, MAX(created_utc) as latest_post, MIN(created_utc) as oldest_post FROM posts GROUP BY subreddit ORDER BY post_count DESC """) results = [dict(row) for row in cursor.fetchall()] conn.close() return results # --- JOB HISTORY FUNCTIONS --- def start_job_record(target, mode, is_user=False, dry_run=False): """ Start tracking a new scrape job. Returns: job_id: Unique identifier for the job """ import uuid conn = get_connection() cursor = conn.cursor() job_id = str(uuid.uuid4())[:8] started_at = datetime.now().isoformat() cursor.execute(""" INSERT INTO job_history (job_id, target, is_user, mode, status, started_at, dry_run) VALUES (?, ?, ?, ?, 'running', ?, ?) """, (job_id, target, is_user, mode, started_at, dry_run)) conn.commit() conn.close() print(f"📋 Job started: {job_id}") return job_id def complete_job_record(job_id, status, posts=0, comments=0, media=0, errors=None): """ Complete a job record with results. Args: job_id: Job ID from start_job_record status: 'completed' or 'failed' posts: Number of posts scraped comments: Number of comments scraped media: Number of media files downloaded errors: Error message if failed """ conn = get_connection() cursor = conn.cursor() completed_at = datetime.now().isoformat() # Calculate duration cursor.execute("SELECT started_at FROM job_history WHERE job_id = ?", (job_id,)) row = cursor.fetchone() duration = 0 error_count = 0 if row: started = datetime.fromisoformat(row['started_at']) duration = (datetime.now() - started).total_seconds() if errors: error_count = 1 cursor.execute(""" UPDATE job_history SET status = ?, completed_at = ?, duration_seconds = ?, posts_scraped = ?, comments_scraped = ?, media_downloaded = ?, errors = ?, error_count = ? WHERE job_id = ? """, (status, completed_at, duration, posts, comments, media, errors, error_count, job_id)) conn.commit() conn.close() if status == 'completed': print(f"✅ Job {job_id} completed: {posts} posts, {comments} comments in {duration:.1f}s") else: print(f"❌ Job {job_id} failed: {errors}") def get_job_history(limit=50, target=None, status=None): """Get recent job history.""" conn = get_connection() cursor = conn.cursor() sql = "SELECT * FROM job_history WHERE 1=1" params = [] if target: sql += " AND target = ?" params.append(target) if status: sql += " AND status = ?" params.append(status) sql += " ORDER BY started_at DESC LIMIT ?" params.append(limit) cursor.execute(sql, params) results = [dict(row) for row in cursor.fetchall()] conn.close() return results def get_job_stats(): """Get aggregated job statistics.""" conn = get_connection() cursor = conn.cursor() stats = {} # Overall counts cursor.execute(""" SELECT COUNT(*) as total_jobs, SUM(CASE WHEN status = 'completed' THEN 1 ELSE 0 END) as completed, SUM(CASE WHEN status = 'failed' THEN 1 ELSE 0 END) as failed, SUM(CASE WHEN status = 'running' THEN 1 ELSE 0 END) as running, AVG(duration_seconds) as avg_duration, SUM(posts_scraped) as total_posts, SUM(comments_scraped) as total_comments FROM job_history """) row = cursor.fetchone() if row: stats.update(dict(row)) # Recent jobs cursor.execute(""" SELECT target, status, duration_seconds, posts_scraped, started_at FROM job_history ORDER BY started_at DESC LIMIT 10 """) stats['recent_jobs'] = [dict(row) for row in cursor.fetchall()] conn.close() return stats def print_job_history(limit=20): """Pretty print job history.""" jobs = get_job_history(limit) print("\n📋 Job History") print("-" * 80) print(f"{'ID':<10} {'Target':<15} {'Status':<10} {'Posts':<8} {'Duration':<10} {'Started':<20}") print("-" * 80) for job in jobs: status_icon = "✅" if job['status'] == 'completed' else "❌" if job['status'] == 'failed' else "🔄" duration = f"{job['duration_seconds']:.1f}s" if job['duration_seconds'] else "-" started = job['started_at'][:19] if job['started_at'] else "-" dry = " (dry)" if job['dry_run'] else "" print(f"{status_icon} {job['job_id']:<8} {job['target']:<15} {job['status']:<10} " f"{job['posts_scraped']:<8} {duration:<10} {started}{dry}") print("-" * 80) stats = get_job_stats() success_rate = (stats['completed'] / stats['total_jobs'] * 100) if stats['total_jobs'] else 0 print(f"\n📊 Stats: {stats['total_jobs']} jobs | {success_rate:.0f}% success | " f"{stats['total_posts'] or 0} posts total") # --- SQLITE MAINTENANCE FUNCTIONS --- def enable_auto_vacuum(): """Enable incremental auto-vacuum on SQLite database.""" conn = get_connection() try: conn.execute("PRAGMA auto_vacuum = INCREMENTAL") conn.execute("PRAGMA incremental_vacuum") conn.commit() print("✅ Auto-vacuum enabled") finally: conn.close() def vacuum_database(): """Run VACUUM to optimize and compact the database.""" conn = get_connection() try: print("🔧 Running VACUUM...") conn.execute("VACUUM") print("✅ Database optimized") finally: conn.close() def backup_database(backup_path=None): """ Create a backup of the SQLite database. Args: backup_path: Optional custom backup path Returns: Path to the backup file """ import shutil backup_dir = DATA_DIR / "backups" backup_dir.mkdir(exist_ok=True) if backup_path is None: timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") backup_path = backup_dir / f"reddit_scraper_{timestamp}.db" shutil.copy2(DB_PATH, backup_path) # Get file size size_mb = Path(backup_path).stat().st_size / (1024 * 1024) print(f"✅ Backup created: {backup_path} ({size_mb:.2f} MB)") return str(backup_path) def get_database_info(): """Get database size and table info.""" info = {} # File size if DB_PATH.exists(): info['size_mb'] = DB_PATH.stat().st_size / (1024 * 1024) conn = get_connection() cursor = conn.cursor() # Table counts tables = ['posts', 'comments', 'job_history', 'alerts', 'subreddits'] info['tables'] = {} for table in tables: try: cursor.execute(f"SELECT COUNT(*) FROM {table}") info['tables'][table] = cursor.fetchone()[0] except: info['tables'][table] = 0 conn.close() return info # Initialize on import init_database() ================================================ FILE: export/parquet.py ================================================ """ Parquet Export Module - For DuckDB/Warehouse integration Export scraped data to Parquet format for analytics tools. """ import pandas as pd from pathlib import Path from datetime import datetime def export_to_parquet(subreddit, output_dir=None, prefix="r"): """ Export subreddit data to Parquet format. Args: subreddit: Subreddit name output_dir: Output directory (default: data/parquet) prefix: "r" for subreddit, "u" for user Returns: Dictionary with paths to exported files """ try: import pyarrow except ImportError: raise ImportError("pyarrow required for Parquet export. Run: pip install pyarrow") # Setup paths data_dir = Path(f"data/{prefix}_{subreddit}") output_path = Path(output_dir) if output_dir else Path("data/parquet") output_path.mkdir(parents=True, exist_ok=True) if not data_dir.exists(): print(f"❌ No data found for {prefix}/{subreddit}") return {} exported = {} timestamp = datetime.now().strftime("%Y%m%d") # Export posts posts_csv = data_dir / "posts.csv" if posts_csv.exists(): print(f"📦 Converting posts to Parquet...") df = pd.read_csv(posts_csv) # Convert datetime columns if 'created_utc' in df.columns: df['created_utc'] = pd.to_datetime(df['created_utc'], errors='coerce') # Optimize dtypes for col in ['score', 'num_comments', 'num_crossposts', 'total_awards']: if col in df.columns: df[col] = pd.to_numeric(df[col], errors='coerce').fillna(0).astype('int32') for col in ['is_nsfw', 'is_spoiler', 'has_media', 'media_downloaded']: if col in df.columns: df[col] = df[col].astype(bool) output_file = output_path / f"{subreddit}_posts_{timestamp}.parquet" df.to_parquet(output_file, engine="pyarrow", compression="snappy") size_mb = output_file.stat().st_size / (1024 * 1024) print(f" ✅ {output_file.name} ({len(df)} rows, {size_mb:.2f} MB)") exported['posts'] = str(output_file) # Export comments comments_csv = data_dir / "comments.csv" if comments_csv.exists(): print(f"📦 Converting comments to Parquet...") df = pd.read_csv(comments_csv) if 'created_utc' in df.columns: df['created_utc'] = pd.to_datetime(df['created_utc'], errors='coerce') if 'score' in df.columns: df['score'] = pd.to_numeric(df['score'], errors='coerce').fillna(0).astype('int32') output_file = output_path / f"{subreddit}_comments_{timestamp}.parquet" df.to_parquet(output_file, engine="pyarrow", compression="snappy") size_mb = output_file.stat().st_size / (1024 * 1024) print(f" ✅ {output_file.name} ({len(df)} rows, {size_mb:.2f} MB)") exported['comments'] = str(output_file) print(f"\n✅ Export complete! Files saved to: {output_path}") print(f" 💡 Query with DuckDB: duckdb.query(\"SELECT * FROM '{exported.get('posts', '')}' LIMIT 10\")") return exported def export_database_to_parquet(output_dir=None): """ Export entire SQLite database to Parquet files. Args: output_dir: Output directory Returns: Dictionary with paths to exported files """ try: import pyarrow except ImportError: raise ImportError("pyarrow required. Run: pip install pyarrow") from export.database import get_connection output_path = Path(output_dir) if output_dir else Path("data/parquet") output_path.mkdir(parents=True, exist_ok=True) conn = get_connection() exported = {} timestamp = datetime.now().strftime("%Y%m%d") tables = ['posts', 'comments', 'job_history'] for table in tables: try: print(f"📦 Exporting {table}...") df = pd.read_sql(f"SELECT * FROM {table}", conn) if len(df) > 0: output_file = output_path / f"db_{table}_{timestamp}.parquet" df.to_parquet(output_file, engine="pyarrow", compression="snappy") size_mb = output_file.stat().st_size / (1024 * 1024) print(f" ✅ {output_file.name} ({len(df)} rows, {size_mb:.2f} MB)") exported[table] = str(output_file) else: print(f" ⏭️ {table} is empty, skipping") except Exception as e: print(f" ❌ Failed to export {table}: {e}") conn.close() return exported def list_parquet_files(directory="data/parquet"): """List all Parquet files in directory.""" parquet_dir = Path(directory) if not parquet_dir.exists(): print(f"📁 No Parquet directory found at {directory}") return [] files = list(parquet_dir.glob("*.parquet")) print(f"\n📁 Parquet Files in {directory}:") print("-" * 60) for f in files: size_mb = f.stat().st_size / (1024 * 1024) mtime = datetime.fromtimestamp(f.stat().st_mtime).strftime("%Y-%m-%d %H:%M") print(f" {f.name:<40} {size_mb:>6.2f} MB {mtime}") print("-" * 60) print(f"Total: {len(files)} files") return [str(f) for f in files] # CLI for testing if __name__ == "__main__": import argparse parser = argparse.ArgumentParser(description="Parquet Export") parser.add_argument("subreddit", nargs='?', help="Subreddit to export") parser.add_argument("--user", action="store_true", help="Is a user profile") parser.add_argument("--output", type=str, help="Output directory") parser.add_argument("--database", action="store_true", help="Export entire database") parser.add_argument("--list", action="store_true", help="List Parquet files") args = parser.parse_args() if args.list: list_parquet_files() elif args.database: export_database_to_parquet(args.output) elif args.subreddit: prefix = "u" if args.user else "r" export_to_parquet(args.subreddit, args.output, prefix) else: parser.print_help() ================================================ FILE: main.py ================================================ """ 🤖 Universal Reddit Scraper Suite Full-featured scraper with analytics, dashboard, notifications, and scheduling. """ import requests import pandas as pd import datetime import time import os import xml.etree.ElementTree as ET import argparse import random import sys import json import subprocess import tempfile from urllib.parse import urlparse from pathlib import Path # --- CONFIGURATION --- USER_AGENT = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36" MIRRORS = [ "https://old.reddit.com", "https://redlib.catsarch.com", "https://redlib.vsls.cz", "https://r.nf", "https://libreddit.northboot.xyz", "https://redlib.tux.pizza" ] SEEN_URLS = set() SESSION = requests.Session() SESSION.headers.update({"User-Agent": USER_AGENT}) # --- DIRECTORY SETUP --- def setup_directories(target, prefix): """Creates organized folder structure for scraped data.""" base_dir = f"data/{prefix}_{target}" dirs = { "base": base_dir, "posts": f"{base_dir}/posts.csv", "comments": f"{base_dir}/comments.csv", "media": f"{base_dir}/media", "images": f"{base_dir}/media/images", "videos": f"{base_dir}/media/videos", } for key in ["base", "media", "images", "videos"]: if not os.path.exists(dirs[key]): os.makedirs(dirs[key]) return dirs def get_file_path(target, type_prefix): """Legacy function for backward compatibility.""" if not os.path.exists("data"): os.makedirs("data") sanitized_target = target.replace("/", "_") return f"data/{type_prefix}_{sanitized_target}.csv" def load_history(filepath): """Loads existing CSV history to prevent duplicates.""" SEEN_URLS.clear() if os.path.exists(filepath): try: df = pd.read_csv(filepath) for url in df['permalink']: SEEN_URLS.add(str(url)) print(f"📚 Loaded {len(SEEN_URLS)} existing items from {filepath}") except: pass def save_posts_csv(posts, filepath): """Saves posts to CSV with all metadata.""" if not posts: return 0 new_posts = [p for p in posts if p['permalink'] not in SEEN_URLS] if new_posts: df = pd.DataFrame(new_posts) if os.path.exists(filepath): df.to_csv(filepath, mode='a', header=False, index=False) else: df.to_csv(filepath, index=False) for p in new_posts: SEEN_URLS.add(p['permalink']) print(f"✅ Saved {len(new_posts)} new posts") return len(new_posts) else: print("💤 No new unique posts found.") return 0 def save_comments_csv(comments, filepath): """Saves comments to CSV.""" if not comments: return df = pd.DataFrame(comments) if os.path.exists(filepath): df.to_csv(filepath, mode='a', header=False, index=False) else: df.to_csv(filepath, index=False) print(f"💬 Saved {len(comments)} comments") # --- MEDIA DOWNLOAD --- def get_media_urls(post_data): """Extracts all media URLs from a post.""" media = {"images": [], "videos": [], "galleries": []} url = post_data.get('url', '') if any(ext in url.lower() for ext in ['.jpg', '.jpeg', '.png', '.gif', '.webp']): media["images"].append(url) if 'i.redd.it' in url: media["images"].append(url) if post_data.get('is_video'): reddit_video = post_data.get('media', {}) if reddit_video and 'reddit_video' in reddit_video: video_url = reddit_video['reddit_video'].get('fallback_url', '') if video_url: media["videos"].append(video_url.split('?')[0]) preview = post_data.get('preview', {}) if preview and 'images' in preview: for img in preview['images']: source = img.get('source', {}) if source.get('url'): clean_url = source['url'].replace('&', '&') media["images"].append(clean_url) if post_data.get('is_gallery'): gallery_data = post_data.get('gallery_data', {}) media_metadata = post_data.get('media_metadata', {}) if gallery_data and media_metadata: for item in gallery_data.get('items', []): media_id = item.get('media_id') if media_id and media_id in media_metadata: meta = media_metadata[media_id] if meta.get('s', {}).get('u'): clean_url = meta['s']['u'].replace('&', '&') media["galleries"].append(clean_url) if 'youtube.com' in url or 'youtu.be' in url: media["videos"].append(url) return media def download_media(url, save_path, media_type="image"): """Downloads a single media file.""" try: if os.path.exists(save_path): return True response = SESSION.get(url, timeout=30, stream=True) if response.status_code == 200: with open(save_path, 'wb') as f: for chunk in response.iter_content(chunk_size=8192): f.write(chunk) return True except Exception as e: pass return False def download_reddit_video_with_audio(video_url, save_path): """ Downloads Reddit video with audio by fetching both streams and merging. Reddit stores video and audio separately - this combines them. """ try: if os.path.exists(save_path): return True # Try to find the audio URL by replacing video quality with audio # Reddit videos have audio at URLs like .../DASH_audio.mp4 or .../DASH_AUDIO_128.mp4 base_url = video_url.rsplit('/', 1)[0] # Common audio URL patterns audio_urls = [ f"{base_url}/DASH_audio.mp4", f"{base_url}/DASH_AUDIO_128.mp4", f"{base_url}/DASH_AUDIO_64.mp4", f"{base_url}/audio.mp4", f"{base_url}/audio" ] # Download video to temp file first with tempfile.NamedTemporaryFile(suffix='_video.mp4', delete=False) as video_temp: video_temp_path = video_temp.name response = SESSION.get(video_url, timeout=60, stream=True) if response.status_code != 200: return False for chunk in response.iter_content(chunk_size=8192): video_temp.write(chunk) # Try to download audio audio_temp_path = None for audio_url in audio_urls: try: response = SESSION.get(audio_url, timeout=30, stream=True) if response.status_code == 200: with tempfile.NamedTemporaryFile(suffix='_audio.mp4', delete=False) as audio_temp: audio_temp_path = audio_temp.name for chunk in response.iter_content(chunk_size=8192): audio_temp.write(chunk) break except: continue if audio_temp_path: # Merge video and audio using ffmpeg try: cmd = [ 'ffmpeg', '-y', '-hide_banner', '-loglevel', 'error', '-i', video_temp_path, '-i', audio_temp_path, '-c:v', 'copy', '-c:a', 'aac', '-shortest', save_path ] result = subprocess.run(cmd, capture_output=True, timeout=120) if result.returncode == 0: # Cleanup temp files os.unlink(video_temp_path) os.unlink(audio_temp_path) return True else: # ffmpeg failed, fall back to video only print(f" ⚠️ ffmpeg merge failed, saving video without audio") os.rename(video_temp_path, save_path) os.unlink(audio_temp_path) return True except FileNotFoundError: # ffmpeg not installed, save video only print(f" ⚠️ ffmpeg not found, saving video without audio") os.rename(video_temp_path, save_path) if audio_temp_path: os.unlink(audio_temp_path) return True except Exception as e: # Other error, save video only os.rename(video_temp_path, save_path) if audio_temp_path and os.path.exists(audio_temp_path): os.unlink(audio_temp_path) return True else: # No audio found, just use video os.rename(video_temp_path, save_path) return True except Exception as e: # Cleanup any temp files on error pass return False def download_post_media(post_data, dirs, post_id): """Downloads all media from a post.""" media = get_media_urls(post_data) downloaded = {"images": 0, "videos": 0} for i, img_url in enumerate(media["images"][:5]): ext = os.path.splitext(urlparse(img_url).path)[1] or '.jpg' save_path = os.path.join(dirs["images"], f"{post_id}_{i}{ext}") if download_media(img_url, save_path, "image"): downloaded["images"] += 1 for i, img_url in enumerate(media["galleries"][:10]): ext = '.jpg' save_path = os.path.join(dirs["images"], f"{post_id}_gallery_{i}{ext}") if download_media(img_url, save_path, "gallery"): downloaded["images"] += 1 for i, vid_url in enumerate(media["videos"][:2]): if 'youtube' not in vid_url: ext = '.mp4' save_path = os.path.join(dirs["videos"], f"{post_id}_{i}{ext}") # Use enhanced download for Reddit videos (includes audio) if 'v.redd.it' in vid_url or 'reddit.com' in vid_url: if download_reddit_video_with_audio(vid_url, save_path): downloaded["videos"] += 1 elif download_media(vid_url, save_path, "video"): downloaded["videos"] += 1 return downloaded # --- COMMENT SCRAPING --- def scrape_comments(permalink, max_depth=3): """Scrapes comments from a post.""" comments = [] try: if not permalink.startswith('http'): url = f"https://old.reddit.com{permalink}.json?limit=100" else: url = f"{permalink}.json?limit=100" response = SESSION.get(url, timeout=15) if response.status_code != 200: return comments data = response.json() if len(data) > 1: comment_data = data[1]['data']['children'] comments = parse_comments(comment_data, permalink, depth=0, max_depth=max_depth) except Exception as e: pass if len(comments) > 0: print(f" + Scraped {len(comments)} comments") return comments def parse_comments(comment_list, post_permalink, depth=0, max_depth=3): """Recursively parses comments.""" comments = [] if depth > max_depth: return comments for item in comment_list: if item['kind'] != 't1': continue c = item['data'] comment = { "post_permalink": post_permalink, "comment_id": c.get('id'), "parent_id": c.get('parent_id'), "author": c.get('author'), "body": c.get('body', ''), "score": c.get('score', 0), "created_utc": datetime.datetime.fromtimestamp(c.get('created_utc', 0)).isoformat(), "depth": depth, "is_submitter": c.get('is_submitter', False), } comments.append(comment) replies = c.get('replies') if replies and isinstance(replies, dict): reply_children = replies.get('data', {}).get('children', []) comments.extend(parse_comments(reply_children, post_permalink, depth + 1, max_depth)) return comments # --- POST EXTRACTION --- def extract_post_data(post_json): """Extracts comprehensive post data.""" p = post_json post_type = "text" if p.get('is_video'): post_type = "video" elif p.get('is_gallery'): post_type = "gallery" elif any(ext in p.get('url', '').lower() for ext in ['.jpg', '.jpeg', '.png', '.gif', '.webp']) or 'i.redd.it' in p.get('url', ''): post_type = "image" elif p.get('is_self'): post_type = "text" else: post_type = "link" return { "id": p.get('id'), "title": p.get('title'), "author": p.get('author'), "created_utc": datetime.datetime.fromtimestamp(p.get('created_utc', 0)).isoformat(), "permalink": p.get('permalink'), "url": p.get('url_overridden_by_dest', p.get('url')), "score": p.get('score', 0), "upvote_ratio": p.get('upvote_ratio', 0), "num_comments": p.get('num_comments', 0), "num_crossposts": p.get('num_crossposts', 0), "selftext": p.get('selftext', ''), "post_type": post_type, "is_nsfw": p.get('over_18', False), "is_spoiler": p.get('spoiler', False), "flair": p.get('link_flair_text', ''), "total_awards": p.get('total_awards_received', 0), "has_media": p.get('is_video', False) or p.get('is_gallery', False) or 'i.redd.it' in p.get('url', ''), "media_downloaded": False, "source": "History-Full" } # --- FULL HISTORY SCRAPE --- def run_full_history(target, limit, is_user=False, download_media_flag=True, scrape_comments_flag=True, dry_run=False, use_plugins=False): """ Full scrape with images, videos, and comments. Args: target: Subreddit or username limit: Maximum posts to scrape is_user: True if target is a user download_media_flag: Download images/videos scrape_comments_flag: Scrape comments dry_run: Simulate without saving data use_plugins: Run post-processing plugins """ prefix = "u" if is_user else "r" mode = "full" if download_media_flag and scrape_comments_flag else "history" # Display mode banner if dry_run: print("=" * 50) print("🧪 DRY RUN MODE - No data will be saved") print("=" * 50) print(f"🚀 Starting {'DRY RUN' if dry_run else 'FULL HISTORY'} scrape for {prefix}/{target}") print(f" 📊 Target posts: {limit}") print(f" 🖼️ Download media: {download_media_flag and not dry_run}") print(f" 💬 Scrape comments: {scrape_comments_flag}") print(f" 🔌 Plugins enabled: {use_plugins}") print("-" * 50) # Start job tracking job_id = None try: from export.database import start_job_record, complete_job_record job_id = start_job_record(target, mode, is_user, dry_run) except Exception as e: print(f"⚠️ Job tracking unavailable: {e}") # Setup directories (even for dry run, to check existing data) dirs = setup_directories(target, prefix) load_history(dirs["posts"]) after = None total_posts = 0 total_media = {"images": 0, "videos": 0} total_comments = 0 all_scraped_posts = [] # For plugin processing all_scraped_comments = [] start_time = time.time() error_msg = None try: while total_posts < limit: random.shuffle(MIRRORS) success = False for base_url in MIRRORS: try: if is_user: path = f"/user/{target}/submitted.json" else: path = f"/r/{target}/new.json" # Use proper batch size - min of remaining posts needed or 100 (Reddit's max per request) batch_size = min(100, limit - total_posts) target_url = f"{base_url}{path}?limit={batch_size}&raw_json=1" if after: target_url += f"&after={after}" print(f"\n📡 Fetching from: {base_url}") response = SESSION.get(target_url, timeout=15) if response.status_code == 200: data = response.json() posts = [] batch_comments = [] children = data['data']['children'] print(f" Found {len(children)} posts in this batch") for child in children: p = child['data'] post = extract_post_data(p) if post['permalink'] in SEEN_URLS: continue # Download media (skip in dry run) if download_media_flag and not dry_run: downloaded = download_post_media(p, dirs, post['id']) post['media_downloaded'] = downloaded['images'] > 0 or downloaded['videos'] > 0 total_media['images'] += downloaded['images'] total_media['videos'] += downloaded['videos'] if downloaded['images'] > 0 or downloaded['videos'] > 0: print(f" + Downloaded: {downloaded['images']} images, {downloaded['videos']} videos") posts.append(post) # Scrape comments if scrape_comments_flag and post['num_comments'] > 0: print(f" 💬 Fetching comments for: {post['title'][:40]}...") comments = scrape_comments(post['permalink']) batch_comments.extend(comments) total_comments += len(comments) time.sleep(1) # Collect for plugins all_scraped_posts.extend(posts) all_scraped_comments.extend(batch_comments) # Save data (skip in dry run) if not dry_run: saved = save_posts_csv(posts, dirs["posts"]) total_posts += saved if batch_comments: save_comments_csv(batch_comments, dirs["comments"]) else: # In dry run, just count total_posts += len(posts) print(f" 🧪 [DRY RUN] Would save {len(posts)} posts") print(f"\n📊 Progress: {total_posts}/{limit} posts") print(f" 🖼️ Images: {total_media['images']} | 🎬 Videos: {total_media['videos']}") print(f" 💬 Comments: {total_comments}") after = data['data'].get('after') if not after: print("\n🏁 Reached end of available history.") break success = True break except Exception as e: print(f" ⚠️ Error with {base_url}: {e}") continue if not after: break if not success: print("\n❌ All sources failed. Waiting 30s...") time.sleep(30) else: print(f"\n⏸️ Cooling down (3s)...") time.sleep(3) # Run plugins on collected data if use_plugins and (all_scraped_posts or all_scraped_comments): print("\n🔌 Running post-processing plugins...") try: from plugins import load_plugins, run_plugins plugins = load_plugins() if plugins: all_scraped_posts, all_scraped_comments = run_plugins( all_scraped_posts, all_scraped_comments, plugins ) print(f" ✅ Processed {len(all_scraped_posts)} posts with {len(plugins)} plugins") else: print(" ⚠️ No plugins found") except Exception as e: print(f" ⚠️ Plugin error: {e}") except Exception as e: error_msg = str(e) print(f"\n❌ Scrape error: {e}") duration = time.time() - start_time # Complete job tracking if job_id: try: status = 'failed' if error_msg else 'completed' complete_job_record( job_id, status, total_posts, total_comments, total_media['images'] + total_media['videos'], error_msg ) except Exception as e: print(f"⚠️ Failed to complete job record: {e}") # Summary print("\n" + "=" * 50) if dry_run: print("🧪 DRY RUN COMPLETE!") print(f" 📊 Would scrape: {total_posts} posts") print(f" 💬 Would scrape: {total_comments} comments") else: print("✅ SCRAPE COMPLETE!") print(f" 📁 Data saved to: {dirs['base']}") print(f" 📊 Total posts: {total_posts}") print(f" 🖼️ Total images: {total_media['images']}") print(f" 🎬 Total videos: {total_media['videos']}") print(f" 💬 Total comments: {total_comments}") print(f" ⏱️ Duration: {duration:.1f}s") return { 'posts': total_posts, 'images': total_media['images'], 'videos': total_media['videos'], 'comments': total_comments, 'duration': f"{duration:.1f}s", 'dry_run': dry_run, 'job_id': job_id } # --- MONITOR MODE --- def run_monitor(target, is_user=False): prefix = "u" if is_user else "r" if is_user: rss_url = f"https://www.reddit.com/user/{target}/submitted.rss?limit=100" else: rss_url = f"https://www.reddit.com/r/{target}/new.rss?limit=100" print(f"[{datetime.datetime.now()}] 📡 Checking RSS for {prefix}/{target}...") try: response = SESSION.get(rss_url, timeout=15) if response.status_code != 200: print(f"❌ RSS blocked (Status {response.status_code}), trying JSON...") run_full_history(target, 25, is_user, download_media_flag=False, scrape_comments_flag=False) return root = ET.fromstring(response.content) namespace = {'atom': 'http://www.w3.org/2005/Atom'} posts = [] for entry in root.findall('atom:entry', namespace): posts.append({ "id": "", "title": entry.find('atom:title', namespace).text, "author": "", "created_utc": entry.find('atom:published', namespace).text, "permalink": entry.find('atom:link', namespace).attrib['href'], "url": entry.find('atom:link', namespace).attrib['href'], "score": 0, "upvote_ratio": 0, "num_comments": 0, "num_crossposts": 0, "selftext": "", "post_type": "unknown", "is_nsfw": False, "is_spoiler": False, "flair": "", "total_awards": 0, "has_media": False, "media_downloaded": False, "source": "Monitor-RSS" }) dirs = setup_directories(target, prefix) save_posts_csv(posts, dirs["posts"]) except Exception as e: print(f"❌ Monitor Error: {e}") # --- CLI --- def main(): parser = argparse.ArgumentParser( description="🤖 Universal Reddit Scraper Suite", formatter_class=argparse.RawDescriptionHelpFormatter, epilog=""" Commands: SCRAPING: python main.py --mode full --limit 100 python main.py --mode history --limit 500 python main.py --mode monitor python main.py --dry-run # Test without saving python main.py --plugins # Enable post-processing SEARCH: python main.py --search "keyword" --subreddit delhi python main.py --search "keyword" --min-score 100 DASHBOARD: python main.py --dashboard SCHEDULE: python main.py --schedule delhi --every 60 ANALYTICS: python main.py --analyze delhi --sentiment python main.py --analyze delhi --keywords MAINTENANCE: python main.py --job-history # View job history python main.py --backup # Backup database python main.py --vacuum # Optimize database python main.py --export-parquet python # Export to Parquet python main.py --list-plugins # List available plugins REST API: python main.py --api # Start REST API server """ ) # Scraping args parser.add_argument("target", nargs='?', help="Subreddit or username to scrape") parser.add_argument("--mode", choices=["monitor", "history", "full"], default="full") parser.add_argument("--user", action="store_true", help="Target is a user") parser.add_argument("--limit", type=int, default=100, help="Max posts to scrape") parser.add_argument("--no-media", action="store_true", help="Skip media download") parser.add_argument("--no-comments", action="store_true", help="Skip comments") # Dashboard parser.add_argument("--dashboard", action="store_true", help="Launch web dashboard") # Search parser.add_argument("--search", type=str, help="Search scraped data") parser.add_argument("--subreddit", type=str, help="Filter by subreddit") parser.add_argument("--min-score", type=int, help="Filter by minimum score") parser.add_argument("--author", type=str, help="Filter by author") # Analytics parser.add_argument("--analyze", type=str, help="Run analytics on subreddit") parser.add_argument("--sentiment", action="store_true", help="Run sentiment analysis") parser.add_argument("--keywords", action="store_true", help="Extract keywords") # Schedule parser.add_argument("--schedule", type=str, help="Schedule scraping for target") parser.add_argument("--every", type=int, help="Interval in minutes") # Alerts parser.add_argument("--alert", type=str, help="Set keyword alert") parser.add_argument("--discord-webhook", type=str, help="Discord webhook URL") parser.add_argument("--telegram-token", type=str, help="Telegram bot token") parser.add_argument("--telegram-chat", type=str, help="Telegram chat ID") # New: Observability & Maintenance parser.add_argument("--dry-run", action="store_true", help="Simulate scrape without saving data") parser.add_argument("--plugins", action="store_true", help="Enable post-processing plugins") parser.add_argument("--list-plugins", action="store_true", help="List available plugins") parser.add_argument("--job-history", action="store_true", help="View job history") parser.add_argument("--backup", action="store_true", help="Backup SQLite database") parser.add_argument("--vacuum", action="store_true", help="Optimize SQLite database") parser.add_argument("--export-parquet", type=str, help="Export subreddit to Parquet format") parser.add_argument("--api", action="store_true", help="Start REST API server (port 8000)") args = parser.parse_args() print("=" * 50) print("🤖 UNIVERSAL REDDIT SCRAPER SUITE") print("=" * 50) # Dashboard mode if args.dashboard: print("\n🌐 Launching Dashboard...") print(" Open: http://localhost:8501") os.system("streamlit run dashboard/app.py") return # REST API mode if args.api: print("\n🚀 Starting REST API server...") print(" 📖 Docs: http://localhost:8000/docs") print(" 📊 Connect Metabase/Grafana to http://localhost:8000") try: import uvicorn from api.server import app uvicorn.run(app, host="0.0.0.0", port=8000) except ImportError: print("❌ Install dependencies: pip install fastapi uvicorn") return # --- NEW: Maintenance & Observability Commands --- # Job history if args.job_history: from export.database import print_job_history print_job_history() return # Backup database if args.backup: from export.database import backup_database backup_database() return # Vacuum/optimize database if args.vacuum: from export.database import vacuum_database vacuum_database() return # Export to Parquet if args.export_parquet: from export.parquet import export_to_parquet prefix = "u" if args.user else "r" export_to_parquet(args.export_parquet, prefix=prefix) return # List plugins if args.list_plugins: from plugins import list_plugins list_plugins() return # Search mode if args.search: print(f"\n🔍 Searching for: {args.search}") from search.query import search_all_data, print_search_results results = search_all_data( query=args.search, min_score=args.min_score, author=args.author ) print_search_results(results) return # Analytics mode if args.analyze: print(f"\n📊 Analyzing: {args.analyze}") # Load data data_dir = Path(f"data/r_{args.analyze}") if not data_dir.exists(): print(f"❌ No data found for r/{args.analyze}") return posts_file = data_dir / "posts.csv" if not posts_file.exists(): print(f"❌ No posts data found") return import pandas as pd df = pd.read_csv(posts_file) posts = df.to_dict('records') if args.sentiment: from analytics.sentiment import analyze_posts_sentiment analyzed, counts = analyze_posts_sentiment(posts) print(f"\n😀 Sentiment Analysis:") print(f" Positive: {counts['positive']}") print(f" Neutral: {counts['neutral']}") print(f" Negative: {counts['negative']}") if args.keywords: from analytics.sentiment import extract_keywords texts = [str(p.get('title', '') or '') + ' ' + str(p.get('selftext', '') or '') for p in posts] keywords = extract_keywords(texts, top_n=20) print(f"\n☁️ Top Keywords:") for word, count in keywords: print(f" {word}: {count}") return # Schedule mode if args.schedule: if not args.every: print("❌ Please specify --every ") return from scheduler.cron import run_scheduled run_scheduled(args.schedule, args.every, args.mode, args.limit, args.user) return # Regular scraping mode if not args.target: parser.print_help() return if args.mode == "monitor": prefix = "u" if args.user else "r" dirs = setup_directories(args.target, prefix) load_history(dirs["posts"]) print(f"🔄 Monitoring {prefix}/{args.target} every 5 mins...") while True: run_monitor(args.target, args.user) time.sleep(300) elif args.mode == "history": run_full_history(args.target, args.limit, args.user, download_media_flag=False, scrape_comments_flag=False, dry_run=args.dry_run, use_plugins=args.plugins) else: run_full_history(args.target, args.limit, args.user, download_media_flag=not args.no_media, scrape_comments_flag=not args.no_comments, dry_run=args.dry_run, use_plugins=args.plugins) if __name__ == "__main__": main() ================================================ FILE: plugins/__init__.py ================================================ """ Lightweight Plugin System for Post-Processing Plugins can process posts and comments after scraping. """ from abc import ABC, abstractmethod from pathlib import Path import importlib.util import sys class Plugin(ABC): """ Base class for scraper plugins. To create a plugin: 1. Create a new .py file in the plugins/ directory 2. Create a class that inherits from Plugin 3. Implement the process_posts() method 4. Optionally implement process_comments() Example: class MyPlugin(Plugin): name = "my_plugin" description = "Does something cool" def process_posts(self, posts): for post in posts: post['processed'] = True return posts """ name = "base" description = "Base plugin" enabled = True @abstractmethod def process_posts(self, posts: list) -> list: """ Process posts after scraping. Args: posts: List of post dictionaries Returns: Modified list of posts """ pass def process_comments(self, comments: list) -> list: """ Process comments after scraping (optional). Args: comments: List of comment dictionaries Returns: Modified list of comments """ return comments def __repr__(self): return f"" def load_plugins(plugin_dir=None): """ Load all plugins from the plugins directory. Args: plugin_dir: Path to plugins directory Returns: List of plugin instances """ if plugin_dir is None: plugin_dir = Path(__file__).parent else: plugin_dir = Path(plugin_dir) plugins = [] for file in plugin_dir.glob("*.py"): # Skip __init__.py and base files if file.name.startswith("_"): continue try: # Load the module spec = importlib.util.spec_from_file_location(file.stem, file) module = importlib.util.module_from_spec(spec) sys.modules[file.stem] = module spec.loader.exec_module(module) # Find Plugin subclasses for attr_name in dir(module): attr = getattr(module, attr_name) if (isinstance(attr, type) and issubclass(attr, Plugin) and attr != Plugin and hasattr(attr, 'name')): plugin_instance = attr() if plugin_instance.enabled: plugins.append(plugin_instance) except Exception as e: print(f"⚠️ Failed to load plugin {file.name}: {e}") return plugins def run_plugins(posts, comments, plugins): """ Run all plugins on scraped data. Args: posts: List of posts comments: List of comments plugins: List of plugin instances Returns: Tuple of (processed_posts, processed_comments) """ for plugin in plugins: try: print(f"🔌 Running plugin: {plugin.name}") posts = plugin.process_posts(posts) comments = plugin.process_comments(comments) except Exception as e: print(f"⚠️ Plugin {plugin.name} failed: {e}") return posts, comments def list_plugins(plugin_dir=None): """List all available plugins.""" plugins = load_plugins(plugin_dir) print("\n🔌 Available Plugins:") print("-" * 50) if not plugins: print(" No plugins found") else: for plugin in plugins: status = "✅" if plugin.enabled else "❌" print(f" {status} {plugin.name:<20} {plugin.description}") print("-" * 50) return plugins ================================================ FILE: plugins/deduplicator.py ================================================ """ Deduplicator Plugin Removes duplicate posts based on permalink. """ from plugins import Plugin class Deduplicator(Plugin): """Remove duplicate posts by permalink.""" name = "deduplicator" description = "Removes duplicate posts by permalink" enabled = True def process_posts(self, posts): """Remove duplicate posts.""" seen = set() unique = [] duplicates = 0 for post in posts: key = post.get('permalink') if key and key not in seen: seen.add(key) unique.append(post) else: duplicates += 1 if duplicates > 0: print(f" 🔄 Removed {duplicates} duplicate posts") return unique def process_comments(self, comments): """Remove duplicate comments.""" seen = set() unique = [] for comment in comments: key = comment.get('comment_id') if key and key not in seen: seen.add(key) unique.append(comment) return unique ================================================ FILE: plugins/keyword_extractor.py ================================================ """ Keyword Extractor Plugin Extracts and tags posts with top keywords. """ import sys from pathlib import Path sys.path.insert(0, str(Path(__file__).parent.parent)) from plugins import Plugin from analytics.sentiment import extract_keywords class KeywordExtractor(Plugin): """Extract and add keywords to posts.""" name = "keyword_extractor" description = "Adds top keywords to each post" enabled = True top_n = 5 # Number of keywords per post def process_posts(self, posts): """Add keywords to each post.""" for post in posts: text = f"{post.get('title', '')} {post.get('selftext', '')}" keywords = extract_keywords([text], top_n=self.top_n) post['keywords'] = ','.join([kw for kw, count in keywords]) # Also extract global keywords all_texts = [f"{p.get('title', '')} {p.get('selftext', '')}" for p in posts] global_keywords = extract_keywords(all_texts, top_n=10) print(f" 🏷️ Top keywords: {', '.join([kw for kw, _ in global_keywords[:5]])}") return posts ================================================ FILE: plugins/sentiment_tagger.py ================================================ """ Sentiment Tagger Plugin Adds sentiment scores and labels to posts and comments. """ import sys from pathlib import Path # Add parent to path for imports sys.path.insert(0, str(Path(__file__).parent.parent)) from plugins import Plugin from analytics.sentiment import analyze_sentiment class SentimentTagger(Plugin): """Add sentiment analysis to scraped content.""" name = "sentiment_tagger" description = "Adds sentiment scores and labels to posts" enabled = True def process_posts(self, posts): """Add sentiment to posts.""" for post in posts: text = f"{post.get('title', '')} {post.get('selftext', '')}" score, label = analyze_sentiment(text) post['sentiment_score'] = score post['sentiment_label'] = label # Count sentiments pos = sum(1 for p in posts if p.get('sentiment_label') == 'positive') neg = sum(1 for p in posts if p.get('sentiment_label') == 'negative') neu = len(posts) - pos - neg print(f" 📊 Sentiment: {pos} positive, {neu} neutral, {neg} negative") return posts def process_comments(self, comments): """Add sentiment to comments.""" for comment in comments: score, label = analyze_sentiment(comment.get('body', '')) comment['sentiment_score'] = score comment['sentiment_label'] = label return comments ================================================ FILE: requirements.txt ================================================ # Core pandas requests # Async aiohttp aiofiles # Dashboard streamlit # Export openpyxl pyarrow # REST API fastapi uvicorn # System & Analytics psutil duckdb ================================================ FILE: scheduler/__init__.py ================================================ # Scheduler module from .cron import * ================================================ FILE: scheduler/cron.py ================================================ """ Scheduler module - Cron-style scheduling for scrape jobs """ import time import threading from datetime import datetime, timedelta import json from pathlib import Path import sys class CronScheduler: """Simple cron-style scheduler for Reddit scraping jobs.""" def __init__(self): self.jobs = [] self.running = False self.thread = None def add_job(self, target, mode='full', limit=100, is_user=False, interval_minutes=60, run_at_start=True): """ Add a scheduled scraping job. Args: target: Subreddit or username mode: 'full', 'history', or 'monitor' limit: Post limit per run is_user: True if target is a user interval_minutes: Minutes between runs run_at_start: Run immediately when scheduler starts """ job = { 'id': len(self.jobs) + 1, 'target': target, 'mode': mode, 'limit': limit, 'is_user': is_user, 'interval_minutes': interval_minutes, 'run_at_start': run_at_start, 'last_run': None, 'next_run': datetime.now() if run_at_start else datetime.now() + timedelta(minutes=interval_minutes), 'enabled': True, 'run_count': 0 } self.jobs.append(job) print(f"📅 Added job #{job['id']}: {'u/' if is_user else 'r/'}{target} every {interval_minutes}min") return job['id'] def remove_job(self, job_id): """Remove a scheduled job.""" self.jobs = [j for j in self.jobs if j['id'] != job_id] print(f"🗑️ Removed job #{job_id}") def disable_job(self, job_id): """Temporarily disable a job.""" for job in self.jobs: if job['id'] == job_id: job['enabled'] = False print(f"⏸️ Disabled job #{job_id}") def enable_job(self, job_id): """Enable a disabled job.""" for job in self.jobs: if job['id'] == job_id: job['enabled'] = True print(f"▶️ Enabled job #{job_id}") def list_jobs(self): """List all scheduled jobs.""" print("\n📋 Scheduled Jobs:") print("-" * 60) for job in self.jobs: status = "✅" if job['enabled'] else "⏸️" prefix = "u/" if job['is_user'] else "r/" next_run = job['next_run'].strftime("%H:%M:%S") if job['next_run'] else "Never" print(f"{status} #{job['id']} | {prefix}{job['target']} | " f"Every {job['interval_minutes']}min | Next: {next_run} | " f"Runs: {job['run_count']}") print() return self.jobs def _run_job(self, job): """Execute a single job.""" # Import here to avoid circular imports try: from main import run_full_history prefix = "u/" if job['is_user'] else "r/" print(f"\n🚀 Running scheduled job: {prefix}{job['target']}") run_full_history( job['target'], job['limit'], job['is_user'], download_media_flag=(job['mode'] == 'full'), scrape_comments_flag=(job['mode'] == 'full') ) job['last_run'] = datetime.now() job['run_count'] += 1 print(f"✅ Job completed: {prefix}{job['target']}") except Exception as e: print(f"❌ Job failed: {e}") def _scheduler_loop(self): """Main scheduler loop.""" print("🔄 Scheduler started") while self.running: now = datetime.now() for job in self.jobs: if not job['enabled']: continue if job['next_run'] and now >= job['next_run']: self._run_job(job) job['next_run'] = now + timedelta(minutes=job['interval_minutes']) # Check every 30 seconds time.sleep(30) print("🛑 Scheduler stopped") def start(self): """Start the scheduler in background.""" if self.running: print("⚠️ Scheduler already running") return self.running = True self.thread = threading.Thread(target=self._scheduler_loop, daemon=True) self.thread.start() print("✅ Scheduler started in background") def stop(self): """Stop the scheduler.""" self.running = False if self.thread: self.thread.join(timeout=5) print("🛑 Scheduler stopped") def save_jobs(self, filepath='scheduler_jobs.json'): """Save jobs to file.""" jobs_data = [] for job in self.jobs: job_copy = job.copy() job_copy['last_run'] = job_copy['last_run'].isoformat() if job_copy['last_run'] else None job_copy['next_run'] = job_copy['next_run'].isoformat() if job_copy['next_run'] else None jobs_data.append(job_copy) with open(filepath, 'w') as f: json.dump(jobs_data, f, indent=2) print(f"💾 Saved {len(self.jobs)} jobs to {filepath}") def load_jobs(self, filepath='scheduler_jobs.json'): """Load jobs from file.""" if not Path(filepath).exists(): print("⚠️ No saved jobs found") return with open(filepath, 'r') as f: jobs_data = json.load(f) for job_data in jobs_data: if job_data['last_run']: job_data['last_run'] = datetime.fromisoformat(job_data['last_run']) if job_data['next_run']: job_data['next_run'] = datetime.fromisoformat(job_data['next_run']) self.jobs.append(job_data) print(f"📂 Loaded {len(jobs_data)} jobs from {filepath}") # Simple interval-based scheduler for CLI def run_scheduled(target, interval_minutes, mode='full', limit=100, is_user=False): """ Run a scrape job on a schedule. Args: target: Subreddit or username interval_minutes: Minutes between runs mode: 'full', 'history', or 'monitor' limit: Post limit per run is_user: True if target is a user """ from main import run_full_history prefix = "u/" if is_user else "r/" print(f"📅 Scheduled: {prefix}{target} every {interval_minutes} minutes") print("Press Ctrl+C to stop\n") run_count = 0 try: while True: run_count += 1 print(f"\n{'='*50}") print(f"🔄 Run #{run_count} - {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}") print(f"{'='*50}") run_full_history( target, limit, is_user, download_media_flag=(mode == 'full'), scrape_comments_flag=(mode == 'full') ) print(f"\n⏰ Next run in {interval_minutes} minutes...") time.sleep(interval_minutes * 60) except KeyboardInterrupt: print(f"\n\n🛑 Scheduler stopped after {run_count} runs") ================================================ FILE: scraper/__init__.py ================================================ # Scraper module from .async_scraper import run_async_scraper, scrape_async ================================================ FILE: scraper/async_scraper.py ================================================ """ Async Reddit Scraper - 10x Speed Boost with aiohttp """ import asyncio import aiohttp import aiofiles import pandas as pd import datetime import time import os import random from pathlib import Path from urllib.parse import urlparse import sys sys.path.insert(0, str(Path(__file__).parent.parent)) from config import USER_AGENT, MIRRORS, ASYNC_MAX_CONCURRENT, ASYNC_BATCH_SIZE import subprocess import tempfile # Semaphore to limit concurrent requests semaphore = None async def fetch_json(session, url, retries=3): """Fetch JSON with retry logic.""" for attempt in range(retries): try: async with session.get(url, timeout=aiohttp.ClientTimeout(total=15)) as response: if response.status == 200: return await response.json() elif response.status == 429: # Rate limited await asyncio.sleep(5 * (attempt + 1)) except Exception as e: if attempt < retries - 1: await asyncio.sleep(2) return None async def fetch_posts_page(session, base_url, target, after=None, is_user=False, batch_size=100): """Fetch a single page of posts.""" if is_user: path = f"/user/{target}/submitted.json" else: path = f"/r/{target}/new.json" url = f"{base_url}{path}?limit={batch_size}&raw_json=1" if after: url += f"&after={after}" return await fetch_json(session, url) async def download_media_async(session, url, save_path): """Download media file asynchronously.""" global semaphore if os.path.exists(save_path): return True async with semaphore: try: async with session.get(url, timeout=aiohttp.ClientTimeout(total=60)) as response: if response.status == 200: async with aiofiles.open(save_path, 'wb') as f: async for chunk in response.content.iter_chunked(8192): await f.write(chunk) return True except: pass return False async def download_reddit_video_with_audio_async(session, video_url, save_path): """ Downloads Reddit video with audio asynchronously. Reddit stores video and audio separately - this combines them using ffmpeg. """ global semaphore if os.path.exists(save_path): return True async with semaphore: try: # Find audio URL by replacing video quality with audio base_url = video_url.rsplit('/', 1)[0] audio_urls = [ f"{base_url}/DASH_audio.mp4", f"{base_url}/DASH_AUDIO_128.mp4", f"{base_url}/DASH_AUDIO_64.mp4", f"{base_url}/audio.mp4", f"{base_url}/audio" ] # Download video to temp file video_temp = tempfile.NamedTemporaryFile(suffix='_video.mp4', delete=False) video_temp_path = video_temp.name video_temp.close() try: async with session.get(video_url, timeout=aiohttp.ClientTimeout(total=60)) as response: if response.status != 200: return False async with aiofiles.open(video_temp_path, 'wb') as f: async for chunk in response.content.iter_chunked(8192): await f.write(chunk) except: if os.path.exists(video_temp_path): os.unlink(video_temp_path) return False # Try to download audio audio_temp_path = None for audio_url in audio_urls: try: async with session.get(audio_url, timeout=aiohttp.ClientTimeout(total=30)) as response: if response.status == 200: audio_temp = tempfile.NamedTemporaryFile(suffix='_audio.mp4', delete=False) audio_temp_path = audio_temp.name audio_temp.close() async with aiofiles.open(audio_temp_path, 'wb') as f: async for chunk in response.content.iter_chunked(8192): await f.write(chunk) break except: continue if audio_temp_path: # Merge video and audio using ffmpeg try: cmd = [ 'ffmpeg', '-y', '-hide_banner', '-loglevel', 'error', '-i', video_temp_path, '-i', audio_temp_path, '-c:v', 'copy', '-c:a', 'aac', '-shortest', save_path ] proc = await asyncio.create_subprocess_exec( *cmd, stdout=asyncio.subprocess.PIPE, stderr=asyncio.subprocess.PIPE ) await asyncio.wait_for(proc.wait(), timeout=120) if proc.returncode == 0: os.unlink(video_temp_path) os.unlink(audio_temp_path) return True else: # ffmpeg failed, use video only os.rename(video_temp_path, save_path) os.unlink(audio_temp_path) return True except FileNotFoundError: # ffmpeg not installed os.rename(video_temp_path, save_path) if audio_temp_path and os.path.exists(audio_temp_path): os.unlink(audio_temp_path) return True except Exception: os.rename(video_temp_path, save_path) if audio_temp_path and os.path.exists(audio_temp_path): os.unlink(audio_temp_path) return True else: # No audio found, just use video os.rename(video_temp_path, save_path) return True except Exception: pass return False async def fetch_comments_async(session, permalink): """Fetch comments asynchronously.""" global semaphore async with semaphore: url = f"https://old.reddit.com{permalink}.json?limit=100" data = await fetch_json(session, url) if data and len(data) > 1: return parse_comments_sync(data[1]['data']['children'], permalink) return [] def parse_comments_sync(comment_list, post_permalink, depth=0, max_depth=3): """Parse comments (sync helper).""" comments = [] if depth > max_depth: return comments for item in comment_list: if item['kind'] != 't1': continue c = item['data'] comments.append({ "post_permalink": post_permalink, "comment_id": c.get('id'), "parent_id": c.get('parent_id'), "author": c.get('author'), "body": c.get('body', ''), "score": c.get('score', 0), "created_utc": datetime.datetime.fromtimestamp(c.get('created_utc', 0)).isoformat(), "depth": depth, "is_submitter": c.get('is_submitter', False), }) replies = c.get('replies') if replies and isinstance(replies, dict): comments.extend(parse_comments_sync( replies.get('data', {}).get('children', []), post_permalink, depth + 1, max_depth )) return comments def extract_media_urls(post_data): """Extract all media URLs from a post.""" media = {"images": [], "videos": [], "galleries": []} url = post_data.get('url', '') if any(ext in url.lower() for ext in ['.jpg', '.jpeg', '.png', '.gif', '.webp']): media["images"].append(url) if 'i.redd.it' in url: media["images"].append(url) if post_data.get('is_video'): reddit_video = post_data.get('media', {}) if reddit_video and 'reddit_video' in reddit_video: video_url = reddit_video['reddit_video'].get('fallback_url', '') if video_url: media["videos"].append(video_url.split('?')[0]) preview = post_data.get('preview', {}) if preview and 'images' in preview: for img in preview['images']: source = img.get('source', {}) if source.get('url'): media["images"].append(source['url'].replace('&', '&')) if post_data.get('is_gallery'): gallery_data = post_data.get('gallery_data', {}) media_metadata = post_data.get('media_metadata', {}) if gallery_data and media_metadata: for item in gallery_data.get('items', []): media_id = item.get('media_id') if media_id and media_id in media_metadata: meta = media_metadata[media_id] if meta.get('s', {}).get('u'): media["galleries"].append(meta['s']['u'].replace('&', '&')) return media def extract_post_data(p): """Extract post data from JSON.""" post_type = "text" if p.get('is_video'): post_type = "video" elif p.get('is_gallery'): post_type = "gallery" elif any(ext in p.get('url', '').lower() for ext in ['.jpg', '.jpeg', '.png', '.gif', '.webp']) or 'i.redd.it' in p.get('url', ''): post_type = "image" elif p.get('is_self'): post_type = "text" else: post_type = "link" return { "id": p.get('id'), "title": p.get('title'), "author": p.get('author'), "created_utc": datetime.datetime.fromtimestamp(p.get('created_utc', 0)).isoformat(), "permalink": p.get('permalink'), "url": p.get('url_overridden_by_dest', p.get('url')), "score": p.get('score', 0), "upvote_ratio": p.get('upvote_ratio', 0), "num_comments": p.get('num_comments', 0), "num_crossposts": p.get('num_crossposts', 0), "selftext": p.get('selftext', ''), "post_type": post_type, "is_nsfw": p.get('over_18', False), "is_spoiler": p.get('spoiler', False), "flair": p.get('link_flair_text', ''), "total_awards": p.get('total_awards_received', 0), "has_media": p.get('is_video', False) or p.get('is_gallery', False) or 'i.redd.it' in p.get('url', ''), "media_downloaded": False, "source": "Async-Scraper" } async def scrape_async(target, limit=100, is_user=False, download_media=True, scrape_comments=True): """ Main async scraping function. Args: target: Subreddit or username limit: Max posts to scrape is_user: True if scraping a user download_media: Download images/videos scrape_comments: Scrape comments """ global semaphore semaphore = asyncio.Semaphore(ASYNC_MAX_CONCURRENT) prefix = "u" if is_user else "r" print(f"🚀 ASYNC Scraper starting for {prefix}/{target}") print(f" Target: {limit} posts | Media: {download_media} | Comments: {scrape_comments}") print(f" Concurrency: {ASYNC_MAX_CONCURRENT} simultaneous requests") print("-" * 50) # Setup directories base_dir = f"data/{prefix}_{target}" media_dir = f"{base_dir}/media" images_dir = f"{media_dir}/images" videos_dir = f"{media_dir}/videos" for d in [base_dir, media_dir, images_dir, videos_dir]: os.makedirs(d, exist_ok=True) start_time = time.time() all_posts = [] all_comments = [] media_tasks = [] seen_permalinks = set() # Load existing data posts_file = f"{base_dir}/posts.csv" if os.path.exists(posts_file): try: df = pd.read_csv(posts_file) seen_permalinks = set(df['permalink'].astype(str).tolist()) print(f"📚 Loaded {len(seen_permalinks)} existing posts") except: pass async with aiohttp.ClientSession(headers={"User-Agent": USER_AGENT}) as session: after = None total_fetched = 0 while total_fetched < limit: # Try mirrors mirrors = MIRRORS.copy() random.shuffle(mirrors) data = None for mirror in mirrors: # Use proper batch size batch_size = min(100, limit - total_fetched) data = await fetch_posts_page(session, mirror, target, after, is_user, batch_size) if data: print(f"✅ Fetched from {mirror}") break if not data: print("❌ All mirrors failed") break children = data.get('data', {}).get('children', []) if not children: print("🏁 No more posts") break print(f" Processing {len(children)} posts...") # Process posts batch_posts = [] comment_tasks = [] for child in children: p = child['data'] post = extract_post_data(p) if post['permalink'] in seen_permalinks: continue seen_permalinks.add(post['permalink']) batch_posts.append(post) # Queue media downloads if download_media: media = extract_media_urls(p) for i, img_url in enumerate(media['images'][:5]): ext = os.path.splitext(urlparse(img_url).path)[1] or '.jpg' save_path = f"{images_dir}/{post['id']}_{i}{ext}" media_tasks.append(download_media_async(session, img_url, save_path)) for i, img_url in enumerate(media['galleries'][:10]): save_path = f"{images_dir}/{post['id']}_gallery_{i}.jpg" media_tasks.append(download_media_async(session, img_url, save_path)) for i, vid_url in enumerate(media['videos'][:2]): if 'youtube' not in vid_url: save_path = f"{videos_dir}/{post['id']}_{i}.mp4" # Use enhanced download for Reddit videos (includes audio) if 'v.redd.it' in vid_url or 'reddit.com' in vid_url: media_tasks.append(download_reddit_video_with_audio_async(session, vid_url, save_path)) else: media_tasks.append(download_media_async(session, vid_url, save_path)) # Queue comment fetching if scrape_comments and post['num_comments'] > 0: comment_tasks.append(fetch_comments_async(session, post['permalink'])) all_posts.extend(batch_posts) total_fetched += len(batch_posts) # Fetch comments in parallel if comment_tasks: print(f" 💬 Fetching comments for {len(comment_tasks)} posts...") comment_results = await asyncio.gather(*comment_tasks, return_exceptions=True) for result in comment_results: if isinstance(result, list): all_comments.extend(result) print(f" 📊 Progress: {total_fetched}/{limit} posts | {len(all_comments)} comments") after = data.get('data', {}).get('after') if not after: print("🏁 Reached end of available posts") break await asyncio.sleep(1) # Small delay between pages # Download all media in parallel if media_tasks: print(f"\n🖼️ Downloading {len(media_tasks)} media files in parallel...") media_results = await asyncio.gather(*media_tasks, return_exceptions=True) downloaded = sum(1 for r in media_results if r is True) print(f" ✅ Downloaded {downloaded}/{len(media_tasks)} files") # Save data if all_posts: df = pd.DataFrame(all_posts) if os.path.exists(posts_file): df.to_csv(posts_file, mode='a', header=False, index=False) else: df.to_csv(posts_file, index=False) print(f"\n💾 Saved {len(all_posts)} posts to {posts_file}") if all_comments: comments_file = f"{base_dir}/comments.csv" df = pd.DataFrame(all_comments) if os.path.exists(comments_file): df.to_csv(comments_file, mode='a', header=False, index=False) else: df.to_csv(comments_file, index=False) print(f"💾 Saved {len(all_comments)} comments") duration = time.time() - start_time print("\n" + "=" * 50) print("✅ ASYNC SCRAPE COMPLETE!") print(f" 📊 Posts: {len(all_posts)}") print(f" 💬 Comments: {len(all_comments)}") print(f" 🖼️ Media: {len(media_tasks)} queued") print(f" ⏱️ Duration: {duration:.1f}s") print(f" ⚡ Speed: {len(all_posts) / duration:.1f} posts/sec") return { 'posts': len(all_posts), 'comments': len(all_comments), 'duration': duration } def run_async_scraper(target, limit=100, is_user=False, download_media=True, scrape_comments=True): """Wrapper to run async scraper from sync code.""" return asyncio.run(scrape_async(target, limit, is_user, download_media, scrape_comments)) # CLI for testing if __name__ == "__main__": import argparse parser = argparse.ArgumentParser(description="Async Reddit Scraper") parser.add_argument("target", help="Subreddit or username") parser.add_argument("--limit", type=int, default=100) parser.add_argument("--user", action="store_true") parser.add_argument("--no-media", action="store_true") parser.add_argument("--no-comments", action="store_true") args = parser.parse_args() run_async_scraper( args.target, args.limit, args.user, not args.no_media, not args.no_comments ) ================================================ FILE: search/__init__.py ================================================ # Search module from .query import * ================================================ FILE: search/query.py ================================================ """ Search & Query module - Search and filter scraped data """ import pandas as pd from pathlib import Path from datetime import datetime import re def search_csv(filepath, query=None, column=None, min_score=None, max_score=None, start_date=None, end_date=None, post_type=None, author=None, limit=50): """ Search within a CSV file with various filters. Args: filepath: Path to CSV file query: Text to search for column: Specific column to search in (default: all text columns) min_score: Minimum score filter max_score: Maximum score filter start_date: Start date (YYYY-MM-DD) end_date: End date (YYYY-MM-DD) post_type: Filter by post type (image, video, text, etc.) author: Filter by author limit: Maximum results to return Returns: DataFrame with matching results """ if not Path(filepath).exists(): print(f"❌ File not found: {filepath}") return pd.DataFrame() df = pd.read_csv(filepath) # Text search if query: if column and column in df.columns: mask = df[column].astype(str).str.contains(query, case=False, na=False) else: # Search in all text columns text_cols = ['title', 'selftext', 'body'] mask = pd.Series([False] * len(df)) for col in text_cols: if col in df.columns: mask |= df[col].astype(str).str.contains(query, case=False, na=False) df = df[mask] # Score filter if min_score is not None and 'score' in df.columns: df = df[df['score'] >= min_score] if max_score is not None and 'score' in df.columns: df = df[df['score'] <= max_score] # Date filter if 'created_utc' in df.columns: if start_date: df = df[df['created_utc'] >= start_date] if end_date: df = df[df['created_utc'] <= end_date] # Post type filter if post_type and 'post_type' in df.columns: df = df[df['post_type'] == post_type] # Author filter if author and 'author' in df.columns: df = df[df['author'] == author] return df.head(limit) def search_all_data(data_dir='data', query=None, **kwargs): """ Search across all scraped data. Args: data_dir: Data directory path query: Text to search for **kwargs: Additional filters passed to search_csv Returns: Dictionary with results from each subreddit """ results = {} data_path = Path(data_dir) if not data_path.exists(): print(f"❌ Data directory not found: {data_dir}") return results # Find all posts.csv files for sub_dir in data_path.iterdir(): if sub_dir.is_dir(): posts_file = sub_dir / 'posts.csv' if posts_file.exists(): df = search_csv(str(posts_file), query=query, **kwargs) if len(df) > 0: results[sub_dir.name] = df # Also check legacy format for csv_file in data_path.glob('*.csv'): if csv_file.stem not in [r.replace('r_', '').replace('u_', '') for r in results.keys()]: df = search_csv(str(csv_file), query=query, **kwargs) if len(df) > 0: results[csv_file.stem] = df return results def print_search_results(results, show_preview=True): """Pretty print search results.""" total = sum(len(df) for df in results.values()) print(f"\n🔍 Found {total} results across {len(results)} sources\n") print("=" * 70) for source, df in results.items(): print(f"\n📁 {source} ({len(df)} matches)") print("-" * 50) for _, row in df.iterrows(): title = str(row.get('title', row.get('body', 'N/A')))[:60] score = row.get('score', 0) date = str(row.get('created_utc', ''))[:10] print(f" [{score:>4}⬆] {title}...") if show_preview and 'selftext' in row and row['selftext']: preview = str(row['selftext'])[:100].replace('\n', ' ') print(f" └─ {preview}...") print() def advanced_search(data_dir='data', query=None, regex=False, sort_by='score', ascending=False, **kwargs): """ Advanced search with regex support and sorting. Args: data_dir: Data directory path query: Search query (text or regex pattern) regex: Treat query as regex pattern sort_by: Column to sort results by ascending: Sort ascending (default: descending) **kwargs: Additional filters Returns: Combined DataFrame of all results """ all_results = [] data_path = Path(data_dir) for sub_dir in data_path.iterdir(): if sub_dir.is_dir(): posts_file = sub_dir / 'posts.csv' if posts_file.exists(): df = pd.read_csv(posts_file) df['source'] = sub_dir.name all_results.append(df) if not all_results: return pd.DataFrame() combined = pd.concat(all_results, ignore_index=True) # Apply query if query: if regex: pattern = query else: pattern = re.escape(query) mask = pd.Series([False] * len(combined)) for col in ['title', 'selftext']: if col in combined.columns: mask |= combined[col].astype(str).str.contains(pattern, case=False, na=False, regex=True) combined = combined[mask] # Apply other filters if kwargs.get('min_score') and 'score' in combined.columns: combined = combined[combined['score'] >= kwargs['min_score']] if kwargs.get('author') and 'author' in combined.columns: combined = combined[combined['author'] == kwargs['author']] if kwargs.get('post_type') and 'post_type' in combined.columns: combined = combined[combined['post_type'] == kwargs['post_type']] # Sort if sort_by in combined.columns: combined = combined.sort_values(sort_by, ascending=ascending) limit = kwargs.get('limit', 100) return combined.head(limit) def get_top_posts(data_dir='data', n=10, by='score'): """Get top N posts across all scraped data.""" df = advanced_search(data_dir, sort_by=by, ascending=False, limit=n) return df def get_recent_posts(data_dir='data', n=10): """Get most recent posts across all scraped data.""" df = advanced_search(data_dir, sort_by='created_utc', ascending=False, limit=n) return df def find_author_posts(data_dir='data', author=None): """Find all posts by a specific author.""" return advanced_search(data_dir, author=author, limit=1000) def export_search_results(results, output_path, format='csv'): """Export search results to file.""" if isinstance(results, dict): combined = pd.concat(results.values(), ignore_index=True) else: combined = results if format == 'csv': combined.to_csv(output_path, index=False) elif format == 'json': combined.to_json(output_path, orient='records', indent=2) elif format == 'excel': combined.to_excel(output_path, index=False) print(f"💾 Exported {len(combined)} results to {output_path}")