Layer 1: Scrapers (Python scripts, each <200 lines)
Layer 2: Orchestration (GitHub Actions / cron on VPS)
Layer 3: Monitoring (dead simple: webhook → Telegram)
Layer 1: Scrapers (Python scripts, each <200 lines)
Layer 2: Orchestration (GitHub Actions / cron on VPS)
Layer 3: Monitoring (dead simple: webhook → Telegram)
Layer 1: Scrapers (Python scripts, each <200 lines)
Layer 2: Orchestration (GitHub Actions / cron on VPS)
Layer 3: Monitoring (dead simple: webhook → Telegram)
# scraper_hackernews.py — 40 lines total
import httpx
import json
from datetime import datetime def scrape(): resp = httpx.get("https://hacker-news.firebaseio.com/v0/topstories.json") story_ids = resp.json()[:30] stories = [] for sid in story_ids: story = httpx.get(f"https://hacker-news.firebaseio.com/v0/item/{sid}.json").json() stories.append({ "title": story.get("title"), "url": story.get("url"), "score": story.get("score"), "time": datetime.fromtimestamp(story.get("time", 0)).isoformat() }) with open("output/hn_top30.json", "w") as f: json.dump(stories, f, indent=2) return len(stories) if __name__ == "__main__": count = scrape() print(f"Scraped {count} stories")
# scraper_hackernews.py — 40 lines total
import httpx
import json
from datetime import datetime def scrape(): resp = httpx.get("https://hacker-news.firebaseio.com/v0/topstories.json") story_ids = resp.json()[:30] stories = [] for sid in story_ids: story = httpx.get(f"https://hacker-news.firebaseio.com/v0/item/{sid}.json").json() stories.append({ "title": story.get("title"), "url": story.get("url"), "score": story.get("score"), "time": datetime.fromtimestamp(story.get("time", 0)).isoformat() }) with open("output/hn_top30.json", "w") as f: json.dump(stories, f, indent=2) return len(stories) if __name__ == "__main__": count = scrape() print(f"Scraped {count} stories")
# scraper_hackernews.py — 40 lines total
import httpx
import json
from datetime import datetime def scrape(): resp = httpx.get("https://hacker-news.firebaseio.com/v0/topstories.json") story_ids = resp.json()[:30] stories = [] for sid in story_ids: story = httpx.get(f"https://hacker-news.firebaseio.com/v0/item/{sid}.json").json() stories.append({ "title": story.get("title"), "url": story.get("url"), "score": story.get("score"), "time": datetime.fromtimestamp(story.get("time", 0)).isoformat() }) with open("output/hn_top30.json", "w") as f: json.dump(stories, f, indent=2) return len(stories) if __name__ == "__main__": count = scrape() print(f"Scraped {count} stories")
# .github/workflows/scrape-hn.yml
name: Scrape Hacker News
on: schedule: - cron: "0 */6 * * *" # Every 6 hours workflow_dispatch: jobs: scrape: runs-on: ubuntu-latest steps: - uses: actions/checkout@v4 - uses: actions/setup-python@v5 with: python-version: "3.12" - run: pip install httpx - run: python scraper_hackernews.py - name: Commit results run: | git config user.name "Scraper Bot" git config user.email "[email protected]" git add output/ git diff --cached --quiet || git commit -m "data: HN $(date -u +%Y-%m-%d)" git push
# .github/workflows/scrape-hn.yml
name: Scrape Hacker News
on: schedule: - cron: "0 */6 * * *" # Every 6 hours workflow_dispatch: jobs: scrape: runs-on: ubuntu-latest steps: - uses: actions/checkout@v4 - uses: actions/setup-python@v5 with: python-version: "3.12" - run: pip install httpx - run: python scraper_hackernews.py - name: Commit results run: | git config user.name "Scraper Bot" git config user.email "[email protected]" git add output/ git diff --cached --quiet || git commit -m "data: HN $(date -u +%Y-%m-%d)" git push
# .github/workflows/scrape-hn.yml
name: Scrape Hacker News
on: schedule: - cron: "0 */6 * * *" # Every 6 hours workflow_dispatch: jobs: scrape: runs-on: ubuntu-latest steps: - uses: actions/checkout@v4 - uses: actions/setup-python@v5 with: python-version: "3.12" - run: pip install httpx - run: python scraper_hackernews.py - name: Commit results run: | git config user.name "Scraper Bot" git config user.email "[email protected]" git add output/ git diff --cached --quiet || git commit -m "data: HN $(date -u +%Y-%m-%d)" git push
# monitor.py
import httpx, os, json TELEGRAM_BOT = os.environ.get("TG_BOT_TOKEN")
CHAT_ID = os.environ.get("TG_CHAT_ID") def alert(message: str): if TELEGRAM_BOT and CHAT_ID: httpx.post( f"https://api.telegram.org/bot{TELEGRAM_BOT}/sendMessage", json={"chat_id": CHAT_ID, "text": f"🚨 {message}"} ) def check_output(filepath: str, min_items: int = 1): try: with open(filepath) as f: data = json.load(f) if len(data) < min_items: alert(f"Low data: {filepath} has {len(data)} items (expected {min_items}+)") except Exception as e: alert(f"Failed: {filepath} — {e}")
# monitor.py
import httpx, os, json TELEGRAM_BOT = os.environ.get("TG_BOT_TOKEN")
CHAT_ID = os.environ.get("TG_CHAT_ID") def alert(message: str): if TELEGRAM_BOT and CHAT_ID: httpx.post( f"https://api.telegram.org/bot{TELEGRAM_BOT}/sendMessage", json={"chat_id": CHAT_ID, "text": f"🚨 {message}"} ) def check_output(filepath: str, min_items: int = 1): try: with open(filepath) as f: data = json.load(f) if len(data) < min_items: alert(f"Low data: {filepath} has {len(data)} items (expected {min_items}+)") except Exception as e: alert(f"Failed: {filepath} — {e}")
# monitor.py
import httpx, os, json TELEGRAM_BOT = os.environ.get("TG_BOT_TOKEN")
CHAT_ID = os.environ.get("TG_CHAT_ID") def alert(message: str): if TELEGRAM_BOT and CHAT_ID: httpx.post( f"https://api.telegram.org/bot{TELEGRAM_BOT}/sendMessage", json={"chat_id": CHAT_ID, "text": f"🚨 {message}"} ) def check_output(filepath: str, min_items: int = 1): try: with open(filepath) as f: data = json.load(f) if len(data) < min_items: alert(f"Low data: {filepath} has {len(data)} items (expected {min_items}+)") except Exception as e: alert(f"Failed: {filepath} — {e}")
import time def scrape_with_retry(func, max_retries=3): for attempt in range(max_retries): try: return func() except Exception as e: if attempt == max_retries - 1: alert(f"Final failure after {max_retries} attempts: {e}") raise time.sleep(2 ** attempt) # 1s, 2s, 4s
import time def scrape_with_retry(func, max_retries=3): for attempt in range(max_retries): try: return func() except Exception as e: if attempt == max_retries - 1: alert(f"Final failure after {max_retries} attempts: {e}") raise time.sleep(2 ** attempt) # 1s, 2s, 4s
import time def scrape_with_retry(func, max_retries=3): for attempt in range(max_retries): try: return func() except Exception as e: if attempt == max_retries - 1: alert(f"Final failure after {max_retries} attempts: {e}") raise time.sleep(2 ** attempt) # 1s, 2s, 4s - What happens when the target site changes its HTML?
- How do you retry failed runs without duplicate data?
- How do you monitor 77 scrapers without going insane? - Fetch data from ONE source
- Parse it into JSON
- Save to ONE output file - Run failed → Telegram notification
- Data looks wrong → Telegram notification - 10x more stable (no CSS selector breakage)
- 5x faster (JSON vs parsing DOM)
- Free (most APIs have generous free tiers)