forgot to include files

2026-02-22 10:51:58 -08:00
parent 2bd9da6326
commit 20ee5c2211
4 changed files with 268 additions and 136 deletions
--- a/scripts/news_digest/main.py
+++ b/scripts/news_digest/main.py
@@ -1,22 +1,90 @@
 #!/usr/bin/env python3
-"""RSS News Digest — fetch feeds, store articles with full content in SQLite, optionally summarize via Ollama."""
+"""RSS News Digest — fetch feeds, store articles with full content in SQLite, and summarize via Ollama during fetch.
+
+Recommended: run via ./run.sh, which uses `uv` to handle dependencies
+automatically (no manual venv or pip install needed).
+
+When an `ollama` key is present in config.json, each newly fetched article is
+automatically summarized and the result is stored in the database.  Ollama
+latency provides natural rate limiting between HTTP requests; when Ollama is
+not configured, a 1-second sleep is used instead.
+
+Uses a requests.Session with automatic retries and browser-like headers to
+handle transient HTTP errors (429/5xx).  A configurable per-feed article cap
+helps avoid overwhelming upstream servers.
+
+Use ``--test`` to smoke-test feed fetching and/or Ollama summarization without
+writing to the database.
+"""

 import argparse
 import json
 import logging
 import sqlite3
 import sys
+import time
 from datetime import datetime, timedelta, timezone
 from pathlib import Path
 from time import mktime
-from urllib.request import Request, urlopen
-from urllib.error import URLError

 import feedparser
+import requests
 from bs4 import BeautifulSoup
+from requests.adapters import HTTPAdapter
+from urllib3.util.retry import Retry

 logger = logging.getLogger("news_digest")

+# Hardcoded test articles for --test summary (one English, one Chinese)
+_TEST_ARTICLES = [
+    {
+        "title": "Global Semiconductor Shortage Eases as New Factories Come Online",
+        "content": (
+            "The global chip shortage that disrupted industries from automotive to "
+            "consumer electronics is finally showing signs of relief. Major semiconductor "
+            "manufacturers including TSMC, Samsung, and Intel have begun production at new "
+            "fabrication plants in Arizona, Texas, and Japan. Industry analysts project that "
+            "global chip capacity will increase by 15% over the next 18 months, potentially "
+            "leading to a supply surplus in certain categories. The shift has already begun "
+            "to impact pricing, with memory chip costs dropping 12% in the last quarter."
+        ),
+    },
+    {
+        "title": "中国新能源汽车出口量首次突破年度600万辆大关",
+        "content": (
+            "据中国汽车工业协会最新数据，2025年中国新能源汽车出口量首次突破600万辆，"
+            "同比增长38%。比亚迪、上汽、蔚来等品牌在东南亚、欧洲和南美市场持续扩张。"
+            "分析人士指出，中国在电池技术和供应链方面的优势使其产品在全球市场具有较强"
+            "竞争力，但欧盟加征的反补贴关税可能对未来增长构成挑战。"
+        ),
+    },
+]
+
+
+def _build_session() -> requests.Session:
+    """Create a requests session with automatic retries and browser-like headers."""
+    session = requests.Session()
+    retry = Retry(
+        total=3,
+        backoff_factor=1,            # 1s, 2s, 4s between retries
+        status_forcelist=[429, 500, 502, 503, 504],
+        respect_retry_after_header=True,
+    )
+    adapter = HTTPAdapter(max_retries=retry)
+    session.mount("http://", adapter)
+    session.mount("https://", adapter)
+    session.headers.update({
+        "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) "
+                      "AppleWebKit/537.36 (KHTML, like Gecko) "
+                      "Chrome/131.0.0.0 Safari/537.36",
+        "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
+        "Accept-Language": "en-US,en;q=0.9",
+    })
+    return session
+
+
+_session = _build_session()
+

 def load_config(path: str) -> dict:
    with open(path, encoding="utf-8") as f:
@@ -33,6 +101,7 @@ def init_db(db_path: str) -> sqlite3.Connection:
            title TEXT,
            description TEXT,
            content TEXT,
+            summary TEXT,
            published_date TEXT,
            fetched_date TEXT NOT NULL,
            feed_name TEXT,
@@ -41,11 +110,6 @@ def init_db(db_path: str) -> sqlite3.Connection:
            author TEXT
        )
    """)
-    # Migrate: add content column if missing (existing DBs)
-    try:
-        conn.execute("ALTER TABLE articles ADD COLUMN content TEXT")
-    except sqlite3.OperationalError:
-        pass
    conn.commit()
    return conn

@@ -65,15 +129,12 @@ def is_within_lookback(dt: datetime | None, hours: int) -> bool:
    return dt >= cutoff


-_USER_AGENT = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko)"
-
-
 def fetch_feed(url: str) -> list[dict]:
    try:
-        req = Request(url, headers={"User-Agent": _USER_AGENT})
-        with urlopen(req, timeout=30) as resp:
-            raw = resp.read()
-    except (URLError, OSError) as e:
+        resp = _session.get(url, timeout=30)
+        resp.raise_for_status()
+        raw = resp.content
+    except requests.RequestException as e:
        logger.warning("Failed to fetch %s: %s", url, e)
        return []

@@ -86,10 +147,10 @@ def fetch_feed(url: str) -> list[dict]:

 def fetch_content(url: str) -> str | None:
    try:
-        req = Request(url, headers={"User-Agent": _USER_AGENT})
-        with urlopen(req, timeout=15) as resp:
-            html = resp.read()
-    except (URLError, OSError) as e:
+        resp = _session.get(url, timeout=15)
+        resp.raise_for_status()
+        html = resp.content
+    except requests.RequestException as e:
        logger.warning("Failed to fetch content from %s: %s", url, e)
        return None

@@ -157,15 +218,8 @@ def purge_old_articles(conn: sqlite3.Connection, days: int) -> int:
 def get_recent_articles(conn: sqlite3.Connection, hours: int) -> list[dict]:
    cutoff = (datetime.now(timezone.utc) - timedelta(hours=hours)).isoformat()
    rows = conn.execute(
-        "SELECT * FROM articles WHERE fetched_date >= ? ORDER BY id", (cutoff,)
-    ).fetchall()
-    return [dict(r) for r in rows]
-
-
-def get_articles_by_ids(conn: sqlite3.Connection, ids: list[int]) -> list[dict]:
-    placeholders = ",".join("?" for _ in ids)
-    rows = conn.execute(
-        f"SELECT * FROM articles WHERE id IN ({placeholders}) ORDER BY id", ids
+        "SELECT * FROM articles WHERE published_date >= ? OR published_date IS NULL ORDER BY id",
+        (cutoff,),
    ).fetchall()
    return [dict(r) for r in rows]

@@ -194,16 +248,101 @@ def generate_summary(title: str, description: str | None, content: str | None, m
        return None


+def _run_test(mode: str, config: dict) -> None:
+    """Run smoke tests for feed fetching, summarization, or both.
+
+    All JSON results go to stdout; status messages go to stderr.
+    """
+    if mode not in ("", "feed", "summary"):
+        print(f"Unknown test mode: {mode!r} (use 'feed', 'summary', or omit)", file=sys.stderr)
+        sys.exit(1)
+
+    feed_article = None  # may be populated by feed test for use in full mode
+
+    # --- Feed test ---
+    if mode in ("", "feed"):
+        print("=== Feed test ===", file=sys.stderr)
+        feeds = config.get("feeds", [])
+        enabled = [f for f in feeds if f.get("enabled", True)]
+        if not enabled:
+            print("FAIL: no enabled feeds in config", file=sys.stderr)
+            sys.exit(1)
+
+        feed_cfg = enabled[0]
+        url = feed_cfg["url"]
+        name = feed_cfg.get("name", url)
+        print(f"Fetching feed: {name} ({url})", file=sys.stderr)
+
+        entries = fetch_feed(url)
+        if not entries:
+            print("FAIL: no entries returned from feed", file=sys.stderr)
+            sys.exit(1)
+
+        entry = entries[0]
+        link = entry.get("link", "")
+        title = entry.get("title", "")
+        print(f"Fetching content: {link}", file=sys.stderr)
+        content = fetch_content(link) if link else None
+
+        result = {
+            "feed": name,
+            "title": title,
+            "url": link,
+            "content_length": len(content) if content else 0,
+        }
+        print(json.dumps(result, ensure_ascii=False, indent=2))
+
+        if content:
+            print("PASS: feed fetch", file=sys.stderr)
+            feed_article = {"title": title, "content": content}
+        else:
+            print("FAIL: could not fetch article content", file=sys.stderr)
+            if mode == "feed":
+                sys.exit(1)
+
+    # --- Summary test ---
+    if mode in ("", "summary"):
+        print("=== Summary test ===", file=sys.stderr)
+        ollama_cfg = config.get("ollama")
+        if not ollama_cfg:
+            print("FAIL: no 'ollama' key in config", file=sys.stderr)
+            sys.exit(1)
+
+        model = ollama_cfg.get("model", "kamekichi128/qwen3-4b-instruct-2507")
+        prompt = ollama_cfg.get("prompt", "Summarize the following news article in 2-3 concise sentences:")
+
+        # Build test inputs: hardcoded articles + fetched article (full mode only)
+        articles = list(_TEST_ARTICLES)
+        if feed_article:
+            articles.append(feed_article)
+
+        all_ok = True
+        for article in articles:
+            print(f"Summarizing: {article['title']}", file=sys.stderr)
+            summary = generate_summary(article["title"], None, article["content"], model, prompt)
+            result = {"title": article["title"], "summary": summary}
+            print(json.dumps(result, ensure_ascii=False, indent=2))
+            if not summary:
+                all_ok = False
+
+        if all_ok:
+            print("PASS: summary", file=sys.stderr)
+        else:
+            print("FAIL: one or more summaries failed", file=sys.stderr)
+            sys.exit(1)
+
+
 def main():
    parser = argparse.ArgumentParser(description="RSS News Digest")
    parser.add_argument("-c", "--config", default="config.json", help="Config file path")
    parser.add_argument("-d", "--database", default="news_digest.db", help="SQLite database path")
    parser.add_argument("--hours", type=int, help="Override lookback hours")
-    parser.add_argument("-f", "--fields", default="id,title,url", help="Comma-separated output fields")
-    parser.add_argument("--digest", help="Article IDs to summarize (comma-separated, e.g. 1,3,7)")
+    parser.add_argument("-f", "--fields", default="id,title,url,published_date,fetched_date,feed_name", help="Comma-separated output fields")
    parser.add_argument("--purge-only", action="store_true", help="Only purge old articles")
    parser.add_argument("--no-fetch", action="store_true", help="Skip fetching feeds, only query stored articles")
    parser.add_argument("-v", "--verbose", action="store_true", help="Debug logging to stderr")
+    parser.add_argument("--test", nargs="?", const="", metavar="MODE",
+                        help="Smoke test: 'feed', 'summary', or omit for full pipeline")
    args = parser.parse_args()

    logging.basicConfig(
@@ -218,39 +357,19 @@ def main():
        sys.exit(1)

    config = load_config(str(config_path))
+
+    # Handle --test before any DB operations
+    if args.test is not None:
+        _run_test(args.test, config)
+        return
+
    settings = config.get("settings", {})
    hours_lookback = args.hours or settings.get("hours_lookback", 24)
    retention_days = settings.get("retention_days", 30)
+    max_per_feed = settings.get("max_articles_per_feed", 0)

    conn = init_db(args.database)

-    # Digest mode — summarize specified articles, then exit
-    if args.digest:
-        ollama_cfg = config.get("ollama", {})
-        model = ollama_cfg.get("model", "qwen3")
-        prompt = ollama_cfg.get("prompt", "Summarize the following news article in 2-3 concise sentences:")
-
-        ids = [int(x.strip()) for x in args.digest.split(",")]
-        articles = get_articles_by_ids(conn, ids)
-
-        if not articles:
-            logger.warning("No articles found for IDs: %s", ids)
-
-        results = []
-        for article in articles:
-            logger.debug("Summarizing article %d: %s", article["id"], article["title"])
-            summary = generate_summary(article["title"], article.get("description"), article.get("content"), model, prompt)
-            results.append({
-                "id": article["id"],
-                "title": article["title"],
-                "url": article["url"],
-                "summary": summary,
-            })
-
-        print(json.dumps(results, ensure_ascii=False, indent=2))
-        conn.close()
-        return
-
    # Purge old articles
    deleted = purge_old_articles(conn, retention_days)
    if deleted:
@@ -266,6 +385,16 @@ def main():
        feeds = config.get("feeds", [])
        total_new = 0

+        # Read ollama config once for summarization during fetch
+        ollama_cfg = config.get("ollama")
+        if ollama_cfg:
+            ollama_model = ollama_cfg.get("model", "kamekichi128/qwen3-4b-instruct-2507")
+            ollama_prompt = ollama_cfg.get("prompt", "Summarize the following news article in 2-3 concise sentences:")
+            logger.debug("Ollama summarization enabled (model: %s)", ollama_model)
+        else:
+            ollama_model = ollama_prompt = None
+            logger.debug("Ollama not configured; skipping summarization")
+
        for feed_cfg in feeds:
            if not feed_cfg.get("enabled", True):
                logger.debug("Skipping disabled feed: %s", feed_cfg.get("name"))
@@ -297,18 +426,37 @@ def main():
                    "author": entry.get("author"),
                })

+            # Cap articles per feed to avoid flooding the DB and downstream fetches
+            if max_per_feed > 0:
+                articles = articles[:max_per_feed]
+
            new_urls = save_articles(conn, articles)
            total_new += len(new_urls)
            logger.info("Feed '%s': %d new articles (of %d within lookback)",
                         feed_cfg.get("name", url), len(new_urls), len(articles))

-            # Fetch full content for newly inserted articles
-            for article_url in new_urls:
+            # Fetch full content and optionally summarize newly inserted articles
+            for i, article_url in enumerate(new_urls):
+                if i > 0 and not ollama_cfg:
+                    time.sleep(1)  # rate limit when Ollama isn't providing natural delay
                logger.debug("Fetching content: %s", article_url)
                content = fetch_content(article_url)
-                if content:
-                    conn.execute("UPDATE articles SET content = ? WHERE url = ?", (content, article_url))
-                    logger.debug("Saved content (%d chars) for %s", len(content), article_url)
+                summary = None
+                if ollama_cfg:
+                    row = conn.execute(
+                        "SELECT title, description FROM articles WHERE url = ?", (article_url,)
+                    ).fetchone()
+                    if row:
+                        summary = generate_summary(row["title"], row["description"], content, ollama_model, ollama_prompt)
+                        if summary:
+                            logger.debug("Generated summary for %s", article_url)
+                        else:
+                            if i > 0:
+                                time.sleep(1)  # fallback rate limit on summary failure
+                conn.execute(
+                    "UPDATE articles SET content = ?, summary = ? WHERE url = ?",
+                    (content, summary, article_url),
+                )
            conn.commit()

        logger.info("Total new articles saved: %d", total_new)