comment from youlu

2026-02-22 11:03:00 -08:00
parent 20ee5c2211
commit c0d0a32c8c
2 changed files with 103 additions and 96 deletions
--- a/scripts/news_digest/README.md
+++ b/scripts/news_digest/README.md
@@ -159,3 +159,11 @@ HTTP requests are made through a shared `requests.Session` with:
 - **Rate limiting** (Ollama latency between fetches when configured; 1-second fallback otherwise)

 Some sites (e.g. paywalled or bot-protected) may still return errors — in those cases the content field is left empty and the RSS description is used as a fallback for summaries.
+
+## Design notes
+
+- **Articles without dates are included by default.** `is_within_lookback` returns `True` when an article has no published date, and the query uses `OR published_date IS NULL`. This is intentional — silently dropping articles just because the feed omits a date would be worse than including them. If you only want dated articles, filter on `published_date` in the output.
+
+- **`generate_summary` accepts both `description` and `content`.** The `description` parameter is not redundant — `body = content or description` uses the RSS description as a fallback when `fetch_content()` fails and returns `None`. This ensures articles still get summarized even when the full page can't be fetched.
+
+- **`fetch_content` uses a chained ternary for element selection.** The expression `article if article else soup.body if soup.body else soup` picks the most specific container available. This is a common Python pattern and reads top-to-bottom as a priority list.
--- a/scripts/news_digest/main.py
+++ b/scripts/news_digest/main.py
@@ -369,105 +369,104 @@ def main():
    max_per_feed = settings.get("max_articles_per_feed", 0)

    conn = init_db(args.database)
+    try:
+        # Purge old articles
+        deleted = purge_old_articles(conn, retention_days)
+        if deleted:
+            logger.info("Purged %d articles older than %d days", deleted, retention_days)

-    # Purge old articles
-    deleted = purge_old_articles(conn, retention_days)
-    if deleted:
-        logger.info("Purged %d articles older than %d days", deleted, retention_days)
+        if args.purge_only:
+            logger.info("Purge-only mode; exiting")
+            return

-    if args.purge_only:
-        logger.info("Purge-only mode; exiting")
+        # Fetch feeds
+        if not args.no_fetch:
+            feeds = config.get("feeds", [])
+            total_new = 0
+
+            # Read ollama config once for summarization during fetch
+            ollama_cfg = config.get("ollama")
+            if ollama_cfg:
+                ollama_model = ollama_cfg.get("model", "kamekichi128/qwen3-4b-instruct-2507")
+                ollama_prompt = ollama_cfg.get("prompt", "Summarize the following news article in 2-3 concise sentences:")
+                logger.debug("Ollama summarization enabled (model: %s)", ollama_model)
+            else:
+                ollama_model = ollama_prompt = None
+                logger.debug("Ollama not configured; skipping summarization")
+
+            for feed_cfg in feeds:
+                if not feed_cfg.get("enabled", True):
+                    logger.debug("Skipping disabled feed: %s", feed_cfg.get("name"))
+                    continue
+
+                url = feed_cfg["url"]
+                logger.debug("Fetching feed: %s (%s)", feed_cfg.get("name", url), url)
+                entries = fetch_feed(url)
+                logger.debug("Got %d entries from %s", len(entries), feed_cfg.get("name", url))
+
+                articles = []
+                for entry in entries:
+                    pub_date = parse_article_date(entry)
+                    if not is_within_lookback(pub_date, hours_lookback):
+                        continue
+
+                    link = entry.get("link", "")
+                    if not link:
+                        continue
+
+                    articles.append({
+                        "url": link,
+                        "title": entry.get("title"),
+                        "description": entry.get("summary"),
+                        "published_date": pub_date.isoformat() if pub_date else None,
+                        "feed_name": feed_cfg.get("name"),
+                        "feed_url": url,
+                        "category": feed_cfg.get("category"),
+                        "author": entry.get("author"),
+                    })
+
+                # Cap articles per feed to avoid flooding the DB and downstream fetches
+                if max_per_feed > 0:
+                    articles = articles[:max_per_feed]
+
+                new_urls = save_articles(conn, articles)
+                total_new += len(new_urls)
+                logger.info("Feed '%s': %d new articles (of %d within lookback)",
+                             feed_cfg.get("name", url), len(new_urls), len(articles))
+
+                # Fetch full content and optionally summarize newly inserted articles
+                for i, article_url in enumerate(new_urls):
+                    if i > 0 and not ollama_cfg:
+                        time.sleep(1)  # rate limit when Ollama isn't providing natural delay
+                    logger.debug("Fetching content: %s", article_url)
+                    content = fetch_content(article_url)
+                    summary = None
+                    if ollama_cfg:
+                        row = conn.execute(
+                            "SELECT title, description FROM articles WHERE url = ?", (article_url,)
+                        ).fetchone()
+                        if row:
+                            summary = generate_summary(row["title"], row["description"], content, ollama_model, ollama_prompt)
+                            if summary:
+                                logger.debug("Generated summary for %s", article_url)
+                            else:
+                                if i > 0:
+                                    time.sleep(1)  # fallback rate limit on summary failure
+                    conn.execute(
+                        "UPDATE articles SET content = ?, summary = ? WHERE url = ?",
+                        (content, summary, article_url),
+                    )
+                conn.commit()
+
+            logger.info("Total new articles saved: %d", total_new)
+
+        # Output recent articles
+        recent = get_recent_articles(conn, hours_lookback)
+        fields = [f.strip() for f in args.fields.split(",")]
+        output = [{k: article[k] for k in fields if k in article} for article in recent]
+        print(json.dumps(output, ensure_ascii=False, indent=2))
+    finally:
        conn.close()
-        return
-
-    # Fetch feeds
-    if not args.no_fetch:
-        feeds = config.get("feeds", [])
-        total_new = 0
-
-        # Read ollama config once for summarization during fetch
-        ollama_cfg = config.get("ollama")
-        if ollama_cfg:
-            ollama_model = ollama_cfg.get("model", "kamekichi128/qwen3-4b-instruct-2507")
-            ollama_prompt = ollama_cfg.get("prompt", "Summarize the following news article in 2-3 concise sentences:")
-            logger.debug("Ollama summarization enabled (model: %s)", ollama_model)
-        else:
-            ollama_model = ollama_prompt = None
-            logger.debug("Ollama not configured; skipping summarization")
-
-        for feed_cfg in feeds:
-            if not feed_cfg.get("enabled", True):
-                logger.debug("Skipping disabled feed: %s", feed_cfg.get("name"))
-                continue
-
-            url = feed_cfg["url"]
-            logger.debug("Fetching feed: %s (%s)", feed_cfg.get("name", url), url)
-            entries = fetch_feed(url)
-            logger.debug("Got %d entries from %s", len(entries), feed_cfg.get("name", url))
-
-            articles = []
-            for entry in entries:
-                pub_date = parse_article_date(entry)
-                if not is_within_lookback(pub_date, hours_lookback):
-                    continue
-
-                link = entry.get("link", "")
-                if not link:
-                    continue
-
-                articles.append({
-                    "url": link,
-                    "title": entry.get("title"),
-                    "description": entry.get("summary"),
-                    "published_date": pub_date.isoformat() if pub_date else None,
-                    "feed_name": feed_cfg.get("name"),
-                    "feed_url": url,
-                    "category": feed_cfg.get("category"),
-                    "author": entry.get("author"),
-                })
-
-            # Cap articles per feed to avoid flooding the DB and downstream fetches
-            if max_per_feed > 0:
-                articles = articles[:max_per_feed]
-
-            new_urls = save_articles(conn, articles)
-            total_new += len(new_urls)
-            logger.info("Feed '%s': %d new articles (of %d within lookback)",
-                         feed_cfg.get("name", url), len(new_urls), len(articles))
-
-            # Fetch full content and optionally summarize newly inserted articles
-            for i, article_url in enumerate(new_urls):
-                if i > 0 and not ollama_cfg:
-                    time.sleep(1)  # rate limit when Ollama isn't providing natural delay
-                logger.debug("Fetching content: %s", article_url)
-                content = fetch_content(article_url)
-                summary = None
-                if ollama_cfg:
-                    row = conn.execute(
-                        "SELECT title, description FROM articles WHERE url = ?", (article_url,)
-                    ).fetchone()
-                    if row:
-                        summary = generate_summary(row["title"], row["description"], content, ollama_model, ollama_prompt)
-                        if summary:
-                            logger.debug("Generated summary for %s", article_url)
-                        else:
-                            if i > 0:
-                                time.sleep(1)  # fallback rate limit on summary failure
-                conn.execute(
-                    "UPDATE articles SET content = ?, summary = ? WHERE url = ?",
-                    (content, summary, article_url),
-                )
-            conn.commit()
-
-        logger.info("Total new articles saved: %d", total_new)
-
-    # Output recent articles
-    recent = get_recent_articles(conn, hours_lookback)
-    fields = [f.strip() for f in args.fields.split(",")]
-    output = [{k: article[k] for k in fields if k in article} for article in recent]
-    print(json.dumps(output, ensure_ascii=False, indent=2))
-
-    conn.close()


 if __name__ == "__main__":