From c0d0a32c8cde441032f253e4a6a5b2cdd3f5e5be Mon Sep 17 00:00:00 2001 From: Yanxin Lu Date: Sun, 22 Feb 2026 11:03:00 -0800 Subject: [PATCH] comment from youlu --- scripts/news_digest/README.md | 8 ++ scripts/news_digest/main.py | 191 +++++++++++++++++----------------- 2 files changed, 103 insertions(+), 96 deletions(-) diff --git a/scripts/news_digest/README.md b/scripts/news_digest/README.md index d01ca43..0d5396e 100644 --- a/scripts/news_digest/README.md +++ b/scripts/news_digest/README.md @@ -159,3 +159,11 @@ HTTP requests are made through a shared `requests.Session` with: - **Rate limiting** (Ollama latency between fetches when configured; 1-second fallback otherwise) Some sites (e.g. paywalled or bot-protected) may still return errors — in those cases the content field is left empty and the RSS description is used as a fallback for summaries. + +## Design notes + +- **Articles without dates are included by default.** `is_within_lookback` returns `True` when an article has no published date, and the query uses `OR published_date IS NULL`. This is intentional — silently dropping articles just because the feed omits a date would be worse than including them. If you only want dated articles, filter on `published_date` in the output. + +- **`generate_summary` accepts both `description` and `content`.** The `description` parameter is not redundant — `body = content or description` uses the RSS description as a fallback when `fetch_content()` fails and returns `None`. This ensures articles still get summarized even when the full page can't be fetched. + +- **`fetch_content` uses a chained ternary for element selection.** The expression `article if article else soup.body if soup.body else soup` picks the most specific container available. This is a common Python pattern and reads top-to-bottom as a priority list. diff --git a/scripts/news_digest/main.py b/scripts/news_digest/main.py index f12e464..12b244c 100644 --- a/scripts/news_digest/main.py +++ b/scripts/news_digest/main.py @@ -369,105 +369,104 @@ def main(): max_per_feed = settings.get("max_articles_per_feed", 0) conn = init_db(args.database) + try: + # Purge old articles + deleted = purge_old_articles(conn, retention_days) + if deleted: + logger.info("Purged %d articles older than %d days", deleted, retention_days) - # Purge old articles - deleted = purge_old_articles(conn, retention_days) - if deleted: - logger.info("Purged %d articles older than %d days", deleted, retention_days) + if args.purge_only: + logger.info("Purge-only mode; exiting") + return - if args.purge_only: - logger.info("Purge-only mode; exiting") + # Fetch feeds + if not args.no_fetch: + feeds = config.get("feeds", []) + total_new = 0 + + # Read ollama config once for summarization during fetch + ollama_cfg = config.get("ollama") + if ollama_cfg: + ollama_model = ollama_cfg.get("model", "kamekichi128/qwen3-4b-instruct-2507") + ollama_prompt = ollama_cfg.get("prompt", "Summarize the following news article in 2-3 concise sentences:") + logger.debug("Ollama summarization enabled (model: %s)", ollama_model) + else: + ollama_model = ollama_prompt = None + logger.debug("Ollama not configured; skipping summarization") + + for feed_cfg in feeds: + if not feed_cfg.get("enabled", True): + logger.debug("Skipping disabled feed: %s", feed_cfg.get("name")) + continue + + url = feed_cfg["url"] + logger.debug("Fetching feed: %s (%s)", feed_cfg.get("name", url), url) + entries = fetch_feed(url) + logger.debug("Got %d entries from %s", len(entries), feed_cfg.get("name", url)) + + articles = [] + for entry in entries: + pub_date = parse_article_date(entry) + if not is_within_lookback(pub_date, hours_lookback): + continue + + link = entry.get("link", "") + if not link: + continue + + articles.append({ + "url": link, + "title": entry.get("title"), + "description": entry.get("summary"), + "published_date": pub_date.isoformat() if pub_date else None, + "feed_name": feed_cfg.get("name"), + "feed_url": url, + "category": feed_cfg.get("category"), + "author": entry.get("author"), + }) + + # Cap articles per feed to avoid flooding the DB and downstream fetches + if max_per_feed > 0: + articles = articles[:max_per_feed] + + new_urls = save_articles(conn, articles) + total_new += len(new_urls) + logger.info("Feed '%s': %d new articles (of %d within lookback)", + feed_cfg.get("name", url), len(new_urls), len(articles)) + + # Fetch full content and optionally summarize newly inserted articles + for i, article_url in enumerate(new_urls): + if i > 0 and not ollama_cfg: + time.sleep(1) # rate limit when Ollama isn't providing natural delay + logger.debug("Fetching content: %s", article_url) + content = fetch_content(article_url) + summary = None + if ollama_cfg: + row = conn.execute( + "SELECT title, description FROM articles WHERE url = ?", (article_url,) + ).fetchone() + if row: + summary = generate_summary(row["title"], row["description"], content, ollama_model, ollama_prompt) + if summary: + logger.debug("Generated summary for %s", article_url) + else: + if i > 0: + time.sleep(1) # fallback rate limit on summary failure + conn.execute( + "UPDATE articles SET content = ?, summary = ? WHERE url = ?", + (content, summary, article_url), + ) + conn.commit() + + logger.info("Total new articles saved: %d", total_new) + + # Output recent articles + recent = get_recent_articles(conn, hours_lookback) + fields = [f.strip() for f in args.fields.split(",")] + output = [{k: article[k] for k in fields if k in article} for article in recent] + print(json.dumps(output, ensure_ascii=False, indent=2)) + finally: conn.close() - return - - # Fetch feeds - if not args.no_fetch: - feeds = config.get("feeds", []) - total_new = 0 - - # Read ollama config once for summarization during fetch - ollama_cfg = config.get("ollama") - if ollama_cfg: - ollama_model = ollama_cfg.get("model", "kamekichi128/qwen3-4b-instruct-2507") - ollama_prompt = ollama_cfg.get("prompt", "Summarize the following news article in 2-3 concise sentences:") - logger.debug("Ollama summarization enabled (model: %s)", ollama_model) - else: - ollama_model = ollama_prompt = None - logger.debug("Ollama not configured; skipping summarization") - - for feed_cfg in feeds: - if not feed_cfg.get("enabled", True): - logger.debug("Skipping disabled feed: %s", feed_cfg.get("name")) - continue - - url = feed_cfg["url"] - logger.debug("Fetching feed: %s (%s)", feed_cfg.get("name", url), url) - entries = fetch_feed(url) - logger.debug("Got %d entries from %s", len(entries), feed_cfg.get("name", url)) - - articles = [] - for entry in entries: - pub_date = parse_article_date(entry) - if not is_within_lookback(pub_date, hours_lookback): - continue - - link = entry.get("link", "") - if not link: - continue - - articles.append({ - "url": link, - "title": entry.get("title"), - "description": entry.get("summary"), - "published_date": pub_date.isoformat() if pub_date else None, - "feed_name": feed_cfg.get("name"), - "feed_url": url, - "category": feed_cfg.get("category"), - "author": entry.get("author"), - }) - - # Cap articles per feed to avoid flooding the DB and downstream fetches - if max_per_feed > 0: - articles = articles[:max_per_feed] - - new_urls = save_articles(conn, articles) - total_new += len(new_urls) - logger.info("Feed '%s': %d new articles (of %d within lookback)", - feed_cfg.get("name", url), len(new_urls), len(articles)) - - # Fetch full content and optionally summarize newly inserted articles - for i, article_url in enumerate(new_urls): - if i > 0 and not ollama_cfg: - time.sleep(1) # rate limit when Ollama isn't providing natural delay - logger.debug("Fetching content: %s", article_url) - content = fetch_content(article_url) - summary = None - if ollama_cfg: - row = conn.execute( - "SELECT title, description FROM articles WHERE url = ?", (article_url,) - ).fetchone() - if row: - summary = generate_summary(row["title"], row["description"], content, ollama_model, ollama_prompt) - if summary: - logger.debug("Generated summary for %s", article_url) - else: - if i > 0: - time.sleep(1) # fallback rate limit on summary failure - conn.execute( - "UPDATE articles SET content = ?, summary = ? WHERE url = ?", - (content, summary, article_url), - ) - conn.commit() - - logger.info("Total new articles saved: %d", total_new) - - # Output recent articles - recent = get_recent_articles(conn, hours_lookback) - fields = [f.strip() for f in args.fields.split(",")] - output = [{k: article[k] for k in fields if k in article} for article in recent] - print(json.dumps(output, ensure_ascii=False, indent=2)) - - conn.close() if __name__ == "__main__":