comment from youlu

This commit is contained in:
Yanxin Lu
2026-02-22 11:03:00 -08:00
parent 20ee5c2211
commit c0d0a32c8c
2 changed files with 103 additions and 96 deletions

View File

@@ -159,3 +159,11 @@ HTTP requests are made through a shared `requests.Session` with:
- **Rate limiting** (Ollama latency between fetches when configured; 1-second fallback otherwise)
Some sites (e.g. paywalled or bot-protected) may still return errors — in those cases the content field is left empty and the RSS description is used as a fallback for summaries.
## Design notes
- **Articles without dates are included by default.** `is_within_lookback` returns `True` when an article has no published date, and the query uses `OR published_date IS NULL`. This is intentional — silently dropping articles just because the feed omits a date would be worse than including them. If you only want dated articles, filter on `published_date` in the output.
- **`generate_summary` accepts both `description` and `content`.** The `description` parameter is not redundant — `body = content or description` uses the RSS description as a fallback when `fetch_content()` fails and returns `None`. This ensures articles still get summarized even when the full page can't be fetched.
- **`fetch_content` uses a chained ternary for element selection.** The expression `article if article else soup.body if soup.body else soup` picks the most specific container available. This is a common Python pattern and reads top-to-bottom as a priority list.

View File

@@ -369,105 +369,104 @@ def main():
max_per_feed = settings.get("max_articles_per_feed", 0)
conn = init_db(args.database)
try:
# Purge old articles
deleted = purge_old_articles(conn, retention_days)
if deleted:
logger.info("Purged %d articles older than %d days", deleted, retention_days)
# Purge old articles
deleted = purge_old_articles(conn, retention_days)
if deleted:
logger.info("Purged %d articles older than %d days", deleted, retention_days)
if args.purge_only:
logger.info("Purge-only mode; exiting")
return
if args.purge_only:
logger.info("Purge-only mode; exiting")
# Fetch feeds
if not args.no_fetch:
feeds = config.get("feeds", [])
total_new = 0
# Read ollama config once for summarization during fetch
ollama_cfg = config.get("ollama")
if ollama_cfg:
ollama_model = ollama_cfg.get("model", "kamekichi128/qwen3-4b-instruct-2507")
ollama_prompt = ollama_cfg.get("prompt", "Summarize the following news article in 2-3 concise sentences:")
logger.debug("Ollama summarization enabled (model: %s)", ollama_model)
else:
ollama_model = ollama_prompt = None
logger.debug("Ollama not configured; skipping summarization")
for feed_cfg in feeds:
if not feed_cfg.get("enabled", True):
logger.debug("Skipping disabled feed: %s", feed_cfg.get("name"))
continue
url = feed_cfg["url"]
logger.debug("Fetching feed: %s (%s)", feed_cfg.get("name", url), url)
entries = fetch_feed(url)
logger.debug("Got %d entries from %s", len(entries), feed_cfg.get("name", url))
articles = []
for entry in entries:
pub_date = parse_article_date(entry)
if not is_within_lookback(pub_date, hours_lookback):
continue
link = entry.get("link", "")
if not link:
continue
articles.append({
"url": link,
"title": entry.get("title"),
"description": entry.get("summary"),
"published_date": pub_date.isoformat() if pub_date else None,
"feed_name": feed_cfg.get("name"),
"feed_url": url,
"category": feed_cfg.get("category"),
"author": entry.get("author"),
})
# Cap articles per feed to avoid flooding the DB and downstream fetches
if max_per_feed > 0:
articles = articles[:max_per_feed]
new_urls = save_articles(conn, articles)
total_new += len(new_urls)
logger.info("Feed '%s': %d new articles (of %d within lookback)",
feed_cfg.get("name", url), len(new_urls), len(articles))
# Fetch full content and optionally summarize newly inserted articles
for i, article_url in enumerate(new_urls):
if i > 0 and not ollama_cfg:
time.sleep(1) # rate limit when Ollama isn't providing natural delay
logger.debug("Fetching content: %s", article_url)
content = fetch_content(article_url)
summary = None
if ollama_cfg:
row = conn.execute(
"SELECT title, description FROM articles WHERE url = ?", (article_url,)
).fetchone()
if row:
summary = generate_summary(row["title"], row["description"], content, ollama_model, ollama_prompt)
if summary:
logger.debug("Generated summary for %s", article_url)
else:
if i > 0:
time.sleep(1) # fallback rate limit on summary failure
conn.execute(
"UPDATE articles SET content = ?, summary = ? WHERE url = ?",
(content, summary, article_url),
)
conn.commit()
logger.info("Total new articles saved: %d", total_new)
# Output recent articles
recent = get_recent_articles(conn, hours_lookback)
fields = [f.strip() for f in args.fields.split(",")]
output = [{k: article[k] for k in fields if k in article} for article in recent]
print(json.dumps(output, ensure_ascii=False, indent=2))
finally:
conn.close()
return
# Fetch feeds
if not args.no_fetch:
feeds = config.get("feeds", [])
total_new = 0
# Read ollama config once for summarization during fetch
ollama_cfg = config.get("ollama")
if ollama_cfg:
ollama_model = ollama_cfg.get("model", "kamekichi128/qwen3-4b-instruct-2507")
ollama_prompt = ollama_cfg.get("prompt", "Summarize the following news article in 2-3 concise sentences:")
logger.debug("Ollama summarization enabled (model: %s)", ollama_model)
else:
ollama_model = ollama_prompt = None
logger.debug("Ollama not configured; skipping summarization")
for feed_cfg in feeds:
if not feed_cfg.get("enabled", True):
logger.debug("Skipping disabled feed: %s", feed_cfg.get("name"))
continue
url = feed_cfg["url"]
logger.debug("Fetching feed: %s (%s)", feed_cfg.get("name", url), url)
entries = fetch_feed(url)
logger.debug("Got %d entries from %s", len(entries), feed_cfg.get("name", url))
articles = []
for entry in entries:
pub_date = parse_article_date(entry)
if not is_within_lookback(pub_date, hours_lookback):
continue
link = entry.get("link", "")
if not link:
continue
articles.append({
"url": link,
"title": entry.get("title"),
"description": entry.get("summary"),
"published_date": pub_date.isoformat() if pub_date else None,
"feed_name": feed_cfg.get("name"),
"feed_url": url,
"category": feed_cfg.get("category"),
"author": entry.get("author"),
})
# Cap articles per feed to avoid flooding the DB and downstream fetches
if max_per_feed > 0:
articles = articles[:max_per_feed]
new_urls = save_articles(conn, articles)
total_new += len(new_urls)
logger.info("Feed '%s': %d new articles (of %d within lookback)",
feed_cfg.get("name", url), len(new_urls), len(articles))
# Fetch full content and optionally summarize newly inserted articles
for i, article_url in enumerate(new_urls):
if i > 0 and not ollama_cfg:
time.sleep(1) # rate limit when Ollama isn't providing natural delay
logger.debug("Fetching content: %s", article_url)
content = fetch_content(article_url)
summary = None
if ollama_cfg:
row = conn.execute(
"SELECT title, description FROM articles WHERE url = ?", (article_url,)
).fetchone()
if row:
summary = generate_summary(row["title"], row["description"], content, ollama_model, ollama_prompt)
if summary:
logger.debug("Generated summary for %s", article_url)
else:
if i > 0:
time.sleep(1) # fallback rate limit on summary failure
conn.execute(
"UPDATE articles SET content = ?, summary = ? WHERE url = ?",
(content, summary, article_url),
)
conn.commit()
logger.info("Total new articles saved: %d", total_new)
# Output recent articles
recent = get_recent_articles(conn, hours_lookback)
fields = [f.strip() for f in args.fields.split(",")]
output = [{k: article[k] for k in fields if k in article} for article in recent]
print(json.dumps(output, ensure_ascii=False, indent=2))
conn.close()
if __name__ == "__main__":