comment from youlu
This commit is contained in:
@@ -159,3 +159,11 @@ HTTP requests are made through a shared `requests.Session` with:
|
|||||||
- **Rate limiting** (Ollama latency between fetches when configured; 1-second fallback otherwise)
|
- **Rate limiting** (Ollama latency between fetches when configured; 1-second fallback otherwise)
|
||||||
|
|
||||||
Some sites (e.g. paywalled or bot-protected) may still return errors — in those cases the content field is left empty and the RSS description is used as a fallback for summaries.
|
Some sites (e.g. paywalled or bot-protected) may still return errors — in those cases the content field is left empty and the RSS description is used as a fallback for summaries.
|
||||||
|
|
||||||
|
## Design notes
|
||||||
|
|
||||||
|
- **Articles without dates are included by default.** `is_within_lookback` returns `True` when an article has no published date, and the query uses `OR published_date IS NULL`. This is intentional — silently dropping articles just because the feed omits a date would be worse than including them. If you only want dated articles, filter on `published_date` in the output.
|
||||||
|
|
||||||
|
- **`generate_summary` accepts both `description` and `content`.** The `description` parameter is not redundant — `body = content or description` uses the RSS description as a fallback when `fetch_content()` fails and returns `None`. This ensures articles still get summarized even when the full page can't be fetched.
|
||||||
|
|
||||||
|
- **`fetch_content` uses a chained ternary for element selection.** The expression `article if article else soup.body if soup.body else soup` picks the most specific container available. This is a common Python pattern and reads top-to-bottom as a priority list.
|
||||||
|
|||||||
@@ -369,105 +369,104 @@ def main():
|
|||||||
max_per_feed = settings.get("max_articles_per_feed", 0)
|
max_per_feed = settings.get("max_articles_per_feed", 0)
|
||||||
|
|
||||||
conn = init_db(args.database)
|
conn = init_db(args.database)
|
||||||
|
try:
|
||||||
|
# Purge old articles
|
||||||
|
deleted = purge_old_articles(conn, retention_days)
|
||||||
|
if deleted:
|
||||||
|
logger.info("Purged %d articles older than %d days", deleted, retention_days)
|
||||||
|
|
||||||
# Purge old articles
|
if args.purge_only:
|
||||||
deleted = purge_old_articles(conn, retention_days)
|
logger.info("Purge-only mode; exiting")
|
||||||
if deleted:
|
return
|
||||||
logger.info("Purged %d articles older than %d days", deleted, retention_days)
|
|
||||||
|
|
||||||
if args.purge_only:
|
# Fetch feeds
|
||||||
logger.info("Purge-only mode; exiting")
|
if not args.no_fetch:
|
||||||
|
feeds = config.get("feeds", [])
|
||||||
|
total_new = 0
|
||||||
|
|
||||||
|
# Read ollama config once for summarization during fetch
|
||||||
|
ollama_cfg = config.get("ollama")
|
||||||
|
if ollama_cfg:
|
||||||
|
ollama_model = ollama_cfg.get("model", "kamekichi128/qwen3-4b-instruct-2507")
|
||||||
|
ollama_prompt = ollama_cfg.get("prompt", "Summarize the following news article in 2-3 concise sentences:")
|
||||||
|
logger.debug("Ollama summarization enabled (model: %s)", ollama_model)
|
||||||
|
else:
|
||||||
|
ollama_model = ollama_prompt = None
|
||||||
|
logger.debug("Ollama not configured; skipping summarization")
|
||||||
|
|
||||||
|
for feed_cfg in feeds:
|
||||||
|
if not feed_cfg.get("enabled", True):
|
||||||
|
logger.debug("Skipping disabled feed: %s", feed_cfg.get("name"))
|
||||||
|
continue
|
||||||
|
|
||||||
|
url = feed_cfg["url"]
|
||||||
|
logger.debug("Fetching feed: %s (%s)", feed_cfg.get("name", url), url)
|
||||||
|
entries = fetch_feed(url)
|
||||||
|
logger.debug("Got %d entries from %s", len(entries), feed_cfg.get("name", url))
|
||||||
|
|
||||||
|
articles = []
|
||||||
|
for entry in entries:
|
||||||
|
pub_date = parse_article_date(entry)
|
||||||
|
if not is_within_lookback(pub_date, hours_lookback):
|
||||||
|
continue
|
||||||
|
|
||||||
|
link = entry.get("link", "")
|
||||||
|
if not link:
|
||||||
|
continue
|
||||||
|
|
||||||
|
articles.append({
|
||||||
|
"url": link,
|
||||||
|
"title": entry.get("title"),
|
||||||
|
"description": entry.get("summary"),
|
||||||
|
"published_date": pub_date.isoformat() if pub_date else None,
|
||||||
|
"feed_name": feed_cfg.get("name"),
|
||||||
|
"feed_url": url,
|
||||||
|
"category": feed_cfg.get("category"),
|
||||||
|
"author": entry.get("author"),
|
||||||
|
})
|
||||||
|
|
||||||
|
# Cap articles per feed to avoid flooding the DB and downstream fetches
|
||||||
|
if max_per_feed > 0:
|
||||||
|
articles = articles[:max_per_feed]
|
||||||
|
|
||||||
|
new_urls = save_articles(conn, articles)
|
||||||
|
total_new += len(new_urls)
|
||||||
|
logger.info("Feed '%s': %d new articles (of %d within lookback)",
|
||||||
|
feed_cfg.get("name", url), len(new_urls), len(articles))
|
||||||
|
|
||||||
|
# Fetch full content and optionally summarize newly inserted articles
|
||||||
|
for i, article_url in enumerate(new_urls):
|
||||||
|
if i > 0 and not ollama_cfg:
|
||||||
|
time.sleep(1) # rate limit when Ollama isn't providing natural delay
|
||||||
|
logger.debug("Fetching content: %s", article_url)
|
||||||
|
content = fetch_content(article_url)
|
||||||
|
summary = None
|
||||||
|
if ollama_cfg:
|
||||||
|
row = conn.execute(
|
||||||
|
"SELECT title, description FROM articles WHERE url = ?", (article_url,)
|
||||||
|
).fetchone()
|
||||||
|
if row:
|
||||||
|
summary = generate_summary(row["title"], row["description"], content, ollama_model, ollama_prompt)
|
||||||
|
if summary:
|
||||||
|
logger.debug("Generated summary for %s", article_url)
|
||||||
|
else:
|
||||||
|
if i > 0:
|
||||||
|
time.sleep(1) # fallback rate limit on summary failure
|
||||||
|
conn.execute(
|
||||||
|
"UPDATE articles SET content = ?, summary = ? WHERE url = ?",
|
||||||
|
(content, summary, article_url),
|
||||||
|
)
|
||||||
|
conn.commit()
|
||||||
|
|
||||||
|
logger.info("Total new articles saved: %d", total_new)
|
||||||
|
|
||||||
|
# Output recent articles
|
||||||
|
recent = get_recent_articles(conn, hours_lookback)
|
||||||
|
fields = [f.strip() for f in args.fields.split(",")]
|
||||||
|
output = [{k: article[k] for k in fields if k in article} for article in recent]
|
||||||
|
print(json.dumps(output, ensure_ascii=False, indent=2))
|
||||||
|
finally:
|
||||||
conn.close()
|
conn.close()
|
||||||
return
|
|
||||||
|
|
||||||
# Fetch feeds
|
|
||||||
if not args.no_fetch:
|
|
||||||
feeds = config.get("feeds", [])
|
|
||||||
total_new = 0
|
|
||||||
|
|
||||||
# Read ollama config once for summarization during fetch
|
|
||||||
ollama_cfg = config.get("ollama")
|
|
||||||
if ollama_cfg:
|
|
||||||
ollama_model = ollama_cfg.get("model", "kamekichi128/qwen3-4b-instruct-2507")
|
|
||||||
ollama_prompt = ollama_cfg.get("prompt", "Summarize the following news article in 2-3 concise sentences:")
|
|
||||||
logger.debug("Ollama summarization enabled (model: %s)", ollama_model)
|
|
||||||
else:
|
|
||||||
ollama_model = ollama_prompt = None
|
|
||||||
logger.debug("Ollama not configured; skipping summarization")
|
|
||||||
|
|
||||||
for feed_cfg in feeds:
|
|
||||||
if not feed_cfg.get("enabled", True):
|
|
||||||
logger.debug("Skipping disabled feed: %s", feed_cfg.get("name"))
|
|
||||||
continue
|
|
||||||
|
|
||||||
url = feed_cfg["url"]
|
|
||||||
logger.debug("Fetching feed: %s (%s)", feed_cfg.get("name", url), url)
|
|
||||||
entries = fetch_feed(url)
|
|
||||||
logger.debug("Got %d entries from %s", len(entries), feed_cfg.get("name", url))
|
|
||||||
|
|
||||||
articles = []
|
|
||||||
for entry in entries:
|
|
||||||
pub_date = parse_article_date(entry)
|
|
||||||
if not is_within_lookback(pub_date, hours_lookback):
|
|
||||||
continue
|
|
||||||
|
|
||||||
link = entry.get("link", "")
|
|
||||||
if not link:
|
|
||||||
continue
|
|
||||||
|
|
||||||
articles.append({
|
|
||||||
"url": link,
|
|
||||||
"title": entry.get("title"),
|
|
||||||
"description": entry.get("summary"),
|
|
||||||
"published_date": pub_date.isoformat() if pub_date else None,
|
|
||||||
"feed_name": feed_cfg.get("name"),
|
|
||||||
"feed_url": url,
|
|
||||||
"category": feed_cfg.get("category"),
|
|
||||||
"author": entry.get("author"),
|
|
||||||
})
|
|
||||||
|
|
||||||
# Cap articles per feed to avoid flooding the DB and downstream fetches
|
|
||||||
if max_per_feed > 0:
|
|
||||||
articles = articles[:max_per_feed]
|
|
||||||
|
|
||||||
new_urls = save_articles(conn, articles)
|
|
||||||
total_new += len(new_urls)
|
|
||||||
logger.info("Feed '%s': %d new articles (of %d within lookback)",
|
|
||||||
feed_cfg.get("name", url), len(new_urls), len(articles))
|
|
||||||
|
|
||||||
# Fetch full content and optionally summarize newly inserted articles
|
|
||||||
for i, article_url in enumerate(new_urls):
|
|
||||||
if i > 0 and not ollama_cfg:
|
|
||||||
time.sleep(1) # rate limit when Ollama isn't providing natural delay
|
|
||||||
logger.debug("Fetching content: %s", article_url)
|
|
||||||
content = fetch_content(article_url)
|
|
||||||
summary = None
|
|
||||||
if ollama_cfg:
|
|
||||||
row = conn.execute(
|
|
||||||
"SELECT title, description FROM articles WHERE url = ?", (article_url,)
|
|
||||||
).fetchone()
|
|
||||||
if row:
|
|
||||||
summary = generate_summary(row["title"], row["description"], content, ollama_model, ollama_prompt)
|
|
||||||
if summary:
|
|
||||||
logger.debug("Generated summary for %s", article_url)
|
|
||||||
else:
|
|
||||||
if i > 0:
|
|
||||||
time.sleep(1) # fallback rate limit on summary failure
|
|
||||||
conn.execute(
|
|
||||||
"UPDATE articles SET content = ?, summary = ? WHERE url = ?",
|
|
||||||
(content, summary, article_url),
|
|
||||||
)
|
|
||||||
conn.commit()
|
|
||||||
|
|
||||||
logger.info("Total new articles saved: %d", total_new)
|
|
||||||
|
|
||||||
# Output recent articles
|
|
||||||
recent = get_recent_articles(conn, hours_lookback)
|
|
||||||
fields = [f.strip() for f in args.fields.split(",")]
|
|
||||||
output = [{k: article[k] for k in fields if k in article} for article in recent]
|
|
||||||
print(json.dumps(output, ensure_ascii=False, indent=2))
|
|
||||||
|
|
||||||
conn.close()
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
|
|||||||
Reference in New Issue
Block a user