comment from youlu
This commit is contained in:
@@ -369,105 +369,104 @@ def main():
|
||||
max_per_feed = settings.get("max_articles_per_feed", 0)
|
||||
|
||||
conn = init_db(args.database)
|
||||
try:
|
||||
# Purge old articles
|
||||
deleted = purge_old_articles(conn, retention_days)
|
||||
if deleted:
|
||||
logger.info("Purged %d articles older than %d days", deleted, retention_days)
|
||||
|
||||
# Purge old articles
|
||||
deleted = purge_old_articles(conn, retention_days)
|
||||
if deleted:
|
||||
logger.info("Purged %d articles older than %d days", deleted, retention_days)
|
||||
if args.purge_only:
|
||||
logger.info("Purge-only mode; exiting")
|
||||
return
|
||||
|
||||
if args.purge_only:
|
||||
logger.info("Purge-only mode; exiting")
|
||||
# Fetch feeds
|
||||
if not args.no_fetch:
|
||||
feeds = config.get("feeds", [])
|
||||
total_new = 0
|
||||
|
||||
# Read ollama config once for summarization during fetch
|
||||
ollama_cfg = config.get("ollama")
|
||||
if ollama_cfg:
|
||||
ollama_model = ollama_cfg.get("model", "kamekichi128/qwen3-4b-instruct-2507")
|
||||
ollama_prompt = ollama_cfg.get("prompt", "Summarize the following news article in 2-3 concise sentences:")
|
||||
logger.debug("Ollama summarization enabled (model: %s)", ollama_model)
|
||||
else:
|
||||
ollama_model = ollama_prompt = None
|
||||
logger.debug("Ollama not configured; skipping summarization")
|
||||
|
||||
for feed_cfg in feeds:
|
||||
if not feed_cfg.get("enabled", True):
|
||||
logger.debug("Skipping disabled feed: %s", feed_cfg.get("name"))
|
||||
continue
|
||||
|
||||
url = feed_cfg["url"]
|
||||
logger.debug("Fetching feed: %s (%s)", feed_cfg.get("name", url), url)
|
||||
entries = fetch_feed(url)
|
||||
logger.debug("Got %d entries from %s", len(entries), feed_cfg.get("name", url))
|
||||
|
||||
articles = []
|
||||
for entry in entries:
|
||||
pub_date = parse_article_date(entry)
|
||||
if not is_within_lookback(pub_date, hours_lookback):
|
||||
continue
|
||||
|
||||
link = entry.get("link", "")
|
||||
if not link:
|
||||
continue
|
||||
|
||||
articles.append({
|
||||
"url": link,
|
||||
"title": entry.get("title"),
|
||||
"description": entry.get("summary"),
|
||||
"published_date": pub_date.isoformat() if pub_date else None,
|
||||
"feed_name": feed_cfg.get("name"),
|
||||
"feed_url": url,
|
||||
"category": feed_cfg.get("category"),
|
||||
"author": entry.get("author"),
|
||||
})
|
||||
|
||||
# Cap articles per feed to avoid flooding the DB and downstream fetches
|
||||
if max_per_feed > 0:
|
||||
articles = articles[:max_per_feed]
|
||||
|
||||
new_urls = save_articles(conn, articles)
|
||||
total_new += len(new_urls)
|
||||
logger.info("Feed '%s': %d new articles (of %d within lookback)",
|
||||
feed_cfg.get("name", url), len(new_urls), len(articles))
|
||||
|
||||
# Fetch full content and optionally summarize newly inserted articles
|
||||
for i, article_url in enumerate(new_urls):
|
||||
if i > 0 and not ollama_cfg:
|
||||
time.sleep(1) # rate limit when Ollama isn't providing natural delay
|
||||
logger.debug("Fetching content: %s", article_url)
|
||||
content = fetch_content(article_url)
|
||||
summary = None
|
||||
if ollama_cfg:
|
||||
row = conn.execute(
|
||||
"SELECT title, description FROM articles WHERE url = ?", (article_url,)
|
||||
).fetchone()
|
||||
if row:
|
||||
summary = generate_summary(row["title"], row["description"], content, ollama_model, ollama_prompt)
|
||||
if summary:
|
||||
logger.debug("Generated summary for %s", article_url)
|
||||
else:
|
||||
if i > 0:
|
||||
time.sleep(1) # fallback rate limit on summary failure
|
||||
conn.execute(
|
||||
"UPDATE articles SET content = ?, summary = ? WHERE url = ?",
|
||||
(content, summary, article_url),
|
||||
)
|
||||
conn.commit()
|
||||
|
||||
logger.info("Total new articles saved: %d", total_new)
|
||||
|
||||
# Output recent articles
|
||||
recent = get_recent_articles(conn, hours_lookback)
|
||||
fields = [f.strip() for f in args.fields.split(",")]
|
||||
output = [{k: article[k] for k in fields if k in article} for article in recent]
|
||||
print(json.dumps(output, ensure_ascii=False, indent=2))
|
||||
finally:
|
||||
conn.close()
|
||||
return
|
||||
|
||||
# Fetch feeds
|
||||
if not args.no_fetch:
|
||||
feeds = config.get("feeds", [])
|
||||
total_new = 0
|
||||
|
||||
# Read ollama config once for summarization during fetch
|
||||
ollama_cfg = config.get("ollama")
|
||||
if ollama_cfg:
|
||||
ollama_model = ollama_cfg.get("model", "kamekichi128/qwen3-4b-instruct-2507")
|
||||
ollama_prompt = ollama_cfg.get("prompt", "Summarize the following news article in 2-3 concise sentences:")
|
||||
logger.debug("Ollama summarization enabled (model: %s)", ollama_model)
|
||||
else:
|
||||
ollama_model = ollama_prompt = None
|
||||
logger.debug("Ollama not configured; skipping summarization")
|
||||
|
||||
for feed_cfg in feeds:
|
||||
if not feed_cfg.get("enabled", True):
|
||||
logger.debug("Skipping disabled feed: %s", feed_cfg.get("name"))
|
||||
continue
|
||||
|
||||
url = feed_cfg["url"]
|
||||
logger.debug("Fetching feed: %s (%s)", feed_cfg.get("name", url), url)
|
||||
entries = fetch_feed(url)
|
||||
logger.debug("Got %d entries from %s", len(entries), feed_cfg.get("name", url))
|
||||
|
||||
articles = []
|
||||
for entry in entries:
|
||||
pub_date = parse_article_date(entry)
|
||||
if not is_within_lookback(pub_date, hours_lookback):
|
||||
continue
|
||||
|
||||
link = entry.get("link", "")
|
||||
if not link:
|
||||
continue
|
||||
|
||||
articles.append({
|
||||
"url": link,
|
||||
"title": entry.get("title"),
|
||||
"description": entry.get("summary"),
|
||||
"published_date": pub_date.isoformat() if pub_date else None,
|
||||
"feed_name": feed_cfg.get("name"),
|
||||
"feed_url": url,
|
||||
"category": feed_cfg.get("category"),
|
||||
"author": entry.get("author"),
|
||||
})
|
||||
|
||||
# Cap articles per feed to avoid flooding the DB and downstream fetches
|
||||
if max_per_feed > 0:
|
||||
articles = articles[:max_per_feed]
|
||||
|
||||
new_urls = save_articles(conn, articles)
|
||||
total_new += len(new_urls)
|
||||
logger.info("Feed '%s': %d new articles (of %d within lookback)",
|
||||
feed_cfg.get("name", url), len(new_urls), len(articles))
|
||||
|
||||
# Fetch full content and optionally summarize newly inserted articles
|
||||
for i, article_url in enumerate(new_urls):
|
||||
if i > 0 and not ollama_cfg:
|
||||
time.sleep(1) # rate limit when Ollama isn't providing natural delay
|
||||
logger.debug("Fetching content: %s", article_url)
|
||||
content = fetch_content(article_url)
|
||||
summary = None
|
||||
if ollama_cfg:
|
||||
row = conn.execute(
|
||||
"SELECT title, description FROM articles WHERE url = ?", (article_url,)
|
||||
).fetchone()
|
||||
if row:
|
||||
summary = generate_summary(row["title"], row["description"], content, ollama_model, ollama_prompt)
|
||||
if summary:
|
||||
logger.debug("Generated summary for %s", article_url)
|
||||
else:
|
||||
if i > 0:
|
||||
time.sleep(1) # fallback rate limit on summary failure
|
||||
conn.execute(
|
||||
"UPDATE articles SET content = ?, summary = ? WHERE url = ?",
|
||||
(content, summary, article_url),
|
||||
)
|
||||
conn.commit()
|
||||
|
||||
logger.info("Total new articles saved: %d", total_new)
|
||||
|
||||
# Output recent articles
|
||||
recent = get_recent_articles(conn, hours_lookback)
|
||||
fields = [f.strip() for f in args.fields.split(",")]
|
||||
output = [{k: article[k] for k in fields if k in article} for article in recent]
|
||||
print(json.dumps(output, ensure_ascii=False, indent=2))
|
||||
|
||||
conn.close()
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
|
||||
Reference in New Issue
Block a user