forgot to include files
This commit is contained in:
@@ -1,17 +1,23 @@
|
||||
# RSS News Digest
|
||||
|
||||
Fetches articles from RSS/Atom feeds, downloads full article content, stores everything in SQLite with URL-based deduplication, and outputs a JSON digest to stdout. Optionally generates per-article summaries via a local Ollama model.
|
||||
Fetches articles from RSS/Atom feeds, downloads full article content, stores everything in SQLite with URL-based deduplication, and outputs a JSON digest to stdout. When Ollama is configured, each article is automatically summarized during fetch and the summary is stored in the database.
|
||||
|
||||
HTTP requests use automatic retries with exponential backoff for transient errors (429/5xx), browser-like headers to avoid 403 blocks, and rate limiting between content fetches.
|
||||
|
||||
## Setup
|
||||
|
||||
Install [uv](https://docs.astral.sh/uv/getting-started/installation/) (a fast Python package manager):
|
||||
|
||||
```bash
|
||||
pip install -r requirements.txt
|
||||
curl -LsSf https://astral.sh/uv/install.sh | sh
|
||||
```
|
||||
|
||||
`run.sh` handles Python and dependency installation automatically — no manual venv or `pip install` needed.
|
||||
|
||||
For AI-powered article summaries, install [Ollama](https://ollama.com) and pull a model:
|
||||
|
||||
```bash
|
||||
ollama pull qwen3
|
||||
ollama pull kamekichi128/qwen3-4b-instruct-2507
|
||||
```
|
||||
|
||||
## Configuration
|
||||
@@ -22,10 +28,11 @@ Edit `config.json` to add feeds and adjust settings:
|
||||
{
|
||||
"settings": {
|
||||
"hours_lookback": 24,
|
||||
"retention_days": 30
|
||||
"retention_days": 30,
|
||||
"max_articles_per_feed": 10
|
||||
},
|
||||
"ollama": {
|
||||
"model": "qwen3",
|
||||
"model": "kamekichi128/qwen3-4b-instruct-2507",
|
||||
"prompt": "Summarize the following news article in 2-3 concise sentences:"
|
||||
},
|
||||
"feeds": [
|
||||
@@ -41,39 +48,50 @@ Edit `config.json` to add feeds and adjust settings:
|
||||
|
||||
- **hours_lookback** — only include articles published within this many hours
|
||||
- **retention_days** — auto-delete articles older than this from the database
|
||||
- **ollama.model** — Ollama model name for digest summaries
|
||||
- **ollama.prompt** — system prompt sent to the model
|
||||
- **max_articles_per_feed** — limit how many articles are saved per feed per run (0 = unlimited)
|
||||
- **ollama.model** — Ollama model name for article summaries (generated during fetch)
|
||||
- **ollama.prompt** — prompt sent to the model for each article
|
||||
- Removing the `ollama` key disables summarization; articles are still fetched normally
|
||||
- **feeds[].enabled** — set to `false` to skip a feed without removing it
|
||||
|
||||
## Usage
|
||||
|
||||
```bash
|
||||
# Fetch feeds and print recent articles (default fields: id, title, url)
|
||||
python main.py
|
||||
# Fetch feeds and print recent articles (default fields: id, title, url, published_date, fetched_date, feed_name)
|
||||
./run.sh
|
||||
|
||||
# Verbose mode (logs to stderr)
|
||||
python main.py -v
|
||||
./run.sh -v
|
||||
|
||||
# Override lookback window to 48 hours
|
||||
python main.py --hours 48
|
||||
./run.sh --hours 48
|
||||
|
||||
# Include more fields in output
|
||||
python main.py -f id,title,url,description,published_date
|
||||
./run.sh -f id,title,url,description,published_date
|
||||
|
||||
# Include article summaries in output (requires ollama config)
|
||||
./run.sh -f id,title,url,summary
|
||||
|
||||
# Include full article content in output
|
||||
python main.py -f id,title,url,content
|
||||
./run.sh -f id,title,url,content
|
||||
|
||||
# Review stored articles without fetching new ones
|
||||
python main.py --no-fetch
|
||||
./run.sh --no-fetch
|
||||
|
||||
# Only purge old articles (no fetching or output)
|
||||
python main.py --purge-only -v
|
||||
./run.sh --purge-only -v
|
||||
|
||||
# Generate AI summaries for specific article IDs
|
||||
python main.py --digest 1,3,7
|
||||
# Smoke test: full pipeline (fetch 1 article from first enabled feed + summarize)
|
||||
./run.sh --test
|
||||
|
||||
# Test feed fetching only (1 article from first enabled feed)
|
||||
./run.sh --test feed
|
||||
|
||||
# Test Ollama summarization only (uses hardcoded articles)
|
||||
./run.sh --test summary
|
||||
|
||||
# Custom config and database paths
|
||||
python main.py -c my_config.json -d my_news.db
|
||||
./run.sh -c my_config.json -d my_news.db
|
||||
```
|
||||
|
||||
## Examples
|
||||
@@ -81,14 +99,15 @@ python main.py -c my_config.json -d my_news.db
|
||||
### Fetch feeds and list articles
|
||||
|
||||
```
|
||||
$ python main.py -v
|
||||
$ ./run.sh -v
|
||||
2026-02-21 22:41:44 DEBUG Ollama summarization enabled (model: qwen3)
|
||||
2026-02-21 22:41:44 INFO Feed 'BBC News': 35 new articles (of 36 within lookback)
|
||||
2026-02-21 22:41:54 INFO Feed 'NY Times': 20 new articles (of 20 within lookback)
|
||||
2026-02-21 22:41:57 INFO Feed 'Hacker News': 20 new articles (of 20 within lookback)
|
||||
2026-02-21 22:42:08 INFO Feed '联合早报 中国': 24 new articles (of 24 within lookback)
|
||||
2026-02-21 22:42:23 INFO Feed '澎湃新闻 热点': 15 new articles (of 15 within lookback)
|
||||
2026-02-21 22:42:49 INFO Feed '36氪 热榜': 8 new articles (of 8 within lookback)
|
||||
2026-02-21 22:42:59 INFO Total new articles saved: 122
|
||||
2026-02-21 22:43:54 INFO Feed 'NY Times': 10 new articles (of 10 within lookback)
|
||||
2026-02-21 22:44:57 INFO Feed 'Hacker News': 10 new articles (of 10 within lookback)
|
||||
2026-02-21 22:46:08 INFO Feed '联合早报 中国': 10 new articles (of 10 within lookback)
|
||||
2026-02-21 22:47:23 INFO Feed '澎湃新闻 热点': 10 new articles (of 10 within lookback)
|
||||
2026-02-21 22:48:49 INFO Feed '36氪 热榜': 8 new articles (of 8 within lookback)
|
||||
2026-02-21 22:48:59 INFO Total new articles saved: 83
|
||||
[
|
||||
{"id": 1, "title": "Iran students stage first large anti-government protests since deadly crackdown", "url": "https://www.bbc.com/news/articles/..."},
|
||||
{"id": 2, "title": "Trump says he will increase his new global tariffs to 15%", "url": "https://www.bbc.com/news/articles/..."},
|
||||
@@ -99,7 +118,7 @@ $ python main.py -v
|
||||
### Review stored articles without fetching
|
||||
|
||||
```
|
||||
$ python main.py --no-fetch -f id,title,feed_name
|
||||
$ ./run.sh --no-fetch -f id,title,feed_name
|
||||
[
|
||||
{"id": 1, "title": "Iran students stage first large anti-government protests...", "feed_name": "BBC News"},
|
||||
{"id": 79, "title": "韩国二次电池技术水平被中国反超", "feed_name": "联合早报 中国"},
|
||||
@@ -108,32 +127,6 @@ $ python main.py --no-fetch -f id,title,feed_name
|
||||
]
|
||||
```
|
||||
|
||||
### Generate AI summaries for specific articles
|
||||
|
||||
```
|
||||
$ python main.py --digest 1,79,117
|
||||
[
|
||||
{
|
||||
"id": 1,
|
||||
"title": "Iran students stage first large anti-government protests since deadly crackdown",
|
||||
"url": "https://www.bbc.com/news/articles/...",
|
||||
"summary": "Iranian students have held large-scale anti-government protests across multiple cities, marking the first such demonstrations since a deadly crackdown last month. Protesters chanted anti-regime slogans, with clashes reported between demonstrators and government supporters."
|
||||
},
|
||||
{
|
||||
"id": 79,
|
||||
"title": "韩国二次电池技术水平被中国反超",
|
||||
"url": "https://www.zaobao.com/news/china/story20260222-8614291",
|
||||
"summary": "韩国官方报告指出,中国在二次电池技术领域已反超韩国,2024年技术水平评估显示中国领先韩国0.2年,且中国追赶美国的势头明显。"
|
||||
},
|
||||
{
|
||||
"id": 117,
|
||||
"title": "忍无可忍,Ilya宫斗奥特曼!微软CTO爆内幕:全因嫉妒下属太优秀?",
|
||||
"url": "https://www.36kr.com/p/3693861726826112",
|
||||
"summary": "该文章描述了OpenAI内部的权力斗争事件,称微软CTO披露,首席科学家Ilya因嫉妒下属取得突破,联合董事会罢免了CEO奥特曼,引发高管集体离职。"
|
||||
}
|
||||
]
|
||||
```
|
||||
|
||||
## Output
|
||||
|
||||
Default output (JSON array to stdout):
|
||||
@@ -145,12 +138,12 @@ Default output (JSON array to stdout):
|
||||
]
|
||||
```
|
||||
|
||||
Digest mode (`--digest`):
|
||||
With summary field (`-f id,title,url,summary`):
|
||||
|
||||
```json
|
||||
[
|
||||
{"id": 1, "title": "Article Title", "url": "https://example.com/article", "summary": "AI-generated summary..."},
|
||||
{"id": 3, "title": "Another Article", "url": "https://example.com/other", "summary": "AI-generated summary..."}
|
||||
{"id": 2, "title": "Another Article", "url": "https://example.com/other", "summary": "AI-generated summary..."}
|
||||
]
|
||||
```
|
||||
|
||||
@@ -158,4 +151,11 @@ Digest mode (`--digest`):
|
||||
|
||||
Articles are stored in `news_digest.db` (SQLite) in the current directory by default. The database is created automatically on first run. Articles older than `retention_days` are purged at the start of each fetch run. Duplicate URLs are ignored via a UNIQUE constraint.
|
||||
|
||||
Each article stores metadata from the RSS feed (title, description, published date, author, etc.) plus the full article content fetched from the article URL. Content is extracted as plain text using BeautifulSoup. Some sites (e.g. paywalled or bot-protected) may return 403 errors — in those cases the content field is left empty and the RSS description is used as a fallback for summaries.
|
||||
Each article stores metadata from the RSS feed (title, description, published date, author, etc.) plus the full article content fetched from the article URL. Content is extracted as plain text using BeautifulSoup.
|
||||
|
||||
HTTP requests are made through a shared `requests.Session` with:
|
||||
- **Automatic retries** (up to 3 attempts with exponential backoff) for 429/5xx errors
|
||||
- **Browser-like headers** (User-Agent, Accept) to reduce 403 rejections
|
||||
- **Rate limiting** (Ollama latency between fetches when configured; 1-second fallback otherwise)
|
||||
|
||||
Some sites (e.g. paywalled or bot-protected) may still return errors — in those cases the content field is left empty and the RSS description is used as a fallback for summaries.
|
||||
|
||||
@@ -1,31 +1,14 @@
|
||||
{
|
||||
"settings": {
|
||||
"hours_lookback": 24,
|
||||
"retention_days": 30
|
||||
"retention_days": 30,
|
||||
"max_articles_per_feed": 10
|
||||
},
|
||||
"ollama": {
|
||||
"model": "kamekichi128/qwen3-4b-instruct-2507:latest",
|
||||
"prompt": "Summarize the following news article in 2-3 concise sentences:"
|
||||
},
|
||||
"feeds": [
|
||||
{
|
||||
"url": "https://feeds.bbci.co.uk/news/rss.xml",
|
||||
"name": "BBC News",
|
||||
"category": "World",
|
||||
"enabled": true
|
||||
},
|
||||
{
|
||||
"url": "https://rss.nytimes.com/services/xml/rss/nyt/HomePage.xml",
|
||||
"name": "NY Times",
|
||||
"category": "World",
|
||||
"enabled": true
|
||||
},
|
||||
{
|
||||
"url": "https://feeds.bbci.co.uk/news/technology/rss.xml",
|
||||
"name": "BBC Tech",
|
||||
"category": "Tech",
|
||||
"enabled": false
|
||||
},
|
||||
{
|
||||
"url": "https://hnrss.org/frontpage",
|
||||
"name": "Hacker News",
|
||||
|
||||
@@ -1,22 +1,90 @@
|
||||
#!/usr/bin/env python3
|
||||
"""RSS News Digest — fetch feeds, store articles with full content in SQLite, optionally summarize via Ollama."""
|
||||
"""RSS News Digest — fetch feeds, store articles with full content in SQLite, and summarize via Ollama during fetch.
|
||||
|
||||
Recommended: run via ./run.sh, which uses `uv` to handle dependencies
|
||||
automatically (no manual venv or pip install needed).
|
||||
|
||||
When an `ollama` key is present in config.json, each newly fetched article is
|
||||
automatically summarized and the result is stored in the database. Ollama
|
||||
latency provides natural rate limiting between HTTP requests; when Ollama is
|
||||
not configured, a 1-second sleep is used instead.
|
||||
|
||||
Uses a requests.Session with automatic retries and browser-like headers to
|
||||
handle transient HTTP errors (429/5xx). A configurable per-feed article cap
|
||||
helps avoid overwhelming upstream servers.
|
||||
|
||||
Use ``--test`` to smoke-test feed fetching and/or Ollama summarization without
|
||||
writing to the database.
|
||||
"""
|
||||
|
||||
import argparse
|
||||
import json
|
||||
import logging
|
||||
import sqlite3
|
||||
import sys
|
||||
import time
|
||||
from datetime import datetime, timedelta, timezone
|
||||
from pathlib import Path
|
||||
from time import mktime
|
||||
from urllib.request import Request, urlopen
|
||||
from urllib.error import URLError
|
||||
|
||||
import feedparser
|
||||
import requests
|
||||
from bs4 import BeautifulSoup
|
||||
from requests.adapters import HTTPAdapter
|
||||
from urllib3.util.retry import Retry
|
||||
|
||||
logger = logging.getLogger("news_digest")
|
||||
|
||||
# Hardcoded test articles for --test summary (one English, one Chinese)
|
||||
_TEST_ARTICLES = [
|
||||
{
|
||||
"title": "Global Semiconductor Shortage Eases as New Factories Come Online",
|
||||
"content": (
|
||||
"The global chip shortage that disrupted industries from automotive to "
|
||||
"consumer electronics is finally showing signs of relief. Major semiconductor "
|
||||
"manufacturers including TSMC, Samsung, and Intel have begun production at new "
|
||||
"fabrication plants in Arizona, Texas, and Japan. Industry analysts project that "
|
||||
"global chip capacity will increase by 15% over the next 18 months, potentially "
|
||||
"leading to a supply surplus in certain categories. The shift has already begun "
|
||||
"to impact pricing, with memory chip costs dropping 12% in the last quarter."
|
||||
),
|
||||
},
|
||||
{
|
||||
"title": "中国新能源汽车出口量首次突破年度600万辆大关",
|
||||
"content": (
|
||||
"据中国汽车工业协会最新数据,2025年中国新能源汽车出口量首次突破600万辆,"
|
||||
"同比增长38%。比亚迪、上汽、蔚来等品牌在东南亚、欧洲和南美市场持续扩张。"
|
||||
"分析人士指出,中国在电池技术和供应链方面的优势使其产品在全球市场具有较强"
|
||||
"竞争力,但欧盟加征的反补贴关税可能对未来增长构成挑战。"
|
||||
),
|
||||
},
|
||||
]
|
||||
|
||||
|
||||
def _build_session() -> requests.Session:
|
||||
"""Create a requests session with automatic retries and browser-like headers."""
|
||||
session = requests.Session()
|
||||
retry = Retry(
|
||||
total=3,
|
||||
backoff_factor=1, # 1s, 2s, 4s between retries
|
||||
status_forcelist=[429, 500, 502, 503, 504],
|
||||
respect_retry_after_header=True,
|
||||
)
|
||||
adapter = HTTPAdapter(max_retries=retry)
|
||||
session.mount("http://", adapter)
|
||||
session.mount("https://", adapter)
|
||||
session.headers.update({
|
||||
"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) "
|
||||
"AppleWebKit/537.36 (KHTML, like Gecko) "
|
||||
"Chrome/131.0.0.0 Safari/537.36",
|
||||
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
|
||||
"Accept-Language": "en-US,en;q=0.9",
|
||||
})
|
||||
return session
|
||||
|
||||
|
||||
_session = _build_session()
|
||||
|
||||
|
||||
def load_config(path: str) -> dict:
|
||||
with open(path, encoding="utf-8") as f:
|
||||
@@ -33,6 +101,7 @@ def init_db(db_path: str) -> sqlite3.Connection:
|
||||
title TEXT,
|
||||
description TEXT,
|
||||
content TEXT,
|
||||
summary TEXT,
|
||||
published_date TEXT,
|
||||
fetched_date TEXT NOT NULL,
|
||||
feed_name TEXT,
|
||||
@@ -41,11 +110,6 @@ def init_db(db_path: str) -> sqlite3.Connection:
|
||||
author TEXT
|
||||
)
|
||||
""")
|
||||
# Migrate: add content column if missing (existing DBs)
|
||||
try:
|
||||
conn.execute("ALTER TABLE articles ADD COLUMN content TEXT")
|
||||
except sqlite3.OperationalError:
|
||||
pass
|
||||
conn.commit()
|
||||
return conn
|
||||
|
||||
@@ -65,15 +129,12 @@ def is_within_lookback(dt: datetime | None, hours: int) -> bool:
|
||||
return dt >= cutoff
|
||||
|
||||
|
||||
_USER_AGENT = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko)"
|
||||
|
||||
|
||||
def fetch_feed(url: str) -> list[dict]:
|
||||
try:
|
||||
req = Request(url, headers={"User-Agent": _USER_AGENT})
|
||||
with urlopen(req, timeout=30) as resp:
|
||||
raw = resp.read()
|
||||
except (URLError, OSError) as e:
|
||||
resp = _session.get(url, timeout=30)
|
||||
resp.raise_for_status()
|
||||
raw = resp.content
|
||||
except requests.RequestException as e:
|
||||
logger.warning("Failed to fetch %s: %s", url, e)
|
||||
return []
|
||||
|
||||
@@ -86,10 +147,10 @@ def fetch_feed(url: str) -> list[dict]:
|
||||
|
||||
def fetch_content(url: str) -> str | None:
|
||||
try:
|
||||
req = Request(url, headers={"User-Agent": _USER_AGENT})
|
||||
with urlopen(req, timeout=15) as resp:
|
||||
html = resp.read()
|
||||
except (URLError, OSError) as e:
|
||||
resp = _session.get(url, timeout=15)
|
||||
resp.raise_for_status()
|
||||
html = resp.content
|
||||
except requests.RequestException as e:
|
||||
logger.warning("Failed to fetch content from %s: %s", url, e)
|
||||
return None
|
||||
|
||||
@@ -157,15 +218,8 @@ def purge_old_articles(conn: sqlite3.Connection, days: int) -> int:
|
||||
def get_recent_articles(conn: sqlite3.Connection, hours: int) -> list[dict]:
|
||||
cutoff = (datetime.now(timezone.utc) - timedelta(hours=hours)).isoformat()
|
||||
rows = conn.execute(
|
||||
"SELECT * FROM articles WHERE fetched_date >= ? ORDER BY id", (cutoff,)
|
||||
).fetchall()
|
||||
return [dict(r) for r in rows]
|
||||
|
||||
|
||||
def get_articles_by_ids(conn: sqlite3.Connection, ids: list[int]) -> list[dict]:
|
||||
placeholders = ",".join("?" for _ in ids)
|
||||
rows = conn.execute(
|
||||
f"SELECT * FROM articles WHERE id IN ({placeholders}) ORDER BY id", ids
|
||||
"SELECT * FROM articles WHERE published_date >= ? OR published_date IS NULL ORDER BY id",
|
||||
(cutoff,),
|
||||
).fetchall()
|
||||
return [dict(r) for r in rows]
|
||||
|
||||
@@ -194,16 +248,101 @@ def generate_summary(title: str, description: str | None, content: str | None, m
|
||||
return None
|
||||
|
||||
|
||||
def _run_test(mode: str, config: dict) -> None:
|
||||
"""Run smoke tests for feed fetching, summarization, or both.
|
||||
|
||||
All JSON results go to stdout; status messages go to stderr.
|
||||
"""
|
||||
if mode not in ("", "feed", "summary"):
|
||||
print(f"Unknown test mode: {mode!r} (use 'feed', 'summary', or omit)", file=sys.stderr)
|
||||
sys.exit(1)
|
||||
|
||||
feed_article = None # may be populated by feed test for use in full mode
|
||||
|
||||
# --- Feed test ---
|
||||
if mode in ("", "feed"):
|
||||
print("=== Feed test ===", file=sys.stderr)
|
||||
feeds = config.get("feeds", [])
|
||||
enabled = [f for f in feeds if f.get("enabled", True)]
|
||||
if not enabled:
|
||||
print("FAIL: no enabled feeds in config", file=sys.stderr)
|
||||
sys.exit(1)
|
||||
|
||||
feed_cfg = enabled[0]
|
||||
url = feed_cfg["url"]
|
||||
name = feed_cfg.get("name", url)
|
||||
print(f"Fetching feed: {name} ({url})", file=sys.stderr)
|
||||
|
||||
entries = fetch_feed(url)
|
||||
if not entries:
|
||||
print("FAIL: no entries returned from feed", file=sys.stderr)
|
||||
sys.exit(1)
|
||||
|
||||
entry = entries[0]
|
||||
link = entry.get("link", "")
|
||||
title = entry.get("title", "")
|
||||
print(f"Fetching content: {link}", file=sys.stderr)
|
||||
content = fetch_content(link) if link else None
|
||||
|
||||
result = {
|
||||
"feed": name,
|
||||
"title": title,
|
||||
"url": link,
|
||||
"content_length": len(content) if content else 0,
|
||||
}
|
||||
print(json.dumps(result, ensure_ascii=False, indent=2))
|
||||
|
||||
if content:
|
||||
print("PASS: feed fetch", file=sys.stderr)
|
||||
feed_article = {"title": title, "content": content}
|
||||
else:
|
||||
print("FAIL: could not fetch article content", file=sys.stderr)
|
||||
if mode == "feed":
|
||||
sys.exit(1)
|
||||
|
||||
# --- Summary test ---
|
||||
if mode in ("", "summary"):
|
||||
print("=== Summary test ===", file=sys.stderr)
|
||||
ollama_cfg = config.get("ollama")
|
||||
if not ollama_cfg:
|
||||
print("FAIL: no 'ollama' key in config", file=sys.stderr)
|
||||
sys.exit(1)
|
||||
|
||||
model = ollama_cfg.get("model", "kamekichi128/qwen3-4b-instruct-2507")
|
||||
prompt = ollama_cfg.get("prompt", "Summarize the following news article in 2-3 concise sentences:")
|
||||
|
||||
# Build test inputs: hardcoded articles + fetched article (full mode only)
|
||||
articles = list(_TEST_ARTICLES)
|
||||
if feed_article:
|
||||
articles.append(feed_article)
|
||||
|
||||
all_ok = True
|
||||
for article in articles:
|
||||
print(f"Summarizing: {article['title']}", file=sys.stderr)
|
||||
summary = generate_summary(article["title"], None, article["content"], model, prompt)
|
||||
result = {"title": article["title"], "summary": summary}
|
||||
print(json.dumps(result, ensure_ascii=False, indent=2))
|
||||
if not summary:
|
||||
all_ok = False
|
||||
|
||||
if all_ok:
|
||||
print("PASS: summary", file=sys.stderr)
|
||||
else:
|
||||
print("FAIL: one or more summaries failed", file=sys.stderr)
|
||||
sys.exit(1)
|
||||
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser(description="RSS News Digest")
|
||||
parser.add_argument("-c", "--config", default="config.json", help="Config file path")
|
||||
parser.add_argument("-d", "--database", default="news_digest.db", help="SQLite database path")
|
||||
parser.add_argument("--hours", type=int, help="Override lookback hours")
|
||||
parser.add_argument("-f", "--fields", default="id,title,url", help="Comma-separated output fields")
|
||||
parser.add_argument("--digest", help="Article IDs to summarize (comma-separated, e.g. 1,3,7)")
|
||||
parser.add_argument("-f", "--fields", default="id,title,url,published_date,fetched_date,feed_name", help="Comma-separated output fields")
|
||||
parser.add_argument("--purge-only", action="store_true", help="Only purge old articles")
|
||||
parser.add_argument("--no-fetch", action="store_true", help="Skip fetching feeds, only query stored articles")
|
||||
parser.add_argument("-v", "--verbose", action="store_true", help="Debug logging to stderr")
|
||||
parser.add_argument("--test", nargs="?", const="", metavar="MODE",
|
||||
help="Smoke test: 'feed', 'summary', or omit for full pipeline")
|
||||
args = parser.parse_args()
|
||||
|
||||
logging.basicConfig(
|
||||
@@ -218,39 +357,19 @@ def main():
|
||||
sys.exit(1)
|
||||
|
||||
config = load_config(str(config_path))
|
||||
|
||||
# Handle --test before any DB operations
|
||||
if args.test is not None:
|
||||
_run_test(args.test, config)
|
||||
return
|
||||
|
||||
settings = config.get("settings", {})
|
||||
hours_lookback = args.hours or settings.get("hours_lookback", 24)
|
||||
retention_days = settings.get("retention_days", 30)
|
||||
max_per_feed = settings.get("max_articles_per_feed", 0)
|
||||
|
||||
conn = init_db(args.database)
|
||||
|
||||
# Digest mode — summarize specified articles, then exit
|
||||
if args.digest:
|
||||
ollama_cfg = config.get("ollama", {})
|
||||
model = ollama_cfg.get("model", "qwen3")
|
||||
prompt = ollama_cfg.get("prompt", "Summarize the following news article in 2-3 concise sentences:")
|
||||
|
||||
ids = [int(x.strip()) for x in args.digest.split(",")]
|
||||
articles = get_articles_by_ids(conn, ids)
|
||||
|
||||
if not articles:
|
||||
logger.warning("No articles found for IDs: %s", ids)
|
||||
|
||||
results = []
|
||||
for article in articles:
|
||||
logger.debug("Summarizing article %d: %s", article["id"], article["title"])
|
||||
summary = generate_summary(article["title"], article.get("description"), article.get("content"), model, prompt)
|
||||
results.append({
|
||||
"id": article["id"],
|
||||
"title": article["title"],
|
||||
"url": article["url"],
|
||||
"summary": summary,
|
||||
})
|
||||
|
||||
print(json.dumps(results, ensure_ascii=False, indent=2))
|
||||
conn.close()
|
||||
return
|
||||
|
||||
# Purge old articles
|
||||
deleted = purge_old_articles(conn, retention_days)
|
||||
if deleted:
|
||||
@@ -266,6 +385,16 @@ def main():
|
||||
feeds = config.get("feeds", [])
|
||||
total_new = 0
|
||||
|
||||
# Read ollama config once for summarization during fetch
|
||||
ollama_cfg = config.get("ollama")
|
||||
if ollama_cfg:
|
||||
ollama_model = ollama_cfg.get("model", "kamekichi128/qwen3-4b-instruct-2507")
|
||||
ollama_prompt = ollama_cfg.get("prompt", "Summarize the following news article in 2-3 concise sentences:")
|
||||
logger.debug("Ollama summarization enabled (model: %s)", ollama_model)
|
||||
else:
|
||||
ollama_model = ollama_prompt = None
|
||||
logger.debug("Ollama not configured; skipping summarization")
|
||||
|
||||
for feed_cfg in feeds:
|
||||
if not feed_cfg.get("enabled", True):
|
||||
logger.debug("Skipping disabled feed: %s", feed_cfg.get("name"))
|
||||
@@ -297,18 +426,37 @@ def main():
|
||||
"author": entry.get("author"),
|
||||
})
|
||||
|
||||
# Cap articles per feed to avoid flooding the DB and downstream fetches
|
||||
if max_per_feed > 0:
|
||||
articles = articles[:max_per_feed]
|
||||
|
||||
new_urls = save_articles(conn, articles)
|
||||
total_new += len(new_urls)
|
||||
logger.info("Feed '%s': %d new articles (of %d within lookback)",
|
||||
feed_cfg.get("name", url), len(new_urls), len(articles))
|
||||
|
||||
# Fetch full content for newly inserted articles
|
||||
for article_url in new_urls:
|
||||
# Fetch full content and optionally summarize newly inserted articles
|
||||
for i, article_url in enumerate(new_urls):
|
||||
if i > 0 and not ollama_cfg:
|
||||
time.sleep(1) # rate limit when Ollama isn't providing natural delay
|
||||
logger.debug("Fetching content: %s", article_url)
|
||||
content = fetch_content(article_url)
|
||||
if content:
|
||||
conn.execute("UPDATE articles SET content = ? WHERE url = ?", (content, article_url))
|
||||
logger.debug("Saved content (%d chars) for %s", len(content), article_url)
|
||||
summary = None
|
||||
if ollama_cfg:
|
||||
row = conn.execute(
|
||||
"SELECT title, description FROM articles WHERE url = ?", (article_url,)
|
||||
).fetchone()
|
||||
if row:
|
||||
summary = generate_summary(row["title"], row["description"], content, ollama_model, ollama_prompt)
|
||||
if summary:
|
||||
logger.debug("Generated summary for %s", article_url)
|
||||
else:
|
||||
if i > 0:
|
||||
time.sleep(1) # fallback rate limit on summary failure
|
||||
conn.execute(
|
||||
"UPDATE articles SET content = ?, summary = ? WHERE url = ?",
|
||||
(content, summary, article_url),
|
||||
)
|
||||
conn.commit()
|
||||
|
||||
logger.info("Total new articles saved: %d", total_new)
|
||||
|
||||
@@ -1,3 +1,4 @@
|
||||
feedparser>=6.0.0
|
||||
beautifulsoup4>=4.12.0
|
||||
requests>=2.31.0
|
||||
ollama>=0.4.0
|
||||
|
||||
Reference in New Issue
Block a user