Clean up stale comments, dead code, and code quality issues

- Remove dead code: unused PENDING_FILE, _extract_domain(), sender_domain
  field, imap_uid fallback, check_unseen_only config key
- Fix stale comments: removed tag references in README and docstrings,
  top_domains -> top_senders, 1-based number -> scan_index number
- Make _extract_email_address public (used by 3 modules)
- Extract _format_address helper to deduplicate from/to parsing
- Batch pending queue disk I/O in review act/accept (load once, save once)
- Reuse cleared pending dict in scan instead of redundant disk load
- Track envelope IDs during scan loop to catch duplicates
- Fix default confidence_threshold 75 -> 85 to match config and docs
- Update get_relevant_examples default n=10 -> n=5 to match caller
- Add graceful error for --recent with non-numeric value
This commit is contained in:
Yanxin Lu
2026-03-05 15:28:05 -08:00
parent 361e983b0f
commit 723c47bbb3
5 changed files with 70 additions and 83 deletions

View File

@@ -121,6 +121,20 @@ def read_message(envelope_id):
return _himalaya("message", "read", "--preview", "--no-headers", str(envelope_id))
def _format_address(addr_field):
"""Format a himalaya address field (dict, list, or string) into a display string."""
if isinstance(addr_field, dict):
name = addr_field.get("name", "")
addr = addr_field.get("addr", "")
return f"{name} <{addr}>" if name else addr
elif isinstance(addr_field, list) and addr_field:
first = addr_field[0]
name = first.get("name", "")
addr = first.get("addr", "")
return f"{name} <{addr}>" if name else addr
return str(addr_field)
def build_email_data(envelope, body, config):
"""Build the email_data dict expected by classifier and decision_store.
@@ -129,40 +143,11 @@ def build_email_data(envelope, body, config):
"""
max_body = config.get("rules", {}).get("max_body_length", 1000)
# himalaya envelope JSON uses "from" as a nested object or string
sender = envelope.get("from", {})
if isinstance(sender, dict):
# Format: {"name": "Display Name", "addr": "user@example.com"}
name = sender.get("name", "")
addr = sender.get("addr", "")
sender_str = f"{name} <{addr}>" if name else addr
elif isinstance(sender, list) and sender:
first = sender[0]
name = first.get("name", "")
addr = first.get("addr", "")
sender_str = f"{name} <{addr}>" if name else addr
else:
sender_str = str(sender)
# Same for "to"
to = envelope.get("to", {})
if isinstance(to, dict):
name = to.get("name", "")
addr = to.get("addr", "")
to_str = f"{name} <{addr}>" if name else addr
elif isinstance(to, list) and to:
first = to[0]
name = first.get("name", "")
addr = first.get("addr", "")
to_str = f"{name} <{addr}>" if name else addr
else:
to_str = str(to)
return {
"id": str(envelope.get("id", "")),
"subject": envelope.get("subject", "(No Subject)"),
"sender": sender_str,
"recipient": to_str,
"sender": _format_address(envelope.get("from", {})),
"recipient": _format_address(envelope.get("to", {})),
"date": envelope.get("date", ""),
"body": body[:max_body],
}
@@ -322,7 +307,7 @@ def cmd_scan(config, recent=None, dry_run=False):
# Load automation threshold
automation = config.get("automation", {})
confidence_threshold = automation.get("confidence_threshold", 75)
confidence_threshold = automation.get("confidence_threshold", 85)
# Fetch envelopes via himalaya
if recent:
@@ -340,9 +325,8 @@ def cmd_scan(config, recent=None, dry_run=False):
queued = 0
skipped = 0
# Load pending queue once to skip already-queued emails
pending = load_pending()
pending_eids = {v.get("envelope_id") for v in pending.values() if v.get("status") == "pending"}
# Reuse the cleared pending dict from above to skip already-queued emails
pending_eids = {v.get("envelope_id") for v in cleared.values() if v.get("status") == "pending"}
for envelope in envelopes:
eid = envelope.get("id", "?")
@@ -353,6 +337,9 @@ def cmd_scan(config, recent=None, dry_run=False):
skipped += 1
continue
# Track this eid so duplicates within the same envelope list are caught
pending_eids.add(str(eid))
print(f"[{eid}] ", end="", flush=True)
# Read message body without marking as seen
@@ -370,7 +357,7 @@ def cmd_scan(config, recent=None, dry_run=False):
)
# Compute confidence from decision history
sender_email = decision_store._extract_email_address(email_data.get("sender", ""))
sender_email = decision_store.extract_email_address(email_data.get("sender", ""))
confidence = decision_store.compute_confidence(sender_email, action, tags)
tags_str = ", ".join(tags) if tags else "(none)"
@@ -479,7 +466,7 @@ def cmd_review_act(selector, action):
"""Execute an action on one or more pending emails.
Args:
selector: a 1-based number, a msg_id string, or "all".
selector: a scan_index number, a msg_id string, or "all".
action: one of delete/archive/keep/mark_read/label:<name>.
"""
# Validate action
@@ -507,8 +494,11 @@ def cmd_review_act(selector, action):
log_file = LOGS_DIR / f"{datetime.now().strftime('%Y-%m-%d')}.log"
# Execute action on each target
pending = load_pending()
pending_dirty = False
for msg_id, data in targets:
eid = data.get("envelope_id") or data.get("imap_uid")
eid = data.get("envelope_id")
if not eid:
print(f" {msg_id}: No envelope ID, skipping")
continue
@@ -519,11 +509,10 @@ def cmd_review_act(selector, action):
decision_store.record_decision(data, action, source="user", tags=data.get("tags", []))
# Mark as done in pending queue
pending = load_pending()
pending[msg_id]["status"] = "done"
pending[msg_id]["action"] = action
pending[msg_id]["processed_at"] = datetime.now().isoformat()
save_pending(pending)
pending_dirty = True
log_result(log_file, data, f"REVIEW:{action}", data.get("reason", ""))
print(f" {msg_id}: {action} -> OK ({data['subject'][:40]})")
@@ -531,6 +520,9 @@ def cmd_review_act(selector, action):
log_result(log_file, data, f"REVIEW_FAILED:{action}", data.get("reason", ""))
print(f" {msg_id}: {action} -> FAILED")
if pending_dirty:
save_pending(pending)
def cmd_review_accept():
"""Accept all classifier suggestions for pending emails.
@@ -547,13 +539,16 @@ def cmd_review_accept():
LOGS_DIR.mkdir(exist_ok=True)
log_file = LOGS_DIR / f"{datetime.now().strftime('%Y-%m-%d')}.log"
pending = load_pending()
pending_dirty = False
for msg_id, data in sorted_items:
action = data.get("suggested_action")
if not action:
print(f" {msg_id}: No suggestion, skipping")
continue
eid = data.get("envelope_id") or data.get("imap_uid")
eid = data.get("envelope_id")
if not eid:
print(f" {msg_id}: No envelope ID, skipping")
continue
@@ -562,11 +557,10 @@ def cmd_review_accept():
if success:
decision_store.record_decision(data, action, source="user", tags=data.get("tags", []))
pending = load_pending()
pending[msg_id]["status"] = "done"
pending[msg_id]["action"] = action
pending[msg_id]["processed_at"] = datetime.now().isoformat()
save_pending(pending)
pending_dirty = True
log_result(log_file, data, f"ACCEPT:{action}", data.get("reason", ""))
print(f" {msg_id}: {action} -> OK ({data['subject'][:40]})")
@@ -574,6 +568,9 @@ def cmd_review_accept():
log_result(log_file, data, f"ACCEPT_FAILED:{action}", data.get("reason", ""))
print(f" {msg_id}: {action} -> FAILED")
if pending_dirty:
save_pending(pending)
def _resolve_target(selector, sorted_items):
"""Resolve a selector (scan_index number or msg_id) to a (msg_id, data) tuple.
@@ -611,7 +608,7 @@ def cmd_stats():
"""Print a summary of the decision history.
Shows total decisions, user vs. auto breakdown, action distribution,
top sender domains, and custom labels.
top senders, and custom labels.
"""
stats = decision_store.get_all_stats()
@@ -672,7 +669,11 @@ if __name__ == "__main__":
i = 0
while i < len(args):
if args[i] == "--recent" and i + 1 < len(args):
recent = int(args[i + 1])
try:
recent = int(args[i + 1])
except ValueError:
print(f"--recent requires a number, got: {args[i + 1]}")
sys.exit(1)
i += 2
elif args[i] == "--dry-run":
dry_run = True