Clean up stale comments, dead code, and code quality issues

- Remove dead code: unused PENDING_FILE, _extract_domain(), sender_domain field, imap_uid fallback, check_unseen_only config key - Fix stale comments: removed tag references in README and docstrings, top_domains -> top_senders, 1-based number -> scan_index number - Make _extract_email_address public (used by 3 modules) - Extract _format_address helper to deduplicate from/to parsing - Batch pending queue disk I/O in review act/accept (load once, save once) - Reuse cleared pending dict in scan instead of redundant disk load - Track envelope IDs during scan loop to catch duplicates - Fix default confidence_threshold 75 -> 85 to match config and docs - Update get_relevant_examples default n=10 -> n=5 to match caller - Add graceful error for --recent with non-numeric value
2026-03-05 15:28:05 -08:00
parent 361e983b0f
commit 723c47bbb3
5 changed files with 70 additions and 83 deletions
--- a/scripts/email_processor/main.py
+++ b/scripts/email_processor/main.py
@@ -121,6 +121,20 @@ def read_message(envelope_id):
    return _himalaya("message", "read", "--preview", "--no-headers", str(envelope_id))


+def _format_address(addr_field):
+    """Format a himalaya address field (dict, list, or string) into a display string."""
+    if isinstance(addr_field, dict):
+        name = addr_field.get("name", "")
+        addr = addr_field.get("addr", "")
+        return f"{name} <{addr}>" if name else addr
+    elif isinstance(addr_field, list) and addr_field:
+        first = addr_field[0]
+        name = first.get("name", "")
+        addr = first.get("addr", "")
+        return f"{name} <{addr}>" if name else addr
+    return str(addr_field)
+
+
 def build_email_data(envelope, body, config):
    """Build the email_data dict expected by classifier and decision_store.

@@ -129,40 +143,11 @@ def build_email_data(envelope, body, config):
    """
    max_body = config.get("rules", {}).get("max_body_length", 1000)

-    # himalaya envelope JSON uses "from" as a nested object or string
-    sender = envelope.get("from", {})
-    if isinstance(sender, dict):
-        # Format: {"name": "Display Name", "addr": "user@example.com"}
-        name = sender.get("name", "")
-        addr = sender.get("addr", "")
-        sender_str = f"{name} <{addr}>" if name else addr
-    elif isinstance(sender, list) and sender:
-        first = sender[0]
-        name = first.get("name", "")
-        addr = first.get("addr", "")
-        sender_str = f"{name} <{addr}>" if name else addr
-    else:
-        sender_str = str(sender)
-
-    # Same for "to"
-    to = envelope.get("to", {})
-    if isinstance(to, dict):
-        name = to.get("name", "")
-        addr = to.get("addr", "")
-        to_str = f"{name} <{addr}>" if name else addr
-    elif isinstance(to, list) and to:
-        first = to[0]
-        name = first.get("name", "")
-        addr = first.get("addr", "")
-        to_str = f"{name} <{addr}>" if name else addr
-    else:
-        to_str = str(to)
-
    return {
        "id": str(envelope.get("id", "")),
        "subject": envelope.get("subject", "(No Subject)"),
-        "sender": sender_str,
-        "recipient": to_str,
+        "sender": _format_address(envelope.get("from", {})),
+        "recipient": _format_address(envelope.get("to", {})),
        "date": envelope.get("date", ""),
        "body": body[:max_body],
    }
@@ -322,7 +307,7 @@ def cmd_scan(config, recent=None, dry_run=False):

    # Load automation threshold
    automation = config.get("automation", {})
-    confidence_threshold = automation.get("confidence_threshold", 75)
+    confidence_threshold = automation.get("confidence_threshold", 85)

    # Fetch envelopes via himalaya
    if recent:
@@ -340,9 +325,8 @@ def cmd_scan(config, recent=None, dry_run=False):
    queued = 0
    skipped = 0

-    # Load pending queue once to skip already-queued emails
-    pending = load_pending()
-    pending_eids = {v.get("envelope_id") for v in pending.values() if v.get("status") == "pending"}
+    # Reuse the cleared pending dict from above to skip already-queued emails
+    pending_eids = {v.get("envelope_id") for v in cleared.values() if v.get("status") == "pending"}

    for envelope in envelopes:
        eid = envelope.get("id", "?")
@@ -353,6 +337,9 @@ def cmd_scan(config, recent=None, dry_run=False):
            skipped += 1
            continue

+        # Track this eid so duplicates within the same envelope list are caught
+        pending_eids.add(str(eid))
+
        print(f"[{eid}] ", end="", flush=True)

        # Read message body without marking as seen
@@ -370,7 +357,7 @@ def cmd_scan(config, recent=None, dry_run=False):
        )

        # Compute confidence from decision history
-        sender_email = decision_store._extract_email_address(email_data.get("sender", ""))
+        sender_email = decision_store.extract_email_address(email_data.get("sender", ""))
        confidence = decision_store.compute_confidence(sender_email, action, tags)

        tags_str = ", ".join(tags) if tags else "(none)"
@@ -479,7 +466,7 @@ def cmd_review_act(selector, action):
    """Execute an action on one or more pending emails.

    Args:
-        selector: a 1-based number, a msg_id string, or "all".
+        selector: a scan_index number, a msg_id string, or "all".
        action:   one of delete/archive/keep/mark_read/label:<name>.
    """
    # Validate action
@@ -507,8 +494,11 @@ def cmd_review_act(selector, action):
    log_file = LOGS_DIR / f"{datetime.now().strftime('%Y-%m-%d')}.log"

    # Execute action on each target
+    pending = load_pending()
+    pending_dirty = False
+
    for msg_id, data in targets:
-        eid = data.get("envelope_id") or data.get("imap_uid")
+        eid = data.get("envelope_id")
        if not eid:
            print(f"  {msg_id}: No envelope ID, skipping")
            continue
@@ -519,11 +509,10 @@ def cmd_review_act(selector, action):
            decision_store.record_decision(data, action, source="user", tags=data.get("tags", []))

            # Mark as done in pending queue
-            pending = load_pending()
            pending[msg_id]["status"] = "done"
            pending[msg_id]["action"] = action
            pending[msg_id]["processed_at"] = datetime.now().isoformat()
-            save_pending(pending)
+            pending_dirty = True

            log_result(log_file, data, f"REVIEW:{action}", data.get("reason", ""))
            print(f"  {msg_id}: {action} -> OK ({data['subject'][:40]})")
@@ -531,6 +520,9 @@ def cmd_review_act(selector, action):
            log_result(log_file, data, f"REVIEW_FAILED:{action}", data.get("reason", ""))
            print(f"  {msg_id}: {action} -> FAILED")

+    if pending_dirty:
+        save_pending(pending)
+

 def cmd_review_accept():
    """Accept all classifier suggestions for pending emails.
@@ -547,13 +539,16 @@ def cmd_review_accept():
    LOGS_DIR.mkdir(exist_ok=True)
    log_file = LOGS_DIR / f"{datetime.now().strftime('%Y-%m-%d')}.log"

+    pending = load_pending()
+    pending_dirty = False
+
    for msg_id, data in sorted_items:
        action = data.get("suggested_action")
        if not action:
            print(f"  {msg_id}: No suggestion, skipping")
            continue

-        eid = data.get("envelope_id") or data.get("imap_uid")
+        eid = data.get("envelope_id")
        if not eid:
            print(f"  {msg_id}: No envelope ID, skipping")
            continue
@@ -562,11 +557,10 @@ def cmd_review_accept():
        if success:
            decision_store.record_decision(data, action, source="user", tags=data.get("tags", []))

-            pending = load_pending()
            pending[msg_id]["status"] = "done"
            pending[msg_id]["action"] = action
            pending[msg_id]["processed_at"] = datetime.now().isoformat()
-            save_pending(pending)
+            pending_dirty = True

            log_result(log_file, data, f"ACCEPT:{action}", data.get("reason", ""))
            print(f"  {msg_id}: {action} -> OK ({data['subject'][:40]})")
@@ -574,6 +568,9 @@ def cmd_review_accept():
            log_result(log_file, data, f"ACCEPT_FAILED:{action}", data.get("reason", ""))
            print(f"  {msg_id}: {action} -> FAILED")

+    if pending_dirty:
+        save_pending(pending)
+

 def _resolve_target(selector, sorted_items):
    """Resolve a selector (scan_index number or msg_id) to a (msg_id, data) tuple.
@@ -611,7 +608,7 @@ def cmd_stats():
    """Print a summary of the decision history.

    Shows total decisions, user vs. auto breakdown, action distribution,
-    top sender domains, and custom labels.
+    top senders, and custom labels.
    """
    stats = decision_store.get_all_stats()

@@ -672,7 +669,11 @@ if __name__ == "__main__":
    i = 0
    while i < len(args):
        if args[i] == "--recent" and i + 1 < len(args):
-            recent = int(args[i + 1])
+            try:
+                recent = int(args[i + 1])
+            except ValueError:
+                print(f"--recent requires a number, got: {args[i + 1]}")
+                sys.exit(1)
            i += 2
        elif args[i] == "--dry-run":
            dry_run = True