email-processor: fix concurrency bugs and several other issues

- Add fcntl file locking around read-modify-write cycles on both decision_history.json and pending_emails.json to prevent data corruption from parallel processes - Pass --page-size 500 to himalaya envelope list to avoid silently missing emails beyond the default first page - Use ollama.Client(host=...) so the config.json host setting is actually respected - Fall back to sender-only matching in compute_confidence when LLM returns no valid taxonomy tags, instead of always returning 50% - Fix _format_address to return empty string instead of literal "None" or "[]" for missing address fields
2026-03-20 18:58:13 -07:00
parent 4e3c6acab6
commit 71672b31ca
3 changed files with 78 additions and 59 deletions
--- a/scripts/email_processor/decision_store.py
+++ b/scripts/email_processor/decision_store.py
@@ -12,8 +12,10 @@ recipient, subject, summary, action taken, and whether it was a user or
 auto decision.
 """

+import fcntl
 import json
 import re
+from contextlib import contextmanager
 from datetime import datetime, timedelta
 from pathlib import Path
 from collections import Counter
@@ -34,6 +36,19 @@ _STOP_WORDS = {"re", "fwd", "the", "a", "an", "is", "to", "for", "and", "or", "y
 # Internal helpers
 # ---------------------------------------------------------------------------

+@contextmanager
+def file_lock(path):
+    """Acquire an exclusive file lock for safe concurrent access."""
+    DATA_DIR.mkdir(exist_ok=True)
+    lock_path = str(path) + ".lock"
+    with open(lock_path, "w") as lock_file:
+        fcntl.flock(lock_file, fcntl.LOCK_EX)
+        try:
+            yield
+        finally:
+            fcntl.flock(lock_file, fcntl.LOCK_UN)
+
+
 def _load_history():
    """Load the full decision history list from disk."""
    if not HISTORY_FILE.exists():
@@ -69,7 +84,6 @@ def record_decision(email_data, action, source="user", tags=None):
        source:     "user" (manual review) or "auto" (high-confidence).
        tags:       list of category tags from the classifier taxonomy.
    """
-    history = _load_history()
    entry = {
        "timestamp": datetime.now().isoformat(timespec="seconds"),
        "sender": email_data.get("sender", ""),
@@ -80,8 +94,10 @@ def record_decision(email_data, action, source="user", tags=None):
        "source": source,
        "tags": tags or [],
    }
-    history.append(entry)
-    _save_history(history)
+    with file_lock(HISTORY_FILE):
+        history = _load_history()
+        history.append(entry)
+        _save_history(history)
    return entry


@@ -152,23 +168,28 @@ def compute_confidence(sender_email, action, tags):
    Returns an integer 0-100.
    """
    history = _load_history()
-    if not history or not tags:
+    if not history:
        return 50

-    # Find past decisions with same sender and sufficient tag overlap
+    # Find past decisions with same sender and sufficient tag overlap.
+    # If tags are empty (LLM returned no valid taxonomy tags), fall back
+    # to sender-only matching so history still contributes to confidence.
    matches = []
    for entry in history:
        entry_email = extract_email_address(entry.get("sender", ""))
        if entry_email != sender_email:
            continue

-        entry_tags = entry.get("tags", [])
-        if not entry_tags:
-            continue
-
-        shared = len(set(tags) & set(entry_tags))
-        min_len = min(len(tags), len(entry_tags))
-        if min_len > 0 and shared / min_len >= 0.5:
+        if tags:
+            entry_tags = entry.get("tags", [])
+            if not entry_tags:
+                continue
+            shared = len(set(tags) & set(entry_tags))
+            min_len = min(len(tags), len(entry_tags))
+            if min_len > 0 and shared / min_len >= 0.5:
+                matches.append(entry)
+        else:
+            # No tags to compare — match on sender alone
            matches.append(entry)

    if not matches: