email-processor: fix concurrency bugs and several other issues

- Add fcntl file locking around read-modify-write cycles on both
  decision_history.json and pending_emails.json to prevent data
  corruption from parallel processes
- Pass --page-size 500 to himalaya envelope list to avoid silently
  missing emails beyond the default first page
- Use ollama.Client(host=...) so the config.json host setting is
  actually respected
- Fall back to sender-only matching in compute_confidence when LLM
  returns no valid taxonomy tags, instead of always returning 50%
- Fix _format_address to return empty string instead of literal
  "None" or "[]" for missing address fields
This commit is contained in:
Yanxin Lu
2026-03-20 18:58:13 -07:00
parent 4e3c6acab6
commit 71672b31ca
3 changed files with 78 additions and 59 deletions

View File

@@ -12,8 +12,10 @@ recipient, subject, summary, action taken, and whether it was a user or
auto decision.
"""
import fcntl
import json
import re
from contextlib import contextmanager
from datetime import datetime, timedelta
from pathlib import Path
from collections import Counter
@@ -34,6 +36,19 @@ _STOP_WORDS = {"re", "fwd", "the", "a", "an", "is", "to", "for", "and", "or", "y
# Internal helpers
# ---------------------------------------------------------------------------
@contextmanager
def file_lock(path):
"""Acquire an exclusive file lock for safe concurrent access."""
DATA_DIR.mkdir(exist_ok=True)
lock_path = str(path) + ".lock"
with open(lock_path, "w") as lock_file:
fcntl.flock(lock_file, fcntl.LOCK_EX)
try:
yield
finally:
fcntl.flock(lock_file, fcntl.LOCK_UN)
def _load_history():
"""Load the full decision history list from disk."""
if not HISTORY_FILE.exists():
@@ -69,7 +84,6 @@ def record_decision(email_data, action, source="user", tags=None):
source: "user" (manual review) or "auto" (high-confidence).
tags: list of category tags from the classifier taxonomy.
"""
history = _load_history()
entry = {
"timestamp": datetime.now().isoformat(timespec="seconds"),
"sender": email_data.get("sender", ""),
@@ -80,8 +94,10 @@ def record_decision(email_data, action, source="user", tags=None):
"source": source,
"tags": tags or [],
}
history.append(entry)
_save_history(history)
with file_lock(HISTORY_FILE):
history = _load_history()
history.append(entry)
_save_history(history)
return entry
@@ -152,23 +168,28 @@ def compute_confidence(sender_email, action, tags):
Returns an integer 0-100.
"""
history = _load_history()
if not history or not tags:
if not history:
return 50
# Find past decisions with same sender and sufficient tag overlap
# Find past decisions with same sender and sufficient tag overlap.
# If tags are empty (LLM returned no valid taxonomy tags), fall back
# to sender-only matching so history still contributes to confidence.
matches = []
for entry in history:
entry_email = extract_email_address(entry.get("sender", ""))
if entry_email != sender_email:
continue
entry_tags = entry.get("tags", [])
if not entry_tags:
continue
shared = len(set(tags) & set(entry_tags))
min_len = min(len(tags), len(entry_tags))
if min_len > 0 and shared / min_len >= 0.5:
if tags:
entry_tags = entry.get("tags", [])
if not entry_tags:
continue
shared = len(set(tags) & set(entry_tags))
min_len = min(len(tags), len(entry_tags))
if min_len > 0 and shared / min_len >= 0.5:
matches.append(entry)
else:
# No tags to compare — match on sender alone
matches.append(entry)
if not matches: