email-processor: fix concurrency bugs and several other issues
- Add fcntl file locking around read-modify-write cycles on both decision_history.json and pending_emails.json to prevent data corruption from parallel processes - Pass --page-size 500 to himalaya envelope list to avoid silently missing emails beyond the default first page - Use ollama.Client(host=...) so the config.json host setting is actually respected - Fall back to sender-only matching in compute_confidence when LLM returns no valid taxonomy tags, instead of always returning 50% - Fix _format_address to return empty string instead of literal "None" or "[]" for missing address fields
This commit is contained in:
@@ -12,8 +12,10 @@ recipient, subject, summary, action taken, and whether it was a user or
|
||||
auto decision.
|
||||
"""
|
||||
|
||||
import fcntl
|
||||
import json
|
||||
import re
|
||||
from contextlib import contextmanager
|
||||
from datetime import datetime, timedelta
|
||||
from pathlib import Path
|
||||
from collections import Counter
|
||||
@@ -34,6 +36,19 @@ _STOP_WORDS = {"re", "fwd", "the", "a", "an", "is", "to", "for", "and", "or", "y
|
||||
# Internal helpers
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
@contextmanager
|
||||
def file_lock(path):
|
||||
"""Acquire an exclusive file lock for safe concurrent access."""
|
||||
DATA_DIR.mkdir(exist_ok=True)
|
||||
lock_path = str(path) + ".lock"
|
||||
with open(lock_path, "w") as lock_file:
|
||||
fcntl.flock(lock_file, fcntl.LOCK_EX)
|
||||
try:
|
||||
yield
|
||||
finally:
|
||||
fcntl.flock(lock_file, fcntl.LOCK_UN)
|
||||
|
||||
|
||||
def _load_history():
|
||||
"""Load the full decision history list from disk."""
|
||||
if not HISTORY_FILE.exists():
|
||||
@@ -69,7 +84,6 @@ def record_decision(email_data, action, source="user", tags=None):
|
||||
source: "user" (manual review) or "auto" (high-confidence).
|
||||
tags: list of category tags from the classifier taxonomy.
|
||||
"""
|
||||
history = _load_history()
|
||||
entry = {
|
||||
"timestamp": datetime.now().isoformat(timespec="seconds"),
|
||||
"sender": email_data.get("sender", ""),
|
||||
@@ -80,8 +94,10 @@ def record_decision(email_data, action, source="user", tags=None):
|
||||
"source": source,
|
||||
"tags": tags or [],
|
||||
}
|
||||
history.append(entry)
|
||||
_save_history(history)
|
||||
with file_lock(HISTORY_FILE):
|
||||
history = _load_history()
|
||||
history.append(entry)
|
||||
_save_history(history)
|
||||
return entry
|
||||
|
||||
|
||||
@@ -152,23 +168,28 @@ def compute_confidence(sender_email, action, tags):
|
||||
Returns an integer 0-100.
|
||||
"""
|
||||
history = _load_history()
|
||||
if not history or not tags:
|
||||
if not history:
|
||||
return 50
|
||||
|
||||
# Find past decisions with same sender and sufficient tag overlap
|
||||
# Find past decisions with same sender and sufficient tag overlap.
|
||||
# If tags are empty (LLM returned no valid taxonomy tags), fall back
|
||||
# to sender-only matching so history still contributes to confidence.
|
||||
matches = []
|
||||
for entry in history:
|
||||
entry_email = extract_email_address(entry.get("sender", ""))
|
||||
if entry_email != sender_email:
|
||||
continue
|
||||
|
||||
entry_tags = entry.get("tags", [])
|
||||
if not entry_tags:
|
||||
continue
|
||||
|
||||
shared = len(set(tags) & set(entry_tags))
|
||||
min_len = min(len(tags), len(entry_tags))
|
||||
if min_len > 0 and shared / min_len >= 0.5:
|
||||
if tags:
|
||||
entry_tags = entry.get("tags", [])
|
||||
if not entry_tags:
|
||||
continue
|
||||
shared = len(set(tags) & set(entry_tags))
|
||||
min_len = min(len(tags), len(entry_tags))
|
||||
if min_len > 0 and shared / min_len >= 0.5:
|
||||
matches.append(entry)
|
||||
else:
|
||||
# No tags to compare — match on sender alone
|
||||
matches.append(entry)
|
||||
|
||||
if not matches:
|
||||
|
||||
Reference in New Issue
Block a user