email-processor: fix concurrency bugs and several other issues
- Add fcntl file locking around read-modify-write cycles on both decision_history.json and pending_emails.json to prevent data corruption from parallel processes - Pass --page-size 500 to himalaya envelope list to avoid silently missing emails beyond the default first page - Use ollama.Client(host=...) so the config.json host setting is actually respected - Fall back to sender-only matching in compute_confidence when LLM returns no valid taxonomy tags, instead of always returning 50% - Fix _format_address to return empty string instead of literal "None" or "[]" for missing address fields
This commit is contained in:
@@ -100,7 +100,7 @@ def get_unseen_envelopes():
|
||||
Returns a list of envelope dicts from himalaya's JSON output.
|
||||
Each has keys like: id, subject, from, to, date, flags.
|
||||
"""
|
||||
return _himalaya_json("envelope", "list", "not", "flag", "seen")
|
||||
return _himalaya_json("envelope", "list", "-s", "500", "not", "flag", "seen")
|
||||
|
||||
|
||||
def get_recent_envelopes(days):
|
||||
@@ -110,7 +110,7 @@ def get_recent_envelopes(days):
|
||||
bulk-classifying historical mail.
|
||||
"""
|
||||
since = (datetime.now() - timedelta(days=days)).strftime("%Y-%m-%d")
|
||||
return _himalaya_json("envelope", "list", "after", since)
|
||||
return _himalaya_json("envelope", "list", "-s", "500", "after", since)
|
||||
|
||||
|
||||
def read_message(envelope_id):
|
||||
@@ -134,7 +134,9 @@ def _format_address(addr_field):
|
||||
name = first.get("name", "")
|
||||
addr = first.get("addr", "")
|
||||
return f"{name} <{addr}>" if name else addr
|
||||
return str(addr_field)
|
||||
elif isinstance(addr_field, str) and addr_field:
|
||||
return addr_field
|
||||
return ""
|
||||
|
||||
|
||||
def build_email_data(envelope, body, config):
|
||||
@@ -226,28 +228,28 @@ def add_to_pending(email_data, summary, reason, action_suggestion, confidence, t
|
||||
alongside the email metadata so the user can see what the model thought.
|
||||
Uses envelope_id as the primary identifier for review commands.
|
||||
"""
|
||||
pending = load_pending()
|
||||
|
||||
# Generate a stable ID from envelope ID + subject
|
||||
eid = str(email_data["id"])
|
||||
key = f"{eid}_{email_data['subject']}"
|
||||
msg_id = f"msg_{hashlib.md5(key.encode()).hexdigest()[:8]}"
|
||||
|
||||
pending[msg_id] = {
|
||||
"envelope_id": eid,
|
||||
"subject": email_data["subject"],
|
||||
"sender": email_data["sender"],
|
||||
"recipient": email_data.get("recipient", ""),
|
||||
"summary": summary,
|
||||
"reason": reason,
|
||||
"suggested_action": action_suggestion,
|
||||
"confidence": confidence,
|
||||
"tags": tags or [],
|
||||
"email_date": email_data.get("date", ""),
|
||||
"status": "pending",
|
||||
"found_at": datetime.now().isoformat(),
|
||||
}
|
||||
save_pending(pending)
|
||||
with decision_store.file_lock(PENDING_FILE):
|
||||
pending = load_pending()
|
||||
pending[msg_id] = {
|
||||
"envelope_id": eid,
|
||||
"subject": email_data["subject"],
|
||||
"sender": email_data["sender"],
|
||||
"recipient": email_data.get("recipient", ""),
|
||||
"summary": summary,
|
||||
"reason": reason,
|
||||
"suggested_action": action_suggestion,
|
||||
"confidence": confidence,
|
||||
"tags": tags or [],
|
||||
"email_date": email_data.get("date", ""),
|
||||
"status": "pending",
|
||||
"found_at": datetime.now().isoformat(),
|
||||
}
|
||||
save_pending(pending)
|
||||
return msg_id
|
||||
|
||||
|
||||
@@ -287,10 +289,11 @@ def cmd_scan(config, recent=None, dry_run=False):
|
||||
print("=" * 50)
|
||||
|
||||
# Clear done items from previous scan cycles
|
||||
pending = load_pending()
|
||||
cleared = {k: v for k, v in pending.items() if v.get("status") != "done"}
|
||||
if len(cleared) < len(pending):
|
||||
save_pending(cleared)
|
||||
with decision_store.file_lock(PENDING_FILE):
|
||||
pending = load_pending()
|
||||
cleared = {k: v for k, v in pending.items() if v.get("status") != "done"}
|
||||
if len(cleared) < len(pending):
|
||||
save_pending(cleared)
|
||||
|
||||
LOGS_DIR.mkdir(exist_ok=True)
|
||||
log_file = LOGS_DIR / f"{datetime.now().strftime('%Y-%m-%d')}.log"
|
||||
@@ -484,9 +487,6 @@ def cmd_review_act(selector, action):
|
||||
log_file = LOGS_DIR / f"{datetime.now().strftime('%Y-%m-%d')}.log"
|
||||
|
||||
# Execute action on each target
|
||||
pending = load_pending()
|
||||
pending_dirty = False
|
||||
|
||||
for msg_id, data in targets:
|
||||
eid = data.get("envelope_id")
|
||||
if not eid:
|
||||
@@ -498,11 +498,13 @@ def cmd_review_act(selector, action):
|
||||
# Record decision for future learning
|
||||
decision_store.record_decision(data, action, source="user", tags=data.get("tags", []))
|
||||
|
||||
# Mark as done in pending queue
|
||||
pending[msg_id]["status"] = "done"
|
||||
pending[msg_id]["action"] = action
|
||||
pending[msg_id]["processed_at"] = datetime.now().isoformat()
|
||||
pending_dirty = True
|
||||
# Mark as done in pending queue (locked to avoid concurrent corruption)
|
||||
with decision_store.file_lock(PENDING_FILE):
|
||||
pending = load_pending()
|
||||
pending[msg_id]["status"] = "done"
|
||||
pending[msg_id]["action"] = action
|
||||
pending[msg_id]["processed_at"] = datetime.now().isoformat()
|
||||
save_pending(pending)
|
||||
|
||||
log_result(log_file, data, f"REVIEW:{action}", data.get("reason", ""))
|
||||
print(f" {msg_id}: {action} -> OK ({data['subject'][:40]})")
|
||||
@@ -510,9 +512,6 @@ def cmd_review_act(selector, action):
|
||||
log_result(log_file, data, f"REVIEW_FAILED:{action}", data.get("reason", ""))
|
||||
print(f" {msg_id}: {action} -> FAILED")
|
||||
|
||||
if pending_dirty:
|
||||
save_pending(pending)
|
||||
|
||||
|
||||
def cmd_review_accept():
|
||||
"""Accept all classifier suggestions for pending emails.
|
||||
@@ -529,9 +528,6 @@ def cmd_review_accept():
|
||||
LOGS_DIR.mkdir(exist_ok=True)
|
||||
log_file = LOGS_DIR / f"{datetime.now().strftime('%Y-%m-%d')}.log"
|
||||
|
||||
pending = load_pending()
|
||||
pending_dirty = False
|
||||
|
||||
for msg_id, data in sorted_items:
|
||||
action = data.get("suggested_action")
|
||||
if not action:
|
||||
@@ -547,10 +543,12 @@ def cmd_review_accept():
|
||||
if success:
|
||||
decision_store.record_decision(data, action, source="user", tags=data.get("tags", []))
|
||||
|
||||
pending[msg_id]["status"] = "done"
|
||||
pending[msg_id]["action"] = action
|
||||
pending[msg_id]["processed_at"] = datetime.now().isoformat()
|
||||
pending_dirty = True
|
||||
with decision_store.file_lock(PENDING_FILE):
|
||||
pending = load_pending()
|
||||
pending[msg_id]["status"] = "done"
|
||||
pending[msg_id]["action"] = action
|
||||
pending[msg_id]["processed_at"] = datetime.now().isoformat()
|
||||
save_pending(pending)
|
||||
|
||||
log_result(log_file, data, f"ACCEPT:{action}", data.get("reason", ""))
|
||||
print(f" {msg_id}: {action} -> OK ({data['subject'][:40]})")
|
||||
@@ -558,9 +556,6 @@ def cmd_review_accept():
|
||||
log_result(log_file, data, f"ACCEPT_FAILED:{action}", data.get("reason", ""))
|
||||
print(f" {msg_id}: {action} -> FAILED")
|
||||
|
||||
if pending_dirty:
|
||||
save_pending(pending)
|
||||
|
||||
|
||||
def _resolve_target(selector, sorted_items):
|
||||
"""Resolve a selector (envelope_id or msg_id) to a (msg_id, data) tuple.
|
||||
|
||||
Reference in New Issue
Block a user