Stable review indices and deduplicate tag taxonomy

Review items now get a stable scan_index assigned during scan, so
sequential review commands don't target wrong emails after earlier
items are resolved. Indices reset on each new scan.

Deduplicate tag taxonomy from 21 to 14 tags: drop invoice/payment
(covered by billing), delivery (covered by shipping), discount/marketing
(covered by promotion), and generic notification/update tags.
This commit is contained in:
Yanxin Lu
2026-03-05 15:02:49 -08:00
parent 81bc42075f
commit 361e983b0f
3 changed files with 53 additions and 25 deletions

View File

@@ -237,6 +237,9 @@ def add_to_pending(email_data, summary, reason, action_suggestion, confidence, t
Stores the classifier's suggestion, computed confidence, and tags
alongside the email metadata so the user can see what the model thought.
Each item gets a stable scan_index (assigned sequentially within a scan
cycle) so that review commands can reference items by number without
indices shifting after earlier items are resolved.
"""
pending = load_pending()
@@ -245,6 +248,14 @@ def add_to_pending(email_data, summary, reason, action_suggestion, confidence, t
key = f"{eid}_{email_data['subject']}"
msg_id = f"msg_{hashlib.md5(key.encode()).hexdigest()[:8]}"
# Assign the next scan_index: max of existing pending items + 1
existing_indices = [
v.get("scan_index", 0)
for v in pending.values()
if v.get("status") == "pending"
]
next_index = max(existing_indices, default=0) + 1
pending[msg_id] = {
"envelope_id": eid,
"subject": email_data["subject"],
@@ -258,6 +269,7 @@ def add_to_pending(email_data, summary, reason, action_suggestion, confidence, t
"email_date": email_data.get("date", ""),
"status": "pending",
"found_at": datetime.now().isoformat(),
"scan_index": next_index,
}
save_pending(pending)
return msg_id
@@ -298,6 +310,13 @@ def cmd_scan(config, recent=None, dry_run=False):
print(f"Email Processor - {mode}")
print("=" * 50)
# Clear done items from previous scan cycles so new items get
# scan_index values starting from 1.
pending = load_pending()
cleared = {k: v for k, v in pending.items() if v.get("status") != "done"}
if len(cleared) < len(pending):
save_pending(cleared)
LOGS_DIR.mkdir(exist_ok=True)
log_file = LOGS_DIR / f"{datetime.now().strftime('%Y-%m-%d')}.log"
@@ -412,10 +431,10 @@ def cmd_scan(config, recent=None, dry_run=False):
# ---------------------------------------------------------------------------
def _get_pending_items():
"""Return only pending (not done) items, sorted by found_at."""
"""Return only pending (not done) items, sorted by scan_index."""
pending = load_pending()
items = {k: v for k, v in pending.items() if v.get("status") == "pending"}
sorted_items = sorted(items.items(), key=lambda x: x[1].get("found_at", ""))
sorted_items = sorted(items.items(), key=lambda x: x[1].get("scan_index", 0))
return sorted_items
@@ -434,12 +453,13 @@ def cmd_review_list():
print(f"Pending emails: {len(sorted_items)}")
print("=" * 60)
for i, (msg_id, data) in enumerate(sorted_items, 1):
for msg_id, data in sorted_items:
num = data.get("scan_index", "?")
suggested = data.get("suggested_action", "?")
conf = data.get("confidence", "?")
tags = data.get("tags", [])
tags_str = ", ".join(tags) if tags else "(none)"
print(f"\n {i}. [{msg_id}]")
print(f"\n {num}. [{msg_id}]")
print(f" Subject: {data.get('subject', 'N/A')[:55]}")
print(f" From: {data.get('sender', 'N/A')[:55]}")
print(f" To: {data.get('recipient', 'N/A')[:40]}")
@@ -556,18 +576,21 @@ def cmd_review_accept():
def _resolve_target(selector, sorted_items):
"""Resolve a selector (number or msg_id) to a (msg_id, data) tuple.
"""Resolve a selector (scan_index number or msg_id) to a (msg_id, data) tuple.
When given a number, looks up the pending item whose scan_index matches
(stable across deletions). When given a string, looks up by msg_id.
Returns None and prints an error if the selector is invalid.
"""
# Try as 1-based index
# Try as scan_index number
try:
idx = int(selector) - 1
if 0 <= idx < len(sorted_items):
return sorted_items[idx]
else:
print(f"Invalid number. Range: 1-{len(sorted_items)}")
return None
idx = int(selector)
for msg_id, data in sorted_items:
if data.get("scan_index") == idx:
return (msg_id, data)
valid = [str(d.get("scan_index")) for _, d in sorted_items]
print(f"No item with number {idx}. Valid numbers: {', '.join(valid)}")
return None
except ValueError:
pass