Compute confidence from decision history instead of LLM

2026-03-04 14:23:50 -08:00
parent 720f4ef1ad
commit 64e28b55d1
4 changed files with 133 additions and 84 deletions
--- a/scripts/email_processor/decision_store.py
+++ b/scripts/email_processor/decision_store.py
@@ -71,7 +71,7 @@ def _extract_email_address(sender):
 # Public API
 # ---------------------------------------------------------------------------

-def record_decision(email_data, action, source="user"):
+def record_decision(email_data, action, source="user", tags=None):
    """Append a decision to the history file.

    Args:
@@ -79,6 +79,7 @@ def record_decision(email_data, action, source="user"):
        action:     one of "delete", "archive", "keep", "mark_read",
                    or "label:<name>".
        source:     "user" (manual review) or "auto" (high-confidence).
+        tags:       list of category tags from the classifier taxonomy.
    """
    history = _load_history()
    entry = {
@@ -90,6 +91,7 @@ def record_decision(email_data, action, source="user"):
        "summary": email_data.get("summary", ""),
        "action": action,
        "source": source,
+        "tags": tags or [],
    }
    history.append(entry)
    _save_history(history)
@@ -99,10 +101,9 @@ def record_decision(email_data, action, source="user"):
 def get_relevant_examples(email_data, n=10):
    """Find the N most relevant past decisions for a given email.

-    Relevance is scored by three signals:
-      - Exact sender domain match:        +3 points
-      - Recipient string match:           +2 points
-      - Subject keyword overlap:          +1 point per shared word
+    Relevance is scored by two signals:
+      - Exact sender email address match:  +3 points
+      - Subject keyword overlap:           +1 point per shared word

    Only entries with score > 0 are considered. Results are returned
    sorted by descending relevance.
@@ -111,8 +112,7 @@ def get_relevant_examples(email_data, n=10):
    if not history:
        return []

-    target_domain = _extract_domain(email_data.get("sender", ""))
-    target_recipient = email_data.get("recipient", "").lower()
+    target_email = _extract_email_address(email_data.get("sender", ""))
    target_words = (
        set(re.findall(r"\w+", email_data.get("subject", "").lower())) - _STOP_WORDS
    )
@@ -121,15 +121,11 @@ def get_relevant_examples(email_data, n=10):
    for entry in history:
        score = 0

-        # Signal 1: sender domain match
-        if target_domain and entry.get("sender_domain", "") == target_domain:
+        # Signal 1: sender email match
+        if target_email and _extract_email_address(entry.get("sender", "")) == target_email:
            score += 3

-        # Signal 2: recipient substring match
-        if target_recipient and target_recipient in entry.get("recipient", "").lower():
-            score += 2
-
-        # Signal 3: subject keyword overlap
+        # Signal 2: subject keyword overlap
        entry_words = (
            set(re.findall(r"\w+", entry.get("subject", "").lower())) - _STOP_WORDS
        )
@@ -142,27 +138,64 @@ def get_relevant_examples(email_data, n=10):
    return [entry for _, entry in scored[:n]]


-def get_sender_stats(sender_domain):
-    """Get action distribution for a sender domain.
+def get_sender_stats(sender_email):
+    """Get action distribution for a sender email address.

    Returns a dict like {"delete": 5, "keep": 2, "archive": 1}.
    """
    history = _load_history()
    actions = Counter()
    for entry in history:
-        if entry.get("sender_domain", "") == sender_domain:
+        if _extract_email_address(entry.get("sender", "")) == sender_email:
            actions[entry["action"]] += 1
    return dict(actions)


-def get_sender_history_count(sender_domain):
-    """Count total past decisions for a sender domain.
+def compute_confidence(sender_email, action, tags):
+    """Compute confidence from decision history by matching email signatures.

-    Used by the scan command to decide whether there is enough history
-    to trust auto-actions for this sender.
+    A "signature" is (sender_email, tags). Past decisions match if they have
+    the same sender email AND at least 50% tag overlap with the current email.
+
+    Confidence is based on two factors:
+      1. Agreement: what fraction of matching decisions chose the same action.
+      2. Match-count cap: limits confidence until enough history exists
+         (1 match -> max 10%, 5 matches -> 50%, 10+ -> 100%).
+
+    Returns an integer 0-100.
    """
    history = _load_history()
-    return sum(1 for e in history if e.get("sender_domain", "") == sender_domain)
+    if not history or not tags:
+        return 50
+
+    # Find past decisions with same sender and sufficient tag overlap
+    matches = []
+    for entry in history:
+        entry_email = _extract_email_address(entry.get("sender", ""))
+        if entry_email != sender_email:
+            continue
+
+        entry_tags = entry.get("tags", [])
+        if not entry_tags:
+            continue
+
+        shared = len(set(tags) & set(entry_tags))
+        min_len = min(len(tags), len(entry_tags))
+        if min_len > 0 and shared / min_len >= 0.5:
+            matches.append(entry)
+
+    if not matches:
+        return 50
+
+    # Agreement: fraction of matches with the same action
+    matching_action = sum(1 for m in matches if m["action"] == action)
+    total = len(matches)
+    agreement = round(matching_action / total * 100)
+
+    # Cap by match count: each match adds 10% to the cap
+    cap = min(total * 10, 100)
+
+    return min(agreement, cap)


 def get_known_labels():
@@ -194,13 +227,13 @@ def get_all_stats():
    by_action = Counter(e["action"] for e in history)
    by_source = Counter(e["source"] for e in history)

-    # Top 10 sender domains by decision count
-    domain_counts = Counter(e.get("sender_domain", "") for e in history)
-    top_domains = domain_counts.most_common(10)
+    # Top 10 sender addresses by decision count
+    sender_counts = Counter(_extract_email_address(e.get("sender", "")) for e in history)
+    top_senders = sender_counts.most_common(10)

    return {
        "total": total,
        "by_action": dict(by_action),
        "by_source": dict(by_source),
-        "top_domains": top_domains,
+        "top_senders": top_senders,
    }