Compute confidence from decision history instead of LLM

2026-03-04 14:23:50 -08:00
parent 720f4ef1ad
commit 64e28b55d1
4 changed files with 133 additions and 84 deletions
--- a/scripts/email_processor/main.py
+++ b/scripts/email_processor/main.py
@@ -232,11 +232,11 @@ def save_pending(pending):
        json.dump(pending, f, indent=2, ensure_ascii=False)


-def add_to_pending(email_data, summary, reason, action_suggestion, confidence):
+def add_to_pending(email_data, summary, reason, action_suggestion, confidence, tags=None):
    """Add an email to the pending queue for manual review.

-    Stores the classifier's suggestion and confidence alongside the
-    email metadata so the user can see what the model thought.
+    Stores the classifier's suggestion, computed confidence, and tags
+    alongside the email metadata so the user can see what the model thought.
    """
    pending = load_pending()

@@ -254,6 +254,7 @@ def add_to_pending(email_data, summary, reason, action_suggestion, confidence):
        "reason": reason,
        "suggested_action": action_suggestion,
        "confidence": confidence,
+        "tags": tags or [],
        "email_date": email_data.get("date", ""),
        "status": "pending",
        "found_at": datetime.now().isoformat(),
@@ -283,10 +284,10 @@ def log_result(log_file, email_data, action, detail, duration=None):
 def cmd_scan(config, recent=None, dry_run=False):
    """Fetch emails, classify each one, then auto-act or queue.

-    Auto-action is based on a single confidence threshold. When the
-    decision history has fewer than 20 entries, a higher threshold (95%)
-    is used to be conservative during the learning phase. Once enough
-    history accumulates, the configured threshold takes over.
+    Confidence is computed from decision history by matching the email's
+    signature (sender_email, tags) against past decisions. New/unknown
+    senders start at 50% (queued). Confidence grows as consistent history
+    accumulates.

    Args:
        config:  full config dict.
@@ -302,17 +303,7 @@ def cmd_scan(config, recent=None, dry_run=False):

    # Load automation threshold
    automation = config.get("automation", {})
-    configured_threshold = automation.get("confidence_threshold", 75)
-
-    # Adaptive threshold: be conservative when history is thin
-    stats = decision_store.get_all_stats()
-    total_decisions = stats["total"] if stats else 0
-    bootstrap_min = automation.get("bootstrap_min_decisions", 20)
-    if total_decisions < bootstrap_min:
-        confidence_threshold = 95
-        print(f"Learning phase ({total_decisions}/{bootstrap_min} decisions) — threshold: 95%\n")
-    else:
-        confidence_threshold = configured_threshold
+    confidence_threshold = automation.get("confidence_threshold", 75)

    # Fetch envelopes via himalaya
    if recent:
@@ -354,12 +345,18 @@ def cmd_scan(config, recent=None, dry_run=False):
        email_data = build_email_data(envelope, body, config)
        print(f"{email_data['subject'][:55]}")

-        # Run the LLM classifier (includes few-shot examples from history)
-        action, confidence, summary, reason, duration = classifier.classify_email(
+        # Run the LLM classifier (returns tags instead of confidence)
+        action, tags, summary, reason, duration = classifier.classify_email(
            email_data, config
        )

+        # Compute confidence from decision history
+        sender_email = decision_store._extract_email_address(email_data.get("sender", ""))
+        confidence = decision_store.compute_confidence(sender_email, action, tags)
+
+        tags_str = ", ".join(tags) if tags else "(none)"
        print(f"    -> {action} (confidence: {confidence}%, {duration:.1f}s)")
+        print(f"       tags: [{tags_str}]")
        print(f"       {reason[:80]}")

        # Auto-act if confidence meets threshold
@@ -379,7 +376,7 @@ def cmd_scan(config, recent=None, dry_run=False):
            success = execute_action(eid, action)
            if success:
                decision_store.record_decision(
-                    {**email_data, "summary": summary}, action, source="auto"
+                    {**email_data, "summary": summary}, action, source="auto", tags=tags
                )
                log_result(log_file, email_data, f"AUTO:{action}", reason, duration)
                print(f"    ** AUTO-executed: {action}")
@@ -388,11 +385,11 @@ def cmd_scan(config, recent=None, dry_run=False):
                # Himalaya action failed — fall back to queuing
                log_result(log_file, email_data, "AUTO_FAILED", reason, duration)
                print(f"    !! Auto-action failed, queuing instead")
-                add_to_pending(email_data, summary, reason, action, confidence)
+                add_to_pending(email_data, summary, reason, action, confidence, tags)
                queued += 1
        else:
            # Not enough confidence or history — queue for manual review
-            add_to_pending(email_data, summary, reason, action, confidence)
+            add_to_pending(email_data, summary, reason, action, confidence, tags)
            log_result(log_file, email_data, f"QUEUED:{action}@{confidence}%", reason, duration)
            print(f"    -> Queued (confidence {confidence}% < {confidence_threshold}%)")
            queued += 1
@@ -440,11 +437,14 @@ def cmd_review_list():
    for i, (msg_id, data) in enumerate(sorted_items, 1):
        suggested = data.get("suggested_action", "?")
        conf = data.get("confidence", "?")
+        tags = data.get("tags", [])
+        tags_str = ", ".join(tags) if tags else "(none)"
        print(f"\n  {i}. [{msg_id}]")
        print(f"     Subject: {data.get('subject', 'N/A')[:55]}")
        print(f"     From: {data.get('sender', 'N/A')[:55]}")
        print(f"     To: {data.get('recipient', 'N/A')[:40]}")
        print(f"     Summary: {data.get('summary', 'N/A')[:70]}")
+        print(f"     Tags: [{tags_str}]")
        print(f"     Suggested: {suggested} ({conf}% confidence)")

    print(f"\n{'=' * 60}")
@@ -496,7 +496,7 @@ def cmd_review_act(selector, action):
        success = execute_action(eid, action)
        if success:
            # Record decision for future learning
-            decision_store.record_decision(data, action, source="user")
+            decision_store.record_decision(data, action, source="user", tags=data.get("tags", []))

            # Mark as done in pending queue
            pending = load_pending()
@@ -540,7 +540,7 @@ def cmd_review_accept():

        success = execute_action(eid, action)
        if success:
-            decision_store.record_decision(data, action, source="user")
+            decision_store.record_decision(data, action, source="user", tags=data.get("tags", []))

            pending = load_pending()
            pending[msg_id]["status"] = "done"
@@ -616,14 +616,14 @@ def cmd_stats():
    for action, count in sorted(stats["by_action"].items(), key=lambda x: -x[1]):
        print(f"  {action}: {count}")

-    # Top sender domains with per-domain action counts
-    print(f"\nTop sender domains:")
-    for domain, count in stats["top_domains"]:
-        domain_stats = decision_store.get_sender_stats(domain)
+    # Top sender addresses with per-sender action counts
+    print(f"\nTop senders:")
+    for sender, count in stats["top_senders"]:
+        sender_stats = decision_store.get_sender_stats(sender)
        detail = ", ".join(
-            f"{a}:{c}" for a, c in sorted(domain_stats.items(), key=lambda x: -x[1])
+            f"{a}:{c}" for a, c in sorted(sender_stats.items(), key=lambda x: -x[1])
        )
-        print(f"  {domain}: {count} ({detail})")
+        print(f"  {sender}: {count} ({detail})")

    # Custom labels
    labels = decision_store.get_known_labels()