Compute confidence from decision history instead of LLM

2026-03-04 14:23:50 -08:00
parent 720f4ef1ad
commit 64e28b55d1
4 changed files with 133 additions and 84 deletions
--- a/scripts/email_processor/classifier.py
+++ b/scripts/email_processor/classifier.py
@@ -5,7 +5,10 @@ Classifier - LLM-based email classification with learning.
 This module builds a rich prompt for the local Ollama model (Qwen3) that
 includes few-shot examples from past user decisions, per-sender statistics,
 and a list of known labels. The model returns a structured response with
-an action, confidence score, summary, and reason.
+an action, category tags, summary, and reason.
+
+Confidence is NOT produced by the LLM — it is computed externally from
+decision history by decision_store.compute_confidence().

 The prompt structure:
  1. System instructions (action definitions)
@@ -13,7 +16,7 @@ The prompt structure:
  3. Sender statistics ("linkedin.com: deleted 8 times, kept 2 times")
  4. Few-shot examples (top 5 most relevant past decisions)
  5. The email to classify (subject, sender, recipient, body preview)
-  6. Output format specification
+  6. Output format specification (action, tags, summary, reason)
 """

 import time
@@ -24,6 +27,15 @@ import decision_store

 LOGS_DIR = Path(__file__).parent / "logs"

+TAG_TAXONOMY = [
+    "receipt", "invoice", "payment", "billing",
+    "shipping", "delivery",
+    "promotion", "discount", "marketing", "newsletter",
+    "notification", "security", "social",
+    "reminder", "confirmation", "update", "alert",
+    "personal", "account", "subscription", "travel",
+]
+

 def _build_prompt(email_data, config):
    """Assemble the full classification prompt with learning context.
@@ -36,8 +48,8 @@ def _build_prompt(email_data, config):

    # Gather learning context from decision history
    examples = decision_store.get_relevant_examples(email_data, n=10)
-    sender_domain = decision_store._extract_domain(email_data.get("sender", ""))
-    sender_stats = decision_store.get_sender_stats(sender_domain) if sender_domain else {}
+    sender_email = decision_store._extract_email_address(email_data.get("sender", ""))
+    sender_stats = decision_store.get_sender_stats(sender_email) if sender_email else {}
    known_labels = decision_store.get_known_labels()

    # /no_think disables Qwen3's chain-of-thought, giving faster + shorter output
@@ -63,7 +75,7 @@ def _build_prompt(email_data, config):
        stats_str = ", ".join(
            f"{action} {count} times" for action, count in sender_stats.items()
        )
-        parts.append(f"\nHistory for {sender_domain}: {stats_str}\n")
+        parts.append(f"\nHistory for {sender_email}: {stats_str}\n")

    # Section 4: Few-shot examples (top 5 most relevant past decisions)
    if examples:
@@ -86,10 +98,11 @@ def _build_prompt(email_data, config):
    )

    # Section 6: Required output format
+    tags_list = ", ".join(TAG_TAXONOMY)
    parts.append(
        "Respond in this exact format (nothing else):\n"
        "Action: [delete|archive|keep|mark_read|label:<name>]\n"
-        "Confidence: [0-100]\n"
+        f"Tags: [comma-separated tags from: {tags_list}]\n"
        "Summary: [one sentence summary of the email]\n"
        "Reason: [brief explanation for your classification]"
    )
@@ -97,18 +110,19 @@ def _build_prompt(email_data, config):
    return "\n".join(parts)


-def _log_llm(prompt, output, email_data, action, confidence, duration):
+def _log_llm(prompt, output, email_data, action, tags, duration):
    """Log the full LLM prompt and response to logs/llm_YYYY-MM-DD.log."""
    LOGS_DIR.mkdir(exist_ok=True)
    log_file = LOGS_DIR / f"llm_{datetime.now().strftime('%Y-%m-%d')}.log"
    timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
    subject = email_data.get("subject", "(No Subject)")[:60]
    sender = email_data.get("sender", "(Unknown)")[:60]
+    tags_str = ", ".join(tags)

    with open(log_file, "a", encoding="utf-8") as f:
        f.write(f"{'=' * 70}\n")
        f.write(f"[{timestamp}] {subject}\n")
-        f.write(f"From: {sender} | Result: {action} @ {confidence}% | {duration:.1f}s\n")
+        f.write(f"From: {sender} | Result: {action} tags=[{tags_str}] | {duration:.1f}s\n")
        f.write(f"{'-' * 70}\n")
        f.write(f"PROMPT:\n{prompt}\n")
        f.write(f"{'-' * 70}\n")
@@ -121,17 +135,19 @@ def _parse_response(output):

    Expected format (one per line):
        Action: delete
-        Confidence: 92
+        Tags: promotion, marketing, newsletter
        Summary: Promotional offer from retailer
        Reason: Clearly a marketing email with discount offer

-    Falls back to safe defaults (keep, 50% confidence) on parse failure.
+    Falls back to safe defaults (keep, empty tags) on parse failure.
    """
    action = "keep"
-    confidence = 50
+    tags = []
    summary = "No summary"
    reason = "Unknown"

+    valid_tags = set(TAG_TAXONOMY)
+
    for line in output.strip().split("\n"):
        line = line.strip()
        if line.startswith("Action:"):
@@ -139,25 +155,26 @@ def _parse_response(output):
            valid_actions = {"delete", "archive", "keep", "mark_read"}
            if raw_action in valid_actions or raw_action.startswith("label:"):
                action = raw_action
-        elif line.startswith("Confidence:"):
-            try:
-                confidence = int(line.replace("Confidence:", "").strip().rstrip("%"))
-                confidence = max(0, min(100, confidence))  # clamp to 0-100
-            except ValueError:
-                confidence = 50
+        elif line.startswith("Tags:"):
+            raw_tags = line.replace("Tags:", "").strip()
+            tags = [
+                t.strip().lower()
+                for t in raw_tags.split(",")
+                if t.strip().lower() in valid_tags
+            ]
        elif line.startswith("Summary:"):
            summary = line.replace("Summary:", "").strip()[:200]
        elif line.startswith("Reason:"):
            reason = line.replace("Reason:", "").strip()

-    return action, confidence, summary, reason
+    return action, tags, summary, reason


 def classify_email(email_data, config):
    """Classify an email using the local LLM with few-shot learning context.

    Connects to Ollama, sends the assembled prompt, and parses the response.
-    On any error, falls back to "keep" with 0% confidence so the email
+    On any error, falls back to "keep" with empty tags so the email
    gets queued for manual review rather than auto-acted upon.

    Args:
@@ -165,7 +182,7 @@ def classify_email(email_data, config):
        config:     full config dict (needs ollama.model and rules.max_body_length).

    Returns:
-        Tuple of (action, confidence, summary, reason, duration_seconds).
+        Tuple of (action, tags, summary, reason, duration_seconds).
    """
    import ollama

@@ -177,15 +194,15 @@ def classify_email(email_data, config):
        # Low temperature for consistent classification
        response = ollama.generate(model=model, prompt=prompt, options={"temperature": 0.1})
        output = response["response"]
-        action, confidence, summary, reason = _parse_response(output)
+        action, tags, summary, reason = _parse_response(output)
    except Exception as e:
-        # On failure, default to "keep" with 0 confidence -> always queued
+        # On failure, default to "keep" with empty tags -> always queued
        output = f"ERROR: {e}"
        action = "keep"
-        confidence = 0
+        tags = []
        summary = "Classification failed"
        reason = f"error - {str(e)[:100]}"

    duration = time.time() - start_time
-    _log_llm(prompt, output, email_data, action, confidence, duration)
-    return action, confidence, summary, reason, duration
+    _log_llm(prompt, output, email_data, action, tags, duration)
+    return action, tags, summary, reason, duration