Compute confidence from decision history instead of LLM

2026-03-04 14:23:50 -08:00
parent 720f4ef1ad
commit 64e28b55d1
4 changed files with 133 additions and 84 deletions
--- a/scripts/email_processor/classifier.py
+++ b/scripts/email_processor/classifier.py
@@ -5,7 +5,10 @@ Classifier - LLM-based email classification with learning.
 This module builds a rich prompt for the local Ollama model (Qwen3) that
 includes few-shot examples from past user decisions, per-sender statistics,
 and a list of known labels. The model returns a structured response with
-an action, confidence score, summary, and reason.
+an action, category tags, summary, and reason.
 Confidence is NOT produced by the LLM — it is computed externally from
 decision history by decision_store.compute_confidence().
 The prompt structure:
  1. System instructions (action definitions)
@@ -13,7 +16,7 @@ The prompt structure:
  3. Sender statistics ("linkedin.com: deleted 8 times, kept 2 times")
  4. Few-shot examples (top 5 most relevant past decisions)
  5. The email to classify (subject, sender, recipient, body preview)
-  6. Output format specification
+  6. Output format specification (action, tags, summary, reason)
 """
 import time
@@ -24,6 +27,15 @@ import decision_store
 LOGS_DIR = Path(__file__).parent / "logs"
 TAG_TAXONOMY = [
    "receipt", "invoice", "payment", "billing",
    "shipping", "delivery",
    "promotion", "discount", "marketing", "newsletter",
    "notification", "security", "social",
    "reminder", "confirmation", "update", "alert",
    "personal", "account", "subscription", "travel",
 ]
 def _build_prompt(email_data, config):
    """Assemble the full classification prompt with learning context.
@@ -36,8 +48,8 @@ def _build_prompt(email_data, config):
    # Gather learning context from decision history
    examples = decision_store.get_relevant_examples(email_data, n=10)
-    sender_domain = decision_store._extract_domain(email_data.get("sender", ""))
+    sender_email = decision_store._extract_email_address(email_data.get("sender", ""))
-    sender_stats = decision_store.get_sender_stats(sender_domain) if sender_domain else {}
+    sender_stats = decision_store.get_sender_stats(sender_email) if sender_email else {}
    known_labels = decision_store.get_known_labels()
    # /no_think disables Qwen3's chain-of-thought, giving faster + shorter output
@@ -63,7 +75,7 @@ def _build_prompt(email_data, config):
        stats_str = ", ".join(
            f"{action} {count} times" for action, count in sender_stats.items()
        )
-        parts.append(f"\nHistory for {sender_domain}: {stats_str}\n")
+        parts.append(f"\nHistory for {sender_email}: {stats_str}\n")
    # Section 4: Few-shot examples (top 5 most relevant past decisions)
    if examples:
@@ -86,10 +98,11 @@ def _build_prompt(email_data, config):
    )
    # Section 6: Required output format
    tags_list = ", ".join(TAG_TAXONOMY)
    parts.append(
        "Respond in this exact format (nothing else):\n"
        "Action: [delete|archive|keep|mark_read|label:<name>]\n"
-        "Confidence: [0-100]\n"
+        f"Tags: [comma-separated tags from: {tags_list}]\n"
        "Summary: [one sentence summary of the email]\n"
        "Reason: [brief explanation for your classification]"
    )
@@ -97,18 +110,19 @@ def _build_prompt(email_data, config):
    return "\n".join(parts)
-def _log_llm(prompt, output, email_data, action, confidence, duration):
+def _log_llm(prompt, output, email_data, action, tags, duration):
    """Log the full LLM prompt and response to logs/llm_YYYY-MM-DD.log."""
    LOGS_DIR.mkdir(exist_ok=True)
    log_file = LOGS_DIR / f"llm_{datetime.now().strftime('%Y-%m-%d')}.log"
    timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
    subject = email_data.get("subject", "(No Subject)")[:60]
    sender = email_data.get("sender", "(Unknown)")[:60]
    tags_str = ", ".join(tags)
    with open(log_file, "a", encoding="utf-8") as f:
        f.write(f"{'=' * 70}\n")
        f.write(f"[{timestamp}] {subject}\n")
-        f.write(f"From: {sender} | Result: {action} @ {confidence}% | {duration:.1f}s\n")
+        f.write(f"From: {sender} | Result: {action} tags=[{tags_str}] | {duration:.1f}s\n")
        f.write(f"{'-' * 70}\n")
        f.write(f"PROMPT:\n{prompt}\n")
        f.write(f"{'-' * 70}\n")
@@ -121,17 +135,19 @@ def _parse_response(output):
    Expected format (one per line):
        Action: delete
-        Confidence: 92
+        Tags: promotion, marketing, newsletter
        Summary: Promotional offer from retailer
        Reason: Clearly a marketing email with discount offer
-    Falls back to safe defaults (keep, 50% confidence) on parse failure.
+    Falls back to safe defaults (keep, empty tags) on parse failure.
    """
    action = "keep"
-    confidence = 50
+    tags = []
    summary = "No summary"
    reason = "Unknown"
    valid_tags = set(TAG_TAXONOMY)
    for line in output.strip().split("\n"):
        line = line.strip()
        if line.startswith("Action:"):
@@ -139,25 +155,26 @@ def _parse_response(output):
            valid_actions = {"delete", "archive", "keep", "mark_read"}
            if raw_action in valid_actions or raw_action.startswith("label:"):
                action = raw_action
-        elif line.startswith("Confidence:"):
+        elif line.startswith("Tags:"):
-            try:
+            raw_tags = line.replace("Tags:", "").strip()
-                confidence = int(line.replace("Confidence:", "").strip().rstrip("%"))
+            tags = [
-                confidence = max(0, min(100, confidence))  # clamp to 0-100
+                t.strip().lower()
-            except ValueError:
+                for t in raw_tags.split(",")
-                confidence = 50
+                if t.strip().lower() in valid_tags
            ]
        elif line.startswith("Summary:"):
            summary = line.replace("Summary:", "").strip()[:200]
        elif line.startswith("Reason:"):
            reason = line.replace("Reason:", "").strip()
-    return action, confidence, summary, reason
+    return action, tags, summary, reason
 def classify_email(email_data, config):
    """Classify an email using the local LLM with few-shot learning context.
    Connects to Ollama, sends the assembled prompt, and parses the response.
-    On any error, falls back to "keep" with 0% confidence so the email
+    On any error, falls back to "keep" with empty tags so the email
    gets queued for manual review rather than auto-acted upon.
    Args:
@@ -165,7 +182,7 @@ def classify_email(email_data, config):
        config:     full config dict (needs ollama.model and rules.max_body_length).
    Returns:
-        Tuple of (action, confidence, summary, reason, duration_seconds).
+        Tuple of (action, tags, summary, reason, duration_seconds).
    """
    import ollama
@@ -177,15 +194,15 @@ def classify_email(email_data, config):
        # Low temperature for consistent classification
        response = ollama.generate(model=model, prompt=prompt, options={"temperature": 0.1})
        output = response["response"]
-        action, confidence, summary, reason = _parse_response(output)
+        action, tags, summary, reason = _parse_response(output)
    except Exception as e:
-        # On failure, default to "keep" with 0 confidence -> always queued
+        # On failure, default to "keep" with empty tags -> always queued
        output = f"ERROR: {e}"
        action = "keep"
-        confidence = 0
+        tags = []
        summary = "Classification failed"
        reason = f"error - {str(e)[:100]}"
    duration = time.time() - start_time
-    _log_llm(prompt, output, email_data, action, confidence, duration)
+    _log_llm(prompt, output, email_data, action, tags, duration)
-    return action, confidence, summary, reason, duration
+    return action, tags, summary, reason, duration
--- a/scripts/email_processor/config.json
+++ b/scripts/email_processor/config.json
@@ -8,7 +8,6 @@
    "check_unseen_only": true
  },
  "automation": {
-    "confidence_threshold": 75,
+    "confidence_threshold": 85
    "bootstrap_min_decisions": 30
  }
 }
--- a/scripts/email_processor/decision_store.py
+++ b/scripts/email_processor/decision_store.py
@@ -71,7 +71,7 @@ def _extract_email_address(sender):
 # Public API
 # ---------------------------------------------------------------------------
-def record_decision(email_data, action, source="user"):
+def record_decision(email_data, action, source="user", tags=None):
    """Append a decision to the history file.
    Args:
@@ -79,6 +79,7 @@ def record_decision(email_data, action, source="user"):
        action:     one of "delete", "archive", "keep", "mark_read",
                    or "label:<name>".
        source:     "user" (manual review) or "auto" (high-confidence).
        tags:       list of category tags from the classifier taxonomy.
    """
    history = _load_history()
    entry = {
@@ -90,6 +91,7 @@ def record_decision(email_data, action, source="user"):
        "summary": email_data.get("summary", ""),
        "action": action,
        "source": source,
        "tags": tags or [],
    }
    history.append(entry)
    _save_history(history)
@@ -99,10 +101,9 @@ def record_decision(email_data, action, source="user"):
 def get_relevant_examples(email_data, n=10):
    """Find the N most relevant past decisions for a given email.
-    Relevance is scored by three signals:
+    Relevance is scored by two signals:
-      - Exact sender domain match:        +3 points
+      - Exact sender email address match:  +3 points
-      - Recipient string match:           +2 points
+      - Subject keyword overlap:           +1 point per shared word
      - Subject keyword overlap:          +1 point per shared word
    Only entries with score > 0 are considered. Results are returned
    sorted by descending relevance.
@@ -111,8 +112,7 @@ def get_relevant_examples(email_data, n=10):
    if not history:
        return []
-    target_domain = _extract_domain(email_data.get("sender", ""))
+    target_email = _extract_email_address(email_data.get("sender", ""))
    target_recipient = email_data.get("recipient", "").lower()
    target_words = (
        set(re.findall(r"\w+", email_data.get("subject", "").lower())) - _STOP_WORDS
    )
@@ -121,15 +121,11 @@ def get_relevant_examples(email_data, n=10):
    for entry in history:
        score = 0
-        # Signal 1: sender domain match
+        # Signal 1: sender email match
-        if target_domain and entry.get("sender_domain", "") == target_domain:
+        if target_email and _extract_email_address(entry.get("sender", "")) == target_email:
            score += 3
-        # Signal 2: recipient substring match
+        # Signal 2: subject keyword overlap
        if target_recipient and target_recipient in entry.get("recipient", "").lower():
            score += 2
        # Signal 3: subject keyword overlap
        entry_words = (
            set(re.findall(r"\w+", entry.get("subject", "").lower())) - _STOP_WORDS
        )
@@ -142,27 +138,64 @@ def get_relevant_examples(email_data, n=10):
    return [entry for _, entry in scored[:n]]
-def get_sender_stats(sender_domain):
+def get_sender_stats(sender_email):
-    """Get action distribution for a sender domain.
+    """Get action distribution for a sender email address.
    Returns a dict like {"delete": 5, "keep": 2, "archive": 1}.
    """
    history = _load_history()
    actions = Counter()
    for entry in history:
-        if entry.get("sender_domain", "") == sender_domain:
+        if _extract_email_address(entry.get("sender", "")) == sender_email:
            actions[entry["action"]] += 1
    return dict(actions)
-def get_sender_history_count(sender_domain):
+def compute_confidence(sender_email, action, tags):
-    """Count total past decisions for a sender domain.
+    """Compute confidence from decision history by matching email signatures.
-    Used by the scan command to decide whether there is enough history
+    A "signature" is (sender_email, tags). Past decisions match if they have
-    to trust auto-actions for this sender.
+    the same sender email AND at least 50% tag overlap with the current email.
    Confidence is based on two factors:
      1. Agreement: what fraction of matching decisions chose the same action.
      2. Match-count cap: limits confidence until enough history exists
         (1 match -> max 10%, 5 matches -> 50%, 10+ -> 100%).
    Returns an integer 0-100.
    """
    history = _load_history()
-    return sum(1 for e in history if e.get("sender_domain", "") == sender_domain)
+    if not history or not tags:
        return 50
    # Find past decisions with same sender and sufficient tag overlap
    matches = []
    for entry in history:
        entry_email = _extract_email_address(entry.get("sender", ""))
        if entry_email != sender_email:
            continue
        entry_tags = entry.get("tags", [])
        if not entry_tags:
            continue
        shared = len(set(tags) & set(entry_tags))
        min_len = min(len(tags), len(entry_tags))
        if min_len > 0 and shared / min_len >= 0.5:
            matches.append(entry)
    if not matches:
        return 50
    # Agreement: fraction of matches with the same action
    matching_action = sum(1 for m in matches if m["action"] == action)
    total = len(matches)
    agreement = round(matching_action / total * 100)
    # Cap by match count: each match adds 10% to the cap
    cap = min(total * 10, 100)
    return min(agreement, cap)
 def get_known_labels():
@@ -194,13 +227,13 @@ def get_all_stats():
    by_action = Counter(e["action"] for e in history)
    by_source = Counter(e["source"] for e in history)
-    # Top 10 sender domains by decision count
+    # Top 10 sender addresses by decision count
-    domain_counts = Counter(e.get("sender_domain", "") for e in history)
+    sender_counts = Counter(_extract_email_address(e.get("sender", "")) for e in history)
-    top_domains = domain_counts.most_common(10)
+    top_senders = sender_counts.most_common(10)
    return {
        "total": total,
        "by_action": dict(by_action),
        "by_source": dict(by_source),
-        "top_domains": top_domains,
+        "top_senders": top_senders,
    }
--- a/scripts/email_processor/main.py
+++ b/scripts/email_processor/main.py
@@ -232,11 +232,11 @@ def save_pending(pending):
        json.dump(pending, f, indent=2, ensure_ascii=False)
-def add_to_pending(email_data, summary, reason, action_suggestion, confidence):
+def add_to_pending(email_data, summary, reason, action_suggestion, confidence, tags=None):
    """Add an email to the pending queue for manual review.
-    Stores the classifier's suggestion and confidence alongside the
+    Stores the classifier's suggestion, computed confidence, and tags
-    email metadata so the user can see what the model thought.
+    alongside the email metadata so the user can see what the model thought.
    """
    pending = load_pending()
@@ -254,6 +254,7 @@ def add_to_pending(email_data, summary, reason, action_suggestion, confidence):
        "reason": reason,
        "suggested_action": action_suggestion,
        "confidence": confidence,
        "tags": tags or [],
        "email_date": email_data.get("date", ""),
        "status": "pending",
        "found_at": datetime.now().isoformat(),
@@ -283,10 +284,10 @@ def log_result(log_file, email_data, action, detail, duration=None):
 def cmd_scan(config, recent=None, dry_run=False):
    """Fetch emails, classify each one, then auto-act or queue.
-    Auto-action is based on a single confidence threshold. When the
+    Confidence is computed from decision history by matching the email's
-    decision history has fewer than 20 entries, a higher threshold (95%)
+    signature (sender_email, tags) against past decisions. New/unknown
-    is used to be conservative during the learning phase. Once enough
+    senders start at 50% (queued). Confidence grows as consistent history
-    history accumulates, the configured threshold takes over.
+    accumulates.
    Args:
        config:  full config dict.
@@ -302,17 +303,7 @@ def cmd_scan(config, recent=None, dry_run=False):
    # Load automation threshold
    automation = config.get("automation", {})
-    configured_threshold = automation.get("confidence_threshold", 75)
+    confidence_threshold = automation.get("confidence_threshold", 75)
    # Adaptive threshold: be conservative when history is thin
    stats = decision_store.get_all_stats()
    total_decisions = stats["total"] if stats else 0
    bootstrap_min = automation.get("bootstrap_min_decisions", 20)
    if total_decisions < bootstrap_min:
        confidence_threshold = 95
        print(f"Learning phase ({total_decisions}/{bootstrap_min} decisions) — threshold: 95%\n")
    else:
        confidence_threshold = configured_threshold
    # Fetch envelopes via himalaya
    if recent:
@@ -354,12 +345,18 @@ def cmd_scan(config, recent=None, dry_run=False):
        email_data = build_email_data(envelope, body, config)
        print(f"{email_data['subject'][:55]}")
-        # Run the LLM classifier (includes few-shot examples from history)
+        # Run the LLM classifier (returns tags instead of confidence)
-        action, confidence, summary, reason, duration = classifier.classify_email(
+        action, tags, summary, reason, duration = classifier.classify_email(
            email_data, config
        )
        # Compute confidence from decision history
        sender_email = decision_store._extract_email_address(email_data.get("sender", ""))
        confidence = decision_store.compute_confidence(sender_email, action, tags)
        tags_str = ", ".join(tags) if tags else "(none)"
        print(f"    -> {action} (confidence: {confidence}%, {duration:.1f}s)")
        print(f"       tags: [{tags_str}]")
        print(f"       {reason[:80]}")
        # Auto-act if confidence meets threshold
@@ -379,7 +376,7 @@ def cmd_scan(config, recent=None, dry_run=False):
            success = execute_action(eid, action)
            if success:
                decision_store.record_decision(
-                    {**email_data, "summary": summary}, action, source="auto"
+                    {**email_data, "summary": summary}, action, source="auto", tags=tags
                )
                log_result(log_file, email_data, f"AUTO:{action}", reason, duration)
                print(f"    ** AUTO-executed: {action}")
@@ -388,11 +385,11 @@ def cmd_scan(config, recent=None, dry_run=False):
                # Himalaya action failed — fall back to queuing
                log_result(log_file, email_data, "AUTO_FAILED", reason, duration)
                print(f"    !! Auto-action failed, queuing instead")
-                add_to_pending(email_data, summary, reason, action, confidence)
+                add_to_pending(email_data, summary, reason, action, confidence, tags)
                queued += 1
        else:
            # Not enough confidence or history — queue for manual review
-            add_to_pending(email_data, summary, reason, action, confidence)
+            add_to_pending(email_data, summary, reason, action, confidence, tags)
            log_result(log_file, email_data, f"QUEUED:{action}@{confidence}%", reason, duration)
            print(f"    -> Queued (confidence {confidence}% < {confidence_threshold}%)")
            queued += 1
@@ -440,11 +437,14 @@ def cmd_review_list():
    for i, (msg_id, data) in enumerate(sorted_items, 1):
        suggested = data.get("suggested_action", "?")
        conf = data.get("confidence", "?")
        tags = data.get("tags", [])
        tags_str = ", ".join(tags) if tags else "(none)"
        print(f"\n  {i}. [{msg_id}]")
        print(f"     Subject: {data.get('subject', 'N/A')[:55]}")
        print(f"     From: {data.get('sender', 'N/A')[:55]}")
        print(f"     To: {data.get('recipient', 'N/A')[:40]}")
        print(f"     Summary: {data.get('summary', 'N/A')[:70]}")
        print(f"     Tags: [{tags_str}]")
        print(f"     Suggested: {suggested} ({conf}% confidence)")
    print(f"\n{'=' * 60}")
@@ -496,7 +496,7 @@ def cmd_review_act(selector, action):
        success = execute_action(eid, action)
        if success:
            # Record decision for future learning
-            decision_store.record_decision(data, action, source="user")
+            decision_store.record_decision(data, action, source="user", tags=data.get("tags", []))
            # Mark as done in pending queue
            pending = load_pending()
@@ -540,7 +540,7 @@ def cmd_review_accept():
        success = execute_action(eid, action)
        if success:
-            decision_store.record_decision(data, action, source="user")
+            decision_store.record_decision(data, action, source="user", tags=data.get("tags", []))
            pending = load_pending()
            pending[msg_id]["status"] = "done"
@@ -616,14 +616,14 @@ def cmd_stats():
    for action, count in sorted(stats["by_action"].items(), key=lambda x: -x[1]):
        print(f"  {action}: {count}")
-    # Top sender domains with per-domain action counts
+    # Top sender addresses with per-sender action counts
-    print(f"\nTop sender domains:")
+    print(f"\nTop senders:")
-    for domain, count in stats["top_domains"]:
+    for sender, count in stats["top_senders"]:
-        domain_stats = decision_store.get_sender_stats(domain)
+        sender_stats = decision_store.get_sender_stats(sender)
        detail = ", ".join(
-            f"{a}:{c}" for a, c in sorted(domain_stats.items(), key=lambda x: -x[1])
+            f"{a}:{c}" for a, c in sorted(sender_stats.items(), key=lambda x: -x[1])
        )
-        print(f"  {domain}: {count} ({detail})")
+        print(f"  {sender}: {count} ({detail})")
    # Custom labels
    labels = decision_store.get_known_labels()