Compute confidence from decision history instead of LLM

This commit is contained in:
Yanxin Lu
2026-03-04 14:23:50 -08:00
parent 720f4ef1ad
commit 64e28b55d1
4 changed files with 133 additions and 84 deletions

View File

@@ -232,11 +232,11 @@ def save_pending(pending):
json.dump(pending, f, indent=2, ensure_ascii=False)
def add_to_pending(email_data, summary, reason, action_suggestion, confidence):
def add_to_pending(email_data, summary, reason, action_suggestion, confidence, tags=None):
"""Add an email to the pending queue for manual review.
Stores the classifier's suggestion and confidence alongside the
email metadata so the user can see what the model thought.
Stores the classifier's suggestion, computed confidence, and tags
alongside the email metadata so the user can see what the model thought.
"""
pending = load_pending()
@@ -254,6 +254,7 @@ def add_to_pending(email_data, summary, reason, action_suggestion, confidence):
"reason": reason,
"suggested_action": action_suggestion,
"confidence": confidence,
"tags": tags or [],
"email_date": email_data.get("date", ""),
"status": "pending",
"found_at": datetime.now().isoformat(),
@@ -283,10 +284,10 @@ def log_result(log_file, email_data, action, detail, duration=None):
def cmd_scan(config, recent=None, dry_run=False):
"""Fetch emails, classify each one, then auto-act or queue.
Auto-action is based on a single confidence threshold. When the
decision history has fewer than 20 entries, a higher threshold (95%)
is used to be conservative during the learning phase. Once enough
history accumulates, the configured threshold takes over.
Confidence is computed from decision history by matching the email's
signature (sender_email, tags) against past decisions. New/unknown
senders start at 50% (queued). Confidence grows as consistent history
accumulates.
Args:
config: full config dict.
@@ -302,17 +303,7 @@ def cmd_scan(config, recent=None, dry_run=False):
# Load automation threshold
automation = config.get("automation", {})
configured_threshold = automation.get("confidence_threshold", 75)
# Adaptive threshold: be conservative when history is thin
stats = decision_store.get_all_stats()
total_decisions = stats["total"] if stats else 0
bootstrap_min = automation.get("bootstrap_min_decisions", 20)
if total_decisions < bootstrap_min:
confidence_threshold = 95
print(f"Learning phase ({total_decisions}/{bootstrap_min} decisions) — threshold: 95%\n")
else:
confidence_threshold = configured_threshold
confidence_threshold = automation.get("confidence_threshold", 75)
# Fetch envelopes via himalaya
if recent:
@@ -354,12 +345,18 @@ def cmd_scan(config, recent=None, dry_run=False):
email_data = build_email_data(envelope, body, config)
print(f"{email_data['subject'][:55]}")
# Run the LLM classifier (includes few-shot examples from history)
action, confidence, summary, reason, duration = classifier.classify_email(
# Run the LLM classifier (returns tags instead of confidence)
action, tags, summary, reason, duration = classifier.classify_email(
email_data, config
)
# Compute confidence from decision history
sender_email = decision_store._extract_email_address(email_data.get("sender", ""))
confidence = decision_store.compute_confidence(sender_email, action, tags)
tags_str = ", ".join(tags) if tags else "(none)"
print(f" -> {action} (confidence: {confidence}%, {duration:.1f}s)")
print(f" tags: [{tags_str}]")
print(f" {reason[:80]}")
# Auto-act if confidence meets threshold
@@ -379,7 +376,7 @@ def cmd_scan(config, recent=None, dry_run=False):
success = execute_action(eid, action)
if success:
decision_store.record_decision(
{**email_data, "summary": summary}, action, source="auto"
{**email_data, "summary": summary}, action, source="auto", tags=tags
)
log_result(log_file, email_data, f"AUTO:{action}", reason, duration)
print(f" ** AUTO-executed: {action}")
@@ -388,11 +385,11 @@ def cmd_scan(config, recent=None, dry_run=False):
# Himalaya action failed — fall back to queuing
log_result(log_file, email_data, "AUTO_FAILED", reason, duration)
print(f" !! Auto-action failed, queuing instead")
add_to_pending(email_data, summary, reason, action, confidence)
add_to_pending(email_data, summary, reason, action, confidence, tags)
queued += 1
else:
# Not enough confidence or history — queue for manual review
add_to_pending(email_data, summary, reason, action, confidence)
add_to_pending(email_data, summary, reason, action, confidence, tags)
log_result(log_file, email_data, f"QUEUED:{action}@{confidence}%", reason, duration)
print(f" -> Queued (confidence {confidence}% < {confidence_threshold}%)")
queued += 1
@@ -440,11 +437,14 @@ def cmd_review_list():
for i, (msg_id, data) in enumerate(sorted_items, 1):
suggested = data.get("suggested_action", "?")
conf = data.get("confidence", "?")
tags = data.get("tags", [])
tags_str = ", ".join(tags) if tags else "(none)"
print(f"\n {i}. [{msg_id}]")
print(f" Subject: {data.get('subject', 'N/A')[:55]}")
print(f" From: {data.get('sender', 'N/A')[:55]}")
print(f" To: {data.get('recipient', 'N/A')[:40]}")
print(f" Summary: {data.get('summary', 'N/A')[:70]}")
print(f" Tags: [{tags_str}]")
print(f" Suggested: {suggested} ({conf}% confidence)")
print(f"\n{'=' * 60}")
@@ -496,7 +496,7 @@ def cmd_review_act(selector, action):
success = execute_action(eid, action)
if success:
# Record decision for future learning
decision_store.record_decision(data, action, source="user")
decision_store.record_decision(data, action, source="user", tags=data.get("tags", []))
# Mark as done in pending queue
pending = load_pending()
@@ -540,7 +540,7 @@ def cmd_review_accept():
success = execute_action(eid, action)
if success:
decision_store.record_decision(data, action, source="user")
decision_store.record_decision(data, action, source="user", tags=data.get("tags", []))
pending = load_pending()
pending[msg_id]["status"] = "done"
@@ -616,14 +616,14 @@ def cmd_stats():
for action, count in sorted(stats["by_action"].items(), key=lambda x: -x[1]):
print(f" {action}: {count}")
# Top sender domains with per-domain action counts
print(f"\nTop sender domains:")
for domain, count in stats["top_domains"]:
domain_stats = decision_store.get_sender_stats(domain)
# Top sender addresses with per-sender action counts
print(f"\nTop senders:")
for sender, count in stats["top_senders"]:
sender_stats = decision_store.get_sender_stats(sender)
detail = ", ".join(
f"{a}:{c}" for a, c in sorted(domain_stats.items(), key=lambda x: -x[1])
f"{a}:{c}" for a, c in sorted(sender_stats.items(), key=lambda x: -x[1])
)
print(f" {domain}: {count} ({detail})")
print(f" {sender}: {count} ({detail})")
# Custom labels
labels = decision_store.get_known_labels()