Compute confidence from decision history instead of LLM
This commit is contained in:
@@ -232,11 +232,11 @@ def save_pending(pending):
|
||||
json.dump(pending, f, indent=2, ensure_ascii=False)
|
||||
|
||||
|
||||
def add_to_pending(email_data, summary, reason, action_suggestion, confidence):
|
||||
def add_to_pending(email_data, summary, reason, action_suggestion, confidence, tags=None):
|
||||
"""Add an email to the pending queue for manual review.
|
||||
|
||||
Stores the classifier's suggestion and confidence alongside the
|
||||
email metadata so the user can see what the model thought.
|
||||
Stores the classifier's suggestion, computed confidence, and tags
|
||||
alongside the email metadata so the user can see what the model thought.
|
||||
"""
|
||||
pending = load_pending()
|
||||
|
||||
@@ -254,6 +254,7 @@ def add_to_pending(email_data, summary, reason, action_suggestion, confidence):
|
||||
"reason": reason,
|
||||
"suggested_action": action_suggestion,
|
||||
"confidence": confidence,
|
||||
"tags": tags or [],
|
||||
"email_date": email_data.get("date", ""),
|
||||
"status": "pending",
|
||||
"found_at": datetime.now().isoformat(),
|
||||
@@ -283,10 +284,10 @@ def log_result(log_file, email_data, action, detail, duration=None):
|
||||
def cmd_scan(config, recent=None, dry_run=False):
|
||||
"""Fetch emails, classify each one, then auto-act or queue.
|
||||
|
||||
Auto-action is based on a single confidence threshold. When the
|
||||
decision history has fewer than 20 entries, a higher threshold (95%)
|
||||
is used to be conservative during the learning phase. Once enough
|
||||
history accumulates, the configured threshold takes over.
|
||||
Confidence is computed from decision history by matching the email's
|
||||
signature (sender_email, tags) against past decisions. New/unknown
|
||||
senders start at 50% (queued). Confidence grows as consistent history
|
||||
accumulates.
|
||||
|
||||
Args:
|
||||
config: full config dict.
|
||||
@@ -302,17 +303,7 @@ def cmd_scan(config, recent=None, dry_run=False):
|
||||
|
||||
# Load automation threshold
|
||||
automation = config.get("automation", {})
|
||||
configured_threshold = automation.get("confidence_threshold", 75)
|
||||
|
||||
# Adaptive threshold: be conservative when history is thin
|
||||
stats = decision_store.get_all_stats()
|
||||
total_decisions = stats["total"] if stats else 0
|
||||
bootstrap_min = automation.get("bootstrap_min_decisions", 20)
|
||||
if total_decisions < bootstrap_min:
|
||||
confidence_threshold = 95
|
||||
print(f"Learning phase ({total_decisions}/{bootstrap_min} decisions) — threshold: 95%\n")
|
||||
else:
|
||||
confidence_threshold = configured_threshold
|
||||
confidence_threshold = automation.get("confidence_threshold", 75)
|
||||
|
||||
# Fetch envelopes via himalaya
|
||||
if recent:
|
||||
@@ -354,12 +345,18 @@ def cmd_scan(config, recent=None, dry_run=False):
|
||||
email_data = build_email_data(envelope, body, config)
|
||||
print(f"{email_data['subject'][:55]}")
|
||||
|
||||
# Run the LLM classifier (includes few-shot examples from history)
|
||||
action, confidence, summary, reason, duration = classifier.classify_email(
|
||||
# Run the LLM classifier (returns tags instead of confidence)
|
||||
action, tags, summary, reason, duration = classifier.classify_email(
|
||||
email_data, config
|
||||
)
|
||||
|
||||
# Compute confidence from decision history
|
||||
sender_email = decision_store._extract_email_address(email_data.get("sender", ""))
|
||||
confidence = decision_store.compute_confidence(sender_email, action, tags)
|
||||
|
||||
tags_str = ", ".join(tags) if tags else "(none)"
|
||||
print(f" -> {action} (confidence: {confidence}%, {duration:.1f}s)")
|
||||
print(f" tags: [{tags_str}]")
|
||||
print(f" {reason[:80]}")
|
||||
|
||||
# Auto-act if confidence meets threshold
|
||||
@@ -379,7 +376,7 @@ def cmd_scan(config, recent=None, dry_run=False):
|
||||
success = execute_action(eid, action)
|
||||
if success:
|
||||
decision_store.record_decision(
|
||||
{**email_data, "summary": summary}, action, source="auto"
|
||||
{**email_data, "summary": summary}, action, source="auto", tags=tags
|
||||
)
|
||||
log_result(log_file, email_data, f"AUTO:{action}", reason, duration)
|
||||
print(f" ** AUTO-executed: {action}")
|
||||
@@ -388,11 +385,11 @@ def cmd_scan(config, recent=None, dry_run=False):
|
||||
# Himalaya action failed — fall back to queuing
|
||||
log_result(log_file, email_data, "AUTO_FAILED", reason, duration)
|
||||
print(f" !! Auto-action failed, queuing instead")
|
||||
add_to_pending(email_data, summary, reason, action, confidence)
|
||||
add_to_pending(email_data, summary, reason, action, confidence, tags)
|
||||
queued += 1
|
||||
else:
|
||||
# Not enough confidence or history — queue for manual review
|
||||
add_to_pending(email_data, summary, reason, action, confidence)
|
||||
add_to_pending(email_data, summary, reason, action, confidence, tags)
|
||||
log_result(log_file, email_data, f"QUEUED:{action}@{confidence}%", reason, duration)
|
||||
print(f" -> Queued (confidence {confidence}% < {confidence_threshold}%)")
|
||||
queued += 1
|
||||
@@ -440,11 +437,14 @@ def cmd_review_list():
|
||||
for i, (msg_id, data) in enumerate(sorted_items, 1):
|
||||
suggested = data.get("suggested_action", "?")
|
||||
conf = data.get("confidence", "?")
|
||||
tags = data.get("tags", [])
|
||||
tags_str = ", ".join(tags) if tags else "(none)"
|
||||
print(f"\n {i}. [{msg_id}]")
|
||||
print(f" Subject: {data.get('subject', 'N/A')[:55]}")
|
||||
print(f" From: {data.get('sender', 'N/A')[:55]}")
|
||||
print(f" To: {data.get('recipient', 'N/A')[:40]}")
|
||||
print(f" Summary: {data.get('summary', 'N/A')[:70]}")
|
||||
print(f" Tags: [{tags_str}]")
|
||||
print(f" Suggested: {suggested} ({conf}% confidence)")
|
||||
|
||||
print(f"\n{'=' * 60}")
|
||||
@@ -496,7 +496,7 @@ def cmd_review_act(selector, action):
|
||||
success = execute_action(eid, action)
|
||||
if success:
|
||||
# Record decision for future learning
|
||||
decision_store.record_decision(data, action, source="user")
|
||||
decision_store.record_decision(data, action, source="user", tags=data.get("tags", []))
|
||||
|
||||
# Mark as done in pending queue
|
||||
pending = load_pending()
|
||||
@@ -540,7 +540,7 @@ def cmd_review_accept():
|
||||
|
||||
success = execute_action(eid, action)
|
||||
if success:
|
||||
decision_store.record_decision(data, action, source="user")
|
||||
decision_store.record_decision(data, action, source="user", tags=data.get("tags", []))
|
||||
|
||||
pending = load_pending()
|
||||
pending[msg_id]["status"] = "done"
|
||||
@@ -616,14 +616,14 @@ def cmd_stats():
|
||||
for action, count in sorted(stats["by_action"].items(), key=lambda x: -x[1]):
|
||||
print(f" {action}: {count}")
|
||||
|
||||
# Top sender domains with per-domain action counts
|
||||
print(f"\nTop sender domains:")
|
||||
for domain, count in stats["top_domains"]:
|
||||
domain_stats = decision_store.get_sender_stats(domain)
|
||||
# Top sender addresses with per-sender action counts
|
||||
print(f"\nTop senders:")
|
||||
for sender, count in stats["top_senders"]:
|
||||
sender_stats = decision_store.get_sender_stats(sender)
|
||||
detail = ", ".join(
|
||||
f"{a}:{c}" for a, c in sorted(domain_stats.items(), key=lambda x: -x[1])
|
||||
f"{a}:{c}" for a, c in sorted(sender_stats.items(), key=lambda x: -x[1])
|
||||
)
|
||||
print(f" {domain}: {count} ({detail})")
|
||||
print(f" {sender}: {count} ({detail})")
|
||||
|
||||
# Custom labels
|
||||
labels = decision_store.get_known_labels()
|
||||
|
||||
Reference in New Issue
Block a user