Compute confidence from decision history instead of LLM
This commit is contained in:
@@ -5,7 +5,10 @@ Classifier - LLM-based email classification with learning.
|
|||||||
This module builds a rich prompt for the local Ollama model (Qwen3) that
|
This module builds a rich prompt for the local Ollama model (Qwen3) that
|
||||||
includes few-shot examples from past user decisions, per-sender statistics,
|
includes few-shot examples from past user decisions, per-sender statistics,
|
||||||
and a list of known labels. The model returns a structured response with
|
and a list of known labels. The model returns a structured response with
|
||||||
an action, confidence score, summary, and reason.
|
an action, category tags, summary, and reason.
|
||||||
|
|
||||||
|
Confidence is NOT produced by the LLM — it is computed externally from
|
||||||
|
decision history by decision_store.compute_confidence().
|
||||||
|
|
||||||
The prompt structure:
|
The prompt structure:
|
||||||
1. System instructions (action definitions)
|
1. System instructions (action definitions)
|
||||||
@@ -13,7 +16,7 @@ The prompt structure:
|
|||||||
3. Sender statistics ("linkedin.com: deleted 8 times, kept 2 times")
|
3. Sender statistics ("linkedin.com: deleted 8 times, kept 2 times")
|
||||||
4. Few-shot examples (top 5 most relevant past decisions)
|
4. Few-shot examples (top 5 most relevant past decisions)
|
||||||
5. The email to classify (subject, sender, recipient, body preview)
|
5. The email to classify (subject, sender, recipient, body preview)
|
||||||
6. Output format specification
|
6. Output format specification (action, tags, summary, reason)
|
||||||
"""
|
"""
|
||||||
|
|
||||||
import time
|
import time
|
||||||
@@ -24,6 +27,15 @@ import decision_store
|
|||||||
|
|
||||||
LOGS_DIR = Path(__file__).parent / "logs"
|
LOGS_DIR = Path(__file__).parent / "logs"
|
||||||
|
|
||||||
|
TAG_TAXONOMY = [
|
||||||
|
"receipt", "invoice", "payment", "billing",
|
||||||
|
"shipping", "delivery",
|
||||||
|
"promotion", "discount", "marketing", "newsletter",
|
||||||
|
"notification", "security", "social",
|
||||||
|
"reminder", "confirmation", "update", "alert",
|
||||||
|
"personal", "account", "subscription", "travel",
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
def _build_prompt(email_data, config):
|
def _build_prompt(email_data, config):
|
||||||
"""Assemble the full classification prompt with learning context.
|
"""Assemble the full classification prompt with learning context.
|
||||||
@@ -36,8 +48,8 @@ def _build_prompt(email_data, config):
|
|||||||
|
|
||||||
# Gather learning context from decision history
|
# Gather learning context from decision history
|
||||||
examples = decision_store.get_relevant_examples(email_data, n=10)
|
examples = decision_store.get_relevant_examples(email_data, n=10)
|
||||||
sender_domain = decision_store._extract_domain(email_data.get("sender", ""))
|
sender_email = decision_store._extract_email_address(email_data.get("sender", ""))
|
||||||
sender_stats = decision_store.get_sender_stats(sender_domain) if sender_domain else {}
|
sender_stats = decision_store.get_sender_stats(sender_email) if sender_email else {}
|
||||||
known_labels = decision_store.get_known_labels()
|
known_labels = decision_store.get_known_labels()
|
||||||
|
|
||||||
# /no_think disables Qwen3's chain-of-thought, giving faster + shorter output
|
# /no_think disables Qwen3's chain-of-thought, giving faster + shorter output
|
||||||
@@ -63,7 +75,7 @@ def _build_prompt(email_data, config):
|
|||||||
stats_str = ", ".join(
|
stats_str = ", ".join(
|
||||||
f"{action} {count} times" for action, count in sender_stats.items()
|
f"{action} {count} times" for action, count in sender_stats.items()
|
||||||
)
|
)
|
||||||
parts.append(f"\nHistory for {sender_domain}: {stats_str}\n")
|
parts.append(f"\nHistory for {sender_email}: {stats_str}\n")
|
||||||
|
|
||||||
# Section 4: Few-shot examples (top 5 most relevant past decisions)
|
# Section 4: Few-shot examples (top 5 most relevant past decisions)
|
||||||
if examples:
|
if examples:
|
||||||
@@ -86,10 +98,11 @@ def _build_prompt(email_data, config):
|
|||||||
)
|
)
|
||||||
|
|
||||||
# Section 6: Required output format
|
# Section 6: Required output format
|
||||||
|
tags_list = ", ".join(TAG_TAXONOMY)
|
||||||
parts.append(
|
parts.append(
|
||||||
"Respond in this exact format (nothing else):\n"
|
"Respond in this exact format (nothing else):\n"
|
||||||
"Action: [delete|archive|keep|mark_read|label:<name>]\n"
|
"Action: [delete|archive|keep|mark_read|label:<name>]\n"
|
||||||
"Confidence: [0-100]\n"
|
f"Tags: [comma-separated tags from: {tags_list}]\n"
|
||||||
"Summary: [one sentence summary of the email]\n"
|
"Summary: [one sentence summary of the email]\n"
|
||||||
"Reason: [brief explanation for your classification]"
|
"Reason: [brief explanation for your classification]"
|
||||||
)
|
)
|
||||||
@@ -97,18 +110,19 @@ def _build_prompt(email_data, config):
|
|||||||
return "\n".join(parts)
|
return "\n".join(parts)
|
||||||
|
|
||||||
|
|
||||||
def _log_llm(prompt, output, email_data, action, confidence, duration):
|
def _log_llm(prompt, output, email_data, action, tags, duration):
|
||||||
"""Log the full LLM prompt and response to logs/llm_YYYY-MM-DD.log."""
|
"""Log the full LLM prompt and response to logs/llm_YYYY-MM-DD.log."""
|
||||||
LOGS_DIR.mkdir(exist_ok=True)
|
LOGS_DIR.mkdir(exist_ok=True)
|
||||||
log_file = LOGS_DIR / f"llm_{datetime.now().strftime('%Y-%m-%d')}.log"
|
log_file = LOGS_DIR / f"llm_{datetime.now().strftime('%Y-%m-%d')}.log"
|
||||||
timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
|
timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
|
||||||
subject = email_data.get("subject", "(No Subject)")[:60]
|
subject = email_data.get("subject", "(No Subject)")[:60]
|
||||||
sender = email_data.get("sender", "(Unknown)")[:60]
|
sender = email_data.get("sender", "(Unknown)")[:60]
|
||||||
|
tags_str = ", ".join(tags)
|
||||||
|
|
||||||
with open(log_file, "a", encoding="utf-8") as f:
|
with open(log_file, "a", encoding="utf-8") as f:
|
||||||
f.write(f"{'=' * 70}\n")
|
f.write(f"{'=' * 70}\n")
|
||||||
f.write(f"[{timestamp}] {subject}\n")
|
f.write(f"[{timestamp}] {subject}\n")
|
||||||
f.write(f"From: {sender} | Result: {action} @ {confidence}% | {duration:.1f}s\n")
|
f.write(f"From: {sender} | Result: {action} tags=[{tags_str}] | {duration:.1f}s\n")
|
||||||
f.write(f"{'-' * 70}\n")
|
f.write(f"{'-' * 70}\n")
|
||||||
f.write(f"PROMPT:\n{prompt}\n")
|
f.write(f"PROMPT:\n{prompt}\n")
|
||||||
f.write(f"{'-' * 70}\n")
|
f.write(f"{'-' * 70}\n")
|
||||||
@@ -121,17 +135,19 @@ def _parse_response(output):
|
|||||||
|
|
||||||
Expected format (one per line):
|
Expected format (one per line):
|
||||||
Action: delete
|
Action: delete
|
||||||
Confidence: 92
|
Tags: promotion, marketing, newsletter
|
||||||
Summary: Promotional offer from retailer
|
Summary: Promotional offer from retailer
|
||||||
Reason: Clearly a marketing email with discount offer
|
Reason: Clearly a marketing email with discount offer
|
||||||
|
|
||||||
Falls back to safe defaults (keep, 50% confidence) on parse failure.
|
Falls back to safe defaults (keep, empty tags) on parse failure.
|
||||||
"""
|
"""
|
||||||
action = "keep"
|
action = "keep"
|
||||||
confidence = 50
|
tags = []
|
||||||
summary = "No summary"
|
summary = "No summary"
|
||||||
reason = "Unknown"
|
reason = "Unknown"
|
||||||
|
|
||||||
|
valid_tags = set(TAG_TAXONOMY)
|
||||||
|
|
||||||
for line in output.strip().split("\n"):
|
for line in output.strip().split("\n"):
|
||||||
line = line.strip()
|
line = line.strip()
|
||||||
if line.startswith("Action:"):
|
if line.startswith("Action:"):
|
||||||
@@ -139,25 +155,26 @@ def _parse_response(output):
|
|||||||
valid_actions = {"delete", "archive", "keep", "mark_read"}
|
valid_actions = {"delete", "archive", "keep", "mark_read"}
|
||||||
if raw_action in valid_actions or raw_action.startswith("label:"):
|
if raw_action in valid_actions or raw_action.startswith("label:"):
|
||||||
action = raw_action
|
action = raw_action
|
||||||
elif line.startswith("Confidence:"):
|
elif line.startswith("Tags:"):
|
||||||
try:
|
raw_tags = line.replace("Tags:", "").strip()
|
||||||
confidence = int(line.replace("Confidence:", "").strip().rstrip("%"))
|
tags = [
|
||||||
confidence = max(0, min(100, confidence)) # clamp to 0-100
|
t.strip().lower()
|
||||||
except ValueError:
|
for t in raw_tags.split(",")
|
||||||
confidence = 50
|
if t.strip().lower() in valid_tags
|
||||||
|
]
|
||||||
elif line.startswith("Summary:"):
|
elif line.startswith("Summary:"):
|
||||||
summary = line.replace("Summary:", "").strip()[:200]
|
summary = line.replace("Summary:", "").strip()[:200]
|
||||||
elif line.startswith("Reason:"):
|
elif line.startswith("Reason:"):
|
||||||
reason = line.replace("Reason:", "").strip()
|
reason = line.replace("Reason:", "").strip()
|
||||||
|
|
||||||
return action, confidence, summary, reason
|
return action, tags, summary, reason
|
||||||
|
|
||||||
|
|
||||||
def classify_email(email_data, config):
|
def classify_email(email_data, config):
|
||||||
"""Classify an email using the local LLM with few-shot learning context.
|
"""Classify an email using the local LLM with few-shot learning context.
|
||||||
|
|
||||||
Connects to Ollama, sends the assembled prompt, and parses the response.
|
Connects to Ollama, sends the assembled prompt, and parses the response.
|
||||||
On any error, falls back to "keep" with 0% confidence so the email
|
On any error, falls back to "keep" with empty tags so the email
|
||||||
gets queued for manual review rather than auto-acted upon.
|
gets queued for manual review rather than auto-acted upon.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
@@ -165,7 +182,7 @@ def classify_email(email_data, config):
|
|||||||
config: full config dict (needs ollama.model and rules.max_body_length).
|
config: full config dict (needs ollama.model and rules.max_body_length).
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
Tuple of (action, confidence, summary, reason, duration_seconds).
|
Tuple of (action, tags, summary, reason, duration_seconds).
|
||||||
"""
|
"""
|
||||||
import ollama
|
import ollama
|
||||||
|
|
||||||
@@ -177,15 +194,15 @@ def classify_email(email_data, config):
|
|||||||
# Low temperature for consistent classification
|
# Low temperature for consistent classification
|
||||||
response = ollama.generate(model=model, prompt=prompt, options={"temperature": 0.1})
|
response = ollama.generate(model=model, prompt=prompt, options={"temperature": 0.1})
|
||||||
output = response["response"]
|
output = response["response"]
|
||||||
action, confidence, summary, reason = _parse_response(output)
|
action, tags, summary, reason = _parse_response(output)
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
# On failure, default to "keep" with 0 confidence -> always queued
|
# On failure, default to "keep" with empty tags -> always queued
|
||||||
output = f"ERROR: {e}"
|
output = f"ERROR: {e}"
|
||||||
action = "keep"
|
action = "keep"
|
||||||
confidence = 0
|
tags = []
|
||||||
summary = "Classification failed"
|
summary = "Classification failed"
|
||||||
reason = f"error - {str(e)[:100]}"
|
reason = f"error - {str(e)[:100]}"
|
||||||
|
|
||||||
duration = time.time() - start_time
|
duration = time.time() - start_time
|
||||||
_log_llm(prompt, output, email_data, action, confidence, duration)
|
_log_llm(prompt, output, email_data, action, tags, duration)
|
||||||
return action, confidence, summary, reason, duration
|
return action, tags, summary, reason, duration
|
||||||
|
|||||||
@@ -8,7 +8,6 @@
|
|||||||
"check_unseen_only": true
|
"check_unseen_only": true
|
||||||
},
|
},
|
||||||
"automation": {
|
"automation": {
|
||||||
"confidence_threshold": 75,
|
"confidence_threshold": 85
|
||||||
"bootstrap_min_decisions": 30
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -71,7 +71,7 @@ def _extract_email_address(sender):
|
|||||||
# Public API
|
# Public API
|
||||||
# ---------------------------------------------------------------------------
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
def record_decision(email_data, action, source="user"):
|
def record_decision(email_data, action, source="user", tags=None):
|
||||||
"""Append a decision to the history file.
|
"""Append a decision to the history file.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
@@ -79,6 +79,7 @@ def record_decision(email_data, action, source="user"):
|
|||||||
action: one of "delete", "archive", "keep", "mark_read",
|
action: one of "delete", "archive", "keep", "mark_read",
|
||||||
or "label:<name>".
|
or "label:<name>".
|
||||||
source: "user" (manual review) or "auto" (high-confidence).
|
source: "user" (manual review) or "auto" (high-confidence).
|
||||||
|
tags: list of category tags from the classifier taxonomy.
|
||||||
"""
|
"""
|
||||||
history = _load_history()
|
history = _load_history()
|
||||||
entry = {
|
entry = {
|
||||||
@@ -90,6 +91,7 @@ def record_decision(email_data, action, source="user"):
|
|||||||
"summary": email_data.get("summary", ""),
|
"summary": email_data.get("summary", ""),
|
||||||
"action": action,
|
"action": action,
|
||||||
"source": source,
|
"source": source,
|
||||||
|
"tags": tags or [],
|
||||||
}
|
}
|
||||||
history.append(entry)
|
history.append(entry)
|
||||||
_save_history(history)
|
_save_history(history)
|
||||||
@@ -99,10 +101,9 @@ def record_decision(email_data, action, source="user"):
|
|||||||
def get_relevant_examples(email_data, n=10):
|
def get_relevant_examples(email_data, n=10):
|
||||||
"""Find the N most relevant past decisions for a given email.
|
"""Find the N most relevant past decisions for a given email.
|
||||||
|
|
||||||
Relevance is scored by three signals:
|
Relevance is scored by two signals:
|
||||||
- Exact sender domain match: +3 points
|
- Exact sender email address match: +3 points
|
||||||
- Recipient string match: +2 points
|
- Subject keyword overlap: +1 point per shared word
|
||||||
- Subject keyword overlap: +1 point per shared word
|
|
||||||
|
|
||||||
Only entries with score > 0 are considered. Results are returned
|
Only entries with score > 0 are considered. Results are returned
|
||||||
sorted by descending relevance.
|
sorted by descending relevance.
|
||||||
@@ -111,8 +112,7 @@ def get_relevant_examples(email_data, n=10):
|
|||||||
if not history:
|
if not history:
|
||||||
return []
|
return []
|
||||||
|
|
||||||
target_domain = _extract_domain(email_data.get("sender", ""))
|
target_email = _extract_email_address(email_data.get("sender", ""))
|
||||||
target_recipient = email_data.get("recipient", "").lower()
|
|
||||||
target_words = (
|
target_words = (
|
||||||
set(re.findall(r"\w+", email_data.get("subject", "").lower())) - _STOP_WORDS
|
set(re.findall(r"\w+", email_data.get("subject", "").lower())) - _STOP_WORDS
|
||||||
)
|
)
|
||||||
@@ -121,15 +121,11 @@ def get_relevant_examples(email_data, n=10):
|
|||||||
for entry in history:
|
for entry in history:
|
||||||
score = 0
|
score = 0
|
||||||
|
|
||||||
# Signal 1: sender domain match
|
# Signal 1: sender email match
|
||||||
if target_domain and entry.get("sender_domain", "") == target_domain:
|
if target_email and _extract_email_address(entry.get("sender", "")) == target_email:
|
||||||
score += 3
|
score += 3
|
||||||
|
|
||||||
# Signal 2: recipient substring match
|
# Signal 2: subject keyword overlap
|
||||||
if target_recipient and target_recipient in entry.get("recipient", "").lower():
|
|
||||||
score += 2
|
|
||||||
|
|
||||||
# Signal 3: subject keyword overlap
|
|
||||||
entry_words = (
|
entry_words = (
|
||||||
set(re.findall(r"\w+", entry.get("subject", "").lower())) - _STOP_WORDS
|
set(re.findall(r"\w+", entry.get("subject", "").lower())) - _STOP_WORDS
|
||||||
)
|
)
|
||||||
@@ -142,27 +138,64 @@ def get_relevant_examples(email_data, n=10):
|
|||||||
return [entry for _, entry in scored[:n]]
|
return [entry for _, entry in scored[:n]]
|
||||||
|
|
||||||
|
|
||||||
def get_sender_stats(sender_domain):
|
def get_sender_stats(sender_email):
|
||||||
"""Get action distribution for a sender domain.
|
"""Get action distribution for a sender email address.
|
||||||
|
|
||||||
Returns a dict like {"delete": 5, "keep": 2, "archive": 1}.
|
Returns a dict like {"delete": 5, "keep": 2, "archive": 1}.
|
||||||
"""
|
"""
|
||||||
history = _load_history()
|
history = _load_history()
|
||||||
actions = Counter()
|
actions = Counter()
|
||||||
for entry in history:
|
for entry in history:
|
||||||
if entry.get("sender_domain", "") == sender_domain:
|
if _extract_email_address(entry.get("sender", "")) == sender_email:
|
||||||
actions[entry["action"]] += 1
|
actions[entry["action"]] += 1
|
||||||
return dict(actions)
|
return dict(actions)
|
||||||
|
|
||||||
|
|
||||||
def get_sender_history_count(sender_domain):
|
def compute_confidence(sender_email, action, tags):
|
||||||
"""Count total past decisions for a sender domain.
|
"""Compute confidence from decision history by matching email signatures.
|
||||||
|
|
||||||
Used by the scan command to decide whether there is enough history
|
A "signature" is (sender_email, tags). Past decisions match if they have
|
||||||
to trust auto-actions for this sender.
|
the same sender email AND at least 50% tag overlap with the current email.
|
||||||
|
|
||||||
|
Confidence is based on two factors:
|
||||||
|
1. Agreement: what fraction of matching decisions chose the same action.
|
||||||
|
2. Match-count cap: limits confidence until enough history exists
|
||||||
|
(1 match -> max 10%, 5 matches -> 50%, 10+ -> 100%).
|
||||||
|
|
||||||
|
Returns an integer 0-100.
|
||||||
"""
|
"""
|
||||||
history = _load_history()
|
history = _load_history()
|
||||||
return sum(1 for e in history if e.get("sender_domain", "") == sender_domain)
|
if not history or not tags:
|
||||||
|
return 50
|
||||||
|
|
||||||
|
# Find past decisions with same sender and sufficient tag overlap
|
||||||
|
matches = []
|
||||||
|
for entry in history:
|
||||||
|
entry_email = _extract_email_address(entry.get("sender", ""))
|
||||||
|
if entry_email != sender_email:
|
||||||
|
continue
|
||||||
|
|
||||||
|
entry_tags = entry.get("tags", [])
|
||||||
|
if not entry_tags:
|
||||||
|
continue
|
||||||
|
|
||||||
|
shared = len(set(tags) & set(entry_tags))
|
||||||
|
min_len = min(len(tags), len(entry_tags))
|
||||||
|
if min_len > 0 and shared / min_len >= 0.5:
|
||||||
|
matches.append(entry)
|
||||||
|
|
||||||
|
if not matches:
|
||||||
|
return 50
|
||||||
|
|
||||||
|
# Agreement: fraction of matches with the same action
|
||||||
|
matching_action = sum(1 for m in matches if m["action"] == action)
|
||||||
|
total = len(matches)
|
||||||
|
agreement = round(matching_action / total * 100)
|
||||||
|
|
||||||
|
# Cap by match count: each match adds 10% to the cap
|
||||||
|
cap = min(total * 10, 100)
|
||||||
|
|
||||||
|
return min(agreement, cap)
|
||||||
|
|
||||||
|
|
||||||
def get_known_labels():
|
def get_known_labels():
|
||||||
@@ -194,13 +227,13 @@ def get_all_stats():
|
|||||||
by_action = Counter(e["action"] for e in history)
|
by_action = Counter(e["action"] for e in history)
|
||||||
by_source = Counter(e["source"] for e in history)
|
by_source = Counter(e["source"] for e in history)
|
||||||
|
|
||||||
# Top 10 sender domains by decision count
|
# Top 10 sender addresses by decision count
|
||||||
domain_counts = Counter(e.get("sender_domain", "") for e in history)
|
sender_counts = Counter(_extract_email_address(e.get("sender", "")) for e in history)
|
||||||
top_domains = domain_counts.most_common(10)
|
top_senders = sender_counts.most_common(10)
|
||||||
|
|
||||||
return {
|
return {
|
||||||
"total": total,
|
"total": total,
|
||||||
"by_action": dict(by_action),
|
"by_action": dict(by_action),
|
||||||
"by_source": dict(by_source),
|
"by_source": dict(by_source),
|
||||||
"top_domains": top_domains,
|
"top_senders": top_senders,
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -232,11 +232,11 @@ def save_pending(pending):
|
|||||||
json.dump(pending, f, indent=2, ensure_ascii=False)
|
json.dump(pending, f, indent=2, ensure_ascii=False)
|
||||||
|
|
||||||
|
|
||||||
def add_to_pending(email_data, summary, reason, action_suggestion, confidence):
|
def add_to_pending(email_data, summary, reason, action_suggestion, confidence, tags=None):
|
||||||
"""Add an email to the pending queue for manual review.
|
"""Add an email to the pending queue for manual review.
|
||||||
|
|
||||||
Stores the classifier's suggestion and confidence alongside the
|
Stores the classifier's suggestion, computed confidence, and tags
|
||||||
email metadata so the user can see what the model thought.
|
alongside the email metadata so the user can see what the model thought.
|
||||||
"""
|
"""
|
||||||
pending = load_pending()
|
pending = load_pending()
|
||||||
|
|
||||||
@@ -254,6 +254,7 @@ def add_to_pending(email_data, summary, reason, action_suggestion, confidence):
|
|||||||
"reason": reason,
|
"reason": reason,
|
||||||
"suggested_action": action_suggestion,
|
"suggested_action": action_suggestion,
|
||||||
"confidence": confidence,
|
"confidence": confidence,
|
||||||
|
"tags": tags or [],
|
||||||
"email_date": email_data.get("date", ""),
|
"email_date": email_data.get("date", ""),
|
||||||
"status": "pending",
|
"status": "pending",
|
||||||
"found_at": datetime.now().isoformat(),
|
"found_at": datetime.now().isoformat(),
|
||||||
@@ -283,10 +284,10 @@ def log_result(log_file, email_data, action, detail, duration=None):
|
|||||||
def cmd_scan(config, recent=None, dry_run=False):
|
def cmd_scan(config, recent=None, dry_run=False):
|
||||||
"""Fetch emails, classify each one, then auto-act or queue.
|
"""Fetch emails, classify each one, then auto-act or queue.
|
||||||
|
|
||||||
Auto-action is based on a single confidence threshold. When the
|
Confidence is computed from decision history by matching the email's
|
||||||
decision history has fewer than 20 entries, a higher threshold (95%)
|
signature (sender_email, tags) against past decisions. New/unknown
|
||||||
is used to be conservative during the learning phase. Once enough
|
senders start at 50% (queued). Confidence grows as consistent history
|
||||||
history accumulates, the configured threshold takes over.
|
accumulates.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
config: full config dict.
|
config: full config dict.
|
||||||
@@ -302,17 +303,7 @@ def cmd_scan(config, recent=None, dry_run=False):
|
|||||||
|
|
||||||
# Load automation threshold
|
# Load automation threshold
|
||||||
automation = config.get("automation", {})
|
automation = config.get("automation", {})
|
||||||
configured_threshold = automation.get("confidence_threshold", 75)
|
confidence_threshold = automation.get("confidence_threshold", 75)
|
||||||
|
|
||||||
# Adaptive threshold: be conservative when history is thin
|
|
||||||
stats = decision_store.get_all_stats()
|
|
||||||
total_decisions = stats["total"] if stats else 0
|
|
||||||
bootstrap_min = automation.get("bootstrap_min_decisions", 20)
|
|
||||||
if total_decisions < bootstrap_min:
|
|
||||||
confidence_threshold = 95
|
|
||||||
print(f"Learning phase ({total_decisions}/{bootstrap_min} decisions) — threshold: 95%\n")
|
|
||||||
else:
|
|
||||||
confidence_threshold = configured_threshold
|
|
||||||
|
|
||||||
# Fetch envelopes via himalaya
|
# Fetch envelopes via himalaya
|
||||||
if recent:
|
if recent:
|
||||||
@@ -354,12 +345,18 @@ def cmd_scan(config, recent=None, dry_run=False):
|
|||||||
email_data = build_email_data(envelope, body, config)
|
email_data = build_email_data(envelope, body, config)
|
||||||
print(f"{email_data['subject'][:55]}")
|
print(f"{email_data['subject'][:55]}")
|
||||||
|
|
||||||
# Run the LLM classifier (includes few-shot examples from history)
|
# Run the LLM classifier (returns tags instead of confidence)
|
||||||
action, confidence, summary, reason, duration = classifier.classify_email(
|
action, tags, summary, reason, duration = classifier.classify_email(
|
||||||
email_data, config
|
email_data, config
|
||||||
)
|
)
|
||||||
|
|
||||||
|
# Compute confidence from decision history
|
||||||
|
sender_email = decision_store._extract_email_address(email_data.get("sender", ""))
|
||||||
|
confidence = decision_store.compute_confidence(sender_email, action, tags)
|
||||||
|
|
||||||
|
tags_str = ", ".join(tags) if tags else "(none)"
|
||||||
print(f" -> {action} (confidence: {confidence}%, {duration:.1f}s)")
|
print(f" -> {action} (confidence: {confidence}%, {duration:.1f}s)")
|
||||||
|
print(f" tags: [{tags_str}]")
|
||||||
print(f" {reason[:80]}")
|
print(f" {reason[:80]}")
|
||||||
|
|
||||||
# Auto-act if confidence meets threshold
|
# Auto-act if confidence meets threshold
|
||||||
@@ -379,7 +376,7 @@ def cmd_scan(config, recent=None, dry_run=False):
|
|||||||
success = execute_action(eid, action)
|
success = execute_action(eid, action)
|
||||||
if success:
|
if success:
|
||||||
decision_store.record_decision(
|
decision_store.record_decision(
|
||||||
{**email_data, "summary": summary}, action, source="auto"
|
{**email_data, "summary": summary}, action, source="auto", tags=tags
|
||||||
)
|
)
|
||||||
log_result(log_file, email_data, f"AUTO:{action}", reason, duration)
|
log_result(log_file, email_data, f"AUTO:{action}", reason, duration)
|
||||||
print(f" ** AUTO-executed: {action}")
|
print(f" ** AUTO-executed: {action}")
|
||||||
@@ -388,11 +385,11 @@ def cmd_scan(config, recent=None, dry_run=False):
|
|||||||
# Himalaya action failed — fall back to queuing
|
# Himalaya action failed — fall back to queuing
|
||||||
log_result(log_file, email_data, "AUTO_FAILED", reason, duration)
|
log_result(log_file, email_data, "AUTO_FAILED", reason, duration)
|
||||||
print(f" !! Auto-action failed, queuing instead")
|
print(f" !! Auto-action failed, queuing instead")
|
||||||
add_to_pending(email_data, summary, reason, action, confidence)
|
add_to_pending(email_data, summary, reason, action, confidence, tags)
|
||||||
queued += 1
|
queued += 1
|
||||||
else:
|
else:
|
||||||
# Not enough confidence or history — queue for manual review
|
# Not enough confidence or history — queue for manual review
|
||||||
add_to_pending(email_data, summary, reason, action, confidence)
|
add_to_pending(email_data, summary, reason, action, confidence, tags)
|
||||||
log_result(log_file, email_data, f"QUEUED:{action}@{confidence}%", reason, duration)
|
log_result(log_file, email_data, f"QUEUED:{action}@{confidence}%", reason, duration)
|
||||||
print(f" -> Queued (confidence {confidence}% < {confidence_threshold}%)")
|
print(f" -> Queued (confidence {confidence}% < {confidence_threshold}%)")
|
||||||
queued += 1
|
queued += 1
|
||||||
@@ -440,11 +437,14 @@ def cmd_review_list():
|
|||||||
for i, (msg_id, data) in enumerate(sorted_items, 1):
|
for i, (msg_id, data) in enumerate(sorted_items, 1):
|
||||||
suggested = data.get("suggested_action", "?")
|
suggested = data.get("suggested_action", "?")
|
||||||
conf = data.get("confidence", "?")
|
conf = data.get("confidence", "?")
|
||||||
|
tags = data.get("tags", [])
|
||||||
|
tags_str = ", ".join(tags) if tags else "(none)"
|
||||||
print(f"\n {i}. [{msg_id}]")
|
print(f"\n {i}. [{msg_id}]")
|
||||||
print(f" Subject: {data.get('subject', 'N/A')[:55]}")
|
print(f" Subject: {data.get('subject', 'N/A')[:55]}")
|
||||||
print(f" From: {data.get('sender', 'N/A')[:55]}")
|
print(f" From: {data.get('sender', 'N/A')[:55]}")
|
||||||
print(f" To: {data.get('recipient', 'N/A')[:40]}")
|
print(f" To: {data.get('recipient', 'N/A')[:40]}")
|
||||||
print(f" Summary: {data.get('summary', 'N/A')[:70]}")
|
print(f" Summary: {data.get('summary', 'N/A')[:70]}")
|
||||||
|
print(f" Tags: [{tags_str}]")
|
||||||
print(f" Suggested: {suggested} ({conf}% confidence)")
|
print(f" Suggested: {suggested} ({conf}% confidence)")
|
||||||
|
|
||||||
print(f"\n{'=' * 60}")
|
print(f"\n{'=' * 60}")
|
||||||
@@ -496,7 +496,7 @@ def cmd_review_act(selector, action):
|
|||||||
success = execute_action(eid, action)
|
success = execute_action(eid, action)
|
||||||
if success:
|
if success:
|
||||||
# Record decision for future learning
|
# Record decision for future learning
|
||||||
decision_store.record_decision(data, action, source="user")
|
decision_store.record_decision(data, action, source="user", tags=data.get("tags", []))
|
||||||
|
|
||||||
# Mark as done in pending queue
|
# Mark as done in pending queue
|
||||||
pending = load_pending()
|
pending = load_pending()
|
||||||
@@ -540,7 +540,7 @@ def cmd_review_accept():
|
|||||||
|
|
||||||
success = execute_action(eid, action)
|
success = execute_action(eid, action)
|
||||||
if success:
|
if success:
|
||||||
decision_store.record_decision(data, action, source="user")
|
decision_store.record_decision(data, action, source="user", tags=data.get("tags", []))
|
||||||
|
|
||||||
pending = load_pending()
|
pending = load_pending()
|
||||||
pending[msg_id]["status"] = "done"
|
pending[msg_id]["status"] = "done"
|
||||||
@@ -616,14 +616,14 @@ def cmd_stats():
|
|||||||
for action, count in sorted(stats["by_action"].items(), key=lambda x: -x[1]):
|
for action, count in sorted(stats["by_action"].items(), key=lambda x: -x[1]):
|
||||||
print(f" {action}: {count}")
|
print(f" {action}: {count}")
|
||||||
|
|
||||||
# Top sender domains with per-domain action counts
|
# Top sender addresses with per-sender action counts
|
||||||
print(f"\nTop sender domains:")
|
print(f"\nTop senders:")
|
||||||
for domain, count in stats["top_domains"]:
|
for sender, count in stats["top_senders"]:
|
||||||
domain_stats = decision_store.get_sender_stats(domain)
|
sender_stats = decision_store.get_sender_stats(sender)
|
||||||
detail = ", ".join(
|
detail = ", ".join(
|
||||||
f"{a}:{c}" for a, c in sorted(domain_stats.items(), key=lambda x: -x[1])
|
f"{a}:{c}" for a, c in sorted(sender_stats.items(), key=lambda x: -x[1])
|
||||||
)
|
)
|
||||||
print(f" {domain}: {count} ({detail})")
|
print(f" {sender}: {count} ({detail})")
|
||||||
|
|
||||||
# Custom labels
|
# Custom labels
|
||||||
labels = decision_store.get_known_labels()
|
labels = decision_store.get_known_labels()
|
||||||
|
|||||||
Reference in New Issue
Block a user