email processor

This commit is contained in:
Yanxin Lu
2026-02-26 20:54:07 -08:00
parent c5c9be3f00
commit b14a93866e
18 changed files with 1365 additions and 666 deletions

View File

@@ -1,297 +1,704 @@
#!/usr/bin/env python3
"""
Email Processor - Auto filter ads using local Qwen3
Moves ad emails to Trash folder (not permanently deleted)
Email Processor - Learning-based mailbox cleanup using Himalaya + Ollama.
Uses himalaya CLI for all IMAP operations (no raw imaplib, no stored
credentials). Uses a local Qwen3 model via Ollama for classification,
with few-shot learning from past user decisions.
All commands are non-interactive — they take arguments, mutate files on
disk, and exit. Suitable for cron (OpenClaw) and scripting.
Subcommands:
python main.py scan # classify unseen emails
python main.py scan --recent 30 # classify last 30 days
python main.py scan --dry-run # classify only, no changes
python main.py scan --recent 7 --dry-run # combine both
python main.py review list # print pending queue
python main.py review <num-or-id> <action> # act on one email
python main.py review all <action> # act on all pending
python main.py review accept # accept all suggestions
python main.py stats # show decision history
python main.py migrate # import old decisions
Action mapping (what each classification does to the email):
delete -> himalaya message delete <id> (moves to Trash)
archive -> himalaya message move Archive <id>
keep -> no-op (leave unread in inbox)
mark_read -> himalaya flag add <id> seen
label:X -> himalaya message move <X> <id>
"""
import json
import imaplib
import email
import os
import subprocess
import hashlib
import sys
from datetime import datetime
from datetime import datetime, timedelta
from pathlib import Path
# Config
import classifier
import decision_store
# ---------------------------------------------------------------------------
# Paths — all relative to the script's own directory
# ---------------------------------------------------------------------------
SCRIPT_DIR = Path(__file__).parent
CONFIG_FILE = SCRIPT_DIR / "config.json"
LOGS_DIR = SCRIPT_DIR / "logs"
DATA_DIR = SCRIPT_DIR / "data"
PENDING_FILE = DATA_DIR / "pending_emails.json"
# ---------------------------------------------------------------------------
# Config
# ---------------------------------------------------------------------------
def load_config():
"""Load configuration"""
"""Load config.json from the script directory.
Only ollama, rules, and automation settings are needed — himalaya
manages its own IMAP config separately.
"""
with open(CONFIG_FILE) as f:
return json.load(f)
def connect_imap(config):
"""Connect to IMAP server"""
imap_config = config['imap']
mail = imaplib.IMAP4_SSL(imap_config['host'], imap_config['port'])
mail.login(imap_config['email'], imap_config['password'])
return mail
def get_unseen_emails(mail):
"""Get list of unseen email IDs"""
mail.select('INBOX')
_, search_data = mail.search(None, 'UNSEEN')
email_ids = search_data[0].split()
return email_ids
# ---------------------------------------------------------------------------
# Himalaya CLI wrappers
#
# All IMAP operations go through himalaya, which handles connection,
# auth, and protocol details. We call it as a subprocess and parse
# its JSON output.
# ---------------------------------------------------------------------------
def fetch_email(mail, email_id):
"""Fetch email content"""
_, msg_data = mail.fetch(email_id, '(RFC822)')
raw_email = msg_data[0][1]
msg = email.message_from_bytes(raw_email)
# Extract subject
subject = msg['Subject'] or '(No Subject)'
# Extract sender
sender = msg['From'] or '(Unknown)'
# Extract recipient
recipient = msg['To'] or '(Unknown)'
# Extract date
date = msg['Date'] or datetime.now().isoformat()
# Extract body
body = ""
if msg.is_multipart():
for part in msg.walk():
if part.get_content_type() == "text/plain":
try:
body = part.get_payload(decode=True).decode('utf-8', errors='ignore')
break
except:
pass
def _himalaya(*args):
"""Run a himalaya command and return its stdout.
Raises subprocess.CalledProcessError on failure.
"""
result = subprocess.run(
["himalaya", *args],
capture_output=True, text=True, check=True,
)
return result.stdout
def _himalaya_json(*args):
"""Run a himalaya command with JSON output and return parsed result."""
return json.loads(_himalaya("-o", "json", *args))
# ---------------------------------------------------------------------------
# Email fetching via himalaya
# ---------------------------------------------------------------------------
def get_unseen_envelopes():
"""Fetch envelope metadata for all unseen emails in INBOX.
Returns a list of envelope dicts from himalaya's JSON output.
Each has keys like: id, subject, from, to, date, flags.
"""
return _himalaya_json("envelope", "list", "not", "flag", "seen")
def get_recent_envelopes(days):
"""Fetch envelope metadata for all emails from the last N days.
Includes both read and unread emails — useful for testing and
bulk-classifying historical mail.
"""
since = (datetime.now() - timedelta(days=days)).strftime("%Y-%m-%d")
return _himalaya_json("envelope", "list", "after", since)
def read_message(envelope_id):
"""Read the full message body without marking it as seen.
The --preview flag prevents himalaya from adding the \\Seen flag,
so the email stays unread for the actual action to handle.
"""
# Read plain text, no headers, without marking as seen
return _himalaya("message", "read", "--preview", "--no-headers", str(envelope_id))
def build_email_data(envelope, body, config):
"""Build the email_data dict expected by classifier and decision_store.
Combines envelope metadata (from himalaya envelope list) with the
message body (from himalaya message read).
"""
max_body = config.get("rules", {}).get("max_body_length", 1000)
# himalaya envelope JSON uses "from" as a nested object or string
sender = envelope.get("from", {})
if isinstance(sender, dict):
# Format: {"name": "Display Name", "addr": "user@example.com"}
name = sender.get("name", "")
addr = sender.get("addr", "")
sender_str = f"{name} <{addr}>" if name else addr
elif isinstance(sender, list) and sender:
first = sender[0]
name = first.get("name", "")
addr = first.get("addr", "")
sender_str = f"{name} <{addr}>" if name else addr
else:
try:
body = msg.get_payload(decode=True).decode('utf-8', errors='ignore')
except:
pass
sender_str = str(sender)
# Same for "to"
to = envelope.get("to", {})
if isinstance(to, dict):
name = to.get("name", "")
addr = to.get("addr", "")
to_str = f"{name} <{addr}>" if name else addr
elif isinstance(to, list) and to:
first = to[0]
name = first.get("name", "")
addr = first.get("addr", "")
to_str = f"{name} <{addr}>" if name else addr
else:
to_str = str(to)
return {
'id': email_id,
'subject': subject,
'sender': sender,
'recipient': recipient,
'date': date,
'body': body[:300] # Limit body length
"id": str(envelope.get("id", "")),
"subject": envelope.get("subject", "(No Subject)"),
"sender": sender_str,
"recipient": to_str,
"date": envelope.get("date", ""),
"body": body[:max_body],
}
def analyze_with_qwen3(email_data, config):
"""Analyze email with local Qwen3 using official library"""
import ollama
import time
prompt = f"""/no_think
Analyze this email and provide two pieces of information:
# ---------------------------------------------------------------------------
# IMAP actions via himalaya
#
# Each function executes one himalaya command. Returns True on success.
# On failure, prints the error and returns False.
# ---------------------------------------------------------------------------
1. Is this an advertisement/promotional email?
2. Summarize the email in one sentence
def execute_action(envelope_id, action):
"""Dispatch an action string to the appropriate himalaya command.
Email details:
Subject: {email_data['subject']}
Sender: {email_data['sender']}
Body: {email_data['body'][:300]}
Action mapping:
"delete" -> himalaya message delete <id>
"archive" -> himalaya message move Archive <id>
"keep" -> no-op (leave unread in inbox)
"mark_read" -> himalaya flag add <id> seen
"label:X" -> himalaya message move <X> <id>
Respond in this exact format:
IsAD: [YES or NO]
Summary: [one sentence summary]
Reason: [brief explanation]
"""
start_time = time.time()
model = config['ollama'].get('model', 'qwen3:4b')
Returns True on success, False on failure.
"""
eid = str(envelope_id)
try:
response = ollama.generate(model=model, prompt=prompt, options={'temperature': 0.1})
output = response['response']
# Parse output
is_ad = False
summary = "No summary"
reason = "Unknown"
for line in output.strip().split('\n'):
if line.startswith('IsAD:'):
is_ad = 'YES' in line.upper()
elif line.startswith('Summary:'):
summary = line.replace('Summary:', '').strip()[:200]
elif line.startswith('Reason:'):
reason = line.replace('Reason:', '').strip()
if is_ad:
result = f"AD: {reason}"
if action == "delete":
_himalaya("message", "delete", eid)
elif action == "archive":
_himalaya("message", "move", "Archive", eid)
elif action == "keep":
pass # leave unread in inbox — no IMAP changes
elif action == "mark_read":
_himalaya("flag", "add", eid, "seen")
elif action.startswith("label:"):
folder = action[6:]
_himalaya("message", "move", folder, eid)
else:
result = f"KEEP: {reason}"
except Exception as e:
result = f"KEEP: error - {str(e)[:100]}"
summary = "Analysis failed"
is_ad = False
duration = time.time() - start_time
return result, summary, is_ad, duration
def move_to_trash(mail, email_id):
"""Move email to Trash folder"""
# Copy to Trash
result = mail.copy(email_id, 'Trash')
if result[0] == 'OK':
# Mark original as deleted
mail.store(email_id, '+FLAGS', '\\Deleted')
print(f" Unknown action: {action}")
return False
return True
return False
except subprocess.CalledProcessError as e:
print(f" Himalaya error: {e.stderr.strip()}")
return False
def log_result(log_file, email_data, analysis, action, duration=None):
"""Log processing result with Qwen3 duration"""
timestamp = datetime.now().strftime('%Y-%m-%d %H:%M:%S')
duration_str = f" ({duration:.1f}s)" if duration else ""
with open(log_file, 'a') as f:
f.write(f"[{timestamp}] {action}{duration_str}: {email_data['subject'][:60]}\n")
f.write(f" From: {email_data['sender']}\n")
f.write(f" Analysis: {analysis}\n\n")
# ---------------------------------------------------------------------------
# Pending queue — emails awaiting manual review
#
# Stored as a JSON dict in data/pending_emails.json, keyed by msg_id.
# Each entry tracks the envelope ID (for himalaya), classifier suggestion,
# and status (pending/done).
# ---------------------------------------------------------------------------
def load_pending():
"""Load pending emails from JSON file"""
"""Load the pending queue from disk."""
if not PENDING_FILE.exists():
return {}
with open(PENDING_FILE, 'r', encoding='utf-8') as f:
with open(PENDING_FILE, "r", encoding="utf-8") as f:
return json.load(f)
def save_pending(pending):
"""Save pending emails to JSON file"""
"""Write the pending queue to disk."""
DATA_DIR.mkdir(exist_ok=True)
with open(PENDING_FILE, 'w', encoding='utf-8') as f:
with open(PENDING_FILE, "w", encoding="utf-8") as f:
json.dump(pending, f, indent=2, ensure_ascii=False)
def add_to_pending(email_data, summary, imap_uid, recipient):
"""Add email to pending queue"""
def add_to_pending(email_data, summary, reason, action_suggestion, confidence):
"""Add an email to the pending queue for manual review.
Stores the classifier's suggestion and confidence alongside the
email metadata so the user can see what the model thought.
"""
pending = load_pending()
# Generate unique ID
import hashlib
msg_id = f"msg_{hashlib.md5(f'{imap_uid}_{email_data['subject']}'.encode()).hexdigest()[:8]}"
# Extract date from email
email_date = email_data.get('date', datetime.now().isoformat())
# Generate a stable ID from envelope ID + subject
eid = str(email_data["id"])
key = f"{eid}_{email_data['subject']}"
msg_id = f"msg_{hashlib.md5(key.encode()).hexdigest()[:8]}"
pending[msg_id] = {
"imap_uid": str(imap_uid),
"subject": email_data['subject'],
"sender": email_data['sender'],
"recipient": recipient,
"envelope_id": eid,
"subject": email_data["subject"],
"sender": email_data["sender"],
"recipient": email_data.get("recipient", ""),
"summary": summary,
"email_date": email_date,
"reason": reason,
"suggested_action": action_suggestion,
"confidence": confidence,
"email_date": email_data.get("date", ""),
"status": "pending",
"found_at": datetime.now().isoformat()
"found_at": datetime.now().isoformat(),
}
save_pending(pending)
return msg_id
def main():
"""Main processing function"""
print("📧 Email Processor Starting...")
# Load config
config = load_config()
# Setup logging
# ---------------------------------------------------------------------------
# Logging
# ---------------------------------------------------------------------------
def log_result(log_file, email_data, action, detail, duration=None):
"""Append a one-line log entry for a processed email."""
timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
dur = f" ({duration:.1f}s)" if duration else ""
with open(log_file, "a") as f:
f.write(f"[{timestamp}] {action}{dur}: {email_data['subject'][:60]}\n")
f.write(f" From: {email_data['sender']}\n")
f.write(f" Detail: {detail}\n\n")
# ---------------------------------------------------------------------------
# Subcommand: scan
# ---------------------------------------------------------------------------
def cmd_scan(config, recent=None, dry_run=False):
"""Fetch emails, classify each one, then auto-act or queue.
Auto-action is based on a single confidence threshold. When the
decision history has fewer than 20 entries, a higher threshold (95%)
is used to be conservative during the learning phase. Once enough
history accumulates, the configured threshold takes over.
Args:
config: full config dict.
recent: if set, fetch emails from last N days (not just unseen).
dry_run: if True, classify and print but skip all actions.
"""
mode = "DRY RUN" if dry_run else "Scan"
print(f"Email Processor - {mode}")
print("=" * 50)
LOGS_DIR.mkdir(exist_ok=True)
log_file = LOGS_DIR / f"{datetime.now().strftime('%Y-%m-%d')}.log"
try:
# Connect to IMAP
print("Connecting to IMAP...")
mail = connect_imap(config)
print("✅ Connected")
# Get unseen emails
email_ids = get_unseen_emails(mail)
print(f"Found {len(email_ids)} unread emails")
if not email_ids:
print("No new emails to process")
mail.logout()
return
# Process each email
processed = 0
moved_to_trash = 0
added_to_pending = 0
for email_id in email_ids:
print(f"\nProcessing email {email_id.decode()}...")
# Fetch email
email_data = fetch_email(mail, email_id)
print(f" Subject: {email_data['subject'][:50]}")
# Analyze with Qwen3 (one call for both ad detection and summary)
analysis, summary, is_ad, duration = analyze_with_qwen3(email_data, config)
print(f" Analysis: {analysis[:100]}")
print(f" Summary: {summary[:60]}...")
print(f" Qwen3 time: {duration:.1f}s")
# Check if analysis was successful (not an error)
if 'error -' in analysis.lower():
# Analysis failed - keep email unread for retry
print(f" -> Analysis failed, keeping unread for retry")
log_result(log_file, email_data, analysis, "FAILED_RETRY", duration)
# Don't increment processed count - will retry next time
continue
# Analysis successful - determine action
if is_ad:
print(" -> Moving to Trash")
if move_to_trash(mail, email_id):
log_result(log_file, email_data, analysis, "MOVED_TO_TRASH", duration)
moved_to_trash += 1
else:
log_result(log_file, email_data, analysis, "MOVE_FAILED", duration)
# Load automation threshold
automation = config.get("automation", {})
configured_threshold = automation.get("confidence_threshold", 75)
# Adaptive threshold: be conservative when history is thin
stats = decision_store.get_all_stats()
total_decisions = stats["total"] if stats else 0
bootstrap_min = automation.get("bootstrap_min_decisions", 20)
if total_decisions < bootstrap_min:
confidence_threshold = 95
print(f"Learning phase ({total_decisions}/{bootstrap_min} decisions) — threshold: 95%\n")
else:
confidence_threshold = configured_threshold
# Fetch envelopes via himalaya
if recent:
envelopes = get_recent_envelopes(recent)
print(f"Found {len(envelopes)} emails from last {recent} days\n")
else:
envelopes = get_unseen_envelopes()
print(f"Found {len(envelopes)} unread emails\n")
if not envelopes:
print("No new emails to process.")
return
auto_acted = 0
queued = 0
for envelope in envelopes:
eid = envelope.get("id", "?")
print(f"[{eid}] ", end="", flush=True)
# Read message body without marking as seen
try:
body = read_message(eid)
except subprocess.CalledProcessError:
body = ""
email_data = build_email_data(envelope, body, config)
print(f"{email_data['subject'][:55]}")
# Run the LLM classifier (includes few-shot examples from history)
action, confidence, summary, reason, duration = classifier.classify_email(
email_data, config
)
print(f" -> {action} (confidence: {confidence}%, {duration:.1f}s)")
print(f" {reason[:80]}")
# Auto-act if confidence meets threshold
can_auto = confidence >= confidence_threshold
if dry_run:
# Dry run: log what would happen, touch nothing
log_result(log_file, email_data, f"DRYRUN:{action}@{confidence}%", reason, duration)
if can_auto:
print(f" -> Would AUTO-execute: {action}")
auto_acted += 1
else:
# Non-ad email - add to pending queue
print(" -> Adding to pending queue")
# Add to pending
msg_internal_id = add_to_pending(
email_data,
summary,
email_id.decode(),
email_data.get('recipient', 'youlu@luyanxin.com')
print(f" -> Would queue for review")
queued += 1
elif can_auto:
# Auto-execute the action via himalaya
success = execute_action(eid, action)
if success:
decision_store.record_decision(
{**email_data, "summary": summary}, action, source="auto"
)
# Mark as read (so it won't be processed again)
mail.store(email_id, '+FLAGS', '\\Seen')
log_result(log_file, email_data, analysis, f"ADDED_TO_PENDING ({msg_internal_id})", duration)
added_to_pending += 1
processed += 1
# Expunge deleted emails
mail.expunge()
mail.logout()
# Summary
print(f"\n{'='*50}")
print(f"Total emails checked: {len(email_ids)}")
print(f"Successfully processed: {processed} emails")
print(f" - Moved to trash (ads): {moved_to_trash}")
print(f" - Added to pending queue: {added_to_pending}")
print(f"Failed (will retry next time): {len(email_ids) - processed}")
print(f"\n📁 Pending queue: {PENDING_FILE}")
print(f"📝 Log: {log_file}")
print(f"\n💡 Run 'python process_queue.py' to view and process pending emails")
except Exception as e:
print(f"❌ Error: {e}")
log_result(log_file, email_data, f"AUTO:{action}", reason, duration)
print(f" ** AUTO-executed: {action}")
auto_acted += 1
else:
# Himalaya action failed — fall back to queuing
log_result(log_file, email_data, "AUTO_FAILED", reason, duration)
print(f" !! Auto-action failed, queuing instead")
add_to_pending(email_data, summary, reason, action, confidence)
queued += 1
else:
# Not enough confidence or history — queue for manual review
add_to_pending(email_data, summary, reason, action, confidence)
# Mark as read to prevent re-processing on next scan
if not dry_run:
try:
_himalaya("flag", "add", str(eid), "seen")
except subprocess.CalledProcessError:
pass
log_result(log_file, email_data, f"QUEUED:{action}@{confidence}%", reason, duration)
print(f" -> Queued (confidence {confidence}% < {confidence_threshold}%)")
queued += 1
# Print run summary
print(f"\n{'=' * 50}")
print(f"Processed: {len(envelopes)} emails")
print(f" Auto-acted: {auto_acted}")
print(f" Queued for review: {queued}")
print(f"\nRun 'python main.py review list' to see pending emails")
# ---------------------------------------------------------------------------
# Subcommand: review
#
# Non-interactive: each invocation takes arguments, acts, and exits.
# No input() calls. Compatible with cron and scripting.
# ---------------------------------------------------------------------------
def _get_pending_items():
"""Return only pending (not done) items, sorted by found_at."""
pending = load_pending()
items = {k: v for k, v in pending.items() if v.get("status") == "pending"}
sorted_items = sorted(items.items(), key=lambda x: x[1].get("found_at", ""))
return sorted_items
def cmd_review_list():
"""Print the pending queue and exit.
Shows each email with its number, ID, subject, sender, summary,
and the classifier's suggested action with confidence.
"""
sorted_items = _get_pending_items()
if not sorted_items:
print("No pending emails to review.")
return
print(f"Pending emails: {len(sorted_items)}")
print("=" * 60)
for i, (msg_id, data) in enumerate(sorted_items, 1):
suggested = data.get("suggested_action", "?")
conf = data.get("confidence", "?")
print(f"\n {i}. [{msg_id}]")
print(f" Subject: {data.get('subject', 'N/A')[:55]}")
print(f" From: {data.get('sender', 'N/A')[:55]}")
print(f" To: {data.get('recipient', 'N/A')[:40]}")
print(f" Summary: {data.get('summary', 'N/A')[:70]}")
print(f" Suggested: {suggested} ({conf}% confidence)")
print(f"\n{'=' * 60}")
print("Usage:")
print(" python main.py review <number> <action>")
print(" python main.py review all <action>")
print(" python main.py review accept")
print("Actions: delete / archive / keep / mark_read / label:<name>")
def cmd_review_act(selector, action):
"""Execute an action on one or more pending emails.
Args:
selector: a 1-based number, a msg_id string, or "all".
action: one of delete/archive/keep/mark_read/label:<name>.
"""
# Validate action
valid_actions = {"delete", "archive", "keep", "mark_read"}
if action not in valid_actions and not action.startswith("label:"):
print(f"Invalid action: {action}")
print(f"Valid: {', '.join(sorted(valid_actions))}, label:<name>")
sys.exit(1)
sorted_items = _get_pending_items()
if not sorted_items:
print("No pending emails to review.")
return
# Resolve targets
if selector == "all":
targets = sorted_items
else:
target = _resolve_target(selector, sorted_items)
if target is None:
sys.exit(1)
targets = [target]
LOGS_DIR.mkdir(exist_ok=True)
log_file = LOGS_DIR / f"{datetime.now().strftime('%Y-%m-%d')}.log"
# Execute action on each target
for msg_id, data in targets:
eid = data.get("envelope_id") or data.get("imap_uid")
if not eid:
print(f" {msg_id}: No envelope ID, skipping")
continue
success = execute_action(eid, action)
if success:
# Record decision for future learning
decision_store.record_decision(data, action, source="user")
# Mark as done in pending queue
pending = load_pending()
pending[msg_id]["status"] = "done"
pending[msg_id]["action"] = action
pending[msg_id]["processed_at"] = datetime.now().isoformat()
save_pending(pending)
log_result(log_file, data, f"REVIEW:{action}", data.get("reason", ""))
print(f" {msg_id}: {action} -> OK ({data['subject'][:40]})")
else:
log_result(log_file, data, f"REVIEW_FAILED:{action}", data.get("reason", ""))
print(f" {msg_id}: {action} -> FAILED")
def cmd_review_accept():
"""Accept all classifier suggestions for pending emails.
For each pending email, executes the suggested_action that the
classifier assigned during scan. Records each as a "user" decision
since the user explicitly chose to accept.
"""
sorted_items = _get_pending_items()
if not sorted_items:
print("No pending emails to review.")
return
LOGS_DIR.mkdir(exist_ok=True)
log_file = LOGS_DIR / f"{datetime.now().strftime('%Y-%m-%d')}.log"
for msg_id, data in sorted_items:
action = data.get("suggested_action")
if not action:
print(f" {msg_id}: No suggestion, skipping")
continue
eid = data.get("envelope_id") or data.get("imap_uid")
if not eid:
print(f" {msg_id}: No envelope ID, skipping")
continue
success = execute_action(eid, action)
if success:
decision_store.record_decision(data, action, source="user")
pending = load_pending()
pending[msg_id]["status"] = "done"
pending[msg_id]["action"] = action
pending[msg_id]["processed_at"] = datetime.now().isoformat()
save_pending(pending)
log_result(log_file, data, f"ACCEPT:{action}", data.get("reason", ""))
print(f" {msg_id}: {action} -> OK ({data['subject'][:40]})")
else:
log_result(log_file, data, f"ACCEPT_FAILED:{action}", data.get("reason", ""))
print(f" {msg_id}: {action} -> FAILED")
def _resolve_target(selector, sorted_items):
"""Resolve a selector (number or msg_id) to a (msg_id, data) tuple.
Returns None and prints an error if the selector is invalid.
"""
# Try as 1-based index
try:
idx = int(selector) - 1
if 0 <= idx < len(sorted_items):
return sorted_items[idx]
else:
print(f"Invalid number. Range: 1-{len(sorted_items)}")
return None
except ValueError:
pass
# Try as msg_id
for msg_id, data in sorted_items:
if msg_id == selector:
return (msg_id, data)
print(f"Not found: {selector}")
return None
# ---------------------------------------------------------------------------
# Subcommand: stats
# ---------------------------------------------------------------------------
def cmd_stats():
"""Print a summary of the decision history.
Shows total decisions, user vs. auto breakdown, action distribution,
top sender domains, and custom labels.
"""
stats = decision_store.get_all_stats()
if not stats:
print("No decision history yet.")
print("Run 'python main.py scan' and 'python main.py review' to build history.")
return
print("Decision History Stats")
print("=" * 50)
print(f"Total decisions: {stats['total']}")
# User vs. auto breakdown
print(f"\nBy source:")
for source, count in sorted(stats["by_source"].items()):
pct = count / stats["total"] * 100
print(f" {source}: {count} ({pct:.0f}%)")
auto = stats["by_source"].get("auto", 0)
if stats["total"] > 0:
print(f" Automation rate: {auto / stats['total'] * 100:.0f}%")
# Action distribution
print(f"\nBy action:")
for action, count in sorted(stats["by_action"].items(), key=lambda x: -x[1]):
print(f" {action}: {count}")
# Top sender domains with per-domain action counts
print(f"\nTop sender domains:")
for domain, count in stats["top_domains"]:
domain_stats = decision_store.get_sender_stats(domain)
detail = ", ".join(
f"{a}:{c}" for a, c in sorted(domain_stats.items(), key=lambda x: -x[1])
)
print(f" {domain}: {count} ({detail})")
# Custom labels
labels = decision_store.get_known_labels()
if labels:
print(f"\nKnown labels: {', '.join(sorted(labels))}")
# ---------------------------------------------------------------------------
# Subcommand: migrate
# ---------------------------------------------------------------------------
def cmd_migrate():
"""Import old pending_emails.json 'done' entries into decision history.
Run once after upgrading from the old system. Converts old action
names (archived/kept/deleted) to new ones (archive/keep/delete).
"""
decision_store.migrate_pending()
# ---------------------------------------------------------------------------
# Entry point & argument parsing
#
# Simple hand-rolled parser — no external dependencies. Supports:
# main.py [subcommand] [--recent N] [--dry-run] [review-args...]
# ---------------------------------------------------------------------------
if __name__ == "__main__":
main()
args = sys.argv[1:]
subcommand = "scan"
recent = None
dry_run = False
extra_args = [] # for review subcommand arguments
# Parse args
i = 0
while i < len(args):
if args[i] == "--recent" and i + 1 < len(args):
recent = int(args[i + 1])
i += 2
elif args[i] == "--dry-run":
dry_run = True
i += 1
elif not args[i].startswith("--") and subcommand == "scan" and not extra_args:
# First positional arg is the subcommand
subcommand = args[i]
i += 1
elif not args[i].startswith("--"):
# Remaining positional args go to the subcommand
extra_args.append(args[i])
i += 1
else:
print(f"Unknown flag: {args[i]}")
sys.exit(1)
config = load_config()
if subcommand == "scan":
cmd_scan(config, recent=recent, dry_run=dry_run)
elif subcommand == "review":
if not extra_args or extra_args[0] == "list":
cmd_review_list()
elif extra_args[0] == "accept":
cmd_review_accept()
elif len(extra_args) == 2:
cmd_review_act(extra_args[0], extra_args[1])
else:
print("Usage:")
print(" python main.py review list")
print(" python main.py review <number-or-id> <action>")
print(" python main.py review all <action>")
print(" python main.py review accept")
sys.exit(1)
elif subcommand == "stats":
cmd_stats()
elif subcommand == "migrate":
cmd_migrate()
else:
print(f"Unknown subcommand: {subcommand}")
print("Usage: python main.py [scan|review|stats|migrate] [--recent N] [--dry-run]")
sys.exit(1)