Email-Agent/Obsolete/labeler.py
milo 5bdd911cf4 First push
from FORK client
2025-05-06 11:51:33 -04:00

115 lines
3.4 KiB
Python
Raw Permalink Blame History

This file contains invisible Unicode characters

This file contains invisible Unicode characters that are indistinguishable to humans but may be processed differently by a computer. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

import os
import yaml
import mysql.connector
from keybert import KeyBERT
from sentence_transformers import SentenceTransformer
# === Load multilingual model for KeyBERT ===
model = SentenceTransformer("paraphrase-multilingual-MiniLM-L12-v2")
kw_model = KeyBERT(model)
# === Load label hierarchy from YAML ===
LABEL_FILE = os.getenv("LABEL_CONFIG_PATH", "labels.yml")
with open(LABEL_FILE, "r", encoding="utf-8") as f:
label_config = yaml.safe_load(f)
print(f"📂 Using label config: {LABEL_FILE}")
print(label_config)
# === DB Credentials ===
DB_HOST = os.getenv("DB_HOST", "localhost")
DB_PORT = int(os.getenv("DB_PORT", 3306))
DB_USER = os.getenv("DB_USER", "emailuser")
DB_PASSWORD = os.getenv("DB_PASSWORD", "miguel33020")
DB_NAME = os.getenv("DB_NAME", "emailassistant")
# === Connect to DB ===
conn = mysql.connector.connect(
host=DB_HOST,
port=DB_PORT,
user=DB_USER,
password=DB_PASSWORD,
database=DB_NAME
)
cursor = conn.cursor(dictionary=True)
# === Logging Helper ===
def log_event(cursor, level, source, message):
cursor.execute(
"INSERT INTO logs (level, source, message) VALUES (%s, %s, %s)",
(level, source, message)
)
# === Recursive label matcher ===
def match_labels(keywords, label_tree, prefix=""):
for label, data in label_tree.items():
full_label = f"{prefix}/{label}".strip("/")
label_keywords = [kw.lower() for kw in data.get("keywords", [])]
# First check children
children = data.get("children", {})
child_match = match_labels(keywords, children, prefix=full_label)
if child_match:
return child_match
# Then check this level (so children take priority)
if any(kw in keywords for kw in label_keywords):
return full_label
return None
# === Fetch emails that haven't been labeled ===
cursor.execute("SELECT id, subject, ai_category FROM emails")
emails = cursor.fetchall()
# === Main Labeling Loop ===
for email in emails:
email_id = email["id"]
subject = email["subject"]
current_label = email["ai_category"]
# if current_label not in [None, "None", ""]:
# print(f" Email {email_id} already has label '{current_label}'")
# continue
if not subject or not subject.strip():
log_event(cursor, "WARNING", "labeler", f"Skipped empty subject for email ID {email_id}")
continue
try:
keywords = kw_model.extract_keywords(
subject,
keyphrase_ngram_range=(1, 2),
stop_words="english",
top_n=5
)
keyword_set = set(k[0].lower() for k in keywords)
label = match_labels(keyword_set, label_config) or "unlabeled"
cursor.execute("""
UPDATE emails
SET ai_category = %s,
ai_keywords = %s,
ai_label_source = %s,
ai_confidence = %s,
is_ai_reviewed = FALSE
WHERE id = %s
""", (label, ", ".join(keyword_set), "labeler_v1.0", 1.0, email_id))
log_event(cursor, "INFO", "labeler", f"Labeled email {email_id} as '{label}'")
print(f"🏷️ Email {email_id} labeled as: {label}")
except Exception as e:
log_event(cursor, "ERROR", "labeler", f"Error labeling email ID {email_id}: {str(e)}")
print(f"❌ Error labeling email {email_id}: {e}")
# === Commit & Close ===
conn.commit()
cursor.close()
conn.close()