115 lines
3.4 KiB
Python
115 lines
3.4 KiB
Python
import os
|
||
import yaml
|
||
import mysql.connector
|
||
from keybert import KeyBERT
|
||
from sentence_transformers import SentenceTransformer
|
||
|
||
|
||
# === Load multilingual model for KeyBERT ===
|
||
model = SentenceTransformer("paraphrase-multilingual-MiniLM-L12-v2")
|
||
kw_model = KeyBERT(model)
|
||
|
||
# === Load label hierarchy from YAML ===
|
||
LABEL_FILE = os.getenv("LABEL_CONFIG_PATH", "labels.yml")
|
||
with open(LABEL_FILE, "r", encoding="utf-8") as f:
|
||
label_config = yaml.safe_load(f)
|
||
|
||
print(f"📂 Using label config: {LABEL_FILE}")
|
||
print(label_config)
|
||
|
||
|
||
# === DB Credentials ===
|
||
DB_HOST = os.getenv("DB_HOST", "localhost")
|
||
DB_PORT = int(os.getenv("DB_PORT", 3306))
|
||
DB_USER = os.getenv("DB_USER", "emailuser")
|
||
DB_PASSWORD = os.getenv("DB_PASSWORD", "miguel33020")
|
||
DB_NAME = os.getenv("DB_NAME", "emailassistant")
|
||
|
||
# === Connect to DB ===
|
||
conn = mysql.connector.connect(
|
||
host=DB_HOST,
|
||
port=DB_PORT,
|
||
user=DB_USER,
|
||
password=DB_PASSWORD,
|
||
database=DB_NAME
|
||
)
|
||
cursor = conn.cursor(dictionary=True)
|
||
|
||
# === Logging Helper ===
|
||
def log_event(cursor, level, source, message):
|
||
cursor.execute(
|
||
"INSERT INTO logs (level, source, message) VALUES (%s, %s, %s)",
|
||
(level, source, message)
|
||
)
|
||
|
||
# === Recursive label matcher ===
|
||
def match_labels(keywords, label_tree, prefix=""):
|
||
for label, data in label_tree.items():
|
||
full_label = f"{prefix}/{label}".strip("/")
|
||
label_keywords = [kw.lower() for kw in data.get("keywords", [])]
|
||
|
||
# First check children
|
||
children = data.get("children", {})
|
||
child_match = match_labels(keywords, children, prefix=full_label)
|
||
if child_match:
|
||
return child_match
|
||
|
||
# Then check this level (so children take priority)
|
||
if any(kw in keywords for kw in label_keywords):
|
||
return full_label
|
||
|
||
return None
|
||
|
||
|
||
|
||
# === Fetch emails that haven't been labeled ===
|
||
cursor.execute("SELECT id, subject, ai_category FROM emails")
|
||
emails = cursor.fetchall()
|
||
|
||
# === Main Labeling Loop ===
|
||
for email in emails:
|
||
email_id = email["id"]
|
||
subject = email["subject"]
|
||
current_label = email["ai_category"]
|
||
|
||
# if current_label not in [None, "None", ""]:
|
||
# print(f"ℹ️ Email {email_id} already has label '{current_label}'")
|
||
# continue
|
||
|
||
if not subject or not subject.strip():
|
||
log_event(cursor, "WARNING", "labeler", f"Skipped empty subject for email ID {email_id}")
|
||
continue
|
||
|
||
try:
|
||
keywords = kw_model.extract_keywords(
|
||
subject,
|
||
keyphrase_ngram_range=(1, 2),
|
||
stop_words="english",
|
||
top_n=5
|
||
)
|
||
keyword_set = set(k[0].lower() for k in keywords)
|
||
label = match_labels(keyword_set, label_config) or "unlabeled"
|
||
|
||
cursor.execute("""
|
||
UPDATE emails
|
||
SET ai_category = %s,
|
||
ai_keywords = %s,
|
||
ai_label_source = %s,
|
||
ai_confidence = %s,
|
||
is_ai_reviewed = FALSE
|
||
WHERE id = %s
|
||
""", (label, ", ".join(keyword_set), "labeler_v1.0", 1.0, email_id))
|
||
|
||
|
||
log_event(cursor, "INFO", "labeler", f"Labeled email {email_id} as '{label}'")
|
||
print(f"🏷️ Email {email_id} labeled as: {label}")
|
||
|
||
|
||
except Exception as e:
|
||
log_event(cursor, "ERROR", "labeler", f"Error labeling email ID {email_id}: {str(e)}")
|
||
print(f"❌ Error labeling email {email_id}: {e}")
|
||
|
||
# === Commit & Close ===
|
||
conn.commit()
|
||
cursor.close()
|
||
conn.close()
|