import os import yaml import mysql.connector from keybert import KeyBERT from sentence_transformers import SentenceTransformer # === Load multilingual model for KeyBERT === model = SentenceTransformer("paraphrase-multilingual-MiniLM-L12-v2") kw_model = KeyBERT(model) # === Load label hierarchy from YAML === LABEL_FILE = os.getenv("LABEL_CONFIG_PATH", "labels.yml") with open(LABEL_FILE, "r", encoding="utf-8") as f: label_config = yaml.safe_load(f) print(f"📂 Using label config: {LABEL_FILE}") print(label_config) # === DB Credentials === DB_HOST = os.getenv("DB_HOST", "localhost") DB_PORT = int(os.getenv("DB_PORT", 3306)) DB_USER = os.getenv("DB_USER", "emailuser") DB_PASSWORD = os.getenv("DB_PASSWORD", "miguel33020") DB_NAME = os.getenv("DB_NAME", "emailassistant") # === Connect to DB === conn = mysql.connector.connect( host=DB_HOST, port=DB_PORT, user=DB_USER, password=DB_PASSWORD, database=DB_NAME ) cursor = conn.cursor(dictionary=True) # === Logging Helper === def log_event(cursor, level, source, message): cursor.execute( "INSERT INTO logs (level, source, message) VALUES (%s, %s, %s)", (level, source, message) ) # === Recursive label matcher === def match_labels(keywords, label_tree, prefix=""): for label, data in label_tree.items(): full_label = f"{prefix}/{label}".strip("/") label_keywords = [kw.lower() for kw in data.get("keywords", [])] # First check children children = data.get("children", {}) child_match = match_labels(keywords, children, prefix=full_label) if child_match: return child_match # Then check this level (so children take priority) if any(kw in keywords for kw in label_keywords): return full_label return None # === Fetch emails that haven't been labeled === cursor.execute("SELECT id, subject, ai_category FROM emails") emails = cursor.fetchall() # === Main Labeling Loop === for email in emails: email_id = email["id"] subject = email["subject"] current_label = email["ai_category"] # if current_label not in [None, "None", ""]: # print(f"â„šī¸ Email {email_id} already has label '{current_label}'") # continue if not subject or not subject.strip(): log_event(cursor, "WARNING", "labeler", f"Skipped empty subject for email ID {email_id}") continue try: keywords = kw_model.extract_keywords( subject, keyphrase_ngram_range=(1, 2), stop_words="english", top_n=5 ) keyword_set = set(k[0].lower() for k in keywords) label = match_labels(keyword_set, label_config) or "unlabeled" cursor.execute(""" UPDATE emails SET ai_category = %s, ai_keywords = %s, ai_label_source = %s, ai_confidence = %s, is_ai_reviewed = FALSE WHERE id = %s """, (label, ", ".join(keyword_set), "labeler_v1.0", 1.0, email_id)) log_event(cursor, "INFO", "labeler", f"Labeled email {email_id} as '{label}'") print(f"đŸˇī¸ Email {email_id} labeled as: {label}") except Exception as e: log_event(cursor, "ERROR", "labeler", f"Error labeling email ID {email_id}: {str(e)}") print(f"❌ Error labeling email {email_id}: {e}") # === Commit & Close === conn.commit() cursor.close() conn.close()