import os import mysql.connector from keybert import KeyBERT from sentence_transformers import SentenceTransformer from collections import Counter # === Load multilingual model for KeyBERT === model = SentenceTransformer("paraphrase-multilingual-MiniLM-L12-v2") kw_model = KeyBERT(model) # === DB Credentials === DB_HOST = os.getenv("DB_HOST", "localhost") DB_PORT = int(os.getenv("DB_PORT", 3306)) DB_USER = os.getenv("DB_USER", "emailuser") DB_PASSWORD = os.getenv("DB_PASSWORD", "miguel33020") DB_NAME = os.getenv("DB_NAME", "emailassistant") # === Connect to DB === conn = mysql.connector.connect( host=DB_HOST, port=DB_PORT, user=DB_USER, password=DB_PASSWORD, database=DB_NAME ) cursor = conn.cursor(dictionary=True) # === Fetch only unlabeled emails === cursor.execute("SELECT id, subject FROM emails WHERE ai_category = 'unlabeled'") emails = cursor.fetchall() print(f"šŸ” Analyzing {len(emails)} unlabeled emails...") keyword_counter = Counter() for email in emails: subject = email["subject"] if not subject: continue try: keywords = kw_model.extract_keywords( subject, keyphrase_ngram_range=(1, 2), stop_words="english", top_n=5 ) keyword_counter.update([kw[0].lower() for kw in keywords]) except Exception as e: print(f"āŒ Error processing email ID {email['id']}: {e}") # === Output top missing keywords === print("\nšŸ“Š Top keywords in unlabeled emails:") for word, count in keyword_counter.most_common(30): print(f"{word}: {count}") cursor.close() conn.close()