58 lines
1.6 KiB
Text
58 lines
1.6 KiB
Text
import os
|
|
import mysql.connector
|
|
from keybert import KeyBERT
|
|
from sentence_transformers import SentenceTransformer
|
|
from collections import Counter
|
|
|
|
# === Load multilingual model for KeyBERT ===
|
|
model = SentenceTransformer("paraphrase-multilingual-MiniLM-L12-v2")
|
|
kw_model = KeyBERT(model)
|
|
|
|
# === DB Credentials ===
|
|
DB_HOST = os.getenv("DB_HOST", "localhost")
|
|
DB_PORT = int(os.getenv("DB_PORT", 3306))
|
|
DB_USER = os.getenv("DB_USER", "emailuser")
|
|
DB_PASSWORD = os.getenv("DB_PASSWORD", "miguel33020")
|
|
DB_NAME = os.getenv("DB_NAME", "emailassistant")
|
|
|
|
# === Connect to DB ===
|
|
conn = mysql.connector.connect(
|
|
host=DB_HOST,
|
|
port=DB_PORT,
|
|
user=DB_USER,
|
|
password=DB_PASSWORD,
|
|
database=DB_NAME
|
|
)
|
|
cursor = conn.cursor(dictionary=True)
|
|
|
|
# === Fetch only unlabeled emails ===
|
|
cursor.execute("SELECT id, subject FROM emails WHERE ai_category = 'unlabeled'")
|
|
emails = cursor.fetchall()
|
|
|
|
print(f"🔍 Analyzing {len(emails)} unlabeled emails...")
|
|
|
|
keyword_counter = Counter()
|
|
|
|
for email in emails:
|
|
subject = email["subject"]
|
|
if not subject:
|
|
continue
|
|
|
|
try:
|
|
keywords = kw_model.extract_keywords(
|
|
subject,
|
|
keyphrase_ngram_range=(1, 2),
|
|
stop_words="english",
|
|
top_n=5
|
|
)
|
|
keyword_counter.update([kw[0].lower() for kw in keywords])
|
|
except Exception as e:
|
|
print(f"❌ Error processing email ID {email['id']}: {e}")
|
|
|
|
# === Output top missing keywords ===
|
|
print("\n📊 Top keywords in unlabeled emails:")
|
|
for word, count in keyword_counter.most_common(30):
|
|
print(f"{word}: {count}")
|
|
|
|
cursor.close()
|
|
conn.close()
|