97 lines
3.2 KiB
Python
97 lines
3.2 KiB
Python
import mysql.connector
|
|
import json
|
|
import re
|
|
import spacy
|
|
from bs4 import BeautifulSoup
|
|
from datetime import datetime
|
|
|
|
# === Load spaCy model ===
|
|
nlp = spacy.load("en_core_web_sm")
|
|
|
|
# === Logging helper ===
|
|
def log_event(cursor, level, source, message):
|
|
cursor.execute(
|
|
"INSERT INTO logs (level, source, message) VALUES (%s, %s, %s)",
|
|
(level, source, message)
|
|
)
|
|
|
|
# === Extract all links from body ===
|
|
def extract_links(text):
|
|
return re.findall(r'https?://[^\s<>()"]+', text)
|
|
|
|
# === Extract unsubscribe links ===
|
|
def extract_unsubscribe_link(text):
|
|
# Match links that contain the word "unsubscribe"
|
|
matches = re.findall(r'(https?://[^\s()"]*unsubscribe[^\s()"]*)', text, re.IGNORECASE)
|
|
if matches:
|
|
return matches[0] # Return the first match
|
|
return None
|
|
|
|
# === Clean email body ===
|
|
def clean_body(body):
|
|
soup = BeautifulSoup(body, "html.parser")
|
|
return soup.get_text(separator=' ', strip=True)
|
|
|
|
# === Main cleaning logic ===
|
|
def clean_emails():
|
|
conn = mysql.connector.connect(
|
|
host="localhost",
|
|
user="emailuser",
|
|
password="miguel33020",
|
|
database="emailassistant"
|
|
)
|
|
cursor = conn.cursor(dictionary=True)
|
|
|
|
cursor.execute("SELECT * FROM emails WHERE body IS NOT NULL")
|
|
emails = cursor.fetchall()
|
|
|
|
for email in emails:
|
|
email_id = email["id"]
|
|
body = email["body"]
|
|
|
|
cleaned_body = clean_body(body)
|
|
links = extract_links(cleaned_body)
|
|
unsubscribe_link = extract_unsubscribe_link(cleaned_body)
|
|
|
|
# Attempt to parse attachments
|
|
attachments_data = None
|
|
if email.get("attachments"):
|
|
try:
|
|
attachments_data = json.loads(email["attachments"])
|
|
except json.JSONDecodeError:
|
|
try:
|
|
# Quick fix: replace single quotes with double quotes
|
|
attachments_data = json.loads(email["attachments"].replace("'", '"'))
|
|
log_event(cursor, "WARNING", "cleaner", f"Auto-corrected JSON in attachments (email ID {email_id})")
|
|
except Exception as e2:
|
|
log_event(cursor, "ERROR", "cleaner", f"Attachment parse failed (ID {email_id}): {str(e2)}")
|
|
attachments_data = None
|
|
|
|
# Update database
|
|
try:
|
|
cursor.execute("""
|
|
UPDATE emails
|
|
SET body = %s,
|
|
links = %s,
|
|
unsubscribe_data = %s,
|
|
attachments = %s
|
|
WHERE id = %s
|
|
""", (
|
|
cleaned_body,
|
|
json.dumps(links),
|
|
unsubscribe_link,
|
|
json.dumps(attachments_data) if attachments_data else None,
|
|
email_id
|
|
))
|
|
conn.commit()
|
|
print(f"✅ Cleaned email {email_id}")
|
|
log_event(cursor, "INFO", "cleaner", f"Successfully cleaned email ID {email_id}")
|
|
except Exception as e:
|
|
print(f"❌ Error updating email {email_id}: {e}")
|
|
log_event(cursor, "ERROR", "cleaner", f"DB update failed for email ID {email_id}: {str(e)}")
|
|
|
|
cursor.close()
|
|
conn.close()
|
|
|
|
if __name__ == "__main__":
|
|
clean_emails()
|