import mysql.connector import json import re import spacy from bs4 import BeautifulSoup from datetime import datetime # === Load spaCy model === nlp = spacy.load("en_core_web_sm") # === Logging helper === def log_event(cursor, level, source, message): cursor.execute( "INSERT INTO logs (level, source, message) VALUES (%s, %s, %s)", (level, source, message) ) # === Extract all links from body === def extract_links(text): return re.findall(r'https?://[^\s<>()"]+', text) # === Extract unsubscribe links === def extract_unsubscribe_link(text): # Match links that contain the word "unsubscribe" matches = re.findall(r'(https?://[^\s()"]*unsubscribe[^\s()"]*)', text, re.IGNORECASE) if matches: return matches[0] # Return the first match return None # === Clean email body === def clean_body(body): soup = BeautifulSoup(body, "html.parser") return soup.get_text(separator=' ', strip=True) # === Main cleaning logic === def clean_emails(): conn = mysql.connector.connect( host="localhost", user="emailuser", password="miguel33020", database="emailassistant" ) cursor = conn.cursor(dictionary=True) cursor.execute("SELECT * FROM emails WHERE body IS NOT NULL") emails = cursor.fetchall() for email in emails: email_id = email["id"] body = email["body"] cleaned_body = clean_body(body) links = extract_links(cleaned_body) unsubscribe_link = extract_unsubscribe_link(cleaned_body) # Attempt to parse attachments attachments_data = None if email.get("attachments"): try: attachments_data = json.loads(email["attachments"]) except json.JSONDecodeError: try: # Quick fix: replace single quotes with double quotes attachments_data = json.loads(email["attachments"].replace("'", '"')) log_event(cursor, "WARNING", "cleaner", f"Auto-corrected JSON in attachments (email ID {email_id})") except Exception as e2: log_event(cursor, "ERROR", "cleaner", f"Attachment parse failed (ID {email_id}): {str(e2)}") attachments_data = None # Update database try: cursor.execute(""" UPDATE emails SET body = %s, links = %s, unsubscribe_data = %s, attachments = %s WHERE id = %s """, ( cleaned_body, json.dumps(links), unsubscribe_link, json.dumps(attachments_data) if attachments_data else None, email_id )) conn.commit() print(f"✅ Cleaned email {email_id}") log_event(cursor, "INFO", "cleaner", f"Successfully cleaned email ID {email_id}") except Exception as e: print(f"❌ Error updating email {email_id}: {e}") log_event(cursor, "ERROR", "cleaner", f"DB update failed for email ID {email_id}: {str(e)}") cursor.close() conn.close() if __name__ == "__main__": clean_emails()