Email-Agent/Obsolete/cleaner.py
milo 89052f53fd First push
from FORK client
2025-05-06 11:13:15 -04:00

97 lines
3.2 KiB
Python

import mysql.connector
import json
import re
import spacy
from bs4 import BeautifulSoup
from datetime import datetime
# === Load spaCy model ===
nlp = spacy.load("en_core_web_sm")
# === Logging helper ===
def log_event(cursor, level, source, message):
cursor.execute(
"INSERT INTO logs (level, source, message) VALUES (%s, %s, %s)",
(level, source, message)
)
# === Extract all links from body ===
def extract_links(text):
return re.findall(r'https?://[^\s<>()"]+', text)
# === Extract unsubscribe links ===
def extract_unsubscribe_link(text):
# Match links that contain the word "unsubscribe"
matches = re.findall(r'(https?://[^\s()"]*unsubscribe[^\s()"]*)', text, re.IGNORECASE)
if matches:
return matches[0] # Return the first match
return None
# === Clean email body ===
def clean_body(body):
soup = BeautifulSoup(body, "html.parser")
return soup.get_text(separator=' ', strip=True)
# === Main cleaning logic ===
def clean_emails():
conn = mysql.connector.connect(
host="localhost",
user="emailuser",
password="miguel33020",
database="emailassistant"
)
cursor = conn.cursor(dictionary=True)
cursor.execute("SELECT * FROM emails WHERE body IS NOT NULL")
emails = cursor.fetchall()
for email in emails:
email_id = email["id"]
body = email["body"]
cleaned_body = clean_body(body)
links = extract_links(cleaned_body)
unsubscribe_link = extract_unsubscribe_link(cleaned_body)
# Attempt to parse attachments
attachments_data = None
if email.get("attachments"):
try:
attachments_data = json.loads(email["attachments"])
except json.JSONDecodeError:
try:
# Quick fix: replace single quotes with double quotes
attachments_data = json.loads(email["attachments"].replace("'", '"'))
log_event(cursor, "WARNING", "cleaner", f"Auto-corrected JSON in attachments (email ID {email_id})")
except Exception as e2:
log_event(cursor, "ERROR", "cleaner", f"Attachment parse failed (ID {email_id}): {str(e2)}")
attachments_data = None
# Update database
try:
cursor.execute("""
UPDATE emails
SET body = %s,
links = %s,
unsubscribe_data = %s,
attachments = %s
WHERE id = %s
""", (
cleaned_body,
json.dumps(links),
unsubscribe_link,
json.dumps(attachments_data) if attachments_data else None,
email_id
))
conn.commit()
print(f"✅ Cleaned email {email_id}")
log_event(cursor, "INFO", "cleaner", f"Successfully cleaned email ID {email_id}")
except Exception as e:
print(f"❌ Error updating email {email_id}: {e}")
log_event(cursor, "ERROR", "cleaner", f"DB update failed for email ID {email_id}: {str(e)}")
cursor.close()
conn.close()
if __name__ == "__main__":
clean_emails()