import os import json from bs4 import BeautifulSoup import nltk from nltk.tokenize import word_tokenize # Ensure NLTK resources are available nltk.download("punkt") # Directory where HTML files are stored HTML_DIR = "tests/" # Directory where parsed JSON datasets will be saved DATASET_DIR = "data/pytorch_dataset/" # Ensure dataset directory exists os.makedirs(DATASET_DIR, exist_ok=True) # List of words to ignore (common in crypto pages but not token names) IGNORE_WORDS = {"API", "Swap", "Powerful", "Gasless", "Dashboard", "Wallet", "Token", "Trade"} def extract_token_data(html_content): """Extracts token data (name, symbol, price, market cap) from structured HTML elements or metadata.""" soup = BeautifulSoup(html_content, "html.parser") dataset = [] # Extract token name from or <meta> tags title = soup.find("title") meta_description = soup.find("meta", attrs={"name": "description"}) token_name = title.get_text(strip=True) if title else "N/A" if meta_description: meta_text = meta_description.get("content", "") token_name = meta_text.split("|")[0].strip() # Often, sites use "Token Name | Description" # Extract other relevant text and filter unwanted words full_text = soup.get_text(separator=" ", strip=True) words = word_tokenize(full_text) # Find potential symbols (uppercase, short length) possible_symbols = [word for word in words if word.isupper() and len(word) <= 5 and word not in IGNORE_WORDS] symbol = possible_symbols[0] if possible_symbols else "N/A" # Extract prices possible_prices = [word for word in words if word.startswith("$") or word.endswith("USD")] price = possible_prices[0] if possible_prices else "N/A" # Store results dataset.append({ "name": token_name, "symbol": symbol, "price": price, "market_cap": "N/A" }) return dataset def parse_html_files(): """Parses all HTML files in the tests/ directory and saves JSON dataset.""" for filename in os.listdir(HTML_DIR): if filename.endswith(".html"): file_path = os.path.join(HTML_DIR, filename) with open(file_path, "r", encoding="utf-8") as file: html_content = file.read() token_data = extract_token_data(html_content) json_filename = os.path.splitext(filename)[0] + ".json" json_path = os.path.join(DATASET_DIR, json_filename) with open(json_path, "w", encoding="utf-8") as json_file: json.dump(token_data, json_file, indent=4) print(f"✅ Extracted data from {filename} → Saved: {json_filename}") if __name__ == "__main__": print("🚀 Parsing HTML files to generate PyTorch dataset...") parse_html_files() print("✅ Dataset generation complete! JSON files saved in data/pytorch_dataset/")