79 lines
2.9 KiB
Python
79 lines
2.9 KiB
Python
|
|
import os
|
||
|
|
import json
|
||
|
|
from bs4 import BeautifulSoup
|
||
|
|
import nltk
|
||
|
|
from nltk.tokenize import word_tokenize
|
||
|
|
|
||
|
|
# Ensure NLTK resources are available
|
||
|
|
nltk.download("punkt")
|
||
|
|
|
||
|
|
# Directory where HTML files are stored
|
||
|
|
HTML_DIR = "tests/"
|
||
|
|
# Directory where parsed JSON datasets will be saved
|
||
|
|
DATASET_DIR = "data/pytorch_dataset/"
|
||
|
|
|
||
|
|
# Ensure dataset directory exists
|
||
|
|
os.makedirs(DATASET_DIR, exist_ok=True)
|
||
|
|
|
||
|
|
# List of words to ignore (common in crypto pages but not token names)
|
||
|
|
IGNORE_WORDS = {"API", "Swap", "Powerful", "Gasless", "Dashboard", "Wallet", "Token", "Trade"}
|
||
|
|
|
||
|
|
def extract_token_data(html_content):
|
||
|
|
"""Extracts token data (name, symbol, price, market cap) from structured HTML elements or metadata."""
|
||
|
|
soup = BeautifulSoup(html_content, "html.parser")
|
||
|
|
dataset = []
|
||
|
|
|
||
|
|
# Extract token name from <title> or <meta> tags
|
||
|
|
title = soup.find("title")
|
||
|
|
meta_description = soup.find("meta", attrs={"name": "description"})
|
||
|
|
|
||
|
|
token_name = title.get_text(strip=True) if title else "N/A"
|
||
|
|
if meta_description:
|
||
|
|
meta_text = meta_description.get("content", "")
|
||
|
|
token_name = meta_text.split("|")[0].strip() # Often, sites use "Token Name | Description"
|
||
|
|
|
||
|
|
# Extract other relevant text and filter unwanted words
|
||
|
|
full_text = soup.get_text(separator=" ", strip=True)
|
||
|
|
words = word_tokenize(full_text)
|
||
|
|
|
||
|
|
# Find potential symbols (uppercase, short length)
|
||
|
|
possible_symbols = [word for word in words if word.isupper() and len(word) <= 5 and word not in IGNORE_WORDS]
|
||
|
|
symbol = possible_symbols[0] if possible_symbols else "N/A"
|
||
|
|
|
||
|
|
# Extract prices
|
||
|
|
possible_prices = [word for word in words if word.startswith("$") or word.endswith("USD")]
|
||
|
|
price = possible_prices[0] if possible_prices else "N/A"
|
||
|
|
|
||
|
|
# Store results
|
||
|
|
dataset.append({
|
||
|
|
"name": token_name,
|
||
|
|
"symbol": symbol,
|
||
|
|
"price": price,
|
||
|
|
"market_cap": "N/A"
|
||
|
|
})
|
||
|
|
|
||
|
|
return dataset
|
||
|
|
|
||
|
|
def parse_html_files():
|
||
|
|
"""Parses all HTML files in the tests/ directory and saves JSON dataset."""
|
||
|
|
for filename in os.listdir(HTML_DIR):
|
||
|
|
if filename.endswith(".html"):
|
||
|
|
file_path = os.path.join(HTML_DIR, filename)
|
||
|
|
|
||
|
|
with open(file_path, "r", encoding="utf-8") as file:
|
||
|
|
html_content = file.read()
|
||
|
|
token_data = extract_token_data(html_content)
|
||
|
|
|
||
|
|
json_filename = os.path.splitext(filename)[0] + ".json"
|
||
|
|
json_path = os.path.join(DATASET_DIR, json_filename)
|
||
|
|
|
||
|
|
with open(json_path, "w", encoding="utf-8") as json_file:
|
||
|
|
json.dump(token_data, json_file, indent=4)
|
||
|
|
|
||
|
|
print(f"✅ Extracted data from {filename} → Saved: {json_filename}")
|
||
|
|
|
||
|
|
if __name__ == "__main__":
|
||
|
|
print("🚀 Parsing HTML files to generate PyTorch dataset...")
|
||
|
|
parse_html_files()
|
||
|
|
print("✅ Dataset generation complete! JSON files saved in data/pytorch_dataset/")
|