Crypto-Scraper/tests/parser_pytorch_dataset.py

79 lines
2.9 KiB
Python
Raw Normal View History

2025-05-05 22:58:57 -04:00
import os
import json
from bs4 import BeautifulSoup
import nltk
from nltk.tokenize import word_tokenize
# Ensure NLTK resources are available
nltk.download("punkt")
# Directory where HTML files are stored
HTML_DIR = "tests/"
# Directory where parsed JSON datasets will be saved
DATASET_DIR = "data/pytorch_dataset/"
# Ensure dataset directory exists
os.makedirs(DATASET_DIR, exist_ok=True)
# List of words to ignore (common in crypto pages but not token names)
IGNORE_WORDS = {"API", "Swap", "Powerful", "Gasless", "Dashboard", "Wallet", "Token", "Trade"}
def extract_token_data(html_content):
"""Extracts token data (name, symbol, price, market cap) from structured HTML elements or metadata."""
soup = BeautifulSoup(html_content, "html.parser")
dataset = []
# Extract token name from <title> or <meta> tags
title = soup.find("title")
meta_description = soup.find("meta", attrs={"name": "description"})
token_name = title.get_text(strip=True) if title else "N/A"
if meta_description:
meta_text = meta_description.get("content", "")
token_name = meta_text.split("|")[0].strip() # Often, sites use "Token Name | Description"
# Extract other relevant text and filter unwanted words
full_text = soup.get_text(separator=" ", strip=True)
words = word_tokenize(full_text)
# Find potential symbols (uppercase, short length)
possible_symbols = [word for word in words if word.isupper() and len(word) <= 5 and word not in IGNORE_WORDS]
symbol = possible_symbols[0] if possible_symbols else "N/A"
# Extract prices
possible_prices = [word for word in words if word.startswith("$") or word.endswith("USD")]
price = possible_prices[0] if possible_prices else "N/A"
# Store results
dataset.append({
"name": token_name,
"symbol": symbol,
"price": price,
"market_cap": "N/A"
})
return dataset
def parse_html_files():
"""Parses all HTML files in the tests/ directory and saves JSON dataset."""
for filename in os.listdir(HTML_DIR):
if filename.endswith(".html"):
file_path = os.path.join(HTML_DIR, filename)
with open(file_path, "r", encoding="utf-8") as file:
html_content = file.read()
token_data = extract_token_data(html_content)
json_filename = os.path.splitext(filename)[0] + ".json"
json_path = os.path.join(DATASET_DIR, json_filename)
with open(json_path, "w", encoding="utf-8") as json_file:
json.dump(token_data, json_file, indent=4)
print(f"✅ Extracted data from {filename} → Saved: {json_filename}")
if __name__ == "__main__":
print("🚀 Parsing HTML files to generate PyTorch dataset...")
parse_html_files()
print("✅ Dataset generation complete! JSON files saved in data/pytorch_dataset/")