Crypto-Scraper/tests/parser_pytorch_dataset.py

import os
import json
from bs4 import BeautifulSoup
import nltk
from nltk.tokenize import word_tokenize

# Ensure NLTK resources are available
nltk.download("punkt")

# Directory where HTML files are stored
HTML_DIR = "tests/"
# Directory where parsed JSON datasets will be saved
DATASET_DIR = "data/pytorch_dataset/"

# Ensure dataset directory exists
os.makedirs(DATASET_DIR, exist_ok=True)

# List of words to ignore (common in crypto pages but not token names)
IGNORE_WORDS = {"API", "Swap", "Powerful", "Gasless", "Dashboard", "Wallet", "Token", "Trade"}

def extract_token_data(html_content):
    """Extracts token data (name, symbol, price, market cap) from structured HTML elements or metadata."""
    soup = BeautifulSoup(html_content, "html.parser")
    dataset = []
    
    # Extract token name from <title> or <meta> tags
    title = soup.find("title")
    meta_description = soup.find("meta", attrs={"name": "description"})
    
    token_name = title.get_text(strip=True) if title else "N/A"
    if meta_description:
        meta_text = meta_description.get("content", "")
        token_name = meta_text.split("|")[0].strip()  # Often, sites use "Token Name | Description"
    
    # Extract other relevant text and filter unwanted words
    full_text = soup.get_text(separator=" ", strip=True)
    words = word_tokenize(full_text)
    
    # Find potential symbols (uppercase, short length)
    possible_symbols = [word for word in words if word.isupper() and len(word) <= 5 and word not in IGNORE_WORDS]
    symbol = possible_symbols[0] if possible_symbols else "N/A"
    
    # Extract prices
    possible_prices = [word for word in words if word.startswith("$") or word.endswith("USD")]
    price = possible_prices[0] if possible_prices else "N/A"
    
    # Store results
    dataset.append({
        "name": token_name,
        "symbol": symbol,
        "price": price,
        "market_cap": "N/A"
    })
    
    return dataset

def parse_html_files():
    """Parses all HTML files in the tests/ directory and saves JSON dataset."""
    for filename in os.listdir(HTML_DIR):
        if filename.endswith(".html"):
            file_path = os.path.join(HTML_DIR, filename)
            
            with open(file_path, "r", encoding="utf-8") as file:
                html_content = file.read()
                token_data = extract_token_data(html_content)
                
                json_filename = os.path.splitext(filename)[0] + ".json"
                json_path = os.path.join(DATASET_DIR, json_filename)
                
                with open(json_path, "w", encoding="utf-8") as json_file:
                    json.dump(token_data, json_file, indent=4)
                
                print(f"✅ Extracted data from {filename} → Saved: {json_filename}")

if __name__ == "__main__":
    print("🚀 Parsing HTML files to generate PyTorch dataset...")
    parse_html_files()
    print("✅ Dataset generation complete! JSON files saved in data/pytorch_dataset/")
First commit/push 2025-05-05 22:58:57 -04:00			`import os`
			`import json`
			`from bs4 import BeautifulSoup`
			`import nltk`
			`from nltk.tokenize import word_tokenize`

			`# Ensure NLTK resources are available`
			`nltk.download("punkt")`

			`# Directory where HTML files are stored`
			`HTML_DIR = "tests/"`
			`# Directory where parsed JSON datasets will be saved`
			`DATASET_DIR = "data/pytorch_dataset/"`

			`# Ensure dataset directory exists`
			`os.makedirs(DATASET_DIR, exist_ok=True)`

			`# List of words to ignore (common in crypto pages but not token names)`
			`IGNORE_WORDS = {"API", "Swap", "Powerful", "Gasless", "Dashboard", "Wallet", "Token", "Trade"}`

			`def extract_token_data(html_content):`
			`"""Extracts token data (name, symbol, price, market cap) from structured HTML elements or metadata."""`
			`soup = BeautifulSoup(html_content, "html.parser")`
			`dataset = []`

			`# Extract token name from <title> or <meta> tags`
			`title = soup.find("title")`
			`meta_description = soup.find("meta", attrs={"name": "description"})`

			`token_name = title.get_text(strip=True) if title else "N/A"`
			`if meta_description:`
			`meta_text = meta_description.get("content", "")`
			`token_name = meta_text.split("\|")[0].strip() # Often, sites use "Token Name \| Description"`

			`# Extract other relevant text and filter unwanted words`
			`full_text = soup.get_text(separator=" ", strip=True)`
			`words = word_tokenize(full_text)`

			`# Find potential symbols (uppercase, short length)`
			`possible_symbols = [word for word in words if word.isupper() and len(word) <= 5 and word not in IGNORE_WORDS]`
			`symbol = possible_symbols[0] if possible_symbols else "N/A"`

			`# Extract prices`
			`possible_prices = [word for word in words if word.startswith("$") or word.endswith("USD")]`
			`price = possible_prices[0] if possible_prices else "N/A"`

			`# Store results`
			`dataset.append({`
			`"name": token_name,`
			`"symbol": symbol,`
			`"price": price,`
			`"market_cap": "N/A"`
			`})`

			`return dataset`

			`def parse_html_files():`
			`"""Parses all HTML files in the tests/ directory and saves JSON dataset."""`
			`for filename in os.listdir(HTML_DIR):`
			`if filename.endswith(".html"):`
			`file_path = os.path.join(HTML_DIR, filename)`

			`with open(file_path, "r", encoding="utf-8") as file:`
			`html_content = file.read()`
			`token_data = extract_token_data(html_content)`

			`json_filename = os.path.splitext(filename)[0] + ".json"`
			`json_path = os.path.join(DATASET_DIR, json_filename)`

			`with open(json_path, "w", encoding="utf-8") as json_file:`
			`json.dump(token_data, json_file, indent=4)`

			`print(f"✅ Extracted data from {filename} → Saved: {json_filename}")`

			`if __name__ == "__main__":`
			`print("🚀 Parsing HTML files to generate PyTorch dataset...")`
			`parse_html_files()`
			`print("✅ Dataset generation complete! JSON files saved in data/pytorch_dataset/")`