# ml/preprocessing.py import pandas as pd import numpy as np def load_and_preprocess_data(path): df = pd.read_excel(path, header=None, names=[ 'timestamp', 'open', 'high', 'low', 'close', 'volume' ]) df['timestamp'] = pd.to_datetime(df['timestamp']) df.set_index('timestamp', inplace=True) # Resample to 15-minute intervals df = df.resample('15min').agg({ 'open': 'first', 'high': 'max', 'low': 'min', 'close': 'last', 'volume': 'sum' }).dropna() # Add features df['sma_10'] = df['close'].rolling(10).mean() df['sma_30'] = df['close'].rolling(30).mean() df['rsi_14'] = 100 - (100 / (1 + df['close'].pct_change().add(1).rolling(14).mean())) df['momentum'] = df['close'] - df['close'].shift(4) df['price_delta'] = df['close'] - df['open'] df['vol_rolling'] = df['volume'].rolling(10).mean() # Bollinger %B rolling_mean = df['close'].rolling(20).mean() rolling_std = df['close'].rolling(20).std() df['bollinger_b'] = (df['close'] - rolling_mean) / (2 * rolling_std) # MACD ema12 = df['close'].ewm(span=12, adjust=False).mean() ema26 = df['close'].ewm(span=26, adjust=False).mean() df['macd'] = ema12 - ema26 # Timestamp-based features df['hour'] = df.index.hour df['weekday'] = df.index.weekday # Simulated portfolio balance and buy-in value (placeholders for now) df['balance'] = 10000.0 # Placeholder: could be dynamic in real-time df['buy_in'] = df['close'].shift(1) # Simulated buy price df['pnl_per_trade'] = df['close'] - df['buy_in'] # Fake PnL calc # Target: Will price rise X% in next N intervals? future_window = 4 threshold = 0.001 df['future_max'] = df['close'].shift(-future_window).rolling(future_window).max() df['target'] = np.where(df['future_max'] > df['close'] * (1 + threshold), 1, 0) df.dropna(inplace=True) # Define feature set features = [ 'open', 'high', 'low', 'close', 'volume', 'sma_10', 'sma_30', 'rsi_14', 'momentum', 'price_delta', 'vol_rolling', 'bollinger_b', 'macd', 'hour', 'weekday', 'balance', 'buy_in', 'pnl_per_trade' ] return df[features], df['target']