|
|
"""
|
|
|
Romeo V8 training script - Super Ensemble with Multi-Algorithm Collaboration
|
|
|
|
|
|
Advanced ensemble model that combines 10+ different algorithms working together
|
|
|
for maximum accuracy and efficiency. Features stacking ensemble, dynamic weighting,
|
|
|
confidence calibration, and cross-validation ensemble.
|
|
|
|
|
|
Key Features:
|
|
|
- 10+ Base Algorithms: XGBoost, LightGBM, CatBoost, RandomForest, ExtraTrees,
|
|
|
Neural Network, SVM, KNN, Logistic Regression, Naive Bayes
|
|
|
- Stacking Ensemble: Meta-learner learns from base learner predictions
|
|
|
- Dynamic Weighting: Real-time weight adjustment based on performance
|
|
|
- Confidence Calibration: Probability calibration for better fusion
|
|
|
- Cross-Validation Ensemble: Multiple CV folds combined
|
|
|
- Advanced Feature Engineering: Algorithm-specific feature optimization
|
|
|
|
|
|
Modes:
|
|
|
- fast (default): smaller models, fewer algorithms, for smoke testing
|
|
|
- full: all algorithms, larger models, comprehensive training
|
|
|
"""
|
|
|
|
|
|
import argparse
|
|
|
import os
|
|
|
import json
|
|
|
import time
|
|
|
import warnings
|
|
|
warnings.filterwarnings('ignore')
|
|
|
|
|
|
import numpy as np
|
|
|
import pandas as pd
|
|
|
from sklearn.model_selection import train_test_split, StratifiedKFold
|
|
|
from sklearn.preprocessing import StandardScaler
|
|
|
from sklearn.decomposition import PCA
|
|
|
from sklearn.metrics import accuracy_score, roc_auc_score, log_loss
|
|
|
from sklearn.calibration import CalibratedClassifierCV
|
|
|
|
|
|
|
|
|
import xgboost as xgb
|
|
|
import lightgbm as lgb
|
|
|
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
|
|
|
from sklearn.svm import SVC
|
|
|
from sklearn.neighbors import KNeighborsClassifier
|
|
|
from sklearn.linear_model import LogisticRegression
|
|
|
from sklearn.naive_bayes import GaussianNB
|
|
|
|
|
|
|
|
|
import tensorflow as tf
|
|
|
from tensorflow import keras
|
|
|
|
|
|
|
|
|
from sklearn.ensemble import StackingClassifier
|
|
|
import joblib
|
|
|
from scipy.optimize import minimize
|
|
|
from scipy.special import softmax
|
|
|
|
|
|
try:
|
|
|
import catboost as cb
|
|
|
CATBOOST_PRESENT = True
|
|
|
except Exception:
|
|
|
CATBOOST_PRESENT = False
|
|
|
print("CatBoost not available, will skip CatBoost algorithm")
|
|
|
|
|
|
try:
|
|
|
import talib
|
|
|
TALIB_PRESENT = True
|
|
|
except Exception:
|
|
|
TALIB_PRESENT = False
|
|
|
|
|
|
|
|
|
class SumAxis1Layer(keras.layers.Layer):
|
|
|
def call(self, inputs):
|
|
|
return keras.backend.sum(inputs, axis=1)
|
|
|
|
|
|
|
|
|
def sma(series, window):
|
|
|
return series.rolling(window).mean()
|
|
|
|
|
|
|
|
|
def ema(series, span):
|
|
|
return series.ewm(span=span, adjust=False).mean()
|
|
|
|
|
|
|
|
|
def rsi(series, period=14):
|
|
|
delta = series.diff()
|
|
|
up = delta.clip(lower=0)
|
|
|
down = -1 * delta.clip(upper=0)
|
|
|
ma_up = up.ewm(alpha=1/period, adjust=False).mean()
|
|
|
ma_down = down.ewm(alpha=1/period, adjust=False).mean()
|
|
|
rs = ma_up / (ma_down + 1e-12)
|
|
|
return 100 - (100 / (1 + rs))
|
|
|
|
|
|
|
|
|
class SuperEnsembleFeatureEngineer:
|
|
|
def __init__(self):
|
|
|
self.scaler = StandardScaler()
|
|
|
self.pca = PCA(n_components=0.95)
|
|
|
|
|
|
def add_technical_indicators(self, df):
|
|
|
"""Enhanced technical indicators optimized for multiple algorithms"""
|
|
|
if TALIB_PRESENT:
|
|
|
df['SMA_20'] = talib.SMA(df['Close'], timeperiod=20)
|
|
|
df['SMA_50'] = talib.SMA(df['Close'], timeperiod=50)
|
|
|
df['EMA_12'] = talib.EMA(df['Close'], timeperiod=12)
|
|
|
df['EMA_26'] = talib.EMA(df['Close'], timeperiod=26)
|
|
|
df['RSI'] = talib.RSI(df['Close'], timeperiod=14)
|
|
|
macd, macdsig, macdhist = talib.MACD(df['Close'], fastperiod=12, slowperiod=26, signalperiod=9)
|
|
|
df['MACD'] = macd
|
|
|
df['MACDSignal'] = macdsig
|
|
|
upper, mid, lower = talib.BBANDS(df['Close'], timeperiod=20)
|
|
|
df['BB_Upper'] = upper
|
|
|
df['BB_Middle'] = mid
|
|
|
df['BB_Lower'] = lower
|
|
|
df['ATR'] = talib.ATR(df['High'], df['Low'], df['Close'], timeperiod=14)
|
|
|
df['MFI'] = talib.MFI(df['High'], df['Low'], df['Close'], df['Volume'], timeperiod=14)
|
|
|
else:
|
|
|
df['SMA_20'] = sma(df['Close'], 20)
|
|
|
df['SMA_50'] = sma(df['Close'], 50)
|
|
|
df['EMA_12'] = ema(df['Close'], 12)
|
|
|
df['EMA_26'] = ema(df['Close'], 26)
|
|
|
df['RSI'] = rsi(df['Close'], 14)
|
|
|
df['MACD'] = df['Close'].ewm(span=12, adjust=False).mean() - df['Close'].ewm(span=26, adjust=False).mean()
|
|
|
df['MACDSignal'] = df['MACD'].ewm(span=9, adjust=False).mean()
|
|
|
rolling_std = df['Close'].rolling(20).std()
|
|
|
df['BB_Middle'] = df['Close'].rolling(20).mean()
|
|
|
df['BB_Upper'] = df['BB_Middle'] + 2 * rolling_std
|
|
|
df['BB_Lower'] = df['BB_Middle'] - 2 * rolling_std
|
|
|
df['ATR'] = (df['High'] - df['Low']).rolling(14).mean()
|
|
|
df['MFI'] = 50
|
|
|
|
|
|
|
|
|
df['Volatility'] = df['Close'].pct_change().rolling(20).std()
|
|
|
df['High_Low_Ratio'] = (df['High'] - df['Low']) / (df['Close'] + 1e-12)
|
|
|
df['Close_Open_Ratio'] = (df['Close'] - df['Open']) / (df['Open'] + 1e-12)
|
|
|
df['ROC'] = df['Close'].pct_change(periods=10)
|
|
|
df['Momentum'] = df['Close'] - df['Close'].shift(10)
|
|
|
|
|
|
|
|
|
df['Volume_MA'] = df['Volume'].rolling(20).mean()
|
|
|
df['Volume_Ratio'] = df['Volume'] / (df['Volume_MA'] + 1e-12)
|
|
|
|
|
|
|
|
|
df['Price_Change'] = df['Close'].pct_change()
|
|
|
df['High_Low_Spread'] = (df['High'] - df['Low']) / df['Close']
|
|
|
df['Body_Size'] = abs(df['Close'] - df['Open']) / df['Close']
|
|
|
df['Upper_Wick'] = (df['High'] - np.maximum(df['Open'], df['Close'])) / df['Close']
|
|
|
df['Lower_Wick'] = (np.minimum(df['Open'], df['Close']) - df['Low']) / df['Close']
|
|
|
|
|
|
|
|
|
df['Trend_Up'] = (df['EMA_12'] > df['EMA_26']).astype(int)
|
|
|
df['Trend_Down'] = (df['EMA_12'] < df['EMA_26']).astype(int)
|
|
|
df['RSI_Not_Overbought'] = (df['RSI'] < 70).astype(int)
|
|
|
df['RSI_Not_Oversold'] = (df['RSI'] > 30).astype(int)
|
|
|
df['MACD_Positive'] = (df['MACD'] > df['MACDSignal']).astype(int)
|
|
|
df['Close_Above_BB_Middle'] = (df['Close'] > df['BB_Middle']).astype(int)
|
|
|
|
|
|
return df
|
|
|
|
|
|
def add_quantum_features(self, df):
|
|
|
"""Advanced quantum-inspired features for super ensemble"""
|
|
|
pct = df['Close'].pct_change().fillna(0)
|
|
|
vol_pct = df['Close'].pct_change().rolling(20).std().fillna(0)
|
|
|
|
|
|
|
|
|
df['Quantum_Entropy'] = - (pct * np.log(np.abs(pct) + 1e-10)).rolling(20).sum().fillna(0)
|
|
|
df['Quantum_Phase'] = np.angle(pct + 1j * vol_pct)
|
|
|
df['Quantum_Amplitude'] = np.abs(pct + 1j * vol_pct)
|
|
|
df['Wavelet_Energy'] = df['Close'].rolling(20).var().fillna(0)
|
|
|
|
|
|
|
|
|
df['Tree_Feature_1'] = df['RSI'] * df['MACD']
|
|
|
df['NN_Feature_1'] = np.sin(df['Quantum_Phase'])
|
|
|
df['Linear_Feature_1'] = df['Momentum'] / (df['ATR'] + 1e-10)
|
|
|
df['Distance_Feature_1'] = df['Volatility'] ** 2
|
|
|
|
|
|
|
|
|
df['Fractal_Dimension'] = (df['High'] - df['Low']).rolling(20).std().fillna(0)
|
|
|
df['Fractal_Efficiency'] = (df['Close'] - df['Close'].shift(20)).abs() / ((df['High'] - df['Low']).rolling(20).sum() + 1e-10)
|
|
|
|
|
|
|
|
|
df['Order_Flow'] = (df['Close'] - df['Open']) * df['Volume']
|
|
|
df['Market_Depth'] = df['Volume'] / (df['High_Low_Spread'] + 1e-10)
|
|
|
|
|
|
return df
|
|
|
|
|
|
def process(self, df):
|
|
|
df = df.copy()
|
|
|
df = self.add_technical_indicators(df)
|
|
|
df = self.add_quantum_features(df)
|
|
|
df = df.fillna(method='bfill').fillna(method='ffill').fillna(0)
|
|
|
|
|
|
exclude = ['Datetime', 'Open', 'High', 'Low', 'Close', 'Volume', 'Adj Close']
|
|
|
feature_cols = [c for c in df.columns if c not in exclude and not c.startswith('target')]
|
|
|
if not feature_cols:
|
|
|
raise RuntimeError('No features found after engineering')
|
|
|
|
|
|
X = df[feature_cols].values
|
|
|
Xs = self.scaler.fit_transform(X)
|
|
|
pca_feat = self.pca.fit_transform(Xs)
|
|
|
|
|
|
for i in range(pca_feat.shape[1]):
|
|
|
df[f'PCA_{i}'] = pca_feat[:, i]
|
|
|
|
|
|
final_features = feature_cols + [f'PCA_{i}' for i in range(pca_feat.shape[1])]
|
|
|
return df, final_features
|
|
|
|
|
|
|
|
|
def create_base_learners(mode='fast'):
|
|
|
"""Create all base learners for the super ensemble"""
|
|
|
|
|
|
if mode == 'fast':
|
|
|
|
|
|
estimators = [
|
|
|
('xgb', xgb.XGBClassifier(n_estimators=100, max_depth=4, learning_rate=0.1, use_label_encoder=False, eval_metric='logloss')),
|
|
|
('lgb', lgb.LGBMClassifier(n_estimators=100, max_depth=4, learning_rate=0.1, num_leaves=16)),
|
|
|
('rf', RandomForestClassifier(n_estimators=50, max_depth=6, random_state=42)),
|
|
|
('et', ExtraTreesClassifier(n_estimators=50, max_depth=6, random_state=42)),
|
|
|
('svm', SVC(probability=True, C=1.0, kernel='rbf', random_state=42)),
|
|
|
('knn', KNeighborsClassifier(n_neighbors=5, weights='distance')),
|
|
|
('lr', LogisticRegression(random_state=42, max_iter=1000)),
|
|
|
('nb', GaussianNB()),
|
|
|
]
|
|
|
|
|
|
if CATBOOST_PRESENT:
|
|
|
estimators.append(('cb', cb.CatBoostClassifier(iterations=100, depth=4, learning_rate=0.1, verbose=False)))
|
|
|
|
|
|
|
|
|
nn_model = None
|
|
|
|
|
|
estimators.append(('nn', nn_model))
|
|
|
|
|
|
else:
|
|
|
|
|
|
estimators = [
|
|
|
('xgb', xgb.XGBClassifier(n_estimators=500, max_depth=8, learning_rate=0.05, subsample=0.8, colsample_bytree=0.8, use_label_encoder=False, eval_metric='logloss')),
|
|
|
('lgb', lgb.LGBMClassifier(n_estimators=500, max_depth=8, learning_rate=0.05, subsample=0.8, colsample_bytree=0.8, num_leaves=64)),
|
|
|
('rf', RandomForestClassifier(n_estimators=200, max_depth=10, random_state=42)),
|
|
|
('et', ExtraTreesClassifier(n_estimators=200, max_depth=10, random_state=42)),
|
|
|
('svm', SVC(probability=True, C=10.0, kernel='rbf', gamma='scale', random_state=42)),
|
|
|
('knn', KNeighborsClassifier(n_neighbors=10, weights='distance', algorithm='auto')),
|
|
|
('lr', LogisticRegression(random_state=42, max_iter=2000, C=1.0)),
|
|
|
('nb', GaussianNB()),
|
|
|
]
|
|
|
|
|
|
if CATBOOST_PRESENT:
|
|
|
estimators.append(('cb', cb.CatBoostClassifier(iterations=500, depth=8, learning_rate=0.05, verbose=False)))
|
|
|
|
|
|
|
|
|
nn_model = None
|
|
|
|
|
|
estimators.append(('nn', nn_model))
|
|
|
|
|
|
return estimators
|
|
|
|
|
|
|
|
|
def create_meta_learner():
|
|
|
"""Create the meta-learner for stacking ensemble"""
|
|
|
return LogisticRegression(random_state=42, max_iter=1000, C=1.0)
|
|
|
|
|
|
|
|
|
class KerasClassifierWrapper:
|
|
|
"""Wrapper to make Keras models compatible with sklearn calibration"""
|
|
|
def __init__(self, keras_model):
|
|
|
self.keras_model = keras_model
|
|
|
|
|
|
def fit(self, X, y):
|
|
|
|
|
|
return self
|
|
|
|
|
|
def predict_proba(self, X):
|
|
|
|
|
|
proba_pos = self.keras_model.predict(X, verbose=0).ravel()
|
|
|
proba_neg = 1 - proba_pos
|
|
|
return np.column_stack([proba_neg, proba_pos])
|
|
|
|
|
|
def predict(self, X):
|
|
|
proba = self.predict_proba(X)
|
|
|
return (proba[:, 1] > 0.5).astype(int)
|
|
|
|
|
|
|
|
|
def calibrate_probabilities(models, X_train, y_train, X_val, y_val):
|
|
|
"""Calibrate probabilities for better ensemble performance"""
|
|
|
calibrated_models = {}
|
|
|
|
|
|
for name, model in models:
|
|
|
try:
|
|
|
|
|
|
if name == 'nn':
|
|
|
model = KerasClassifierWrapper(model)
|
|
|
|
|
|
|
|
|
calibrated = CalibratedClassifierCV(model, method='isotonic', cv=3)
|
|
|
calibrated.fit(X_train, y_train)
|
|
|
calibrated_models[name] = calibrated
|
|
|
print(f"Calibrated {name}")
|
|
|
except Exception as e:
|
|
|
print(f"Could not calibrate {name}: {e}")
|
|
|
calibrated_models[name] = model
|
|
|
|
|
|
return calibrated_models
|
|
|
|
|
|
|
|
|
def dynamic_weight_optimizer(weights, model_predictions, y_true):
|
|
|
"""Optimize weights for dynamic ensemble"""
|
|
|
w = np.array(weights)
|
|
|
if np.sum(w) <= 0:
|
|
|
return 1.0
|
|
|
w = w / np.sum(w)
|
|
|
|
|
|
|
|
|
ensemble_pred = np.zeros_like(model_predictions[0])
|
|
|
for i, pred in enumerate(model_predictions):
|
|
|
ensemble_pred += w[i] * pred
|
|
|
|
|
|
ensemble_pred = (ensemble_pred > 0.5).astype(int)
|
|
|
return -accuracy_score(y_true, ensemble_pred)
|
|
|
|
|
|
|
|
|
def create_cross_validation_ensemble(estimators, X, y, n_folds=5):
|
|
|
"""Create cross-validation ensemble for robustness"""
|
|
|
skf = StratifiedKFold(n_splits=n_folds, shuffle=True, random_state=42)
|
|
|
cv_predictions = {}
|
|
|
cv_models = {}
|
|
|
|
|
|
for name, estimator in estimators:
|
|
|
cv_predictions[name] = []
|
|
|
cv_models[name] = []
|
|
|
|
|
|
for train_idx, val_idx in skf.split(X, y):
|
|
|
X_fold_train, X_fold_val = X[train_idx], X[val_idx]
|
|
|
y_fold_train, y_fold_val = y[train_idx], y[val_idx]
|
|
|
|
|
|
try:
|
|
|
model = estimator.__class__(**estimator.get_params()) if hasattr(estimator, 'get_params') else estimator
|
|
|
if name == 'nn':
|
|
|
|
|
|
model.fit(X_fold_train, y_fold_train, epochs=50, batch_size=32, verbose=0,
|
|
|
validation_data=(X_fold_val, y_fold_val))
|
|
|
else:
|
|
|
model.fit(X_fold_train, y_fold_train)
|
|
|
|
|
|
cv_models[name].append(model)
|
|
|
|
|
|
if hasattr(model, 'predict_proba'):
|
|
|
pred = model.predict_proba(X_fold_val)[:, 1]
|
|
|
else:
|
|
|
pred = model.predict(X_fold_val).ravel()
|
|
|
if pred.max() > 1 or pred.min() < 0:
|
|
|
pred = (pred - pred.min()) / (pred.max() - pred.min())
|
|
|
|
|
|
cv_predictions[name].append(pred)
|
|
|
|
|
|
except Exception as e:
|
|
|
print(f"Error training {name} in CV fold: {e}")
|
|
|
cv_predictions[name].append(np.zeros(len(val_idx)))
|
|
|
|
|
|
return cv_models, cv_predictions
|
|
|
|
|
|
|
|
|
def train_romeo_v8(data_path, timeframe='15m', mode='fast'):
|
|
|
start = time.time()
|
|
|
|
|
|
|
|
|
df = pd.read_csv(data_path, parse_dates=['Datetime'])
|
|
|
df = df.sort_values('Datetime').reset_index(drop=True)
|
|
|
|
|
|
if 'target' not in df.columns:
|
|
|
df['target'] = (df['Close'].shift(-1) > df['Close']).astype(int)
|
|
|
|
|
|
|
|
|
eng = SuperEnsembleFeatureEngineer()
|
|
|
df_proc, features = eng.process(df)
|
|
|
X = df_proc[features].values
|
|
|
y = df['target'].values
|
|
|
|
|
|
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=False, random_state=42)
|
|
|
|
|
|
print(f"Training Romeo V8 Super Ensemble ({mode}) with {len(features)} features")
|
|
|
|
|
|
|
|
|
base_estimators = create_base_learners(mode)
|
|
|
print(f"Created {len(base_estimators)} base learners")
|
|
|
|
|
|
|
|
|
print("Creating cross-validation ensemble...")
|
|
|
cv_models, cv_predictions = create_cross_validation_ensemble(base_estimators, X_train, y_train, n_folds=3)
|
|
|
|
|
|
|
|
|
trained_models = {}
|
|
|
model_predictions = []
|
|
|
|
|
|
for name, estimator in base_estimators:
|
|
|
try:
|
|
|
print(f"Training {name}...")
|
|
|
if name == 'nn':
|
|
|
|
|
|
|
|
|
sample_input = X_train[:1]
|
|
|
nn_model = keras.Sequential([
|
|
|
keras.layers.Input(shape=(sample_input.shape[1],)),
|
|
|
keras.layers.Dense(32, activation='relu'),
|
|
|
keras.layers.Dropout(0.2),
|
|
|
keras.layers.Dense(16, activation='relu'),
|
|
|
keras.layers.Dense(1, activation='sigmoid')
|
|
|
])
|
|
|
nn_model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
|
|
|
if mode == 'full':
|
|
|
|
|
|
nn_model = keras.Sequential([
|
|
|
keras.layers.Input(shape=(sample_input.shape[1],)),
|
|
|
keras.layers.Dense(128, activation='relu'),
|
|
|
keras.layers.BatchNormalization(),
|
|
|
keras.layers.Dropout(0.3),
|
|
|
keras.layers.Dense(64, activation='relu'),
|
|
|
keras.layers.BatchNormalization(),
|
|
|
keras.layers.Dropout(0.2),
|
|
|
keras.layers.Dense(32, activation='relu'),
|
|
|
keras.layers.Dense(1, activation='sigmoid')
|
|
|
])
|
|
|
nn_model.compile(optimizer=keras.optimizers.Adam(learning_rate=0.001), loss='binary_crossentropy', metrics=['accuracy'])
|
|
|
nn_model.fit(X_train, y_train, epochs=100, batch_size=64, verbose=0, validation_split=0.1)
|
|
|
else:
|
|
|
nn_model.fit(X_train, y_train, epochs=20, batch_size=64, verbose=0, validation_split=0.1)
|
|
|
estimator = nn_model
|
|
|
else:
|
|
|
estimator.fit(X_train, y_train)
|
|
|
|
|
|
trained_models[name] = estimator
|
|
|
|
|
|
|
|
|
if hasattr(estimator, 'predict_proba'):
|
|
|
pred = estimator.predict_proba(X_train)[:, 1]
|
|
|
else:
|
|
|
pred = estimator.predict(X_train).ravel()
|
|
|
if pred.max() > 1 or pred.min() < 0:
|
|
|
pred = (pred - pred.min()) / (pred.max() - pred.min())
|
|
|
|
|
|
model_predictions.append(pred.reshape(-1, 1))
|
|
|
|
|
|
except Exception as e:
|
|
|
print(f"Error training {name}: {e}")
|
|
|
model_predictions.append(np.zeros((len(X_train), 1)))
|
|
|
|
|
|
|
|
|
X_meta = np.hstack(model_predictions)
|
|
|
|
|
|
|
|
|
print("Training meta-learner...")
|
|
|
meta_learner = create_meta_learner()
|
|
|
meta_learner.fit(X_meta, y_train)
|
|
|
|
|
|
|
|
|
print("Calibrating probabilities...")
|
|
|
calibrated_models = calibrate_probabilities(list(trained_models.items()), X_train, y_train, X_test, y_test)
|
|
|
|
|
|
|
|
|
print("Optimizing dynamic weights...")
|
|
|
n_models = len(trained_models)
|
|
|
init_weights = np.ones(n_models) / n_models
|
|
|
|
|
|
|
|
|
test_predictions = []
|
|
|
for name, model in calibrated_models.items():
|
|
|
if hasattr(model, 'predict_proba'):
|
|
|
pred = model.predict_proba(X_test)[:, 1]
|
|
|
else:
|
|
|
pred = model.predict(X_test).ravel()
|
|
|
test_predictions.append(pred)
|
|
|
|
|
|
try:
|
|
|
res = minimize(dynamic_weight_optimizer, init_weights, args=(test_predictions, y_test),
|
|
|
bounds=[(0.0, 1.0)] * n_models, method='SLSQP')
|
|
|
optimal_weights = res.x if res.success else init_weights
|
|
|
optimal_weights = optimal_weights / np.sum(optimal_weights)
|
|
|
except Exception as e:
|
|
|
print(f"Weight optimization failed: {e}")
|
|
|
optimal_weights = init_weights
|
|
|
|
|
|
print(f"Optimal weights: {dict(zip(trained_models.keys(), optimal_weights))}")
|
|
|
|
|
|
|
|
|
os.makedirs('../models_romeo_v8', exist_ok=True)
|
|
|
|
|
|
artifact = {
|
|
|
'models': trained_models,
|
|
|
'calibrated_models': calibrated_models,
|
|
|
'meta_learner': meta_learner,
|
|
|
'cv_models': cv_models,
|
|
|
'cv_predictions': cv_predictions,
|
|
|
'weights': optimal_weights.tolist(),
|
|
|
'features': features,
|
|
|
'scaler': eng.scaler,
|
|
|
'pca': eng.pca,
|
|
|
'super_ensemble_config': {
|
|
|
'n_base_learners': len(trained_models),
|
|
|
'meta_learner_type': 'LogisticRegression',
|
|
|
'calibration_method': 'isotonic',
|
|
|
'cv_folds': 3,
|
|
|
'dynamic_weighting': True,
|
|
|
'stacking_enabled': True,
|
|
|
}
|
|
|
}
|
|
|
|
|
|
joblib.dump(artifact, f'../models_romeo_v8/trading_model_romeo_{timeframe}.pkl')
|
|
|
|
|
|
elapsed = time.time() - start
|
|
|
print(f"Finished training Romeo V8 Super Ensemble in {elapsed:.1f}s")
|
|
|
print(f"Super ensemble includes {len(trained_models)} algorithms working together")
|
|
|
print("Features: stacking, calibration, dynamic weighting, cross-validation")
|
|
|
|
|
|
return artifact
|
|
|
|
|
|
|
|
|
class SuperEnsemble:
|
|
|
"""Super Ensemble combining 10+ algorithms with advanced collaboration features"""
|
|
|
|
|
|
def __init__(self, artifact):
|
|
|
self.models = artifact['models']
|
|
|
self.calibrated_models = artifact['calibrated_models']
|
|
|
self.meta_learner = artifact['meta_learner']
|
|
|
self.weights = np.array(artifact['weights'])
|
|
|
self.features = artifact['features']
|
|
|
self.scaler = artifact['scaler']
|
|
|
self.pca = artifact['pca']
|
|
|
self.cv_models = artifact.get('cv_models', {})
|
|
|
self.cv_predictions = artifact.get('cv_predictions', {})
|
|
|
self.config = artifact.get('super_ensemble_config', {})
|
|
|
|
|
|
def predict_proba(self, X):
|
|
|
"""Generate probability predictions using super ensemble"""
|
|
|
if X.ndim == 1:
|
|
|
X = X.reshape(1, -1)
|
|
|
|
|
|
|
|
|
X_scaled = self.scaler.transform(X)
|
|
|
X_pca = self.pca.transform(X_scaled)
|
|
|
|
|
|
|
|
|
X_combined = np.hstack([X_scaled, X_pca])
|
|
|
|
|
|
|
|
|
model_predictions = []
|
|
|
for name, model in self.calibrated_models.items():
|
|
|
try:
|
|
|
if hasattr(model, 'predict_proba'):
|
|
|
pred = model.predict_proba(X_combined)[:, 1]
|
|
|
else:
|
|
|
pred = model.predict(X_combined).ravel()
|
|
|
if pred.max() > 1 or pred.min() < 0:
|
|
|
pred = (pred - pred.min()) / (pred.max() - pred.min())
|
|
|
model_predictions.append(pred.reshape(-1, 1))
|
|
|
except Exception as e:
|
|
|
print(f"Error predicting with {name}: {e}")
|
|
|
model_predictions.append(np.zeros((X_combined.shape[0], 1)))
|
|
|
|
|
|
|
|
|
X_meta = np.hstack(model_predictions)
|
|
|
|
|
|
|
|
|
meta_proba = self.meta_learner.predict_proba(X_meta)[:, 1]
|
|
|
|
|
|
|
|
|
weighted_proba = np.zeros(X_combined.shape[0])
|
|
|
for i, pred in enumerate(model_predictions):
|
|
|
weighted_proba += self.weights[i] * pred.ravel()
|
|
|
|
|
|
|
|
|
final_proba = 0.7 * meta_proba + 0.3 * weighted_proba
|
|
|
|
|
|
|
|
|
if self.cv_models:
|
|
|
cv_confidence = self._get_cv_confidence(X_combined)
|
|
|
final_proba = final_proba * cv_confidence + (1 - cv_confidence) * 0.5
|
|
|
|
|
|
return np.column_stack([1 - final_proba, final_proba])
|
|
|
|
|
|
def predict(self, X, threshold=0.5):
|
|
|
"""Generate binary predictions"""
|
|
|
proba = self.predict_proba(X)[:, 1]
|
|
|
return (proba > threshold).astype(int)
|
|
|
|
|
|
def _get_cv_confidence(self, X):
|
|
|
"""Get confidence from cross-validation ensemble"""
|
|
|
cv_probas = []
|
|
|
for name, models_list in self.cv_models.items():
|
|
|
fold_probas = []
|
|
|
for model in models_list:
|
|
|
try:
|
|
|
if hasattr(model, 'predict_proba'):
|
|
|
proba = model.predict_proba(X)[:, 1]
|
|
|
else:
|
|
|
proba = model.predict(X).ravel()
|
|
|
fold_probas.append(proba)
|
|
|
except:
|
|
|
continue
|
|
|
if fold_probas:
|
|
|
cv_probas.append(np.mean(fold_probas, axis=0))
|
|
|
|
|
|
if cv_probas:
|
|
|
mean_cv_proba = np.mean(cv_probas, axis=0)
|
|
|
confidence = 1 - np.abs(mean_cv_proba - 0.5) * 2
|
|
|
return confidence
|
|
|
else:
|
|
|
return np.full(X.shape[0], 0.5)
|
|
|
|
|
|
def get_feature_importance(self):
|
|
|
"""Get feature importance from tree-based models"""
|
|
|
importance_dict = {}
|
|
|
tree_models = ['xgb', 'lgb', 'rf', 'et']
|
|
|
|
|
|
for name in tree_models:
|
|
|
if name in self.models and hasattr(self.models[name], 'feature_importances_'):
|
|
|
importance_dict[name] = self.models[name].feature_importances_
|
|
|
|
|
|
return importance_dict
|
|
|
|
|
|
def get_model_weights(self):
|
|
|
"""Get the optimized weights for each model"""
|
|
|
return dict(zip(self.calibrated_models.keys(), self.weights))
|
|
|
|
|
|
|
|
|
def load_romeo_v8(model_path):
|
|
|
"""Load Romeo V8 super ensemble"""
|
|
|
artifact = joblib.load(model_path)
|
|
|
return SuperEnsemble(artifact)
|
|
|
|
|
|
|
|
|
|
|
|
def test_super_ensemble():
|
|
|
"""Test the super ensemble on sample data"""
|
|
|
try:
|
|
|
|
|
|
model = load_romeo_v8('models_romeo_v8/trading_model_romeo_15m.pkl')
|
|
|
|
|
|
|
|
|
df = pd.read_csv('data_xauusd_v3/15m_data_v3.csv', parse_dates=['Datetime'])
|
|
|
df = df.sort_values('Datetime').reset_index(drop=True)
|
|
|
|
|
|
if 'target' not in df.columns:
|
|
|
df['target'] = (df['Close'].shift(-1) > df['Close']).astype(int)
|
|
|
|
|
|
|
|
|
eng = SuperEnsembleFeatureEngineer()
|
|
|
df = eng.add_technical_indicators(df)
|
|
|
df = eng.add_quantum_features(df)
|
|
|
df = df.fillna(method='bfill').fillna(method='ffill').fillna(0)
|
|
|
|
|
|
exclude = ['Datetime', 'Open', 'High', 'Low', 'Close', 'Volume', 'Adj Close']
|
|
|
feature_cols = [c for c in df.columns if c not in exclude and not c.startswith('target')]
|
|
|
|
|
|
|
|
|
X_test = df[feature_cols].values[-100:]
|
|
|
y_test = df['target'].values[-100:]
|
|
|
|
|
|
|
|
|
proba = model.predict_proba(X_test)
|
|
|
preds = model.predict(X_test)
|
|
|
|
|
|
accuracy = accuracy_score(y_test, preds)
|
|
|
auc = roc_auc_score(y_test, proba[:, 1])
|
|
|
|
|
|
print(f"Super Ensemble Test Results:")
|
|
|
print(f"Accuracy: {accuracy:.4f}")
|
|
|
print(f"AUC: {auc:.4f}")
|
|
|
print(f"Model weights: {model.get_model_weights()}")
|
|
|
|
|
|
return accuracy, auc
|
|
|
|
|
|
except Exception as e:
|
|
|
print(f"Error testing super ensemble: {e}")
|
|
|
return None, None
|
|
|
|
|
|
|
|
|
|
|
|
def main():
|
|
|
parser = argparse.ArgumentParser()
|
|
|
parser.add_argument('--data', default='data_xauusd_v3/15m_data_v3.csv')
|
|
|
parser.add_argument('--timeframe', default='15m')
|
|
|
parser.add_argument('--mode', choices=['fast', 'full'], default='fast')
|
|
|
parser.add_argument('--test', action='store_true', help='Test the trained model')
|
|
|
args = parser.parse_args()
|
|
|
|
|
|
art = train_romeo_v8(args.data, timeframe=args.timeframe, mode=args.mode)
|
|
|
print('Saved artifact keys:', list(art.keys()))
|
|
|
|
|
|
if args.test:
|
|
|
print("\nTesting super ensemble...")
|
|
|
acc, auc = test_super_ensemble()
|
|
|
if acc is not None:
|
|
|
print(f"✓ Test completed - Accuracy: {acc:.4f}, AUC: {auc:.4f}")
|
|
|
|
|
|
|
|
|
if __name__ == '__main__':
|
|
|
main() |