In [1]:
from datetime import datetime
import pandas as pd
from feature_engine.timeseries.forecasting import LagFeatures
from data.stocks import get_dataset
from data.stocks import SentimentSource
import xgboost as xgb
import optuna
import numpy as np
from sklearn.metrics import root_mean_squared_error
from optuna_integration import XGBoostPruningCallback
# start_date = datetime(2007, 1, 1)
# end_date = datetime(2016, 8, 17)


#TODO implement functions that call and test 3 models on the same data
#TODO classification
# .apply(lambda x: 1 if x > 0 else 0)

# dji = get_dataset('^DJI', SentimentSource.REUTERS, True)
# gspc = get_dataset('^GSPC', SentimentSource.REUTERS, True)
df = get_dataset(
 ticker='XOM',
 scale_price=100,
 scale_vol=1e7,
 sentiment='nyt_and_reu',
 use_regular_close=False
)
df = df.drop(columns=['nyt_vader_comp', 'reu_finbert_sent', 'reu_vader_comp', 'nyt_vader_sent', 'reu_vader_sent', 'close'])
prefixes = ['nyt_', 'reu_']
prefixed_cols = [col for col in df.columns if any(col.startswith(prefix) for prefix in prefixes)]

# Extract base column names
base_names = set(col.split('_', 1)[1] for col in prefixed_cols)

# Average values of columns with matching base names
for base_name in base_names:
 matching_cols = [col for col in prefixed_cols if col.endswith(base_name)]
 df[base_name] = df[matching_cols].mean(axis=1)

# Drop the original prefixed columns
df.drop(columns=prefixed_cols, inplace=True)
og_cols = df.columns.tolist()
og_cols.remove('adj_close')
lag_transformer = LagFeatures(periods=[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14])
lagged_df = lag_transformer.fit_transform(df)
lagged_df = lagged_df.dropna().drop(columns=og_cols)
X = lagged_df.drop(columns=['adj_close'])
y = lagged_df.adj_close

In [2]:
def ts_split(X, y, n):
 """
 Splits time series data into training and testing sets based on the given fraction.

 Parameters:
 X (array-like): The feature dataset.
 y (array-like): The target dataset.
 n (int): The n of the data to be used as the training set.

 Returns:
 X_train, X_test, y_train, y_test: Split datasets.
 """
 # Split the datasets
 X_train = X[:-n]
 X_test = X[-n:]
 y_train = y[:-n]
 y_test = y[-n:]

 return X_train, X_test, y_train, y_test

X_train, X_test, y_train, y_test = ts_split(X, y, 252)

In [4]:
def objective(trial: optuna.Trial) -> float:
 global X_train
 global y_train
 params = {
 'objective': 'reg:squarederror',
 'eval_metric': 'rmse',
 'booster': trial.suggest_categorical('booster', ['gbtree', 'dart']),
 'lambda': trial.suggest_float('lambda', 1e-8, 1.0, log=True),
 'alpha': trial.suggest_float('alpha', 1e-8, 1.0, log=True),
 'subsample': trial.suggest_float('subsample', 0.8, 1.0),
 'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1.0),
 'learning_rate': trial.suggest_float('learning_rate', 1e-4, 0.5, log=True),
 'max_depth': trial.suggest_int('max_depth', 1, 9),
 }
 num_boost_round = trial.suggest_int('num_boost_rounds', 500, 1500)
 X_opt, X_val, y_opt, y_val = ts_split(X_train, y_train, 252)
 dopt = xgb.DMatrix(X_opt, y_opt)
 dval = xgb.DMatrix(X_val, y_val)
 pruning_callback = XGBoostPruningCallback(trial, 'validation-rmse')
 bst = xgb.train(params=params,
 dtrain=dopt,
 evals=[(dval, 'validation')],
 callbacks=[pruning_callback],
 num_boost_round=num_boost_round,
 verbose_eval=False)
 y_pred = bst.predict(dval)

 return root_mean_squared_error(y_val, y_pred) * 100


study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=1000)

print('Number of finished trials:', len(study.trials))
print('Best trial:')
trial = study.best_trial
print(' Value: {:.5f}'.format(trial.value))
print(' Params: ')
for key, value in trial.params.items():
 print(' {}: {}'.format(key, value))

# model_name = input('Model name: ')

best_params = trial.params

const_params = {
 'objective': 'reg:squarederror',
 'eval_metric': 'rmse',
 'random_state': 42
}

n_rounds = best_params['num_boost_rounds']
best_params.pop('num_boost_rounds')
best_params.update(const_params)

[I 2024-05-31 19:11:15,812] A new study created in memory with name: no-name-f6f5f480-a380-4cbc-b56f-7e66a4fc6448
[I 2024-05-31 19:11:36,336] Trial 0 finished with value: 3.011515435869531 and parameters: {'booster': 'gbtree', 'lambda': 5.111523552408159e-06, 'alpha': 2.386697183556482e-05, 'subsample': 0.9470180048917057, 'colsample_bytree': 0.6599356911484081, 'learning_rate': 0.0008442603261871952, 'max_depth': 3, 'num_boost_rounds': 1408}. Best is trial 0 with value: 3.011515435869531.
[I 2024-05-31 19:12:11,124] Trial 1 finished with value: 0.7632041863037787 and parameters: {'booster': 'gbtree', 'lambda': 0.0001191679101832303, 'alpha': 6.26247651111337e-05, 'subsample': 0.9705635546494571, 'colsample_bytree': 0.6930134776338135, 'learning_rate': 0.0068483926691428835, 'max_depth': 5, 'num_boost_rounds': 1414}. Best is trial 1 with value: 0.7632041863037787.
[I 2024-05-31 19:25:51,998] Trial 2 finished with value: 0.7527490278666076 and parameters: {'booster': 'dart', 'lambda': 7

KeyboardInterrupt: 

In [6]:
dtrain = xgb.DMatrix(X_train, y_train)
dtest = xgb.DMatrix(X_test, y_test)
best_params = {
 'eta': 0.01,
 'colsample_bytree': 0.6,
 'subsample': 0.9,
 'max_depth': 6,
 'lambda': 1.0,
 'alpha': 0.0001,
 'objective': 'reg:squarederror',
 'random_state': 42
}
model = xgb.train(best_params, dtrain, num_boost_round=800)
print()
importance_dict = model.get_score(importance_type='weight')
sorted_importance = sorted(importance_dict.items(), key=lambda item: item[1], reverse=True)
preds = model.predict(dtest)
print(root_mean_squared_error(y_test, preds) * 100)
# Print the sorted key-value pairs
for feature, importance in sorted_importance:
 print(f"{feature}: {importance}")


1.0021918308112492
adj_close_lag_1: 2880.0
adj_close_lag_2: 858.0
return_lag_1: 755.0
rsi_ema_lag_1: 719.0
open_lag_1: 619.0
volume_lag_1: 522.0
high_lag_1: 514.0
low_lag_1: 461.0
adj_close_lag_3: 388.0
log1p_return_lag_1: 360.0
return_lag_2: 352.0
ewma_20_lag_1: 300.0
return_lag_8: 298.0
volume_lag_2: 260.0
volume_lag_3: 215.0
blob_pol_lag_1: 214.0
rsi_ema_lag_2: 213.0
vader_neg_lag_1: 206.0
adj_close_lag_4: 204.0
finbert_pos_lag_1: 195.0
blob_sub_lag_4: 193.0
finbert_sent_lag_1: 192.0
vader_neu_lag_3: 192.0
blob_sub_lag_1: 191.0
ewmstd_20_lag_1: 187.0
finbert_neg_lag_12: 187.0
blob_pol_lag_7: 185.0
finbert_neu_lag_1: 183.0
return_lag_4: 179.0
blob_pol_lag_3: 174.0
return_lag_3: 173.0
volume_lag_4: 173.0
blob_pol_lag_10: 172.0
vader_pos_lag_7: 171.0
vader_pos_lag_12: 171.0
blob_pol_lag_5: 170.0
blob_sub_lag_3: 168.0
blob_sub_lag_8: 168.0
finbert_neg_lag_1: 164.0
blob_pol_lag_4: 164.0
vader_pos_lag_6: 164.0
vader_neg_lag_5: 163.0
vader_neu_lag_1: 162.0
finbert_neu_lag_4: 161.0
vader_n

Best trial:
 Value: 0.01113
 Params:
 max_depth: 3
 learning_rate: 0.013019168747264056
 subsample: 0.8300701590363793
 colsample_bytree: 0.5999155901719391
 reg_alpha: 1.1965214202530094e-08
 reg_lambda: 3.7725711320800848
 num_boost_rounds: 580