In [1]:
# !pip install optuna
!pip install yfinance

Collecting yfinance
 Downloading yfinance-0.2.40-py2.py3-none-any.whl.metadata (11 kB)
Collecting multitasking>=0.0.7 (from yfinance)
 Downloading multitasking-0.0.11-py3-none-any.whl.metadata (5.5 kB)
Collecting peewee>=3.16.2 (from yfinance)
 Downloading peewee-3.17.5.tar.gz (3.0 MB)
[2K [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.0/3.0 MB[0m [31m72.7 MB/s[0m eta [36m0:00:00[0m
[?25h Installing build dependencies ... [?25l- \ | / - done
[?25h Getting requirements to build wheel ... [?25l- done
[?25h Preparing metadata (pyproject.toml) ... [?25l- done
Downloading yfinance-0.2.40-py2.py3-none-any.whl (73 kB)
[2K [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m73.5/73.5 kB[0m [31m4.5 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading multitasking-0.0.11-py3-none-any.whl (8.5 kB)
Building wheels for collected packages: peewee
 Building wheel for peewee (pyproject.toml) ... [?25l- \ | / done
[?25h Created wheel for pee

In [2]:
!pip install optuna_integration

Collecting optuna_integration
 Downloading optuna_integration-3.6.0-py3-none-any.whl.metadata (10 kB)
Downloading optuna_integration-3.6.0-py3-none-any.whl (93 kB)
[2K [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m93.4/93.4 kB[0m [31m4.6 MB/s[0m eta [36m0:00:00[0m
[0mInstalling collected packages: optuna_integration
Successfully installed optuna_integration-3.6.0


In [3]:
import pandas as pd
import numpy as np
import os
from datetime import datetime
import yfinance as yf
import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import LSTM, Dense, Input
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping
import optuna
from optuna_integration import TFKerasPruningCallback
from sklearn.metrics import roc_auc_score
from functools import partial
sess = tf.compat.v1.Session(config=tf.compat.v1.ConfigProto(log_device_placement=True))
from typing import Union, Optional, Tuple, Any
tf.keras.utils.set_random_seed(
 21052003
)
def get_stock(ticker, start, end):

 def convert_colnames(value):
 value = value.lower()
 value = value.replace(" ", "_")
 return value

 def check_object(obj):
 if obj is None:
 raise ValueError('Return of yfinance download is None')
 elif isinstance(obj, pd.DataFrame):
 if obj.empty:
 raise ValueError('DataFrame is empty')
 else:
 raise ValueError('Return of yfinance download is niether a Dataframe nor None')

 try:
 df = yf.download(ticker, start=start, end=end, progress=False)
 df.columns = map(convert_colnames, df.columns)
 df.index.names = ['date']
 check_object(df)
 return df
 except Exception as e:
 print('An error occured while downloading data\n' + str(e))

def add_sentiment(stock_data: pd.DataFrame, file_name) -> pd.DataFrame:
 file_path = os.path.join('/kaggle', 'input', 'sentiments', file_name)
 sentiment_data = pd.read_csv(file_path, index_col='date', parse_dates=['date'])
 merged_df = pd.merge(stock_data, sentiment_data, left_index=True, right_index=True, how='left')
 return merged_df


def calc_rsi(over: pd.Series, fn_roll: callable, scale_down: bool = True) -> pd.Series:
 # Get the difference in price from previous step
 delta = over.diff()
 # Get rid of the first row, which is NaN since it did not have a previous row to calculate the differences
 delta = delta[1:]

 # Make the positive gains (up) and negative gains (down) Series
 up, down = delta.clip(lower=0), delta.clip(upper=0).abs()

 roll_up, roll_down = fn_roll(up), fn_roll(down)
 rs = roll_up / roll_down
 rsi = 100.0 - (100.0 / (1.0 + rs))

 # Avoid division-by-zero if `roll_down` is zero
 # This prevents inf and/or nan values.
 rsi[:] = np.select([roll_down == 0, roll_up == 0, True], [100, 0, rsi])
 # rsi = rsi.case_when([((roll_down == 0), 100), ((roll_up == 0), 0)])
 # This alternative to np.select works only for pd.__version__ >= 2.2.0.
 rsi.name = 'rsi'

 # Assert range
 valid_rsi = rsi[13:]
 assert ((0 <= valid_rsi) & (valid_rsi <= 100)).all()
 # Note: rsi[:length - 1] is excluded from above assertion because it is NaN for SMA.
 rsi = rsi.reindex(over.index)
 if scale_down:
 rsi = rsi / 100
 return rsi


def scale_stock_data(data: pd.DataFrame, scale_volume: float, scale_price: float) -> pd.DataFrame:
 data['volume'] = data['volume'] / scale_volume
 data['open'] = data['open'] / scale_price
 data['high'] = data['high'] / scale_price
 data['low'] = data['low'] / scale_price
 data['close'] = data['close'] / scale_price
 data['adj_close'] = data['adj_close'] / scale_price
 return data


def calculate_indicators(prices_dataframe: pd.DataFrame, use_regular_close=False) -> pd.DataFrame:
 if use_regular_close:
 col = 'close'
 prices_dataframe = prices_dataframe.drop(columns=['adj_close'])
 else:
 col = 'adj_close'
 prices_dataframe['return'] = prices_dataframe[col].pct_change(1).iloc[1:]
 prices_dataframe['log1p_return'] = np.log1p(prices_dataframe['return'])

 prices_dataframe['rsi_ema'] = calc_rsi(prices_dataframe[col], lambda s: s.ewm(span=14).mean())
 # prices_dataframe['rsi_sma'] = calc_rsi(prices_dataframe[col], lambda s: s.rolling(14).mean())

 # prices_dataframe['smstd_20'] = prices_dataframe[col].rolling(window=20).std()
 # prices_dataframe['sma_20'] = prices_dataframe[col].rolling(window=20).mean()
 prices_dataframe['ewma_20'] = prices_dataframe[col].ewm(span=20).mean()
 prices_dataframe['ewma_60'] = prices_dataframe[col].ewm(span=60).mean()
 prices_dataframe['ewmstd_20'] = prices_dataframe[col].ewm(span=20).std()
 prices_dataframe['macd'] = prices_dataframe[col].ewm(span=12).mean() - prices_dataframe[col].ewm(span=26).mean()
 # prices_dataframe['upper_band_sma'] = prices_dataframe.ewma_20 + (2 * prices_dataframe.ewmstd_20)
 # prices_dataframe['lower_band_sma'] = prices_dataframe.sma_20 - (2 * prices_dataframe.ewmstd_20)

 # prices_dataframe['upper_band_ewma'] = prices_dataframe.ewma_20 + (2 * prices_dataframe.ewmstd_20)
 # prices_dataframe['lower_band_ewma'] = prices_dataframe.ewma_20 - (2 * prices_dataframe.ewmstd_20)

 return prices_dataframe


def get_dataset(ticker: str,
 scale_price: float,
 scale_vol: float,
 sentiment: str,
 use_regular_close=False) -> Union[pd.DataFrame, None]:
 start = datetime(2007, 1, 1)
 end = datetime(2016, 8, 16)
 try:
 if sentiment == 'nyt_and_reu':
 sentiments = ['reuters.csv', 'nytimes.csv']
 else:
 sentiments = None

 data = get_stock(ticker, start, end)
 data = scale_stock_data(data, scale_vol, scale_price)
 data = calculate_indicators(data, use_regular_close)
 data = data.dropna()
 data['clabel'] = data['return'].apply(lambda x: 1 if x > 0 else 0)
 if sentiments is None:
 return data
 for sent in sentiments:
 data = add_sentiment(data, sent)
 return data
 except Exception as e:
 print(f'Failed to get {ticker} due to error: {e}')
 return None

 
def get_sequences(data: pd.DataFrame,
 target_col: str,
 time_steps: int = 10) -> tuple[np.ndarray, np.ndarray]:

 feature_sequences = []
 targets = []

 for i in range(time_steps, len(data)):
 features_sequence = data.iloc[i - time_steps:i, :]
 target = data[target_col].iloc[i]
 feature_sequences.append(features_sequence)
 targets.append(target)

 # (batch_dim, sequence_size, features)
 feature_sequences = np.array(feature_sequences)
 targets = np.array(targets)
 targets = targets.reshape(targets.shape[0], 1)

 return feature_sequences, targets


def get_train_val_test(feature_sequences, target, n: int = 252):

 opt_sequences = feature_sequences[:-n]
 test_sequences = feature_sequences[-n:]
 opt_target = target[:-n]
 test_target = target[-n:]
 
 train_sequences = opt_sequences[:-n]
 val_sequences = opt_sequences[-n:]
 train_target = opt_target[:-n]
 val_target = opt_target[-n:]
 
 
 return train_sequences, val_sequences, test_sequences, train_target, val_target, test_target

def get_tt(feature_sequences, target, n: int = 252):

 opt_sequences = feature_sequences[:-n]
 test_sequences = feature_sequences[-n:]
 opt_target = target[:-n]
 test_target = target[-n:]
 
 return opt_sequences, test_sequences, opt_target, test_target


2024-05-31 20:47:07.091531: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-05-31 20:47:07.091665: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-05-31 20:47:07.224419: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


Device mapping:
/job:localhost/replica:0/task:0/device:GPU:0 -> device: 0, name: Tesla P100-PCIE-16GB, pci bus id: 0000:00:04.0, compute capability: 6.0



In [4]:
tickers = ['XOM', 'AAPL', 'JPM', 'PG']
df = get_dataset(tickers[1],
 100,
 1e7,
 'nyt_and_reu')
df = df.drop(columns=['nyt_vader_comp', 'reu_finbert_sent', 'reu_vader_comp', 'nyt_vader_sent', 'reu_vader_sent', 'close'])
prefixes = ['nyt_', 'reu_']
prefixed_cols = [col for col in df.columns if any(col.startswith(prefix) for prefix in prefixes)]

# Extract base column names
base_names = set(col.split('_', 1)[1] for col in prefixed_cols)

# Average values of columns with matching base names
for base_name in base_names:
 matching_cols = [col for col in prefixed_cols if col.endswith(base_name)]
 df[base_name] = df[matching_cols].mean(axis=1)

# Drop the original prefixed columns
df.drop(columns=prefixed_cols, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
 data['clabel'] = data['return'].apply(lambda x: 1 if x > 0 else 0)


In [5]:
# Ensure TensorFlow uses GPU
gpus = tf.config.experimental.list_physical_devices('GPU')
if gpus:
 try:
 for gpu in gpus:
 tf.config.experimental.set_memory_growth(gpu, True)
 except RuntimeError as e:
 print(e)

# 


def create_lstm_model(input_shape, lstm_units_1, lstm_units_2, dense_units, eta):
 inputs = Input(shape=input_shape)
 x = LSTM(units=lstm_units_1, return_sequences=True)(inputs)
 x = LSTM(units=lstm_units_2)(x)
 x = Dense(units=dense_units, activation='relu')(x)
 outputs = Dense(units=1, activation='sigmoid')(x)
 
 model = Model(inputs=inputs, outputs=outputs)
 
 model.compile(optimizer=Adam(eta), loss='binary_crossentropy', metrics=[tf.keras.metrics.AUC()])
 return model

def objective(trial, df):
 # Hyperparameter space
 
 powers_of_two = [2 ** n for n in range(2, 9)]
 lstm_units_1 = trial.suggest_categorical('lstm_units_1', powers_of_two)
 lstm_units_2 = trial.suggest_categorical('lstm_units_2', powers_of_two)
 dense_units = trial.suggest_categorical('dense_units', powers_of_two)
 batch_size = 128
 epochs = 128
 eta = trial.suggest_float('eta', 1e-4, 1e-1, log=True)
 slen = trial.suggest_int('sequence_length', 10, 60)
 fs, t = get_sequences(df, 'clabel', slen)
 trs, vs, tss, trt, vt, tst = get_train_val_test(fs, t)
 # Create model
 model = create_lstm_model(input_shape=(trs.shape[1], trs.shape[2]), 
 lstm_units_1=lstm_units_1, 
 lstm_units_2=lstm_units_2, 
 dense_units=dense_units,
 eta=eta)
 
 # Early stopping callback
# early_stopping = EarlyStopping(monitor='val_loss', patience=5)
 
 # Pruning callback
 pruning_callback = TFKerasPruningCallback(trial, 'val_loss')
 
 
 # Train model
 history = model.fit(trs, trt,
 validation_data=(vs, vt),
 epochs=epochs,
 batch_size=batch_size,
 callbacks=[pruning_callback],
 verbose=0)
 
 # Evaluate model
 val_loss, auc = model.evaluate(vs, vt, verbose=0)
 return auc

objective_wrap = partial(objective, df=df)
study = optuna.create_study(direction='maximize')
study.optimize(objective_wrap, n_trials=1000)

ofile = open('/kaggle/working/output.txt', 'w')
print('Best trial:', file=ofile)
trial = study.best_trial

print(' Value: {}'.format(trial.value), file=ofile)

print(' Params: ', file=ofile)
for key, value in trial.params.items():
 print(' {}: {}'.format(key, value), file=ofile)

[I 2024-05-31 20:47:20,597] A new study created in memory with name: no-name-ec9d3986-eba9-4614-8693-e5fadc833048
[I 2024-05-31 20:47:46,713] Trial 0 finished with value: 0.5 and parameters: {'lstm_units_1': 128, 'lstm_units_2': 64, 'dense_units': 16, 'eta': 0.08994154633501211, 'sequence_length': 24}. Best is trial 0 with value: 0.5.
[I 2024-05-31 20:48:10,115] Trial 1 finished with value: 0.5658575296401978 and parameters: {'lstm_units_1': 4, 'lstm_units_2': 32, 'dense_units': 256, 'eta': 0.0002879735177979203, 'sequence_length': 46}. Best is trial 1 with value: 0.5658575296401978.
[I 2024-05-31 20:48:33,490] Trial 2 finished with value: 0.5 and parameters: {'lstm_units_1': 4, 'lstm_units_2': 16, 'dense_units': 32, 'eta': 0.04662958353861651, 'sequence_length': 48}. Best is trial 1 with value: 0.5658575296401978.
[I 2024-05-31 20:49:04,881] Trial 3 finished with value: 0.49255985021591187 and parameters: {'lstm_units_1': 256, 'lstm_units_2': 256, 'dense_units': 16, 'eta': 0.004998660

In [6]:
best_params = trial.params
fs, t = get_sequences(df, 'clabel', best_params['sequence_length'])
trs, tss, trt, tst = get_tt(fs, t)
final_model = create_lstm_model(input_shape=(trs.shape[1], trs.shape[2]),
 lstm_units_1=best_params['lstm_units_1'],
 lstm_units_2=best_params['lstm_units_2'],
 dense_units=best_params['dense_units'],
 eta=best_params['eta'])

# Train final model
final_model.fit(trs, trt,
 validation_data=(tss, tst),
 epochs=128,
 batch_size=128,
 verbose=1)

# Predict and compute AUC
val_loss, auc = final_model.evaluate(tss, tst, verbose=1)
print(f'AUC {auc:.6f}', file=ofile)
ofile.close()

Epoch 1/128
[1m17/17[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 30ms/step - auc_1000: 0.4947 - loss: 0.8483 - val_auc_1000: 0.5151 - val_loss: 0.6937
Epoch 2/128
[1m17/17[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 8ms/step - auc_1000: 0.4836 - loss: 0.6938 - val_auc_1000: 0.5000 - val_loss: 0.6936
Epoch 3/128
[1m17/17[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 8ms/step - auc_1000: 0.4897 - loss: 0.6924 - val_auc_1000: 0.4357 - val_loss: 0.6935
Epoch 4/128
[1m17/17[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 8ms/step - auc_1000: 0.4804 - loss: 0.6931 - val_auc_1000: 0.4417 - val_loss: 0.6943
Epoch 5/128
[1m17/17[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 8ms/step - auc_1000: 0.4897 - loss: 0.6900 - val_auc_1000: 0.4839 - val_loss: 0.6937
Epoch 6/128
[1m17/17[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 8ms/step - auc_1000: 0.4883 - loss: 0.6903 - val_auc_1000: 0.4769 - val_loss: 0.6939
Epoch 7/128
[1m17/17[0m [32m━━