{ "cells": [ { "cell_type": "code", "execution_count": 1, "metadata": { "collapsed": true }, "outputs": [], "source": [ "from datetime import datetime\n", "import pandas as pd\n", "from feature_engine.timeseries.forecasting import LagFeatures\n", "from data.stocks import get_dataset\n", "from data.stocks import SentimentSource\n", "import xgboost as xgb\n", "import optuna\n", "import numpy as np\n", "from sklearn.metrics import root_mean_squared_error\n", "from optuna_integration import XGBoostPruningCallback\n", "# start_date = datetime(2007, 1, 1)\n", "# end_date = datetime(2016, 8, 17)\n", "\n", "\n", "#TODO implement functions that call and test 3 models on the same data\n", "#TODO classification\n", "# .apply(lambda x: 1 if x > 0 else 0)\n", "\n", "# dji = get_dataset('^DJI', SentimentSource.REUTERS, True)\n", "# gspc = get_dataset('^GSPC', SentimentSource.REUTERS, True)\n", "df = get_dataset(\n", " ticker='XOM',\n", " scale_price=100,\n", " scale_vol=1e7,\n", " sentiment='nyt_and_reu',\n", " use_regular_close=False\n", ")\n", "df = df.drop(columns=['nyt_vader_comp', 'reu_finbert_sent', 'reu_vader_comp', 'nyt_vader_sent', 'reu_vader_sent', 'close'])\n", "prefixes = ['nyt_', 'reu_']\n", "prefixed_cols = [col for col in df.columns if any(col.startswith(prefix) for prefix in prefixes)]\n", "\n", "# Extract base column names\n", "base_names = set(col.split('_', 1)[1] for col in prefixed_cols)\n", "\n", "# Average values of columns with matching base names\n", "for base_name in base_names:\n", " matching_cols = [col for col in prefixed_cols if col.endswith(base_name)]\n", " df[base_name] = df[matching_cols].mean(axis=1)\n", "\n", "# Drop the original prefixed columns\n", "df.drop(columns=prefixed_cols, inplace=True)\n", "og_cols = df.columns.tolist()\n", "og_cols.remove('adj_close')\n", "lag_transformer = LagFeatures(periods=[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14])\n", "lagged_df = lag_transformer.fit_transform(df)\n", "lagged_df = lagged_df.dropna().drop(columns=og_cols)\n", "X = lagged_df.drop(columns=['adj_close'])\n", "y = lagged_df.adj_close" ] }, { "cell_type": "code", "execution_count": 2, "outputs": [], "source": [ "def ts_split(X, y, n):\n", " \"\"\"\n", " Splits time series data into training and testing sets based on the given fraction.\n", "\n", " Parameters:\n", " X (array-like): The feature dataset.\n", " y (array-like): The target dataset.\n", " n (int): The n of the data to be used as the training set.\n", "\n", " Returns:\n", " X_train, X_test, y_train, y_test: Split datasets.\n", " \"\"\"\n", " # Split the datasets\n", " X_train = X[:-n]\n", " X_test = X[-n:]\n", " y_train = y[:-n]\n", " y_test = y[-n:]\n", "\n", " return X_train, X_test, y_train, y_test\n", "\n", "X_train, X_test, y_train, y_test = ts_split(X, y, 252)" ], "metadata": { "collapsed": false } }, { "cell_type": "code", "execution_count": 4, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "[I 2024-05-31 19:11:15,812] A new study created in memory with name: no-name-f6f5f480-a380-4cbc-b56f-7e66a4fc6448\n", "[I 2024-05-31 19:11:36,336] Trial 0 finished with value: 3.011515435869531 and parameters: {'booster': 'gbtree', 'lambda': 5.111523552408159e-06, 'alpha': 2.386697183556482e-05, 'subsample': 0.9470180048917057, 'colsample_bytree': 0.6599356911484081, 'learning_rate': 0.0008442603261871952, 'max_depth': 3, 'num_boost_rounds': 1408}. Best is trial 0 with value: 3.011515435869531.\n", "[I 2024-05-31 19:12:11,124] Trial 1 finished with value: 0.7632041863037787 and parameters: {'booster': 'gbtree', 'lambda': 0.0001191679101832303, 'alpha': 6.26247651111337e-05, 'subsample': 0.9705635546494571, 'colsample_bytree': 0.6930134776338135, 'learning_rate': 0.0068483926691428835, 'max_depth': 5, 'num_boost_rounds': 1414}. Best is trial 1 with value: 0.7632041863037787.\n", "[I 2024-05-31 19:25:51,998] Trial 2 finished with value: 0.7527490278666076 and parameters: {'booster': 'dart', 'lambda': 7.802344621883337e-07, 'alpha': 0.007860299789017285, 'subsample': 0.9619284940088776, 'colsample_bytree': 0.8989576002566275, 'learning_rate': 0.005976504337302015, 'max_depth': 1, 'num_boost_rounds': 1278}. Best is trial 2 with value: 0.7527490278666076.\n", "[I 2024-05-31 19:45:23,979] Trial 3 finished with value: 0.831719605956121 and parameters: {'booster': 'dart', 'lambda': 0.13491534981922632, 'alpha': 0.04389684969897374, 'subsample': 0.8820983140539309, 'colsample_bytree': 0.9397547349968005, 'learning_rate': 0.3718109996818719, 'max_depth': 5, 'num_boost_rounds': 1409}. Best is trial 2 with value: 0.7527490278666076.\n", "[W 2024-05-31 19:46:34,658] Trial 4 failed with parameters: {'booster': 'dart', 'lambda': 9.891669061476927e-08, 'alpha': 0.00037010223328544205, 'subsample': 0.906026126233236, 'colsample_bytree': 0.6130203472323721, 'learning_rate': 0.0006735890216453622, 'max_depth': 5, 'num_boost_rounds': 730} because of the following error: KeyboardInterrupt().\n", "Traceback (most recent call last):\n", " File \"C:\\Users\\boomb\\anaconda3\\lib\\site-packages\\optuna\\study\\_optimize.py\", line 200, in _run_trial\n", " value_or_values = func(trial)\n", " File \"C:\\Users\\boomb\\AppData\\Local\\Temp\\ipykernel_12252\\1686883986.py\", line 20, in objective\n", " bst = xgb.train(params=params,\n", " File \"C:\\Users\\boomb\\anaconda3\\lib\\site-packages\\xgboost\\core.py\", line 620, in inner_f\n", " return func(**kwargs)\n", " File \"C:\\Users\\boomb\\anaconda3\\lib\\site-packages\\xgboost\\training.py\", line 185, in train\n", " bst.update(dtrain, i, obj)\n", " File \"C:\\Users\\boomb\\anaconda3\\lib\\site-packages\\xgboost\\core.py\", line 1918, in update\n", " _check_call(_LIB.XGBoosterUpdateOneIter(self.handle,\n", "KeyboardInterrupt\n", "[W 2024-05-31 19:46:34,660] Trial 4 failed with value None.\n" ] }, { "ename": "KeyboardInterrupt", "evalue": "", "output_type": "error", "traceback": [ "\u001B[1;31m---------------------------------------------------------------------------\u001B[0m", "\u001B[1;31mKeyboardInterrupt\u001B[0m Traceback (most recent call last)", "\u001B[1;32m~\\AppData\\Local\\Temp\\ipykernel_12252\\1686883986.py\u001B[0m in \u001B[0;36m\u001B[1;34m\u001B[0m\n\u001B[0;32m 30\u001B[0m \u001B[1;33m\u001B[0m\u001B[0m\n\u001B[0;32m 31\u001B[0m \u001B[0mstudy\u001B[0m \u001B[1;33m=\u001B[0m \u001B[0moptuna\u001B[0m\u001B[1;33m.\u001B[0m\u001B[0mcreate_study\u001B[0m\u001B[1;33m(\u001B[0m\u001B[0mdirection\u001B[0m\u001B[1;33m=\u001B[0m\u001B[1;34m'minimize'\u001B[0m\u001B[1;33m)\u001B[0m\u001B[1;33m\u001B[0m\u001B[1;33m\u001B[0m\u001B[0m\n\u001B[1;32m---> 32\u001B[1;33m \u001B[0mstudy\u001B[0m\u001B[1;33m.\u001B[0m\u001B[0moptimize\u001B[0m\u001B[1;33m(\u001B[0m\u001B[0mobjective\u001B[0m\u001B[1;33m,\u001B[0m \u001B[0mn_trials\u001B[0m\u001B[1;33m=\u001B[0m\u001B[1;36m1000\u001B[0m\u001B[1;33m)\u001B[0m\u001B[1;33m\u001B[0m\u001B[1;33m\u001B[0m\u001B[0m\n\u001B[0m\u001B[0;32m 33\u001B[0m \u001B[1;33m\u001B[0m\u001B[0m\n\u001B[0;32m 34\u001B[0m \u001B[0mprint\u001B[0m\u001B[1;33m(\u001B[0m\u001B[1;34m'Number of finished trials:'\u001B[0m\u001B[1;33m,\u001B[0m \u001B[0mlen\u001B[0m\u001B[1;33m(\u001B[0m\u001B[0mstudy\u001B[0m\u001B[1;33m.\u001B[0m\u001B[0mtrials\u001B[0m\u001B[1;33m)\u001B[0m\u001B[1;33m)\u001B[0m\u001B[1;33m\u001B[0m\u001B[1;33m\u001B[0m\u001B[0m\n", "\u001B[1;32m~\\anaconda3\\lib\\site-packages\\optuna\\study\\study.py\u001B[0m in \u001B[0;36moptimize\u001B[1;34m(self, func, n_trials, timeout, n_jobs, catch, callbacks, gc_after_trial, show_progress_bar)\u001B[0m\n\u001B[0;32m 449\u001B[0m \u001B[0mIf\u001B[0m \u001B[0mnested\u001B[0m \u001B[0minvocation\u001B[0m \u001B[0mof\u001B[0m \u001B[0mthis\u001B[0m \u001B[0mmethod\u001B[0m \u001B[0moccurs\u001B[0m\u001B[1;33m.\u001B[0m\u001B[1;33m\u001B[0m\u001B[1;33m\u001B[0m\u001B[0m\n\u001B[0;32m 450\u001B[0m \"\"\"\n\u001B[1;32m--> 451\u001B[1;33m _optimize(\n\u001B[0m\u001B[0;32m 452\u001B[0m \u001B[0mstudy\u001B[0m\u001B[1;33m=\u001B[0m\u001B[0mself\u001B[0m\u001B[1;33m,\u001B[0m\u001B[1;33m\u001B[0m\u001B[1;33m\u001B[0m\u001B[0m\n\u001B[0;32m 453\u001B[0m \u001B[0mfunc\u001B[0m\u001B[1;33m=\u001B[0m\u001B[0mfunc\u001B[0m\u001B[1;33m,\u001B[0m\u001B[1;33m\u001B[0m\u001B[1;33m\u001B[0m\u001B[0m\n", "\u001B[1;32m~\\anaconda3\\lib\\site-packages\\optuna\\study\\_optimize.py\u001B[0m in \u001B[0;36m_optimize\u001B[1;34m(study, func, n_trials, timeout, n_jobs, catch, callbacks, gc_after_trial, show_progress_bar)\u001B[0m\n\u001B[0;32m 64\u001B[0m \u001B[1;32mtry\u001B[0m\u001B[1;33m:\u001B[0m\u001B[1;33m\u001B[0m\u001B[1;33m\u001B[0m\u001B[0m\n\u001B[0;32m 65\u001B[0m \u001B[1;32mif\u001B[0m \u001B[0mn_jobs\u001B[0m \u001B[1;33m==\u001B[0m \u001B[1;36m1\u001B[0m\u001B[1;33m:\u001B[0m\u001B[1;33m\u001B[0m\u001B[1;33m\u001B[0m\u001B[0m\n\u001B[1;32m---> 66\u001B[1;33m _optimize_sequential(\n\u001B[0m\u001B[0;32m 67\u001B[0m \u001B[0mstudy\u001B[0m\u001B[1;33m,\u001B[0m\u001B[1;33m\u001B[0m\u001B[1;33m\u001B[0m\u001B[0m\n\u001B[0;32m 68\u001B[0m \u001B[0mfunc\u001B[0m\u001B[1;33m,\u001B[0m\u001B[1;33m\u001B[0m\u001B[1;33m\u001B[0m\u001B[0m\n", "\u001B[1;32m~\\anaconda3\\lib\\site-packages\\optuna\\study\\_optimize.py\u001B[0m in \u001B[0;36m_optimize_sequential\u001B[1;34m(study, func, n_trials, timeout, catch, callbacks, gc_after_trial, reseed_sampler_rng, time_start, progress_bar)\u001B[0m\n\u001B[0;32m 161\u001B[0m \u001B[1;33m\u001B[0m\u001B[0m\n\u001B[0;32m 162\u001B[0m \u001B[1;32mtry\u001B[0m\u001B[1;33m:\u001B[0m\u001B[1;33m\u001B[0m\u001B[1;33m\u001B[0m\u001B[0m\n\u001B[1;32m--> 163\u001B[1;33m \u001B[0mfrozen_trial\u001B[0m \u001B[1;33m=\u001B[0m \u001B[0m_run_trial\u001B[0m\u001B[1;33m(\u001B[0m\u001B[0mstudy\u001B[0m\u001B[1;33m,\u001B[0m \u001B[0mfunc\u001B[0m\u001B[1;33m,\u001B[0m \u001B[0mcatch\u001B[0m\u001B[1;33m)\u001B[0m\u001B[1;33m\u001B[0m\u001B[1;33m\u001B[0m\u001B[0m\n\u001B[0m\u001B[0;32m 164\u001B[0m \u001B[1;32mfinally\u001B[0m\u001B[1;33m:\u001B[0m\u001B[1;33m\u001B[0m\u001B[1;33m\u001B[0m\u001B[0m\n\u001B[0;32m 165\u001B[0m \u001B[1;31m# The following line mitigates memory problems that can be occurred in some\u001B[0m\u001B[1;33m\u001B[0m\u001B[1;33m\u001B[0m\u001B[0m\n", "\u001B[1;32m~\\anaconda3\\lib\\site-packages\\optuna\\study\\_optimize.py\u001B[0m in \u001B[0;36m_run_trial\u001B[1;34m(study, func, catch)\u001B[0m\n\u001B[0;32m 249\u001B[0m \u001B[1;32mand\u001B[0m \u001B[1;32mnot\u001B[0m \u001B[0misinstance\u001B[0m\u001B[1;33m(\u001B[0m\u001B[0mfunc_err\u001B[0m\u001B[1;33m,\u001B[0m \u001B[0mcatch\u001B[0m\u001B[1;33m)\u001B[0m\u001B[1;33m\u001B[0m\u001B[1;33m\u001B[0m\u001B[0m\n\u001B[0;32m 250\u001B[0m ):\n\u001B[1;32m--> 251\u001B[1;33m \u001B[1;32mraise\u001B[0m \u001B[0mfunc_err\u001B[0m\u001B[1;33m\u001B[0m\u001B[1;33m\u001B[0m\u001B[0m\n\u001B[0m\u001B[0;32m 252\u001B[0m \u001B[1;32mreturn\u001B[0m \u001B[0mfrozen_trial\u001B[0m\u001B[1;33m\u001B[0m\u001B[1;33m\u001B[0m\u001B[0m\n\u001B[0;32m 253\u001B[0m \u001B[1;33m\u001B[0m\u001B[0m\n", "\u001B[1;32m~\\anaconda3\\lib\\site-packages\\optuna\\study\\_optimize.py\u001B[0m in \u001B[0;36m_run_trial\u001B[1;34m(study, func, catch)\u001B[0m\n\u001B[0;32m 198\u001B[0m \u001B[1;32mwith\u001B[0m \u001B[0mget_heartbeat_thread\u001B[0m\u001B[1;33m(\u001B[0m\u001B[0mtrial\u001B[0m\u001B[1;33m.\u001B[0m\u001B[0m_trial_id\u001B[0m\u001B[1;33m,\u001B[0m \u001B[0mstudy\u001B[0m\u001B[1;33m.\u001B[0m\u001B[0m_storage\u001B[0m\u001B[1;33m)\u001B[0m\u001B[1;33m:\u001B[0m\u001B[1;33m\u001B[0m\u001B[1;33m\u001B[0m\u001B[0m\n\u001B[0;32m 199\u001B[0m \u001B[1;32mtry\u001B[0m\u001B[1;33m:\u001B[0m\u001B[1;33m\u001B[0m\u001B[1;33m\u001B[0m\u001B[0m\n\u001B[1;32m--> 200\u001B[1;33m \u001B[0mvalue_or_values\u001B[0m \u001B[1;33m=\u001B[0m \u001B[0mfunc\u001B[0m\u001B[1;33m(\u001B[0m\u001B[0mtrial\u001B[0m\u001B[1;33m)\u001B[0m\u001B[1;33m\u001B[0m\u001B[1;33m\u001B[0m\u001B[0m\n\u001B[0m\u001B[0;32m 201\u001B[0m \u001B[1;32mexcept\u001B[0m \u001B[0mexceptions\u001B[0m\u001B[1;33m.\u001B[0m\u001B[0mTrialPruned\u001B[0m \u001B[1;32mas\u001B[0m \u001B[0me\u001B[0m\u001B[1;33m:\u001B[0m\u001B[1;33m\u001B[0m\u001B[1;33m\u001B[0m\u001B[0m\n\u001B[0;32m 202\u001B[0m \u001B[1;31m# TODO(mamu): Handle multi-objective cases.\u001B[0m\u001B[1;33m\u001B[0m\u001B[1;33m\u001B[0m\u001B[0m\n", "\u001B[1;32m~\\AppData\\Local\\Temp\\ipykernel_12252\\1686883986.py\u001B[0m in \u001B[0;36mobjective\u001B[1;34m(trial)\u001B[0m\n\u001B[0;32m 18\u001B[0m \u001B[0mdval\u001B[0m \u001B[1;33m=\u001B[0m \u001B[0mxgb\u001B[0m\u001B[1;33m.\u001B[0m\u001B[0mDMatrix\u001B[0m\u001B[1;33m(\u001B[0m\u001B[0mX_val\u001B[0m\u001B[1;33m,\u001B[0m \u001B[0my_val\u001B[0m\u001B[1;33m)\u001B[0m\u001B[1;33m\u001B[0m\u001B[1;33m\u001B[0m\u001B[0m\n\u001B[0;32m 19\u001B[0m \u001B[0mpruning_callback\u001B[0m \u001B[1;33m=\u001B[0m \u001B[0mXGBoostPruningCallback\u001B[0m\u001B[1;33m(\u001B[0m\u001B[0mtrial\u001B[0m\u001B[1;33m,\u001B[0m \u001B[1;34m'validation-rmse'\u001B[0m\u001B[1;33m)\u001B[0m\u001B[1;33m\u001B[0m\u001B[1;33m\u001B[0m\u001B[0m\n\u001B[1;32m---> 20\u001B[1;33m bst = xgb.train(params=params,\n\u001B[0m\u001B[0;32m 21\u001B[0m \u001B[0mdtrain\u001B[0m\u001B[1;33m=\u001B[0m\u001B[0mdopt\u001B[0m\u001B[1;33m,\u001B[0m\u001B[1;33m\u001B[0m\u001B[1;33m\u001B[0m\u001B[0m\n\u001B[0;32m 22\u001B[0m \u001B[0mevals\u001B[0m\u001B[1;33m=\u001B[0m\u001B[1;33m[\u001B[0m\u001B[1;33m(\u001B[0m\u001B[0mdval\u001B[0m\u001B[1;33m,\u001B[0m \u001B[1;34m'validation'\u001B[0m\u001B[1;33m)\u001B[0m\u001B[1;33m]\u001B[0m\u001B[1;33m,\u001B[0m\u001B[1;33m\u001B[0m\u001B[1;33m\u001B[0m\u001B[0m\n", "\u001B[1;32m~\\anaconda3\\lib\\site-packages\\xgboost\\core.py\u001B[0m in \u001B[0;36minner_f\u001B[1;34m(*args, **kwargs)\u001B[0m\n\u001B[0;32m 618\u001B[0m \u001B[1;32mfor\u001B[0m \u001B[0mk\u001B[0m\u001B[1;33m,\u001B[0m \u001B[0marg\u001B[0m \u001B[1;32min\u001B[0m \u001B[0mzip\u001B[0m\u001B[1;33m(\u001B[0m\u001B[0msig\u001B[0m\u001B[1;33m.\u001B[0m\u001B[0mparameters\u001B[0m\u001B[1;33m,\u001B[0m \u001B[0margs\u001B[0m\u001B[1;33m)\u001B[0m\u001B[1;33m:\u001B[0m\u001B[1;33m\u001B[0m\u001B[1;33m\u001B[0m\u001B[0m\n\u001B[0;32m 619\u001B[0m \u001B[0mkwargs\u001B[0m\u001B[1;33m[\u001B[0m\u001B[0mk\u001B[0m\u001B[1;33m]\u001B[0m \u001B[1;33m=\u001B[0m \u001B[0marg\u001B[0m\u001B[1;33m\u001B[0m\u001B[1;33m\u001B[0m\u001B[0m\n\u001B[1;32m--> 620\u001B[1;33m \u001B[1;32mreturn\u001B[0m \u001B[0mfunc\u001B[0m\u001B[1;33m(\u001B[0m\u001B[1;33m**\u001B[0m\u001B[0mkwargs\u001B[0m\u001B[1;33m)\u001B[0m\u001B[1;33m\u001B[0m\u001B[1;33m\u001B[0m\u001B[0m\n\u001B[0m\u001B[0;32m 621\u001B[0m \u001B[1;33m\u001B[0m\u001B[0m\n\u001B[0;32m 622\u001B[0m \u001B[1;32mreturn\u001B[0m \u001B[0minner_f\u001B[0m\u001B[1;33m\u001B[0m\u001B[1;33m\u001B[0m\u001B[0m\n", "\u001B[1;32m~\\anaconda3\\lib\\site-packages\\xgboost\\training.py\u001B[0m in \u001B[0;36mtrain\u001B[1;34m(params, dtrain, num_boost_round, evals, obj, feval, maximize, early_stopping_rounds, evals_result, verbose_eval, xgb_model, callbacks, custom_metric)\u001B[0m\n\u001B[0;32m 183\u001B[0m \u001B[1;32mif\u001B[0m \u001B[0mcb_container\u001B[0m\u001B[1;33m.\u001B[0m\u001B[0mbefore_iteration\u001B[0m\u001B[1;33m(\u001B[0m\u001B[0mbst\u001B[0m\u001B[1;33m,\u001B[0m \u001B[0mi\u001B[0m\u001B[1;33m,\u001B[0m \u001B[0mdtrain\u001B[0m\u001B[1;33m,\u001B[0m \u001B[0mevals\u001B[0m\u001B[1;33m)\u001B[0m\u001B[1;33m:\u001B[0m\u001B[1;33m\u001B[0m\u001B[1;33m\u001B[0m\u001B[0m\n\u001B[0;32m 184\u001B[0m \u001B[1;32mbreak\u001B[0m\u001B[1;33m\u001B[0m\u001B[1;33m\u001B[0m\u001B[0m\n\u001B[1;32m--> 185\u001B[1;33m \u001B[0mbst\u001B[0m\u001B[1;33m.\u001B[0m\u001B[0mupdate\u001B[0m\u001B[1;33m(\u001B[0m\u001B[0mdtrain\u001B[0m\u001B[1;33m,\u001B[0m \u001B[0mi\u001B[0m\u001B[1;33m,\u001B[0m \u001B[0mobj\u001B[0m\u001B[1;33m)\u001B[0m\u001B[1;33m\u001B[0m\u001B[1;33m\u001B[0m\u001B[0m\n\u001B[0m\u001B[0;32m 186\u001B[0m \u001B[1;32mif\u001B[0m \u001B[0mcb_container\u001B[0m\u001B[1;33m.\u001B[0m\u001B[0mafter_iteration\u001B[0m\u001B[1;33m(\u001B[0m\u001B[0mbst\u001B[0m\u001B[1;33m,\u001B[0m \u001B[0mi\u001B[0m\u001B[1;33m,\u001B[0m \u001B[0mdtrain\u001B[0m\u001B[1;33m,\u001B[0m \u001B[0mevals\u001B[0m\u001B[1;33m)\u001B[0m\u001B[1;33m:\u001B[0m\u001B[1;33m\u001B[0m\u001B[1;33m\u001B[0m\u001B[0m\n\u001B[0;32m 187\u001B[0m \u001B[1;32mbreak\u001B[0m\u001B[1;33m\u001B[0m\u001B[1;33m\u001B[0m\u001B[0m\n", "\u001B[1;32m~\\anaconda3\\lib\\site-packages\\xgboost\\core.py\u001B[0m in \u001B[0;36mupdate\u001B[1;34m(self, dtrain, iteration, fobj)\u001B[0m\n\u001B[0;32m 1916\u001B[0m \u001B[1;33m\u001B[0m\u001B[0m\n\u001B[0;32m 1917\u001B[0m \u001B[1;32mif\u001B[0m \u001B[0mfobj\u001B[0m \u001B[1;32mis\u001B[0m \u001B[1;32mNone\u001B[0m\u001B[1;33m:\u001B[0m\u001B[1;33m\u001B[0m\u001B[1;33m\u001B[0m\u001B[0m\n\u001B[1;32m-> 1918\u001B[1;33m _check_call(_LIB.XGBoosterUpdateOneIter(self.handle,\n\u001B[0m\u001B[0;32m 1919\u001B[0m \u001B[0mctypes\u001B[0m\u001B[1;33m.\u001B[0m\u001B[0mc_int\u001B[0m\u001B[1;33m(\u001B[0m\u001B[0miteration\u001B[0m\u001B[1;33m)\u001B[0m\u001B[1;33m,\u001B[0m\u001B[1;33m\u001B[0m\u001B[1;33m\u001B[0m\u001B[0m\n\u001B[0;32m 1920\u001B[0m dtrain.handle))\n", "\u001B[1;31mKeyboardInterrupt\u001B[0m: " ] } ], "source": [ "def objective(trial: optuna.Trial) -> float:\n", " global X_train\n", " global y_train\n", " params = {\n", " 'objective': 'reg:squarederror',\n", " 'eval_metric': 'rmse',\n", " 'booster': trial.suggest_categorical('booster', ['gbtree', 'dart']),\n", " 'lambda': trial.suggest_float('lambda', 1e-8, 1.0, log=True),\n", " 'alpha': trial.suggest_float('alpha', 1e-8, 1.0, log=True),\n", " 'subsample': trial.suggest_float('subsample', 0.8, 1.0),\n", " 'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1.0),\n", " 'learning_rate': trial.suggest_float('learning_rate', 1e-4, 0.5, log=True),\n", " 'max_depth': trial.suggest_int('max_depth', 1, 9),\n", " }\n", " num_boost_round = trial.suggest_int('num_boost_rounds', 500, 1500)\n", " X_opt, X_val, y_opt, y_val = ts_split(X_train, y_train, 252)\n", " dopt = xgb.DMatrix(X_opt, y_opt)\n", " dval = xgb.DMatrix(X_val, y_val)\n", " pruning_callback = XGBoostPruningCallback(trial, 'validation-rmse')\n", " bst = xgb.train(params=params,\n", " dtrain=dopt,\n", " evals=[(dval, 'validation')],\n", " callbacks=[pruning_callback],\n", " num_boost_round=num_boost_round,\n", " verbose_eval=False)\n", " y_pred = bst.predict(dval)\n", "\n", " return root_mean_squared_error(y_val, y_pred) * 100\n", "\n", "\n", "study = optuna.create_study(direction='minimize')\n", "study.optimize(objective, n_trials=1000)\n", "\n", "print('Number of finished trials:', len(study.trials))\n", "print('Best trial:')\n", "trial = study.best_trial\n", "print(' Value: {:.5f}'.format(trial.value))\n", "print(' Params: ')\n", "for key, value in trial.params.items():\n", " print(' {}: {}'.format(key, value))\n", "\n", "# model_name = input('Model name: ')\n", "\n", "best_params = trial.params\n", "\n", "const_params = {\n", " 'objective': 'reg:squarederror',\n", " 'eval_metric': 'rmse',\n", " 'random_state': 42\n", "}\n", "\n", "n_rounds = best_params['num_boost_rounds']\n", "best_params.pop('num_boost_rounds')\n", "best_params.update(const_params)" ], "metadata": { "collapsed": false } }, { "cell_type": "code", "execution_count": 6, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\n", "1.0021918308112492\n", "adj_close_lag_1: 2880.0\n", "adj_close_lag_2: 858.0\n", "return_lag_1: 755.0\n", "rsi_ema_lag_1: 719.0\n", "open_lag_1: 619.0\n", "volume_lag_1: 522.0\n", "high_lag_1: 514.0\n", "low_lag_1: 461.0\n", "adj_close_lag_3: 388.0\n", "log1p_return_lag_1: 360.0\n", "return_lag_2: 352.0\n", "ewma_20_lag_1: 300.0\n", "return_lag_8: 298.0\n", "volume_lag_2: 260.0\n", "volume_lag_3: 215.0\n", "blob_pol_lag_1: 214.0\n", "rsi_ema_lag_2: 213.0\n", "vader_neg_lag_1: 206.0\n", "adj_close_lag_4: 204.0\n", "finbert_pos_lag_1: 195.0\n", "blob_sub_lag_4: 193.0\n", "finbert_sent_lag_1: 192.0\n", "vader_neu_lag_3: 192.0\n", "blob_sub_lag_1: 191.0\n", "ewmstd_20_lag_1: 187.0\n", "finbert_neg_lag_12: 187.0\n", "blob_pol_lag_7: 185.0\n", "finbert_neu_lag_1: 183.0\n", "return_lag_4: 179.0\n", "blob_pol_lag_3: 174.0\n", "return_lag_3: 173.0\n", "volume_lag_4: 173.0\n", "blob_pol_lag_10: 172.0\n", "vader_pos_lag_7: 171.0\n", "vader_pos_lag_12: 171.0\n", "blob_pol_lag_5: 170.0\n", "blob_sub_lag_3: 168.0\n", "blob_sub_lag_8: 168.0\n", "finbert_neg_lag_1: 164.0\n", "blob_pol_lag_4: 164.0\n", "vader_pos_lag_6: 164.0\n", "vader_neg_lag_5: 163.0\n", "vader_neu_lag_1: 162.0\n", "finbert_neu_lag_4: 161.0\n", "vader_neg_lag_2: 160.0\n", "vader_neu_lag_4: 159.0\n", "finbert_pos_lag_4: 158.0\n", "macd_lag_1: 156.0\n", "blob_pol_lag_12: 154.0\n", "finbert_pos_lag_13: 154.0\n", "finbert_neu_lag_14: 154.0\n", "ewma_60_lag_1: 153.0\n", "volume_lag_5: 153.0\n", "blob_pol_lag_2: 152.0\n", "vader_pos_lag_3: 152.0\n", "finbert_pos_lag_5: 152.0\n", "finbert_sent_lag_8: 152.0\n", "vader_pos_lag_14: 152.0\n", "vader_pos_lag_2: 151.0\n", "vader_neg_lag_3: 149.0\n", "adj_close_lag_6: 148.0\n", "blob_pol_lag_9: 148.0\n", "return_lag_10: 147.0\n", "blob_sub_lag_12: 147.0\n", "finbert_pos_lag_9: 146.0\n", "vader_neu_lag_11: 146.0\n", "vader_neu_lag_2: 145.0\n", "finbert_pos_lag_7: 143.0\n", "return_lag_13: 143.0\n", "blob_pol_lag_8: 142.0\n", "return_lag_12: 142.0\n", "return_lag_6: 141.0\n", "finbert_neu_lag_6: 141.0\n", "finbert_sent_lag_6: 141.0\n", "blob_sub_lag_11: 141.0\n", "finbert_pos_lag_11: 141.0\n", "adj_close_lag_5: 140.0\n", "vader_neg_lag_7: 140.0\n", "blob_sub_lag_10: 140.0\n", "return_lag_5: 139.0\n", "blob_sub_lag_6: 139.0\n", "vader_pos_lag_1: 138.0\n", "finbert_pos_lag_3: 138.0\n", "finbert_neu_lag_5: 138.0\n", "log1p_return_lag_2: 137.0\n", "finbert_sent_lag_13: 137.0\n", "finbert_neu_lag_11: 136.0\n", "finbert_pos_lag_12: 136.0\n", "vader_pos_lag_4: 135.0\n", "blob_sub_lag_2: 133.0\n", "rsi_ema_lag_4: 133.0\n", "volume_lag_6: 133.0\n", "volume_lag_9: 133.0\n", "finbert_neu_lag_7: 131.0\n", "vader_neu_lag_9: 131.0\n", "return_lag_11: 131.0\n", "vader_neg_lag_13: 131.0\n", "vader_pos_lag_8: 130.0\n", "volume_lag_10: 130.0\n", "return_lag_9: 129.0\n", "finbert_pos_lag_10: 129.0\n", "ewma_20_lag_2: 127.0\n", "log1p_return_lag_8: 127.0\n", "ewmstd_20_lag_3: 126.0\n", "vader_neu_lag_6: 126.0\n", "finbert_neu_lag_10: 123.0\n", "vader_neg_lag_9: 122.0\n", "finbert_pos_lag_14: 122.0\n", "blob_pol_lag_11: 121.0\n", "vader_neg_lag_4: 120.0\n", "vader_neg_lag_6: 120.0\n", "return_lag_14: 120.0\n", "finbert_sent_lag_3: 119.0\n", "finbert_pos_lag_2: 118.0\n", "finbert_sent_lag_2: 118.0\n", "return_lag_7: 118.0\n", "finbert_neg_lag_13: 118.0\n", "vader_pos_lag_13: 118.0\n", "finbert_neu_lag_9: 117.0\n", "volume_lag_11: 117.0\n", "vader_pos_lag_11: 116.0\n", "finbert_neu_lag_2: 114.0\n", "finbert_neu_lag_8: 114.0\n", "finbert_sent_lag_12: 113.0\n", "vader_pos_lag_5: 112.0\n", "vader_neu_lag_5: 111.0\n", "vader_neu_lag_12: 111.0\n", "vader_neg_lag_14: 111.0\n", "finbert_neg_lag_2: 110.0\n", "finbert_neu_lag_3: 110.0\n", "finbert_sent_lag_11: 110.0\n", "finbert_neg_lag_5: 109.0\n", "finbert_neg_lag_6: 109.0\n", "blob_sub_lag_14: 109.0\n", "finbert_sent_lag_5: 108.0\n", "finbert_sent_lag_7: 108.0\n", "finbert_neg_lag_14: 108.0\n", "finbert_sent_lag_14: 108.0\n", "blob_pol_lag_6: 107.0\n", "finbert_pos_lag_8: 107.0\n", "vader_pos_lag_9: 107.0\n", "blob_sub_lag_13: 106.0\n", "volume_lag_8: 105.0\n", "rsi_ema_lag_8: 105.0\n", "vader_neu_lag_8: 105.0\n", "adj_close_lag_9: 104.0\n", "vader_neg_lag_12: 104.0\n", "finbert_neg_lag_8: 103.0\n", "blob_sub_lag_9: 102.0\n", "rsi_ema_lag_11: 102.0\n", "blob_pol_lag_13: 102.0\n", "vader_neu_lag_7: 101.0\n", "vader_neu_lag_10: 101.0\n", "vader_neg_lag_11: 101.0\n", "blob_sub_lag_5: 100.0\n", "vader_neu_lag_13: 100.0\n", "rsi_ema_lag_3: 96.0\n", "finbert_neg_lag_3: 96.0\n", "volume_lag_14: 96.0\n", "adj_close_lag_8: 95.0\n", "volume_lag_12: 94.0\n", "finbert_neu_lag_13: 94.0\n", "vader_neg_lag_8: 93.0\n", "low_lag_2: 92.0\n", "ewmstd_20_lag_2: 92.0\n", "adj_close_lag_7: 92.0\n", "finbert_neu_lag_12: 92.0\n", "open_lag_14: 92.0\n", "finbert_pos_lag_6: 91.0\n", "low_lag_3: 89.0\n", "rsi_ema_lag_7: 89.0\n", "ewmstd_20_lag_7: 89.0\n", "ewmstd_20_lag_12: 89.0\n", "high_lag_13: 89.0\n", "rsi_ema_lag_14: 89.0\n", "blob_sub_lag_7: 87.0\n", "vader_neg_lag_10: 87.0\n", "rsi_ema_lag_5: 86.0\n", "volume_lag_7: 86.0\n", "rsi_ema_lag_9: 86.0\n", "finbert_neg_lag_11: 86.0\n", "finbert_sent_lag_10: 85.0\n", "ewmstd_20_lag_4: 84.0\n", "finbert_neg_lag_4: 84.0\n", "finbert_neg_lag_10: 84.0\n", "blob_pol_lag_14: 84.0\n", "macd_lag_12: 83.0\n", "rsi_ema_lag_10: 82.0\n", "rsi_ema_lag_6: 81.0\n", "rsi_ema_lag_12: 81.0\n", "rsi_ema_lag_13: 81.0\n", "finbert_neg_lag_7: 80.0\n", "ewmstd_20_lag_8: 79.0\n", "open_lag_2: 78.0\n", "volume_lag_13: 78.0\n", "ewmstd_20_lag_14: 78.0\n", "adj_close_lag_10: 77.0\n", "vader_neu_lag_14: 77.0\n", "ewmstd_20_lag_5: 76.0\n", "finbert_sent_lag_9: 76.0\n", "vader_pos_lag_10: 76.0\n", "log1p_return_lag_4: 75.0\n", "finbert_neg_lag_9: 73.0\n", "ewmstd_20_lag_6: 72.0\n", "log1p_return_lag_10: 72.0\n", "high_lag_2: 70.0\n", "open_lag_5: 70.0\n", "open_lag_7: 70.0\n", "open_lag_3: 69.0\n", "finbert_sent_lag_4: 68.0\n", "ewmstd_20_lag_13: 67.0\n", "log1p_return_lag_14: 67.0\n", "log1p_return_lag_12: 65.0\n", "adj_close_lag_14: 65.0\n", "high_lag_4: 62.0\n", "open_lag_6: 62.0\n", "ewma_60_lag_14: 62.0\n", "high_lag_10: 61.0\n", "open_lag_4: 60.0\n", "log1p_return_lag_13: 60.0\n", "macd_lag_13: 60.0\n", "macd_lag_14: 59.0\n", "low_lag_14: 58.0\n", "macd_lag_3: 57.0\n", "adj_close_lag_13: 57.0\n", "adj_close_lag_11: 55.0\n", "open_lag_13: 55.0\n", "log1p_return_lag_3: 54.0\n", "log1p_return_lag_5: 54.0\n", "macd_lag_2: 53.0\n", "log1p_return_lag_6: 53.0\n", "log1p_return_lag_9: 53.0\n", "ewmstd_20_lag_9: 52.0\n", "ewma_60_lag_2: 51.0\n", "open_lag_9: 50.0\n", "low_lag_13: 50.0\n", "ewma_60_lag_3: 49.0\n", "log1p_return_lag_11: 49.0\n", "macd_lag_11: 49.0\n", "macd_lag_9: 48.0\n", "ewma_20_lag_10: 48.0\n", "open_lag_11: 48.0\n", "ewma_60_lag_13: 48.0\n", "high_lag_14: 48.0\n", "blob_sub_med_lag_1: 47.0\n", "ewma_20_lag_3: 47.0\n", "ewmstd_20_lag_10: 47.0\n", "low_lag_4: 45.0\n", "ewma_20_lag_9: 44.0\n", "macd_lag_10: 44.0\n", "ewmstd_20_lag_11: 44.0\n", "ewma_60_lag_12: 44.0\n", "low_lag_5: 43.0\n", "ewma_20_lag_14: 43.0\n", "high_lag_7: 42.0\n", "log1p_return_lag_7: 42.0\n", "low_lag_9: 42.0\n", "ewma_20_lag_8: 41.0\n", "high_lag_6: 39.0\n", "open_lag_10: 39.0\n", "low_lag_6: 38.0\n", "low_lag_8: 38.0\n", "adj_close_lag_12: 38.0\n", "macd_lag_5: 37.0\n", "macd_lag_8: 37.0\n", "macd_lag_7: 36.0\n", "open_lag_12: 36.0\n", "macd_lag_6: 35.0\n", "high_lag_9: 35.0\n", "high_lag_3: 34.0\n", "macd_lag_4: 34.0\n", "ewma_20_lag_5: 34.0\n", "low_lag_7: 34.0\n", "ewma_20_lag_6: 33.0\n", "high_lag_5: 32.0\n", "ewma_20_lag_4: 30.0\n", "high_lag_8: 29.0\n", "ewma_20_lag_13: 29.0\n", "high_lag_11: 27.0\n", "low_lag_11: 27.0\n", "low_lag_10: 25.0\n", "ewma_20_lag_12: 25.0\n", "blob_sub_med_lag_10: 24.0\n", "blob_sub_med_lag_2: 23.0\n", "ewma_60_lag_4: 22.0\n", "open_lag_8: 22.0\n", "high_lag_12: 22.0\n", "ewma_60_lag_7: 21.0\n", "blob_sub_med_lag_9: 20.0\n", "ewma_60_lag_11: 19.0\n", "low_lag_12: 19.0\n", "ewma_60_lag_5: 18.0\n", "ewma_60_lag_9: 17.0\n", "ewma_20_lag_11: 17.0\n", "ewma_20_lag_7: 16.0\n", "ewma_60_lag_8: 16.0\n", "blob_sub_med_lag_6: 15.0\n", "blob_sub_med_lag_4: 12.0\n", "ewma_60_lag_6: 12.0\n", "blob_sub_med_lag_13: 11.0\n", "ewma_60_lag_10: 10.0\n", "blob_sub_med_lag_11: 9.0\n", "blob_sub_med_lag_7: 7.0\n", "blob_sub_med_lag_12: 6.0\n", "blob_sub_med_lag_5: 5.0\n", "blob_sub_med_lag_3: 4.0\n", "blob_sub_med_lag_8: 3.0\n", "blob_sub_med_lag_14: 1.0\n" ] } ], "source": [ "dtrain = xgb.DMatrix(X_train, y_train)\n", "dtest = xgb.DMatrix(X_test, y_test)\n", "best_params = {\n", " 'eta': 0.01,\n", " 'colsample_bytree': 0.6,\n", " 'subsample': 0.9,\n", " 'max_depth': 6,\n", " 'lambda': 1.0,\n", " 'alpha': 0.0001,\n", " 'objective': 'reg:squarederror',\n", " 'random_state': 42\n", "}\n", "model = xgb.train(best_params, dtrain, num_boost_round=800)\n", "print()\n", "importance_dict = model.get_score(importance_type='weight')\n", "sorted_importance = sorted(importance_dict.items(), key=lambda item: item[1], reverse=True)\n", "preds = model.predict(dtest)\n", "print(root_mean_squared_error(y_test, preds) * 100)\n", "# Print the sorted key-value pairs\n", "for feature, importance in sorted_importance:\n", " print(f\"{feature}: {importance}\")" ], "metadata": { "collapsed": false } }, { "cell_type": "markdown", "source": [ "Best trial:\n", " Value: 0.01113\n", " Params:\n", " max_depth: 3\n", " learning_rate: 0.013019168747264056\n", " subsample: 0.8300701590363793\n", " colsample_bytree: 0.5999155901719391\n", " reg_alpha: 1.1965214202530094e-08\n", " reg_lambda: 3.7725711320800848\n", " num_boost_rounds: 580" ], "metadata": { "collapsed": false } }, { "cell_type": "code", "execution_count": null, "outputs": [], "source": [], "metadata": { "collapsed": false } } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 2 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython2", "version": "2.7.6" } }, "nbformat": 4, "nbformat_minor": 0 }