In [43]:
import numpy as np
import pandas as pd
import plotly.express as px
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error
from statsmodels.stats.stattools import durbin_watson
from statsmodels.tsa.api import VAR
from statsmodels.tsa.stattools import adfuller
from statsmodels.tsa.stattools import grangercausalitytests
from statsmodels.tsa.stattools import kpss

In [30]:
df_apple = pd.read_csv('../coal-price-data/investing/AAPL Historical Data.csv')
df_walmart = pd.read_csv('../coal-price-data/investing/WMT Historical Data.csv')
df_tesla = pd.read_csv('../coal-price-data/investing/TSLA Historical Data.csv')

In [3]:
# df_apple.rename(columns = {'Price':'apple'}, inplace = True)
# df_walmart.rename(columns = {'Price':'walmart'}, inplace = True)
# df_tesla.rename(columns = {'Price':'tesla'}, inplace = True)

In [4]:
df_apple

Unnamed: 0,Date,Price,Open,High,Low,Vol.,Change %
0,02/01/2024,182.52,183.97,191.00,179.26,45.12M,-1.02%
1,01/01/2024,184.40,187.15,196.38,180.17,1.19B,-4.22%
2,12/01/2023,192.53,190.33,199.62,187.45,1.06B,1.36%
3,11/01/2023,189.95,171.00,192.93,170.12,1.10B,11.23%
4,10/01/2023,170.77,171.22,182.34,165.67,1.17B,-0.26%
...,...,...,...,...,...,...,...
513,05/01/1981,0.15,0.13,0.15,0.12,590.42M,15.38%
514,04/01/1981,0.13,0.11,0.13,0.11,536.93M,18.18%
515,03/01/1981,0.11,0.12,0.12,0.10,700.72M,-8.33%
516,02/01/1981,0.12,0.12,0.13,0.11,321.62M,-7.69%


In [5]:
df = pd.merge(df_apple[['Date', 'Price']], df_walmart[['Date', 'Price']], on='Date', how='right').rename(columns = {'Price_x':'apple', 'Price_y':'walmart'})
df = df.merge(df_tesla[['Date', 'Price']], on='Date', how='right').rename(columns={'Price':'tesla'})

In [6]:
df

Unnamed: 0,Date,apple,walmart,tesla
0,02/01/2024,182.52,175.56,191.97
1,01/01/2024,184.40,165.25,187.29
2,12/01/2023,192.53,157.65,248.48
3,11/01/2023,189.95,155.69,240.08
4,10/01/2023,170.77,163.41,200.84
...,...,...,...,...
159,11/01/2010,11.11,54.09,2.36
160,10/01/2010,10.75,54.17,1.46
161,09/01/2010,10.13,53.52,1.36
162,08/01/2010,8.68,50.14,1.30


In [7]:
df['Date'] =  pd.to_datetime(df['Date'])

In [8]:
df

Unnamed: 0,Date,apple,walmart,tesla
0,2024-02-01,182.52,175.56,191.97
1,2024-01-01,184.40,165.25,187.29
2,2023-12-01,192.53,157.65,248.48
3,2023-11-01,189.95,155.69,240.08
4,2023-10-01,170.77,163.41,200.84
...,...,...,...,...
159,2010-11-01,11.11,54.09,2.36
160,2010-10-01,10.75,54.17,1.46
161,2010-09-01,10.13,53.52,1.36
162,2010-08-01,8.68,50.14,1.30


In [9]:
df = df.set_index('Date').rename_axis('company', axis=1)

In [10]:
df

company,apple,walmart,tesla
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2024-02-01,182.52,175.56,191.97
2024-01-01,184.40,165.25,187.29
2023-12-01,192.53,157.65,248.48
2023-11-01,189.95,155.69,240.08
2023-10-01,170.77,163.41,200.84
...,...,...,...
2010-11-01,11.11,54.09,2.36
2010-10-01,10.75,54.17,1.46
2010-09-01,10.13,53.52,1.36
2010-08-01,8.68,50.14,1.30


In [11]:
fig = px.line(df, facet_col="company", facet_col_wrap=1)
fig.update_yaxes(matches=None)
fig.show()

In [12]:
fig = px.area(df, facet_col='company', facet_col_wrap=1)
fig.update_yaxes(matches=None)
fig.show()

In [13]:
n_obs = 20
df_train, df_test = df[0:-n_obs], df[-n_obs:]

In [14]:
def adf_test(df):
    result = adfuller(df.values)
    print('ADF Statistics: %f' % result[0])
    print('p-value: %f' % result[1])
    print('Critical values:')
    for key, value in result[4].items():
        print('\t%s: %.3f' % (key, value))

In [15]:
print('ADF Test: Apple time series')
adf_test(df_train['apple'])
print('ADF Test: Walmart time series')
adf_test(df_train['walmart'])
print('ADF Test: Tesla time series')
adf_test(df_train['tesla'])

ADF Test: Apple time series
ADF Statistics: -2.793474
p-value: 0.059212
Critical values:
	1%: -3.480
	5%: -2.883
	10%: -2.578
ADF Test: Walmart time series
ADF Statistics: -1.407972
p-value: 0.578448
Critical values:
	1%: -3.478
	5%: -2.882
	10%: -2.578
ADF Test: Tesla time series
ADF Statistics: -1.195298
p-value: 0.675615
Critical values:
	1%: -3.482
	5%: -2.884
	10%: -2.579


In [17]:
def kpss_test(df):    
    statistic, p_value, n_lags, critical_values = kpss(df.values)
    
    print(f'KPSS Statistic: {statistic}')
    print(f'p-value: {p_value}')
    print(f'num lags: {n_lags}')
    print('Critial Values:')
    for key, value in critical_values.items():
        print(f'   {key} : {value}')

In [18]:
print('KPSS Test: Apple time series')
kpss_test(df_train['apple'])
print('KPSS Test: Walmart time series')
kpss_test(df_train['walmart'])
print('KPSS Test: Tesla time series')
kpss_test(df_train['tesla'])

KPSS Test: Apple time series
KPSS Statistic: 1.4799049926667052
p-value: 0.01
num lags: 8
Critial Values:
   10% : 0.347
   5% : 0.463
   2.5% : 0.574
   1% : 0.739
KPSS Test: Walmart time series
KPSS Statistic: 1.5504555338108945
p-value: 0.01
num lags: 8
Critial Values:
   10% : 0.347
   5% : 0.463
   2.5% : 0.574
   1% : 0.739
KPSS Test: Tesla time series
KPSS Statistic: 1.2070673514705184
p-value: 0.01
num lags: 8
Critial Values:
   10% : 0.347
   5% : 0.463
   2.5% : 0.574
   1% : 0.739



The test statistic is outside of the range of p-values available in the
look-up table. The actual p-value is smaller than the p-value returned.



The test statistic is outside of the range of p-values available in the
look-up table. The actual p-value is smaller than the p-value returned.



The test statistic is outside of the range of p-values available in the
look-up table. The actual p-value is smaller than the p-value returned.




In [19]:
df_train_transformed = df_train.diff().dropna()

fig = px.line(df_train_transformed, facet_col="company", facet_col_wrap=1)
fig.update_yaxes(matches=None)
fig.show()

In [20]:
print('ADF Test: Apple time series transformed')
adf_test(df_train_transformed['apple'])
print('ADF Test: Walmart time series transformed')
adf_test(df_train_transformed['walmart'])
print('ADF Test: Tesla time series transformed')
adf_test(df_train_transformed['tesla'])

ADF Test: Apple time series transformed
ADF Statistics: -3.799168
p-value: 0.002916
Critical values:
	1%: -3.480
	5%: -2.883
	10%: -2.578
ADF Test: Walmart time series transformed
ADF Statistics: -10.306569
p-value: 0.000000
Critical values:
	1%: -3.478
	5%: -2.882
	10%: -2.578
ADF Test: Tesla time series transformed
ADF Statistics: -2.927547
p-value: 0.042237
Critical values:
	1%: -3.482
	5%: -2.884
	10%: -2.579


In [21]:
print('KPSS Test: Apple time series transformed')
kpss_test(df_train_transformed['apple'])
print('KPSS Test: Walmart time series transformed')
kpss_test(df_train_transformed['walmart'])
print('KPSS Test: Tesla time series transformed')
kpss_test(df_train_transformed['tesla'])

KPSS Test: Apple time series transformed
KPSS Statistic: 0.30642603993334216
p-value: 0.1
num lags: 7
Critial Values:
   10% : 0.347
   5% : 0.463
   2.5% : 0.574
   1% : 0.739
KPSS Test: Walmart time series transformed
KPSS Statistic: 0.19811078226585627
p-value: 0.1
num lags: 8
Critial Values:
   10% : 0.347
   5% : 0.463
   2.5% : 0.574
   1% : 0.739
KPSS Test: Tesla time series transformed
KPSS Statistic: 0.09114707308088876
p-value: 0.1
num lags: 17
Critial Values:
   10% : 0.347
   5% : 0.463
   2.5% : 0.574
   1% : 0.739



The test statistic is outside of the range of p-values available in the
look-up table. The actual p-value is greater than the p-value returned.



The test statistic is outside of the range of p-values available in the
look-up table. The actual p-value is greater than the p-value returned.



The test statistic is outside of the range of p-values available in the
look-up table. The actual p-value is greater than the p-value returned.




In [25]:
model = VAR(df_train_transformed)


No frequency information was provided, so inferred frequency -1MS will be used.


A date index has been provided, but it is not monotonic and so will be ignored when e.g. forecasting.



In [27]:
for i in [1,2,3,4,5,6,7,8,9,10,11,12,13,14,15]:
    result = model.fit(i)
    print('Lag Order =', i)
    print('AIC : ', result.aic)
    print('BIC : ', result.bic)
    print('FPE : ', result.fpe)
    print('HQIC: ', result.hqic, '\n')

Lag Order = 1
AIC :  13.121845837129593
BIC :  13.371634039180403
FPE :  499763.6483155377
HQIC:  13.223349623202385 

Lag Order = 2
AIC :  13.018387880139725
BIC :  13.457564885089665
FPE :  450734.1429337017
HQIC:  13.196854225926092 

Lag Order = 3
AIC :  12.924920345350833
BIC :  13.555272293052827
FPE :  410713.6140093425
HQIC:  13.181076523988663 

Lag Order = 4
AIC :  12.765344110806351
BIC :  13.58868571794374
FPE :  350455.45003947406
HQIC:  13.099928235318213 

Lag Order = 5
AIC :  12.679059830627413
BIC :  13.697235025464702
FPE :  321966.3544416425
HQIC:  13.092821062630037 

Lag Order = 6
AIC :  12.592854593217835
BIC :  13.80773716819742
FPE :  296040.6198897234
HQIC:  13.086553356094514 

Lag Order = 7
AIC :  12.578950684208595
BIC :  13.992444966992267
FPE :  292889.94491444435
HQIC:  13.153358880821687 

Lag Order = 8
AIC :  12.557421041800211
BIC :  14.171462585377117
FPE :  287906.2769831479
HQIC:  13.213322276819387 

Lag Order = 9
AIC :  12.299921356873089
BIC :  1

In [28]:
results = model.fit(maxlags=15, ic='aic')
results.summary()

  Summary of Regression Results   
Model:                         VAR
Method:                        OLS
Date:           Sun, 25, Feb, 2024
Time:                     21:40:09
--------------------------------------------------------------------
No. of Equations:         3.00000    BIC:                    14.5979
Nobs:                     128.000    HQIC:                   12.7724
Log likelihood:          -1144.35    FPE:                    111728.
AIC:                      11.5231    Det(Omega_mle):         44477.8
--------------------------------------------------------------------
Results for equation apple
                 coefficient       std. error           t-stat            prob
------------------------------------------------------------------------------
const              -0.699238         0.584786           -1.196           0.232
L1.apple            0.014847         0.140038            0.106           0.916
L1.walmart         -0.113138         0.105005           -1.077      

In [31]:
out = durbin_watson(results.resid)

In [32]:
for col, val in zip(df.columns, out):
    print(col, ':', round(val, 2))

apple : 2.12
walmart : 1.9
tesla : 2.13


In [34]:
maxlag=15
test = 'ssr_chi2test'

In [35]:
def grangers_causation_matrix(data, variables, test='ssr_chi2test', verbose=False):    
   
    df = pd.DataFrame(np.zeros((len(variables), len(variables))), columns=variables, index=variables)
    for c in df.columns:
        for r in df.index:
            test_result = grangercausalitytests(data[[r, c]], maxlag=maxlag, verbose=False)
            p_values = [round(test_result[i+1][0][test][1],4) for i in range(maxlag)]
            if verbose: print(f'Y = {r}, X = {c}, P Values = {p_values}')
            min_p_value = np.min(p_values)
            df.loc[r, c] = min_p_value
    df.columns = [var + '_x' for var in variables]
    df.index = [var + '_y' for var in variables]
    return df

In [38]:
grangers_causation_matrix(df_train_transformed, variables = df_train_transformed.columns)


verbose is deprecated since functions should not print results


verbose is deprecated since functions should not print results


verbose is deprecated since functions should not print results


verbose is deprecated since functions should not print results


verbose is deprecated since functions should not print results


verbose is deprecated since functions should not print results


verbose is deprecated since functions should not print results


verbose is deprecated since functions should not print results


verbose is deprecated since functions should not print results



Unnamed: 0,apple_x,walmart_x,tesla_x
apple_y,1.0,0.0003,0.0
walmart_y,0.0,1.0,0.0
tesla_y,0.0,0.0,1.0


In [39]:
lag_order = results.k_ar

df_input = df_train_transformed.values[-lag_order:]
df_forecast = results.forecast(y=df_input, steps=n_obs)
df_forecast = (pd.DataFrame(df_forecast, index=df_test.index, columns=df_test.columns + '_pred'))

In [40]:
def invert_transformation(df, pred):
    forecast = df_forecast.copy()
    columns = df.columns
    for col in columns:
        forecast[str(col)+'_pred'] = df[col].iloc[-1] + forecast[str(col)+'_pred'].cumsum()
    return forecast

In [41]:
output = invert_transformation(df_train, df_forecast)

combined = pd.concat([output['apple_pred'], df_test['apple'], output['walmart_pred'], df_test['walmart'], output['tesla_pred'], df_test['tesla']], axis=1)

In [44]:
rmse = mean_squared_error(combined['apple_pred'], combined['apple'], squared=False)
mae = mean_absolute_error(combined['apple_pred'], combined['apple'])

print('Forecast accuracy of Apple')
print('RMSE: ', round(rmse,2))
print('MAE: ', round(mae,2))

Forecast accuracy of Apple
RMSE:  4.37
MAE:  3.67



'squared' is deprecated in version 1.4 and will be removed in 1.6. To calculate the root mean squared error, use the function'root_mean_squared_error'.



In [45]:
rmse = mean_squared_error(combined['walmart_pred'], combined['walmart'], squared=False)
mae = mean_absolute_error(combined['walmart_pred'], combined['walmart'])

print('Forecast accuracy of Walmart')
print('RMSE: ', round(rmse,2))
print('MAE: ', round(mae,2))

Forecast accuracy of Walmart
RMSE:  10.5
MAE:  9.71



'squared' is deprecated in version 1.4 and will be removed in 1.6. To calculate the root mean squared error, use the function'root_mean_squared_error'.



In [46]:
rmse = mean_squared_error(combined['tesla_pred'], combined['tesla'], squared=False)
mae = mean_absolute_error(combined['tesla_pred'], combined['tesla'])

print('Forecast accuracy of Tesla')
print('RMSE: ', round(rmse,2))
print('MAE: ', round(mae,2))

Forecast accuracy of Tesla
RMSE:  11.66
MAE:  9.66



'squared' is deprecated in version 1.4 and will be removed in 1.6. To calculate the root mean squared error, use the function'root_mean_squared_error'.

