File size: 5,411 Bytes
05a3e2c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
from datetime import datetime, timedelta
import pandas as pd
import numpy as np

from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from math import sqrt
from sklearn.preprocessing import MinMaxScaler
import model_utils as mu

def model_run(df_all):
    """ Prediciton function that runs random forest model and predicts tomorrow cryptocurrency price"""
    
    first_day_future=pd.to_datetime(datetime.now()+timedelta(days=1))
    #----------------------------------------- DATASET MANIPULATION FOR SUPERVISED LEARNING --------------------------------------------
    reframed_lags, df_final=mu.data_transform(df_all, first_day_future)
    print(f'I have transformed the dataset into the frame for supervised learning')
    reframed_lags.to_csv('reframed_lags.csv')
    #----------------------------------------- TRAIN/TEST SPLIT ------------------------------------------------------
    """ Randomly split a chunk into train test based on train/test ratio (0.8) and split the other chunks for all the other currencies in the same fashion"""
    import random
    train_size=0.8

    df_cut1=reframed_lags.reset_index().iloc[:,1:]
    print('tady')
    train_value=int(len(df_cut1)*train_size)
    first_random=random.sample(range(len(df_cut1)-1), train_value)
    train_bulk=np.sort(first_random) #make sure all the consequent ones have the same random numbers

    df_cut=reframed_lags.reset_index()
    train_sample=df_cut.loc[df_cut['index'].isin(train_bulk)]
    test_sample=df_cut.loc[~df_cut['index'].isin(train_bulk)]

    test=test_sample.iloc[:,1:] 
    train=train_sample.iloc[:,1:]
    print(f'I have split the dataset into training and testing samples')

    #----------------------------------- Re-Scale for supervised learning
    # TRAIN RESCALE
    # normalize features for the supervised learning (0,1)
    scaler_train = MinMaxScaler(feature_range=(0, 1))
    scaled = scaler_train.fit_transform(train.values.astype('float32'))
    df_train=pd.DataFrame(scaled)
    df_train.columns=train.columns #rename columns

    # TEST RESCALE
    scaler_test = MinMaxScaler(feature_range=(0, 1))
    scaled = scaler_test.fit_transform(test.values.astype('float32'))
    df_test=pd.DataFrame(scaled)
    df_test.columns=test.columns #rename columns

    #----------------------------------- MODEL

    #define features
    train_features=df_train.values
    test_features=df_test.values
    #define labels
    train_labels = df_train['prices'].values
    test_labels = df_test['prices'].values        

    #define baseline prediction (as last values) for evaluating prediction accuracy
    baseline_preds = pd.DataFrame(test_features).iloc[:,0]
    # Calculate errors for the baseline prediction
    baseline_errors = abs(baseline_preds - test_labels)

    # Import the model we are using
    from sklearn.ensemble import RandomForestRegressor
    # Instantiate model with 1000 decision trees
    rf = RandomForestRegressor(n_estimators= 1000)
    rf.fit(train_features, train_labels)
    prediction_rf = rf.predict(test_features)
    predictions=prediction_rf

    #----------------------------------- MODEL OUTPUT TRANSFORMATION
    #Convert test column
    df_test['prices']=predictions
    prediction_transformed=pd.DataFrame(scaler_test.inverse_transform(df_test.values.astype('float')))
    prediction_transformed.columns=test.columns

    #Convert prediction
    df_test.loc[df_test.index==(len(df_test)-1),'prices']=predictions[-1:][0]
    inv_transformed=pd.DataFrame(scaler_test.inverse_transform(df_test.values.astype('float')))
    inv_transformed.columns=test.columns

    # data with forecast
    df_with_forecast=df_final.copy()
    df_with_forecast.loc[df_with_forecast.index==df_with_forecast.index[-1],'prices']=inv_transformed['prices'][-1:].values[0]
    print('Final result')
    print(df_with_forecast)

    #----------------------------------- MODEL ACCURACY
    #Calculate accuracy after transformation!!!
    #get rid of values below 0.01 which skew the accuracy measure if in denominator

    #Rolling window accuracy measure
    if len(reframed_lags)>500:
        train_size=0.9
    elif len(reframed_lags)>200:
        train_size=0.8
    else:
        train_size=0.7
    predictions=[]
    test_labels_all=[]
    window_length=int((len(reframed_lags)-len(reframed_lags)*train_size))
    for i in range(0,window_length):
        train_accuracy=reframed_lags.iloc[0:int(len(reframed_lags)*train_size)+i,:]
        test_accuracy=reframed_lags.iloc[len(train_accuracy):len(train_accuracy)+1,:]
        train_features_accuracy=train_accuracy.drop(columns='prices')
        test_features_accuracy=test_accuracy.drop(columns='prices')
        train_labels_accuracy=train_accuracy['prices']
        test_labels_accuracy=test_accuracy['prices']

        rf = RandomForestRegressor(n_estimators= 1000)
        rf.fit(train_features_accuracy, train_labels_accuracy)
        prediction_rf = rf.predict(test_features_accuracy)
        predictions=np.append(predictions,prediction_rf)
        test_labels_all=np.append(test_labels_all,test_labels_accuracy)

    #Calculate accuracy
    from sklearn.metrics import r2_score
    accuracy=r2_score(predictions,test_labels_all)
    result_rf=pd.DataFrame({'prediction':predictions,'data':test_labels_all})
    result_rf.to_csv('result_rf.csv')
    return df_with_forecast, accuracy, result_rf