Krypto1 / app_crypto_rf_model.py
KatGaw's picture
adding files
96edc51
raw
history blame
5.41 kB
from datetime import datetime, timedelta
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from math import sqrt
from sklearn.preprocessing import MinMaxScaler
import model_utils as mu
def model_run(df_all):
""" Prediciton function that runs random forest model and predicts tomorrow cryptocurrency price"""
first_day_future=pd.to_datetime(datetime.now()+timedelta(days=1))
#----------------------------------------- DATASET MANIPULATION FOR SUPERVISED LEARNING --------------------------------------------
reframed_lags, df_final=mu.data_transform(df_all, first_day_future)
print(f'I have transformed the dataset into the frame for supervised learning')
reframed_lags.to_csv('reframed_lags.csv')
#----------------------------------------- TRAIN/TEST SPLIT ------------------------------------------------------
""" Randomly split a chunk into train test based on train/test ratio (0.8) and split the other chunks for all the other currencies in the same fashion"""
import random
train_size=0.8
df_cut1=reframed_lags.reset_index().iloc[:,1:]
print('tady')
train_value=int(len(df_cut1)*train_size)
first_random=random.sample(range(len(df_cut1)-1), train_value)
train_bulk=np.sort(first_random) #make sure all the consequent ones have the same random numbers
df_cut=reframed_lags.reset_index()
train_sample=df_cut.loc[df_cut['index'].isin(train_bulk)]
test_sample=df_cut.loc[~df_cut['index'].isin(train_bulk)]
test=test_sample.iloc[:,1:]
train=train_sample.iloc[:,1:]
print(f'I have split the dataset into training and testing samples')
#----------------------------------- Re-Scale for supervised learning
# TRAIN RESCALE
# normalize features for the supervised learning (0,1)
scaler_train = MinMaxScaler(feature_range=(0, 1))
scaled = scaler_train.fit_transform(train.values.astype('float32'))
df_train=pd.DataFrame(scaled)
df_train.columns=train.columns #rename columns
# TEST RESCALE
scaler_test = MinMaxScaler(feature_range=(0, 1))
scaled = scaler_test.fit_transform(test.values.astype('float32'))
df_test=pd.DataFrame(scaled)
df_test.columns=test.columns #rename columns
#----------------------------------- MODEL
#define features
train_features=df_train.values
test_features=df_test.values
#define labels
train_labels = df_train['prices'].values
test_labels = df_test['prices'].values
#define baseline prediction (as last values) for evaluating prediction accuracy
baseline_preds = pd.DataFrame(test_features).iloc[:,0]
# Calculate errors for the baseline prediction
baseline_errors = abs(baseline_preds - test_labels)
# Import the model we are using
from sklearn.ensemble import RandomForestRegressor
# Instantiate model with 1000 decision trees
rf = RandomForestRegressor(n_estimators= 1000)
rf.fit(train_features, train_labels)
prediction_rf = rf.predict(test_features)
predictions=prediction_rf
#----------------------------------- MODEL OUTPUT TRANSFORMATION
#Convert test column
df_test['prices']=predictions
prediction_transformed=pd.DataFrame(scaler_test.inverse_transform(df_test.values.astype('float')))
prediction_transformed.columns=test.columns
#Convert prediction
df_test.loc[df_test.index==(len(df_test)-1),'prices']=predictions[-1:][0]
inv_transformed=pd.DataFrame(scaler_test.inverse_transform(df_test.values.astype('float')))
inv_transformed.columns=test.columns
# data with forecast
df_with_forecast=df_final.copy()
df_with_forecast.loc[df_with_forecast.index==df_with_forecast.index[-1],'prices']=inv_transformed['prices'][-1:].values[0]
print('Final result')
print(df_with_forecast)
#----------------------------------- MODEL ACCURACY
#Calculate accuracy after transformation!!!
#get rid of values below 0.01 which skew the accuracy measure if in denominator
#Rolling window accuracy measure
if len(reframed_lags)>500:
train_size=0.9
elif len(reframed_lags)>200:
train_size=0.8
else:
train_size=0.7
predictions=[]
test_labels_all=[]
window_length=int((len(reframed_lags)-len(reframed_lags)*train_size))
for i in range(0,window_length):
train_accuracy=reframed_lags.iloc[0:int(len(reframed_lags)*train_size)+i,:]
test_accuracy=reframed_lags.iloc[len(train_accuracy):len(train_accuracy)+1,:]
train_features_accuracy=train_accuracy.drop(columns='prices')
test_features_accuracy=test_accuracy.drop(columns='prices')
train_labels_accuracy=train_accuracy['prices']
test_labels_accuracy=test_accuracy['prices']
rf = RandomForestRegressor(n_estimators= 1000)
rf.fit(train_features_accuracy, train_labels_accuracy)
prediction_rf = rf.predict(test_features_accuracy)
predictions=np.append(predictions,prediction_rf)
test_labels_all=np.append(test_labels_all,test_labels_accuracy)
#Calculate accuracy
from sklearn.metrics import r2_score
accuracy=r2_score(predictions,test_labels_all)
result_rf=pd.DataFrame({'prediction':predictions,'data':test_labels_all})
result_rf.to_csv('result_rf.csv')
return df_with_forecast, accuracy, result_rf