from datetime import datetime, timedelta import pandas as pd import numpy as np from sklearn.ensemble import RandomForestRegressor from sklearn.metrics import mean_squared_error from math import sqrt from sklearn.preprocessing import MinMaxScaler import model_utils as mu def model_run(df_all): """ Prediciton function that runs random forest model and predicts tomorrow cryptocurrency price""" first_day_future=pd.to_datetime(datetime.now()+timedelta(days=1)) #----------------------------------------- DATASET MANIPULATION FOR SUPERVISED LEARNING -------------------------------------------- reframed_lags, df_final=mu.data_transform(df_all, first_day_future) print(f'I have transformed the dataset into the frame for supervised learning') reframed_lags.to_csv('reframed_lags.csv') #----------------------------------------- TRAIN/TEST SPLIT ------------------------------------------------------ """ Randomly split a chunk into train test based on train/test ratio (0.8) and split the other chunks for all the other currencies in the same fashion""" import random train_size=0.8 df_cut1=reframed_lags.reset_index().iloc[:,1:] print('tady') train_value=int(len(df_cut1)*train_size) first_random=random.sample(range(len(df_cut1)-1), train_value) train_bulk=np.sort(first_random) #make sure all the consequent ones have the same random numbers df_cut=reframed_lags.reset_index() train_sample=df_cut.loc[df_cut['index'].isin(train_bulk)] test_sample=df_cut.loc[~df_cut['index'].isin(train_bulk)] test=test_sample.iloc[:,1:] train=train_sample.iloc[:,1:] print(f'I have split the dataset into training and testing samples') #----------------------------------- Re-Scale for supervised learning # TRAIN RESCALE # normalize features for the supervised learning (0,1) scaler_train = MinMaxScaler(feature_range=(0, 1)) scaled = scaler_train.fit_transform(train.values.astype('float32')) df_train=pd.DataFrame(scaled) df_train.columns=train.columns #rename columns # TEST RESCALE scaler_test = MinMaxScaler(feature_range=(0, 1)) scaled = scaler_test.fit_transform(test.values.astype('float32')) df_test=pd.DataFrame(scaled) df_test.columns=test.columns #rename columns #----------------------------------- MODEL #define features train_features=df_train.values test_features=df_test.values #define labels train_labels = df_train['prices'].values test_labels = df_test['prices'].values #define baseline prediction (as last values) for evaluating prediction accuracy baseline_preds = pd.DataFrame(test_features).iloc[:,0] # Calculate errors for the baseline prediction baseline_errors = abs(baseline_preds - test_labels) # Import the model we are using from sklearn.ensemble import RandomForestRegressor # Instantiate model with 1000 decision trees rf = RandomForestRegressor(n_estimators= 1000) rf.fit(train_features, train_labels) prediction_rf = rf.predict(test_features) predictions=prediction_rf #----------------------------------- MODEL OUTPUT TRANSFORMATION #Convert test column df_test['prices']=predictions prediction_transformed=pd.DataFrame(scaler_test.inverse_transform(df_test.values.astype('float'))) prediction_transformed.columns=test.columns #Convert prediction df_test.loc[df_test.index==(len(df_test)-1),'prices']=predictions[-1:][0] inv_transformed=pd.DataFrame(scaler_test.inverse_transform(df_test.values.astype('float'))) inv_transformed.columns=test.columns # data with forecast df_with_forecast=df_final.copy() df_with_forecast.loc[df_with_forecast.index==df_with_forecast.index[-1],'prices']=inv_transformed['prices'][-1:].values[0] print('Final result') print(df_with_forecast) #----------------------------------- MODEL ACCURACY #Calculate accuracy after transformation!!! #get rid of values below 0.01 which skew the accuracy measure if in denominator #Rolling window accuracy measure if len(reframed_lags)>500: train_size=0.9 elif len(reframed_lags)>200: train_size=0.8 else: train_size=0.7 predictions=[] test_labels_all=[] window_length=int((len(reframed_lags)-len(reframed_lags)*train_size)) for i in range(0,window_length): train_accuracy=reframed_lags.iloc[0:int(len(reframed_lags)*train_size)+i,:] test_accuracy=reframed_lags.iloc[len(train_accuracy):len(train_accuracy)+1,:] train_features_accuracy=train_accuracy.drop(columns='prices') test_features_accuracy=test_accuracy.drop(columns='prices') train_labels_accuracy=train_accuracy['prices'] test_labels_accuracy=test_accuracy['prices'] rf = RandomForestRegressor(n_estimators= 1000) rf.fit(train_features_accuracy, train_labels_accuracy) prediction_rf = rf.predict(test_features_accuracy) predictions=np.append(predictions,prediction_rf) test_labels_all=np.append(test_labels_all,test_labels_accuracy) #Calculate accuracy from sklearn.metrics import r2_score accuracy=r2_score(predictions,test_labels_all) result_rf=pd.DataFrame({'prediction':predictions,'data':test_labels_all}) result_rf.to_csv('result_rf.csv') return df_with_forecast, accuracy, result_rf