Spaces:

KatGaw
/

Krypto1

Sleeping

File size: 5,411 Bytes

05a3e2c

from datetime import datetime, timedelta
import pandas as pd
import numpy as np

from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from math import sqrt
from sklearn.preprocessing import MinMaxScaler
import model_utils as mu

def model_run(df_all):
    """ Prediciton function that runs random forest model and predicts tomorrow cryptocurrency price"""
    
    first_day_future=pd.to_datetime(datetime.now()+timedelta(days=1))
    #----------------------------------------- DATASET MANIPULATION FOR SUPERVISED LEARNING --------------------------------------------
    reframed_lags, df_final=mu.data_transform(df_all, first_day_future)
    print(f'I have transformed the dataset into the frame for supervised learning')
    reframed_lags.to_csv('reframed_lags.csv')
    #----------------------------------------- TRAIN/TEST SPLIT ------------------------------------------------------
    """ Randomly split a chunk into train test based on train/test ratio (0.8) and split the other chunks for all the other currencies in the same fashion"""
    import random
    train_size=0.8

    df_cut1=reframed_lags.reset_index().iloc[:,1:]
    print('tady')
    train_value=int(len(df_cut1)*train_size)
    first_random=random.sample(range(len(df_cut1)-1), train_value)
    train_bulk=np.sort(first_random) #make sure all the consequent ones have the same random numbers

    df_cut=reframed_lags.reset_index()
    train_sample=df_cut.loc[df_cut['index'].isin(train_bulk)]
    test_sample=df_cut.loc[~df_cut['index'].isin(train_bulk)]

    test=test_sample.iloc[:,1:] 
    train=train_sample.iloc[:,1:]
    print(f'I have split the dataset into training and testing samples')

    #----------------------------------- Re-Scale for supervised learning
    # TRAIN RESCALE
    # normalize features for the supervised learning (0,1)
    scaler_train = MinMaxScaler(feature_range=(0, 1))
    scaled = scaler_train.fit_transform(train.values.astype('float32'))
    df_train=pd.DataFrame(scaled)
    df_train.columns=train.columns #rename columns

    # TEST RESCALE
    scaler_test = MinMaxScaler(feature_range=(0, 1))
    scaled = scaler_test.fit_transform(test.values.astype('float32'))
    df_test=pd.DataFrame(scaled)
    df_test.columns=test.columns #rename columns

    #----------------------------------- MODEL

    #define features
    train_features=df_train.values
    test_features=df_test.values
    #define labels
    train_labels = df_train['prices'].values
    test_labels = df_test['prices'].values        

    #define baseline prediction (as last values) for evaluating prediction accuracy
    baseline_preds = pd.DataFrame(test_features).iloc[:,0]
    # Calculate errors for the baseline prediction
    baseline_errors = abs(baseline_preds - test_labels)

    # Import the model we are using
    from sklearn.ensemble import RandomForestRegressor
    # Instantiate model with 1000 decision trees
    rf = RandomForestRegressor(n_estimators= 1000)
    rf.fit(train_features, train_labels)
    prediction_rf = rf.predict(test_features)
    predictions=prediction_rf

    #----------------------------------- MODEL OUTPUT TRANSFORMATION
    #Convert test column
    df_test['prices']=predictions
    prediction_transformed=pd.DataFrame(scaler_test.inverse_transform(df_test.values.astype('float')))
    prediction_transformed.columns=test.columns

    #Convert prediction
    df_test.loc[df_test.index==(len(df_test)-1),'prices']=predictions[-1:][0]
    inv_transformed=pd.DataFrame(scaler_test.inverse_transform(df_test.values.astype('float')))
    inv_transformed.columns=test.columns

    # data with forecast
    df_with_forecast=df_final.copy()
    df_with_forecast.loc[df_with_forecast.index==df_with_forecast.index[-1],'prices']=inv_transformed['prices'][-1:].values[0]
    print('Final result')
    print(df_with_forecast)

    #----------------------------------- MODEL ACCURACY
    #Calculate accuracy after transformation!!!
    #get rid of values below 0.01 which skew the accuracy measure if in denominator

    #Rolling window accuracy measure
    if len(reframed_lags)>500:
        train_size=0.9
    elif len(reframed_lags)>200:
        train_size=0.8
    else:
        train_size=0.7
    predictions=[]
    test_labels_all=[]
    window_length=int((len(reframed_lags)-len(reframed_lags)*train_size))
    for i in range(0,window_length):
        train_accuracy=reframed_lags.iloc[0:int(len(reframed_lags)*train_size)+i,:]
        test_accuracy=reframed_lags.iloc[len(train_accuracy):len(train_accuracy)+1,:]
        train_features_accuracy=train_accuracy.drop(columns='prices')
        test_features_accuracy=test_accuracy.drop(columns='prices')
        train_labels_accuracy=train_accuracy['prices']
        test_labels_accuracy=test_accuracy['prices']

        rf = RandomForestRegressor(n_estimators= 1000)
        rf.fit(train_features_accuracy, train_labels_accuracy)
        prediction_rf = rf.predict(test_features_accuracy)
        predictions=np.append(predictions,prediction_rf)
        test_labels_all=np.append(test_labels_all,test_labels_accuracy)

    #Calculate accuracy
    from sklearn.metrics import r2_score
    accuracy=r2_score(predictions,test_labels_all)
    result_rf=pd.DataFrame({'prediction':predictions,'data':test_labels_all})
    result_rf.to_csv('result_rf.csv')
    return df_with_forecast, accuracy, result_rf