File size: 2,647 Bytes
96edc51
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
import pandas as pd
import numpy as np

def data_transform(df_all,first_day_future):
    df_all['CPI']=df_all['CPI'].astype('float')
    df_all['Employment']=df_all['Employment'].astype('float')

    # Add future row and Shift X columns
    df_future_row=pd.DataFrame({0: df_all.reset_index().columns,1:df_all.reset_index().iloc[-1,:]}).T
    
    df_future_row.columns=df_future_row.iloc[0,:]
    
    df_future_row=df_future_row.drop(0).drop(columns=['index'])
    df_future_row.insert(0,'',pd.to_datetime(first_day_future).strftime("%Y-%m-%d"))
    df_future_row.set_index(df_future_row.iloc[:,0],inplace=True)
    df_future_row=df_future_row.drop(columns='')
    if 'level_0' in df_future_row.columns:
        df_future_row=df_future_row.drop(columns='level_0')
    if 'index' in df_all.columns:
        df_all=df_all.drop(columns='index')

    df_with_future=pd.concat([df_all,df_future_row],axis=0)
    df_with_future.index.names=['date']
    df_with_future.index=pd.to_datetime(df_with_future.index).strftime("%Y-%m-%d")
    df_with_future=df_with_future.shift()
    df_final=df_with_future.interpolate(method='linear',limit_direction='both', 
                                                limit=100).bfill().ffill()
    df_final['name']=np.repeat(df_final['id'].iloc[:,0].dropna()[0:1][0],len(df_final))
    df_final=df_final.drop(columns='id')
    #Data transformation coin_dummy, time_variables, shift X, iso_week
    df_final['name_no']=pd.get_dummies(df_final['name'],dtype='int')
    df_final.index=pd.to_datetime(df_final.index, utc=True)
    df_final['Day']=df_final.index.day
    df_final['Month']=df_final.index.month
    df_final['Year']=df_final.index.year
    seasonal_dummy=pd.get_dummies(df_final.index.day,dtype='int')
    seasonal_dummy.index=df_final.index
    seasonal_dummy.columns=[str('day_'+str(value)) for value in seasonal_dummy.columns]
    reframed=pd.concat([df_final,seasonal_dummy],axis=1).drop(columns='name')
    print(reframed.iloc[-5:,:])
    reframed=reframed.reset_index().drop(columns=['date'])
    reframed_lags=reframed.copy()
    reframed_lags['lag1'] = reframed_lags['prices'].iloc[-1]
    reframed_lags['lag2'] = reframed_lags['prices'].iloc[-2]

    # Use the last observed values for lag features
    for i in range(1, len(reframed_lags)):
        reframed_lags.loc[reframed_lags.index[i], 'lag1'] = reframed_lags.loc[reframed_lags.index[i-1], 'prices'] if 'prices' in reframed_lags.columns else reframed_lags.loc[reframed_lags.index[i-1], 'lag1']
        reframed_lags.loc[reframed_lags.index[i], 'lag2'] = reframed_lags.loc[reframed_lags.index[i-1], 'lag1']
    return reframed_lags, df_final