juncliu commited on
Commit
5928f02
·
1 Parent(s): 4b38e69

unify freq

Browse files
Files changed (1) hide show
  1. src/utils.py +41 -5
src/utils.py CHANGED
@@ -56,6 +56,29 @@ def format_df(df):
56
  # make sure the data type is float
57
  df.iloc[:, 1:] = df.iloc[:, 1:].astype(float)
58
  return df
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
59
  def pivot_existed_df(df, tab_name):
60
  df = df.reset_index()
61
  if tab_name == 'univariate':
@@ -128,6 +151,8 @@ def get_grouped_dfs(root_dir='results', ds_properties='results/dataset_propertie
128
  else:
129
  df.loc[df['dataset'] == dataset, key] = dataset_properties_dict[dataset][key]
130
 
 
 
131
  # standardize by seasonal naive
132
  df = standardize_df(df)
133
  metric_columns = ['eval_metrics/MSE[mean]', 'eval_metrics/MSE[0.5]', 'eval_metrics/MAE[0.5]',
@@ -179,6 +204,13 @@ def standardize_df(df):
179
  # 6. Create a new df with standardized results
180
  original_df = df.copy()
181
  # 1. Get all the unique dataset names
 
 
 
 
 
 
 
182
  dataset_names = df['dataset'].unique()
183
  # 2. For each dataset name, get all the unique frequencies and term lengths
184
  for dataset in dataset_names:
@@ -191,11 +223,15 @@ def standardize_df(df):
191
  (df['dataset'] == dataset) & (df['frequency'] == frequency) & (df['term_length'] == term_length) & (
192
  df['model'] == 'Seasonal_Naive')]
193
  for metric in metric_columns:
194
- # 5. For each model name, dataset name, frequency, and term length, divide the model results by the seasonal_naive results
195
- df.loc[(df['dataset'] == dataset) & (df['frequency'] == frequency) & (
196
- df['term_length'] == term_length), metric] = df[(df['dataset'] == dataset) & (
197
- df['frequency'] == frequency) & (df['term_length'] == term_length)][metric] / \
198
- seasonal_naive_results[metric].values[0]
 
 
 
 
199
  # df[(df['dataset'] == 'bitbrains_fast_storage') & (df['model'] == 'seasonal_naive')]
200
  return df
201
 
 
56
  # make sure the data type is float
57
  df.iloc[:, 1:] = df.iloc[:, 1:].astype(float)
58
  return df
59
+
60
+ def unify_freq(df):
61
+ # Remove all numeric characters from the 'frequency' column
62
+ df['frequency'] = df['frequency'].str.replace(r'\d+', '', regex=True)
63
+ # Remove everything after '-' if present
64
+ df['frequency'] = df['frequency'].str.split('-').str[0]
65
+
66
+ # Define the frequency conversion dictionary
67
+ freq_conversion = {
68
+ 'T': 'Minutely',
69
+ 'H': 'Hourly',
70
+ 'D': 'Daily',
71
+ 'W': 'Weekly',
72
+ 'M': 'Monthly',
73
+ 'Q': 'Quarterly',
74
+ 'Y': 'Yearly',
75
+ 'A': 'Yearly',
76
+ 'S': 'Secondly'
77
+ }
78
+
79
+ # Map the cleaned 'frequency' values using the dictionary
80
+ df['frequency'] = df['frequency'].replace(freq_conversion)
81
+ return df
82
  def pivot_existed_df(df, tab_name):
83
  df = df.reset_index()
84
  if tab_name == 'univariate':
 
151
  else:
152
  df.loc[df['dataset'] == dataset, key] = dataset_properties_dict[dataset][key]
153
 
154
+ # unify the frequency
155
+ df = unify_freq(df)
156
  # standardize by seasonal naive
157
  df = standardize_df(df)
158
  metric_columns = ['eval_metrics/MSE[mean]', 'eval_metrics/MSE[0.5]', 'eval_metrics/MAE[0.5]',
 
204
  # 6. Create a new df with standardized results
205
  original_df = df.copy()
206
  # 1. Get all the unique dataset names
207
+ dataset_corrections = {
208
+ "saugeenday": "saugeen",
209
+ "temperature_rain_with_missing": "temperature_rain",
210
+ "kdd_cup_2018_with_missing": "kdd_cup_2018",
211
+ "car_parts_with_missing": "car_parts",
212
+ }
213
+ df['dataset'] = df['dataset'].replace(dataset_corrections)
214
  dataset_names = df['dataset'].unique()
215
  # 2. For each dataset name, get all the unique frequencies and term lengths
216
  for dataset in dataset_names:
 
223
  (df['dataset'] == dataset) & (df['frequency'] == frequency) & (df['term_length'] == term_length) & (
224
  df['model'] == 'Seasonal_Naive')]
225
  for metric in metric_columns:
226
+ try:
227
+ # 5. For each model name, dataset name, frequency, and term length, divide the model results by the seasonal_naive results
228
+ df.loc[(df['dataset'] == dataset) & (df['frequency'] == frequency) & (
229
+ df['term_length'] == term_length), metric] = df[(df['dataset'] == dataset) & (
230
+ df['frequency'] == frequency) & (df['term_length'] == term_length)][metric] / \
231
+ seasonal_naive_results[metric].values[0]
232
+ except Exception:
233
+ print(f"Error: {dataset} {term_length} {frequency} {metric}")
234
+ ipdb.set_trace()
235
  # df[(df['dataset'] == 'bitbrains_fast_storage') & (df['model'] == 'seasonal_naive')]
236
  return df
237