benediktstroebl commited on
Commit
b69a733
·
verified ·
1 Parent(s): e92240d

Upload 2 files

Browse files
Files changed (2) hide show
  1. utils/db.py +16 -7
  2. utils/viz.py +2 -2
utils/db.py CHANGED
@@ -181,7 +181,7 @@ class TracePreprocessor:
181
  # if there is a failure report, return the first one
182
  return pickle.loads(df['failure_report'][0])
183
 
184
- def _calculate_ci(self, data, confidence=0.95):
185
  data = data[np.isfinite(data)]
186
 
187
  if len(data) < 2:
@@ -189,9 +189,15 @@ class TracePreprocessor:
189
  n = len(data)
190
 
191
  mean = np.mean(data)
192
- sem = stats.sem(data)
193
 
194
- ci = stats.t.interval(confidence, n-1, loc=mean, scale=sem)
 
 
 
 
 
 
 
195
  return mean, ci[0], ci[1]
196
 
197
  def get_parsed_results(self, benchmark_name, aggregate=True):
@@ -222,12 +228,15 @@ class TracePreprocessor:
222
  agent_df = df[df['agent_name'] == agent_name]
223
 
224
  if len(agent_df) > 1:
225
- accuracy_mean, accuracy_lower, accuracy_upper = self._calculate_ci(agent_df['accuracy'])
226
- cost_mean, cost_lower, cost_upper = self._calculate_ci(agent_df['total_cost'])
227
 
228
  # format the confidence interval with +/- sign
229
- accuracy_ci = f"± {abs(accuracy_mean - accuracy_lower):.3f}"
230
- cost_ci = f"± {abs(cost_mean - cost_lower):.3f}"
 
 
 
231
 
232
  df.loc[df['agent_name'] == agent_name, 'acc_ci'] = accuracy_ci
233
  df.loc[df['agent_name'] == agent_name, 'cost_ci'] = cost_ci
 
181
  # if there is a failure report, return the first one
182
  return pickle.loads(df['failure_report'][0])
183
 
184
+ def _calculate_ci(self, data, confidence=0.95, type='minmax'):
185
  data = data[np.isfinite(data)]
186
 
187
  if len(data) < 2:
 
189
  n = len(data)
190
 
191
  mean = np.mean(data)
 
192
 
193
+ if type == 't':
194
+ sem = stats.sem(data)
195
+ ci = stats.t.interval(confidence, n-1, loc=mean, scale=sem)
196
+
197
+ elif type == 'minmax':
198
+ min = np.min(data)
199
+ max = np.max(data)
200
+ ci = (min, max)
201
  return mean, ci[0], ci[1]
202
 
203
  def get_parsed_results(self, benchmark_name, aggregate=True):
 
228
  agent_df = df[df['agent_name'] == agent_name]
229
 
230
  if len(agent_df) > 1:
231
+ accuracy_mean, accuracy_lower, accuracy_upper = self._calculate_ci(agent_df['accuracy'], type='minmax')
232
+ cost_mean, cost_lower, cost_upper = self._calculate_ci(agent_df['total_cost'], type='minmax')
233
 
234
  # format the confidence interval with +/- sign
235
+ # accuracy_ci = f"± {abs(accuracy_mean - accuracy_lower):.3f}"
236
+ # cost_ci = f"± {abs(cost_mean - cost_lower):.3f}"
237
+
238
+ accuracy_ci = f"-{abs(accuracy_mean - accuracy_lower):.3f}/+{abs(accuracy_mean - accuracy_upper):.3f}"
239
+ cost_ci = f"-{abs(cost_mean - cost_lower):.3f}/+{abs(cost_mean - cost_upper):.3f}"
240
 
241
  df.loc[df['agent_name'] == agent_name, 'acc_ci'] = accuracy_ci
242
  df.loc[df['agent_name'] == agent_name, 'cost_ci'] = cost_ci
utils/viz.py CHANGED
@@ -202,8 +202,8 @@ def create_scatter_plot(df, x: str, y: str, x_label: str = None, y_label: str =
202
 
203
  if len(agent_data) > 1:
204
  # Calculate 95% confidence intervals
205
- ci_x = stats.t.interval(0.95, len(agent_data[x])-1, loc=np.mean(agent_data[x]), scale=stats.sem(agent_data[x]))
206
- ci_y = stats.t.interval(0.95, len(agent_data[y])-1, loc=np.mean(agent_data[y]), scale=stats.sem(agent_data[y]))
207
 
208
  # # Add error bars for x (cost)
209
  # fig.add_trace(go.Scatter(
 
202
 
203
  if len(agent_data) > 1:
204
  # Calculate 95% confidence intervals
205
+ # ci_x = stats.t.interval(0.95, len(agent_data[x])-1, loc=np.mean(agent_data[x]), scale=stats.sem(agent_data[x]))
206
+ # ci_y = stats.t.interval(0.95, len(agent_data[y])-1, loc=np.mean(agent_data[y]), scale=stats.sem(agent_data[y]))
207
 
208
  # # Add error bars for x (cost)
209
  # fig.add_trace(go.Scatter(