zhen-dong-nexusflow commited on
Commit
7e5b9e1
·
1 Parent(s): 30f0ed5

add GPT3.5 results

Browse files
Files changed (1) hide show
  1. app.py +21 -17
app.py CHANGED
@@ -47,15 +47,15 @@ hover_css = """
47
 
48
  # Updated results reflecting the new screenshot
49
  RESULTS = {
50
- 'Climate': {"GPT4": 0.6808, "NexusRaven-V2": 0.7234},
51
- 'Heldout_Combined': {"GPT4": 0.4814, "NexusRaven-V2": 0.5990},
52
- 'Places_API': {"GPT4": 0.3541, "NexusRaven-V2": 0.5000},
53
- 'OTX': {"GPT4": 0.9130, "NexusRaven-V2": 0.9021},
54
- 'VirusTotal': {"GPT4": 0.8940, "NexusRaven-V2": 0.7815},
55
- 'VT_Multi_Dependency': {"GPT4": 0.3469, "NexusRaven-V2": 0.3673},
56
- 'VT_Multi_Disconnected': {"GPT4": 0.2380, "NexusRaven-V2": 0.3809},
57
- 'CVECPE': {"GPT4": 0.5769, "NexusRaven-V2": 0.4480},
58
- 'CVECPE_Multi_Dependency': {"GPT4": 0.1071, "NexusRaven-V2": 0.1607},
59
  }
60
 
61
  SAMPLES = {
@@ -169,9 +169,9 @@ def calculate_averages(results):
169
  difficult_tasks_avg = pd.DataFrame({k: results[k] for k in difficult_tasks}).mean(axis=1)
170
 
171
  avg_data = pd.DataFrame({
172
- 'All Tasks': all_tasks_avg,
173
- 'Tasks with Single Call (simple)': simple_tasks_avg,
174
- 'Tasks with Nested/Parallel Calls (challenging)': difficult_tasks_avg
175
  }).reset_index().rename(columns={'index': 'Model'})
176
 
177
  return avg_data
@@ -207,6 +207,7 @@ def calculate_capability_scores(results, type):
207
  capability_data = pd.DataFrame({
208
  'Capability': ['Single Calls', 'Nested Calls', 'Parallel Calls'],
209
  'GPT4': [single_calls_avg['GPT4'], nested_calls_avg['GPT4'], parallel_calls_avg['GPT4']],
 
210
  'NexusRaven-V2': [single_calls_avg['NexusRaven-V2'], nested_calls_avg['NexusRaven-V2'], parallel_calls_avg['NexusRaven-V2']]
211
  }).melt(id_vars=['Capability'], var_name='Model', value_name='Score')
212
  elif type == "many apis many args":
@@ -221,10 +222,13 @@ def calculate_capability_scores(results, type):
221
  vt_multi_disconnected_avg = pd.DataFrame({k: results[k] for k in vt_multi_disconnected}).mean(axis=1)
222
 
223
  capability_data = pd.DataFrame({
224
- 'Capability': ['OTX (Single)', 'VirusTotal (Single)', 'VT_Multi (Nested)', 'VT_Multi (Parallel)', 'CVECPE (Single)', 'CVECPE_Multi (Nested)', 'Places (Nested)', 'Climate (Parallel)', 'Stack (Mostly Single)'],
225
  'GPT4': [otx_avg['GPT4'], virustotal_avg['GPT4'], vt_multi_dependency_avg['GPT4'], vt_multi_disconnected_avg['GPT4'], cvecpe_avg['GPT4'], cvecpe_multi_dependency_avg['GPT4'], places_avg['GPT4'], climate_avg['GPT4'], heldout_avg['GPT4']],
 
226
  'NexusRaven-V2': [otx_avg['NexusRaven-V2'], virustotal_avg['NexusRaven-V2'], vt_multi_dependency_avg['NexusRaven-V2'], vt_multi_disconnected_avg['NexusRaven-V2'], cvecpe_avg['NexusRaven-V2'], cvecpe_multi_dependency_avg['NexusRaven-V2'],
227
  places_avg['NexusRaven-V2'], climate_avg['NexusRaven-V2'], heldout_avg['NexusRaven-V2']]
 
 
228
  }).melt(id_vars=['Capability'], var_name='Model', value_name='Score')
229
 
230
  return capability_data
@@ -314,13 +318,13 @@ with gr.Blocks(theme="dark") as demo: # Set the theme here
314
  for key, value in RESULTS.items():
315
  tab_names = {
316
  'OTX': 'OTX (Single)',
317
- 'CVECPE': 'CVECPE (Single)',
318
  'VirusTotal': 'VirusTotal (Single)',
319
  'VT_Multi_Dependency': 'VT_Multi (Nested)',
320
  'Places_API': 'Places (Nested)',
321
- 'CVECPE_Multi_Dependency': 'CVECPE_Multi (Nested)',
322
  'Heldout_Combined': 'Stack (Mostly Single)',
323
- 'Climate': 'Climate (Parallel)',
324
  'VT_Multi_Disconnected': 'VT_Multi (Parallel)'
325
  }
326
 
@@ -383,5 +387,5 @@ with gr.Blocks(theme="dark") as demo: # Set the theme here
383
  )
384
 
385
 
386
- demo.launch(allowed_paths=["logo.png", "raven.png"])
387
 
 
47
 
48
  # Updated results reflecting the new screenshot
49
  RESULTS = {
50
+ 'Climate': {"GPT4": 0.6809, "GPT3.5": 0.2553, "NexusRaven-V2": 0.7021, "Gorilla": 0.0213},
51
+ 'Heldout_Combined': {"GPT4": 0.4814, "GPT3.5": 0.4495, "NexusRaven-V2": 0.5990},
52
+ 'Places_API': {"GPT4": 0.4375, "GPT3.5": 0.2500, "NexusRaven-V2": 0.5000, "Gorilla": 0.0208},
53
+ 'OTX': {"GPT4": 0.9022, "GPT3.5": 0.8913, "NexusRaven-V2": 0.9022, "Gorilla": 0.2935},
54
+ 'VirusTotal': {"GPT4": 0.8800, "GPT3.5": 0.8100, "NexusRaven-V2": 0.8013, "Gorilla": 0.0728},
55
+ 'VT_Multi_Dependency': {"GPT4": 0.3673, "GPT3.5": 0.0204, "NexusRaven-V2": 0.3878, "Gorilla": 0.0000},
56
+ 'VT_Multi_Disconnected': {"GPT4": 0.2857, "GPT3.5": 0.1429, "NexusRaven-V2": 0.4286, "Gorilla": 0.0000},
57
+ 'CVECPE': {"GPT4": 0.7700, "GPT3.5": 0.4800, "NexusRaven-V2": 0.6667, "Gorilla": 0.0897},
58
+ 'CVECPE_Multi_Dependency': {"GPT4": 0.0714, "GPT3.5": 0.0714, "NexusRaven-V2": 0.2500, "Gorilla": 0.0000},
59
  }
60
 
61
  SAMPLES = {
 
169
  difficult_tasks_avg = pd.DataFrame({k: results[k] for k in difficult_tasks}).mean(axis=1)
170
 
171
  avg_data = pd.DataFrame({
172
+ 'All Tasks': all_tasks_avg[:-1],
173
+ 'Tasks with Single Call (simple)': simple_tasks_avg[:-1],
174
+ 'Tasks with Nested/Parallel Calls (challenging)': difficult_tasks_avg[:-1]
175
  }).reset_index().rename(columns={'index': 'Model'})
176
 
177
  return avg_data
 
207
  capability_data = pd.DataFrame({
208
  'Capability': ['Single Calls', 'Nested Calls', 'Parallel Calls'],
209
  'GPT4': [single_calls_avg['GPT4'], nested_calls_avg['GPT4'], parallel_calls_avg['GPT4']],
210
+ 'GPT3.5': [single_calls_avg['GPT3.5'], nested_calls_avg['GPT3.5'], parallel_calls_avg['GPT3.5']],
211
  'NexusRaven-V2': [single_calls_avg['NexusRaven-V2'], nested_calls_avg['NexusRaven-V2'], parallel_calls_avg['NexusRaven-V2']]
212
  }).melt(id_vars=['Capability'], var_name='Model', value_name='Score')
213
  elif type == "many apis many args":
 
222
  vt_multi_disconnected_avg = pd.DataFrame({k: results[k] for k in vt_multi_disconnected}).mean(axis=1)
223
 
224
  capability_data = pd.DataFrame({
225
+ 'Capability': ['OTX (Single)', 'VirusTotal (Single)', 'VT_Multi (Nested)', 'VT_Multi (Parallel)', 'NVDLibrary (Single)', 'NVDLibrary_Multi (Nested)', 'Places (Nested)', 'Climate (Nested/Parallel)', 'Stack (Mostly Single)'],
226
  'GPT4': [otx_avg['GPT4'], virustotal_avg['GPT4'], vt_multi_dependency_avg['GPT4'], vt_multi_disconnected_avg['GPT4'], cvecpe_avg['GPT4'], cvecpe_multi_dependency_avg['GPT4'], places_avg['GPT4'], climate_avg['GPT4'], heldout_avg['GPT4']],
227
+ 'GPT3.5': [otx_avg['GPT3.5'], virustotal_avg['GPT3.5'], vt_multi_dependency_avg['GPT3.5'], vt_multi_disconnected_avg['GPT3.5'], cvecpe_avg['GPT3.5'], cvecpe_multi_dependency_avg['GPT3.5'], places_avg['GPT3.5'], climate_avg['GPT3.5'], heldout_avg['GPT3.5']],
228
  'NexusRaven-V2': [otx_avg['NexusRaven-V2'], virustotal_avg['NexusRaven-V2'], vt_multi_dependency_avg['NexusRaven-V2'], vt_multi_disconnected_avg['NexusRaven-V2'], cvecpe_avg['NexusRaven-V2'], cvecpe_multi_dependency_avg['NexusRaven-V2'],
229
  places_avg['NexusRaven-V2'], climate_avg['NexusRaven-V2'], heldout_avg['NexusRaven-V2']]
230
+ # 'Gorilla': [otx_avg['Gorilla'], virustotal_avg['Gorilla'], vt_multi_dependency_avg['Gorilla'], vt_multi_disconnected_avg['Gorilla'], cvecpe_avg['Gorilla'], cvecpe_multi_dependency_avg['Gorilla'],
231
+ # places_avg['Gorilla'], climate_avg['Gorilla'], 0]
232
  }).melt(id_vars=['Capability'], var_name='Model', value_name='Score')
233
 
234
  return capability_data
 
318
  for key, value in RESULTS.items():
319
  tab_names = {
320
  'OTX': 'OTX (Single)',
321
+ 'CVECPE': 'NVDLibrary (Single)',
322
  'VirusTotal': 'VirusTotal (Single)',
323
  'VT_Multi_Dependency': 'VT_Multi (Nested)',
324
  'Places_API': 'Places (Nested)',
325
+ 'CVECPE_Multi_Dependency': 'NVDLibrary_Multi (Nested)',
326
  'Heldout_Combined': 'Stack (Mostly Single)',
327
+ 'Climate': 'Climate (Nested/Parallel)',
328
  'VT_Multi_Disconnected': 'VT_Multi (Parallel)'
329
  }
330
 
 
387
  )
388
 
389
 
390
+ demo.launch(share=True, allowed_paths=["logo.png", "raven.png"])
391