Commit
·
7e5b9e1
1
Parent(s):
30f0ed5
add GPT3.5 results
Browse files
app.py
CHANGED
@@ -47,15 +47,15 @@ hover_css = """
|
|
47 |
|
48 |
# Updated results reflecting the new screenshot
|
49 |
RESULTS = {
|
50 |
-
'Climate': {"GPT4": 0.
|
51 |
-
'Heldout_Combined': {"GPT4": 0.4814, "NexusRaven-V2": 0.5990},
|
52 |
-
'Places_API': {"GPT4": 0.
|
53 |
-
'OTX': {"GPT4": 0.
|
54 |
-
'VirusTotal': {"GPT4": 0.
|
55 |
-
'VT_Multi_Dependency': {"GPT4": 0.
|
56 |
-
'VT_Multi_Disconnected': {"GPT4": 0.
|
57 |
-
'CVECPE': {"GPT4": 0.
|
58 |
-
'CVECPE_Multi_Dependency': {"GPT4": 0.
|
59 |
}
|
60 |
|
61 |
SAMPLES = {
|
@@ -169,9 +169,9 @@ def calculate_averages(results):
|
|
169 |
difficult_tasks_avg = pd.DataFrame({k: results[k] for k in difficult_tasks}).mean(axis=1)
|
170 |
|
171 |
avg_data = pd.DataFrame({
|
172 |
-
'All Tasks': all_tasks_avg,
|
173 |
-
'Tasks with Single Call (simple)': simple_tasks_avg,
|
174 |
-
'Tasks with Nested/Parallel Calls (challenging)': difficult_tasks_avg
|
175 |
}).reset_index().rename(columns={'index': 'Model'})
|
176 |
|
177 |
return avg_data
|
@@ -207,6 +207,7 @@ def calculate_capability_scores(results, type):
|
|
207 |
capability_data = pd.DataFrame({
|
208 |
'Capability': ['Single Calls', 'Nested Calls', 'Parallel Calls'],
|
209 |
'GPT4': [single_calls_avg['GPT4'], nested_calls_avg['GPT4'], parallel_calls_avg['GPT4']],
|
|
|
210 |
'NexusRaven-V2': [single_calls_avg['NexusRaven-V2'], nested_calls_avg['NexusRaven-V2'], parallel_calls_avg['NexusRaven-V2']]
|
211 |
}).melt(id_vars=['Capability'], var_name='Model', value_name='Score')
|
212 |
elif type == "many apis many args":
|
@@ -221,10 +222,13 @@ def calculate_capability_scores(results, type):
|
|
221 |
vt_multi_disconnected_avg = pd.DataFrame({k: results[k] for k in vt_multi_disconnected}).mean(axis=1)
|
222 |
|
223 |
capability_data = pd.DataFrame({
|
224 |
-
'Capability': ['OTX (Single)', 'VirusTotal (Single)', 'VT_Multi (Nested)', 'VT_Multi (Parallel)', '
|
225 |
'GPT4': [otx_avg['GPT4'], virustotal_avg['GPT4'], vt_multi_dependency_avg['GPT4'], vt_multi_disconnected_avg['GPT4'], cvecpe_avg['GPT4'], cvecpe_multi_dependency_avg['GPT4'], places_avg['GPT4'], climate_avg['GPT4'], heldout_avg['GPT4']],
|
|
|
226 |
'NexusRaven-V2': [otx_avg['NexusRaven-V2'], virustotal_avg['NexusRaven-V2'], vt_multi_dependency_avg['NexusRaven-V2'], vt_multi_disconnected_avg['NexusRaven-V2'], cvecpe_avg['NexusRaven-V2'], cvecpe_multi_dependency_avg['NexusRaven-V2'],
|
227 |
places_avg['NexusRaven-V2'], climate_avg['NexusRaven-V2'], heldout_avg['NexusRaven-V2']]
|
|
|
|
|
228 |
}).melt(id_vars=['Capability'], var_name='Model', value_name='Score')
|
229 |
|
230 |
return capability_data
|
@@ -314,13 +318,13 @@ with gr.Blocks(theme="dark") as demo: # Set the theme here
|
|
314 |
for key, value in RESULTS.items():
|
315 |
tab_names = {
|
316 |
'OTX': 'OTX (Single)',
|
317 |
-
'CVECPE': '
|
318 |
'VirusTotal': 'VirusTotal (Single)',
|
319 |
'VT_Multi_Dependency': 'VT_Multi (Nested)',
|
320 |
'Places_API': 'Places (Nested)',
|
321 |
-
'CVECPE_Multi_Dependency': '
|
322 |
'Heldout_Combined': 'Stack (Mostly Single)',
|
323 |
-
'Climate': 'Climate (Parallel)',
|
324 |
'VT_Multi_Disconnected': 'VT_Multi (Parallel)'
|
325 |
}
|
326 |
|
@@ -383,5 +387,5 @@ with gr.Blocks(theme="dark") as demo: # Set the theme here
|
|
383 |
)
|
384 |
|
385 |
|
386 |
-
demo.launch(allowed_paths=["logo.png", "raven.png"])
|
387 |
|
|
|
47 |
|
48 |
# Updated results reflecting the new screenshot
|
49 |
RESULTS = {
|
50 |
+
'Climate': {"GPT4": 0.6809, "GPT3.5": 0.2553, "NexusRaven-V2": 0.7021, "Gorilla": 0.0213},
|
51 |
+
'Heldout_Combined': {"GPT4": 0.4814, "GPT3.5": 0.4495, "NexusRaven-V2": 0.5990},
|
52 |
+
'Places_API': {"GPT4": 0.4375, "GPT3.5": 0.2500, "NexusRaven-V2": 0.5000, "Gorilla": 0.0208},
|
53 |
+
'OTX': {"GPT4": 0.9022, "GPT3.5": 0.8913, "NexusRaven-V2": 0.9022, "Gorilla": 0.2935},
|
54 |
+
'VirusTotal': {"GPT4": 0.8800, "GPT3.5": 0.8100, "NexusRaven-V2": 0.8013, "Gorilla": 0.0728},
|
55 |
+
'VT_Multi_Dependency': {"GPT4": 0.3673, "GPT3.5": 0.0204, "NexusRaven-V2": 0.3878, "Gorilla": 0.0000},
|
56 |
+
'VT_Multi_Disconnected': {"GPT4": 0.2857, "GPT3.5": 0.1429, "NexusRaven-V2": 0.4286, "Gorilla": 0.0000},
|
57 |
+
'CVECPE': {"GPT4": 0.7700, "GPT3.5": 0.4800, "NexusRaven-V2": 0.6667, "Gorilla": 0.0897},
|
58 |
+
'CVECPE_Multi_Dependency': {"GPT4": 0.0714, "GPT3.5": 0.0714, "NexusRaven-V2": 0.2500, "Gorilla": 0.0000},
|
59 |
}
|
60 |
|
61 |
SAMPLES = {
|
|
|
169 |
difficult_tasks_avg = pd.DataFrame({k: results[k] for k in difficult_tasks}).mean(axis=1)
|
170 |
|
171 |
avg_data = pd.DataFrame({
|
172 |
+
'All Tasks': all_tasks_avg[:-1],
|
173 |
+
'Tasks with Single Call (simple)': simple_tasks_avg[:-1],
|
174 |
+
'Tasks with Nested/Parallel Calls (challenging)': difficult_tasks_avg[:-1]
|
175 |
}).reset_index().rename(columns={'index': 'Model'})
|
176 |
|
177 |
return avg_data
|
|
|
207 |
capability_data = pd.DataFrame({
|
208 |
'Capability': ['Single Calls', 'Nested Calls', 'Parallel Calls'],
|
209 |
'GPT4': [single_calls_avg['GPT4'], nested_calls_avg['GPT4'], parallel_calls_avg['GPT4']],
|
210 |
+
'GPT3.5': [single_calls_avg['GPT3.5'], nested_calls_avg['GPT3.5'], parallel_calls_avg['GPT3.5']],
|
211 |
'NexusRaven-V2': [single_calls_avg['NexusRaven-V2'], nested_calls_avg['NexusRaven-V2'], parallel_calls_avg['NexusRaven-V2']]
|
212 |
}).melt(id_vars=['Capability'], var_name='Model', value_name='Score')
|
213 |
elif type == "many apis many args":
|
|
|
222 |
vt_multi_disconnected_avg = pd.DataFrame({k: results[k] for k in vt_multi_disconnected}).mean(axis=1)
|
223 |
|
224 |
capability_data = pd.DataFrame({
|
225 |
+
'Capability': ['OTX (Single)', 'VirusTotal (Single)', 'VT_Multi (Nested)', 'VT_Multi (Parallel)', 'NVDLibrary (Single)', 'NVDLibrary_Multi (Nested)', 'Places (Nested)', 'Climate (Nested/Parallel)', 'Stack (Mostly Single)'],
|
226 |
'GPT4': [otx_avg['GPT4'], virustotal_avg['GPT4'], vt_multi_dependency_avg['GPT4'], vt_multi_disconnected_avg['GPT4'], cvecpe_avg['GPT4'], cvecpe_multi_dependency_avg['GPT4'], places_avg['GPT4'], climate_avg['GPT4'], heldout_avg['GPT4']],
|
227 |
+
'GPT3.5': [otx_avg['GPT3.5'], virustotal_avg['GPT3.5'], vt_multi_dependency_avg['GPT3.5'], vt_multi_disconnected_avg['GPT3.5'], cvecpe_avg['GPT3.5'], cvecpe_multi_dependency_avg['GPT3.5'], places_avg['GPT3.5'], climate_avg['GPT3.5'], heldout_avg['GPT3.5']],
|
228 |
'NexusRaven-V2': [otx_avg['NexusRaven-V2'], virustotal_avg['NexusRaven-V2'], vt_multi_dependency_avg['NexusRaven-V2'], vt_multi_disconnected_avg['NexusRaven-V2'], cvecpe_avg['NexusRaven-V2'], cvecpe_multi_dependency_avg['NexusRaven-V2'],
|
229 |
places_avg['NexusRaven-V2'], climate_avg['NexusRaven-V2'], heldout_avg['NexusRaven-V2']]
|
230 |
+
# 'Gorilla': [otx_avg['Gorilla'], virustotal_avg['Gorilla'], vt_multi_dependency_avg['Gorilla'], vt_multi_disconnected_avg['Gorilla'], cvecpe_avg['Gorilla'], cvecpe_multi_dependency_avg['Gorilla'],
|
231 |
+
# places_avg['Gorilla'], climate_avg['Gorilla'], 0]
|
232 |
}).melt(id_vars=['Capability'], var_name='Model', value_name='Score')
|
233 |
|
234 |
return capability_data
|
|
|
318 |
for key, value in RESULTS.items():
|
319 |
tab_names = {
|
320 |
'OTX': 'OTX (Single)',
|
321 |
+
'CVECPE': 'NVDLibrary (Single)',
|
322 |
'VirusTotal': 'VirusTotal (Single)',
|
323 |
'VT_Multi_Dependency': 'VT_Multi (Nested)',
|
324 |
'Places_API': 'Places (Nested)',
|
325 |
+
'CVECPE_Multi_Dependency': 'NVDLibrary_Multi (Nested)',
|
326 |
'Heldout_Combined': 'Stack (Mostly Single)',
|
327 |
+
'Climate': 'Climate (Nested/Parallel)',
|
328 |
'VT_Multi_Disconnected': 'VT_Multi (Parallel)'
|
329 |
}
|
330 |
|
|
|
387 |
)
|
388 |
|
389 |
|
390 |
+
demo.launch(share=True, allowed_paths=["logo.png", "raven.png"])
|
391 |
|