Spaces:

Nexusflow
/

Nexus_Function_Calling_Leaderboard

Running

App Files Files Community

zhen-dong-nexusflow commited on Dec 5, 2023

Commit

7e5b9e1

1 Parent(s): 30f0ed5

add GPT3.5 results

Browse files

Files changed (1) hide show

app.py +21 -17

app.py CHANGED Viewed

@@ -47,15 +47,15 @@ hover_css = """
 # Updated results reflecting the new screenshot
 RESULTS = {
-    'Climate': {"GPT4": 0.6808, "NexusRaven-V2": 0.7234},
-    'Heldout_Combined': {"GPT4": 0.4814, "NexusRaven-V2": 0.5990},
-    'Places_API': {"GPT4": 0.3541, "NexusRaven-V2": 0.5000},
-    'OTX': {"GPT4": 0.9130, "NexusRaven-V2": 0.9021},
-    'VirusTotal': {"GPT4": 0.8940, "NexusRaven-V2": 0.7815},
-    'VT_Multi_Dependency': {"GPT4": 0.3469, "NexusRaven-V2": 0.3673},
-    'VT_Multi_Disconnected': {"GPT4": 0.2380, "NexusRaven-V2": 0.3809},
-    'CVECPE': {"GPT4": 0.5769, "NexusRaven-V2": 0.4480},
-    'CVECPE_Multi_Dependency': {"GPT4": 0.1071, "NexusRaven-V2": 0.1607},
 }
 SAMPLES = {
@@ -169,9 +169,9 @@ def calculate_averages(results):
     difficult_tasks_avg = pd.DataFrame({k: results[k] for k in difficult_tasks}).mean(axis=1)
     avg_data = pd.DataFrame({
-        'All Tasks': all_tasks_avg,
-        'Tasks with Single Call (simple)': simple_tasks_avg,
-        'Tasks with Nested/Parallel Calls (challenging)': difficult_tasks_avg
     }).reset_index().rename(columns={'index': 'Model'})
     return avg_data
@@ -207,6 +207,7 @@ def calculate_capability_scores(results, type):
         capability_data = pd.DataFrame({
             'Capability': ['Single Calls', 'Nested Calls', 'Parallel Calls'],
             'GPT4': [single_calls_avg['GPT4'], nested_calls_avg['GPT4'], parallel_calls_avg['GPT4']],
             'NexusRaven-V2': [single_calls_avg['NexusRaven-V2'], nested_calls_avg['NexusRaven-V2'], parallel_calls_avg['NexusRaven-V2']]
         }).melt(id_vars=['Capability'], var_name='Model', value_name='Score')
     elif type == "many apis many args":
@@ -221,10 +222,13 @@ def calculate_capability_scores(results, type):
         vt_multi_disconnected_avg = pd.DataFrame({k: results[k] for k in vt_multi_disconnected}).mean(axis=1)
         capability_data = pd.DataFrame({
-            'Capability': ['OTX (Single)', 'VirusTotal (Single)', 'VT_Multi (Nested)', 'VT_Multi (Parallel)', 'CVECPE (Single)', 'CVECPE_Multi (Nested)', 'Places (Nested)', 'Climate (Parallel)', 'Stack (Mostly Single)'],
             'GPT4': [otx_avg['GPT4'], virustotal_avg['GPT4'], vt_multi_dependency_avg['GPT4'], vt_multi_disconnected_avg['GPT4'], cvecpe_avg['GPT4'], cvecpe_multi_dependency_avg['GPT4'], places_avg['GPT4'], climate_avg['GPT4'], heldout_avg['GPT4']],
             'NexusRaven-V2': [otx_avg['NexusRaven-V2'], virustotal_avg['NexusRaven-V2'], vt_multi_dependency_avg['NexusRaven-V2'], vt_multi_disconnected_avg['NexusRaven-V2'], cvecpe_avg['NexusRaven-V2'], cvecpe_multi_dependency_avg['NexusRaven-V2'],
             places_avg['NexusRaven-V2'], climate_avg['NexusRaven-V2'], heldout_avg['NexusRaven-V2']]
         }).melt(id_vars=['Capability'], var_name='Model', value_name='Score')
     return capability_data
@@ -314,13 +318,13 @@ with gr.Blocks(theme="dark") as demo:  # Set the theme here
     for key, value in RESULTS.items():
         tab_names = {
             'OTX': 'OTX (Single)',
-            'CVECPE': 'CVECPE (Single)',
             'VirusTotal': 'VirusTotal (Single)',
             'VT_Multi_Dependency': 'VT_Multi (Nested)',
             'Places_API': 'Places (Nested)',
-            'CVECPE_Multi_Dependency': 'CVECPE_Multi (Nested)',
             'Heldout_Combined': 'Stack (Mostly Single)',
-            'Climate': 'Climate (Parallel)',
             'VT_Multi_Disconnected': 'VT_Multi (Parallel)'
             }
@@ -383,5 +387,5 @@ with gr.Blocks(theme="dark") as demo:  # Set the theme here
         )
-demo.launch(allowed_paths=["logo.png", "raven.png"])

 # Updated results reflecting the new screenshot
 RESULTS = {
+    'Climate': {"GPT4": 0.6809, "GPT3.5": 0.2553, "NexusRaven-V2": 0.7021, "Gorilla": 0.0213},
+    'Heldout_Combined': {"GPT4": 0.4814, "GPT3.5": 0.4495, "NexusRaven-V2": 0.5990},
+    'Places_API': {"GPT4": 0.4375, "GPT3.5": 0.2500, "NexusRaven-V2": 0.5000, "Gorilla": 0.0208},
+    'OTX': {"GPT4": 0.9022, "GPT3.5": 0.8913, "NexusRaven-V2": 0.9022, "Gorilla": 0.2935},
+    'VirusTotal': {"GPT4": 0.8800, "GPT3.5": 0.8100, "NexusRaven-V2": 0.8013, "Gorilla": 0.0728},
+    'VT_Multi_Dependency': {"GPT4": 0.3673, "GPT3.5": 0.0204, "NexusRaven-V2": 0.3878, "Gorilla": 0.0000},
+    'VT_Multi_Disconnected': {"GPT4": 0.2857, "GPT3.5": 0.1429, "NexusRaven-V2": 0.4286, "Gorilla": 0.0000},
+    'CVECPE': {"GPT4": 0.7700, "GPT3.5": 0.4800, "NexusRaven-V2": 0.6667, "Gorilla": 0.0897},
+    'CVECPE_Multi_Dependency': {"GPT4": 0.0714, "GPT3.5": 0.0714, "NexusRaven-V2": 0.2500, "Gorilla": 0.0000},
 }
 SAMPLES = {
     difficult_tasks_avg = pd.DataFrame({k: results[k] for k in difficult_tasks}).mean(axis=1)
     avg_data = pd.DataFrame({
+        'All Tasks': all_tasks_avg[:-1],
+        'Tasks with Single Call (simple)': simple_tasks_avg[:-1],
+        'Tasks with Nested/Parallel Calls (challenging)': difficult_tasks_avg[:-1]
     }).reset_index().rename(columns={'index': 'Model'})
     return avg_data
         capability_data = pd.DataFrame({
             'Capability': ['Single Calls', 'Nested Calls', 'Parallel Calls'],
             'GPT4': [single_calls_avg['GPT4'], nested_calls_avg['GPT4'], parallel_calls_avg['GPT4']],
+            'GPT3.5': [single_calls_avg['GPT3.5'], nested_calls_avg['GPT3.5'], parallel_calls_avg['GPT3.5']],
             'NexusRaven-V2': [single_calls_avg['NexusRaven-V2'], nested_calls_avg['NexusRaven-V2'], parallel_calls_avg['NexusRaven-V2']]
         }).melt(id_vars=['Capability'], var_name='Model', value_name='Score')
     elif type == "many apis many args":
         vt_multi_disconnected_avg = pd.DataFrame({k: results[k] for k in vt_multi_disconnected}).mean(axis=1)
         capability_data = pd.DataFrame({
+            'Capability': ['OTX (Single)', 'VirusTotal (Single)', 'VT_Multi (Nested)', 'VT_Multi (Parallel)', 'NVDLibrary (Single)', 'NVDLibrary_Multi (Nested)', 'Places (Nested)', 'Climate (Nested/Parallel)', 'Stack (Mostly Single)'],
             'GPT4': [otx_avg['GPT4'], virustotal_avg['GPT4'], vt_multi_dependency_avg['GPT4'], vt_multi_disconnected_avg['GPT4'], cvecpe_avg['GPT4'], cvecpe_multi_dependency_avg['GPT4'], places_avg['GPT4'], climate_avg['GPT4'], heldout_avg['GPT4']],
+            'GPT3.5': [otx_avg['GPT3.5'], virustotal_avg['GPT3.5'], vt_multi_dependency_avg['GPT3.5'], vt_multi_disconnected_avg['GPT3.5'], cvecpe_avg['GPT3.5'], cvecpe_multi_dependency_avg['GPT3.5'], places_avg['GPT3.5'], climate_avg['GPT3.5'], heldout_avg['GPT3.5']],
             'NexusRaven-V2': [otx_avg['NexusRaven-V2'], virustotal_avg['NexusRaven-V2'], vt_multi_dependency_avg['NexusRaven-V2'], vt_multi_disconnected_avg['NexusRaven-V2'], cvecpe_avg['NexusRaven-V2'], cvecpe_multi_dependency_avg['NexusRaven-V2'],
             places_avg['NexusRaven-V2'], climate_avg['NexusRaven-V2'], heldout_avg['NexusRaven-V2']]
+            # 'Gorilla': [otx_avg['Gorilla'], virustotal_avg['Gorilla'], vt_multi_dependency_avg['Gorilla'], vt_multi_disconnected_avg['Gorilla'], cvecpe_avg['Gorilla'], cvecpe_multi_dependency_avg['Gorilla'],
+            # places_avg['Gorilla'], climate_avg['Gorilla'], 0]
         }).melt(id_vars=['Capability'], var_name='Model', value_name='Score')
     return capability_data
     for key, value in RESULTS.items():
         tab_names = {
             'OTX': 'OTX (Single)',
+            'CVECPE': 'NVDLibrary (Single)',
             'VirusTotal': 'VirusTotal (Single)',
             'VT_Multi_Dependency': 'VT_Multi (Nested)',
             'Places_API': 'Places (Nested)',
+            'CVECPE_Multi_Dependency': 'NVDLibrary_Multi (Nested)',
             'Heldout_Combined': 'Stack (Mostly Single)',
+            'Climate': 'Climate (Nested/Parallel)',
             'VT_Multi_Disconnected': 'VT_Multi (Parallel)'
             }
         )
+demo.launch(share=True, allowed_paths=["logo.png", "raven.png"])