benediktstroebl commited on
Commit
a60497d
·
verified ·
1 Parent(s): 7db4465

Upload app.py

Browse files
Files changed (1) hide show
  1. app.py +836 -329
app.py CHANGED
@@ -6,7 +6,7 @@ from pathlib import Path
6
  import pandas as pd
7
  import os
8
  import json
9
- from utils.viz import create_scatter_plot, create_flow_chart, create_bar_chart, create_task_success_heatmap
10
  from utils.processing import check_and_process_uploads
11
  from huggingface_hub import snapshot_download
12
  from apscheduler.schedulers.background import BackgroundScheduler
@@ -48,8 +48,8 @@ def get_analyzed_traces(agent_name, benchmark_name):
48
  def get_failure_report(agent_name, benchmark_name):
49
  return preprocessor.get_failure_report(agent_name, benchmark_name)
50
 
51
- def parse_json_files(folder_path, benchmark_name):
52
- return preprocessor.get_parsed_results(benchmark_name)
53
 
54
  def update_agent_dropdown(benchmark_name, metric):
55
  df = parse_json_files(os.path.join(abs_path, "evals_live"), benchmark_name)
@@ -463,7 +463,7 @@ with gr.Blocks(theme=my_theme, css='css.css') as demo:
463
  with gr.Row():
464
  with gr.Column(scale=2):
465
  Leaderboard(
466
- value=parse_json_files(os.path.join(abs_path, "evals_live"), 'usaco'),
467
  select_columns=SelectColumns(
468
  default_selection=config.USACO_ON_LOAD_COLUMNS + ["Verified"],
469
  cant_deselect=["Agent Name"],
@@ -472,15 +472,15 @@ with gr.Blocks(theme=my_theme, css='css.css') as demo:
472
  hide_columns=config.USACO_HIDE_COLUMNS,
473
  search_columns=config.USACO_SEARCH_COLUMNS,
474
  )
 
475
  with gr.Row():
476
  gr.Markdown("### Accuracy vs. Cost for USACO agents")
477
  with gr.Row():
478
- scatter_plot = gr.Plot(create_scatter_plot(parse_json_files(os.path.join(abs_path, "evals_live"), 'usaco'), "Total Cost", "Accuracy", "Total Cost (in USD)", "Accuracy", ["Agent Name"]))
479
 
480
- gr.Markdown("")
481
- gr.Markdown("")
482
  gr.Markdown("## Task success heatmap")
483
- gr.Markdown("The task success heatmap shows which agent can solve which tasks. Agents are sorted by total accuracy (higher is better); tasks in USACO are sorted by decreasing order of difficulty (tasks on the left are solved by the most agents; tasks on the right are solved by the least")
484
  with gr.Row():
485
  task_success_heatmap = gr.Plot()
486
  demo.load(
@@ -624,400 +624,907 @@ with gr.Blocks(theme=my_theme, css='css.css') as demo:
624
  inputs=[raw_agent_dropdown, raw_task_dropdown, raw_step_dropdown],
625
  outputs=[raw_call_details])
626
 
627
-
628
- with gr.Tab("SWE-Bench Verified"):
629
  gr.Markdown("""SWE-bench is a dataset that tests systems' ability to solve GitHub issues automatically. Verified is a human-validated subset of 500 problems reviewed by software engineers. The We are currently actively developing this platform and this benchmark is not fully implemented yet.""")
630
  with gr.Row():
631
  with gr.Column(scale=2):
632
  Leaderboard(
633
- value=parse_json_files(os.path.join(abs_path, "evals_live"), 'swebench_verified'),
634
  select_columns=SelectColumns(
635
  default_selection=config.SWEBENCH_ON_LOAD_COLUMNS + ["Verified"],
636
  cant_deselect=["Agent Name"],
637
  label="Select Columns to Display:",
638
  ),
639
  hide_columns=config.SWEBENCH_HIDE_COLUMNS,
640
- search_columns=config.SWEBENCH_SEARCH_COLUMNS
641
  )
 
642
  with gr.Row():
643
- scatter_plot = gr.Plot(create_scatter_plot(parse_json_files(os.path.join(abs_path, "evals_live"), 'swebench_verified'), "Total Cost", "Accuracy", "Total Cost (in USD)", "Accuracy", ["Agent Name"]))
 
 
644
 
645
- gr.Markdown("")
646
- gr.Markdown("")
647
  gr.Markdown("## Task success heatmap")
 
648
  with gr.Row():
649
  task_success_heatmap = gr.Plot()
650
  demo.load(
651
  lambda: create_task_success_heatmap(
652
  preprocessor.get_task_success_data('swebench_verified'),
653
- 'SWEBench Verified'
654
  ),
655
  outputs=[task_success_heatmap]
656
  )
657
-
658
- gr.Markdown("")
659
- gr.Markdown("")
660
- gr.Markdown("## Failure report for each agent")
661
- with gr.Row():
662
- with gr.Column(scale=1):
663
- failure_report_agent_dropdown = gr.Dropdown(label="Select Agent for Failure Report")
664
- with gr.Row():
665
- with gr.Column(scale=1):
666
- failure_categories_overview = gr.Markdown()
667
-
668
- with gr.Column(scale=1):
669
- failure_categories_chart = gr.Plot()
670
 
671
- # Initialize the failure report agent dropdown with all agents
672
- demo.load(update_agent_dropdown,
673
- inputs=[gr.Textbox(value="swebench_verified", visible=False), gr.Textbox(value="Accuracy", visible=False)],
674
- outputs=[failure_report_agent_dropdown])
675
-
676
- # Update failure report when agent is selected
677
- failure_report_agent_dropdown.change(update_failure_report,
678
- inputs=[failure_report_agent_dropdown, gr.Textbox(value="swebench_verified", visible=False)],
679
- outputs=[failure_categories_overview, failure_categories_chart])
680
-
681
- gr.Markdown("")
682
- gr.Markdown("")
683
- gr.Markdown("## Agent monitor")
684
- with gr.Row():
685
- with gr.Column(scale=1):
686
- agent_dropdown = gr.Dropdown(label="Select Agent")
687
- with gr.Column(scale=1):
688
- task_dropdown = gr.Dropdown(label="Select SWE-Bench Task")
689
- with gr.Row():
690
- task_overview = gr.Markdown()
691
- with gr.Row():
692
- flow_chart = gr.Plot(label="Task Flow")
693
-
694
- # Initialize the agent dropdown with the best agent
695
- demo.load(update_agent_dropdown, inputs=[gr.Textbox(value="swebench_verified", visible=False), gr.Textbox(value="Accuracy", visible=False)], outputs=[agent_dropdown])
696
- demo.load(update_task_analysis, inputs=[gr.Textbox(value="swebench_verified", visible=False), agent_dropdown], outputs=[task_overview, flow_chart, task_dropdown, gr.Textbox(visible=False)])
697
-
698
- agent_dropdown.change(update_task_analysis,
699
- inputs=[gr.Textbox(value="swebench_verified", visible=False), agent_dropdown],
700
- outputs=[task_overview, flow_chart, task_dropdown, gr.Textbox(visible=False)])
701
- task_dropdown.change(update_task_details,
702
- inputs=[gr.Textbox(value="swebench_verified", visible=False), agent_dropdown, task_dropdown],
703
- outputs=[task_overview, flow_chart, gr.Textbox(visible=False)])
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
704
 
705
  gr.Markdown("## Raw predictions")
706
- with gr.Row():
707
- with gr.Column(scale=1):
708
- raw_agent_dropdown = gr.Dropdown(label="Select Agent")
709
- with gr.Column(scale=1):
710
- raw_task_dropdown = gr.Dropdown(label="Select Task")
711
- with gr.Column(scale=1):
712
- raw_step_dropdown = gr.Dropdown(label="Select Step")
713
-
714
- with gr.Row():
715
- raw_call_details = gr.HTML()
716
-
717
- def update_raw_task_dropdown(agent_name):
718
- analyzed_traces = get_analyzed_traces(agent_name, "swebench_verified")
719
- if not analyzed_traces:
720
- return gr.Dropdown(choices=[], label="Select Task"), gr.Dropdown(choices=[], label="Select Step"), f"No raw predictions data available for agent: {agent_name}."
721
- task_ids = list(analyzed_traces.keys())
722
- steps = analyzed_traces[task_ids[0]]['steps']
723
- return gr.Dropdown(choices=task_ids, label="Select Task", value=task_ids[0]), gr.Dropdown(choices=[(f"Step {i+1}", i) for i in range(len(steps))], label="Select Step", value=0), update_raw_call_details(agent_name, task_ids[0], 0)
724
-
725
- def update_raw_step_dropdown(agent_name, task_id):
726
- analyzed_traces = get_analyzed_traces(agent_name, "swebench_verified")
727
- if not analyzed_traces or task_id not in analyzed_traces:
728
- return gr.Dropdown(choices=[], label="Select Step", value="No data available.")
729
- steps = analyzed_traces[task_id]['steps']
730
- return gr.Dropdown(choices=[(f"Step {i+1}", i) for i in range(len(steps))], label="Select Step", value=0), format_call_info(steps[0], 0)
731
-
732
- def update_raw_call_details(agent_name, task_id, step_index):
733
- analyzed_traces = get_analyzed_traces(agent_name, "swebench_verified")
734
- if not analyzed_traces or task_id not in analyzed_traces:
735
- return "No data available for this selection."
736
- steps = analyzed_traces[task_id]['steps']
737
- if step_index is None:
738
- return "Invalid step selection."
739
- step = steps[step_index]
740
- return format_call_info(step, step_index)
741
-
742
- # Initialize the raw agent dropdown with all agents
743
- demo.load(update_agent_dropdown,
744
- inputs=[gr.Textbox(value="swebench_verified", visible=False), gr.Textbox(value="Accuracy", visible=False)],
745
- outputs=[raw_agent_dropdown])
746
- demo.load(update_raw_task_dropdown,
747
- inputs=[raw_agent_dropdown],
748
- outputs=[raw_task_dropdown, raw_step_dropdown])
749
- demo.load(update_raw_call_details,
750
- inputs=[raw_agent_dropdown, raw_task_dropdown, raw_step_dropdown],
751
- outputs=[raw_call_details])
752
-
753
- raw_agent_dropdown.change(update_raw_task_dropdown,
754
- inputs=[raw_agent_dropdown],
755
- outputs=[raw_task_dropdown, raw_step_dropdown, raw_call_details])
756
- raw_task_dropdown.change(update_raw_step_dropdown,
757
- inputs=[raw_agent_dropdown, raw_task_dropdown],
758
- outputs=[raw_step_dropdown, raw_call_details])
759
- raw_step_dropdown.change(update_raw_call_details,
760
- inputs=[raw_agent_dropdown, raw_task_dropdown, raw_step_dropdown],
761
- outputs=[raw_call_details])
762
-
763
- with gr.Tab("SWE-Bench Lite"):
 
 
764
  gr.Markdown("""SWE-bench is a dataset that tests systems' ability to solve GitHub issues automatically. Lite is a subset of 300 tasks of the original SWE-bench. We are currently actively developing this platform and this benchmark is not fully implemented yet.""")
765
  with gr.Row():
766
  with gr.Column(scale=2):
767
  Leaderboard(
768
- value=parse_json_files(os.path.join(abs_path, "evals_live"), 'swebench_lite'),
769
  select_columns=SelectColumns(
770
  default_selection=config.SWEBENCH_ON_LOAD_COLUMNS + ["Verified"],
771
  cant_deselect=["Agent Name"],
772
  label="Select Columns to Display:",
773
  ),
 
774
  search_columns=config.SWEBENCH_SEARCH_COLUMNS,
775
- hide_columns=config.SWEBENCH_HIDE_COLUMNS
776
  )
 
777
  with gr.Row():
778
- scatter_plot = gr.Plot(create_scatter_plot(parse_json_files(os.path.join(abs_path, "evals_live"), 'swebench_lite'), "Total Cost", "Accuracy", "Total Cost (in USD)", "Accuracy", ["Agent Name"]))
779
-
780
- gr.Markdown("")
781
- gr.Markdown("")
 
782
  gr.Markdown("## Task success heatmap")
 
783
  with gr.Row():
784
  task_success_heatmap = gr.Plot()
785
  demo.load(
786
  lambda: create_task_success_heatmap(
787
  preprocessor.get_task_success_data('swebench_lite'),
788
- 'SWEBench Lite'
789
  ),
790
  outputs=[task_success_heatmap]
791
  )
792
-
793
- gr.Markdown("")
794
- gr.Markdown("")
795
- gr.Markdown("## Failure report for each agent")
796
- with gr.Row():
797
- with gr.Column(scale=1):
798
- failure_report_agent_dropdown = gr.Dropdown(label="Select Agent for Failure Report")
799
- with gr.Row():
800
- with gr.Column(scale=1):
801
- failure_categories_overview = gr.Markdown()
802
-
803
- with gr.Column(scale=1):
804
- failure_categories_chart = gr.Plot()
805
 
806
- # Initialize the failure report agent dropdown with all agents
807
- demo.load(update_agent_dropdown,
808
- inputs=[gr.Textbox(value="swebench_lite", visible=False), gr.Textbox(value="Accuracy", visible=False)],
809
- outputs=[failure_report_agent_dropdown])
810
-
811
- # Update failure report when agent is selected
812
- failure_report_agent_dropdown.change(update_failure_report,
813
- inputs=[failure_report_agent_dropdown, gr.Textbox(value="swebench_lite", visible=False)],
814
- outputs=[failure_categories_overview, failure_categories_chart])
815
-
816
- gr.Markdown("")
817
- gr.Markdown("")
818
- gr.Markdown("## Agent monitor")
819
- with gr.Row():
820
- with gr.Column(scale=1):
821
- agent_dropdown = gr.Dropdown(label="Select Agent")
822
- with gr.Column(scale=1):
823
- task_dropdown = gr.Dropdown(label="Select SWE-Bench Task")
824
- with gr.Row():
825
- task_overview = gr.Markdown()
826
- with gr.Row():
827
- flow_chart = gr.Plot(label="Task Flow")
828
 
829
- # Initialize the agent dropdown with the best agent
830
- demo.load(update_agent_dropdown, inputs=[gr.Textbox(value="swebench_lite", visible=False), gr.Textbox(value="Accuracy", visible=False)], outputs=[agent_dropdown])
831
- demo.load(update_task_analysis, inputs=[gr.Textbox(value="swebench_lite", visible=False), agent_dropdown], outputs=[task_overview, flow_chart, task_dropdown, gr.Textbox(visible=False)])
 
 
 
 
 
 
 
 
 
832
 
833
- agent_dropdown.change(update_task_analysis,
834
- inputs=[gr.Textbox(value="swebench_lite", visible=False), agent_dropdown],
835
- outputs=[task_overview, flow_chart, task_dropdown, gr.Textbox(visible=False)])
836
- task_dropdown.change(update_task_details,
837
- inputs=[gr.Textbox(value="swebench_lite", visible=False), agent_dropdown, task_dropdown],
838
- outputs=[task_overview, flow_chart, gr.Textbox(visible=False)])
 
 
 
 
 
 
 
 
839
 
840
-
841
- gr.Markdown("## Raw predictions")
842
- with gr.Row():
843
- with gr.Column(scale=1):
844
- raw_agent_dropdown = gr.Dropdown(label="Select Agent")
845
- with gr.Column(scale=1):
846
- raw_task_dropdown = gr.Dropdown(label="Select Task")
847
- with gr.Column(scale=1):
848
- raw_step_dropdown = gr.Dropdown(label="Select Step")
849
-
850
- with gr.Row():
851
- raw_call_details = gr.HTML()
852
-
853
- def update_raw_task_dropdown(agent_name):
854
- analyzed_traces = get_analyzed_traces(agent_name, "swebench_lite")
855
- if not analyzed_traces:
856
- return gr.Dropdown(choices=[], label="Select Task"), gr.Dropdown(choices=[], label="Select Step"), f"No raw predictions data available for agent: {agent_name}."
857
- task_ids = list(analyzed_traces.keys())
858
- steps = analyzed_traces[task_ids[0]]['steps']
859
- return gr.Dropdown(choices=task_ids, label="Select Task", value=task_ids[0]), gr.Dropdown(choices=[(f"Step {i+1}", i) for i in range(len(steps))], label="Select Step", value=0), update_raw_call_details(agent_name, task_ids[0], 0)
860
-
861
- def update_raw_step_dropdown(agent_name, task_id):
862
- analyzed_traces = get_analyzed_traces(agent_name, "swebench_lite")
863
- if not analyzed_traces or task_id not in analyzed_traces:
864
- return gr.Dropdown(choices=[], label="Select Step", value="No data available.")
865
- steps = analyzed_traces[task_id]['steps']
866
- return gr.Dropdown(choices=[(f"Step {i+1}", i) for i in range(len(steps))], label="Select Step", value=0), format_call_info(steps[0], 0)
867
-
868
- def update_raw_call_details(agent_name, task_id, step_index):
869
- analyzed_traces = get_analyzed_traces(agent_name, "swebench_lite")
870
- if not analyzed_traces or task_id not in analyzed_traces:
871
- return "No data available for this selection."
872
- steps = analyzed_traces[task_id]['steps']
873
- if step_index is None:
874
- return "Invalid step selection."
875
- step = steps[step_index]
876
- return format_call_info(step, step_index)
877
-
878
- # Initialize the raw agent dropdown with all agents
879
- demo.load(update_agent_dropdown,
880
- inputs=[gr.Textbox(value="swebench_lite", visible=False), gr.Textbox(value="Accuracy", visible=False)],
881
- outputs=[raw_agent_dropdown])
882
- demo.load(update_raw_task_dropdown,
883
- inputs=[raw_agent_dropdown],
884
- outputs=[raw_task_dropdown, raw_step_dropdown])
885
- demo.load(update_raw_call_details,
886
- inputs=[raw_agent_dropdown, raw_task_dropdown, raw_step_dropdown],
887
- outputs=[raw_call_details])
888
-
889
- raw_agent_dropdown.change(update_raw_task_dropdown,
890
- inputs=[raw_agent_dropdown],
891
- outputs=[raw_task_dropdown, raw_step_dropdown, raw_call_details])
892
- raw_task_dropdown.change(update_raw_step_dropdown,
893
- inputs=[raw_agent_dropdown, raw_task_dropdown],
894
- outputs=[raw_step_dropdown, raw_call_details])
895
- raw_step_dropdown.change(update_raw_call_details,
896
- inputs=[raw_agent_dropdown, raw_task_dropdown, raw_step_dropdown],
897
- outputs=[raw_call_details])
898
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
899
  with gr.Tab("MLAgentBench"):
900
  gr.Markdown("""MLAgentBench is a suite of end-to-end Machine Learning (ML) experimentation tasks, where the agent aims to take a given dataset and a machine learning task description and autonomously develop or improve an ML model. We are currently actively developing this platform and this benchmark is not fully implemented yet. In particular, we only include one agent and a subset of tasks for this benchmark.""")
901
  with gr.Row():
902
  with gr.Column(scale=2):
903
  Leaderboard(
904
- value=parse_json_files(os.path.join(abs_path, "evals_live"), 'mlagentbench'),
905
  select_columns=SelectColumns(
906
  default_selection=config.MLAGENTBENCH_ON_LOAD_COLUMNS + ["Verified"],
907
  cant_deselect=["Agent Name"],
908
  label="Select Columns to Display:",
909
  ),
910
- search_columns=config.MLAGENTBENCH_SEARCH_COLUMNS,
911
  hide_columns=config.MLAGENTBENCH_HIDE_COLUMNS,
 
912
  )
 
913
  with gr.Row():
914
- scatter_plot = gr.Plot(create_scatter_plot(parse_json_files(os.path.join(abs_path, "evals_live"), 'mlagentbench'), "Total Cost", "Overall Score", "Total Cost (in USD)", "Overall Score", ["Agent Name"]))
915
-
916
- gr.Markdown("")
917
- gr.Markdown("")
918
- gr.Markdown("## Failure report for each agent")
919
- with gr.Row():
920
- with gr.Column(scale=1):
921
- failure_report_agent_dropdown = gr.Dropdown(label="Select Agent for Failure Report")
922
  with gr.Row():
923
- with gr.Column(scale=1):
924
- failure_categories_overview = gr.Markdown()
925
 
926
- with gr.Column(scale=1):
927
- failure_categories_chart = gr.Plot()
 
 
 
 
 
 
 
 
 
 
928
 
929
- # Initialize the failure report agent dropdown with all agents
930
- demo.load(update_agent_dropdown,
931
- inputs=[gr.Textbox(value="mlagentbench", visible=False), gr.Textbox(value="Overall Score", visible=False)],
932
- outputs=[failure_report_agent_dropdown])
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
933
 
934
- # Update failure report when agent is selected
935
- failure_report_agent_dropdown.change(update_failure_report,
936
- inputs=[failure_report_agent_dropdown, gr.Textbox(value="mlagentbench", visible=False)],
937
- outputs=[failure_categories_overview, failure_categories_chart])
 
 
 
 
 
 
 
 
938
 
939
- gr.Markdown("")
940
- gr.Markdown("")
941
- gr.Markdown("## Agent monitor")
942
- with gr.Row():
943
- with gr.Column(scale=1):
944
- agent_dropdown = gr.Dropdown(label="Select Agent")
945
- with gr.Column(scale=1):
946
- task_dropdown = gr.Dropdown(label="Select SWE-Bench Task")
947
- with gr.Row():
948
- task_overview = gr.Markdown()
949
- with gr.Row():
950
- flow_chart = gr.Plot(label="Task Flow")
951
 
952
- # Initialize the agent dropdown with the best agent
953
- demo.load(update_agent_dropdown, inputs=[gr.Textbox(value="mlagentbench", visible=False), gr.Textbox(value="Overall Score", visible=False)], outputs=[agent_dropdown])
954
- demo.load(update_task_analysis, inputs=[gr.Textbox(value="mlagentbench", visible=False), agent_dropdown], outputs=[task_overview, flow_chart, task_dropdown, gr.Textbox(visible=False)])
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
955
 
956
- agent_dropdown.change(update_task_analysis,
957
- inputs=[gr.Textbox(value="mlagentbench", visible=False), agent_dropdown],
958
- outputs=[task_overview, flow_chart, task_dropdown, gr.Textbox(visible=False)])
959
- task_dropdown.change(update_task_details,
960
- inputs=[gr.Textbox(value="mlagentbench", visible=False), agent_dropdown, task_dropdown],
961
- outputs=[task_overview, flow_chart, gr.Textbox(visible=False)])
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
962
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
963
 
964
- gr.Markdown("## Raw predictions")
965
- with gr.Row():
966
- with gr.Column(scale=1):
967
- raw_agent_dropdown = gr.Dropdown(label="Select Agent")
968
- with gr.Column(scale=1):
969
- raw_task_dropdown = gr.Dropdown(label="Select Task")
970
- with gr.Column(scale=1):
971
- raw_step_dropdown = gr.Dropdown(label="Select Step")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
972
 
973
- with gr.Row():
974
- raw_call_details = gr.HTML()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
975
 
976
- def update_raw_task_dropdown(agent_name):
977
- analyzed_traces = get_analyzed_traces(agent_name, "mlagentbench")
978
- if not analyzed_traces:
979
- return gr.Dropdown(choices=[], label="Select Task"), gr.Dropdown(choices=[], label="Select Step"), f"No raw predictions data available for agent: {agent_name}."
980
- task_ids = list(analyzed_traces.keys())
981
- steps = analyzed_traces[task_ids[0]]['steps']
982
- return gr.Dropdown(choices=task_ids, label="Select Task", value=task_ids[0]), gr.Dropdown(choices=[(f"Step {i+1}", i) for i in range(len(steps))], label="Select Step", value=0), update_raw_call_details(agent_name, task_ids[0], 0)
983
-
984
- def update_raw_step_dropdown(agent_name, task_id):
985
- analyzed_traces = get_analyzed_traces(agent_name, "mlagentbench")
986
- if not analyzed_traces or task_id not in analyzed_traces:
987
- return gr.Dropdown(choices=[], label="Select Step", value="No data available.")
988
- steps = analyzed_traces[task_id]['steps']
989
- return gr.Dropdown(choices=[(f"Step {i+1}", i) for i in range(len(steps))], label="Select Step", value=0), format_call_info(steps[0], 0)
990
-
991
- def update_raw_call_details(agent_name, task_id, step_index):
992
- analyzed_traces = get_analyzed_traces(agent_name, "mlagentbench")
993
- if not analyzed_traces or task_id not in analyzed_traces:
994
- return "No data available for this selection."
995
- steps = analyzed_traces[task_id]['steps']
996
- if step_index is None:
997
- return "Invalid step selection."
998
- step = steps[step_index]
999
- return format_call_info(step, step_index)
1000
-
1001
- # Initialize the raw agent dropdown with all agents
1002
- demo.load(update_agent_dropdown,
1003
- inputs=[gr.Textbox(value="mlagentbench", visible=False), gr.Textbox(value="Overall Score", visible=False)],
1004
- outputs=[raw_agent_dropdown])
1005
- demo.load(update_raw_task_dropdown,
1006
- inputs=[raw_agent_dropdown],
1007
- outputs=[raw_task_dropdown, raw_step_dropdown])
1008
- demo.load(update_raw_call_details,
1009
- inputs=[raw_agent_dropdown, raw_task_dropdown, raw_step_dropdown],
1010
- outputs=[raw_call_details])
1011
-
1012
- raw_agent_dropdown.change(update_raw_task_dropdown,
1013
- inputs=[raw_agent_dropdown],
1014
- outputs=[raw_task_dropdown, raw_step_dropdown, raw_call_details])
1015
- raw_task_dropdown.change(update_raw_step_dropdown,
1016
- inputs=[raw_agent_dropdown, raw_task_dropdown],
1017
- outputs=[raw_step_dropdown, raw_call_details])
1018
- raw_step_dropdown.change(update_raw_call_details,
1019
- inputs=[raw_agent_dropdown, raw_task_dropdown, raw_step_dropdown],
1020
- outputs=[raw_call_details])
1021
 
1022
 
1023
  with gr.Tab("About"):
@@ -1044,12 +1551,12 @@ async def main():
1044
  # Download the results from the Hugging Face Hub
1045
  # await asyncio.to_thread(download_latest_results)
1046
 
1047
- # Check for new uploads and process them
1048
  # await check_and_process_uploads()
1049
 
1050
  scheduler = AsyncIOScheduler()
1051
  scheduler.add_job(restart_space, "interval", hours=1)
1052
- scheduler.add_job(download_latest_results, "interval", hours=1)
1053
  # scheduler.add_job(check_and_process_uploads, "interval", hours=1)
1054
  scheduler.start()
1055
 
 
6
  import pandas as pd
7
  import os
8
  import json
9
+ from utils.viz import create_scatter_plot, create_flow_chart, create_bar_chart, create_task_success_heatmap, create_leaderboard
10
  from utils.processing import check_and_process_uploads
11
  from huggingface_hub import snapshot_download
12
  from apscheduler.schedulers.background import BackgroundScheduler
 
48
  def get_failure_report(agent_name, benchmark_name):
49
  return preprocessor.get_failure_report(agent_name, benchmark_name)
50
 
51
+ def parse_json_files(folder_path, benchmark_name, aggregate=True):
52
+ return preprocessor.get_parsed_results(benchmark_name, aggregate=aggregate)
53
 
54
  def update_agent_dropdown(benchmark_name, metric):
55
  df = parse_json_files(os.path.join(abs_path, "evals_live"), benchmark_name)
 
463
  with gr.Row():
464
  with gr.Column(scale=2):
465
  Leaderboard(
466
+ value=create_leaderboard(parse_json_files(os.path.join(abs_path, "evals_live"), 'usaco'), ci_metrics=["Accuracy", "Total Cost"]),
467
  select_columns=SelectColumns(
468
  default_selection=config.USACO_ON_LOAD_COLUMNS + ["Verified"],
469
  cant_deselect=["Agent Name"],
 
472
  hide_columns=config.USACO_HIDE_COLUMNS,
473
  search_columns=config.USACO_SEARCH_COLUMNS,
474
  )
475
+ gr.Markdown("""*95% CIs calculated using Student's t-distribution.*""", elem_classes=["text-right"])
476
  with gr.Row():
477
  gr.Markdown("### Accuracy vs. Cost for USACO agents")
478
  with gr.Row():
479
+ scatter_plot = gr.Plot(create_scatter_plot(parse_json_files(os.path.join(abs_path, "evals_live"), 'usaco', aggregate=False), "Total Cost", "Accuracy", "Total Cost (in USD)", "Accuracy", ["Agent Name"]))
480
 
481
+ gr.HTML('<div style="height: 30px;"></div>')
 
482
  gr.Markdown("## Task success heatmap")
483
+ gr.Markdown("The task success heatmap shows which agent can solve which tasks. Agents are sorted by total accuracy (higher is better); tasks in USACO are sorted by decreasing order of difficulty (tasks on the left are solved by the most agents; tasks on the right are solved by the least. For agents that have been run more than once, the run with the highest score is shown.")
484
  with gr.Row():
485
  task_success_heatmap = gr.Plot()
486
  demo.load(
 
624
  inputs=[raw_agent_dropdown, raw_task_dropdown, raw_step_dropdown],
625
  outputs=[raw_call_details])
626
 
627
+ with gr.Tab("SWE-bench Verified"):
 
628
  gr.Markdown("""SWE-bench is a dataset that tests systems' ability to solve GitHub issues automatically. Verified is a human-validated subset of 500 problems reviewed by software engineers. The We are currently actively developing this platform and this benchmark is not fully implemented yet.""")
629
  with gr.Row():
630
  with gr.Column(scale=2):
631
  Leaderboard(
632
+ value=create_leaderboard(parse_json_files(os.path.join(abs_path, "evals_live"), 'swebench_verified'), ci_metrics=["Accuracy", "Total Cost"]),
633
  select_columns=SelectColumns(
634
  default_selection=config.SWEBENCH_ON_LOAD_COLUMNS + ["Verified"],
635
  cant_deselect=["Agent Name"],
636
  label="Select Columns to Display:",
637
  ),
638
  hide_columns=config.SWEBENCH_HIDE_COLUMNS,
639
+ search_columns=config.SWEBENCH_SEARCH_COLUMNS,
640
  )
641
+ gr.Markdown("""*95% CIs calculated using Student's t-distribution.*""", elem_classes=["text-right"])
642
  with gr.Row():
643
+ gr.Markdown("### Accuracy vs. Cost for SWE-bench agents")
644
+ with gr.Row():
645
+ scatter_plot = gr.Plot(create_scatter_plot(parse_json_files(os.path.join(abs_path, "evals_live"), 'swebench_verified', aggregate=False), "Total Cost", "Accuracy", "Total Cost (in USD)", "Accuracy", ["Agent Name"]))
646
 
647
+ gr.HTML('<div style="height: 30px;"></div>')
 
648
  gr.Markdown("## Task success heatmap")
649
+ gr.Markdown("The task success heatmap shows which agent can solve which tasks. Agents are sorted by total accuracy (higher is better); tasks in SWE-bench are sorted by decreasing order of difficulty (tasks on the left are solved by the most agents; tasks on the right are solved by the least. For agents that have been run more than once, the run with the highest score is shown.")
650
  with gr.Row():
651
  task_success_heatmap = gr.Plot()
652
  demo.load(
653
  lambda: create_task_success_heatmap(
654
  preprocessor.get_task_success_data('swebench_verified'),
655
+ 'SWE-bench Verified'
656
  ),
657
  outputs=[task_success_heatmap]
658
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
659
 
660
+ gr.HTML("""
661
+ <style>
662
+ .grouped-section {
663
+ border: 2px solid #dee2e6; /* Color matching unactivated tabs */
664
+ border-radius: 10px;
665
+ padding: 30px;
666
+ margin-top: 40px;
667
+ margin-bottom: 40px;
668
+ position: relative;
669
+ }
670
+
671
+ .grouped-section-title {
672
+ font-size: 1.7em;
673
+ font-weight: bold;
674
+ color: #2c3e50;
675
+ margin-bottom: 20px;
676
+ padding-bottom: 10px;
677
+ border-bottom: 2px solid #dee2e6;
678
+ }
679
+ </style>
680
+ """)
681
+ with gr.Group(elem_classes=["grouped-section"]):
682
+ gr.Markdown("# Agent monitor", elem_classes=["grouped-section-title"], elem_id="agent-monitor")
683
+
684
+ gr.HTML('<div style="height: 10px;"></div>')
685
+ gr.Markdown("## Failure report for each agent")
686
+ gr.Markdown('Select an agent to see why the agent fails to solve tasks correctly. Note that these descriptions (and the failure categories) are generated by LLM-based evaluations of the agent logs and may contain inaccuracies.')
687
+ gr.HTML('<div style="height: 10px;"></div>')
688
+ with gr.Row():
689
+ with gr.Column(scale=1):
690
+ failure_report_agent_dropdown = gr.Dropdown(label="Select Agent for Failure Report")
691
+ gr.HTML('<div style="height: 10px;"></div>')
692
+ with gr.Row():
693
+ with gr.Column(scale=1):
694
+ failure_categories_overview = gr.Markdown()
695
+
696
+ with gr.Column(scale=1):
697
+ failure_categories_chart = gr.Plot()
698
+
699
+ # Initialize the failure report agent dropdown with all agents
700
+ demo.load(update_agent_dropdown,
701
+ inputs=[gr.Textbox(value="swebench_verified", visible=False), gr.Textbox(value="Accuracy", visible=False)],
702
+ outputs=[failure_report_agent_dropdown])
703
+
704
+ # Update failure report when agent is selected
705
+ failure_report_agent_dropdown.change(update_failure_report,
706
+ inputs=[failure_report_agent_dropdown, gr.Textbox(value="swebench_verified", visible=False)],
707
+ outputs=[failure_categories_overview, failure_categories_chart])
708
+
709
+ gr.HTML('<div style="height: 30px;"></div>')
710
+ gr.Markdown("## Task overview")
711
+ gr.HTML('<div style="height: 10px;"></div>')
712
+ with gr.Row():
713
+ with gr.Column(scale=1):
714
+ agent_dropdown = gr.Dropdown(label="Select Agent")
715
+ with gr.Column(scale=1):
716
+ task_dropdown = gr.Dropdown(label="Select SWE-bench Verified Task")
717
+ gr.HTML('<div style="height: 10px;"></div>')
718
+ with gr.Row():
719
+ task_overview = gr.Markdown()
720
+ with gr.Row():
721
+ flow_chart = gr.Plot(label="Task Flow")
722
+
723
+ # Initialize the agent dropdown with the best agent
724
+ demo.load(update_agent_dropdown, inputs=[gr.Textbox(value="swebench_verified", visible=False), gr.Textbox(value="Accuracy", visible=False)], outputs=[agent_dropdown])
725
+ demo.load(update_task_analysis, inputs=[gr.Textbox(value="swebench_verified", visible=False), agent_dropdown], outputs=[task_overview, flow_chart, task_dropdown, gr.Textbox(visible=False)])
726
+
727
+ agent_dropdown.change(update_task_analysis,
728
+ inputs=[gr.Textbox(value="swebench_verified", visible=False), agent_dropdown],
729
+ outputs=[task_overview, flow_chart, task_dropdown, gr.Textbox(visible=False)])
730
+ task_dropdown.change(update_task_details,
731
+ inputs=[gr.Textbox(value="swebench_verified", visible=False), agent_dropdown, task_dropdown],
732
+ outputs=[task_overview, flow_chart, gr.Textbox(visible=False)])
733
 
734
  gr.Markdown("## Raw predictions")
735
+ gr.Markdown('Select an agent to see the raw predictions made by the agent for each task. We also provide information on token usage for each call.')
736
+ with gr.Accordion("Expand to inspect raw predictions of agents...", open=False):
737
+ with gr.Row():
738
+ with gr.Column(scale=1):
739
+ raw_agent_dropdown = gr.Dropdown(label="Select Agent")
740
+ with gr.Column(scale=1):
741
+ raw_task_dropdown = gr.Dropdown(label="Select Task")
742
+ with gr.Column(scale=1):
743
+ raw_step_dropdown = gr.Dropdown(label="Select Step")
744
+ with gr.Row():
745
+ raw_call_details = gr.HTML()
746
+
747
+ def update_raw_task_dropdown(agent_name):
748
+ analyzed_traces = get_analyzed_traces(agent_name, "swebench_verified")
749
+ if not analyzed_traces:
750
+ return gr.Dropdown(choices=[], label="Select Task"), gr.Dropdown(choices=[], label="Select Step"), f"No raw predictions data available for agent: {agent_name}."
751
+ task_ids = list(analyzed_traces.keys())
752
+ steps = analyzed_traces[task_ids[0]]['steps']
753
+ return gr.Dropdown(choices=task_ids, label="Select Task", value=task_ids[0]), gr.Dropdown(choices=[(f"Step {i+1}", i) for i in range(len(steps))], label="Select Step", value=0), format_call_info(get_analyzed_traces(agent_name, "swebench_verified")[task_ids[0]]['steps'][0], 0)
754
+
755
+ def update_raw_step_dropdown(agent_name, task_id):
756
+ analyzed_traces = get_analyzed_traces(agent_name, "swebench_verified")
757
+ if not analyzed_traces or task_id not in analyzed_traces:
758
+ return gr.Dropdown(choices=[], label="Select Step", value="No data available.")
759
+ steps = analyzed_traces[task_id]['steps']
760
+ return gr.Dropdown(choices=[(f"Step {i+1}", i) for i in range(len(steps))], label="Select Step", value=0), format_call_info(steps[0], 0)
761
+
762
+ def update_raw_call_details(agent_name, task_id, step_index):
763
+ analyzed_traces = get_analyzed_traces(agent_name, "swebench_verified")
764
+ if not analyzed_traces or task_id not in analyzed_traces:
765
+ return "No data available for this selection."
766
+ steps = analyzed_traces[task_id]['steps']
767
+ if step_index is None:
768
+ return "Invalid step selection."
769
+ step = steps[step_index]
770
+ return format_call_info(step, step_index)
771
+
772
+ # Initialize the raw agent dropdown with all agents
773
+ demo.load(update_agent_dropdown,
774
+ inputs=[gr.Textbox(value="swebench_verified", visible=False), gr.Textbox(value="Accuracy", visible=False)],
775
+ outputs=[raw_agent_dropdown])
776
+ demo.load(update_raw_task_dropdown,
777
+ inputs=[raw_agent_dropdown],
778
+ outputs=[raw_task_dropdown, raw_step_dropdown])
779
+ demo.load(update_raw_call_details,
780
+ inputs=[raw_agent_dropdown, raw_task_dropdown, raw_step_dropdown],
781
+ outputs=[raw_call_details])
782
+
783
+ raw_agent_dropdown.change(update_raw_task_dropdown,
784
+ inputs=[raw_agent_dropdown],
785
+ outputs=[raw_task_dropdown, raw_step_dropdown, raw_call_details])
786
+ raw_task_dropdown.change(update_raw_step_dropdown,
787
+ inputs=[raw_agent_dropdown, raw_task_dropdown],
788
+ outputs=[raw_step_dropdown, raw_call_details])
789
+ raw_step_dropdown.change(update_raw_call_details,
790
+ inputs=[raw_agent_dropdown, raw_task_dropdown, raw_step_dropdown],
791
+ outputs=[raw_call_details])
792
+
793
+
794
+ with gr.Tab("SWE-bench Lite"):
795
  gr.Markdown("""SWE-bench is a dataset that tests systems' ability to solve GitHub issues automatically. Lite is a subset of 300 tasks of the original SWE-bench. We are currently actively developing this platform and this benchmark is not fully implemented yet.""")
796
  with gr.Row():
797
  with gr.Column(scale=2):
798
  Leaderboard(
799
+ value=create_leaderboard(parse_json_files(os.path.join(abs_path, "evals_live"), 'swebench_lite'), ci_metrics=["Accuracy", "Total Cost"]),
800
  select_columns=SelectColumns(
801
  default_selection=config.SWEBENCH_ON_LOAD_COLUMNS + ["Verified"],
802
  cant_deselect=["Agent Name"],
803
  label="Select Columns to Display:",
804
  ),
805
+ hide_columns=config.SWEBENCH_HIDE_COLUMNS,
806
  search_columns=config.SWEBENCH_SEARCH_COLUMNS,
 
807
  )
808
+ gr.Markdown("""*95% CIs calculated using Student's t-distribution.*""", elem_classes=["text-right"])
809
  with gr.Row():
810
+ gr.Markdown("### Accuracy vs. Cost for SWE-bench agents")
811
+ with gr.Row():
812
+ scatter_plot = gr.Plot(create_scatter_plot(parse_json_files(os.path.join(abs_path, "evals_live"), 'swebench_lite', aggregate=False), "Total Cost", "Accuracy", "Total Cost (in USD)", "Accuracy", ["Agent Name"]))
813
+
814
+ gr.HTML('<div style="height: 30px;"></div>')
815
  gr.Markdown("## Task success heatmap")
816
+ gr.Markdown("The task success heatmap shows which agent can solve which tasks. Agents are sorted by total accuracy (higher is better); tasks in SWE-bench are sorted by decreasing order of difficulty (tasks on the left are solved by the most agents; tasks on the right are solved by the least. For agents that have been run more than once, the run with the highest score is shown.")
817
  with gr.Row():
818
  task_success_heatmap = gr.Plot()
819
  demo.load(
820
  lambda: create_task_success_heatmap(
821
  preprocessor.get_task_success_data('swebench_lite'),
822
+ 'SWE-bench Lite'
823
  ),
824
  outputs=[task_success_heatmap]
825
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
826
 
827
+ gr.HTML("""
828
+ <style>
829
+ .grouped-section {
830
+ border: 2px solid #dee2e6; /* Color matching unactivated tabs */
831
+ border-radius: 10px;
832
+ padding: 30px;
833
+ margin-top: 40px;
834
+ margin-bottom: 40px;
835
+ position: relative;
836
+ }
 
 
 
 
 
 
 
 
 
 
 
 
837
 
838
+ .grouped-section-title {
839
+ font-size: 1.7em;
840
+ font-weight: bold;
841
+ color: #2c3e50;
842
+ margin-bottom: 20px;
843
+ padding-bottom: 10px;
844
+ border-bottom: 2px solid #dee2e6;
845
+ }
846
+ </style>
847
+ """)
848
+ with gr.Group(elem_classes=["grouped-section"]):
849
+ gr.Markdown("# Agent monitor", elem_classes=["grouped-section-title"], elem_id="agent-monitor")
850
 
851
+ gr.HTML('<div style="height: 10px;"></div>')
852
+ gr.Markdown("## Failure report for each agent")
853
+ gr.Markdown('Select an agent to see why the agent fails to solve tasks correctly. Note that these descriptions (and the failure categories) are generated by LLM-based evaluations of the agent logs and may contain inaccuracies.')
854
+ gr.HTML('<div style="height: 10px;"></div>')
855
+ with gr.Row():
856
+ with gr.Column(scale=1):
857
+ failure_report_agent_dropdown = gr.Dropdown(label="Select Agent for Failure Report")
858
+ gr.HTML('<div style="height: 10px;"></div>')
859
+ with gr.Row():
860
+ with gr.Column(scale=1):
861
+ failure_categories_overview = gr.Markdown()
862
+
863
+ with gr.Column(scale=1):
864
+ failure_categories_chart = gr.Plot()
865
 
866
+ # Initialize the failure report agent dropdown with all agents
867
+ demo.load(update_agent_dropdown,
868
+ inputs=[gr.Textbox(value="swebench_lite", visible=False), gr.Textbox(value="Accuracy", visible=False)],
869
+ outputs=[failure_report_agent_dropdown])
870
+
871
+ # Update failure report when agent is selected
872
+ failure_report_agent_dropdown.change(update_failure_report,
873
+ inputs=[failure_report_agent_dropdown, gr.Textbox(value="swebench_lite", visible=False)],
874
+ outputs=[failure_categories_overview, failure_categories_chart])
875
+
876
+ gr.HTML('<div style="height: 30px;"></div>')
877
+ gr.Markdown("## Task overview")
878
+ gr.HTML('<div style="height: 10px;"></div>')
879
+ with gr.Row():
880
+ with gr.Column(scale=1):
881
+ agent_dropdown = gr.Dropdown(label="Select Agent")
882
+ with gr.Column(scale=1):
883
+ task_dropdown = gr.Dropdown(label="Select SWE-bench Lite Task")
884
+ gr.HTML('<div style="height: 10px;"></div>')
885
+ with gr.Row():
886
+ task_overview = gr.Markdown()
887
+ with gr.Row():
888
+ flow_chart = gr.Plot(label="Task Flow")
889
+
890
+ # Initialize the agent dropdown with the best agent
891
+ demo.load(update_agent_dropdown, inputs=[gr.Textbox(value="swebench_lite", visible=False), gr.Textbox(value="Accuracy", visible=False)], outputs=[agent_dropdown])
892
+ demo.load(update_task_analysis, inputs=[gr.Textbox(value="swebench_lite", visible=False), agent_dropdown], outputs=[task_overview, flow_chart, task_dropdown, gr.Textbox(visible=False)])
893
+
894
+ agent_dropdown.change(update_task_analysis,
895
+ inputs=[gr.Textbox(value="swebench_lite", visible=False), agent_dropdown],
896
+ outputs=[task_overview, flow_chart, task_dropdown, gr.Textbox(visible=False)])
897
+ task_dropdown.change(update_task_details,
898
+ inputs=[gr.Textbox(value="swebench_lite", visible=False), agent_dropdown, task_dropdown],
899
+ outputs=[task_overview, flow_chart, gr.Textbox(visible=False)])
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
900
 
901
+ gr.Markdown("## Raw predictions")
902
+ gr.Markdown('Select an agent to see the raw predictions made by the agent for each task. We also provide information on token usage for each call.')
903
+ with gr.Accordion("Expand to inspect raw predictions of agents...", open=False):
904
+ with gr.Row():
905
+ with gr.Column(scale=1):
906
+ raw_agent_dropdown = gr.Dropdown(label="Select Agent")
907
+ with gr.Column(scale=1):
908
+ raw_task_dropdown = gr.Dropdown(label="Select Task")
909
+ with gr.Column(scale=1):
910
+ raw_step_dropdown = gr.Dropdown(label="Select Step")
911
+ with gr.Row():
912
+ raw_call_details = gr.HTML()
913
+
914
+ def update_raw_task_dropdown(agent_name):
915
+ analyzed_traces = get_analyzed_traces(agent_name, "swebench_lite")
916
+ if not analyzed_traces:
917
+ return gr.Dropdown(choices=[], label="Select Task"), gr.Dropdown(choices=[], label="Select Step"), f"No raw predictions data available for agent: {agent_name}."
918
+ task_ids = list(analyzed_traces.keys())
919
+ steps = analyzed_traces[task_ids[0]]['steps']
920
+ return gr.Dropdown(choices=task_ids, label="Select Task", value=task_ids[0]), gr.Dropdown(choices=[(f"Step {i+1}", i) for i in range(len(steps))], label="Select Step", value=0), format_call_info(get_analyzed_traces(agent_name, "swebench_lite")[task_ids[0]]['steps'][0], 0)
921
+
922
+ def update_raw_step_dropdown(agent_name, task_id):
923
+ analyzed_traces = get_analyzed_traces(agent_name, "swebench_lite")
924
+ if not analyzed_traces or task_id not in analyzed_traces:
925
+ return gr.Dropdown(choices=[], label="Select Step", value="No data available.")
926
+ steps = analyzed_traces[task_id]['steps']
927
+ return gr.Dropdown(choices=[(f"Step {i+1}", i) for i in range(len(steps))], label="Select Step", value=0), format_call_info(steps[0], 0)
928
+
929
+ def update_raw_call_details(agent_name, task_id, step_index):
930
+ analyzed_traces = get_analyzed_traces(agent_name, "swebench_lite")
931
+ if not analyzed_traces or task_id not in analyzed_traces:
932
+ return "No data available for this selection."
933
+ steps = analyzed_traces[task_id]['steps']
934
+ if step_index is None:
935
+ return "Invalid step selection."
936
+ step = steps[step_index]
937
+ return format_call_info(step, step_index)
938
+
939
+ # Initialize the raw agent dropdown with all agents
940
+ demo.load(update_agent_dropdown,
941
+ inputs=[gr.Textbox(value="swebench_lite", visible=False), gr.Textbox(value="Accuracy", visible=False)],
942
+ outputs=[raw_agent_dropdown])
943
+ demo.load(update_raw_task_dropdown,
944
+ inputs=[raw_agent_dropdown],
945
+ outputs=[raw_task_dropdown, raw_step_dropdown])
946
+ demo.load(update_raw_call_details,
947
+ inputs=[raw_agent_dropdown, raw_task_dropdown, raw_step_dropdown],
948
+ outputs=[raw_call_details])
949
+
950
+ raw_agent_dropdown.change(update_raw_task_dropdown,
951
+ inputs=[raw_agent_dropdown],
952
+ outputs=[raw_task_dropdown, raw_step_dropdown, raw_call_details])
953
+ raw_task_dropdown.change(update_raw_step_dropdown,
954
+ inputs=[raw_agent_dropdown, raw_task_dropdown],
955
+ outputs=[raw_step_dropdown, raw_call_details])
956
+ raw_step_dropdown.change(update_raw_call_details,
957
+ inputs=[raw_agent_dropdown, raw_task_dropdown, raw_step_dropdown],
958
+ outputs=[raw_call_details])
959
+
960
+
961
+
962
  with gr.Tab("MLAgentBench"):
963
  gr.Markdown("""MLAgentBench is a suite of end-to-end Machine Learning (ML) experimentation tasks, where the agent aims to take a given dataset and a machine learning task description and autonomously develop or improve an ML model. We are currently actively developing this platform and this benchmark is not fully implemented yet. In particular, we only include one agent and a subset of tasks for this benchmark.""")
964
  with gr.Row():
965
  with gr.Column(scale=2):
966
  Leaderboard(
967
+ value=create_leaderboard(parse_json_files(os.path.join(abs_path, "evals_live"), 'mlagentbench')),
968
  select_columns=SelectColumns(
969
  default_selection=config.MLAGENTBENCH_ON_LOAD_COLUMNS + ["Verified"],
970
  cant_deselect=["Agent Name"],
971
  label="Select Columns to Display:",
972
  ),
 
973
  hide_columns=config.MLAGENTBENCH_HIDE_COLUMNS,
974
+ search_columns=config.MLAGENTBENCH_SEARCH_COLUMNS,
975
  )
976
+ gr.Markdown("""*95% CIs calculated using Student's t-distribution.*""", elem_classes=["text-right"])
977
  with gr.Row():
978
+ gr.Markdown("### Accuracy vs. Cost for MLAgentBench agents")
 
 
 
 
 
 
 
979
  with gr.Row():
980
+ scatter_plot = gr.Plot(create_scatter_plot(parse_json_files(os.path.join(abs_path, "evals_live"), 'mlagentbench', aggregate=False), "Total Cost", "Overall Score", "Total Cost (in USD)", "Overall Score", ["Agent Name"]))
 
981
 
982
+ # gr.HTML('<div style="height: 30px;"></div>')
983
+ # gr.Markdown("## Task success heatmap")
984
+ # gr.Markdown("The task success heatmap shows which agent can solve which tasks. Agents are sorted by total accuracy (higher is better); tasks in USACO are sorted by decreasing order of difficulty (tasks on the left are solved by the most agents; tasks on the right are solved by the least. For agents that have been run more than once, the run with the highest score is shown.")
985
+ # with gr.Row():
986
+ # task_success_heatmap = gr.Plot()
987
+ # demo.load(
988
+ # lambda: create_task_success_heatmap(
989
+ # preprocessor.get_task_success_data('usaco'),
990
+ # 'USACO'
991
+ # ),
992
+ # outputs=[task_success_heatmap]
993
+ # )
994
 
995
+ gr.HTML("""
996
+ <style>
997
+ .grouped-section {
998
+ border: 2px solid #dee2e6; /* Color matching unactivated tabs */
999
+ border-radius: 10px;
1000
+ padding: 30px;
1001
+ margin-top: 40px;
1002
+ margin-bottom: 40px;
1003
+ position: relative;
1004
+ }
1005
+
1006
+ .grouped-section-title {
1007
+ font-size: 1.7em;
1008
+ font-weight: bold;
1009
+ color: #2c3e50;
1010
+ margin-bottom: 20px;
1011
+ padding-bottom: 10px;
1012
+ border-bottom: 2px solid #dee2e6;
1013
+ }
1014
+ </style>
1015
+ """)
1016
+ with gr.Group(elem_classes=["grouped-section"]):
1017
+ gr.Markdown("# Agent monitor", elem_classes=["grouped-section-title"], elem_id="agent-monitor")
1018
+
1019
+ # gr.HTML('<div style="height: 10px;"></div>')
1020
+ # gr.Markdown("## Failure report for each agent")
1021
+ # gr.Markdown('Select an agent to see why the agent fails to solve tasks correctly. Note that these descriptions (and the failure categories) are generated by LLM-based evaluations of the agent logs and may contain inaccuracies.')
1022
+ # gr.HTML('<div style="height: 10px;"></div>')
1023
+ # with gr.Row():
1024
+ # with gr.Column(scale=1):
1025
+ # failure_report_agent_dropdown = gr.Dropdown(label="Select Agent for Failure Report")
1026
+ # gr.HTML('<div style="height: 10px;"></div>')
1027
+ # with gr.Row():
1028
+ # with gr.Column(scale=1):
1029
+ # failure_categories_overview = gr.Markdown()
1030
+
1031
+ # with gr.Column(scale=1):
1032
+ # failure_categories_chart = gr.Plot()
1033
+
1034
+ # # Initialize the failure report agent dropdown with all agents
1035
+ # demo.load(update_agent_dropdown,
1036
+ # inputs=[gr.Textbox(value="mlagentbench", visible=False), gr.Textbox(value="Overall Score", visible=False)],
1037
+ # outputs=[failure_report_agent_dropdown])
1038
+
1039
+ # # Update failure report when agent is selected
1040
+ # failure_report_agent_dropdown.change(update_failure_report,
1041
+ # inputs=[failure_report_agent_dropdown, gr.Textbox(value="mlagentbench", visible=False)],
1042
+ # outputs=[failure_categories_overview, failure_categories_chart])
1043
+
1044
+ gr.HTML('<div style="height: 30px;"></div>')
1045
+ gr.Markdown("## Task overview")
1046
+ gr.HTML('<div style="height: 10px;"></div>')
1047
+ with gr.Row():
1048
+ with gr.Column(scale=1):
1049
+ agent_dropdown = gr.Dropdown(label="Select Agent")
1050
+ with gr.Column(scale=1):
1051
+ task_dropdown = gr.Dropdown(label="Select MLAgentBench Task")
1052
+ gr.HTML('<div style="height: 10px;"></div>')
1053
+ with gr.Row():
1054
+ task_overview = gr.Markdown()
1055
+ with gr.Row():
1056
+ flow_chart = gr.Plot(label="Task Flow")
1057
+
1058
+ # Initialize the agent dropdown with the best agent
1059
+ demo.load(update_agent_dropdown, inputs=[gr.Textbox(value="mlagentbench", visible=False), gr.Textbox(value="Overall Score", visible=False)], outputs=[agent_dropdown])
1060
+ demo.load(update_task_analysis, inputs=[gr.Textbox(value="mlagentbench", visible=False), agent_dropdown], outputs=[task_overview, flow_chart, task_dropdown, gr.Textbox(visible=False)])
1061
+
1062
+ agent_dropdown.change(update_task_analysis,
1063
+ inputs=[gr.Textbox(value="mlagentbench", visible=False), agent_dropdown],
1064
+ outputs=[task_overview, flow_chart, task_dropdown, gr.Textbox(visible=False)])
1065
+ task_dropdown.change(update_task_details,
1066
+ inputs=[gr.Textbox(value="mlagentbench", visible=False), agent_dropdown, task_dropdown],
1067
+ outputs=[task_overview, flow_chart, gr.Textbox(visible=False)])
1068
+
1069
+ gr.Markdown("## Raw predictions")
1070
+ gr.Markdown('Select an agent to see the raw predictions made by the agent for each task. We also provide information on token usage for each call.')
1071
+ with gr.Accordion("Expand to inspect raw predictions of agents...", open=False):
1072
+ with gr.Row():
1073
+ with gr.Column(scale=1):
1074
+ raw_agent_dropdown = gr.Dropdown(label="Select Agent")
1075
+ with gr.Column(scale=1):
1076
+ raw_task_dropdown = gr.Dropdown(label="Select Task")
1077
+ with gr.Column(scale=1):
1078
+ raw_step_dropdown = gr.Dropdown(label="Select Step")
1079
+ with gr.Row():
1080
+ raw_call_details = gr.HTML()
1081
+
1082
+ def update_raw_task_dropdown(agent_name):
1083
+ analyzed_traces = get_analyzed_traces(agent_name, "mlagentbench")
1084
+ if not analyzed_traces:
1085
+ return gr.Dropdown(choices=[], label="Select Task"), gr.Dropdown(choices=[], label="Select Step"), f"No raw predictions data available for agent: {agent_name}."
1086
+ task_ids = list(analyzed_traces.keys())
1087
+ steps = analyzed_traces[task_ids[0]]['steps']
1088
+ return gr.Dropdown(choices=task_ids, label="Select Task", value=task_ids[0]), gr.Dropdown(choices=[(f"Step {i+1}", i) for i in range(len(steps))], label="Select Step", value=0), format_call_info(get_analyzed_traces(agent_name, "mlagentbench")[task_ids[0]]['steps'][0], 0)
1089
+
1090
+ def update_raw_step_dropdown(agent_name, task_id):
1091
+ analyzed_traces = get_analyzed_traces(agent_name, "mlagentbench")
1092
+ if not analyzed_traces or task_id not in analyzed_traces:
1093
+ return gr.Dropdown(choices=[], label="Select Step", value="No data available.")
1094
+ steps = analyzed_traces[task_id]['steps']
1095
+ return gr.Dropdown(choices=[(f"Step {i+1}", i) for i in range(len(steps))], label="Select Step", value=0), format_call_info(steps[0], 0)
1096
+
1097
+ def update_raw_call_details(agent_name, task_id, step_index):
1098
+ analyzed_traces = get_analyzed_traces(agent_name, "mlagentbench")
1099
+ if not analyzed_traces or task_id not in analyzed_traces:
1100
+ return "No data available for this selection."
1101
+ steps = analyzed_traces[task_id]['steps']
1102
+ if step_index is None:
1103
+ return "Invalid step selection."
1104
+ step = steps[step_index]
1105
+ return format_call_info(step, step_index)
1106
+
1107
+ # Initialize the raw agent dropdown with all agents
1108
+ demo.load(update_agent_dropdown,
1109
+ inputs=[gr.Textbox(value="mlagentbench", visible=False), gr.Textbox(value="Overall Score", visible=False)],
1110
+ outputs=[raw_agent_dropdown])
1111
+ demo.load(update_raw_task_dropdown,
1112
+ inputs=[raw_agent_dropdown],
1113
+ outputs=[raw_task_dropdown, raw_step_dropdown])
1114
+ demo.load(update_raw_call_details,
1115
+ inputs=[raw_agent_dropdown, raw_task_dropdown, raw_step_dropdown],
1116
+ outputs=[raw_call_details])
1117
+
1118
+ raw_agent_dropdown.change(update_raw_task_dropdown,
1119
+ inputs=[raw_agent_dropdown],
1120
+ outputs=[raw_task_dropdown, raw_step_dropdown, raw_call_details])
1121
+ raw_task_dropdown.change(update_raw_step_dropdown,
1122
+ inputs=[raw_agent_dropdown, raw_task_dropdown],
1123
+ outputs=[raw_step_dropdown, raw_call_details])
1124
+ raw_step_dropdown.change(update_raw_call_details,
1125
+ inputs=[raw_agent_dropdown, raw_task_dropdown, raw_step_dropdown],
1126
+ outputs=[raw_call_details])
1127
+
1128
+
1129
+ # with gr.Tab("SWE-Bench Verified"):
1130
+ # gr.Markdown("""SWE-bench is a dataset that tests systems' ability to solve GitHub issues automatically. Verified is a human-validated subset of 500 problems reviewed by software engineers. The We are currently actively developing this platform and this benchmark is not fully implemented yet.""")
1131
+ # with gr.Row():
1132
+ # with gr.Column(scale=2):
1133
+ # Leaderboard(
1134
+ # value=parse_json_files(os.path.join(abs_path, "evals_live"), 'swebench_verified'),
1135
+ # select_columns=SelectColumns(
1136
+ # default_selection=config.SWEBENCH_ON_LOAD_COLUMNS + ["Verified"],
1137
+ # cant_deselect=["Agent Name"],
1138
+ # label="Select Columns to Display:",
1139
+ # ),
1140
+ # hide_columns=config.SWEBENCH_HIDE_COLUMNS,
1141
+ # search_columns=config.SWEBENCH_SEARCH_COLUMNS
1142
+ # )
1143
+ # gr.Markdown("""*95% CIs calculated using Student's t-distribution.*""", elem_classes=["text-right"])
1144
+ # with gr.Row():
1145
+ # scatter_plot = gr.Plot(create_scatter_plot(parse_json_files(os.path.join(abs_path, "evals_live"), 'swebench_verified', aggregate=False), "Total Cost", "Accuracy", "Total Cost (in USD)", "Accuracy", ["Agent Name"]))
1146
 
1147
+ # gr.Markdown("")
1148
+ # gr.Markdown("")
1149
+ # gr.Markdown("## Task success heatmap")
1150
+ # with gr.Row():
1151
+ # task_success_heatmap = gr.Plot()
1152
+ # demo.load(
1153
+ # lambda: create_task_success_heatmap(
1154
+ # preprocessor.get_task_success_data('swebench_verified'),
1155
+ # 'SWEBench Verified'
1156
+ # ),
1157
+ # outputs=[task_success_heatmap]
1158
+ # )
1159
 
1160
+ # gr.Markdown("")
1161
+ # gr.Markdown("")
1162
+ # gr.Markdown("## Failure report for each agent")
1163
+ # with gr.Row():
1164
+ # with gr.Column(scale=1):
1165
+ # failure_report_agent_dropdown = gr.Dropdown(label="Select Agent for Failure Report")
1166
+ # with gr.Row():
1167
+ # with gr.Column(scale=1):
1168
+ # failure_categories_overview = gr.Markdown()
1169
+
1170
+ # with gr.Column(scale=1):
1171
+ # failure_categories_chart = gr.Plot()
1172
 
1173
+ # # Initialize the failure report agent dropdown with all agents
1174
+ # demo.load(update_agent_dropdown,
1175
+ # inputs=[gr.Textbox(value="swebench_verified", visible=False), gr.Textbox(value="Accuracy", visible=False)],
1176
+ # outputs=[failure_report_agent_dropdown])
1177
+
1178
+ # # Update failure report when agent is selected
1179
+ # failure_report_agent_dropdown.change(update_failure_report,
1180
+ # inputs=[failure_report_agent_dropdown, gr.Textbox(value="swebench_verified", visible=False)],
1181
+ # outputs=[failure_categories_overview, failure_categories_chart])
1182
+
1183
+ # gr.Markdown("")
1184
+ # gr.Markdown("")
1185
+ # gr.Markdown("## Agent monitor")
1186
+ # with gr.Row():
1187
+ # with gr.Column(scale=1):
1188
+ # agent_dropdown = gr.Dropdown(label="Select Agent")
1189
+ # with gr.Column(scale=1):
1190
+ # task_dropdown = gr.Dropdown(label="Select SWE-Bench Task")
1191
+ # with gr.Row():
1192
+ # task_overview = gr.Markdown()
1193
+ # with gr.Row():
1194
+ # flow_chart = gr.Plot(label="Task Flow")
1195
+
1196
+ # # Initialize the agent dropdown with the best agent
1197
+ # demo.load(update_agent_dropdown, inputs=[gr.Textbox(value="swebench_verified", visible=False), gr.Textbox(value="Accuracy", visible=False)], outputs=[agent_dropdown])
1198
+ # demo.load(update_task_analysis, inputs=[gr.Textbox(value="swebench_verified", visible=False), agent_dropdown], outputs=[task_overview, flow_chart, task_dropdown, gr.Textbox(visible=False)])
1199
+
1200
+ # agent_dropdown.change(update_task_analysis,
1201
+ # inputs=[gr.Textbox(value="swebench_verified", visible=False), agent_dropdown],
1202
+ # outputs=[task_overview, flow_chart, task_dropdown, gr.Textbox(visible=False)])
1203
+ # task_dropdown.change(update_task_details,
1204
+ # inputs=[gr.Textbox(value="swebench_verified", visible=False), agent_dropdown, task_dropdown],
1205
+ # outputs=[task_overview, flow_chart, gr.Textbox(visible=False)])
1206
+
1207
+ # gr.Markdown("## Raw predictions")
1208
+ # with gr.Row():
1209
+ # with gr.Column(scale=1):
1210
+ # raw_agent_dropdown = gr.Dropdown(label="Select Agent")
1211
+ # with gr.Column(scale=1):
1212
+ # raw_task_dropdown = gr.Dropdown(label="Select Task")
1213
+ # with gr.Column(scale=1):
1214
+ # raw_step_dropdown = gr.Dropdown(label="Select Step")
1215
+
1216
+ # with gr.Row():
1217
+ # raw_call_details = gr.HTML()
1218
+
1219
+ # def update_raw_task_dropdown(agent_name):
1220
+ # analyzed_traces = get_analyzed_traces(agent_name, "swebench_verified")
1221
+ # if not analyzed_traces:
1222
+ # return gr.Dropdown(choices=[], label="Select Task"), gr.Dropdown(choices=[], label="Select Step"), f"No raw predictions data available for agent: {agent_name}."
1223
+ # task_ids = list(analyzed_traces.keys())
1224
+ # steps = analyzed_traces[task_ids[0]]['steps']
1225
+ # return gr.Dropdown(choices=task_ids, label="Select Task", value=task_ids[0]), gr.Dropdown(choices=[(f"Step {i+1}", i) for i in range(len(steps))], label="Select Step", value=0), update_raw_call_details(agent_name, task_ids[0], 0)
1226
+
1227
+ # def update_raw_step_dropdown(agent_name, task_id):
1228
+ # analyzed_traces = get_analyzed_traces(agent_name, "swebench_verified")
1229
+ # if not analyzed_traces or task_id not in analyzed_traces:
1230
+ # return gr.Dropdown(choices=[], label="Select Step", value="No data available.")
1231
+ # steps = analyzed_traces[task_id]['steps']
1232
+ # return gr.Dropdown(choices=[(f"Step {i+1}", i) for i in range(len(steps))], label="Select Step", value=0), format_call_info(steps[0], 0)
1233
+
1234
+ # def update_raw_call_details(agent_name, task_id, step_index):
1235
+ # analyzed_traces = get_analyzed_traces(agent_name, "swebench_verified")
1236
+ # if not analyzed_traces or task_id not in analyzed_traces:
1237
+ # return "No data available for this selection."
1238
+ # steps = analyzed_traces[task_id]['steps']
1239
+ # if step_index is None:
1240
+ # return "Invalid step selection."
1241
+ # step = steps[step_index]
1242
+ # return format_call_info(step, step_index)
1243
+
1244
+ # # Initialize the raw agent dropdown with all agents
1245
+ # demo.load(update_agent_dropdown,
1246
+ # inputs=[gr.Textbox(value="swebench_verified", visible=False), gr.Textbox(value="Accuracy", visible=False)],
1247
+ # outputs=[raw_agent_dropdown])
1248
+ # demo.load(update_raw_task_dropdown,
1249
+ # inputs=[raw_agent_dropdown],
1250
+ # outputs=[raw_task_dropdown, raw_step_dropdown])
1251
+ # demo.load(update_raw_call_details,
1252
+ # inputs=[raw_agent_dropdown, raw_task_dropdown, raw_step_dropdown],
1253
+ # outputs=[raw_call_details])
1254
+
1255
+ # raw_agent_dropdown.change(update_raw_task_dropdown,
1256
+ # inputs=[raw_agent_dropdown],
1257
+ # outputs=[raw_task_dropdown, raw_step_dropdown, raw_call_details])
1258
+ # raw_task_dropdown.change(update_raw_step_dropdown,
1259
+ # inputs=[raw_agent_dropdown, raw_task_dropdown],
1260
+ # outputs=[raw_step_dropdown, raw_call_details])
1261
+ # raw_step_dropdown.change(update_raw_call_details,
1262
+ # inputs=[raw_agent_dropdown, raw_task_dropdown, raw_step_dropdown],
1263
+ # outputs=[raw_call_details])
1264
+
1265
+ # with gr.Tab("SWE-Bench Lite"):
1266
+ # gr.Markdown("""SWE-bench is a dataset that tests systems' ability to solve GitHub issues automatically. Lite is a subset of 300 tasks of the original SWE-bench. We are currently actively developing this platform and this benchmark is not fully implemented yet.""")
1267
+ # with gr.Row():
1268
+ # with gr.Column(scale=2):
1269
+ # Leaderboard(
1270
+ # value=create_leaderboard(parse_json_files(os.path.join(abs_path, "evals_live"), 'swebench_lite'), ci_metrics=['Accuracy', 'Total Cost']),
1271
+ # select_columns=SelectColumns(
1272
+ # default_selection=config.SWEBENCH_ON_LOAD_COLUMNS + ["Verified"],
1273
+ # cant_deselect=["Agent Name"],
1274
+ # label="Select Columns to Display:",
1275
+ # ),
1276
+ # search_columns=config.SWEBENCH_SEARCH_COLUMNS,
1277
+ # hide_columns=config.SWEBENCH_HIDE_COLUMNS
1278
+ # )
1279
+ # # make right aligned markdown
1280
+ # gr.Markdown("""*95% CIs calculated using Student's t-distribution.*""", elem_classes=["text-right"])
1281
+ # with gr.Row():
1282
+ # scatter_plot = gr.Plot(create_scatter_plot(parse_json_files(os.path.join(abs_path, "evals_live"), 'swebench_lite', aggregate=True), "Total Cost", "Accuracy", "Total Cost (in USD)", "Accuracy", ["Agent Name"]))
1283
+
1284
+ # gr.Markdown("")
1285
+ # gr.Markdown("")
1286
+ # gr.Markdown("## Task success heatmap")
1287
+ # with gr.Row():
1288
+ # task_success_heatmap = gr.Plot()
1289
+ # demo.load(
1290
+ # lambda: create_task_success_heatmap(
1291
+ # preprocessor.get_task_success_data('swebench_lite'),
1292
+ # 'SWEBench Lite'
1293
+ # ),
1294
+ # outputs=[task_success_heatmap]
1295
+ # )
1296
+
1297
+ # gr.Markdown("")
1298
+ # gr.Markdown("")
1299
+ # gr.Markdown("## Failure report for each agent")
1300
+ # with gr.Row():
1301
+ # with gr.Column(scale=1):
1302
+ # failure_report_agent_dropdown = gr.Dropdown(label="Select Agent for Failure Report")
1303
+ # with gr.Row():
1304
+ # with gr.Column(scale=1):
1305
+ # failure_categories_overview = gr.Markdown()
1306
+
1307
+ # with gr.Column(scale=1):
1308
+ # failure_categories_chart = gr.Plot()
1309
 
1310
+ # # Initialize the failure report agent dropdown with all agents
1311
+ # demo.load(update_agent_dropdown,
1312
+ # inputs=[gr.Textbox(value="swebench_lite", visible=False), gr.Textbox(value="Accuracy", visible=False)],
1313
+ # outputs=[failure_report_agent_dropdown])
1314
+
1315
+ # # Update failure report when agent is selected
1316
+ # failure_report_agent_dropdown.change(update_failure_report,
1317
+ # inputs=[failure_report_agent_dropdown, gr.Textbox(value="swebench_lite", visible=False)],
1318
+ # outputs=[failure_categories_overview, failure_categories_chart])
1319
+
1320
+ # gr.Markdown("")
1321
+ # gr.Markdown("")
1322
+ # gr.Markdown("## Agent monitor")
1323
+ # with gr.Row():
1324
+ # with gr.Column(scale=1):
1325
+ # agent_dropdown = gr.Dropdown(label="Select Agent")
1326
+ # with gr.Column(scale=1):
1327
+ # task_dropdown = gr.Dropdown(label="Select SWE-Bench Task")
1328
+ # with gr.Row():
1329
+ # task_overview = gr.Markdown()
1330
+ # with gr.Row():
1331
+ # flow_chart = gr.Plot(label="Task Flow")
1332
+
1333
+ # # Initialize the agent dropdown with the best agent
1334
+ # demo.load(update_agent_dropdown, inputs=[gr.Textbox(value="swebench_lite", visible=False), gr.Textbox(value="Accuracy", visible=False)], outputs=[agent_dropdown])
1335
+ # demo.load(update_task_analysis, inputs=[gr.Textbox(value="swebench_lite", visible=False), agent_dropdown], outputs=[task_overview, flow_chart, task_dropdown, gr.Textbox(visible=False)])
1336
+
1337
+ # agent_dropdown.change(update_task_analysis,
1338
+ # inputs=[gr.Textbox(value="swebench_lite", visible=False), agent_dropdown],
1339
+ # outputs=[task_overview, flow_chart, task_dropdown, gr.Textbox(visible=False)])
1340
+ # task_dropdown.change(update_task_details,
1341
+ # inputs=[gr.Textbox(value="swebench_lite", visible=False), agent_dropdown, task_dropdown],
1342
+ # outputs=[task_overview, flow_chart, gr.Textbox(visible=False)])
1343
 
1344
+
1345
+ # gr.Markdown("## Raw predictions")
1346
+ # with gr.Row():
1347
+ # with gr.Column(scale=1):
1348
+ # raw_agent_dropdown = gr.Dropdown(label="Select Agent")
1349
+ # with gr.Column(scale=1):
1350
+ # raw_task_dropdown = gr.Dropdown(label="Select Task")
1351
+ # with gr.Column(scale=1):
1352
+ # raw_step_dropdown = gr.Dropdown(label="Select Step")
1353
+
1354
+ # with gr.Row():
1355
+ # raw_call_details = gr.HTML()
1356
+
1357
+ # def update_raw_task_dropdown(agent_name):
1358
+ # analyzed_traces = get_analyzed_traces(agent_name, "swebench_lite")
1359
+ # if not analyzed_traces:
1360
+ # return gr.Dropdown(choices=[], label="Select Task"), gr.Dropdown(choices=[], label="Select Step"), f"No raw predictions data available for agent: {agent_name}."
1361
+ # task_ids = list(analyzed_traces.keys())
1362
+ # steps = analyzed_traces[task_ids[0]]['steps']
1363
+ # return gr.Dropdown(choices=task_ids, label="Select Task", value=task_ids[0]), gr.Dropdown(choices=[(f"Step {i+1}", i) for i in range(len(steps))], label="Select Step", value=0), update_raw_call_details(agent_name, task_ids[0], 0)
1364
+
1365
+ # def update_raw_step_dropdown(agent_name, task_id):
1366
+ # analyzed_traces = get_analyzed_traces(agent_name, "swebench_lite")
1367
+ # if not analyzed_traces or task_id not in analyzed_traces:
1368
+ # return gr.Dropdown(choices=[], label="Select Step", value="No data available.")
1369
+ # steps = analyzed_traces[task_id]['steps']
1370
+ # return gr.Dropdown(choices=[(f"Step {i+1}", i) for i in range(len(steps))], label="Select Step", value=0), format_call_info(steps[0], 0)
1371
+
1372
+ # def update_raw_call_details(agent_name, task_id, step_index):
1373
+ # analyzed_traces = get_analyzed_traces(agent_name, "swebench_lite")
1374
+ # if not analyzed_traces or task_id not in analyzed_traces:
1375
+ # return "No data available for this selection."
1376
+ # steps = analyzed_traces[task_id]['steps']
1377
+ # if step_index is None:
1378
+ # return "Invalid step selection."
1379
+ # step = steps[step_index]
1380
+ # return format_call_info(step, step_index)
1381
+
1382
+ # # Initialize the raw agent dropdown with all agents
1383
+ # demo.load(update_agent_dropdown,
1384
+ # inputs=[gr.Textbox(value="swebench_lite", visible=False), gr.Textbox(value="Accuracy", visible=False)],
1385
+ # outputs=[raw_agent_dropdown])
1386
+ # demo.load(update_raw_task_dropdown,
1387
+ # inputs=[raw_agent_dropdown],
1388
+ # outputs=[raw_task_dropdown, raw_step_dropdown])
1389
+ # demo.load(update_raw_call_details,
1390
+ # inputs=[raw_agent_dropdown, raw_task_dropdown, raw_step_dropdown],
1391
+ # outputs=[raw_call_details])
1392
+
1393
+ # raw_agent_dropdown.change(update_raw_task_dropdown,
1394
+ # inputs=[raw_agent_dropdown],
1395
+ # outputs=[raw_task_dropdown, raw_step_dropdown, raw_call_details])
1396
+ # raw_task_dropdown.change(update_raw_step_dropdown,
1397
+ # inputs=[raw_agent_dropdown, raw_task_dropdown],
1398
+ # outputs=[raw_step_dropdown, raw_call_details])
1399
+ # raw_step_dropdown.change(update_raw_call_details,
1400
+ # inputs=[raw_agent_dropdown, raw_task_dropdown, raw_step_dropdown],
1401
+ # outputs=[raw_call_details])
1402
+
1403
 
1404
+
1405
+ # with gr.Tab("MLAgentBench"):
1406
+ # gr.Markdown("""MLAgentBench is a suite of end-to-end Machine Learning (ML) experimentation tasks, where the agent aims to take a given dataset and a machine learning task description and autonomously develop or improve an ML model. We are currently actively developing this platform and this benchmark is not fully implemented yet. In particular, we only include one agent and a subset of tasks for this benchmark.""")
1407
+ # with gr.Row():
1408
+ # with gr.Column(scale=2):
1409
+ # Leaderboard(
1410
+ # value=parse_json_files(os.path.join(abs_path, "evals_live"), 'mlagentbench'),
1411
+ # select_columns=SelectColumns(
1412
+ # default_selection=config.MLAGENTBENCH_ON_LOAD_COLUMNS + ["Verified"],
1413
+ # cant_deselect=["Agent Name"],
1414
+ # label="Select Columns to Display:",
1415
+ # ),
1416
+ # search_columns=config.MLAGENTBENCH_SEARCH_COLUMNS,
1417
+ # hide_columns=config.MLAGENTBENCH_HIDE_COLUMNS,
1418
+ # )
1419
+ # gr.Markdown("""*95% CIs calculated using Student's t-distribution.*""", elem_classes=["text-right"])
1420
+ # with gr.Row():
1421
+ # scatter_plot = gr.Plot(create_scatter_plot(parse_json_files(os.path.join(abs_path, "evals_live"), 'mlagentbench', aggregate=False), "Total Cost", "Overall Score", "Total Cost (in USD)", "Overall Score", ["Agent Name"]))
1422
+
1423
+ # gr.Markdown("")
1424
+ # gr.Markdown("")
1425
+ # gr.Markdown("## Failure report for each agent")
1426
+ # with gr.Row():
1427
+ # with gr.Column(scale=1):
1428
+ # failure_report_agent_dropdown = gr.Dropdown(label="Select Agent for Failure Report")
1429
+ # with gr.Row():
1430
+ # with gr.Column(scale=1):
1431
+ # failure_categories_overview = gr.Markdown()
1432
 
1433
+ # with gr.Column(scale=1):
1434
+ # failure_categories_chart = gr.Plot()
1435
+
1436
+ # # Initialize the failure report agent dropdown with all agents
1437
+ # demo.load(update_agent_dropdown,
1438
+ # inputs=[gr.Textbox(value="mlagentbench", visible=False), gr.Textbox(value="Overall Score", visible=False)],
1439
+ # outputs=[failure_report_agent_dropdown])
1440
+
1441
+ # # Update failure report when agent is selected
1442
+ # failure_report_agent_dropdown.change(update_failure_report,
1443
+ # inputs=[failure_report_agent_dropdown, gr.Textbox(value="mlagentbench", visible=False)],
1444
+ # outputs=[failure_categories_overview, failure_categories_chart])
1445
+
1446
+ # gr.Markdown("")
1447
+ # gr.Markdown("")
1448
+ # gr.Markdown("## Agent monitor")
1449
+ # with gr.Row():
1450
+ # with gr.Column(scale=1):
1451
+ # agent_dropdown = gr.Dropdown(label="Select Agent")
1452
+ # with gr.Column(scale=1):
1453
+ # task_dropdown = gr.Dropdown(label="Select SWE-Bench Task")
1454
+ # with gr.Row():
1455
+ # task_overview = gr.Markdown()
1456
+ # with gr.Row():
1457
+ # flow_chart = gr.Plot(label="Task Flow")
1458
+
1459
+ # # Initialize the agent dropdown with the best agent
1460
+ # demo.load(update_agent_dropdown, inputs=[gr.Textbox(value="mlagentbench", visible=False), gr.Textbox(value="Overall Score", visible=False)], outputs=[agent_dropdown])
1461
+ # demo.load(update_task_analysis, inputs=[gr.Textbox(value="mlagentbench", visible=False), agent_dropdown], outputs=[task_overview, flow_chart, task_dropdown, gr.Textbox(visible=False)])
1462
+
1463
+ # agent_dropdown.change(update_task_analysis,
1464
+ # inputs=[gr.Textbox(value="mlagentbench", visible=False), agent_dropdown],
1465
+ # outputs=[task_overview, flow_chart, task_dropdown, gr.Textbox(visible=False)])
1466
+ # task_dropdown.change(update_task_details,
1467
+ # inputs=[gr.Textbox(value="mlagentbench", visible=False), agent_dropdown, task_dropdown],
1468
+ # outputs=[task_overview, flow_chart, gr.Textbox(visible=False)])
1469
+
1470
+
1471
+ # gr.Markdown("## Raw predictions")
1472
+ # with gr.Row():
1473
+ # with gr.Column(scale=1):
1474
+ # raw_agent_dropdown = gr.Dropdown(label="Select Agent")
1475
+ # with gr.Column(scale=1):
1476
+ # raw_task_dropdown = gr.Dropdown(label="Select Task")
1477
+ # with gr.Column(scale=1):
1478
+ # raw_step_dropdown = gr.Dropdown(label="Select Step")
1479
+
1480
+ # with gr.Row():
1481
+ # raw_call_details = gr.HTML()
1482
 
1483
+ # def update_raw_task_dropdown(agent_name):
1484
+ # analyzed_traces = get_analyzed_traces(agent_name, "mlagentbench")
1485
+ # if not analyzed_traces:
1486
+ # return gr.Dropdown(choices=[], label="Select Task"), gr.Dropdown(choices=[], label="Select Step"), f"No raw predictions data available for agent: {agent_name}."
1487
+ # task_ids = list(analyzed_traces.keys())
1488
+ # steps = analyzed_traces[task_ids[0]]['steps']
1489
+ # return gr.Dropdown(choices=task_ids, label="Select Task", value=task_ids[0]), gr.Dropdown(choices=[(f"Step {i+1}", i) for i in range(len(steps))], label="Select Step", value=0), update_raw_call_details(agent_name, task_ids[0], 0)
1490
+
1491
+ # def update_raw_step_dropdown(agent_name, task_id):
1492
+ # analyzed_traces = get_analyzed_traces(agent_name, "mlagentbench")
1493
+ # if not analyzed_traces or task_id not in analyzed_traces:
1494
+ # return gr.Dropdown(choices=[], label="Select Step", value="No data available.")
1495
+ # steps = analyzed_traces[task_id]['steps']
1496
+ # return gr.Dropdown(choices=[(f"Step {i+1}", i) for i in range(len(steps))], label="Select Step", value=0), format_call_info(steps[0], 0)
1497
+
1498
+ # def update_raw_call_details(agent_name, task_id, step_index):
1499
+ # analyzed_traces = get_analyzed_traces(agent_name, "mlagentbench")
1500
+ # if not analyzed_traces or task_id not in analyzed_traces:
1501
+ # return "No data available for this selection."
1502
+ # steps = analyzed_traces[task_id]['steps']
1503
+ # if step_index is None:
1504
+ # return "Invalid step selection."
1505
+ # step = steps[step_index]
1506
+ # return format_call_info(step, step_index)
1507
+
1508
+ # # Initialize the raw agent dropdown with all agents
1509
+ # demo.load(update_agent_dropdown,
1510
+ # inputs=[gr.Textbox(value="mlagentbench", visible=False), gr.Textbox(value="Overall Score", visible=False)],
1511
+ # outputs=[raw_agent_dropdown])
1512
+ # demo.load(update_raw_task_dropdown,
1513
+ # inputs=[raw_agent_dropdown],
1514
+ # outputs=[raw_task_dropdown, raw_step_dropdown])
1515
+ # demo.load(update_raw_call_details,
1516
+ # inputs=[raw_agent_dropdown, raw_task_dropdown, raw_step_dropdown],
1517
+ # outputs=[raw_call_details])
1518
+
1519
+ # raw_agent_dropdown.change(update_raw_task_dropdown,
1520
+ # inputs=[raw_agent_dropdown],
1521
+ # outputs=[raw_task_dropdown, raw_step_dropdown, raw_call_details])
1522
+ # raw_task_dropdown.change(update_raw_step_dropdown,
1523
+ # inputs=[raw_agent_dropdown, raw_task_dropdown],
1524
+ # outputs=[raw_step_dropdown, raw_call_details])
1525
+ # raw_step_dropdown.change(update_raw_call_details,
1526
+ # inputs=[raw_agent_dropdown, raw_task_dropdown, raw_step_dropdown],
1527
+ # outputs=[raw_call_details])
1528
 
1529
 
1530
  with gr.Tab("About"):
 
1551
  # Download the results from the Hugging Face Hub
1552
  # await asyncio.to_thread(download_latest_results)
1553
 
1554
+ # # Check for new uploads and process them
1555
  # await check_and_process_uploads()
1556
 
1557
  scheduler = AsyncIOScheduler()
1558
  scheduler.add_job(restart_space, "interval", hours=1)
1559
+ # scheduler.add_job(download_latest_results, "interval", hours=1)
1560
  # scheduler.add_job(check_and_process_uploads, "interval", hours=1)
1561
  scheduler.start()
1562