Spaces:
Running
Running
Upload app.py
Browse files
app.py
CHANGED
@@ -6,7 +6,7 @@ from pathlib import Path
|
|
6 |
import pandas as pd
|
7 |
import os
|
8 |
import json
|
9 |
-
from utils.viz import create_scatter_plot, create_flow_chart, create_bar_chart, create_task_success_heatmap
|
10 |
from utils.processing import check_and_process_uploads
|
11 |
from huggingface_hub import snapshot_download
|
12 |
from apscheduler.schedulers.background import BackgroundScheduler
|
@@ -48,8 +48,8 @@ def get_analyzed_traces(agent_name, benchmark_name):
|
|
48 |
def get_failure_report(agent_name, benchmark_name):
|
49 |
return preprocessor.get_failure_report(agent_name, benchmark_name)
|
50 |
|
51 |
-
def parse_json_files(folder_path, benchmark_name):
|
52 |
-
return preprocessor.get_parsed_results(benchmark_name)
|
53 |
|
54 |
def update_agent_dropdown(benchmark_name, metric):
|
55 |
df = parse_json_files(os.path.join(abs_path, "evals_live"), benchmark_name)
|
@@ -463,7 +463,7 @@ with gr.Blocks(theme=my_theme, css='css.css') as demo:
|
|
463 |
with gr.Row():
|
464 |
with gr.Column(scale=2):
|
465 |
Leaderboard(
|
466 |
-
value=parse_json_files(os.path.join(abs_path, "evals_live"), 'usaco'),
|
467 |
select_columns=SelectColumns(
|
468 |
default_selection=config.USACO_ON_LOAD_COLUMNS + ["Verified"],
|
469 |
cant_deselect=["Agent Name"],
|
@@ -472,15 +472,15 @@ with gr.Blocks(theme=my_theme, css='css.css') as demo:
|
|
472 |
hide_columns=config.USACO_HIDE_COLUMNS,
|
473 |
search_columns=config.USACO_SEARCH_COLUMNS,
|
474 |
)
|
|
|
475 |
with gr.Row():
|
476 |
gr.Markdown("### Accuracy vs. Cost for USACO agents")
|
477 |
with gr.Row():
|
478 |
-
scatter_plot = gr.Plot(create_scatter_plot(parse_json_files(os.path.join(abs_path, "evals_live"), 'usaco'), "Total Cost", "Accuracy", "Total Cost (in USD)", "Accuracy", ["Agent Name"]))
|
479 |
|
480 |
-
gr.
|
481 |
-
gr.Markdown("")
|
482 |
gr.Markdown("## Task success heatmap")
|
483 |
-
gr.Markdown("The task success heatmap shows which agent can solve which tasks. Agents are sorted by total accuracy (higher is better); tasks in USACO are sorted by decreasing order of difficulty (tasks on the left are solved by the most agents; tasks on the right are solved by the least")
|
484 |
with gr.Row():
|
485 |
task_success_heatmap = gr.Plot()
|
486 |
demo.load(
|
@@ -624,400 +624,907 @@ with gr.Blocks(theme=my_theme, css='css.css') as demo:
|
|
624 |
inputs=[raw_agent_dropdown, raw_task_dropdown, raw_step_dropdown],
|
625 |
outputs=[raw_call_details])
|
626 |
|
627 |
-
|
628 |
-
with gr.Tab("SWE-Bench Verified"):
|
629 |
gr.Markdown("""SWE-bench is a dataset that tests systems' ability to solve GitHub issues automatically. Verified is a human-validated subset of 500 problems reviewed by software engineers. The We are currently actively developing this platform and this benchmark is not fully implemented yet.""")
|
630 |
with gr.Row():
|
631 |
with gr.Column(scale=2):
|
632 |
Leaderboard(
|
633 |
-
value=parse_json_files(os.path.join(abs_path, "evals_live"), 'swebench_verified'),
|
634 |
select_columns=SelectColumns(
|
635 |
default_selection=config.SWEBENCH_ON_LOAD_COLUMNS + ["Verified"],
|
636 |
cant_deselect=["Agent Name"],
|
637 |
label="Select Columns to Display:",
|
638 |
),
|
639 |
hide_columns=config.SWEBENCH_HIDE_COLUMNS,
|
640 |
-
search_columns=config.SWEBENCH_SEARCH_COLUMNS
|
641 |
)
|
|
|
642 |
with gr.Row():
|
643 |
-
|
|
|
|
|
644 |
|
645 |
-
gr.
|
646 |
-
gr.Markdown("")
|
647 |
gr.Markdown("## Task success heatmap")
|
|
|
648 |
with gr.Row():
|
649 |
task_success_heatmap = gr.Plot()
|
650 |
demo.load(
|
651 |
lambda: create_task_success_heatmap(
|
652 |
preprocessor.get_task_success_data('swebench_verified'),
|
653 |
-
'
|
654 |
),
|
655 |
outputs=[task_success_heatmap]
|
656 |
)
|
657 |
-
|
658 |
-
gr.Markdown("")
|
659 |
-
gr.Markdown("")
|
660 |
-
gr.Markdown("## Failure report for each agent")
|
661 |
-
with gr.Row():
|
662 |
-
with gr.Column(scale=1):
|
663 |
-
failure_report_agent_dropdown = gr.Dropdown(label="Select Agent for Failure Report")
|
664 |
-
with gr.Row():
|
665 |
-
with gr.Column(scale=1):
|
666 |
-
failure_categories_overview = gr.Markdown()
|
667 |
-
|
668 |
-
with gr.Column(scale=1):
|
669 |
-
failure_categories_chart = gr.Plot()
|
670 |
|
671 |
-
|
672 |
-
|
673 |
-
|
674 |
-
|
675 |
-
|
676 |
-
|
677 |
-
|
678 |
-
|
679 |
-
|
680 |
-
|
681 |
-
|
682 |
-
|
683 |
-
|
684 |
-
|
685 |
-
|
686 |
-
|
687 |
-
|
688 |
-
|
689 |
-
|
690 |
-
|
691 |
-
|
692 |
-
|
693 |
-
|
694 |
-
|
695 |
-
|
696 |
-
|
697 |
-
|
698 |
-
|
699 |
-
|
700 |
-
|
701 |
-
|
702 |
-
|
703 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
704 |
|
705 |
gr.Markdown("## Raw predictions")
|
706 |
-
|
707 |
-
|
708 |
-
|
709 |
-
|
710 |
-
|
711 |
-
|
712 |
-
|
713 |
-
|
714 |
-
|
715 |
-
|
716 |
-
|
717 |
-
|
718 |
-
|
719 |
-
|
720 |
-
|
721 |
-
|
722 |
-
|
723 |
-
|
724 |
-
|
725 |
-
|
726 |
-
|
727 |
-
|
728 |
-
|
729 |
-
|
730 |
-
|
731 |
-
|
732 |
-
|
733 |
-
|
734 |
-
|
735 |
-
|
736 |
-
|
737 |
-
|
738 |
-
|
739 |
-
|
740 |
-
|
741 |
-
|
742 |
-
|
743 |
-
|
744 |
-
|
745 |
-
|
746 |
-
|
747 |
-
|
748 |
-
|
749 |
-
|
750 |
-
|
751 |
-
|
752 |
-
|
753 |
-
|
754 |
-
|
755 |
-
|
756 |
-
|
757 |
-
|
758 |
-
|
759 |
-
|
760 |
-
|
761 |
-
|
762 |
-
|
763 |
-
|
|
|
|
|
764 |
gr.Markdown("""SWE-bench is a dataset that tests systems' ability to solve GitHub issues automatically. Lite is a subset of 300 tasks of the original SWE-bench. We are currently actively developing this platform and this benchmark is not fully implemented yet.""")
|
765 |
with gr.Row():
|
766 |
with gr.Column(scale=2):
|
767 |
Leaderboard(
|
768 |
-
value=parse_json_files(os.path.join(abs_path, "evals_live"), 'swebench_lite'),
|
769 |
select_columns=SelectColumns(
|
770 |
default_selection=config.SWEBENCH_ON_LOAD_COLUMNS + ["Verified"],
|
771 |
cant_deselect=["Agent Name"],
|
772 |
label="Select Columns to Display:",
|
773 |
),
|
|
|
774 |
search_columns=config.SWEBENCH_SEARCH_COLUMNS,
|
775 |
-
hide_columns=config.SWEBENCH_HIDE_COLUMNS
|
776 |
)
|
|
|
777 |
with gr.Row():
|
778 |
-
|
779 |
-
|
780 |
-
|
781 |
-
|
|
|
782 |
gr.Markdown("## Task success heatmap")
|
|
|
783 |
with gr.Row():
|
784 |
task_success_heatmap = gr.Plot()
|
785 |
demo.load(
|
786 |
lambda: create_task_success_heatmap(
|
787 |
preprocessor.get_task_success_data('swebench_lite'),
|
788 |
-
'
|
789 |
),
|
790 |
outputs=[task_success_heatmap]
|
791 |
)
|
792 |
-
|
793 |
-
gr.Markdown("")
|
794 |
-
gr.Markdown("")
|
795 |
-
gr.Markdown("## Failure report for each agent")
|
796 |
-
with gr.Row():
|
797 |
-
with gr.Column(scale=1):
|
798 |
-
failure_report_agent_dropdown = gr.Dropdown(label="Select Agent for Failure Report")
|
799 |
-
with gr.Row():
|
800 |
-
with gr.Column(scale=1):
|
801 |
-
failure_categories_overview = gr.Markdown()
|
802 |
-
|
803 |
-
with gr.Column(scale=1):
|
804 |
-
failure_categories_chart = gr.Plot()
|
805 |
|
806 |
-
|
807 |
-
|
808 |
-
|
809 |
-
|
810 |
-
|
811 |
-
|
812 |
-
|
813 |
-
|
814 |
-
|
815 |
-
|
816 |
-
gr.Markdown("")
|
817 |
-
gr.Markdown("")
|
818 |
-
gr.Markdown("## Agent monitor")
|
819 |
-
with gr.Row():
|
820 |
-
with gr.Column(scale=1):
|
821 |
-
agent_dropdown = gr.Dropdown(label="Select Agent")
|
822 |
-
with gr.Column(scale=1):
|
823 |
-
task_dropdown = gr.Dropdown(label="Select SWE-Bench Task")
|
824 |
-
with gr.Row():
|
825 |
-
task_overview = gr.Markdown()
|
826 |
-
with gr.Row():
|
827 |
-
flow_chart = gr.Plot(label="Task Flow")
|
828 |
|
829 |
-
|
830 |
-
|
831 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
832 |
|
833 |
-
|
834 |
-
|
835 |
-
|
836 |
-
|
837 |
-
|
838 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
839 |
|
840 |
-
|
841 |
-
|
842 |
-
|
843 |
-
|
844 |
-
|
845 |
-
|
846 |
-
|
847 |
-
|
848 |
-
|
849 |
-
|
850 |
-
|
851 |
-
|
852 |
-
|
853 |
-
|
854 |
-
|
855 |
-
|
856 |
-
|
857 |
-
|
858 |
-
|
859 |
-
|
860 |
-
|
861 |
-
|
862 |
-
|
863 |
-
|
864 |
-
|
865 |
-
|
866 |
-
|
867 |
-
|
868 |
-
|
869 |
-
|
870 |
-
|
871 |
-
|
872 |
-
|
873 |
-
|
874 |
-
return "Invalid step selection."
|
875 |
-
step = steps[step_index]
|
876 |
-
return format_call_info(step, step_index)
|
877 |
-
|
878 |
-
# Initialize the raw agent dropdown with all agents
|
879 |
-
demo.load(update_agent_dropdown,
|
880 |
-
inputs=[gr.Textbox(value="swebench_lite", visible=False), gr.Textbox(value="Accuracy", visible=False)],
|
881 |
-
outputs=[raw_agent_dropdown])
|
882 |
-
demo.load(update_raw_task_dropdown,
|
883 |
-
inputs=[raw_agent_dropdown],
|
884 |
-
outputs=[raw_task_dropdown, raw_step_dropdown])
|
885 |
-
demo.load(update_raw_call_details,
|
886 |
-
inputs=[raw_agent_dropdown, raw_task_dropdown, raw_step_dropdown],
|
887 |
-
outputs=[raw_call_details])
|
888 |
-
|
889 |
-
raw_agent_dropdown.change(update_raw_task_dropdown,
|
890 |
-
inputs=[raw_agent_dropdown],
|
891 |
-
outputs=[raw_task_dropdown, raw_step_dropdown, raw_call_details])
|
892 |
-
raw_task_dropdown.change(update_raw_step_dropdown,
|
893 |
-
inputs=[raw_agent_dropdown, raw_task_dropdown],
|
894 |
-
outputs=[raw_step_dropdown, raw_call_details])
|
895 |
-
raw_step_dropdown.change(update_raw_call_details,
|
896 |
-
inputs=[raw_agent_dropdown, raw_task_dropdown, raw_step_dropdown],
|
897 |
-
outputs=[raw_call_details])
|
898 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
899 |
with gr.Tab("MLAgentBench"):
|
900 |
gr.Markdown("""MLAgentBench is a suite of end-to-end Machine Learning (ML) experimentation tasks, where the agent aims to take a given dataset and a machine learning task description and autonomously develop or improve an ML model. We are currently actively developing this platform and this benchmark is not fully implemented yet. In particular, we only include one agent and a subset of tasks for this benchmark.""")
|
901 |
with gr.Row():
|
902 |
with gr.Column(scale=2):
|
903 |
Leaderboard(
|
904 |
-
value=parse_json_files(os.path.join(abs_path, "evals_live"), 'mlagentbench'),
|
905 |
select_columns=SelectColumns(
|
906 |
default_selection=config.MLAGENTBENCH_ON_LOAD_COLUMNS + ["Verified"],
|
907 |
cant_deselect=["Agent Name"],
|
908 |
label="Select Columns to Display:",
|
909 |
),
|
910 |
-
search_columns=config.MLAGENTBENCH_SEARCH_COLUMNS,
|
911 |
hide_columns=config.MLAGENTBENCH_HIDE_COLUMNS,
|
|
|
912 |
)
|
|
|
913 |
with gr.Row():
|
914 |
-
|
915 |
-
|
916 |
-
gr.Markdown("")
|
917 |
-
gr.Markdown("")
|
918 |
-
gr.Markdown("## Failure report for each agent")
|
919 |
-
with gr.Row():
|
920 |
-
with gr.Column(scale=1):
|
921 |
-
failure_report_agent_dropdown = gr.Dropdown(label="Select Agent for Failure Report")
|
922 |
with gr.Row():
|
923 |
-
|
924 |
-
failure_categories_overview = gr.Markdown()
|
925 |
|
926 |
-
|
927 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
928 |
|
929 |
-
|
930 |
-
|
931 |
-
|
932 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
933 |
|
934 |
-
|
935 |
-
|
936 |
-
|
937 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
938 |
|
939 |
-
|
940 |
-
|
941 |
-
|
942 |
-
|
943 |
-
|
944 |
-
|
945 |
-
|
946 |
-
|
947 |
-
|
948 |
-
|
949 |
-
|
950 |
-
|
951 |
|
952 |
-
|
953 |
-
|
954 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
955 |
|
956 |
-
|
957 |
-
|
958 |
-
|
959 |
-
|
960 |
-
|
961 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
962 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
963 |
|
964 |
-
|
965 |
-
|
966 |
-
|
967 |
-
|
968 |
-
|
969 |
-
|
970 |
-
|
971 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
972 |
|
973 |
-
|
974 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
975 |
|
976 |
-
|
977 |
-
|
978 |
-
|
979 |
-
|
980 |
-
|
981 |
-
|
982 |
-
|
983 |
-
|
984 |
-
|
985 |
-
|
986 |
-
|
987 |
-
|
988 |
-
|
989 |
-
|
990 |
-
|
991 |
-
|
992 |
-
|
993 |
-
|
994 |
-
|
995 |
-
|
996 |
-
|
997 |
-
|
998 |
-
|
999 |
-
|
1000 |
-
|
1001 |
-
|
1002 |
-
|
1003 |
-
|
1004 |
-
|
1005 |
-
|
1006 |
-
|
1007 |
-
|
1008 |
-
|
1009 |
-
|
1010 |
-
|
1011 |
-
|
1012 |
-
|
1013 |
-
|
1014 |
-
|
1015 |
-
|
1016 |
-
|
1017 |
-
|
1018 |
-
|
1019 |
-
|
1020 |
-
|
1021 |
|
1022 |
|
1023 |
with gr.Tab("About"):
|
@@ -1044,12 +1551,12 @@ async def main():
|
|
1044 |
# Download the results from the Hugging Face Hub
|
1045 |
# await asyncio.to_thread(download_latest_results)
|
1046 |
|
1047 |
-
# Check for new uploads and process them
|
1048 |
# await check_and_process_uploads()
|
1049 |
|
1050 |
scheduler = AsyncIOScheduler()
|
1051 |
scheduler.add_job(restart_space, "interval", hours=1)
|
1052 |
-
scheduler.add_job(download_latest_results, "interval", hours=1)
|
1053 |
# scheduler.add_job(check_and_process_uploads, "interval", hours=1)
|
1054 |
scheduler.start()
|
1055 |
|
|
|
6 |
import pandas as pd
|
7 |
import os
|
8 |
import json
|
9 |
+
from utils.viz import create_scatter_plot, create_flow_chart, create_bar_chart, create_task_success_heatmap, create_leaderboard
|
10 |
from utils.processing import check_and_process_uploads
|
11 |
from huggingface_hub import snapshot_download
|
12 |
from apscheduler.schedulers.background import BackgroundScheduler
|
|
|
48 |
def get_failure_report(agent_name, benchmark_name):
|
49 |
return preprocessor.get_failure_report(agent_name, benchmark_name)
|
50 |
|
51 |
+
def parse_json_files(folder_path, benchmark_name, aggregate=True):
|
52 |
+
return preprocessor.get_parsed_results(benchmark_name, aggregate=aggregate)
|
53 |
|
54 |
def update_agent_dropdown(benchmark_name, metric):
|
55 |
df = parse_json_files(os.path.join(abs_path, "evals_live"), benchmark_name)
|
|
|
463 |
with gr.Row():
|
464 |
with gr.Column(scale=2):
|
465 |
Leaderboard(
|
466 |
+
value=create_leaderboard(parse_json_files(os.path.join(abs_path, "evals_live"), 'usaco'), ci_metrics=["Accuracy", "Total Cost"]),
|
467 |
select_columns=SelectColumns(
|
468 |
default_selection=config.USACO_ON_LOAD_COLUMNS + ["Verified"],
|
469 |
cant_deselect=["Agent Name"],
|
|
|
472 |
hide_columns=config.USACO_HIDE_COLUMNS,
|
473 |
search_columns=config.USACO_SEARCH_COLUMNS,
|
474 |
)
|
475 |
+
gr.Markdown("""*95% CIs calculated using Student's t-distribution.*""", elem_classes=["text-right"])
|
476 |
with gr.Row():
|
477 |
gr.Markdown("### Accuracy vs. Cost for USACO agents")
|
478 |
with gr.Row():
|
479 |
+
scatter_plot = gr.Plot(create_scatter_plot(parse_json_files(os.path.join(abs_path, "evals_live"), 'usaco', aggregate=False), "Total Cost", "Accuracy", "Total Cost (in USD)", "Accuracy", ["Agent Name"]))
|
480 |
|
481 |
+
gr.HTML('<div style="height: 30px;"></div>')
|
|
|
482 |
gr.Markdown("## Task success heatmap")
|
483 |
+
gr.Markdown("The task success heatmap shows which agent can solve which tasks. Agents are sorted by total accuracy (higher is better); tasks in USACO are sorted by decreasing order of difficulty (tasks on the left are solved by the most agents; tasks on the right are solved by the least. For agents that have been run more than once, the run with the highest score is shown.")
|
484 |
with gr.Row():
|
485 |
task_success_heatmap = gr.Plot()
|
486 |
demo.load(
|
|
|
624 |
inputs=[raw_agent_dropdown, raw_task_dropdown, raw_step_dropdown],
|
625 |
outputs=[raw_call_details])
|
626 |
|
627 |
+
with gr.Tab("SWE-bench Verified"):
|
|
|
628 |
gr.Markdown("""SWE-bench is a dataset that tests systems' ability to solve GitHub issues automatically. Verified is a human-validated subset of 500 problems reviewed by software engineers. The We are currently actively developing this platform and this benchmark is not fully implemented yet.""")
|
629 |
with gr.Row():
|
630 |
with gr.Column(scale=2):
|
631 |
Leaderboard(
|
632 |
+
value=create_leaderboard(parse_json_files(os.path.join(abs_path, "evals_live"), 'swebench_verified'), ci_metrics=["Accuracy", "Total Cost"]),
|
633 |
select_columns=SelectColumns(
|
634 |
default_selection=config.SWEBENCH_ON_LOAD_COLUMNS + ["Verified"],
|
635 |
cant_deselect=["Agent Name"],
|
636 |
label="Select Columns to Display:",
|
637 |
),
|
638 |
hide_columns=config.SWEBENCH_HIDE_COLUMNS,
|
639 |
+
search_columns=config.SWEBENCH_SEARCH_COLUMNS,
|
640 |
)
|
641 |
+
gr.Markdown("""*95% CIs calculated using Student's t-distribution.*""", elem_classes=["text-right"])
|
642 |
with gr.Row():
|
643 |
+
gr.Markdown("### Accuracy vs. Cost for SWE-bench agents")
|
644 |
+
with gr.Row():
|
645 |
+
scatter_plot = gr.Plot(create_scatter_plot(parse_json_files(os.path.join(abs_path, "evals_live"), 'swebench_verified', aggregate=False), "Total Cost", "Accuracy", "Total Cost (in USD)", "Accuracy", ["Agent Name"]))
|
646 |
|
647 |
+
gr.HTML('<div style="height: 30px;"></div>')
|
|
|
648 |
gr.Markdown("## Task success heatmap")
|
649 |
+
gr.Markdown("The task success heatmap shows which agent can solve which tasks. Agents are sorted by total accuracy (higher is better); tasks in SWE-bench are sorted by decreasing order of difficulty (tasks on the left are solved by the most agents; tasks on the right are solved by the least. For agents that have been run more than once, the run with the highest score is shown.")
|
650 |
with gr.Row():
|
651 |
task_success_heatmap = gr.Plot()
|
652 |
demo.load(
|
653 |
lambda: create_task_success_heatmap(
|
654 |
preprocessor.get_task_success_data('swebench_verified'),
|
655 |
+
'SWE-bench Verified'
|
656 |
),
|
657 |
outputs=[task_success_heatmap]
|
658 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
659 |
|
660 |
+
gr.HTML("""
|
661 |
+
<style>
|
662 |
+
.grouped-section {
|
663 |
+
border: 2px solid #dee2e6; /* Color matching unactivated tabs */
|
664 |
+
border-radius: 10px;
|
665 |
+
padding: 30px;
|
666 |
+
margin-top: 40px;
|
667 |
+
margin-bottom: 40px;
|
668 |
+
position: relative;
|
669 |
+
}
|
670 |
+
|
671 |
+
.grouped-section-title {
|
672 |
+
font-size: 1.7em;
|
673 |
+
font-weight: bold;
|
674 |
+
color: #2c3e50;
|
675 |
+
margin-bottom: 20px;
|
676 |
+
padding-bottom: 10px;
|
677 |
+
border-bottom: 2px solid #dee2e6;
|
678 |
+
}
|
679 |
+
</style>
|
680 |
+
""")
|
681 |
+
with gr.Group(elem_classes=["grouped-section"]):
|
682 |
+
gr.Markdown("# Agent monitor", elem_classes=["grouped-section-title"], elem_id="agent-monitor")
|
683 |
+
|
684 |
+
gr.HTML('<div style="height: 10px;"></div>')
|
685 |
+
gr.Markdown("## Failure report for each agent")
|
686 |
+
gr.Markdown('Select an agent to see why the agent fails to solve tasks correctly. Note that these descriptions (and the failure categories) are generated by LLM-based evaluations of the agent logs and may contain inaccuracies.')
|
687 |
+
gr.HTML('<div style="height: 10px;"></div>')
|
688 |
+
with gr.Row():
|
689 |
+
with gr.Column(scale=1):
|
690 |
+
failure_report_agent_dropdown = gr.Dropdown(label="Select Agent for Failure Report")
|
691 |
+
gr.HTML('<div style="height: 10px;"></div>')
|
692 |
+
with gr.Row():
|
693 |
+
with gr.Column(scale=1):
|
694 |
+
failure_categories_overview = gr.Markdown()
|
695 |
+
|
696 |
+
with gr.Column(scale=1):
|
697 |
+
failure_categories_chart = gr.Plot()
|
698 |
+
|
699 |
+
# Initialize the failure report agent dropdown with all agents
|
700 |
+
demo.load(update_agent_dropdown,
|
701 |
+
inputs=[gr.Textbox(value="swebench_verified", visible=False), gr.Textbox(value="Accuracy", visible=False)],
|
702 |
+
outputs=[failure_report_agent_dropdown])
|
703 |
+
|
704 |
+
# Update failure report when agent is selected
|
705 |
+
failure_report_agent_dropdown.change(update_failure_report,
|
706 |
+
inputs=[failure_report_agent_dropdown, gr.Textbox(value="swebench_verified", visible=False)],
|
707 |
+
outputs=[failure_categories_overview, failure_categories_chart])
|
708 |
+
|
709 |
+
gr.HTML('<div style="height: 30px;"></div>')
|
710 |
+
gr.Markdown("## Task overview")
|
711 |
+
gr.HTML('<div style="height: 10px;"></div>')
|
712 |
+
with gr.Row():
|
713 |
+
with gr.Column(scale=1):
|
714 |
+
agent_dropdown = gr.Dropdown(label="Select Agent")
|
715 |
+
with gr.Column(scale=1):
|
716 |
+
task_dropdown = gr.Dropdown(label="Select SWE-bench Verified Task")
|
717 |
+
gr.HTML('<div style="height: 10px;"></div>')
|
718 |
+
with gr.Row():
|
719 |
+
task_overview = gr.Markdown()
|
720 |
+
with gr.Row():
|
721 |
+
flow_chart = gr.Plot(label="Task Flow")
|
722 |
+
|
723 |
+
# Initialize the agent dropdown with the best agent
|
724 |
+
demo.load(update_agent_dropdown, inputs=[gr.Textbox(value="swebench_verified", visible=False), gr.Textbox(value="Accuracy", visible=False)], outputs=[agent_dropdown])
|
725 |
+
demo.load(update_task_analysis, inputs=[gr.Textbox(value="swebench_verified", visible=False), agent_dropdown], outputs=[task_overview, flow_chart, task_dropdown, gr.Textbox(visible=False)])
|
726 |
+
|
727 |
+
agent_dropdown.change(update_task_analysis,
|
728 |
+
inputs=[gr.Textbox(value="swebench_verified", visible=False), agent_dropdown],
|
729 |
+
outputs=[task_overview, flow_chart, task_dropdown, gr.Textbox(visible=False)])
|
730 |
+
task_dropdown.change(update_task_details,
|
731 |
+
inputs=[gr.Textbox(value="swebench_verified", visible=False), agent_dropdown, task_dropdown],
|
732 |
+
outputs=[task_overview, flow_chart, gr.Textbox(visible=False)])
|
733 |
|
734 |
gr.Markdown("## Raw predictions")
|
735 |
+
gr.Markdown('Select an agent to see the raw predictions made by the agent for each task. We also provide information on token usage for each call.')
|
736 |
+
with gr.Accordion("Expand to inspect raw predictions of agents...", open=False):
|
737 |
+
with gr.Row():
|
738 |
+
with gr.Column(scale=1):
|
739 |
+
raw_agent_dropdown = gr.Dropdown(label="Select Agent")
|
740 |
+
with gr.Column(scale=1):
|
741 |
+
raw_task_dropdown = gr.Dropdown(label="Select Task")
|
742 |
+
with gr.Column(scale=1):
|
743 |
+
raw_step_dropdown = gr.Dropdown(label="Select Step")
|
744 |
+
with gr.Row():
|
745 |
+
raw_call_details = gr.HTML()
|
746 |
+
|
747 |
+
def update_raw_task_dropdown(agent_name):
|
748 |
+
analyzed_traces = get_analyzed_traces(agent_name, "swebench_verified")
|
749 |
+
if not analyzed_traces:
|
750 |
+
return gr.Dropdown(choices=[], label="Select Task"), gr.Dropdown(choices=[], label="Select Step"), f"No raw predictions data available for agent: {agent_name}."
|
751 |
+
task_ids = list(analyzed_traces.keys())
|
752 |
+
steps = analyzed_traces[task_ids[0]]['steps']
|
753 |
+
return gr.Dropdown(choices=task_ids, label="Select Task", value=task_ids[0]), gr.Dropdown(choices=[(f"Step {i+1}", i) for i in range(len(steps))], label="Select Step", value=0), format_call_info(get_analyzed_traces(agent_name, "swebench_verified")[task_ids[0]]['steps'][0], 0)
|
754 |
+
|
755 |
+
def update_raw_step_dropdown(agent_name, task_id):
|
756 |
+
analyzed_traces = get_analyzed_traces(agent_name, "swebench_verified")
|
757 |
+
if not analyzed_traces or task_id not in analyzed_traces:
|
758 |
+
return gr.Dropdown(choices=[], label="Select Step", value="No data available.")
|
759 |
+
steps = analyzed_traces[task_id]['steps']
|
760 |
+
return gr.Dropdown(choices=[(f"Step {i+1}", i) for i in range(len(steps))], label="Select Step", value=0), format_call_info(steps[0], 0)
|
761 |
+
|
762 |
+
def update_raw_call_details(agent_name, task_id, step_index):
|
763 |
+
analyzed_traces = get_analyzed_traces(agent_name, "swebench_verified")
|
764 |
+
if not analyzed_traces or task_id not in analyzed_traces:
|
765 |
+
return "No data available for this selection."
|
766 |
+
steps = analyzed_traces[task_id]['steps']
|
767 |
+
if step_index is None:
|
768 |
+
return "Invalid step selection."
|
769 |
+
step = steps[step_index]
|
770 |
+
return format_call_info(step, step_index)
|
771 |
+
|
772 |
+
# Initialize the raw agent dropdown with all agents
|
773 |
+
demo.load(update_agent_dropdown,
|
774 |
+
inputs=[gr.Textbox(value="swebench_verified", visible=False), gr.Textbox(value="Accuracy", visible=False)],
|
775 |
+
outputs=[raw_agent_dropdown])
|
776 |
+
demo.load(update_raw_task_dropdown,
|
777 |
+
inputs=[raw_agent_dropdown],
|
778 |
+
outputs=[raw_task_dropdown, raw_step_dropdown])
|
779 |
+
demo.load(update_raw_call_details,
|
780 |
+
inputs=[raw_agent_dropdown, raw_task_dropdown, raw_step_dropdown],
|
781 |
+
outputs=[raw_call_details])
|
782 |
+
|
783 |
+
raw_agent_dropdown.change(update_raw_task_dropdown,
|
784 |
+
inputs=[raw_agent_dropdown],
|
785 |
+
outputs=[raw_task_dropdown, raw_step_dropdown, raw_call_details])
|
786 |
+
raw_task_dropdown.change(update_raw_step_dropdown,
|
787 |
+
inputs=[raw_agent_dropdown, raw_task_dropdown],
|
788 |
+
outputs=[raw_step_dropdown, raw_call_details])
|
789 |
+
raw_step_dropdown.change(update_raw_call_details,
|
790 |
+
inputs=[raw_agent_dropdown, raw_task_dropdown, raw_step_dropdown],
|
791 |
+
outputs=[raw_call_details])
|
792 |
+
|
793 |
+
|
794 |
+
with gr.Tab("SWE-bench Lite"):
|
795 |
gr.Markdown("""SWE-bench is a dataset that tests systems' ability to solve GitHub issues automatically. Lite is a subset of 300 tasks of the original SWE-bench. We are currently actively developing this platform and this benchmark is not fully implemented yet.""")
|
796 |
with gr.Row():
|
797 |
with gr.Column(scale=2):
|
798 |
Leaderboard(
|
799 |
+
value=create_leaderboard(parse_json_files(os.path.join(abs_path, "evals_live"), 'swebench_lite'), ci_metrics=["Accuracy", "Total Cost"]),
|
800 |
select_columns=SelectColumns(
|
801 |
default_selection=config.SWEBENCH_ON_LOAD_COLUMNS + ["Verified"],
|
802 |
cant_deselect=["Agent Name"],
|
803 |
label="Select Columns to Display:",
|
804 |
),
|
805 |
+
hide_columns=config.SWEBENCH_HIDE_COLUMNS,
|
806 |
search_columns=config.SWEBENCH_SEARCH_COLUMNS,
|
|
|
807 |
)
|
808 |
+
gr.Markdown("""*95% CIs calculated using Student's t-distribution.*""", elem_classes=["text-right"])
|
809 |
with gr.Row():
|
810 |
+
gr.Markdown("### Accuracy vs. Cost for SWE-bench agents")
|
811 |
+
with gr.Row():
|
812 |
+
scatter_plot = gr.Plot(create_scatter_plot(parse_json_files(os.path.join(abs_path, "evals_live"), 'swebench_lite', aggregate=False), "Total Cost", "Accuracy", "Total Cost (in USD)", "Accuracy", ["Agent Name"]))
|
813 |
+
|
814 |
+
gr.HTML('<div style="height: 30px;"></div>')
|
815 |
gr.Markdown("## Task success heatmap")
|
816 |
+
gr.Markdown("The task success heatmap shows which agent can solve which tasks. Agents are sorted by total accuracy (higher is better); tasks in SWE-bench are sorted by decreasing order of difficulty (tasks on the left are solved by the most agents; tasks on the right are solved by the least. For agents that have been run more than once, the run with the highest score is shown.")
|
817 |
with gr.Row():
|
818 |
task_success_heatmap = gr.Plot()
|
819 |
demo.load(
|
820 |
lambda: create_task_success_heatmap(
|
821 |
preprocessor.get_task_success_data('swebench_lite'),
|
822 |
+
'SWE-bench Lite'
|
823 |
),
|
824 |
outputs=[task_success_heatmap]
|
825 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
826 |
|
827 |
+
gr.HTML("""
|
828 |
+
<style>
|
829 |
+
.grouped-section {
|
830 |
+
border: 2px solid #dee2e6; /* Color matching unactivated tabs */
|
831 |
+
border-radius: 10px;
|
832 |
+
padding: 30px;
|
833 |
+
margin-top: 40px;
|
834 |
+
margin-bottom: 40px;
|
835 |
+
position: relative;
|
836 |
+
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
837 |
|
838 |
+
.grouped-section-title {
|
839 |
+
font-size: 1.7em;
|
840 |
+
font-weight: bold;
|
841 |
+
color: #2c3e50;
|
842 |
+
margin-bottom: 20px;
|
843 |
+
padding-bottom: 10px;
|
844 |
+
border-bottom: 2px solid #dee2e6;
|
845 |
+
}
|
846 |
+
</style>
|
847 |
+
""")
|
848 |
+
with gr.Group(elem_classes=["grouped-section"]):
|
849 |
+
gr.Markdown("# Agent monitor", elem_classes=["grouped-section-title"], elem_id="agent-monitor")
|
850 |
|
851 |
+
gr.HTML('<div style="height: 10px;"></div>')
|
852 |
+
gr.Markdown("## Failure report for each agent")
|
853 |
+
gr.Markdown('Select an agent to see why the agent fails to solve tasks correctly. Note that these descriptions (and the failure categories) are generated by LLM-based evaluations of the agent logs and may contain inaccuracies.')
|
854 |
+
gr.HTML('<div style="height: 10px;"></div>')
|
855 |
+
with gr.Row():
|
856 |
+
with gr.Column(scale=1):
|
857 |
+
failure_report_agent_dropdown = gr.Dropdown(label="Select Agent for Failure Report")
|
858 |
+
gr.HTML('<div style="height: 10px;"></div>')
|
859 |
+
with gr.Row():
|
860 |
+
with gr.Column(scale=1):
|
861 |
+
failure_categories_overview = gr.Markdown()
|
862 |
+
|
863 |
+
with gr.Column(scale=1):
|
864 |
+
failure_categories_chart = gr.Plot()
|
865 |
|
866 |
+
# Initialize the failure report agent dropdown with all agents
|
867 |
+
demo.load(update_agent_dropdown,
|
868 |
+
inputs=[gr.Textbox(value="swebench_lite", visible=False), gr.Textbox(value="Accuracy", visible=False)],
|
869 |
+
outputs=[failure_report_agent_dropdown])
|
870 |
+
|
871 |
+
# Update failure report when agent is selected
|
872 |
+
failure_report_agent_dropdown.change(update_failure_report,
|
873 |
+
inputs=[failure_report_agent_dropdown, gr.Textbox(value="swebench_lite", visible=False)],
|
874 |
+
outputs=[failure_categories_overview, failure_categories_chart])
|
875 |
+
|
876 |
+
gr.HTML('<div style="height: 30px;"></div>')
|
877 |
+
gr.Markdown("## Task overview")
|
878 |
+
gr.HTML('<div style="height: 10px;"></div>')
|
879 |
+
with gr.Row():
|
880 |
+
with gr.Column(scale=1):
|
881 |
+
agent_dropdown = gr.Dropdown(label="Select Agent")
|
882 |
+
with gr.Column(scale=1):
|
883 |
+
task_dropdown = gr.Dropdown(label="Select SWE-bench Lite Task")
|
884 |
+
gr.HTML('<div style="height: 10px;"></div>')
|
885 |
+
with gr.Row():
|
886 |
+
task_overview = gr.Markdown()
|
887 |
+
with gr.Row():
|
888 |
+
flow_chart = gr.Plot(label="Task Flow")
|
889 |
+
|
890 |
+
# Initialize the agent dropdown with the best agent
|
891 |
+
demo.load(update_agent_dropdown, inputs=[gr.Textbox(value="swebench_lite", visible=False), gr.Textbox(value="Accuracy", visible=False)], outputs=[agent_dropdown])
|
892 |
+
demo.load(update_task_analysis, inputs=[gr.Textbox(value="swebench_lite", visible=False), agent_dropdown], outputs=[task_overview, flow_chart, task_dropdown, gr.Textbox(visible=False)])
|
893 |
+
|
894 |
+
agent_dropdown.change(update_task_analysis,
|
895 |
+
inputs=[gr.Textbox(value="swebench_lite", visible=False), agent_dropdown],
|
896 |
+
outputs=[task_overview, flow_chart, task_dropdown, gr.Textbox(visible=False)])
|
897 |
+
task_dropdown.change(update_task_details,
|
898 |
+
inputs=[gr.Textbox(value="swebench_lite", visible=False), agent_dropdown, task_dropdown],
|
899 |
+
outputs=[task_overview, flow_chart, gr.Textbox(visible=False)])
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
900 |
|
901 |
+
gr.Markdown("## Raw predictions")
|
902 |
+
gr.Markdown('Select an agent to see the raw predictions made by the agent for each task. We also provide information on token usage for each call.')
|
903 |
+
with gr.Accordion("Expand to inspect raw predictions of agents...", open=False):
|
904 |
+
with gr.Row():
|
905 |
+
with gr.Column(scale=1):
|
906 |
+
raw_agent_dropdown = gr.Dropdown(label="Select Agent")
|
907 |
+
with gr.Column(scale=1):
|
908 |
+
raw_task_dropdown = gr.Dropdown(label="Select Task")
|
909 |
+
with gr.Column(scale=1):
|
910 |
+
raw_step_dropdown = gr.Dropdown(label="Select Step")
|
911 |
+
with gr.Row():
|
912 |
+
raw_call_details = gr.HTML()
|
913 |
+
|
914 |
+
def update_raw_task_dropdown(agent_name):
|
915 |
+
analyzed_traces = get_analyzed_traces(agent_name, "swebench_lite")
|
916 |
+
if not analyzed_traces:
|
917 |
+
return gr.Dropdown(choices=[], label="Select Task"), gr.Dropdown(choices=[], label="Select Step"), f"No raw predictions data available for agent: {agent_name}."
|
918 |
+
task_ids = list(analyzed_traces.keys())
|
919 |
+
steps = analyzed_traces[task_ids[0]]['steps']
|
920 |
+
return gr.Dropdown(choices=task_ids, label="Select Task", value=task_ids[0]), gr.Dropdown(choices=[(f"Step {i+1}", i) for i in range(len(steps))], label="Select Step", value=0), format_call_info(get_analyzed_traces(agent_name, "swebench_lite")[task_ids[0]]['steps'][0], 0)
|
921 |
+
|
922 |
+
def update_raw_step_dropdown(agent_name, task_id):
|
923 |
+
analyzed_traces = get_analyzed_traces(agent_name, "swebench_lite")
|
924 |
+
if not analyzed_traces or task_id not in analyzed_traces:
|
925 |
+
return gr.Dropdown(choices=[], label="Select Step", value="No data available.")
|
926 |
+
steps = analyzed_traces[task_id]['steps']
|
927 |
+
return gr.Dropdown(choices=[(f"Step {i+1}", i) for i in range(len(steps))], label="Select Step", value=0), format_call_info(steps[0], 0)
|
928 |
+
|
929 |
+
def update_raw_call_details(agent_name, task_id, step_index):
|
930 |
+
analyzed_traces = get_analyzed_traces(agent_name, "swebench_lite")
|
931 |
+
if not analyzed_traces or task_id not in analyzed_traces:
|
932 |
+
return "No data available for this selection."
|
933 |
+
steps = analyzed_traces[task_id]['steps']
|
934 |
+
if step_index is None:
|
935 |
+
return "Invalid step selection."
|
936 |
+
step = steps[step_index]
|
937 |
+
return format_call_info(step, step_index)
|
938 |
+
|
939 |
+
# Initialize the raw agent dropdown with all agents
|
940 |
+
demo.load(update_agent_dropdown,
|
941 |
+
inputs=[gr.Textbox(value="swebench_lite", visible=False), gr.Textbox(value="Accuracy", visible=False)],
|
942 |
+
outputs=[raw_agent_dropdown])
|
943 |
+
demo.load(update_raw_task_dropdown,
|
944 |
+
inputs=[raw_agent_dropdown],
|
945 |
+
outputs=[raw_task_dropdown, raw_step_dropdown])
|
946 |
+
demo.load(update_raw_call_details,
|
947 |
+
inputs=[raw_agent_dropdown, raw_task_dropdown, raw_step_dropdown],
|
948 |
+
outputs=[raw_call_details])
|
949 |
+
|
950 |
+
raw_agent_dropdown.change(update_raw_task_dropdown,
|
951 |
+
inputs=[raw_agent_dropdown],
|
952 |
+
outputs=[raw_task_dropdown, raw_step_dropdown, raw_call_details])
|
953 |
+
raw_task_dropdown.change(update_raw_step_dropdown,
|
954 |
+
inputs=[raw_agent_dropdown, raw_task_dropdown],
|
955 |
+
outputs=[raw_step_dropdown, raw_call_details])
|
956 |
+
raw_step_dropdown.change(update_raw_call_details,
|
957 |
+
inputs=[raw_agent_dropdown, raw_task_dropdown, raw_step_dropdown],
|
958 |
+
outputs=[raw_call_details])
|
959 |
+
|
960 |
+
|
961 |
+
|
962 |
with gr.Tab("MLAgentBench"):
|
963 |
gr.Markdown("""MLAgentBench is a suite of end-to-end Machine Learning (ML) experimentation tasks, where the agent aims to take a given dataset and a machine learning task description and autonomously develop or improve an ML model. We are currently actively developing this platform and this benchmark is not fully implemented yet. In particular, we only include one agent and a subset of tasks for this benchmark.""")
|
964 |
with gr.Row():
|
965 |
with gr.Column(scale=2):
|
966 |
Leaderboard(
|
967 |
+
value=create_leaderboard(parse_json_files(os.path.join(abs_path, "evals_live"), 'mlagentbench')),
|
968 |
select_columns=SelectColumns(
|
969 |
default_selection=config.MLAGENTBENCH_ON_LOAD_COLUMNS + ["Verified"],
|
970 |
cant_deselect=["Agent Name"],
|
971 |
label="Select Columns to Display:",
|
972 |
),
|
|
|
973 |
hide_columns=config.MLAGENTBENCH_HIDE_COLUMNS,
|
974 |
+
search_columns=config.MLAGENTBENCH_SEARCH_COLUMNS,
|
975 |
)
|
976 |
+
gr.Markdown("""*95% CIs calculated using Student's t-distribution.*""", elem_classes=["text-right"])
|
977 |
with gr.Row():
|
978 |
+
gr.Markdown("### Accuracy vs. Cost for MLAgentBench agents")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
979 |
with gr.Row():
|
980 |
+
scatter_plot = gr.Plot(create_scatter_plot(parse_json_files(os.path.join(abs_path, "evals_live"), 'mlagentbench', aggregate=False), "Total Cost", "Overall Score", "Total Cost (in USD)", "Overall Score", ["Agent Name"]))
|
|
|
981 |
|
982 |
+
# gr.HTML('<div style="height: 30px;"></div>')
|
983 |
+
# gr.Markdown("## Task success heatmap")
|
984 |
+
# gr.Markdown("The task success heatmap shows which agent can solve which tasks. Agents are sorted by total accuracy (higher is better); tasks in USACO are sorted by decreasing order of difficulty (tasks on the left are solved by the most agents; tasks on the right are solved by the least. For agents that have been run more than once, the run with the highest score is shown.")
|
985 |
+
# with gr.Row():
|
986 |
+
# task_success_heatmap = gr.Plot()
|
987 |
+
# demo.load(
|
988 |
+
# lambda: create_task_success_heatmap(
|
989 |
+
# preprocessor.get_task_success_data('usaco'),
|
990 |
+
# 'USACO'
|
991 |
+
# ),
|
992 |
+
# outputs=[task_success_heatmap]
|
993 |
+
# )
|
994 |
|
995 |
+
gr.HTML("""
|
996 |
+
<style>
|
997 |
+
.grouped-section {
|
998 |
+
border: 2px solid #dee2e6; /* Color matching unactivated tabs */
|
999 |
+
border-radius: 10px;
|
1000 |
+
padding: 30px;
|
1001 |
+
margin-top: 40px;
|
1002 |
+
margin-bottom: 40px;
|
1003 |
+
position: relative;
|
1004 |
+
}
|
1005 |
+
|
1006 |
+
.grouped-section-title {
|
1007 |
+
font-size: 1.7em;
|
1008 |
+
font-weight: bold;
|
1009 |
+
color: #2c3e50;
|
1010 |
+
margin-bottom: 20px;
|
1011 |
+
padding-bottom: 10px;
|
1012 |
+
border-bottom: 2px solid #dee2e6;
|
1013 |
+
}
|
1014 |
+
</style>
|
1015 |
+
""")
|
1016 |
+
with gr.Group(elem_classes=["grouped-section"]):
|
1017 |
+
gr.Markdown("# Agent monitor", elem_classes=["grouped-section-title"], elem_id="agent-monitor")
|
1018 |
+
|
1019 |
+
# gr.HTML('<div style="height: 10px;"></div>')
|
1020 |
+
# gr.Markdown("## Failure report for each agent")
|
1021 |
+
# gr.Markdown('Select an agent to see why the agent fails to solve tasks correctly. Note that these descriptions (and the failure categories) are generated by LLM-based evaluations of the agent logs and may contain inaccuracies.')
|
1022 |
+
# gr.HTML('<div style="height: 10px;"></div>')
|
1023 |
+
# with gr.Row():
|
1024 |
+
# with gr.Column(scale=1):
|
1025 |
+
# failure_report_agent_dropdown = gr.Dropdown(label="Select Agent for Failure Report")
|
1026 |
+
# gr.HTML('<div style="height: 10px;"></div>')
|
1027 |
+
# with gr.Row():
|
1028 |
+
# with gr.Column(scale=1):
|
1029 |
+
# failure_categories_overview = gr.Markdown()
|
1030 |
+
|
1031 |
+
# with gr.Column(scale=1):
|
1032 |
+
# failure_categories_chart = gr.Plot()
|
1033 |
+
|
1034 |
+
# # Initialize the failure report agent dropdown with all agents
|
1035 |
+
# demo.load(update_agent_dropdown,
|
1036 |
+
# inputs=[gr.Textbox(value="mlagentbench", visible=False), gr.Textbox(value="Overall Score", visible=False)],
|
1037 |
+
# outputs=[failure_report_agent_dropdown])
|
1038 |
+
|
1039 |
+
# # Update failure report when agent is selected
|
1040 |
+
# failure_report_agent_dropdown.change(update_failure_report,
|
1041 |
+
# inputs=[failure_report_agent_dropdown, gr.Textbox(value="mlagentbench", visible=False)],
|
1042 |
+
# outputs=[failure_categories_overview, failure_categories_chart])
|
1043 |
+
|
1044 |
+
gr.HTML('<div style="height: 30px;"></div>')
|
1045 |
+
gr.Markdown("## Task overview")
|
1046 |
+
gr.HTML('<div style="height: 10px;"></div>')
|
1047 |
+
with gr.Row():
|
1048 |
+
with gr.Column(scale=1):
|
1049 |
+
agent_dropdown = gr.Dropdown(label="Select Agent")
|
1050 |
+
with gr.Column(scale=1):
|
1051 |
+
task_dropdown = gr.Dropdown(label="Select MLAgentBench Task")
|
1052 |
+
gr.HTML('<div style="height: 10px;"></div>')
|
1053 |
+
with gr.Row():
|
1054 |
+
task_overview = gr.Markdown()
|
1055 |
+
with gr.Row():
|
1056 |
+
flow_chart = gr.Plot(label="Task Flow")
|
1057 |
+
|
1058 |
+
# Initialize the agent dropdown with the best agent
|
1059 |
+
demo.load(update_agent_dropdown, inputs=[gr.Textbox(value="mlagentbench", visible=False), gr.Textbox(value="Overall Score", visible=False)], outputs=[agent_dropdown])
|
1060 |
+
demo.load(update_task_analysis, inputs=[gr.Textbox(value="mlagentbench", visible=False), agent_dropdown], outputs=[task_overview, flow_chart, task_dropdown, gr.Textbox(visible=False)])
|
1061 |
+
|
1062 |
+
agent_dropdown.change(update_task_analysis,
|
1063 |
+
inputs=[gr.Textbox(value="mlagentbench", visible=False), agent_dropdown],
|
1064 |
+
outputs=[task_overview, flow_chart, task_dropdown, gr.Textbox(visible=False)])
|
1065 |
+
task_dropdown.change(update_task_details,
|
1066 |
+
inputs=[gr.Textbox(value="mlagentbench", visible=False), agent_dropdown, task_dropdown],
|
1067 |
+
outputs=[task_overview, flow_chart, gr.Textbox(visible=False)])
|
1068 |
+
|
1069 |
+
gr.Markdown("## Raw predictions")
|
1070 |
+
gr.Markdown('Select an agent to see the raw predictions made by the agent for each task. We also provide information on token usage for each call.')
|
1071 |
+
with gr.Accordion("Expand to inspect raw predictions of agents...", open=False):
|
1072 |
+
with gr.Row():
|
1073 |
+
with gr.Column(scale=1):
|
1074 |
+
raw_agent_dropdown = gr.Dropdown(label="Select Agent")
|
1075 |
+
with gr.Column(scale=1):
|
1076 |
+
raw_task_dropdown = gr.Dropdown(label="Select Task")
|
1077 |
+
with gr.Column(scale=1):
|
1078 |
+
raw_step_dropdown = gr.Dropdown(label="Select Step")
|
1079 |
+
with gr.Row():
|
1080 |
+
raw_call_details = gr.HTML()
|
1081 |
+
|
1082 |
+
def update_raw_task_dropdown(agent_name):
|
1083 |
+
analyzed_traces = get_analyzed_traces(agent_name, "mlagentbench")
|
1084 |
+
if not analyzed_traces:
|
1085 |
+
return gr.Dropdown(choices=[], label="Select Task"), gr.Dropdown(choices=[], label="Select Step"), f"No raw predictions data available for agent: {agent_name}."
|
1086 |
+
task_ids = list(analyzed_traces.keys())
|
1087 |
+
steps = analyzed_traces[task_ids[0]]['steps']
|
1088 |
+
return gr.Dropdown(choices=task_ids, label="Select Task", value=task_ids[0]), gr.Dropdown(choices=[(f"Step {i+1}", i) for i in range(len(steps))], label="Select Step", value=0), format_call_info(get_analyzed_traces(agent_name, "mlagentbench")[task_ids[0]]['steps'][0], 0)
|
1089 |
+
|
1090 |
+
def update_raw_step_dropdown(agent_name, task_id):
|
1091 |
+
analyzed_traces = get_analyzed_traces(agent_name, "mlagentbench")
|
1092 |
+
if not analyzed_traces or task_id not in analyzed_traces:
|
1093 |
+
return gr.Dropdown(choices=[], label="Select Step", value="No data available.")
|
1094 |
+
steps = analyzed_traces[task_id]['steps']
|
1095 |
+
return gr.Dropdown(choices=[(f"Step {i+1}", i) for i in range(len(steps))], label="Select Step", value=0), format_call_info(steps[0], 0)
|
1096 |
+
|
1097 |
+
def update_raw_call_details(agent_name, task_id, step_index):
|
1098 |
+
analyzed_traces = get_analyzed_traces(agent_name, "mlagentbench")
|
1099 |
+
if not analyzed_traces or task_id not in analyzed_traces:
|
1100 |
+
return "No data available for this selection."
|
1101 |
+
steps = analyzed_traces[task_id]['steps']
|
1102 |
+
if step_index is None:
|
1103 |
+
return "Invalid step selection."
|
1104 |
+
step = steps[step_index]
|
1105 |
+
return format_call_info(step, step_index)
|
1106 |
+
|
1107 |
+
# Initialize the raw agent dropdown with all agents
|
1108 |
+
demo.load(update_agent_dropdown,
|
1109 |
+
inputs=[gr.Textbox(value="mlagentbench", visible=False), gr.Textbox(value="Overall Score", visible=False)],
|
1110 |
+
outputs=[raw_agent_dropdown])
|
1111 |
+
demo.load(update_raw_task_dropdown,
|
1112 |
+
inputs=[raw_agent_dropdown],
|
1113 |
+
outputs=[raw_task_dropdown, raw_step_dropdown])
|
1114 |
+
demo.load(update_raw_call_details,
|
1115 |
+
inputs=[raw_agent_dropdown, raw_task_dropdown, raw_step_dropdown],
|
1116 |
+
outputs=[raw_call_details])
|
1117 |
+
|
1118 |
+
raw_agent_dropdown.change(update_raw_task_dropdown,
|
1119 |
+
inputs=[raw_agent_dropdown],
|
1120 |
+
outputs=[raw_task_dropdown, raw_step_dropdown, raw_call_details])
|
1121 |
+
raw_task_dropdown.change(update_raw_step_dropdown,
|
1122 |
+
inputs=[raw_agent_dropdown, raw_task_dropdown],
|
1123 |
+
outputs=[raw_step_dropdown, raw_call_details])
|
1124 |
+
raw_step_dropdown.change(update_raw_call_details,
|
1125 |
+
inputs=[raw_agent_dropdown, raw_task_dropdown, raw_step_dropdown],
|
1126 |
+
outputs=[raw_call_details])
|
1127 |
+
|
1128 |
+
|
1129 |
+
# with gr.Tab("SWE-Bench Verified"):
|
1130 |
+
# gr.Markdown("""SWE-bench is a dataset that tests systems' ability to solve GitHub issues automatically. Verified is a human-validated subset of 500 problems reviewed by software engineers. The We are currently actively developing this platform and this benchmark is not fully implemented yet.""")
|
1131 |
+
# with gr.Row():
|
1132 |
+
# with gr.Column(scale=2):
|
1133 |
+
# Leaderboard(
|
1134 |
+
# value=parse_json_files(os.path.join(abs_path, "evals_live"), 'swebench_verified'),
|
1135 |
+
# select_columns=SelectColumns(
|
1136 |
+
# default_selection=config.SWEBENCH_ON_LOAD_COLUMNS + ["Verified"],
|
1137 |
+
# cant_deselect=["Agent Name"],
|
1138 |
+
# label="Select Columns to Display:",
|
1139 |
+
# ),
|
1140 |
+
# hide_columns=config.SWEBENCH_HIDE_COLUMNS,
|
1141 |
+
# search_columns=config.SWEBENCH_SEARCH_COLUMNS
|
1142 |
+
# )
|
1143 |
+
# gr.Markdown("""*95% CIs calculated using Student's t-distribution.*""", elem_classes=["text-right"])
|
1144 |
+
# with gr.Row():
|
1145 |
+
# scatter_plot = gr.Plot(create_scatter_plot(parse_json_files(os.path.join(abs_path, "evals_live"), 'swebench_verified', aggregate=False), "Total Cost", "Accuracy", "Total Cost (in USD)", "Accuracy", ["Agent Name"]))
|
1146 |
|
1147 |
+
# gr.Markdown("")
|
1148 |
+
# gr.Markdown("")
|
1149 |
+
# gr.Markdown("## Task success heatmap")
|
1150 |
+
# with gr.Row():
|
1151 |
+
# task_success_heatmap = gr.Plot()
|
1152 |
+
# demo.load(
|
1153 |
+
# lambda: create_task_success_heatmap(
|
1154 |
+
# preprocessor.get_task_success_data('swebench_verified'),
|
1155 |
+
# 'SWEBench Verified'
|
1156 |
+
# ),
|
1157 |
+
# outputs=[task_success_heatmap]
|
1158 |
+
# )
|
1159 |
|
1160 |
+
# gr.Markdown("")
|
1161 |
+
# gr.Markdown("")
|
1162 |
+
# gr.Markdown("## Failure report for each agent")
|
1163 |
+
# with gr.Row():
|
1164 |
+
# with gr.Column(scale=1):
|
1165 |
+
# failure_report_agent_dropdown = gr.Dropdown(label="Select Agent for Failure Report")
|
1166 |
+
# with gr.Row():
|
1167 |
+
# with gr.Column(scale=1):
|
1168 |
+
# failure_categories_overview = gr.Markdown()
|
1169 |
+
|
1170 |
+
# with gr.Column(scale=1):
|
1171 |
+
# failure_categories_chart = gr.Plot()
|
1172 |
|
1173 |
+
# # Initialize the failure report agent dropdown with all agents
|
1174 |
+
# demo.load(update_agent_dropdown,
|
1175 |
+
# inputs=[gr.Textbox(value="swebench_verified", visible=False), gr.Textbox(value="Accuracy", visible=False)],
|
1176 |
+
# outputs=[failure_report_agent_dropdown])
|
1177 |
+
|
1178 |
+
# # Update failure report when agent is selected
|
1179 |
+
# failure_report_agent_dropdown.change(update_failure_report,
|
1180 |
+
# inputs=[failure_report_agent_dropdown, gr.Textbox(value="swebench_verified", visible=False)],
|
1181 |
+
# outputs=[failure_categories_overview, failure_categories_chart])
|
1182 |
+
|
1183 |
+
# gr.Markdown("")
|
1184 |
+
# gr.Markdown("")
|
1185 |
+
# gr.Markdown("## Agent monitor")
|
1186 |
+
# with gr.Row():
|
1187 |
+
# with gr.Column(scale=1):
|
1188 |
+
# agent_dropdown = gr.Dropdown(label="Select Agent")
|
1189 |
+
# with gr.Column(scale=1):
|
1190 |
+
# task_dropdown = gr.Dropdown(label="Select SWE-Bench Task")
|
1191 |
+
# with gr.Row():
|
1192 |
+
# task_overview = gr.Markdown()
|
1193 |
+
# with gr.Row():
|
1194 |
+
# flow_chart = gr.Plot(label="Task Flow")
|
1195 |
+
|
1196 |
+
# # Initialize the agent dropdown with the best agent
|
1197 |
+
# demo.load(update_agent_dropdown, inputs=[gr.Textbox(value="swebench_verified", visible=False), gr.Textbox(value="Accuracy", visible=False)], outputs=[agent_dropdown])
|
1198 |
+
# demo.load(update_task_analysis, inputs=[gr.Textbox(value="swebench_verified", visible=False), agent_dropdown], outputs=[task_overview, flow_chart, task_dropdown, gr.Textbox(visible=False)])
|
1199 |
+
|
1200 |
+
# agent_dropdown.change(update_task_analysis,
|
1201 |
+
# inputs=[gr.Textbox(value="swebench_verified", visible=False), agent_dropdown],
|
1202 |
+
# outputs=[task_overview, flow_chart, task_dropdown, gr.Textbox(visible=False)])
|
1203 |
+
# task_dropdown.change(update_task_details,
|
1204 |
+
# inputs=[gr.Textbox(value="swebench_verified", visible=False), agent_dropdown, task_dropdown],
|
1205 |
+
# outputs=[task_overview, flow_chart, gr.Textbox(visible=False)])
|
1206 |
+
|
1207 |
+
# gr.Markdown("## Raw predictions")
|
1208 |
+
# with gr.Row():
|
1209 |
+
# with gr.Column(scale=1):
|
1210 |
+
# raw_agent_dropdown = gr.Dropdown(label="Select Agent")
|
1211 |
+
# with gr.Column(scale=1):
|
1212 |
+
# raw_task_dropdown = gr.Dropdown(label="Select Task")
|
1213 |
+
# with gr.Column(scale=1):
|
1214 |
+
# raw_step_dropdown = gr.Dropdown(label="Select Step")
|
1215 |
+
|
1216 |
+
# with gr.Row():
|
1217 |
+
# raw_call_details = gr.HTML()
|
1218 |
+
|
1219 |
+
# def update_raw_task_dropdown(agent_name):
|
1220 |
+
# analyzed_traces = get_analyzed_traces(agent_name, "swebench_verified")
|
1221 |
+
# if not analyzed_traces:
|
1222 |
+
# return gr.Dropdown(choices=[], label="Select Task"), gr.Dropdown(choices=[], label="Select Step"), f"No raw predictions data available for agent: {agent_name}."
|
1223 |
+
# task_ids = list(analyzed_traces.keys())
|
1224 |
+
# steps = analyzed_traces[task_ids[0]]['steps']
|
1225 |
+
# return gr.Dropdown(choices=task_ids, label="Select Task", value=task_ids[0]), gr.Dropdown(choices=[(f"Step {i+1}", i) for i in range(len(steps))], label="Select Step", value=0), update_raw_call_details(agent_name, task_ids[0], 0)
|
1226 |
+
|
1227 |
+
# def update_raw_step_dropdown(agent_name, task_id):
|
1228 |
+
# analyzed_traces = get_analyzed_traces(agent_name, "swebench_verified")
|
1229 |
+
# if not analyzed_traces or task_id not in analyzed_traces:
|
1230 |
+
# return gr.Dropdown(choices=[], label="Select Step", value="No data available.")
|
1231 |
+
# steps = analyzed_traces[task_id]['steps']
|
1232 |
+
# return gr.Dropdown(choices=[(f"Step {i+1}", i) for i in range(len(steps))], label="Select Step", value=0), format_call_info(steps[0], 0)
|
1233 |
+
|
1234 |
+
# def update_raw_call_details(agent_name, task_id, step_index):
|
1235 |
+
# analyzed_traces = get_analyzed_traces(agent_name, "swebench_verified")
|
1236 |
+
# if not analyzed_traces or task_id not in analyzed_traces:
|
1237 |
+
# return "No data available for this selection."
|
1238 |
+
# steps = analyzed_traces[task_id]['steps']
|
1239 |
+
# if step_index is None:
|
1240 |
+
# return "Invalid step selection."
|
1241 |
+
# step = steps[step_index]
|
1242 |
+
# return format_call_info(step, step_index)
|
1243 |
+
|
1244 |
+
# # Initialize the raw agent dropdown with all agents
|
1245 |
+
# demo.load(update_agent_dropdown,
|
1246 |
+
# inputs=[gr.Textbox(value="swebench_verified", visible=False), gr.Textbox(value="Accuracy", visible=False)],
|
1247 |
+
# outputs=[raw_agent_dropdown])
|
1248 |
+
# demo.load(update_raw_task_dropdown,
|
1249 |
+
# inputs=[raw_agent_dropdown],
|
1250 |
+
# outputs=[raw_task_dropdown, raw_step_dropdown])
|
1251 |
+
# demo.load(update_raw_call_details,
|
1252 |
+
# inputs=[raw_agent_dropdown, raw_task_dropdown, raw_step_dropdown],
|
1253 |
+
# outputs=[raw_call_details])
|
1254 |
+
|
1255 |
+
# raw_agent_dropdown.change(update_raw_task_dropdown,
|
1256 |
+
# inputs=[raw_agent_dropdown],
|
1257 |
+
# outputs=[raw_task_dropdown, raw_step_dropdown, raw_call_details])
|
1258 |
+
# raw_task_dropdown.change(update_raw_step_dropdown,
|
1259 |
+
# inputs=[raw_agent_dropdown, raw_task_dropdown],
|
1260 |
+
# outputs=[raw_step_dropdown, raw_call_details])
|
1261 |
+
# raw_step_dropdown.change(update_raw_call_details,
|
1262 |
+
# inputs=[raw_agent_dropdown, raw_task_dropdown, raw_step_dropdown],
|
1263 |
+
# outputs=[raw_call_details])
|
1264 |
+
|
1265 |
+
# with gr.Tab("SWE-Bench Lite"):
|
1266 |
+
# gr.Markdown("""SWE-bench is a dataset that tests systems' ability to solve GitHub issues automatically. Lite is a subset of 300 tasks of the original SWE-bench. We are currently actively developing this platform and this benchmark is not fully implemented yet.""")
|
1267 |
+
# with gr.Row():
|
1268 |
+
# with gr.Column(scale=2):
|
1269 |
+
# Leaderboard(
|
1270 |
+
# value=create_leaderboard(parse_json_files(os.path.join(abs_path, "evals_live"), 'swebench_lite'), ci_metrics=['Accuracy', 'Total Cost']),
|
1271 |
+
# select_columns=SelectColumns(
|
1272 |
+
# default_selection=config.SWEBENCH_ON_LOAD_COLUMNS + ["Verified"],
|
1273 |
+
# cant_deselect=["Agent Name"],
|
1274 |
+
# label="Select Columns to Display:",
|
1275 |
+
# ),
|
1276 |
+
# search_columns=config.SWEBENCH_SEARCH_COLUMNS,
|
1277 |
+
# hide_columns=config.SWEBENCH_HIDE_COLUMNS
|
1278 |
+
# )
|
1279 |
+
# # make right aligned markdown
|
1280 |
+
# gr.Markdown("""*95% CIs calculated using Student's t-distribution.*""", elem_classes=["text-right"])
|
1281 |
+
# with gr.Row():
|
1282 |
+
# scatter_plot = gr.Plot(create_scatter_plot(parse_json_files(os.path.join(abs_path, "evals_live"), 'swebench_lite', aggregate=True), "Total Cost", "Accuracy", "Total Cost (in USD)", "Accuracy", ["Agent Name"]))
|
1283 |
+
|
1284 |
+
# gr.Markdown("")
|
1285 |
+
# gr.Markdown("")
|
1286 |
+
# gr.Markdown("## Task success heatmap")
|
1287 |
+
# with gr.Row():
|
1288 |
+
# task_success_heatmap = gr.Plot()
|
1289 |
+
# demo.load(
|
1290 |
+
# lambda: create_task_success_heatmap(
|
1291 |
+
# preprocessor.get_task_success_data('swebench_lite'),
|
1292 |
+
# 'SWEBench Lite'
|
1293 |
+
# ),
|
1294 |
+
# outputs=[task_success_heatmap]
|
1295 |
+
# )
|
1296 |
+
|
1297 |
+
# gr.Markdown("")
|
1298 |
+
# gr.Markdown("")
|
1299 |
+
# gr.Markdown("## Failure report for each agent")
|
1300 |
+
# with gr.Row():
|
1301 |
+
# with gr.Column(scale=1):
|
1302 |
+
# failure_report_agent_dropdown = gr.Dropdown(label="Select Agent for Failure Report")
|
1303 |
+
# with gr.Row():
|
1304 |
+
# with gr.Column(scale=1):
|
1305 |
+
# failure_categories_overview = gr.Markdown()
|
1306 |
+
|
1307 |
+
# with gr.Column(scale=1):
|
1308 |
+
# failure_categories_chart = gr.Plot()
|
1309 |
|
1310 |
+
# # Initialize the failure report agent dropdown with all agents
|
1311 |
+
# demo.load(update_agent_dropdown,
|
1312 |
+
# inputs=[gr.Textbox(value="swebench_lite", visible=False), gr.Textbox(value="Accuracy", visible=False)],
|
1313 |
+
# outputs=[failure_report_agent_dropdown])
|
1314 |
+
|
1315 |
+
# # Update failure report when agent is selected
|
1316 |
+
# failure_report_agent_dropdown.change(update_failure_report,
|
1317 |
+
# inputs=[failure_report_agent_dropdown, gr.Textbox(value="swebench_lite", visible=False)],
|
1318 |
+
# outputs=[failure_categories_overview, failure_categories_chart])
|
1319 |
+
|
1320 |
+
# gr.Markdown("")
|
1321 |
+
# gr.Markdown("")
|
1322 |
+
# gr.Markdown("## Agent monitor")
|
1323 |
+
# with gr.Row():
|
1324 |
+
# with gr.Column(scale=1):
|
1325 |
+
# agent_dropdown = gr.Dropdown(label="Select Agent")
|
1326 |
+
# with gr.Column(scale=1):
|
1327 |
+
# task_dropdown = gr.Dropdown(label="Select SWE-Bench Task")
|
1328 |
+
# with gr.Row():
|
1329 |
+
# task_overview = gr.Markdown()
|
1330 |
+
# with gr.Row():
|
1331 |
+
# flow_chart = gr.Plot(label="Task Flow")
|
1332 |
+
|
1333 |
+
# # Initialize the agent dropdown with the best agent
|
1334 |
+
# demo.load(update_agent_dropdown, inputs=[gr.Textbox(value="swebench_lite", visible=False), gr.Textbox(value="Accuracy", visible=False)], outputs=[agent_dropdown])
|
1335 |
+
# demo.load(update_task_analysis, inputs=[gr.Textbox(value="swebench_lite", visible=False), agent_dropdown], outputs=[task_overview, flow_chart, task_dropdown, gr.Textbox(visible=False)])
|
1336 |
+
|
1337 |
+
# agent_dropdown.change(update_task_analysis,
|
1338 |
+
# inputs=[gr.Textbox(value="swebench_lite", visible=False), agent_dropdown],
|
1339 |
+
# outputs=[task_overview, flow_chart, task_dropdown, gr.Textbox(visible=False)])
|
1340 |
+
# task_dropdown.change(update_task_details,
|
1341 |
+
# inputs=[gr.Textbox(value="swebench_lite", visible=False), agent_dropdown, task_dropdown],
|
1342 |
+
# outputs=[task_overview, flow_chart, gr.Textbox(visible=False)])
|
1343 |
|
1344 |
+
|
1345 |
+
# gr.Markdown("## Raw predictions")
|
1346 |
+
# with gr.Row():
|
1347 |
+
# with gr.Column(scale=1):
|
1348 |
+
# raw_agent_dropdown = gr.Dropdown(label="Select Agent")
|
1349 |
+
# with gr.Column(scale=1):
|
1350 |
+
# raw_task_dropdown = gr.Dropdown(label="Select Task")
|
1351 |
+
# with gr.Column(scale=1):
|
1352 |
+
# raw_step_dropdown = gr.Dropdown(label="Select Step")
|
1353 |
+
|
1354 |
+
# with gr.Row():
|
1355 |
+
# raw_call_details = gr.HTML()
|
1356 |
+
|
1357 |
+
# def update_raw_task_dropdown(agent_name):
|
1358 |
+
# analyzed_traces = get_analyzed_traces(agent_name, "swebench_lite")
|
1359 |
+
# if not analyzed_traces:
|
1360 |
+
# return gr.Dropdown(choices=[], label="Select Task"), gr.Dropdown(choices=[], label="Select Step"), f"No raw predictions data available for agent: {agent_name}."
|
1361 |
+
# task_ids = list(analyzed_traces.keys())
|
1362 |
+
# steps = analyzed_traces[task_ids[0]]['steps']
|
1363 |
+
# return gr.Dropdown(choices=task_ids, label="Select Task", value=task_ids[0]), gr.Dropdown(choices=[(f"Step {i+1}", i) for i in range(len(steps))], label="Select Step", value=0), update_raw_call_details(agent_name, task_ids[0], 0)
|
1364 |
+
|
1365 |
+
# def update_raw_step_dropdown(agent_name, task_id):
|
1366 |
+
# analyzed_traces = get_analyzed_traces(agent_name, "swebench_lite")
|
1367 |
+
# if not analyzed_traces or task_id not in analyzed_traces:
|
1368 |
+
# return gr.Dropdown(choices=[], label="Select Step", value="No data available.")
|
1369 |
+
# steps = analyzed_traces[task_id]['steps']
|
1370 |
+
# return gr.Dropdown(choices=[(f"Step {i+1}", i) for i in range(len(steps))], label="Select Step", value=0), format_call_info(steps[0], 0)
|
1371 |
+
|
1372 |
+
# def update_raw_call_details(agent_name, task_id, step_index):
|
1373 |
+
# analyzed_traces = get_analyzed_traces(agent_name, "swebench_lite")
|
1374 |
+
# if not analyzed_traces or task_id not in analyzed_traces:
|
1375 |
+
# return "No data available for this selection."
|
1376 |
+
# steps = analyzed_traces[task_id]['steps']
|
1377 |
+
# if step_index is None:
|
1378 |
+
# return "Invalid step selection."
|
1379 |
+
# step = steps[step_index]
|
1380 |
+
# return format_call_info(step, step_index)
|
1381 |
+
|
1382 |
+
# # Initialize the raw agent dropdown with all agents
|
1383 |
+
# demo.load(update_agent_dropdown,
|
1384 |
+
# inputs=[gr.Textbox(value="swebench_lite", visible=False), gr.Textbox(value="Accuracy", visible=False)],
|
1385 |
+
# outputs=[raw_agent_dropdown])
|
1386 |
+
# demo.load(update_raw_task_dropdown,
|
1387 |
+
# inputs=[raw_agent_dropdown],
|
1388 |
+
# outputs=[raw_task_dropdown, raw_step_dropdown])
|
1389 |
+
# demo.load(update_raw_call_details,
|
1390 |
+
# inputs=[raw_agent_dropdown, raw_task_dropdown, raw_step_dropdown],
|
1391 |
+
# outputs=[raw_call_details])
|
1392 |
+
|
1393 |
+
# raw_agent_dropdown.change(update_raw_task_dropdown,
|
1394 |
+
# inputs=[raw_agent_dropdown],
|
1395 |
+
# outputs=[raw_task_dropdown, raw_step_dropdown, raw_call_details])
|
1396 |
+
# raw_task_dropdown.change(update_raw_step_dropdown,
|
1397 |
+
# inputs=[raw_agent_dropdown, raw_task_dropdown],
|
1398 |
+
# outputs=[raw_step_dropdown, raw_call_details])
|
1399 |
+
# raw_step_dropdown.change(update_raw_call_details,
|
1400 |
+
# inputs=[raw_agent_dropdown, raw_task_dropdown, raw_step_dropdown],
|
1401 |
+
# outputs=[raw_call_details])
|
1402 |
+
|
1403 |
|
1404 |
+
|
1405 |
+
# with gr.Tab("MLAgentBench"):
|
1406 |
+
# gr.Markdown("""MLAgentBench is a suite of end-to-end Machine Learning (ML) experimentation tasks, where the agent aims to take a given dataset and a machine learning task description and autonomously develop or improve an ML model. We are currently actively developing this platform and this benchmark is not fully implemented yet. In particular, we only include one agent and a subset of tasks for this benchmark.""")
|
1407 |
+
# with gr.Row():
|
1408 |
+
# with gr.Column(scale=2):
|
1409 |
+
# Leaderboard(
|
1410 |
+
# value=parse_json_files(os.path.join(abs_path, "evals_live"), 'mlagentbench'),
|
1411 |
+
# select_columns=SelectColumns(
|
1412 |
+
# default_selection=config.MLAGENTBENCH_ON_LOAD_COLUMNS + ["Verified"],
|
1413 |
+
# cant_deselect=["Agent Name"],
|
1414 |
+
# label="Select Columns to Display:",
|
1415 |
+
# ),
|
1416 |
+
# search_columns=config.MLAGENTBENCH_SEARCH_COLUMNS,
|
1417 |
+
# hide_columns=config.MLAGENTBENCH_HIDE_COLUMNS,
|
1418 |
+
# )
|
1419 |
+
# gr.Markdown("""*95% CIs calculated using Student's t-distribution.*""", elem_classes=["text-right"])
|
1420 |
+
# with gr.Row():
|
1421 |
+
# scatter_plot = gr.Plot(create_scatter_plot(parse_json_files(os.path.join(abs_path, "evals_live"), 'mlagentbench', aggregate=False), "Total Cost", "Overall Score", "Total Cost (in USD)", "Overall Score", ["Agent Name"]))
|
1422 |
+
|
1423 |
+
# gr.Markdown("")
|
1424 |
+
# gr.Markdown("")
|
1425 |
+
# gr.Markdown("## Failure report for each agent")
|
1426 |
+
# with gr.Row():
|
1427 |
+
# with gr.Column(scale=1):
|
1428 |
+
# failure_report_agent_dropdown = gr.Dropdown(label="Select Agent for Failure Report")
|
1429 |
+
# with gr.Row():
|
1430 |
+
# with gr.Column(scale=1):
|
1431 |
+
# failure_categories_overview = gr.Markdown()
|
1432 |
|
1433 |
+
# with gr.Column(scale=1):
|
1434 |
+
# failure_categories_chart = gr.Plot()
|
1435 |
+
|
1436 |
+
# # Initialize the failure report agent dropdown with all agents
|
1437 |
+
# demo.load(update_agent_dropdown,
|
1438 |
+
# inputs=[gr.Textbox(value="mlagentbench", visible=False), gr.Textbox(value="Overall Score", visible=False)],
|
1439 |
+
# outputs=[failure_report_agent_dropdown])
|
1440 |
+
|
1441 |
+
# # Update failure report when agent is selected
|
1442 |
+
# failure_report_agent_dropdown.change(update_failure_report,
|
1443 |
+
# inputs=[failure_report_agent_dropdown, gr.Textbox(value="mlagentbench", visible=False)],
|
1444 |
+
# outputs=[failure_categories_overview, failure_categories_chart])
|
1445 |
+
|
1446 |
+
# gr.Markdown("")
|
1447 |
+
# gr.Markdown("")
|
1448 |
+
# gr.Markdown("## Agent monitor")
|
1449 |
+
# with gr.Row():
|
1450 |
+
# with gr.Column(scale=1):
|
1451 |
+
# agent_dropdown = gr.Dropdown(label="Select Agent")
|
1452 |
+
# with gr.Column(scale=1):
|
1453 |
+
# task_dropdown = gr.Dropdown(label="Select SWE-Bench Task")
|
1454 |
+
# with gr.Row():
|
1455 |
+
# task_overview = gr.Markdown()
|
1456 |
+
# with gr.Row():
|
1457 |
+
# flow_chart = gr.Plot(label="Task Flow")
|
1458 |
+
|
1459 |
+
# # Initialize the agent dropdown with the best agent
|
1460 |
+
# demo.load(update_agent_dropdown, inputs=[gr.Textbox(value="mlagentbench", visible=False), gr.Textbox(value="Overall Score", visible=False)], outputs=[agent_dropdown])
|
1461 |
+
# demo.load(update_task_analysis, inputs=[gr.Textbox(value="mlagentbench", visible=False), agent_dropdown], outputs=[task_overview, flow_chart, task_dropdown, gr.Textbox(visible=False)])
|
1462 |
+
|
1463 |
+
# agent_dropdown.change(update_task_analysis,
|
1464 |
+
# inputs=[gr.Textbox(value="mlagentbench", visible=False), agent_dropdown],
|
1465 |
+
# outputs=[task_overview, flow_chart, task_dropdown, gr.Textbox(visible=False)])
|
1466 |
+
# task_dropdown.change(update_task_details,
|
1467 |
+
# inputs=[gr.Textbox(value="mlagentbench", visible=False), agent_dropdown, task_dropdown],
|
1468 |
+
# outputs=[task_overview, flow_chart, gr.Textbox(visible=False)])
|
1469 |
+
|
1470 |
+
|
1471 |
+
# gr.Markdown("## Raw predictions")
|
1472 |
+
# with gr.Row():
|
1473 |
+
# with gr.Column(scale=1):
|
1474 |
+
# raw_agent_dropdown = gr.Dropdown(label="Select Agent")
|
1475 |
+
# with gr.Column(scale=1):
|
1476 |
+
# raw_task_dropdown = gr.Dropdown(label="Select Task")
|
1477 |
+
# with gr.Column(scale=1):
|
1478 |
+
# raw_step_dropdown = gr.Dropdown(label="Select Step")
|
1479 |
+
|
1480 |
+
# with gr.Row():
|
1481 |
+
# raw_call_details = gr.HTML()
|
1482 |
|
1483 |
+
# def update_raw_task_dropdown(agent_name):
|
1484 |
+
# analyzed_traces = get_analyzed_traces(agent_name, "mlagentbench")
|
1485 |
+
# if not analyzed_traces:
|
1486 |
+
# return gr.Dropdown(choices=[], label="Select Task"), gr.Dropdown(choices=[], label="Select Step"), f"No raw predictions data available for agent: {agent_name}."
|
1487 |
+
# task_ids = list(analyzed_traces.keys())
|
1488 |
+
# steps = analyzed_traces[task_ids[0]]['steps']
|
1489 |
+
# return gr.Dropdown(choices=task_ids, label="Select Task", value=task_ids[0]), gr.Dropdown(choices=[(f"Step {i+1}", i) for i in range(len(steps))], label="Select Step", value=0), update_raw_call_details(agent_name, task_ids[0], 0)
|
1490 |
+
|
1491 |
+
# def update_raw_step_dropdown(agent_name, task_id):
|
1492 |
+
# analyzed_traces = get_analyzed_traces(agent_name, "mlagentbench")
|
1493 |
+
# if not analyzed_traces or task_id not in analyzed_traces:
|
1494 |
+
# return gr.Dropdown(choices=[], label="Select Step", value="No data available.")
|
1495 |
+
# steps = analyzed_traces[task_id]['steps']
|
1496 |
+
# return gr.Dropdown(choices=[(f"Step {i+1}", i) for i in range(len(steps))], label="Select Step", value=0), format_call_info(steps[0], 0)
|
1497 |
+
|
1498 |
+
# def update_raw_call_details(agent_name, task_id, step_index):
|
1499 |
+
# analyzed_traces = get_analyzed_traces(agent_name, "mlagentbench")
|
1500 |
+
# if not analyzed_traces or task_id not in analyzed_traces:
|
1501 |
+
# return "No data available for this selection."
|
1502 |
+
# steps = analyzed_traces[task_id]['steps']
|
1503 |
+
# if step_index is None:
|
1504 |
+
# return "Invalid step selection."
|
1505 |
+
# step = steps[step_index]
|
1506 |
+
# return format_call_info(step, step_index)
|
1507 |
+
|
1508 |
+
# # Initialize the raw agent dropdown with all agents
|
1509 |
+
# demo.load(update_agent_dropdown,
|
1510 |
+
# inputs=[gr.Textbox(value="mlagentbench", visible=False), gr.Textbox(value="Overall Score", visible=False)],
|
1511 |
+
# outputs=[raw_agent_dropdown])
|
1512 |
+
# demo.load(update_raw_task_dropdown,
|
1513 |
+
# inputs=[raw_agent_dropdown],
|
1514 |
+
# outputs=[raw_task_dropdown, raw_step_dropdown])
|
1515 |
+
# demo.load(update_raw_call_details,
|
1516 |
+
# inputs=[raw_agent_dropdown, raw_task_dropdown, raw_step_dropdown],
|
1517 |
+
# outputs=[raw_call_details])
|
1518 |
+
|
1519 |
+
# raw_agent_dropdown.change(update_raw_task_dropdown,
|
1520 |
+
# inputs=[raw_agent_dropdown],
|
1521 |
+
# outputs=[raw_task_dropdown, raw_step_dropdown, raw_call_details])
|
1522 |
+
# raw_task_dropdown.change(update_raw_step_dropdown,
|
1523 |
+
# inputs=[raw_agent_dropdown, raw_task_dropdown],
|
1524 |
+
# outputs=[raw_step_dropdown, raw_call_details])
|
1525 |
+
# raw_step_dropdown.change(update_raw_call_details,
|
1526 |
+
# inputs=[raw_agent_dropdown, raw_task_dropdown, raw_step_dropdown],
|
1527 |
+
# outputs=[raw_call_details])
|
1528 |
|
1529 |
|
1530 |
with gr.Tab("About"):
|
|
|
1551 |
# Download the results from the Hugging Face Hub
|
1552 |
# await asyncio.to_thread(download_latest_results)
|
1553 |
|
1554 |
+
# # Check for new uploads and process them
|
1555 |
# await check_and_process_uploads()
|
1556 |
|
1557 |
scheduler = AsyncIOScheduler()
|
1558 |
scheduler.add_job(restart_space, "interval", hours=1)
|
1559 |
+
# scheduler.add_job(download_latest_results, "interval", hours=1)
|
1560 |
# scheduler.add_job(check_and_process_uploads, "interval", hours=1)
|
1561 |
scheduler.start()
|
1562 |
|