benediktstroebl commited on
Commit
19bb306
·
1 Parent(s): 3427022

added timestamp to task summary prompt for failure report and fixed failure report gradio issue

Browse files
Files changed (2) hide show
  1. agent_monitor/failure_report.py +2 -0
  2. app.py +5 -5
agent_monitor/failure_report.py CHANGED
@@ -6,6 +6,7 @@ from pydantic import BaseModel
6
  from abc import ABC, abstractmethod
7
  import json
8
  from typing import Dict, List
 
9
 
10
  class FailureCategory(BaseModel):
11
  category_id: int
@@ -105,6 +106,7 @@ async def summarize_task(task_id, calls, llm_client):
105
  Step {i}:
106
  Input: {call['inputs']}
107
  Output: {call['outputs']}
 
108
  """
109
 
110
  prompt = f"""
 
6
  from abc import ABC, abstractmethod
7
  import json
8
  from typing import Dict, List
9
+ from datetime import datetime
10
 
11
  class FailureCategory(BaseModel):
12
  category_id: int
 
106
  Step {i}:
107
  Input: {call['inputs']}
108
  Output: {call['outputs']}
109
+ Timestamp: {datetime.fromtimestamp(call_data['created_timestamp'])}
110
  """
111
 
112
  prompt = f"""
app.py CHANGED
@@ -216,8 +216,8 @@ def format_call_info(step, step_index):
216
  return formatted_info
217
 
218
 
219
- def update_failure_report(agent_name):
220
- failure_report = get_failure_report(agent_name, "swebench_lite")
221
  if not failure_report:
222
  return "No failure report available for this agent.", None
223
 
@@ -382,7 +382,7 @@ with gr.Blocks() as demo:
382
 
383
  # Update failure report when agent is selected
384
  failure_report_agent_dropdown.change(update_failure_report,
385
- inputs=[failure_report_agent_dropdown],
386
  outputs=[failure_categories_overview, failure_categories_chart])
387
 
388
  gr.Markdown("# Raw Predictions")
@@ -480,7 +480,7 @@ with gr.Blocks() as demo:
480
 
481
  # Update failure report when agent is selected
482
  failure_report_agent_dropdown.change(update_failure_report,
483
- inputs=[failure_report_agent_dropdown],
484
  outputs=[failure_categories_overview, failure_categories_chart])
485
 
486
  gr.Markdown("# Raw Predictions")
@@ -553,7 +553,7 @@ async def main():
553
  preprocess_traces()
554
 
555
  # # Download the results from the Hugging Face Hub
556
- # await asyncio.to_thread(download_latest_results)
557
 
558
  # Check for new uploads and process them
559
  await check_and_process_uploads()
 
216
  return formatted_info
217
 
218
 
219
+ def update_failure_report(agent_name, benchmark_name):
220
+ failure_report = get_failure_report(agent_name, benchmark_name)
221
  if not failure_report:
222
  return "No failure report available for this agent.", None
223
 
 
382
 
383
  # Update failure report when agent is selected
384
  failure_report_agent_dropdown.change(update_failure_report,
385
+ inputs=[failure_report_agent_dropdown, gr.Textbox(value="swebench_lite", visible=False)],
386
  outputs=[failure_categories_overview, failure_categories_chart])
387
 
388
  gr.Markdown("# Raw Predictions")
 
480
 
481
  # Update failure report when agent is selected
482
  failure_report_agent_dropdown.change(update_failure_report,
483
+ inputs=[failure_report_agent_dropdown, gr.Textbox(value="swebench_verified", visible=False)],
484
  outputs=[failure_categories_overview, failure_categories_chart])
485
 
486
  gr.Markdown("# Raw Predictions")
 
553
  preprocess_traces()
554
 
555
  # # Download the results from the Hugging Face Hub
556
+ await asyncio.to_thread(download_latest_results)
557
 
558
  # Check for new uploads and process them
559
  await check_and_process_uploads()