|
|
|
|
|
from pathlib import Path |
|
|
|
import click |
|
|
|
from agbenchmark.reports.processing.report_types import Report |
|
|
|
|
|
@click.command() |
|
@click.argument( |
|
"report_json_file", type=click.Path(exists=True, dir_okay=False, path_type=Path) |
|
) |
|
def print_markdown_report(report_json_file: Path): |
|
""" |
|
Generates a Markdown report from a given report.json file. |
|
|
|
:param report_json_file: Path to the report.json file. |
|
:return: A string containing the Markdown formatted report. |
|
""" |
|
report = Report.model_validate_json(report_json_file.read_text()) |
|
|
|
|
|
click.echo("# Benchmark Report") |
|
click.echo(f"- β **Run time:** `{report.metrics.run_time}`") |
|
click.echo( |
|
f" - **Started at:** `{report.benchmark_start_time[:16].replace('T', '` `')}`" |
|
) |
|
if report.completion_time: |
|
click.echo( |
|
f" - **Completed at:** `{report.completion_time[:16].replace('T', '` `')}`" |
|
) |
|
if report.metrics.total_cost: |
|
click.echo(f"- πΈ **Total cost:** `${round(report.metrics.total_cost, 2)}`") |
|
click.echo( |
|
f"- π
**Highest achieved difficulty:** `{report.metrics.highest_difficulty}`" |
|
) |
|
click.echo(f"- βοΈ **Command:** `{report.command}`") |
|
|
|
click.echo() |
|
|
|
|
|
successful, failed, unreliable = [], [], [] |
|
for test in report.tests.values(): |
|
test.metrics.success_percentage = ( |
|
rsp |
|
if (rsp := test.metrics.success_percentage) is not None |
|
else sum(float(r.success or 0) for r in test.results) |
|
* 100 |
|
/ len(test.results) |
|
) |
|
if test.metrics.success_percentage == 100.0: |
|
successful.append(test) |
|
elif test.metrics.success_percentage == 0.0: |
|
failed.append(test) |
|
else: |
|
unreliable.append(test) |
|
|
|
|
|
click.echo("## Summary") |
|
click.echo(f"- **`{len(successful)}` passed** {'β
'*len(successful)}") |
|
click.echo(f"- **`{len(failed)}` failed** {'β'*len(failed)}") |
|
click.echo(f"- **`{len(unreliable)}` unreliable** {'β οΈ'*len(unreliable)}") |
|
|
|
click.echo() |
|
|
|
|
|
click.echo("## Challenges") |
|
for test_name, test in report.tests.items(): |
|
click.echo() |
|
|
|
result_indicator = ( |
|
"β
" |
|
if test.metrics.success_percentage == 100.0 |
|
else "β οΈ" |
|
if test.metrics.success_percentage > 0 |
|
else "β" |
|
) |
|
click.echo( |
|
f"### {test_name} {result_indicator if test.metrics.attempted else 'β'}" |
|
) |
|
click.echo(f"{test.description}") |
|
|
|
click.echo() |
|
|
|
click.echo(f"- **Attempted:** {'Yes π' if test.metrics.attempted else 'No π'}") |
|
click.echo( |
|
f"- **Success rate:** {round(test.metrics.success_percentage)}% " |
|
f"({len([r for r in test.results if r.success])}/{len(test.results)})" |
|
) |
|
click.echo(f"- **Difficulty:** `{test.difficulty}`") |
|
click.echo(f"- **Categories:** `{'`, `'.join(test.category)}`") |
|
click.echo( |
|
f"<details>\n<summary><strong>Task</strong> (click to expand)</summary>\n\n" |
|
f"{indent('> ', test.task)}\n\n" |
|
f"Reference answer:\n{indent('> ', test.answer)}\n" |
|
"</details>" |
|
) |
|
|
|
click.echo() |
|
|
|
click.echo("\n#### Attempts") |
|
for i, attempt in enumerate(test.results, 1): |
|
click.echo( |
|
f"\n{i}. **{'β
Passed' if attempt.success else 'β Failed'}** " |
|
f"in **{attempt.run_time}** " |
|
f"and **{quantify('step', attempt.n_steps)}**\n" |
|
) |
|
if attempt.cost is not None: |
|
click.echo(f" - **Cost:** `${round(attempt.cost, 3)}`") |
|
if attempt.fail_reason: |
|
click.echo( |
|
" - **Failure reason:**\n" |
|
+ indent(" > ", attempt.fail_reason) |
|
+ "\n" |
|
) |
|
if attempt.steps: |
|
click.echo( |
|
indent( |
|
3 * " ", |
|
"<details>\n<summary><strong>Steps</strong></summary>\n", |
|
) |
|
) |
|
for j, step in enumerate(attempt.steps, 1): |
|
click.echo() |
|
click.echo( |
|
indent(3 * " ", f"{j}. {indent(3*' ', step.output, False)}") |
|
) |
|
click.echo("\n</details>") |
|
|
|
|
|
def indent(indent: str, text: str, prefix_indent: bool = True) -> str: |
|
return (indent if prefix_indent else "") + text.replace("\n", "\n" + indent) |
|
|
|
|
|
def quantify(noun: str, count: int, plural_suffix: str = "s") -> str: |
|
if count == 1: |
|
return f"{count} {noun}" |
|
return f"{count} {noun}{plural_suffix}" |
|
|
|
|
|
if __name__ == "__main__": |
|
print_markdown_report() |
|
|