|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
"""Accuracy metric for the Mathematics Aptitude Test of Heuristics (MATH) dataset.""" |
|
|
|
import datasets |
|
import math_equivalence |
|
|
|
import evaluate |
|
|
|
|
|
_CITATION = """\ |
|
@article{hendrycksmath2021, |
|
title={Measuring Mathematical Problem Solving With the MATH Dataset}, |
|
author={Dan Hendrycks |
|
and Collin Burns |
|
and Saurav Kadavath |
|
and Akul Arora |
|
and Steven Basart |
|
and Eric Tang |
|
and Dawn Song |
|
and Jacob Steinhardt}, |
|
journal={arXiv preprint arXiv:2103.03874}, |
|
year={2021} |
|
} |
|
""" |
|
|
|
|
|
_DESCRIPTION = """\ |
|
This metric is used to assess performance on the Mathematics Aptitude Test of Heuristics (MATH) dataset. |
|
It first canonicalizes the inputs (e.g., converting "1/2" to "\\frac{1}{2}") and then computes accuracy. |
|
""" |
|
|
|
|
|
_KWARGS_DESCRIPTION = r""" |
|
Calculates accuracy after canonicalizing inputs. |
|
|
|
Args: |
|
predictions: list of predictions to score. Each prediction |
|
is a string that contains natural language and LaTex. |
|
references: list of reference for each prediction. Each |
|
reference is a string that contains natural language |
|
and LaTex. |
|
Returns: |
|
accuracy: accuracy after canonicalizing inputs |
|
(e.g., converting "1/2" to "\\frac{1}{2}") |
|
|
|
Examples: |
|
>>> metric = evaluate.load("competition_math") |
|
>>> results = metric.compute(references=["\\frac{1}{2}"], predictions=["1/2"]) |
|
>>> print(results) |
|
{'accuracy': 1.0} |
|
""" |
|
|
|
|
|
@datasets.utils.file_utils.add_end_docstrings(_DESCRIPTION, _KWARGS_DESCRIPTION) |
|
class CompetitionMathMetric(evaluate.Metric): |
|
"""Accuracy metric for the MATH dataset.""" |
|
|
|
def _info(self): |
|
return evaluate.MetricInfo( |
|
description=_DESCRIPTION, |
|
citation=_CITATION, |
|
inputs_description=_KWARGS_DESCRIPTION, |
|
features=datasets.Features( |
|
{ |
|
"predictions": datasets.Value("string"), |
|
"references": datasets.Value("string"), |
|
} |
|
), |
|
|
|
homepage="https://github.com/hendrycks/math", |
|
|
|
codebase_urls=["https://github.com/hendrycks/math"], |
|
) |
|
|
|
def _compute(self, predictions, references): |
|
"""Returns the scores""" |
|
n_correct = 0.0 |
|
for i, j in zip(predictions, references): |
|
n_correct += 1.0 if math_equivalence.is_equiv(i, j) else 0.0 |
|
accuracy = n_correct / len(predictions) |
|
return { |
|
"accuracy": accuracy, |
|
} |
|
|