adding main description; skeleton of code
Browse files- anls.py +54 -85
- compute_score.py +38 -0
- requirements.txt +2 -0
anls.py
CHANGED
@@ -14,11 +14,10 @@
|
|
14 |
"""ANLS - Average Normalized Levenshtein Similarity"""
|
15 |
|
16 |
import datasets
|
17 |
-
import numpy as np
|
18 |
-
from sklearn.metrics import mean_absolute_error
|
19 |
-
|
20 |
import evaluate
|
21 |
|
|
|
|
|
22 |
|
23 |
_CITATION = """\
|
24 |
@article{,
|
@@ -34,105 +33,75 @@ _CITATION = """\
|
|
34 |
"""
|
35 |
|
36 |
_DESCRIPTION = """\
|
37 |
-
ANLS refer to the average normalized Levenshtein similarity
|
38 |
"""
|
39 |
|
40 |
|
41 |
_KWARGS_DESCRIPTION = """
|
|
|
42 |
Args:
|
43 |
-
predictions:
|
44 |
-
|
45 |
-
|
46 |
-
|
47 |
-
|
48 |
-
|
49 |
-
|
50 |
-
|
51 |
-
|
52 |
-
|
53 |
-
|
54 |
-
Defines aggregating of multiple output values. Array-like value defines weights used to average errors.
|
55 |
-
|
56 |
-
"raw_values" : Returns a full set of errors in case of multioutput input.
|
57 |
-
|
58 |
-
"uniform_average" : Errors of all outputs are averaged with uniform weight.
|
59 |
-
|
60 |
Returns:
|
61 |
-
|
62 |
-
If multioutput is "raw_values", then mean absolute percentage error is returned for each output separately. If multioutput is "uniform_average" or an ndarray of weights, then the weighted average of all output errors is returned.
|
63 |
-
MASE output is non-negative floating point. The best value is 0.0.
|
64 |
Examples:
|
65 |
-
|
66 |
-
>>>
|
67 |
-
>>>
|
68 |
-
>>>
|
69 |
-
>>> training = [5, 0.5, 4, 6, 3, 5, 2]
|
70 |
-
>>> results = mase_metric.compute(predictions=predictions, references=references, training=training)
|
71 |
-
>>> print(results)
|
72 |
-
{'mase': 0.18333333333333335}
|
73 |
-
|
74 |
-
If you're using multi-dimensional lists, then set the config as follows :
|
75 |
-
|
76 |
-
>>> mase_metric = evaluate.load("mase", "multilist")
|
77 |
-
>>> predictions = [[0, 2], [-1, 2], [8, -5]]
|
78 |
-
>>> references = [[0.5, 1], [-1, 1], [7, -6]]
|
79 |
-
>>> training = [[0.5, 1], [-1, 1], [7, -6]]
|
80 |
-
>>> results = mase_metric.compute(predictions=predictions, references=references, training=training)
|
81 |
>>> print(results)
|
82 |
-
{'
|
83 |
-
>>> results = mase_metric.compute(predictions=predictions, references=references, training=training, multioutput='raw_values')
|
84 |
-
>>> print(results)
|
85 |
-
{'mase': array([0.10526316, 0.28571429])}
|
86 |
-
>>> results = mase_metric.compute(predictions=predictions, references=references, training=training, multioutput=[0.3, 0.7])
|
87 |
-
>>> print(results)
|
88 |
-
{'mase': 0.21935483870967742}
|
89 |
"""
|
90 |
|
91 |
|
92 |
@evaluate.utils.file_utils.add_start_docstrings(_DESCRIPTION, _KWARGS_DESCRIPTION)
|
93 |
-
class
|
94 |
def _info(self):
|
95 |
return evaluate.MetricInfo(
|
96 |
description=_DESCRIPTION,
|
97 |
citation=_CITATION,
|
98 |
inputs_description=_KWARGS_DESCRIPTION,
|
99 |
-
features=datasets.Features(
|
100 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
101 |
)
|
102 |
|
103 |
-
def
|
104 |
-
|
105 |
-
|
106 |
-
|
107 |
-
"
|
108 |
-
|
109 |
-
|
110 |
-
|
111 |
-
|
112 |
-
|
|
|
|
|
|
|
|
|
|
|
113 |
}
|
114 |
-
|
115 |
-
|
116 |
-
|
117 |
-
predictions,
|
118 |
-
references,
|
119 |
-
training,
|
120 |
-
periodicity=1,
|
121 |
-
sample_weight=None,
|
122 |
-
multioutput="uniform_average",
|
123 |
-
):
|
124 |
-
|
125 |
-
y_pred_naive = training[:-periodicity]
|
126 |
-
mae_naive = mean_absolute_error(training[periodicity:], y_pred_naive, multioutput=multioutput)
|
127 |
-
|
128 |
-
mae_score = mean_absolute_error(
|
129 |
-
references,
|
130 |
-
predictions,
|
131 |
-
sample_weight=sample_weight,
|
132 |
-
multioutput=multioutput,
|
133 |
-
)
|
134 |
-
|
135 |
-
epsilon = np.finfo(np.float64).eps
|
136 |
-
mase_score = mae_score / np.maximum(mae_naive, epsilon)
|
137 |
-
|
138 |
-
return {"mase": mase_score}
|
|
|
14 |
"""ANLS - Average Normalized Levenshtein Similarity"""
|
15 |
|
16 |
import datasets
|
|
|
|
|
|
|
17 |
import evaluate
|
18 |
|
19 |
+
from compute_score import compute_score
|
20 |
+
|
21 |
|
22 |
_CITATION = """\
|
23 |
@article{,
|
|
|
33 |
"""
|
34 |
|
35 |
_DESCRIPTION = """\
|
36 |
+
ANLS refer to the average normalized Levenshtein similarity.
|
37 |
"""
|
38 |
|
39 |
|
40 |
_KWARGS_DESCRIPTION = """
|
41 |
+
Computes Average Normalized Levenshtein Similarity (ANLS).
|
42 |
Args:
|
43 |
+
predictions: List of question-answers dictionaries with the following key-values:
|
44 |
+
- 'id': id of the question-answer pair as given in the references (see below)
|
45 |
+
- 'prediction_text': the text of the answer
|
46 |
+
references: List of question-answers dictionaries with the following key-values:
|
47 |
+
- 'id': id of the question-answer pair (see above),
|
48 |
+
- 'answers': a Dict in the SQuAD dataset format
|
49 |
+
{
|
50 |
+
'text': list of possible texts for the answer, as a list of strings
|
51 |
+
'answer_start': list of start positions for the answer, as a list of ints
|
52 |
+
}
|
53 |
+
Note that answer_start values are not taken into account to compute the metric.
|
|
|
|
|
|
|
|
|
|
|
|
|
54 |
Returns:
|
55 |
+
'anls': The ANLS score of predicted tokens versus the gold answer
|
|
|
|
|
56 |
Examples:
|
57 |
+
>>> predictions = [{'prediction_text': '1976', 'id': '56e10a3be3433e1400422b22'}]
|
58 |
+
>>> references = [{'answers': {'answer_start': [97], 'text': ['1976']}, 'id': '56e10a3be3433e1400422b22'}]
|
59 |
+
>>> anls_metric = evaluate.load("anls")
|
60 |
+
>>> results = anls_metric.compute(predictions=predictions, references=references)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
61 |
>>> print(results)
|
62 |
+
{'anls_score': 100.0}
|
|
|
|
|
|
|
|
|
|
|
|
|
63 |
"""
|
64 |
|
65 |
|
66 |
@evaluate.utils.file_utils.add_start_docstrings(_DESCRIPTION, _KWARGS_DESCRIPTION)
|
67 |
+
class Anls(evaluate.Metric):
|
68 |
def _info(self):
|
69 |
return evaluate.MetricInfo(
|
70 |
description=_DESCRIPTION,
|
71 |
citation=_CITATION,
|
72 |
inputs_description=_KWARGS_DESCRIPTION,
|
73 |
+
features=datasets.Features(
|
74 |
+
{
|
75 |
+
"predictions": {"id": datasets.Value("string"), "prediction_text": datasets.Value("string")},
|
76 |
+
"references": {
|
77 |
+
"id": datasets.Value("string"),
|
78 |
+
"answers": datasets.features.Sequence(
|
79 |
+
{
|
80 |
+
"text": datasets.Value("string"),
|
81 |
+
"answer_start": datasets.Value("int32"),
|
82 |
+
}
|
83 |
+
),
|
84 |
+
},
|
85 |
+
}
|
86 |
+
)
|
87 |
)
|
88 |
|
89 |
+
def _compute(self, predictions, references):
|
90 |
+
prediction_dict = {prediction["id"]: prediction["prediction_text"] for prediction in predictions}
|
91 |
+
dataset = [
|
92 |
+
{
|
93 |
+
"paragraphs": [
|
94 |
+
{
|
95 |
+
"qas": [
|
96 |
+
{
|
97 |
+
"answers": [{"text": answer_text} for answer_text in ref["answers"]["text"]],
|
98 |
+
"id": ref["id"],
|
99 |
+
}
|
100 |
+
for ref in references
|
101 |
+
]
|
102 |
+
}
|
103 |
+
]
|
104 |
}
|
105 |
+
]
|
106 |
+
score = compute_score(dataset=dataset, predictions=prediction_dict)
|
107 |
+
return score
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
compute_score.py
ADDED
@@ -0,0 +1,38 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import sys
|
2 |
+
from collections import Counter
|
3 |
+
from Levenshtein import ratio
|
4 |
+
|
5 |
+
|
6 |
+
def anls_compute(prediction, ground_truth):
|
7 |
+
prediction_tokens = prediction.split()
|
8 |
+
ground_truth_tokens = ground_truth.split()
|
9 |
+
common = Counter(prediction_tokens) & Counter(ground_truth_tokens)
|
10 |
+
num_same = sum(common.values())
|
11 |
+
if num_same == 0:
|
12 |
+
return 0
|
13 |
+
precision = 1.0 * num_same / len(prediction_tokens)
|
14 |
+
recall = 1.0 * num_same / len(ground_truth_tokens)
|
15 |
+
f1 = (2 * precision * recall) / (precision + recall)
|
16 |
+
return f1
|
17 |
+
|
18 |
+
|
19 |
+
def compute_score(dataset, predictions):
|
20 |
+
anls_score = total = 0
|
21 |
+
for article in dataset:
|
22 |
+
for paragraph in article["paragraphs"]:
|
23 |
+
for qa in paragraph["qas"]:
|
24 |
+
total += 1
|
25 |
+
if qa["id"] not in predictions:
|
26 |
+
message = "Unanswered question " + qa["id"] + " will receive score 0."
|
27 |
+
print(message, file=sys.stderr)
|
28 |
+
continue
|
29 |
+
ground_truths = list(map(lambda x: x["text"], qa["answers"]))
|
30 |
+
prediction = predictions[qa["id"]]
|
31 |
+
score = anls_compute(prediction=prediction, ground_truth=ground_truths)
|
32 |
+
# exact_match += metric_max_over_ground_truths(exact_match_score, prediction, ground_truths)
|
33 |
+
# f1 += metric_max_over_ground_truths(f1_score, prediction, ground_truths)
|
34 |
+
#
|
35 |
+
# exact_match = 100.0 * exact_match / total
|
36 |
+
# f1 = 100.0 * f1 / total
|
37 |
+
|
38 |
+
return {"anls_score": anls_score}
|
requirements.txt
ADDED
@@ -0,0 +1,2 @@
|
|
|
|
|
|
|
1 |
+
git+https://github.com/huggingface/evaluate@{COMMIT_PLACEHOLDER}
|
2 |
+
git+https://github.com/maxbachmann/python-Levenshtein.git
|