Viona commited on
Commit
6a4fac9
·
1 Parent(s): ced0e09

adding main description; skeleton of code

Browse files
Files changed (3) hide show
  1. anls.py +54 -85
  2. compute_score.py +38 -0
  3. requirements.txt +2 -0
anls.py CHANGED
@@ -14,11 +14,10 @@
14
  """ANLS - Average Normalized Levenshtein Similarity"""
15
 
16
  import datasets
17
- import numpy as np
18
- from sklearn.metrics import mean_absolute_error
19
-
20
  import evaluate
21
 
 
 
22
 
23
  _CITATION = """\
24
  @article{,
@@ -34,105 +33,75 @@ _CITATION = """\
34
  """
35
 
36
  _DESCRIPTION = """\
37
- ANLS refer to the average normalized Levenshtein similarity
38
  """
39
 
40
 
41
  _KWARGS_DESCRIPTION = """
 
42
  Args:
43
- predictions: array-like of shape (n_samples,) or (n_samples, n_outputs)
44
- Estimated target values.
45
- references: array-like of shape (n_samples,) or (n_samples, n_outputs)
46
- Ground truth (correct) target values.
47
- training: array-like of shape (n_train_samples,) or (n_train_samples, n_outputs)
48
- In sample training data for naive forecast.
49
- periodicity: int, default=1
50
- Seasonal periodicity of training data.
51
- sample_weight: array-like of shape (n_samples,), default=None
52
- Sample weights.
53
- multioutput: {"raw_values", "uniform_average"} or array-like of shape (n_outputs,), default="uniform_average"
54
- Defines aggregating of multiple output values. Array-like value defines weights used to average errors.
55
-
56
- "raw_values" : Returns a full set of errors in case of multioutput input.
57
-
58
- "uniform_average" : Errors of all outputs are averaged with uniform weight.
59
-
60
  Returns:
61
- mase : mean absolute scaled error.
62
- If multioutput is "raw_values", then mean absolute percentage error is returned for each output separately. If multioutput is "uniform_average" or an ndarray of weights, then the weighted average of all output errors is returned.
63
- MASE output is non-negative floating point. The best value is 0.0.
64
  Examples:
65
-
66
- >>> mase_metric = evaluate.load("mase")
67
- >>> predictions = [2.5, 0.0, 2, 8, 1.25]
68
- >>> references = [3, -0.5, 2, 7, 2]
69
- >>> training = [5, 0.5, 4, 6, 3, 5, 2]
70
- >>> results = mase_metric.compute(predictions=predictions, references=references, training=training)
71
- >>> print(results)
72
- {'mase': 0.18333333333333335}
73
-
74
- If you're using multi-dimensional lists, then set the config as follows :
75
-
76
- >>> mase_metric = evaluate.load("mase", "multilist")
77
- >>> predictions = [[0, 2], [-1, 2], [8, -5]]
78
- >>> references = [[0.5, 1], [-1, 1], [7, -6]]
79
- >>> training = [[0.5, 1], [-1, 1], [7, -6]]
80
- >>> results = mase_metric.compute(predictions=predictions, references=references, training=training)
81
  >>> print(results)
82
- {'mase': 0.18181818181818182}
83
- >>> results = mase_metric.compute(predictions=predictions, references=references, training=training, multioutput='raw_values')
84
- >>> print(results)
85
- {'mase': array([0.10526316, 0.28571429])}
86
- >>> results = mase_metric.compute(predictions=predictions, references=references, training=training, multioutput=[0.3, 0.7])
87
- >>> print(results)
88
- {'mase': 0.21935483870967742}
89
  """
90
 
91
 
92
  @evaluate.utils.file_utils.add_start_docstrings(_DESCRIPTION, _KWARGS_DESCRIPTION)
93
- class Mase(evaluate.Metric):
94
  def _info(self):
95
  return evaluate.MetricInfo(
96
  description=_DESCRIPTION,
97
  citation=_CITATION,
98
  inputs_description=_KWARGS_DESCRIPTION,
99
- features=datasets.Features(self._get_feature_types()),
100
- reference_urls=["https://otexts.com/fpp3/accuracy.html#scaled-errors"],
 
 
 
 
 
 
 
 
 
 
 
 
101
  )
102
 
103
- def _get_feature_types(self):
104
- if self.config_name == "multilist":
105
- return {
106
- "predictions": datasets.Sequence(datasets.Value("float")),
107
- "references": datasets.Sequence(datasets.Value("float")),
108
- }
109
- else:
110
- return {
111
- "predictions": datasets.Value("float"),
112
- "references": datasets.Value("float"),
 
 
 
 
 
113
  }
114
-
115
- def _compute(
116
- self,
117
- predictions,
118
- references,
119
- training,
120
- periodicity=1,
121
- sample_weight=None,
122
- multioutput="uniform_average",
123
- ):
124
-
125
- y_pred_naive = training[:-periodicity]
126
- mae_naive = mean_absolute_error(training[periodicity:], y_pred_naive, multioutput=multioutput)
127
-
128
- mae_score = mean_absolute_error(
129
- references,
130
- predictions,
131
- sample_weight=sample_weight,
132
- multioutput=multioutput,
133
- )
134
-
135
- epsilon = np.finfo(np.float64).eps
136
- mase_score = mae_score / np.maximum(mae_naive, epsilon)
137
-
138
- return {"mase": mase_score}
 
14
  """ANLS - Average Normalized Levenshtein Similarity"""
15
 
16
  import datasets
 
 
 
17
  import evaluate
18
 
19
+ from compute_score import compute_score
20
+
21
 
22
  _CITATION = """\
23
  @article{,
 
33
  """
34
 
35
  _DESCRIPTION = """\
36
+ ANLS refer to the average normalized Levenshtein similarity.
37
  """
38
 
39
 
40
  _KWARGS_DESCRIPTION = """
41
+ Computes Average Normalized Levenshtein Similarity (ANLS).
42
  Args:
43
+ predictions: List of question-answers dictionaries with the following key-values:
44
+ - 'id': id of the question-answer pair as given in the references (see below)
45
+ - 'prediction_text': the text of the answer
46
+ references: List of question-answers dictionaries with the following key-values:
47
+ - 'id': id of the question-answer pair (see above),
48
+ - 'answers': a Dict in the SQuAD dataset format
49
+ {
50
+ 'text': list of possible texts for the answer, as a list of strings
51
+ 'answer_start': list of start positions for the answer, as a list of ints
52
+ }
53
+ Note that answer_start values are not taken into account to compute the metric.
 
 
 
 
 
 
54
  Returns:
55
+ 'anls': The ANLS score of predicted tokens versus the gold answer
 
 
56
  Examples:
57
+ >>> predictions = [{'prediction_text': '1976', 'id': '56e10a3be3433e1400422b22'}]
58
+ >>> references = [{'answers': {'answer_start': [97], 'text': ['1976']}, 'id': '56e10a3be3433e1400422b22'}]
59
+ >>> anls_metric = evaluate.load("anls")
60
+ >>> results = anls_metric.compute(predictions=predictions, references=references)
 
 
 
 
 
 
 
 
 
 
 
 
61
  >>> print(results)
62
+ {'anls_score': 100.0}
 
 
 
 
 
 
63
  """
64
 
65
 
66
  @evaluate.utils.file_utils.add_start_docstrings(_DESCRIPTION, _KWARGS_DESCRIPTION)
67
+ class Anls(evaluate.Metric):
68
  def _info(self):
69
  return evaluate.MetricInfo(
70
  description=_DESCRIPTION,
71
  citation=_CITATION,
72
  inputs_description=_KWARGS_DESCRIPTION,
73
+ features=datasets.Features(
74
+ {
75
+ "predictions": {"id": datasets.Value("string"), "prediction_text": datasets.Value("string")},
76
+ "references": {
77
+ "id": datasets.Value("string"),
78
+ "answers": datasets.features.Sequence(
79
+ {
80
+ "text": datasets.Value("string"),
81
+ "answer_start": datasets.Value("int32"),
82
+ }
83
+ ),
84
+ },
85
+ }
86
+ )
87
  )
88
 
89
+ def _compute(self, predictions, references):
90
+ prediction_dict = {prediction["id"]: prediction["prediction_text"] for prediction in predictions}
91
+ dataset = [
92
+ {
93
+ "paragraphs": [
94
+ {
95
+ "qas": [
96
+ {
97
+ "answers": [{"text": answer_text} for answer_text in ref["answers"]["text"]],
98
+ "id": ref["id"],
99
+ }
100
+ for ref in references
101
+ ]
102
+ }
103
+ ]
104
  }
105
+ ]
106
+ score = compute_score(dataset=dataset, predictions=prediction_dict)
107
+ return score
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
compute_score.py ADDED
@@ -0,0 +1,38 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import sys
2
+ from collections import Counter
3
+ from Levenshtein import ratio
4
+
5
+
6
+ def anls_compute(prediction, ground_truth):
7
+ prediction_tokens = prediction.split()
8
+ ground_truth_tokens = ground_truth.split()
9
+ common = Counter(prediction_tokens) & Counter(ground_truth_tokens)
10
+ num_same = sum(common.values())
11
+ if num_same == 0:
12
+ return 0
13
+ precision = 1.0 * num_same / len(prediction_tokens)
14
+ recall = 1.0 * num_same / len(ground_truth_tokens)
15
+ f1 = (2 * precision * recall) / (precision + recall)
16
+ return f1
17
+
18
+
19
+ def compute_score(dataset, predictions):
20
+ anls_score = total = 0
21
+ for article in dataset:
22
+ for paragraph in article["paragraphs"]:
23
+ for qa in paragraph["qas"]:
24
+ total += 1
25
+ if qa["id"] not in predictions:
26
+ message = "Unanswered question " + qa["id"] + " will receive score 0."
27
+ print(message, file=sys.stderr)
28
+ continue
29
+ ground_truths = list(map(lambda x: x["text"], qa["answers"]))
30
+ prediction = predictions[qa["id"]]
31
+ score = anls_compute(prediction=prediction, ground_truth=ground_truths)
32
+ # exact_match += metric_max_over_ground_truths(exact_match_score, prediction, ground_truths)
33
+ # f1 += metric_max_over_ground_truths(f1_score, prediction, ground_truths)
34
+ #
35
+ # exact_match = 100.0 * exact_match / total
36
+ # f1 = 100.0 * f1 / total
37
+
38
+ return {"anls_score": anls_score}
requirements.txt ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ git+https://github.com/huggingface/evaluate@{COMMIT_PLACEHOLDER}
2
+ git+https://github.com/maxbachmann/python-Levenshtein.git