Spaces:

Viona
/

anls

Runtime error

App Files Files Community

Viona commited on Feb 3, 2023

Commit

6a4fac9

1 Parent(s): ced0e09

adding main description; skeleton of code

Browse files

Files changed (3) hide show

anls.py +54 -85
compute_score.py +38 -0
requirements.txt +2 -0

anls.py CHANGED Viewed

@@ -14,11 +14,10 @@
 """ANLS - Average Normalized Levenshtein Similarity"""
 import datasets
-import numpy as np
-from sklearn.metrics import mean_absolute_error
 import evaluate
 _CITATION = """\
 @article{,
@@ -34,105 +33,75 @@ _CITATION = """\
 """
 _DESCRIPTION = """\
-ANLS refer to the average normalized Levenshtein similarity
 """
 _KWARGS_DESCRIPTION = """
 Args:
-    predictions: array-like of shape (n_samples,) or (n_samples, n_outputs)
-        Estimated target values.
-    references: array-like of shape (n_samples,) or (n_samples, n_outputs)
-        Ground truth (correct) target values.
-    training: array-like of shape (n_train_samples,) or (n_train_samples, n_outputs)
-        In sample training data for naive forecast.
-    periodicity: int, default=1
-        Seasonal periodicity of training data.
-    sample_weight: array-like of shape (n_samples,), default=None
-        Sample weights.
-    multioutput: {"raw_values", "uniform_average"} or array-like of shape (n_outputs,), default="uniform_average"
-        Defines aggregating of multiple output values. Array-like value defines weights used to average errors.
-                 "raw_values" : Returns a full set of errors in case of multioutput input.
-                 "uniform_average" : Errors of all outputs are averaged with uniform weight.
 Returns:
-    mase : mean absolute scaled error.
-        If multioutput is "raw_values", then mean absolute percentage error is returned for each output separately. If multioutput is "uniform_average" or an ndarray of weights, then the weighted average of all output errors is returned.
-        MASE output is non-negative floating point. The best value is 0.0.
 Examples:
-    >>> mase_metric = evaluate.load("mase")
-    >>> predictions = [2.5, 0.0, 2, 8, 1.25]
-    >>> references = [3, -0.5, 2, 7, 2]
-    >>> training = [5, 0.5, 4, 6, 3, 5, 2]
-    >>> results = mase_metric.compute(predictions=predictions, references=references, training=training)
-    >>> print(results)
-    {'mase': 0.18333333333333335}
-    If you're using multi-dimensional lists, then set the config as follows :
-    >>> mase_metric = evaluate.load("mase", "multilist")
-    >>> predictions = [[0, 2], [-1, 2], [8, -5]]
-    >>> references = [[0.5, 1], [-1, 1], [7, -6]]
-    >>> training = [[0.5, 1], [-1, 1], [7, -6]]
-    >>> results = mase_metric.compute(predictions=predictions, references=references, training=training)
     >>> print(results)
-    {'mase': 0.18181818181818182}
-    >>> results = mase_metric.compute(predictions=predictions, references=references, training=training, multioutput='raw_values')
-    >>> print(results)
-    {'mase': array([0.10526316, 0.28571429])}
-    >>> results = mase_metric.compute(predictions=predictions, references=references, training=training, multioutput=[0.3, 0.7])
-    >>> print(results)
-    {'mase': 0.21935483870967742}
 """
 @evaluate.utils.file_utils.add_start_docstrings(_DESCRIPTION, _KWARGS_DESCRIPTION)
-class Mase(evaluate.Metric):
     def _info(self):
         return evaluate.MetricInfo(
             description=_DESCRIPTION,
             citation=_CITATION,
             inputs_description=_KWARGS_DESCRIPTION,
-            features=datasets.Features(self._get_feature_types()),
-            reference_urls=["https://otexts.com/fpp3/accuracy.html#scaled-errors"],
         )
-    def _get_feature_types(self):
-        if self.config_name == "multilist":
-            return {
-                "predictions": datasets.Sequence(datasets.Value("float")),
-                "references": datasets.Sequence(datasets.Value("float")),
-            }
-        else:
-            return {
-                "predictions": datasets.Value("float"),
-                "references": datasets.Value("float"),
             }
-    def _compute(
-        self,
-        predictions,
-        references,
-        training,
-        periodicity=1,
-        sample_weight=None,
-        multioutput="uniform_average",
-    ):
-        y_pred_naive = training[:-periodicity]
-        mae_naive = mean_absolute_error(training[periodicity:], y_pred_naive, multioutput=multioutput)
-        mae_score = mean_absolute_error(
-            references,
-            predictions,
-            sample_weight=sample_weight,
-            multioutput=multioutput,
-        )
-        epsilon = np.finfo(np.float64).eps
-        mase_score = mae_score / np.maximum(mae_naive, epsilon)
-        return {"mase": mase_score}

 """ANLS - Average Normalized Levenshtein Similarity"""
 import datasets
 import evaluate
+from compute_score import compute_score
 _CITATION = """\
 @article{,
 """
 _DESCRIPTION = """\
+ANLS refer to the average normalized Levenshtein similarity.
 """
 _KWARGS_DESCRIPTION = """
+Computes Average Normalized Levenshtein Similarity (ANLS).
 Args:
+    predictions: List of question-answers dictionaries with the following key-values:
+        - 'id': id of the question-answer pair as given in the references (see below)
+        - 'prediction_text': the text of the answer
+    references: List of question-answers dictionaries with the following key-values:
+        - 'id': id of the question-answer pair (see above),
+        - 'answers': a Dict in the SQuAD dataset format
+            {
+                'text': list of possible texts for the answer, as a list of strings
+                'answer_start': list of start positions for the answer, as a list of ints
+            }
+            Note that answer_start values are not taken into account to compute the metric.
 Returns:
+    'anls': The ANLS score of predicted tokens versus the gold answer
 Examples:
+    >>> predictions = [{'prediction_text': '1976', 'id': '56e10a3be3433e1400422b22'}]
+    >>> references = [{'answers': {'answer_start': [97], 'text': ['1976']}, 'id': '56e10a3be3433e1400422b22'}]
+    >>> anls_metric = evaluate.load("anls")
+    >>> results = anls_metric.compute(predictions=predictions, references=references)
     >>> print(results)
+    {'anls_score': 100.0}
 """
 @evaluate.utils.file_utils.add_start_docstrings(_DESCRIPTION, _KWARGS_DESCRIPTION)
+class Anls(evaluate.Metric):
     def _info(self):
         return evaluate.MetricInfo(
             description=_DESCRIPTION,
             citation=_CITATION,
             inputs_description=_KWARGS_DESCRIPTION,
+            features=datasets.Features(
+                {
+                    "predictions": {"id": datasets.Value("string"), "prediction_text": datasets.Value("string")},
+                    "references": {
+                        "id": datasets.Value("string"),
+                        "answers": datasets.features.Sequence(
+                            {
+                                "text": datasets.Value("string"),
+                                "answer_start": datasets.Value("int32"),
+                            }
+                        ),
+                    },
+                }
+            )
         )
+    def _compute(self, predictions, references):
+        prediction_dict = {prediction["id"]: prediction["prediction_text"] for prediction in predictions}
+        dataset = [
+            {
+                "paragraphs": [
+                    {
+                        "qas": [
+                            {
+                                "answers": [{"text": answer_text} for answer_text in ref["answers"]["text"]],
+                                "id": ref["id"],
+                            }
+                            for ref in references
+                        ]
+                    }
+                ]
             }
+        ]
+        score = compute_score(dataset=dataset, predictions=prediction_dict)
+        return score

compute_score.py ADDED Viewed

	@@ -0,0 +1,38 @@

+import sys
+from collections import Counter
+from Levenshtein import ratio
+def anls_compute(prediction, ground_truth):
+    prediction_tokens = prediction.split()
+    ground_truth_tokens = ground_truth.split()
+    common = Counter(prediction_tokens) & Counter(ground_truth_tokens)
+    num_same = sum(common.values())
+    if num_same == 0:
+        return 0
+    precision = 1.0 * num_same / len(prediction_tokens)
+    recall = 1.0 * num_same / len(ground_truth_tokens)
+    f1 = (2 * precision * recall) / (precision + recall)
+    return f1
+def compute_score(dataset, predictions):
+    anls_score = total = 0
+    for article in dataset:
+        for paragraph in article["paragraphs"]:
+            for qa in paragraph["qas"]:
+                total += 1
+                if qa["id"] not in predictions:
+                    message = "Unanswered question " + qa["id"] + " will receive score 0."
+                    print(message, file=sys.stderr)
+                    continue
+                ground_truths = list(map(lambda x: x["text"], qa["answers"]))
+                prediction = predictions[qa["id"]]
+                score = anls_compute(prediction=prediction, ground_truth=ground_truths)
+    #             exact_match += metric_max_over_ground_truths(exact_match_score, prediction, ground_truths)
+    #             f1 += metric_max_over_ground_truths(f1_score, prediction, ground_truths)
+    #
+    # exact_match = 100.0 * exact_match / total
+    # f1 = 100.0 * f1 / total
+    return {"anls_score": anls_score}

requirements.txt ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ git+https://github.com/huggingface/evaluate@{COMMIT_PLACEHOLDER}
2	+ git+https://github.com/maxbachmann/python-Levenshtein.git