writing README.md
Browse files- README.md +89 -5
- app.py +1 -1
- compute_score.py +7 -5
- requirements.txt +2 -2
README.md
CHANGED
@@ -8,11 +8,95 @@ sdk_version: 3.17.0
|
|
8 |
app_file: app.py
|
9 |
pinned: false
|
10 |
---
|
|
|
|
|
|
|
|
|
|
|
11 |
|
12 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
13 |
|
14 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
15 |
|
16 |
-
|
17 |
-
|
18 |
-
Answers or tokens comprising answers are not limited to a fixed size dictionary. It could be any word/token which is present in the document.
|
|
|
8 |
app_file: app.py
|
9 |
pinned: false
|
10 |
---
|
11 |
+
tags:
|
12 |
+
- evaluate
|
13 |
+
- metric
|
14 |
+
description: >-
|
15 |
+
This metric wrap the official scoring script for version 1 of the Average Normalized Levenshtein Similarity (ANLS).
|
16 |
|
17 |
+
---
|
18 |
+
|
19 |
+
# Metric Card for ANLS
|
20 |
+
|
21 |
+
## Metric description
|
22 |
+
This metric wraps the official scoring script for version 1 of the Average Normalized Levenshtein Similarity (ANLS).
|
23 |
+
|
24 |
+
The ANLS smoothly captures the OCR mistakes applying a slight penalization in case of correct intended responses, but badly recognized. It also makes use of a threshold of value 0.5 that dictates whether the output of the metric will be the ANLS if its value is equal or bigger than 0.5 or 0 otherwise. The key point of this threshold is to determine if the answer has been correctly selected but not properly recognized, or on the contrary, the output is a wrong text selected from the options and given as an answer.
|
25 |
+
|
26 |
+
More formally, the ANLS between the net output and the ground truth answers is given by equation 1. Where N is the total number of questions, M total number of GT answers per question, a<sub>ij</sub> the ground truth answers where i = {0, ..., N}, and j = {0, ..., M}, and o<sub>qi</sub> be the network's answer for the i<sup>th</sup> question q<sub>i</sub>.
|
27 |
+
|
28 |
+
![alt text](https://rrc.cvc.uab.es/files/ANLS.png)
|
29 |
+
|
30 |
+
Reference: [Evaluation Metric](https://rrc.cvc.uab.es/?ch=11&com=tasks)
|
31 |
+
|
32 |
+
## How to use
|
33 |
+
The metric takes two lists of question-answers dictionaries as inputs, one with the predictions of the model and the other with the references to be compared to.
|
34 |
+
|
35 |
+
_predictions_: List of question-answers dictionaries with the following key-values:
|
36 |
+
|
37 |
+
- 'question_id': id of the question-answer pair as given in the references (see below)
|
38 |
+
- 'prediction_text': the text of the answer
|
39 |
+
|
40 |
+
_references_: List of question-answers dictionaries with the following key-values:
|
41 |
+
|
42 |
+
- 'question_id': id of the question-answer pair (see above)
|
43 |
+
- 'answers': list of possible texts for the answer, as a list of strings
|
44 |
+
|
45 |
+
```python
|
46 |
+
from evaluate import load
|
47 |
+
squad_metric = load("anls")
|
48 |
+
results = anls_metric.compute(predictions=predictions, references=references)
|
49 |
+
```
|
50 |
+
## Output values
|
51 |
+
|
52 |
+
This metric outputs a dictionary with value 'anls_score' between 0.0 and 1.0
|
53 |
+
|
54 |
+
```
|
55 |
+
{'anls_score': 1.0}
|
56 |
+
```
|
57 |
+
|
58 |
+
## Examples
|
59 |
+
|
60 |
+
|
61 |
+
```python
|
62 |
+
from evaluate import load
|
63 |
+
anls_metric = load("anls")
|
64 |
+
predictions = [{'question_id': '10285', 'prediction_text': 'Denver Broncos'},
|
65 |
+
{'question_id': '18601', 'prediction_text': '12/15/89'},
|
66 |
+
{'question_id': '16734', 'prediction_text': 'Dear dr. Lobo'}]
|
67 |
+
|
68 |
+
references = [{"answers": ["Denver Broncos", "Denver R. Broncos"], 'question_id': '10285'},
|
69 |
+
{'answers': ['12/15/88'], 'question_id': '18601'},
|
70 |
+
{'answers': ['Dear Dr. Lobo', 'Dr. Lobo'], 'question_id': '16734'}]
|
71 |
+
results = anls_metric.compute(predictions=predictions, references=references)
|
72 |
+
results
|
73 |
+
{'anls_metric': 1.0}
|
74 |
+
```
|
75 |
+
|
76 |
+
|
77 |
+
## Limitations and bias
|
78 |
+
This metric works only with datasets that have the same format as specified above.
|
79 |
+
|
80 |
+
## Considerations / Assumptions
|
81 |
+
As specified in website: [Tasks - Document Visual Question Answering](https://rrc.cvc.uab.es/?ch=17&com=tasks)
|
82 |
+
|
83 |
+
- Answers are not case sensitive
|
84 |
+
- Answers are space sensitive
|
85 |
+
- Answers or tokens comprising answers are not limited to a fixed size dictionary. It could be any word/token which is present in the document.
|
86 |
+
|
87 |
+
## Citation
|
88 |
|
89 |
+
@article{,
|
90 |
+
title = {Binary codes capable of correcting deletions, insertions, and reversals},
|
91 |
+
journal = {Soviet physics doklady},
|
92 |
+
volume = {10},
|
93 |
+
number = {8},
|
94 |
+
pages = {707--710},
|
95 |
+
year = {1966},
|
96 |
+
url = {https://nymity.ch/sybilhunting/pdf/Levenshtein1966a.pdf},
|
97 |
+
author = {V. I. Levenshtein},
|
98 |
+
|
99 |
+
## Further References
|
100 |
|
101 |
+
- [The Stanford Question Answering Dataset: Background, Challenges, Progress (blog post)](https://rajpurkar.github.io/mlx/qa-and-squad/)
|
102 |
+
- [Hugging Face Course -- Question Answering](https://huggingface.co/course/chapter7/7)
|
|
app.py
CHANGED
@@ -2,5 +2,5 @@ import evaluate
|
|
2 |
from evaluate.utils import launch_gradio_widget
|
3 |
|
4 |
|
5 |
-
module = evaluate.load("
|
6 |
launch_gradio_widget(module)
|
|
|
2 |
from evaluate.utils import launch_gradio_widget
|
3 |
|
4 |
|
5 |
+
module = evaluate.load("anls")
|
6 |
launch_gradio_widget(module)
|
compute_score.py
CHANGED
@@ -4,24 +4,26 @@ from Levenshtein import ratio
|
|
4 |
def compute_score(predictions, ground_truths):
|
5 |
theta = 0.5
|
6 |
anls_score = 0
|
|
|
7 |
for qid, prediction in predictions.items():
|
8 |
max_value = 0
|
9 |
if qid in ground_truths:
|
10 |
for x in ground_truths[qid]:
|
11 |
-
|
|
|
12 |
if nl < theta:
|
13 |
score = 1 - nl
|
14 |
if score > max_value:
|
15 |
max_value = score
|
16 |
anls_score += max_value
|
17 |
|
18 |
-
return anls_score
|
19 |
|
20 |
|
21 |
if __name__ == "__main__":
|
22 |
-
predictions = [{'question_id': '10285', 'prediction_text': 'Denver
|
23 |
-
{'question_id': '18601', 'prediction_text': '12
|
24 |
-
{'question_id': '16734', 'prediction_text': '
|
25 |
|
26 |
references = [{"answers": ["Denver Broncos", "Denver R. Broncos"], 'question_id': '10285'},
|
27 |
{'answers': ['12/15/88'], 'question_id': '18601'},
|
|
|
4 |
def compute_score(predictions, ground_truths):
|
5 |
theta = 0.5
|
6 |
anls_score = 0
|
7 |
+
total = 0
|
8 |
for qid, prediction in predictions.items():
|
9 |
max_value = 0
|
10 |
if qid in ground_truths:
|
11 |
for x in ground_truths[qid]:
|
12 |
+
total += 1
|
13 |
+
nl = ratio(prediction.lower(), x.lower())
|
14 |
if nl < theta:
|
15 |
score = 1 - nl
|
16 |
if score > max_value:
|
17 |
max_value = score
|
18 |
anls_score += max_value
|
19 |
|
20 |
+
return anls_score/total
|
21 |
|
22 |
|
23 |
if __name__ == "__main__":
|
24 |
+
predictions = [{'question_id': '10285', 'prediction_text': 'Denver R.'},
|
25 |
+
{'question_id': '18601', 'prediction_text': '12'},
|
26 |
+
{'question_id': '16734', 'prediction_text': 'dear'}]
|
27 |
|
28 |
references = [{"answers": ["Denver Broncos", "Denver R. Broncos"], 'question_id': '10285'},
|
29 |
{'answers': ['12/15/88'], 'question_id': '18601'},
|
requirements.txt
CHANGED
@@ -1,2 +1,2 @@
|
|
1 |
-
evaluate
|
2 |
-
python-Levenshtein
|
|
|
1 |
+
git+https://github.com/huggingface/evaluate
|
2 |
+
git+https://github.com/maxbachmann/python-Levenshtein.git
|