Spaces:
Sleeping
Sleeping
Commit
·
bac1189
1
Parent(s):
fff21b1
update info
Browse files- textgen_evaluator.py +41 -10
textgen_evaluator.py
CHANGED
@@ -13,6 +13,25 @@ _CITATION = """\
|
|
13 |
url = "https://www.aclweb.org/anthology/W04-1013",
|
14 |
pages = "74--81",
|
15 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
16 |
"""
|
17 |
|
18 |
_DESCRIPTION = """\
|
@@ -24,31 +43,43 @@ Note that ROUGE is case insensitive, meaning that upper case letters are treated
|
|
24 |
|
25 |
This metrics is a wrapper around Google Research reimplementation of ROUGE:
|
26 |
https://github.com/google-research/google-research/tree/master/rouge
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
27 |
"""
|
28 |
|
29 |
_KWARGS_DESCRIPTION = """
|
30 |
-
Calculates average rouge scores for a list of hypotheses and references
|
31 |
Args:
|
32 |
predictions: list of predictions to score. Each prediction
|
33 |
should be a string with tokens separated by spaces.
|
34 |
references: list of reference for each prediction. Each
|
35 |
reference should be a string with tokens separated by spaces.
|
36 |
-
|
37 |
-
Valid names:
|
38 |
-
`"rouge{n}"` (e.g. `"rouge1"`, `"rouge2"`) where: {n} is the n-gram based scoring,
|
39 |
-
`"rougeL"`: Longest common subsequence based scoring.
|
40 |
-
`"rougeLSum"`: rougeLsum splits text using `"\n"`.
|
41 |
-
See details in https://github.com/huggingface/datasets/issues/617
|
42 |
-
use_stemmer: Bool indicating whether Porter stemmer should be used to strip word suffixes.
|
43 |
-
use_aggregator: Return aggregates if this is set to True
|
44 |
Returns:
|
|
|
45 |
rouge1: rouge_1 (precision, recall, f1),
|
46 |
rouge2: rouge_2 (precision, recall, f1),
|
47 |
rougeL: rouge_l (precision, recall, f1),
|
48 |
rougeLsum: rouge_lsum (precision, recall, f1)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
49 |
"""
|
50 |
|
51 |
-
class
|
52 |
def _info(self):
|
53 |
return evaluate.MetricInfo(
|
54 |
description=_DESCRIPTION,
|
|
|
13 |
url = "https://www.aclweb.org/anthology/W04-1013",
|
14 |
pages = "74--81",
|
15 |
}
|
16 |
+
\
|
17 |
+
@INPROCEEDINGS{Papineni02bleu:a,
|
18 |
+
author = {Kishore Papineni and Salim Roukos and Todd Ward and Wei-jing Zhu},
|
19 |
+
title = {BLEU: a Method for Automatic Evaluation of Machine Translation},
|
20 |
+
booktitle = {},
|
21 |
+
year = {2002},
|
22 |
+
pages = {311--318}
|
23 |
+
}
|
24 |
+
@inproceedings{lin-och-2004-orange,
|
25 |
+
title = "{ORANGE}: a Method for Evaluating Automatic Evaluation Metrics for Machine Translation",
|
26 |
+
author = "Lin, Chin-Yew and
|
27 |
+
Och, Franz Josef",
|
28 |
+
booktitle = "{COLING} 2004: Proceedings of the 20th International Conference on Computational Linguistics",
|
29 |
+
month = "aug 23{--}aug 27",
|
30 |
+
year = "2004",
|
31 |
+
address = "Geneva, Switzerland",
|
32 |
+
publisher = "COLING",
|
33 |
+
url = "https://www.aclweb.org/anthology/C04-1072",
|
34 |
+
pages = "501--507",
|
35 |
"""
|
36 |
|
37 |
_DESCRIPTION = """\
|
|
|
43 |
|
44 |
This metrics is a wrapper around Google Research reimplementation of ROUGE:
|
45 |
https://github.com/google-research/google-research/tree/master/rouge
|
46 |
+
|
47 |
+
BLEU (Bilingual Evaluation Understudy) is an algorithm for evaluating the quality of text which has been machine-translated from one natural language to another.
|
48 |
+
Quality is considered to be the correspondence between a machine's output and that of a human: "the closer a machine translation is to a professional human translation, the better it is"
|
49 |
+
this is the central idea behind BLEU. BLEU was one of the first metrics to claim a high correlation with human judgements of quality, and remains one of the most popular automated and inexpensive metrics.
|
50 |
+
|
51 |
+
Scores are calculated for individual translated segments—generally sentences—by comparing them with a set of good quality reference translations.
|
52 |
+
Those scores are then averaged over the whole corpus to reach an estimate of the translation's overall quality.
|
53 |
+
Neither intelligibility nor grammatical correctness are not taken into account.
|
54 |
+
|
55 |
"""
|
56 |
|
57 |
_KWARGS_DESCRIPTION = """
|
58 |
+
Calculates average rouge and bleu scores for a list of hypotheses and references
|
59 |
Args:
|
60 |
predictions: list of predictions to score. Each prediction
|
61 |
should be a string with tokens separated by spaces.
|
62 |
references: list of reference for each prediction. Each
|
63 |
reference should be a string with tokens separated by spaces.
|
64 |
+
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
65 |
Returns:
|
66 |
+
ROUGE:{
|
67 |
rouge1: rouge_1 (precision, recall, f1),
|
68 |
rouge2: rouge_2 (precision, recall, f1),
|
69 |
rougeL: rouge_l (precision, recall, f1),
|
70 |
rougeLsum: rouge_lsum (precision, recall, f1)
|
71 |
+
},
|
72 |
+
BLEU:{
|
73 |
+
'bleu': bleu score,
|
74 |
+
'precisions': geometric mean of n-gram precisions,
|
75 |
+
'brevity_penalty': brevity penalty,
|
76 |
+
'length_ratio': ratio of lengths,
|
77 |
+
'translation_length': translation_length,
|
78 |
+
'reference_length': reference_length
|
79 |
+
}
|
80 |
"""
|
81 |
|
82 |
+
class TextGenEvaluator(evaluate.Metric):
|
83 |
def _info(self):
|
84 |
return evaluate.MetricInfo(
|
85 |
description=_DESCRIPTION,
|