HalteroXHunter commited on
Commit
bac1189
·
1 Parent(s): fff21b1

update info

Browse files
Files changed (1) hide show
  1. textgen_evaluator.py +41 -10
textgen_evaluator.py CHANGED
@@ -13,6 +13,25 @@ _CITATION = """\
13
  url = "https://www.aclweb.org/anthology/W04-1013",
14
  pages = "74--81",
15
  }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
16
  """
17
 
18
  _DESCRIPTION = """\
@@ -24,31 +43,43 @@ Note that ROUGE is case insensitive, meaning that upper case letters are treated
24
 
25
  This metrics is a wrapper around Google Research reimplementation of ROUGE:
26
  https://github.com/google-research/google-research/tree/master/rouge
 
 
 
 
 
 
 
 
 
27
  """
28
 
29
  _KWARGS_DESCRIPTION = """
30
- Calculates average rouge scores for a list of hypotheses and references
31
  Args:
32
  predictions: list of predictions to score. Each prediction
33
  should be a string with tokens separated by spaces.
34
  references: list of reference for each prediction. Each
35
  reference should be a string with tokens separated by spaces.
36
- rouge_types: A list of rouge types to calculate.
37
- Valid names:
38
- `"rouge{n}"` (e.g. `"rouge1"`, `"rouge2"`) where: {n} is the n-gram based scoring,
39
- `"rougeL"`: Longest common subsequence based scoring.
40
- `"rougeLSum"`: rougeLsum splits text using `"\n"`.
41
- See details in https://github.com/huggingface/datasets/issues/617
42
- use_stemmer: Bool indicating whether Porter stemmer should be used to strip word suffixes.
43
- use_aggregator: Return aggregates if this is set to True
44
  Returns:
 
45
  rouge1: rouge_1 (precision, recall, f1),
46
  rouge2: rouge_2 (precision, recall, f1),
47
  rougeL: rouge_l (precision, recall, f1),
48
  rougeLsum: rouge_lsum (precision, recall, f1)
 
 
 
 
 
 
 
 
 
49
  """
50
 
51
- class TextGenEvaluatorTest(evaluate.Metric):
52
  def _info(self):
53
  return evaluate.MetricInfo(
54
  description=_DESCRIPTION,
 
13
  url = "https://www.aclweb.org/anthology/W04-1013",
14
  pages = "74--81",
15
  }
16
+ \
17
+ @INPROCEEDINGS{Papineni02bleu:a,
18
+ author = {Kishore Papineni and Salim Roukos and Todd Ward and Wei-jing Zhu},
19
+ title = {BLEU: a Method for Automatic Evaluation of Machine Translation},
20
+ booktitle = {},
21
+ year = {2002},
22
+ pages = {311--318}
23
+ }
24
+ @inproceedings{lin-och-2004-orange,
25
+ title = "{ORANGE}: a Method for Evaluating Automatic Evaluation Metrics for Machine Translation",
26
+ author = "Lin, Chin-Yew and
27
+ Och, Franz Josef",
28
+ booktitle = "{COLING} 2004: Proceedings of the 20th International Conference on Computational Linguistics",
29
+ month = "aug 23{--}aug 27",
30
+ year = "2004",
31
+ address = "Geneva, Switzerland",
32
+ publisher = "COLING",
33
+ url = "https://www.aclweb.org/anthology/C04-1072",
34
+ pages = "501--507",
35
  """
36
 
37
  _DESCRIPTION = """\
 
43
 
44
  This metrics is a wrapper around Google Research reimplementation of ROUGE:
45
  https://github.com/google-research/google-research/tree/master/rouge
46
+
47
+ BLEU (Bilingual Evaluation Understudy) is an algorithm for evaluating the quality of text which has been machine-translated from one natural language to another.
48
+ Quality is considered to be the correspondence between a machine's output and that of a human: "the closer a machine translation is to a professional human translation, the better it is"
49
+ this is the central idea behind BLEU. BLEU was one of the first metrics to claim a high correlation with human judgements of quality, and remains one of the most popular automated and inexpensive metrics.
50
+
51
+ Scores are calculated for individual translated segments—generally sentences—by comparing them with a set of good quality reference translations.
52
+ Those scores are then averaged over the whole corpus to reach an estimate of the translation's overall quality.
53
+ Neither intelligibility nor grammatical correctness are not taken into account.
54
+
55
  """
56
 
57
  _KWARGS_DESCRIPTION = """
58
+ Calculates average rouge and bleu scores for a list of hypotheses and references
59
  Args:
60
  predictions: list of predictions to score. Each prediction
61
  should be a string with tokens separated by spaces.
62
  references: list of reference for each prediction. Each
63
  reference should be a string with tokens separated by spaces.
64
+
 
 
 
 
 
 
 
65
  Returns:
66
+ ROUGE:{
67
  rouge1: rouge_1 (precision, recall, f1),
68
  rouge2: rouge_2 (precision, recall, f1),
69
  rougeL: rouge_l (precision, recall, f1),
70
  rougeLsum: rouge_lsum (precision, recall, f1)
71
+ },
72
+ BLEU:{
73
+ 'bleu': bleu score,
74
+ 'precisions': geometric mean of n-gram precisions,
75
+ 'brevity_penalty': brevity penalty,
76
+ 'length_ratio': ratio of lengths,
77
+ 'translation_length': translation_length,
78
+ 'reference_length': reference_length
79
+ }
80
  """
81
 
82
+ class TextGenEvaluator(evaluate.Metric):
83
  def _info(self):
84
  return evaluate.MetricInfo(
85
  description=_DESCRIPTION,