Spaces:
Sleeping
Sleeping
import datasets | |
import evaluate | |
import numpy as np | |
_CITATION = """\ | |
@inproceedings{lin-2004-rouge, | |
title = "{ROUGE}: A Package for Automatic Evaluation of Summaries", | |
author = "Lin, Chin-Yew", | |
booktitle = "Text Summarization Branches Out", | |
month = jul, | |
year = "2004", | |
address = "Barcelona, Spain", | |
publisher = "Association for Computational Linguistics", | |
url = "https://www.aclweb.org/anthology/W04-1013", | |
pages = "74--81", | |
} | |
\ | |
@INPROCEEDINGS{Papineni02bleu:a, | |
author = {Kishore Papineni and Salim Roukos and Todd Ward and Wei-jing Zhu}, | |
title = {BLEU: a Method for Automatic Evaluation of Machine Translation}, | |
booktitle = {}, | |
year = {2002}, | |
pages = {311--318} | |
} | |
@inproceedings{lin-och-2004-orange, | |
title = "{ORANGE}: a Method for Evaluating Automatic Evaluation Metrics for Machine Translation", | |
author = "Lin, Chin-Yew and | |
Och, Franz Josef", | |
booktitle = "{COLING} 2004: Proceedings of the 20th International Conference on Computational Linguistics", | |
month = "aug 23{--}aug 27", | |
year = "2004", | |
address = "Geneva, Switzerland", | |
publisher = "COLING", | |
url = "https://www.aclweb.org/anthology/C04-1072", | |
pages = "501--507", | |
\ | |
@inproceedings{bert-score, | |
title={BERTScore: Evaluating Text Generation with BERT}, | |
author={Tianyi Zhang* and Varsha Kishore* and Felix Wu* and Kilian Q. Weinberger and Yoav Artzi}, | |
booktitle={International Conference on Learning Representations}, | |
year={2020}, | |
url={https://openreview.net/forum?id=SkeHuCVFDr} | |
\ | |
@inproceedings{bleurt, | |
title={BLEURT: Learning Robust Metrics for Text Generation}, | |
author={Thibault Sellam and Dipanjan Das and Ankur P. Parikh}, | |
booktitle={ACL}, | |
year={2020}, | |
url={https://arxiv.org/abs/2004.04696} | |
} | |
""" | |
_DESCRIPTION = """\ | |
ROUGE, or Recall-Oriented Understudy for Gisting Evaluation, is a set of metrics and a software package used for | |
evaluating automatic summarization and machine translation software in natural language processing. | |
The metrics compare an automatically produced summary or translation against a reference or a set of references (human-produced) summary or translation. | |
Note that ROUGE is case insensitive, meaning that upper case letters are treated the same way as lower case letters. | |
This metrics is a wrapper around Google Research reimplementation of ROUGE: | |
https://github.com/google-research/google-research/tree/master/rouge | |
BLEU (Bilingual Evaluation Understudy) is an algorithm for evaluating the quality of text which has been machine-translated from one natural language to another. | |
Quality is considered to be the correspondence between a machine's output and that of a human: "the closer a machine translation is to a professional human translation, the better it is" | |
this is the central idea behind BLEU. BLEU was one of the first metrics to claim a high correlation with human judgements of quality, and remains one of the most popular automated and inexpensive metrics. | |
Scores are calculated for individual translated segments—generally sentences—by comparing them with a set of good quality reference translations. | |
Those scores are then averaged over the whole corpus to reach an estimate of the translation's overall quality. | |
Neither intelligibility nor grammatical correctness are not taken into account. | |
EXACT MATCH: Returns the rate at which the input predicted strings exactly match their references, ignoring any strings input as part of the regexes_to_ignore list. | |
BERTScore leverages the pre-trained contextual embeddings from BERT and matches words in candidate and reference | |
sentences by cosine similarity. | |
It has been shown to correlate with human judgment on sentence-level and system-level evaluation. | |
Moreover, BERTScore computes precision, recall, and F1 measure, which can be useful for evaluating different language | |
generation tasks. | |
See the project's README at https://github.com/Tiiiger/bert_score#readme for more information. | |
BLEURT a learnt evaluation metric for Natural Language Generation. It is built using multiple phases of transfer learning starting from a pretrained BERT model (Devlin et al. 2018) | |
and then employing another pre-training phrase using synthetic data. Finally it is trained on WMT human annotations. You may run BLEURT out-of-the-box or fine-tune | |
it for your specific application (the latter is expected to perform better). | |
See the project's README at https://github.com/google-research/bleurt#readme for more information. | |
ChrF and ChrF++ are two MT evaluation metrics. They both use the F-score statistic for character n-gram matches, | |
and ChrF++ adds word n-grams as well which correlates more strongly with direct assessment. We use the implementation | |
that is already present in sacrebleu. | |
""" | |
_KWARGS_DESCRIPTION = """ | |
Calculates average rouge and bleu scores for a list of hypotheses and references | |
Args: | |
predictions: list of predictions to score. Each prediction | |
should be a string with tokens separated by spaces. | |
references: list of reference for each prediction. Each | |
reference should be a string with tokens separated by spaces. | |
Returns: | |
ROUGE:{ | |
rouge1: rouge_1 (precision, recall, f1), | |
rouge2: rouge_2 (precision, recall, f1), | |
rougeL: rouge_l (precision, recall, f1), | |
rougeLsum: rouge_lsum (precision, recall, f1) | |
}, | |
BLEU:{ | |
'bleu': bleu score, | |
'precisions': geometric mean of n-gram precisions, | |
'brevity_penalty': brevity penalty, | |
'length_ratio': ratio of lengths, | |
'translation_length': translation_length, | |
'reference_length': reference_length | |
}, | |
EXACT_MATCH:{ | |
"exact_match": exact_match rate. Possible values are between 0.0 and 1.0, inclusive. | |
}, | |
BERT_SCORE:{ | |
"precision": Precision. | |
"recall": Recall. | |
"f1": F1 score. | |
"hashcode": Hashcode of the library. | |
}, | |
BLEURT:{ | |
"scores": List of scores. | |
}, | |
CHRF:{ | |
'score' (float): The chrF (chrF++) score, | |
'char_order' (int): The character n-gram order, | |
'word_order' (int): The word n-gram order. If equals to 2, the metric is referred to as chrF++, | |
'beta' (int): Determine the importance of recall w.r.t precision | |
} | |
""" | |
class GenerationEvaluator(evaluate.Metric): | |
def _info(self): | |
return evaluate.MetricInfo( | |
description=_DESCRIPTION, | |
citation=_CITATION, | |
inputs_description=_KWARGS_DESCRIPTION, | |
features=datasets.Features( | |
{ | |
"predictions": datasets.Value("string"), | |
"references": datasets.Value("string"), | |
} | |
), | |
codebase_urls=[ | |
"https://github.com/google-research/google-research/tree/master/rouge" | |
], | |
reference_urls=[ | |
"https://en.wikipedia.org/wiki/ROUGE_(metric)", | |
"https://github.com/google-research/google-research/tree/master/rouge", | |
], | |
) | |
def _compute(self, predictions, references): | |
rouge_score = evaluate.load("rouge") | |
rouge_results = rouge_score.compute( | |
predictions=predictions, references=references | |
) | |
bleu_score = evaluate.load("bleu") | |
bleu_results = bleu_score.compute( | |
predictions=predictions, references=references | |
) | |
exact_match_score = evaluate.load("exact_match") | |
exact_match_results = exact_match_score.compute( | |
predictions=predictions, references=references | |
) | |
bert_score = evaluate.load("bertscore") | |
bert_score_results = bert_score.compute( | |
predictions=predictions, references=references, lang="en" | |
) | |
mean_precision = np.mean(bert_score_results['precision']) | |
mean_recall = np.mean(bert_score_results['recall']) | |
mean_f1 = np.mean(bert_score_results['f1']) | |
bert_score_results['precision'] = round(mean_precision, 4) | |
bert_score_results['recall'] = round(mean_recall, 4) | |
bert_score_results['f1'] = round(mean_f1, 4) | |
bleurt_score = evaluate.load("bleurt", module_type="metric") | |
bleurt_results = bleurt_score.compute( | |
predictions=predictions, references=references | |
) | |
mean_bleurt_score = np.mean(bleurt_results['scores']) | |
bleurt_results['scores'] = round(mean_bleurt_score, 4) | |
chrf = evaluate.load("chrf") | |
chrf_results = chrf.compute(predictions=predictions, references=references) | |
return { | |
"ROUGE": rouge_results, | |
"BLEU": bleu_results, | |
"EXACT_MATCH": exact_match_results, | |
"BERT_SCORE": bert_score_results, | |
"BLEURT": bleurt_results, | |
"CHRF": chrf_results | |
} | |