Ronald Cardenas Acosta commited on
Commit
49bb9a0
·
1 Parent(s): 5e66e89
Files changed (2) hide show
  1. app.py +6 -0
  2. myperplexity.py +195 -0
app.py ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ import evaluate
2
+ from evaluate.utils import launch_gradio_widget
3
+
4
+
5
+ module = evaluate.load("myperplexity", module_type="measurement")
6
+ launch_gradio_widget(module)
myperplexity.py ADDED
@@ -0,0 +1,195 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright 2022 The HuggingFace Datasets Authors and the current dataset script contributor.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+ """Perplexity Metric."""
15
+
16
+ import datasets
17
+ import numpy as np
18
+ import torch
19
+ from torch.nn import CrossEntropyLoss
20
+ from transformers import AutoModelForCausalLM, AutoTokenizer
21
+
22
+ import evaluate
23
+ from evaluate import logging
24
+
25
+ CACHE_DIR="/gfs/team/nlp/users/rcardena/tools/huggingface"
26
+
27
+ _CITATION = """\
28
+
29
+ """
30
+
31
+ _DESCRIPTION = """
32
+ Perplexity (PPL) is one of the most common metrics for evaluating language models.
33
+ It is defined as the exponentiated average negative log-likelihood of a sequence, calculated with exponent base `e`.
34
+
35
+ For more information, see https://huggingface.co/docs/transformers/perplexity
36
+ """
37
+
38
+ _KWARGS_DESCRIPTION = """
39
+ Args:
40
+ model_id (str): model used for calculating Perplexity
41
+ NOTE: Perplexity can only be calculated for causal language models.
42
+ This includes models such as gpt2, causal variations of bert,
43
+ causal versions of t5, and more (the full list can be found
44
+ in the AutoModelForCausalLM documentation here:
45
+ https://huggingface.co/docs/transformers/master/en/model_doc/auto#transformers.AutoModelForCausalLM )
46
+
47
+ predictions (list of str): input text, each separate text snippet
48
+ is one list entry.
49
+ batch_size (int): the batch size to run texts through the model. Defaults to 16.
50
+ add_start_token (bool): whether to add the start token to the texts,
51
+ so the perplexity can include the probability of the first word. Defaults to True.
52
+ device (str): device to run on, defaults to 'cuda' when available
53
+ Returns:
54
+ perplexity: dictionary containing the perplexity scores for the texts
55
+ in the input list, as well as the mean perplexity. If one of the input texts is
56
+ longer than the max input length of the model, then it is truncated to the
57
+ max length for the perplexity computation.
58
+ Examples:
59
+ Example 1:
60
+ >>> perplexity = evaluate.load("perplexity", module_type="metric")
61
+ >>> input_texts = ["lorem ipsum", "Happy Birthday!", "Bienvenue"]
62
+ >>> results = perplexity.compute(model_id='gpt2',
63
+ ... add_start_token=False,
64
+ ... predictions=input_texts) # doctest:+ELLIPSIS
65
+ >>> print(list(results.keys()))
66
+ ['perplexities', 'mean_perplexity']
67
+ >>> print(round(results["mean_perplexity"], 0))
68
+ 647.0
69
+ >>> print(round(results["perplexities"][0], 0))
70
+ 32.0
71
+
72
+ Example 2:
73
+ >>> from datasets import load_dataset
74
+ >>> perplexity = evaluate.load("perplexity", module_type="metric")
75
+ >>> input_texts = load_dataset("wikitext", "wikitext-2-raw-v1", split="test")["text"][:10] # doctest: +SKIP
76
+ >>> input_texts = [s for s in input_texts if s!='']
77
+ >>> results = perplexity.compute(model_id='gpt2',
78
+ ... predictions=input_texts)
79
+ >>> print(list(results.keys()))
80
+ ['perplexities', 'mean_perplexity']
81
+ >>> print(round(results["mean_perplexity"], 2)) # doctest: +SKIP
82
+ 576.76
83
+ >>> print(round(results["perplexities"][0], 2)) # doctest: +SKIP
84
+ 889.28
85
+ """
86
+
87
+
88
+ @evaluate.utils.file_utils.add_start_docstrings(_DESCRIPTION, _KWARGS_DESCRIPTION)
89
+ class MyPerplexity(evaluate.Metric):
90
+ def _info(self):
91
+ return evaluate.MetricInfo(
92
+ module_type="measurement",
93
+ description=_DESCRIPTION,
94
+ citation=_CITATION,
95
+ inputs_description=_KWARGS_DESCRIPTION,
96
+ features=datasets.Features(
97
+ {
98
+ "predictions": datasets.Value("string"),
99
+ }
100
+ ),
101
+ reference_urls=["https://huggingface.co/docs/transformers/perplexity"],
102
+ )
103
+
104
+ def _compute(self, predictions, model_id, batch_size: int = 16, add_start_token: bool = True, device=None):
105
+
106
+ if device is not None:
107
+ assert device in ["gpu", "cpu", "cuda"], "device should be either gpu or cpu."
108
+ if device == "gpu":
109
+ device = "cuda"
110
+ else:
111
+ device = "cuda" if torch.cuda.is_available() else "cpu"
112
+
113
+ model = AutoModelForCausalLM.from_pretrained(model_id,cache_dir=CACHE_DIR)
114
+ model = model.to(device)
115
+
116
+ tokenizer = AutoTokenizer.from_pretrained(
117
+ model_id,
118
+ cache_dir=CACHE_DIR,
119
+ use_fast="cnn_dailymail" not in model_id,
120
+ )
121
+
122
+ # if batch_size > 1 (which generally leads to padding being required), and
123
+ # if there is not an already assigned pad_token, assign an existing
124
+ # special token to also be the padding token
125
+ if tokenizer.pad_token is None and batch_size > 1:
126
+ existing_special_tokens = list(tokenizer.special_tokens_map_extended.values())
127
+ # check that the model already has at least one special token defined
128
+ assert (
129
+ len(existing_special_tokens) > 0
130
+ ), "If batch_size > 1, model must have at least one special token to use for padding. Please use a different model or set batch_size=1."
131
+ # assign one of the special tokens to also be the pad token
132
+ tokenizer.add_special_tokens({"pad_token": existing_special_tokens[0]})
133
+
134
+ if add_start_token:
135
+ # leave room for <BOS> token to be added:
136
+ assert (
137
+ tokenizer.bos_token is not None
138
+ ), "Input model must already have a BOS token if using add_start_token=True. Please use a different model, or set add_start_token=False"
139
+ max_tokenized_len = model.config.max_length - 1
140
+ else:
141
+ max_tokenized_len = model.config.max_length
142
+
143
+ encodings = tokenizer(
144
+ predictions,
145
+ add_special_tokens=False,
146
+ padding=True,
147
+ truncation=True,
148
+ max_length=max_tokenized_len,
149
+ return_tensors="pt",
150
+ return_attention_mask=True,
151
+ ).to(device)
152
+
153
+ encoded_texts = encodings["input_ids"]
154
+ attn_masks = encodings["attention_mask"]
155
+
156
+ # check that each input is long enough:
157
+ if add_start_token:
158
+ assert torch.all(torch.ge(attn_masks.sum(1), 1)), "Each input text must be at least one token long."
159
+ else:
160
+ assert torch.all(
161
+ torch.ge(attn_masks.sum(1), 2)
162
+ ), "When add_start_token=False, each input text must be at least two tokens long. Run with add_start_token=True if inputting strings of only one token, and remove all empty input strings."
163
+
164
+ ppls = []
165
+ loss_fct = CrossEntropyLoss(reduction="none")
166
+
167
+ for start_index in logging.tqdm(range(0, len(encoded_texts), batch_size)):
168
+ end_index = min(start_index + batch_size, len(encoded_texts))
169
+ encoded_batch = encoded_texts[start_index:end_index]
170
+ attn_mask = attn_masks[start_index:end_index]
171
+
172
+ if add_start_token:
173
+ bos_tokens_tensor = torch.tensor([[tokenizer.bos_token_id]] * encoded_batch.size(dim=0)).to(device)
174
+ encoded_batch = torch.cat([bos_tokens_tensor, encoded_batch], dim=1)
175
+ attn_mask = torch.cat(
176
+ [torch.ones(bos_tokens_tensor.size(), dtype=torch.int64).to(device), attn_mask], dim=1
177
+ )
178
+
179
+ labels = encoded_batch
180
+
181
+ with torch.no_grad():
182
+ out_logits = model(encoded_batch, attention_mask=attn_mask).logits
183
+
184
+ shift_logits = out_logits[..., :-1, :].contiguous()
185
+ shift_labels = labels[..., 1:].contiguous()
186
+ shift_attention_mask_batch = attn_mask[..., 1:].contiguous()
187
+
188
+ perplexity_batch = torch.exp(
189
+ (loss_fct(shift_logits.transpose(1, 2), shift_labels) * shift_attention_mask_batch).sum(1)
190
+ / shift_attention_mask_batch.sum(1)
191
+ )
192
+
193
+ ppls += perplexity_batch.tolist()
194
+
195
+ return {"perplexities": ppls, "mean_perplexity": np.mean(ppls)}