Create translate.py
Browse files- translate.py +487 -0
translate.py
ADDED
@@ -0,0 +1,487 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import ctranslate2
|
2 |
+
import functools
|
3 |
+
|
4 |
+
try:
|
5 |
+
from transformers import AutoTokenizer
|
6 |
+
autotokenizer_ok = True
|
7 |
+
except ImportError:
|
8 |
+
AutoTokenizer = object
|
9 |
+
autotokenizer_ok = False
|
10 |
+
|
11 |
+
try:
|
12 |
+
from typing import Literal
|
13 |
+
except ImportError:
|
14 |
+
from typing_extensions import Literal
|
15 |
+
|
16 |
+
from typing import Any, Union, List
|
17 |
+
import os
|
18 |
+
|
19 |
+
from hf_hub_ctranslate2.util import utils as _utils
|
20 |
+
|
21 |
+
|
22 |
+
class CTranslate2ModelfromHuggingfaceHub:
|
23 |
+
"""CTranslate2 compatibility class for Translator and Generator"""
|
24 |
+
|
25 |
+
def __init__(
|
26 |
+
self,
|
27 |
+
model_name_or_path: str,
|
28 |
+
device: Literal["cpu", "cuda"] = "cuda",
|
29 |
+
device_index=0,
|
30 |
+
compute_type: Literal["int8_float16", "int8"] = "int8_float16",
|
31 |
+
tokenizer: Union[AutoTokenizer, None] = None,
|
32 |
+
hub_kwargs: dict = {},
|
33 |
+
**kwargs: Any,
|
34 |
+
):
|
35 |
+
# adaptions from https://github.com/guillaumekln/faster-whisper
|
36 |
+
if os.path.isdir(model_name_or_path):
|
37 |
+
model_path = model_name_or_path
|
38 |
+
else:
|
39 |
+
try:
|
40 |
+
model_path = _utils._download_model(
|
41 |
+
model_name_or_path, hub_kwargs=hub_kwargs, local_files_only=True,
|
42 |
+
)
|
43 |
+
except Exception:
|
44 |
+
hub_kwargs["local_files_only"] = True
|
45 |
+
model_path = _utils._download_model(
|
46 |
+
model_name_or_path, hub_kwargs=hub_kwargs, local_files_only=True,
|
47 |
+
)
|
48 |
+
self.model = self.ctranslate_class(
|
49 |
+
model_path,
|
50 |
+
device=device,
|
51 |
+
device_index=device_index,
|
52 |
+
compute_type=compute_type,
|
53 |
+
**kwargs,
|
54 |
+
)
|
55 |
+
|
56 |
+
if tokenizer is not None:
|
57 |
+
self.tokenizer = tokenizer
|
58 |
+
else:
|
59 |
+
if "tokenizer.json" in os.listdir(model_path):
|
60 |
+
if not autotokenizer_ok:
|
61 |
+
raise ValueError(
|
62 |
+
"`pip install transformers` missing to load AutoTokenizer."
|
63 |
+
)
|
64 |
+
self.tokenizer = AutoTokenizer.from_pretrained(model_path, fast=True)
|
65 |
+
else:
|
66 |
+
raise ValueError(
|
67 |
+
"no suitable Tokenizer found. "
|
68 |
+
"Please set one via tokenizer=AutoTokenizer.from_pretrained(..) arg."
|
69 |
+
)
|
70 |
+
|
71 |
+
def _forward(self, *args: Any, **kwds: Any) -> Any:
|
72 |
+
raise NotImplementedError
|
73 |
+
|
74 |
+
def tokenize_encode(self, text, *args, **kwargs):
|
75 |
+
return [
|
76 |
+
self.tokenizer.convert_ids_to_tokens(self.tokenizer.encode(p)) for p in text
|
77 |
+
]
|
78 |
+
|
79 |
+
def tokenize_decode(self, tokens_out, *args, **kwargs):
|
80 |
+
raise NotImplementedError
|
81 |
+
|
82 |
+
def generate(
|
83 |
+
self,
|
84 |
+
text: Union[str, List[str]],
|
85 |
+
encode_kwargs={},
|
86 |
+
decode_kwargs={},
|
87 |
+
*forward_args,
|
88 |
+
**forward_kwds: Any,
|
89 |
+
):
|
90 |
+
orig_type = list
|
91 |
+
if isinstance(text, str):
|
92 |
+
orig_type = str
|
93 |
+
text = [text]
|
94 |
+
token_list = self.tokenize_encode(text, **encode_kwargs)
|
95 |
+
tokens_out = self._forward(token_list, *forward_args, **forward_kwds)
|
96 |
+
texts_out = self.tokenize_decode(tokens_out, **decode_kwargs)
|
97 |
+
if orig_type == str:
|
98 |
+
return texts_out[0]
|
99 |
+
else:
|
100 |
+
return texts_out
|
101 |
+
|
102 |
+
|
103 |
+
class TranslatorCT2fromHfHub(CTranslate2ModelfromHuggingfaceHub):
|
104 |
+
def __init__(
|
105 |
+
self,
|
106 |
+
model_name_or_path: str,
|
107 |
+
device: Literal["cpu", "cuda"] = "cuda",
|
108 |
+
device_index=0,
|
109 |
+
compute_type: Literal["int8_float16", "int8"] = "int8_float16",
|
110 |
+
tokenizer: Union[AutoTokenizer, None] = None,
|
111 |
+
hub_kwargs={},
|
112 |
+
**kwargs: Any,
|
113 |
+
):
|
114 |
+
"""for ctranslate2.Translator models, in particular m2m-100
|
115 |
+
Args:
|
116 |
+
model_name_or_path (str): _description_
|
117 |
+
device (Literal[cpu, cuda], optional): _description_. Defaults to "cuda".
|
118 |
+
device_index (int, optional): _description_. Defaults to 0.
|
119 |
+
compute_type (Literal[int8_float16, int8], optional): _description_. Defaults to "int8_float16".
|
120 |
+
tokenizer (Union[AutoTokenizer, None], optional): _description_. Defaults to None.
|
121 |
+
hub_kwargs (dict, optional): _description_. Defaults to {}.
|
122 |
+
**kwargs (Any, optional): Any additional arguments
|
123 |
+
"""
|
124 |
+
self.ctranslate_class = ctranslate2.Translator
|
125 |
+
super().__init__(
|
126 |
+
model_name_or_path,
|
127 |
+
device,
|
128 |
+
device_index,
|
129 |
+
compute_type,
|
130 |
+
tokenizer,
|
131 |
+
hub_kwargs,
|
132 |
+
**kwargs,
|
133 |
+
)
|
134 |
+
|
135 |
+
def _forward(self, *args, **kwds):
|
136 |
+
return self.model.translate_batch(*args, **kwds)
|
137 |
+
|
138 |
+
def tokenize_decode(self, tokens_out, *args, **kwargs):
|
139 |
+
return [
|
140 |
+
self.tokenizer.decode(
|
141 |
+
self.tokenizer.convert_tokens_to_ids(tokens_out[i].hypotheses[0]),
|
142 |
+
*args,
|
143 |
+
**kwargs,
|
144 |
+
)
|
145 |
+
for i in range(len(tokens_out))
|
146 |
+
]
|
147 |
+
|
148 |
+
def generate(
|
149 |
+
self,
|
150 |
+
text: Union[str, List[str]],
|
151 |
+
encode_tok_kwargs={},
|
152 |
+
decode_tok_kwargs={},
|
153 |
+
*forward_args,
|
154 |
+
**forward_kwds: Any,
|
155 |
+
):
|
156 |
+
"""_summary_
|
157 |
+
Args:
|
158 |
+
text (Union[str, List[str]]): Input texts
|
159 |
+
encode_tok_kwargs (dict, optional): additional kwargs for tokenizer
|
160 |
+
decode_tok_kwargs (dict, optional): additional kwargs for tokenizer
|
161 |
+
max_batch_size (int, optional): Batch size. Defaults to 0.
|
162 |
+
batch_type (str, optional): _. Defaults to "examples".
|
163 |
+
asynchronous (bool, optional): Only False supported. Defaults to False.
|
164 |
+
beam_size (int, optional): _. Defaults to 2.
|
165 |
+
patience (float, optional): _. Defaults to 1.
|
166 |
+
num_hypotheses (int, optional): _. Defaults to 1.
|
167 |
+
length_penalty (float, optional): _. Defaults to 1.
|
168 |
+
coverage_penalty (float, optional): _. Defaults to 0.
|
169 |
+
repetition_penalty (float, optional): _. Defaults to 1.
|
170 |
+
no_repeat_ngram_size (int, optional): _. Defaults to 0.
|
171 |
+
disable_unk (bool, optional): _. Defaults to False.
|
172 |
+
suppress_sequences (Optional[List[List[str]]], optional): _.
|
173 |
+
Defaults to None.
|
174 |
+
end_token (Optional[Union[str, List[str], List[int]]], optional): _.
|
175 |
+
Defaults to None.
|
176 |
+
return_end_token (bool, optional): _. Defaults to False.
|
177 |
+
prefix_bias_beta (float, optional): _. Defaults to 0.
|
178 |
+
max_input_length (int, optional): _. Defaults to 1024.
|
179 |
+
max_decoding_length (int, optional): _. Defaults to 256.
|
180 |
+
min_decoding_length (int, optional): _. Defaults to 1.
|
181 |
+
use_vmap (bool, optional): _. Defaults to False.
|
182 |
+
return_scores (bool, optional): _. Defaults to False.
|
183 |
+
return_attention (bool, optional): _. Defaults to False.
|
184 |
+
return_alternatives (bool, optional): _. Defaults to False.
|
185 |
+
min_alternative_expansion_prob (float, optional): _. Defaults to 0.
|
186 |
+
sampling_topk (int, optional): _. Defaults to 1.
|
187 |
+
sampling_temperature (float, optional): _. Defaults to 1.
|
188 |
+
replace_unknowns (bool, optional): _. Defaults to False.
|
189 |
+
callback (_type_, optional): _. Defaults to None.
|
190 |
+
Returns:
|
191 |
+
Union[str, List[str]]: text as output, if list, same len as input
|
192 |
+
"""
|
193 |
+
return super().generate(
|
194 |
+
text,
|
195 |
+
encode_kwargs=encode_tok_kwargs,
|
196 |
+
decode_kwargs=decode_tok_kwargs,
|
197 |
+
*forward_args,
|
198 |
+
**forward_kwds,
|
199 |
+
)
|
200 |
+
|
201 |
+
|
202 |
+
class MultiLingualTranslatorCT2fromHfHub(CTranslate2ModelfromHuggingfaceHub):
|
203 |
+
def __init__(
|
204 |
+
self,
|
205 |
+
model_name_or_path: str,
|
206 |
+
device: Literal["cpu", "cuda"] = "cuda",
|
207 |
+
device_index=0,
|
208 |
+
compute_type: Literal["int8_float16", "int8"] = "int8_float16",
|
209 |
+
tokenizer: Union[AutoTokenizer, None] = None,
|
210 |
+
hub_kwargs={},
|
211 |
+
**kwargs: Any,
|
212 |
+
):
|
213 |
+
"""for ctranslate2.Translator models
|
214 |
+
Args:
|
215 |
+
model_name_or_path (str): _description_
|
216 |
+
device (Literal[cpu, cuda], optional): _description_. Defaults to "cuda".
|
217 |
+
device_index (int, optional): _description_. Defaults to 0.
|
218 |
+
compute_type (Literal[int8_float16, int8], optional): _description_. Defaults to "int8_float16".
|
219 |
+
tokenizer (Union[AutoTokenizer, None], optional): _description_. Defaults to None.
|
220 |
+
hub_kwargs (dict, optional): _description_. Defaults to {}.
|
221 |
+
**kwargs (Any, optional): Any additional arguments
|
222 |
+
"""
|
223 |
+
self.ctranslate_class = ctranslate2.Translator
|
224 |
+
super().__init__(
|
225 |
+
model_name_or_path,
|
226 |
+
device,
|
227 |
+
device_index,
|
228 |
+
compute_type,
|
229 |
+
tokenizer,
|
230 |
+
hub_kwargs,
|
231 |
+
**kwargs,
|
232 |
+
)
|
233 |
+
|
234 |
+
def _forward(self, *args, **kwds):
|
235 |
+
target_prefix = [
|
236 |
+
[self.tokenizer.lang_code_to_token[lng]] for lng in kwds.pop("tgt_lang")
|
237 |
+
]
|
238 |
+
# target_prefix=[['__de__'], ['__fr__']]
|
239 |
+
return self.model.translate_batch(*args, **kwds, target_prefix=target_prefix)
|
240 |
+
|
241 |
+
def tokenize_encode(self, text, *args, **kwargs):
|
242 |
+
tokens = []
|
243 |
+
src_lang = kwargs.pop("src_lang")
|
244 |
+
for t, src_language in zip(text, src_lang):
|
245 |
+
self.tokenizer.src_lang = src_language
|
246 |
+
tokens.append(
|
247 |
+
self.tokenizer.convert_ids_to_tokens(self.tokenizer.encode(t))
|
248 |
+
)
|
249 |
+
return tokens
|
250 |
+
|
251 |
+
def tokenize_decode(self, tokens_out, *args, **kwargs):
|
252 |
+
return [
|
253 |
+
self.tokenizer.decode(
|
254 |
+
self.tokenizer.convert_tokens_to_ids(tokens_out[i].hypotheses[0][1:]),
|
255 |
+
*args,
|
256 |
+
**kwargs,
|
257 |
+
)
|
258 |
+
for i in range(len(tokens_out))
|
259 |
+
]
|
260 |
+
|
261 |
+
def generate(
|
262 |
+
self,
|
263 |
+
text: Union[str, List[str]],
|
264 |
+
src_lang: Union[str, List[str]],
|
265 |
+
tgt_lang: Union[str, List[str]],
|
266 |
+
*forward_args,
|
267 |
+
**forward_kwds: Any,
|
268 |
+
):
|
269 |
+
"""_summary_
|
270 |
+
Args:
|
271 |
+
text (Union[str, List[str]]): Input texts
|
272 |
+
src_lang (Union[str, List[str]]): soruce language of the Input texts
|
273 |
+
tgt_lang (Union[str, List[str]]): target language for outputs
|
274 |
+
max_batch_size (int, optional): Batch size. Defaults to 0.
|
275 |
+
batch_type (str, optional): _. Defaults to "examples".
|
276 |
+
asynchronous (bool, optional): Only False supported. Defaults to False.
|
277 |
+
beam_size (int, optional): _. Defaults to 2.
|
278 |
+
patience (float, optional): _. Defaults to 1.
|
279 |
+
num_hypotheses (int, optional): _. Defaults to 1.
|
280 |
+
length_penalty (float, optional): _. Defaults to 1.
|
281 |
+
coverage_penalty (float, optional): _. Defaults to 0.
|
282 |
+
repetition_penalty (float, optional): _. Defaults to 1.
|
283 |
+
no_repeat_ngram_size (int, optional): _. Defaults to 0.
|
284 |
+
disable_unk (bool, optional): _. Defaults to False.
|
285 |
+
suppress_sequences (Optional[List[List[str]]], optional): _.
|
286 |
+
Defaults to None.
|
287 |
+
end_token (Optional[Union[str, List[str], List[int]]], optional): _.
|
288 |
+
Defaults to None.
|
289 |
+
return_end_token (bool, optional): _. Defaults to False.
|
290 |
+
prefix_bias_beta (float, optional): _. Defaults to 0.
|
291 |
+
max_input_length (int, optional): _. Defaults to 1024.
|
292 |
+
max_decoding_length (int, optional): _. Defaults to 256.
|
293 |
+
min_decoding_length (int, optional): _. Defaults to 1.
|
294 |
+
use_vmap (bool, optional): _. Defaults to False.
|
295 |
+
return_scores (bool, optional): _. Defaults to False.
|
296 |
+
return_attention (bool, optional): _. Defaults to False.
|
297 |
+
return_alternatives (bool, optional): _. Defaults to False.
|
298 |
+
min_alternative_expansion_prob (float, optional): _. Defaults to 0.
|
299 |
+
sampling_topk (int, optional): _. Defaults to 1.
|
300 |
+
sampling_temperature (float, optional): _. Defaults to 1.
|
301 |
+
replace_unknowns (bool, optional): _. Defaults to False.
|
302 |
+
callback (_type_, optional): _. Defaults to None.
|
303 |
+
Returns:
|
304 |
+
Union[str, List[str]]: text as output, if list, same len as input
|
305 |
+
"""
|
306 |
+
if not len(text) == len(src_lang) == len(tgt_lang):
|
307 |
+
raise ValueError(
|
308 |
+
f"unequal len: text={len(text)} src_lang={len(src_lang)} tgt_lang={len(tgt_lang)}"
|
309 |
+
)
|
310 |
+
forward_kwds["tgt_lang"] = tgt_lang
|
311 |
+
return super().generate(
|
312 |
+
text, *forward_args, **forward_kwds, encode_kwargs={"src_lang": src_lang}
|
313 |
+
)
|
314 |
+
|
315 |
+
|
316 |
+
class EncoderCT2fromHfHub(CTranslate2ModelfromHuggingfaceHub):
|
317 |
+
def __init__(
|
318 |
+
self,
|
319 |
+
model_name_or_path: str,
|
320 |
+
device: Literal["cpu", "cuda"] = "cuda",
|
321 |
+
device_index=0,
|
322 |
+
compute_type: Literal["int8_float16", "int8"] = "int8_float16",
|
323 |
+
tokenizer: Union[AutoTokenizer, None] = None,
|
324 |
+
hub_kwargs={},
|
325 |
+
**kwargs: Any,
|
326 |
+
):
|
327 |
+
"""for ctranslate2.Translator models, in particular m2m-100
|
328 |
+
Args:
|
329 |
+
model_name_or_path (str): _description_
|
330 |
+
device (Literal[cpu, cuda], optional): _description_. Defaults to "cuda".
|
331 |
+
device_index (int, optional): _description_. Defaults to 0.
|
332 |
+
compute_type (Literal[int8_float16, int8], optional): _description_. Defaults to "int8_float16".
|
333 |
+
tokenizer (Union[AutoTokenizer, None], optional): _description_. Defaults to None.
|
334 |
+
hub_kwargs (dict, optional): _description_. Defaults to {}.
|
335 |
+
**kwargs (Any, optional): Any additional arguments
|
336 |
+
"""
|
337 |
+
self.ctranslate_class = ctranslate2.Encoder
|
338 |
+
super().__init__(
|
339 |
+
model_name_or_path,
|
340 |
+
device,
|
341 |
+
device_index,
|
342 |
+
compute_type,
|
343 |
+
tokenizer,
|
344 |
+
hub_kwargs,
|
345 |
+
**kwargs,
|
346 |
+
)
|
347 |
+
self.device = device
|
348 |
+
if device == "cuda":
|
349 |
+
try:
|
350 |
+
import torch
|
351 |
+
except ImportError:
|
352 |
+
raise ValueError(
|
353 |
+
"decoding storageview on CUDA of encoder requires torch"
|
354 |
+
)
|
355 |
+
self.tensor_decode_method = functools.partial(
|
356 |
+
torch.as_tensor, device=device
|
357 |
+
)
|
358 |
+
self.input_dtype=torch.int32
|
359 |
+
else:
|
360 |
+
try:
|
361 |
+
import numpy as np
|
362 |
+
except ImportError:
|
363 |
+
raise ValueError(
|
364 |
+
"decoding storageview on CPU of encoder requires numpy"
|
365 |
+
)
|
366 |
+
self.tensor_decode_method = np.asarray
|
367 |
+
|
368 |
+
def _forward(self, features, *args, **kwds):
|
369 |
+
input_ids = features["input_ids"]
|
370 |
+
tokens_out = self.model.forward_batch(input_ids, *args, **kwds)
|
371 |
+
outputs = dict(
|
372 |
+
pooler_output = self.tensor_decode_method(tokens_out.pooler_output),
|
373 |
+
last_hidden_state = self.tensor_decode_method(tokens_out.last_hidden_state),
|
374 |
+
attention_mask=features["attention_mask"]
|
375 |
+
)
|
376 |
+
return outputs
|
377 |
+
|
378 |
+
def tokenize_encode(self, text, *args, **kwargs):
|
379 |
+
return self.tokenizer(text)
|
380 |
+
|
381 |
+
def tokenize_decode(self, tokens_out, *args, **kwargs):
|
382 |
+
return tokens_out
|
383 |
+
|
384 |
+
def generate(
|
385 |
+
self,
|
386 |
+
text: Union[str, List[str]],
|
387 |
+
encode_tok_kwargs={},
|
388 |
+
decode_tok_kwargs={},
|
389 |
+
*forward_args,
|
390 |
+
**forward_kwds: Any,
|
391 |
+
):
|
392 |
+
return super().generate(
|
393 |
+
text,
|
394 |
+
encode_kwargs=encode_tok_kwargs,
|
395 |
+
decode_kwargs=decode_tok_kwargs,
|
396 |
+
*forward_args,
|
397 |
+
**forward_kwds,
|
398 |
+
)
|
399 |
+
|
400 |
+
|
401 |
+
class GeneratorCT2fromHfHub(CTranslate2ModelfromHuggingfaceHub):
|
402 |
+
def __init__(
|
403 |
+
self,
|
404 |
+
model_name_or_path: str,
|
405 |
+
device: Literal["cpu", "cuda"] = "cuda",
|
406 |
+
device_index=0,
|
407 |
+
compute_type: Literal["int8_float16", "int8"] = "int8_float16",
|
408 |
+
tokenizer: Union[AutoTokenizer, None] = None,
|
409 |
+
hub_kwargs={},
|
410 |
+
**kwargs: Any,
|
411 |
+
):
|
412 |
+
"""for ctranslate2.Generator models
|
413 |
+
Args:
|
414 |
+
model_name_or_path (str): _description_
|
415 |
+
device (Literal[cpu, cuda], optional): _description_. Defaults to "cuda".
|
416 |
+
device_index (int, optional): _description_. Defaults to 0.
|
417 |
+
compute_type (Literal[int8_float16, int8], optional): _description_. Defaults to "int8_float16".
|
418 |
+
tokenizer (Union[AutoTokenizer, None], optional): _description_. Defaults to None.
|
419 |
+
hub_kwargs (dict, optional): _description_. Defaults to {}.
|
420 |
+
**kwargs (Any, optional): Any additional arguments
|
421 |
+
"""
|
422 |
+
self.ctranslate_class = ctranslate2.Generator
|
423 |
+
super().__init__(
|
424 |
+
model_name_or_path,
|
425 |
+
device,
|
426 |
+
device_index,
|
427 |
+
compute_type,
|
428 |
+
tokenizer,
|
429 |
+
hub_kwargs,
|
430 |
+
**kwargs,
|
431 |
+
)
|
432 |
+
|
433 |
+
def _forward(self, *args, **kwds):
|
434 |
+
return self.model.generate_batch(*args, **kwds)
|
435 |
+
|
436 |
+
def tokenize_decode(self, tokens_out, *args, **kwargs):
|
437 |
+
return [
|
438 |
+
self.tokenizer.decode(tokens_out[i].sequences_ids[0], *args, **kwargs)
|
439 |
+
for i in range(len(tokens_out))
|
440 |
+
]
|
441 |
+
|
442 |
+
def generate(
|
443 |
+
self,
|
444 |
+
text: Union[str, List[str]],
|
445 |
+
encode_tok_kwargs={},
|
446 |
+
decode_tok_kwargs={},
|
447 |
+
*forward_args,
|
448 |
+
**forward_kwds: Any,
|
449 |
+
):
|
450 |
+
"""_summary_
|
451 |
+
Args:
|
452 |
+
text (str | List[str]): Input texts
|
453 |
+
encode_tok_kwargs (dict, optional): additional kwargs for tokenizer
|
454 |
+
decode_tok_kwargs (dict, optional): additional kwargs for tokenizer
|
455 |
+
max_batch_size (int, optional): _. Defaults to 0.
|
456 |
+
batch_type (str, optional): _. Defaults to 'examples'.
|
457 |
+
asynchronous (bool, optional): _. Defaults to False.
|
458 |
+
beam_size (int, optional): _. Defaults to 1.
|
459 |
+
patience (float, optional): _. Defaults to 1.
|
460 |
+
num_hypotheses (int, optional): _. Defaults to 1.
|
461 |
+
length_penalty (float, optional): _. Defaults to 1.
|
462 |
+
repetition_penalty (float, optional): _. Defaults to 1.
|
463 |
+
no_repeat_ngram_size (int, optional): _. Defaults to 0.
|
464 |
+
disable_unk (bool, optional): _. Defaults to False.
|
465 |
+
suppress_sequences (Optional[List[List[str]]], optional): _.
|
466 |
+
Defaults to None.
|
467 |
+
end_token (Optional[Union[str, List[str], List[int]]], optional): _.
|
468 |
+
Defaults to None.
|
469 |
+
return_end_token (bool, optional): _. Defaults to False.
|
470 |
+
max_length (int, optional): _. Defaults to 512.
|
471 |
+
min_length (int, optional): _. Defaults to 0.
|
472 |
+
include_prompt_in_result (bool, optional): _. Defaults to True.
|
473 |
+
return_scores (bool, optional): _. Defaults to False.
|
474 |
+
return_alternatives (bool, optional): _. Defaults to False.
|
475 |
+
min_alternative_expansion_prob (float, optional): _. Defaults to 0.
|
476 |
+
sampling_topk (int, optional): _. Defaults to 1.
|
477 |
+
sampling_temperature (float, optional): _. Defaults to 1.
|
478 |
+
Returns:
|
479 |
+
str | List[str]: text as output, if list, same len as input
|
480 |
+
"""
|
481 |
+
return super().generate(
|
482 |
+
text,
|
483 |
+
encode_kwargs=encode_tok_kwargs,
|
484 |
+
decode_kwargs=decode_tok_kwargs,
|
485 |
+
*forward_args,
|
486 |
+
**forward_kwds,
|
487 |
+
)
|