tianxie-sf
commited on
Commit
•
dfa2bda
1
Parent(s):
ec8c14c
add pad_token
Browse files- tokenization_xgen.py +11 -5
tokenization_xgen.py
CHANGED
@@ -25,7 +25,7 @@ MAX_MODEL_INPUT_SIZES = {
|
|
25 |
}
|
26 |
|
27 |
|
28 |
-
def tiktoken_tokenizer(base="gpt2", add_special=True):
|
29 |
if not add_special:
|
30 |
return tiktoken.get_encoding(base)
|
31 |
|
@@ -83,6 +83,9 @@ def tiktoken_tokenizer(base="gpt2", add_special=True):
|
|
83 |
special_tokens[sp] = idx
|
84 |
idx += 1
|
85 |
|
|
|
|
|
|
|
86 |
# In production, load the arguments directly instead of accessing private attributes
|
87 |
# See openai_public.py for examples of arguments for specific encodings
|
88 |
enc = tiktoken.Encoding(
|
@@ -116,15 +119,15 @@ class XgenTokenizer(PreTrainedTokenizer):
|
|
116 |
add_special_tokens=True,
|
117 |
**kwargs,
|
118 |
):
|
119 |
-
|
120 |
super().__init__(
|
121 |
-
pad_token=
|
122 |
add_eos_token=add_eos_token,
|
123 |
add_special_tokens=add_special_tokens,
|
124 |
**kwargs,
|
125 |
)
|
126 |
self.add_eos_token = add_eos_token
|
127 |
-
self.encoder = tiktoken_tokenizer(base="gpt2", add_special=add_special_tokens)
|
128 |
|
129 |
@property
|
130 |
def vocab_size(self):
|
@@ -142,6 +145,9 @@ class XgenTokenizer(PreTrainedTokenizer):
|
|
142 |
|
143 |
def _convert_token_to_id(self, token):
|
144 |
"""Converts a token (str) in an id using the vocab."""
|
|
|
|
|
|
|
145 |
return token
|
146 |
|
147 |
def _convert_id_to_token(self, index):
|
@@ -216,4 +222,4 @@ class XgenTokenizer(PreTrainedTokenizer):
|
|
216 |
if token_ids_1 is not None:
|
217 |
output += [1] * len(token_ids_1 + eos_token_id)
|
218 |
|
219 |
-
return output
|
|
|
25 |
}
|
26 |
|
27 |
|
28 |
+
def tiktoken_tokenizer(base="gpt2", pad_token=None, add_special=True):
|
29 |
if not add_special:
|
30 |
return tiktoken.get_encoding(base)
|
31 |
|
|
|
83 |
special_tokens[sp] = idx
|
84 |
idx += 1
|
85 |
|
86 |
+
if pad_token and pad_token not in tokenizer._special_tokens and pad_token not in special_tokens:
|
87 |
+
special_tokens[pad_token] = idx
|
88 |
+
idx += 1
|
89 |
# In production, load the arguments directly instead of accessing private attributes
|
90 |
# See openai_public.py for examples of arguments for specific encodings
|
91 |
enc = tiktoken.Encoding(
|
|
|
119 |
add_special_tokens=True,
|
120 |
**kwargs,
|
121 |
):
|
122 |
+
pad_token_added = AddedToken(pad_token, lstrip=False, rstrip=False) if isinstance(pad_token, str) else pad_token
|
123 |
super().__init__(
|
124 |
+
pad_token=pad_token_added,
|
125 |
add_eos_token=add_eos_token,
|
126 |
add_special_tokens=add_special_tokens,
|
127 |
**kwargs,
|
128 |
)
|
129 |
self.add_eos_token = add_eos_token
|
130 |
+
self.encoder = tiktoken_tokenizer(base="gpt2", pad_token=pad_token, add_special=add_special_tokens)
|
131 |
|
132 |
@property
|
133 |
def vocab_size(self):
|
|
|
145 |
|
146 |
def _convert_token_to_id(self, token):
|
147 |
"""Converts a token (str) in an id using the vocab."""
|
148 |
+
if isinstance(token, str):
|
149 |
+
ids = self._tokenize(token)
|
150 |
+
return ids[0]
|
151 |
return token
|
152 |
|
153 |
def _convert_id_to_token(self, index):
|
|
|
222 |
if token_ids_1 is not None:
|
223 |
output += [1] * len(token_ids_1 + eos_token_id)
|
224 |
|
225 |
+
return output
|