Update adapt_tokenizer.py
Browse files- adapt_tokenizer.py +4 -5
adapt_tokenizer.py
CHANGED
@@ -1,9 +1,8 @@
|
|
1 |
-
from typing import
|
2 |
-
from transformers import AutoTokenizer,
|
3 |
-
Tokenizer = Union[PreTrainedTokenizer, PreTrainedTokenizerFast]
|
4 |
NUM_SENTINEL_TOKENS: int = 100
|
5 |
|
6 |
-
def adapt_tokenizer_for_denoising(tokenizer:
|
7 |
"""Adds sentinel tokens and padding token (if missing).
|
8 |
|
9 |
Expands the tokenizer vocabulary to include sentinel tokens
|
@@ -34,7 +33,7 @@ class AutoTokenizerForMOD(AutoTokenizer):
|
|
34 |
"""
|
35 |
|
36 |
@classmethod
|
37 |
-
def from_pretrained(cls, *args, **kwargs):
|
38 |
"""See `AutoTokenizer.from_pretrained` docstring."""
|
39 |
tokenizer = super().from_pretrained(*args, **kwargs)
|
40 |
adapt_tokenizer_for_denoising(tokenizer)
|
|
|
1 |
+
from typing import Any
|
2 |
+
from transformers import AutoTokenizer, PreTrainedTokenizerBase
|
|
|
3 |
NUM_SENTINEL_TOKENS: int = 100
|
4 |
|
5 |
+
def adapt_tokenizer_for_denoising(tokenizer: PreTrainedTokenizerBase) -> None:
|
6 |
"""Adds sentinel tokens and padding token (if missing).
|
7 |
|
8 |
Expands the tokenizer vocabulary to include sentinel tokens
|
|
|
33 |
"""
|
34 |
|
35 |
@classmethod
|
36 |
+
def from_pretrained(cls, *args: Any, **kwargs: Any) -> PreTrainedTokenizerBase:
|
37 |
"""See `AutoTokenizer.from_pretrained` docstring."""
|
38 |
tokenizer = super().from_pretrained(*args, **kwargs)
|
39 |
adapt_tokenizer_for_denoising(tokenizer)
|