|
from transformers import PretrainedConfig |
|
|
|
|
|
class TransformerLMConfig(PretrainedConfig): |
|
model_type = "transformerlm" |
|
|
|
def __init__( |
|
self, |
|
vocab_size: int = 30000, |
|
output_vocab_size: int = 30000, |
|
share_embeddings: bool = False, |
|
logits_via_embedding: bool = False, |
|
emb_dim: int = 512, |
|
num_heads: int = 8, |
|
num_layers: int = 6, |
|
qkv_dim: int = 512, |
|
mlp_dim: int = 2048, |
|
max_len: int = 2048, |
|
dropout_rate: float = 0.1, |
|
attention_dropout_rate: float = 0.1, |
|
deterministic: bool = False, |
|
decode: bool = False, |
|
bos_token_id=50256, |
|
eos_token_id=50256, |
|
**kwargs, |
|
): |
|
self.vocab_size = vocab_size |
|
self.output_vocab_size = output_vocab_size |
|
self.share_embeddings = share_embeddings |
|
self.logits_via_embedding = logits_via_embedding |
|
self.emb_dim = emb_dim |
|
self.num_heads = num_heads |
|
self.num_layers = num_layers |
|
self.qkv_dim = qkv_dim |
|
self.mlp_dim = mlp_dim |
|
self.max_len = max_len |
|
self.dropout_rate = dropout_rate |
|
self.attention_dropout_rate = attention_dropout_rate |
|
self.deterministic = deterministic |
|
self.decode = decode |
|
self.bos_token_id = bos_token_id |
|
self.eos_token_id = eos_token_id |
|
super().__init__(bos_token_id=bos_token_id, eos_token_id=eos_token_id, **kwargs) |
|
|