from transformers import PretrainedConfig class TransformerLMConfig(PretrainedConfig): model_type = "transformerlm" def __init__( self, vocab_size: int = 30000, output_vocab_size: int = 30000, share_embeddings: bool = False, logits_via_embedding: bool = False, emb_dim: int = 512, num_heads: int = 8, num_layers: int = 6, qkv_dim: int = 512, mlp_dim: int = 2048, max_len: int = 2048, dropout_rate: float = 0.1, attention_dropout_rate: float = 0.1, deterministic: bool = False, decode: bool = False, bos_token_id=50256, eos_token_id=50256, **kwargs, ): self.vocab_size = vocab_size self.output_vocab_size = output_vocab_size self.share_embeddings = share_embeddings self.logits_via_embedding = logits_via_embedding self.emb_dim = emb_dim self.num_heads = num_heads self.num_layers = num_layers self.qkv_dim = qkv_dim self.mlp_dim = mlp_dim self.max_len = max_len self.dropout_rate = dropout_rate self.attention_dropout_rate = attention_dropout_rate self.deterministic = deterministic self.decode = decode self.bos_token_id = bos_token_id self.eos_token_id = eos_token_id super().__init__(bos_token_id=bos_token_id, eos_token_id=eos_token_id, **kwargs)