|
|
|
|
|
|
|
|
|
|
|
import logging |
|
import os |
|
import sys |
|
from typing import Dict, List, Optional |
|
|
|
import torch |
|
from fairseq.models import ( |
|
FairseqIncrementalDecoder, |
|
FairseqLanguageModel, |
|
register_model, |
|
register_model_architecture, |
|
) |
|
|
|
|
|
logger = logging.getLogger(__name__) |
|
|
|
|
|
DEFAULT_MAX_TARGET_POSITIONS = 1024 |
|
|
|
|
|
@register_model("hf_gpt2") |
|
class HuggingFaceGPT2LanguageModel(FairseqLanguageModel): |
|
def __init__(self, decoder): |
|
super().__init__(decoder) |
|
|
|
@staticmethod |
|
def add_args(parser): |
|
"""Add model-specific arguments to the parser.""" |
|
|
|
parser.add_argument('--embed-dim', type=int, metavar='N', |
|
help='embedding dimension') |
|
parser.add_argument('--num-attention-heads', type=int, metavar='N', |
|
help='num attention heads') |
|
parser.add_argument('--num-layers', type=int, metavar='N', |
|
help='num layers') |
|
parser.add_argument('--dropout', type=float, metavar='D', |
|
help='dropout probability for all fully connected layers ' |
|
'in the embeddings, encoder, and pooler') |
|
parser.add_argument('--attention-dropout', type=float, metavar='D', |
|
help='dropout probability for attention weights') |
|
|
|
|
|
@classmethod |
|
def build_model(cls, args, task): |
|
"""Build a new model instance.""" |
|
default_architecture(args) |
|
return cls(HuggingFaceGPT2Decoder(args, task)) |
|
|
|
|
|
class HuggingFaceGPT2Decoder(FairseqIncrementalDecoder): |
|
def __init__(self, args, task): |
|
try: |
|
from transformers import GPT2Config, GPT2LMHeadModel |
|
except ImportError: |
|
raise ImportError( |
|
"\n\nPlease install huggingface/transformers with:" |
|
"\n\n pip install transformers" |
|
) |
|
|
|
super().__init__(task.target_dictionary) |
|
|
|
config = GPT2Config( |
|
vocab_size=len(task.target_dictionary), |
|
n_positions=args.max_target_positions + 1, |
|
n_ctx=args.max_target_positions, |
|
n_embd=args.embed_dim, |
|
n_layer=args.num_layers, |
|
n_head=args.num_attention_heads, |
|
resid_pdrop=args.dropout, |
|
embd_pdrop=args.dropout, |
|
attn_pdrop=args.attention_dropout, |
|
layer_norm_epsilon=1e-6, |
|
) |
|
self.model = GPT2LMHeadModel(config) |
|
|
|
|
|
self.pad_idx = task.target_dictionary.pad() |
|
self.model.transformer.wte.weight.data[self.pad_idx].zero_() |
|
self.model.transformer.wpe.weight.data[0].zero_() |
|
|
|
def forward( |
|
self, |
|
prev_output_tokens, |
|
src_lengths=None, |
|
incremental_state: Optional[Dict[str, List[torch.Tensor]]] = None, |
|
encoder_out=None, |
|
): |
|
features = self.extract_features(prev_output_tokens, incremental_state) |
|
lm_logits = self.model.lm_head(features) |
|
return (lm_logits,) |
|
|
|
def extract_features( |
|
self, |
|
prev_output_tokens, |
|
incremental_state: Optional[Dict[str, List[torch.Tensor]]] = None, |
|
): |
|
if incremental_state: |
|
past = self.get_incremental_state("past") |
|
else: |
|
past = None |
|
|
|
|
|
attention_mask = prev_output_tokens.ne(self.pad_idx).int() |
|
|
|
|
|
position_ids = attention_mask * ( |
|
torch.arange(1, 1 + prev_output_tokens.size(1)) |
|
.to(prev_output_tokens) |
|
.repeat(prev_output_tokens.size(0), 1) |
|
) |
|
|
|
outputs = self.model.transformer( |
|
input_ids=prev_output_tokens, |
|
past=past, |
|
attention_mask=attention_mask, |
|
position_ids=position_ids, |
|
) |
|
last_hidden_states = outputs[0] |
|
|
|
if incremental_state: |
|
self.set_incremental_state(incremental_state, "past", outputs[1]) |
|
|
|
return last_hidden_states |
|
|
|
def max_positions(self): |
|
return self.model.config.n_positions - 1 |
|
|
|
|
|
@register_model_architecture("hf_gpt2", "hf_gpt2") |
|
def default_architecture(args): |
|
if getattr(args, "max_target_positions", None) is None: |
|
args.max_target_positions = getattr( |
|
args, "tokens_per_sample", DEFAULT_MAX_TARGET_POSITIONS |
|
) |
|
args.embed_dim = getattr(args, "embed_dim", 768) |
|
args.num_attention_heads = getattr(args, "num_attention_heads", 12) |
|
args.num_layers = getattr(args, "num_layers", 12) |
|
args.dropout = getattr(args, "dropout", 0.1) |
|
args.attention_dropout = getattr(args, "attention_dropout", 0.1) |
|
|
|
|
|
@register_model_architecture("hf_gpt2", "hf_gpt2_medium") |
|
def hf_gpt2_medium(args): |
|
args.embed_dim = getattr(args, "embed_dim", 1024) |
|
args.num_attention_heads = getattr(args, "num_attention_heads", 16) |
|
args.num_layers = getattr(args, "num_layers", 24) |
|
default_architecture(args) |
|
|
|
|
|
@register_model_architecture("hf_gpt2", "hf_gpt2_large") |
|
def hf_gpt2_large(args): |
|
args.embed_dim = getattr(args, "embed_dim", 1280) |
|
args.num_attention_heads = getattr(args, "num_attention_heads", 20) |
|
args.num_layers = getattr(args, "num_layers", 36) |
|
default_architecture(args) |
|
|
|
|
|
@register_model_architecture("hf_gpt2", "hf_gpt2_xl") |
|
def hf_gpt2_xl(args): |
|
args.embed_dim = getattr(args, "embed_dim", 1600) |
|
args.num_attention_heads = getattr(args, "num_attention_heads", 25) |
|
args.num_layers = getattr(args, "num_layers", 48) |
|
default_architecture(args) |
|
|