Automodel support

Browse files

Files changed (11) hide show

README.md +91 -3
bert_layers.py +986 -0
bert_padding.py +153 -0
blockdiag_linear.py +73 -0
blockdiag_multiply.py +82 -0
config.json +46 -2
configuration_bert.py +75 -0
generation_config.json +6 -0
hyena_utils.py +259 -0
monarch_mixer_sequence_mixer.py +156 -0
structured_linear.py +70 -0

README.md CHANGED Viewed

@@ -3,13 +3,101 @@ license: apache-2.0
 language:
 - en
 pipeline_tag: text-classification
 ---
 # Monarch Mixer-BERT
-The 80M checkpoint for M2-BERT-base from the paper [Monarch Mixer: A Simple Sub-Quadratic GEMM-Based Architecture](https://arxiv.org/abs/2310.12109).
-This model has been pretrained with sequence length 32768, and it has been fine-tuned for retrieval.
-This model was trained by Dan Fu, Jon Saad-Falcon, and Simran Arora.
 Check out our [GitHub](https://github.com/HazyResearch/m2/tree/main) for instructions on how to download and fine-tune it!

 language:
 - en
 pipeline_tag: text-classification
+inference: false
 ---
 # Monarch Mixer-BERT
+An 80M checkpoint of M2-BERT, pretrained with sequence length 32768, and it has been fine-tuned for long-context retrieval.
+Check out the paper [Monarch Mixer: A Simple Sub-Quadratic GEMM-Based Architecture](https://arxiv.org/abs/2310.12109) and our [blog post]() on retrieval for more on how we trained this model for long sequence.
+This model was trained by Jon Saad-Falcon, Dan Fu, and Simran Arora.
 Check out our [GitHub](https://github.com/HazyResearch/m2/tree/main) for instructions on how to download and fine-tune it!
+## How to use
+You can load this model using Hugging Face `AutoModel`:
+```python
+from transformers import AutoModelForSequenceClassification
+model = AutoModelForSequenceClassification.from_pretrained(
+  "togethercomputer/m2-bert-80M-32k-retrieval",
+  trust_remote_code=True
+)
+```
+You should expect to see a large error message about unused parameters for FlashFFTConv.
+If you'd like to load the model with FlashFFTConv, you can check out our [GitHub](https://github.com/HazyResearch/m2/tree/main).
+This model generates embeddings for retrieval. The embeddings have a dimensionality of 768:
+```python
+from transformers import AutoTokenizer, AutoModelForSequenceClassification
+max_seq_length = 32768
+testing_string = "Every morning, I make a cup of coffee to start my day."
+model = AutoModelForSequenceClassification.from_pretrained(
+  "togethercomputer/m2-bert-80M-32k-retrieval",
+  trust_remote_code=True
+)
+tokenizer = AutoTokenizer.from_pretrained(
+  "bert-base-uncased",
+  model_max_length=max_seq_length
+)
+input_ids = tokenizer(
+  [testing_string],
+  return_tensors="pt",
+  padding="max_length",
+  return_token_type_ids=False,
+  truncation=True,
+  max_length=max_seq_length
+)
+outputs = model(**input_ids)
+embeddings = outputs['sentence_embedding']
+```
+You can also get embeddings from this model using the Together API as follows (you can find your API key [here](https://api.together.xyz/settings/api-keys)):
+```python
+import os
+import requests
+def generate_together_embeddings(text: str, model_api_string: str, api_key: str):
+    url = "https://api.together.xyz/api/v1/embeddings"
+    headers = {
+        "accept": "application/json",
+        "content-type": "application/json",
+        "Authorization": f"Bearer {api_key}"
+    }
+    session = requests.Session()
+    response = session.post(
+        url,
+        headers=headers,
+        json={
+            "input": text,
+            "model": model_api_string
+        }
+    )
+    if response.status_code != 200:
+        raise ValueError(f"Request failed with status code {response.status_code}: {response.text}")
+    return response.json()['data'][0]['embedding']
+print(generate_together_embeddings(
+  'Hello world',
+  'togethercomputer/m2-bert-80M-32k-retrieval',
+  os.environ['TOGETHER_API_KEY'])[:10]
+)
+```
+## Citation
+If you use this model, or otherwise found our work valuable, you can cite us as follows:
+```
+@inproceedings{fu2023monarch,
+  title={Monarch Mixer: A Simple Sub-Quadratic GEMM-Based Architecture},
+  author={Fu, Daniel Y and Arora, Simran and Grogan, Jessica and Johnson, Isys and Eyuboglu, Sabri and Thomas, Armin W and Spector, Benjamin and Poli, Michael and Rudra, Atri and R{\'e}, Christopher},
+  booktitle={Advances in Neural Information Processing Systems},
+  year={2023}
+}
+```

bert_layers.py ADDED Viewed

	@@ -0,0 +1,986 @@

+# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
+# Copyright (c) 2018-2021, NVIDIA CORPORATION.  All rights reserved.
+# Copyright (c) 2022, Tri Dao.
+# Copyright (c) 2023, MosaicML.
+# Copyright (c) 2023, Dan Fu and Simran Arora.
+import copy
+import logging
+import math
+import os
+import sys
+import warnings
+from typing import List, Optional, Tuple, Union
+from functools import partial
+# Add folder root to path to allow us to use relative imports regardless of what directory the script is run from
+# sys.path.append(os.path.dirname(os.path.realpath(__file__)))
+from .bert_padding import (index_first_axis,
+                           index_put_first_axis, pad_input,
+                           unpad_input, unpad_input_only)
+import torch
+import torch.nn as nn
+from einops import rearrange
+from torch.nn.modules.utils import consume_prefix_in_state_dict_if_present
+from transformers.activations import ACT2FN
+from transformers.modeling_outputs import (MaskedLMOutput,
+                                           SequenceClassifierOutput)
+from transformers.models.bert.modeling_bert import BertPreTrainedModel
+from .blockdiag_linear import BlockdiagLinear
+from .monarch_mixer_sequence_mixer import MonarchMixerSequenceMixing
+logger = logging.getLogger(__name__)
+torch.backends.cuda.matmul.allow_tf32 = True
+torch.backends.cudnn.allow_tf32 = True
+class BertEmbeddings(nn.Module):
+    """Construct the embeddings for words, ignoring position.
+    There are no positional embeddings since we use ALiBi and token_type
+    embeddings.
+    This module is modeled after the Hugging Face BERT's
+    :class:`~transformers.model.bert.modeling_bert.BertEmbeddings`, but is
+    modified as part of Mosaic BERT's ALiBi implementation. The key change is
+    that position embeddings are removed. Position information instead comes
+    from attention biases that scale linearly with the position distance
+    between query and key tokens.
+    This module ignores the `position_ids` input to the `forward` method.
+    """
+    def __init__(self, config):
+        super().__init__()
+        self.word_embeddings = nn.Embedding(config.vocab_size,
+                                            config.hidden_size,
+                                            padding_idx=config.pad_token_id)
+        # ALiBi doesn't use position embeddings
+        if config.use_positional_encodings:
+            self.position_embeddings = nn.Embedding(config.max_position_embeddings, config.hidden_size)
+        self.use_positional_encodings = config.use_positional_encodings
+        self.token_type_embeddings = nn.Embedding(config.type_vocab_size,
+                                                  config.hidden_size)
+        # self.LayerNorm is not snake-cased to stick with TensorFlow model
+        # variable name and be able to load any TensorFlow checkpoint file
+        self.LayerNorm = nn.LayerNorm(config.hidden_size,
+                                      eps=config.layer_norm_eps)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+        if config.use_positional_encodings:
+            self.register_buffer("position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)))
+        self.register_buffer('token_type_ids',
+                             torch.zeros(config.max_position_embeddings,
+                                         dtype=torch.long),
+                             persistent=False)
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        token_type_ids: Optional[torch.LongTensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        past_key_values_length: int = 0,
+        return_position_encodings: bool = False,
+    ) -> torch.Tensor:
+        if (input_ids is not None) == (inputs_embeds is not None):
+            raise ValueError('Must specify either input_ids or input_embeds!')
+        if input_ids is not None:
+            input_shape = input_ids.size()
+        else:
+            assert inputs_embeds is not None  # just for type checking
+            input_shape = inputs_embeds.size()[:-1]
+        seq_length = input_shape[1]
+        if position_ids is None:
+            if self.use_positional_encodings:
+                position_ids = self.position_ids[:, past_key_values_length : seq_length + past_key_values_length]
+        # Setting the token_type_ids to the registered buffer in constructor
+        # where it is all zeros, which usually occurs when it's auto-generated;
+        # registered buffer helps users when tracing the model without passing
+        # token_type_ids, solves issue #5664
+        if token_type_ids is None:
+            if hasattr(self, 'token_type_ids'):
+                assert isinstance(self.token_type_ids, torch.LongTensor)
+                buffered_token_type_ids = self.token_type_ids[:, :seq_length]
+                buffered_token_type_ids_expanded = buffered_token_type_ids.expand(
+                    input_shape[0], seq_length)
+                token_type_ids = buffered_token_type_ids_expanded  # type: ignore
+            else:
+                token_type_ids = torch.zeros(input_shape,  # type: ignore
+                                             dtype=torch.long,
+                                             device=self.word_embeddings.device) # type: ignore  # yapf: disable
+        if inputs_embeds is None:
+            inputs_embeds = self.word_embeddings(input_ids)
+        token_type_embeddings = self.token_type_embeddings(token_type_ids)
+        embeddings = inputs_embeds + token_type_embeddings
+        if self.use_positional_encodings:
+            position_embeddings = self.position_embeddings(position_ids)
+            embeddings += position_embeddings
+        embeddings = self.LayerNorm(embeddings)
+        embeddings = self.dropout(embeddings)
+        if return_position_encodings:
+            return embeddings, position_embeddings
+        else:
+            return embeddings
+class BertMLP(nn.Module):
+    """Applies the FFN at the end of each BERT layer."""
+    def __init__(self, config):
+        super().__init__()
+        self.config = config
+        if self.config.use_monarch_mlp:
+            linear_cls = partial(BlockdiagLinear, nblocks=self.config.monarch_mlp_nblocks)
+        else:
+            linear_cls = nn.Linear
+        self.gated_layers = linear_cls(config.hidden_size,
+                                        config.intermediate_size,
+                                        bias=False)
+        self.act = nn.GELU(approximate='none')
+        self.wo = linear_cls(config.intermediate_size, config.hidden_size)
+        self.layernorm = nn.LayerNorm(config.hidden_size,
+                                      eps=config.layer_norm_eps)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        """Compute new hidden states from current hidden states.
+        Args:
+            hidden_states (torch.Tensor): The (unpadded) hidden states from
+                the attention layer [nnz, dim].
+        """
+        residual_connection = hidden_states
+        hidden_states = self.gated_layers(hidden_states)
+        hidden_states = self.act(hidden_states)
+        hidden_states = self.dropout(hidden_states)
+        hidden_states = self.wo(hidden_states)
+        hidden_states = self.layernorm(hidden_states + residual_connection)
+        return hidden_states
+class BertGatedLinearUnitMLP(nn.Module):
+    """Applies the FFN at the end of each BERT layer with a Gated Linear Unit"""
+    def __init__(self, config):
+        super().__init__()
+        self.config = config
+        self.is_padded = True
+        if self.config.use_monarch_mlp:
+            linear_cls = partial(BlockdiagLinear, nblocks=self.config.monarch_mlp_nblocks)
+        else:
+            linear_cls = nn.Linear
+        self.gated_layers = linear_cls(
+            config.hidden_size,
+            config.intermediate_size * 2,
+            bias=False
+        )
+        self.act = nn.GELU(approximate='none')
+        self.wo = linear_cls(config.intermediate_size, config.hidden_size)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+        self.layernorm = nn.LayerNorm(config.hidden_size,
+                                      eps=config.layer_norm_eps)
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        """Compute new hidden states from current hidden states.
+        Args:
+            hidden_states (torch.Tensor): The (unpadded) hidden states from
+                the attention layer [nnz, dim].
+        """
+        residual_connection = hidden_states
+        # compute the activation
+        hidden_states = self.gated_layers(hidden_states)
+        if self.is_padded:
+            gated = hidden_states[:, :, :self.config.intermediate_size]
+            non_gated = hidden_states[:, :, self.config.intermediate_size:]
+        else:
+            gated = hidden_states[:, :self.config.intermediate_size]
+            non_gated = hidden_states[:, self.config.intermediate_size:]
+        hidden_states = self.act(gated) * non_gated
+        hidden_states = self.dropout(hidden_states)
+        # multiply by the second matrix
+        hidden_states = self.wo(hidden_states)
+        # add the residual connection and post-LN
+        hidden_states = self.layernorm(hidden_states + residual_connection)
+        return hidden_states
+class BertLayer(nn.Module):
+    """BERT layer, which includes Sequence Mixing (e.g. Hyena) and State Mixing (e.g. MLP)."""
+    def __init__(self, config):
+        super(BertLayer, self).__init__()
+        mm_cls = MonarchMixerSequenceMixing
+        self.attention = mm_cls(
+            config.hidden_size,
+            l_max=config.long_conv_l_max,
+            hyena_kernel_lr=config.long_conv_kernel_learning_rate,
+            bidirectional=config.bidirectional,
+            hyena_lr_pos_emb=config.hyena_lr_pos_emb,
+            hyena_w=config.hyena_w,
+            hyena_w_mod=config.hyena_w_mod,
+            hyena_wd=config.hyena_wd,
+            hyena_emb_dim=config.hyena_emb_dim,
+            hyena_filter_dropout=config.hyena_filter_dropout,
+            hyena_filter_order=config.hyena_filter_order,
+            residual_long_conv=config.residual_long_conv,
+            hyena_training_additions=config.hyena_training_additions,
+        )
+        if config.use_glu_mlp:
+            self.mlp = BertGatedLinearUnitMLP(config)
+        else:
+            self.mlp = BertMLP(config)
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        cu_seqlens: torch.Tensor,
+        seqlen: int,
+        subset_idx: Optional[torch.Tensor] = None,
+        indices: Optional[torch.Tensor] = None,
+        attn_mask: Optional[torch.Tensor] = None,
+        bias: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        """Forward pass for a BERT layer, including both attention and MLP.
+        Args:
+            hidden_states: (total_nnz, dim)
+            cu_seqlens: (batch + 1,)
+            seqlen: int
+            subset_idx: () set of indices whose values we care about at the end of the layer
+                        (e.g., the masked tokens, if this is the final layer).
+            indices: None or (total_nnz,)
+            attn_mask: None or (batch, max_seqlen_in_batch)
+            bias: None or (batch, heads, max_seqlen_in_batch, max_seqlen_in_batch)
+        """
+        attention_output = self.attention(hidden_states)
+        if type(attention_output) == tuple:
+            attention_output, _ = attention_output
+        layer_output = self.mlp(attention_output)
+        return layer_output
+class BertEncoder(nn.Module):
+    """A stack of BERT layers providing the backbone of BERT.
+    Compared to the analogous Hugging Face BERT module, this module handles unpadding to reduce unnecessary computation
+    at padded tokens, and pre-computes attention biases to implement ALiBi.
+    """
+    def __init__(self, config):
+        super().__init__()
+        layer = BertLayer(config)
+        self.layer = nn.ModuleList(
+            [copy.deepcopy(layer) for _ in range(config.num_hidden_layers)])
+        self.num_attention_heads = config.num_attention_heads
+    def rebuild_alibi_tensor(self,
+                             size: int,
+                             device: Optional[Union[torch.device, str]] = None):
+        # Alibi
+        # Following https://github.com/ofirpress/attention_with_linear_biases/issues/5 (Implementation 1)
+        # In the causal case, you can exploit the fact that softmax is invariant to a uniform translation
+        # of the logits, which makes the math work out *after* applying causal masking. If no causal masking
+        # will be applied, it is necessary to construct the diagonal mask.
+        n_heads = self.num_attention_heads
+        def _get_alibi_head_slopes(n_heads: int) -> List[float]:
+            def get_slopes_power_of_2(n_heads: int) -> List[float]:
+                start = (2**(-2**-(math.log2(n_heads) - 3)))
+                ratio = start
+                return [start * ratio**i for i in range(n_heads)]
+            # In the paper, they only train models that have 2^a heads for some a. This function
+            # has some good properties that only occur when the input is a power of 2. To
+            # maintain that even when the number of heads is not a power of 2, we use a
+            # workaround.
+            if math.log2(n_heads).is_integer():
+                return get_slopes_power_of_2(n_heads)
+            closest_power_of_2 = 2**math.floor(math.log2(n_heads))
+            slopes_a = get_slopes_power_of_2(closest_power_of_2)
+            slopes_b = _get_alibi_head_slopes(2 * closest_power_of_2)
+            slopes_b = slopes_b[0::2][:n_heads - closest_power_of_2]
+            return slopes_a + slopes_b
+        context_position = torch.arange(size, device=device)[:, None]
+        memory_position = torch.arange(size, device=device)[None, :]
+        relative_position = torch.abs(memory_position - context_position)
+        # [n_heads, max_token_length, max_token_length]
+        relative_position = relative_position.unsqueeze(0).expand(
+            n_heads, -1, -1)
+        slopes = torch.Tensor(_get_alibi_head_slopes(n_heads)).to(device)
+        alibi = slopes.unsqueeze(1).unsqueeze(1) * -relative_position
+        # [1, n_heads, max_token_length, max_token_length]
+        alibi = alibi.unsqueeze(0)
+        assert alibi.shape == torch.Size([1, n_heads, size, size])
+        self._current_alibi_size = size
+        self.alibi = alibi
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: torch.Tensor,
+        output_all_encoded_layers: Optional[bool] = True,
+        subset_mask: Optional[torch.Tensor] = None,
+        position_encodings: Optional[torch.Tensor] = None,
+    ) -> List[torch.Tensor]:
+        extended_attention_mask = attention_mask.unsqueeze(1).unsqueeze(2)
+        extended_attention_mask = extended_attention_mask.to(
+            dtype=next(self.parameters()).dtype)  # fp16 compatibility
+        extended_attention_mask = (1.0 - extended_attention_mask) * -10000.0
+        attention_mask_bool = attention_mask.bool()
+        batch, seqlen = hidden_states.shape[:2]
+        cu_seqlens = None
+        indices = None
+        alibi_attn_mask = None
+        all_encoder_layers = []
+        for layer_module in self.layer:
+            hidden_states = layer_module(hidden_states,
+                cu_seqlens,
+                seqlen,
+                None,
+                indices,
+                attn_mask=attention_mask,
+                bias=alibi_attn_mask
+            )
+            if position_encodings is not None:
+                hidden_states = hidden_states + position_encodings
+            if output_all_encoded_layers:
+                all_encoder_layers.append(hidden_states)
+        if subset_mask is not None:
+            hidden_states = hidden_states[subset_mask]
+        if not output_all_encoded_layers:
+            all_encoder_layers.append(hidden_states)
+        return all_encoder_layers
+class BertPooler(nn.Module):
+    def __init__(self, config):
+        super(BertPooler, self).__init__()
+        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
+        self.activation = nn.Tanh()
+        self.pool_all = config.pool_all
+    def forward(self,
+                hidden_states: torch.Tensor,
+                pool: Optional[bool] = True,
+                mask= None) -> torch.Tensor:
+        # We "pool" the model by simply taking the hidden state corresponding
+        # to the first token.
+        if not self.pool_all:
+            first_token_tensor = hidden_states[:, 0] if pool else hidden_states
+            pooled_output = self.dense(first_token_tensor)
+            pooled_output = self.activation(pooled_output)
+        else:
+            # mean pool everything that isn't masked out
+            denom = torch.sum(mask, dim=1, keepdim=True)
+            mean_tensor = torch.sum((hidden_states) * mask.unsqueeze(-1), dim = 1) / denom
+            pooled_output = self.dense(mean_tensor)
+            pooled_output = self.activation(pooled_output)
+        return pooled_output
+class BertPredictionHeadTransform(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
+        if isinstance(config.hidden_act, str):
+            self.transform_act_fn = ACT2FN[config.hidden_act]
+        else:
+            self.transform_act_fn = config.hidden_act
+        self.LayerNorm = torch.nn.LayerNorm(config.hidden_size, eps=1e-12)
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.transform_act_fn(hidden_states)
+        hidden_states = self.LayerNorm(hidden_states)
+        return hidden_states
+class BertModel(BertPreTrainedModel):
+    """Overall BERT model.
+    Args:
+        config: a BertConfig class instance with the configuration to build a new model
+    Inputs:
+        `input_ids`: a torch.LongTensor of shape [batch_size, sequence_length]
+            with the word token indices in the vocabulary(see the tokens preprocessing logic in the scripts
+            `extract_features.py`, `run_classifier.py` and `run_squad.py`)
+        `token_type_ids`: an optional torch.LongTensor of shape [batch_size, sequence_length] with the token
+            types indices selected in [0, 1]. Type 0 corresponds to a `sentence A` and type 1 corresponds to
+            a `sentence B` token (see BERT paper for more details).
+        `attention_mask`: an optional torch.LongTensor of shape [batch_size, sequence_length] with indices
+            selected in [0, 1]. It's a mask to be used if the input sequence length is smaller than the max
+            input sequence length in the current batch. It's the mask that we typically use for attention when
+            a batch has varying length sentences.
+        `output_all_encoded_layers`: boolean which controls the content of the `encoded_layers` output as described below. Default: `True`.
+    Outputs: Tuple of (encoded_layers, pooled_output)
+        `encoded_layers`: controlled by `output_all_encoded_layers` argument:
+            - `output_all_encoded_layers=True`: outputs a list of the full sequences of encoded-hidden-states at the end
+                of each attention block (i.e. 12 full sequences for BERT-base, 24 for BERT-large), each
+                encoded-hidden-state is a torch.FloatTensor of size [batch_size, sequence_length, hidden_size],
+            - `output_all_encoded_layers=False`: outputs only the full sequence of hidden-states corresponding
+                to the last attention block of shape [batch_size, sequence_length, hidden_size],
+        `pooled_output`: a torch.FloatTensor of size [batch_size, hidden_size] which is the output of a
+            classifier pretrained on top of the hidden state associated to the first character of the
+            input (`CLS`) to train on the Next-Sentence task (see BERT's paper).
+    Example usage:
+    ```python
+    # Already been converted into WordPiece token ids
+    input_ids = torch.LongTensor([[31, 51, 99], [15, 5, 0]])
+    input_mask = torch.LongTensor([[1, 1, 1], [1, 1, 0]])
+    token_type_ids = torch.LongTensor([[0, 0, 1], [0, 1, 0]])
+    config = modeling.BertConfig(vocab_size_or_config_json_file=32000, hidden_size=768,
+        num_hidden_layers=12, num_attention_heads=12, intermediate_size=3072)
+    model = BertModel(config=config)
+    all_encoder_layers, pooled_output = model(input_ids, token_type_ids, input_mask)
+    ```
+    """
+    def __init__(self, config, add_pooling_layer=True):
+        super(BertModel, self).__init__(config)
+        self.embeddings = BertEmbeddings(config)
+        self.encoder = BertEncoder(config)
+        self.pooler = BertPooler(config) if add_pooling_layer else None
+        self.post_init()
+    def get_input_embeddings(self):
+        return self.embeddings.word_embeddings
+    def set_input_embeddings(self, value):
+        self.embeddings.word_embeddings = value
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        token_type_ids: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.Tensor] = None,
+        output_all_encoded_layers: Optional[bool] = False,
+        masked_tokens_mask: Optional[torch.Tensor] = None,
+        **kwargs
+    ) -> Tuple[Union[List[torch.Tensor], torch.Tensor], Optional[torch.Tensor]]:
+        if attention_mask is None:
+            attention_mask = torch.ones_like(input_ids)
+        if token_type_ids is None:
+            token_type_ids = torch.zeros_like(input_ids)
+        embedding_output = self.embeddings(
+            input_ids,
+            token_type_ids,
+            position_ids
+        )
+        position_encodings = None
+        subset_mask = []
+        first_col_mask = []
+        if masked_tokens_mask is None:
+            subset_mask = None
+        else:
+            first_col_mask = torch.zeros_like(masked_tokens_mask)
+            first_col_mask[:, 0] = True
+            subset_mask = masked_tokens_mask | first_col_mask
+        encoder_outputs = self.encoder(
+            embedding_output,
+            attention_mask,
+            output_all_encoded_layers=output_all_encoded_layers,
+            subset_mask=subset_mask,
+            position_encodings=position_encodings)
+        if masked_tokens_mask is None:
+            sequence_output = encoder_outputs[-1]
+            pooled_output = self.pooler(
+                sequence_output, mask = attention_mask) if self.pooler is not None else None
+        else:
+            # TD [2022-03-01]: the indexing here is very tricky.
+            attention_mask_bool = attention_mask.bool()
+            subset_idx = subset_mask[attention_mask_bool]  # type: ignore
+            sequence_output = encoder_outputs[-1][
+                masked_tokens_mask[attention_mask_bool][subset_idx]]
+            if self.pooler is not None:
+                pool_input = encoder_outputs[-1][
+                    first_col_mask[attention_mask_bool][subset_idx]]
+                pooled_output = self.pooler(pool_input, pool=False, mask = attention_mask)
+            else:
+                pooled_output = None
+        if not output_all_encoded_layers:
+            encoder_outputs = sequence_output
+        if self.pooler is not None:
+            return encoder_outputs, pooled_output
+        return encoder_outputs, None
+###################
+# Bert Heads
+###################
+class BertLMPredictionHead(nn.Module):
+    def __init__(self, config, bert_model_embedding_weights):
+        super().__init__()
+        self.transform = BertPredictionHeadTransform(config)
+        # The output weights are the same as the input embeddings, but there is
+        # an output-only bias for each token.
+        self.decoder = nn.Linear(bert_model_embedding_weights.size(1),
+                                 bert_model_embedding_weights.size(0))
+        self.decoder.weight = bert_model_embedding_weights
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        hidden_states = self.transform(hidden_states)
+        hidden_states = self.decoder(hidden_states)
+        return hidden_states
+class BertOnlyMLMHead(nn.Module):
+    def __init__(self, config, bert_model_embedding_weights):
+        super().__init__()
+        self.predictions = BertLMPredictionHead(config,
+                                                bert_model_embedding_weights)
+    def forward(self, sequence_output: torch.Tensor) -> torch.Tensor:
+        prediction_scores = self.predictions(sequence_output)
+        return prediction_scores
+class BertOnlyNSPHead(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.seq_relationship = nn.Linear(config.hidden_size, 2)
+    def forward(self, pooled_output: torch.Tensor) -> torch.Tensor:
+        seq_relationship_score = self.seq_relationship(pooled_output)
+        return seq_relationship_score
+#######################
+# Construct Bert model
+#######################
+class BertForMaskedLM(BertPreTrainedModel):
+    def __init__(self, config):
+        super().__init__(config)
+        if config.is_decoder:
+            warnings.warn(
+                'If you want to use `BertForMaskedLM` make sure `config.is_decoder=False` for '
+                'bi-directional self-attention.')
+        self.bert = BertModel(config, add_pooling_layer=False)
+        self.cls = BertOnlyMLMHead(config,
+                                   self.bert.embeddings.word_embeddings.weight)
+        # Initialize weights and apply final processing
+        self.post_init()
+    @classmethod
+    def from_composer(cls,
+                      pretrained_checkpoint,
+                      state_dict=None,
+                      cache_dir=None,
+                      from_tf=False,
+                      config=None,
+                      *inputs,
+                      **kwargs):
+        """Load from pre-trained."""
+        model = cls(config, *inputs, **kwargs)
+        if from_tf:
+            raise ValueError(
+                'TensorFlow is not supported.')
+        state_dict = torch.load(pretrained_checkpoint)
+        # If the state_dict was saved after wrapping with `composer.HuggingFaceModel`, it takes on the `model` prefix
+        consume_prefix_in_state_dict_if_present(state_dict, prefix='model.')
+        missing_keys, unexpected_keys = model.load_state_dict(state_dict,
+                                                              strict=False)
+        if len(missing_keys) > 0:
+            logger.warning(
+                f"Found these missing keys in the checkpoint: {', '.join(missing_keys)}"
+            )
+        if len(unexpected_keys) > 0:
+            logger.warning(
+                f"Found these unexpected keys in the checkpoint: {', '.join(unexpected_keys)}"
+            )
+        return model
+    def get_output_embeddings(self):
+        return self.cls.predictions.decoder
+    def set_output_embeddings(self, new_embeddings):
+        self.cls.predictions.decoder = new_embeddings
+    def forward(
+        self,
+        input_ids: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        token_type_ids: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.Tensor] = None,
+        head_mask: Optional[torch.Tensor] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+        encoder_hidden_states: Optional[torch.Tensor] = None,
+        encoder_attention_mask: Optional[torch.Tensor] = None,
+        labels: Optional[torch.Tensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple[torch.Tensor], MaskedLMOutput]:
+        # labels should be a `torch.LongTensor` of shape
+        # `(batch_size, sequence_length)`. These are used for computing the
+        #  masked language modeling loss.
+        #
+        # Indices should be in `[-100, 0, ..., config.vocab_size]` (see
+        # `input_ids` docstring) Tokens with indices set to `-100` are ignored
+        # (masked), the loss is only computed for the tokens with labels in `[0,
+        # ..., config.vocab_size]`
+        #
+        # Prediction scores are only computed for masked tokens and the (bs,
+        # seqlen) dimensions are flattened
+        if (input_ids is not None) == (inputs_embeds is not None):
+            raise ValueError('Must specify either input_ids or input_embeds!')
+        if labels is None:
+            masked_tokens_mask = None
+        else:
+            masked_tokens_mask = labels > 0
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        outputs = self.bert(
+                input_ids,
+                attention_mask=attention_mask,
+                token_type_ids=token_type_ids,
+                position_ids=position_ids,
+                head_mask=head_mask,
+                inputs_embeds=inputs_embeds,
+                encoder_hidden_states=encoder_hidden_states,
+                encoder_attention_mask=encoder_attention_mask,
+                output_attentions=output_attentions,
+                output_hidden_states=output_hidden_states,
+                return_dict=return_dict,
+                masked_tokens_mask=masked_tokens_mask,
+            )
+        if torch.isnan(outputs[0]).any():
+            print("NaNs in outputs.")
+            raise ValueError()
+        #print("MLM Outputs")
+        #print(outputs[0].shape)
+        pooled_output = outputs[0]
+        last_hidden_state_formatted = outputs[0][:,0,:].view(-1, self.config.hidden_size)
+        return {"sentence_embedding": last_hidden_state_formatted}
+    def prepare_inputs_for_generation(self, input_ids: torch.Tensor,
+                                      attention_mask: torch.Tensor,
+                                      **model_kwargs):
+        input_shape = input_ids.shape
+        effective_batch_size = input_shape[0]
+        #  add a dummy token
+        if self.config.pad_token_id is None:
+            raise ValueError('The PAD token should be defined for generation')
+        attention_mask = torch.cat([
+            attention_mask,
+            attention_mask.new_zeros((attention_mask.shape[0], 1))
+        ], dim=-1)
+        dummy_token = torch.full((effective_batch_size, 1),
+                                 self.config.pad_token_id,
+                                 dtype=torch.long,
+                                 device=input_ids.device)
+        input_ids = torch.cat([input_ids, dummy_token], dim=1)
+        return {'input_ids': input_ids, 'attention_mask': attention_mask}
+class BertForSequenceClassification(BertPreTrainedModel):
+    """Bert Model transformer with a sequence classification/regression head.
+    This head is just a linear layer on top of the pooled output. Used for,
+    e.g., GLUE tasks.
+    """
+    def __init__(self, config):
+        super().__init__(config)
+        self.num_labels = config.num_labels
+        self.config = config
+        self.bert = BertModel(config)
+        classifier_dropout = (config.classifier_dropout
+                              if config.classifier_dropout is not None else
+                              config.hidden_dropout_prob)
+        self.dropout = nn.Dropout(classifier_dropout)
+        self.classifier = nn.Linear(config.hidden_size, config.num_labels)
+        # Initialize weights and apply final processing
+        self.post_init()
+    @classmethod
+    def from_composer(cls,
+                      pretrained_checkpoint,
+                      state_dict=None,
+                      cache_dir=None,
+                      from_tf=False,
+                      config=None,
+                      *inputs,
+                      **kwargs):
+        """Load from pre-trained."""
+        model = cls(config, *inputs, **kwargs)
+        if from_tf:
+            raise ValueError(
+                'TensorFlow is not supported.')
+        state_dict = torch.load(pretrained_checkpoint)
+        # If the state_dict was saved after wrapping with `composer.HuggingFaceModel`, it takes on the `model` prefix
+        consume_prefix_in_state_dict_if_present(state_dict, prefix='model.')
+        missing_keys, unexpected_keys = model.load_state_dict(state_dict,
+                                                              strict=False)
+        if len(missing_keys) > 0:
+            logger.warning(
+                f"Found these missing keys in the checkpoint: {', '.join(missing_keys)}"
+            )
+        if len(unexpected_keys) > 0:
+            logger.warning(
+                f"Found these unexpected keys in the checkpoint: {', '.join(unexpected_keys)}"
+            )
+        return model
+    def forward(
+        self,
+        input_ids: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        token_type_ids: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.Tensor] = None,
+        head_mask: Optional[torch.Tensor] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+        labels: Optional[torch.Tensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple[torch.Tensor], SequenceClassifierOutput]:
+        # labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+        # Labels for computing the sequence classification/regression loss.
+        # Indices should be in `[0, ..., config.num_labels - 1]`.
+        # If `config.num_labels == 1` a regression loss is computed
+        # (mean-square loss). If `config.num_labels > 1` a classification loss
+        # is computed (cross-entropy).
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        outputs = self.bert(
+            input_ids,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        pooled_output = outputs[1]
+        pooled_output = self.dropout(pooled_output)
+        logits = self.classifier(pooled_output)
+        loss = None
+        if labels is not None:
+            # Compute loss
+            if self.config.problem_type is None:
+                if self.num_labels == 1:
+                    self.config.problem_type = 'regression'
+                elif self.num_labels > 1 and (labels.dtype == torch.long or
+                                              labels.dtype == torch.int):
+                    self.config.problem_type = 'single_label_classification'
+                else:
+                    self.config.problem_type = 'multi_label_classification'
+            if self.config.problem_type == 'regression':
+                loss_fct = nn.MSELoss()
+                if self.num_labels == 1:
+                    loss = loss_fct(logits.squeeze(), labels.squeeze())
+                else:
+                    loss = loss_fct(logits, labels)
+            elif self.config.problem_type == 'single_label_classification':
+                loss_fct = nn.CrossEntropyLoss()
+                loss = loss_fct(logits.view(-1, self.num_labels),
+                                labels.view(-1))
+            elif self.config.problem_type == 'multi_label_classification':
+                loss_fct = nn.BCEWithLogitsLoss()
+                loss = loss_fct(logits, labels)
+        if not return_dict:
+            output = (logits,) + outputs[2:]
+            return ((loss,) + output) if loss is not None else output
+        return SequenceClassifierOutput(
+            loss=loss,
+            logits=logits,
+            hidden_states=None,
+            attentions=None,
+        )
+class BertForTextEncoding(BertPreTrainedModel):
+    def __init__(self, config):
+        super().__init__(config)
+        if config.is_decoder:
+            warnings.warn(
+                'If you want to use `BertForMaskedLM` make sure `config.is_decoder=False` for '
+                'bi-directional self-attention.')
+        self.bert = BertModel(config, add_pooling_layer=True)
+        # Initialize weights and apply final processing
+        self.post_init()
+    @classmethod
+    def from_composer(cls,
+                      pretrained_checkpoint,
+                      state_dict=None,
+                      cache_dir=None,
+                      from_tf=False,
+                      config=None,
+                      *inputs,
+                      **kwargs):
+        """Load from pre-trained."""
+        model = cls(config, *inputs, **kwargs)
+        if from_tf:
+            raise ValueError(
+                'TensorFlow is not supported.')
+        state_dict = torch.load(pretrained_checkpoint)
+        # If the state_dict was saved after wrapping with `composer.HuggingFaceModel`, it takes on the `model` prefix
+        consume_prefix_in_state_dict_if_present(state_dict, prefix='model.')
+        missing_keys, unexpected_keys = model.load_state_dict(state_dict,
+                                                              strict=False)
+        if len(missing_keys) > 0:
+            logger.warning(
+                f"Found these missing keys in the checkpoint: {', '.join(missing_keys)}"
+            )
+        if len(unexpected_keys) > 0:
+            logger.warning(
+                f"Found these unexpected keys in the checkpoint: {', '.join(unexpected_keys)}"
+            )
+        return model
+    def forward(
+        self,
+        input_ids: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        token_type_ids: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.Tensor] = None,
+        head_mask: Optional[torch.Tensor] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+        encoder_hidden_states: Optional[torch.Tensor] = None,
+        encoder_attention_mask: Optional[torch.Tensor] = None,
+        labels: Optional[torch.Tensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple[torch.Tensor], MaskedLMOutput]:
+        if (input_ids is not None) == (inputs_embeds is not None):
+            raise ValueError('Must specify either input_ids or input_embeds!')
+        if labels is None:
+            masked_tokens_mask = None
+        else:
+            masked_tokens_mask = labels > 0
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        outputs = self.bert(
+                input_ids,
+                attention_mask=attention_mask,
+                token_type_ids=token_type_ids,
+                position_ids=position_ids,
+                head_mask=head_mask,
+                inputs_embeds=inputs_embeds,
+                encoder_hidden_states=encoder_hidden_states,
+                encoder_attention_mask=encoder_attention_mask,
+                output_attentions=output_attentions,
+                output_hidden_states=output_hidden_states,
+                return_dict=return_dict,
+                masked_tokens_mask=masked_tokens_mask,
+            )
+        pooled_output = outputs[1]
+        return {"sentence_embedding": pooled_output}
+    def prepare_inputs_for_generation(self, input_ids: torch.Tensor,
+                                      attention_mask: torch.Tensor,
+                                      **model_kwargs):
+        input_shape = input_ids.shape
+        effective_batch_size = input_shape[0]
+        #  add a dummy token
+        if self.config.pad_token_id is None:
+            raise ValueError('The PAD token should be defined for generation')
+        attention_mask = torch.cat([
+            attention_mask,
+            attention_mask.new_zeros((attention_mask.shape[0], 1))
+        ], dim=-1)
+        dummy_token = torch.full((effective_batch_size, 1),
+                                 self.config.pad_token_id,
+                                 dtype=torch.long,
+                                 device=input_ids.device)
+        input_ids = torch.cat([input_ids, dummy_token], dim=1)
+        return {'input_ids': input_ids, 'attention_mask': attention_mask}

bert_padding.py ADDED Viewed

	@@ -0,0 +1,153 @@

+# Adapted from https://github.com/HazyResearch/flash-attention/blob/main/flash_attn/bert_padding.py
+# Adapted from https://github.com/mlcommons/training_results_v1.1/blob/main/NVIDIA/benchmarks/bert/implementations/pytorch/padding.py
+"""
+Functions for padding and unpadding
+"""
+from typing import Tuple, cast
+import torch
+import torch.nn.functional as F
+from einops import rearrange, repeat
+class IndexFirstAxis(torch.autograd.Function):
+    @staticmethod
+    def forward(ctx, input: torch.Tensor,
+                indices: torch.Tensor) -> torch.Tensor:
+        """Get just the values of `input` which are at `indices`.
+        Arguments:
+            ctx: the autograd context object
+            input: (b, ...) 2+ dimensional tensor
+            indices: (num_idx) 1D tensor
+        """
+        ctx.save_for_backward(indices)
+        assert input.ndim >= 2
+        ctx.first_axis_dim, other_shape = input.shape[0], input.shape[
+            1:]
+        second_dim = other_shape.numel(
+        )  # product of sizes of all but first dimension
+        # TD [2022-03-04] For some reason torch.gather is a bit faster than indexing.
+        return torch.gather(
+            rearrange(input, 'b ... -> b (...)'),  # (b, ...) -> (b, second_dim)
+            0,
+            repeat(indices, 'z -> z d',
+                   d=second_dim)  # (indices,) -> (indices, second_dim)
+        ).reshape(-1, *other_shape)  # (num_idx, ...)
+    @staticmethod
+    def backward(ctx, grad_output: torch.Tensor) -> Tuple[torch.Tensor, None]:
+        indices, = ctx.saved_tensors
+        assert grad_output.ndim >= 2
+        other_shape = grad_output.shape[1:]
+        grad_output = rearrange(grad_output, 'b ... -> b (...)')
+        grad_input = torch.zeros([ctx.first_axis_dim, grad_output.shape[1]],
+                                 device=grad_output.device,
+                                 dtype=grad_output.dtype)
+        # TD [2022-03-04] For some reason torch.scatter is a bit faster than indexing.
+        # grad_input[indices] = grad_output
+        grad_input.scatter_(0,
+                            repeat(indices, 'z -> z d', d=grad_output.shape[1]),
+                            grad_output)
+        return grad_input.reshape(ctx.first_axis_dim, *other_shape), None
+index_first_axis = IndexFirstAxis.apply
+class IndexPutFirstAxis(torch.autograd.Function):
+    @staticmethod
+    def forward(ctx, values: torch.Tensor, indices: torch.Tensor,
+                first_axis_dim) -> torch.Tensor:
+        ctx.save_for_backward(indices)
+        assert indices.ndim == 1
+        assert values.ndim >= 2
+        output = torch.zeros(first_axis_dim,
+                             *values.shape[1:],
+                             device=values.device,
+                             dtype=values.dtype)
+        output[indices] = values
+        return output
+    @staticmethod
+    def backward(ctx,
+                 grad_output: torch.Tensor) -> Tuple[torch.Tensor, None, None]:
+        indices, = ctx.saved_tensors
+        grad_values = grad_output[indices]
+        return grad_values, None, None
+index_put_first_axis = IndexPutFirstAxis.apply
+def unpad_input(
+    hidden_states: torch.Tensor,
+    attention_mask: torch.Tensor,
+) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, int]:
+    """Remove padding from input sequences.
+    Arguments:
+        hidden_states: (batch, seqlen, ...)
+        attention_mask: (batch, seqlen), bool / int, 1 means valid and 0 means not valid.
+    Returns:
+        hidden_states: (total_nnz, ...), where total_nnz = number of tokens in selected in attention_mask.
+        cu_seqlens: (batch + 1), the cumulative sequence lengths, used to index into hidden_states.
+        max_seqlen_in_batch: int
+    """
+    seqlens_in_batch = attention_mask.sum(dim=-1, dtype=torch.int32)
+    indices = torch.nonzero(attention_mask.flatten(), as_tuple=False).flatten()
+    max_seqlen_in_batch = int(seqlens_in_batch.max().item())
+    cu_seqlens = F.pad(torch.cumsum(seqlens_in_batch, dim=0, dtype=torch.int32),
+                       (1, 0))
+    # TD [2022-03-04] We don't want to index with a bool mask, because Pytorch will expand the
+    # bool mask, then call nonzero to get the indices, then index with those. The indices is @dim
+    # times larger than it needs to be, wasting memory. It's faster and more memory-efficient to
+    # index with integer indices. Moreover, torch's index is a bit slower than it needs to be,
+    # so we write custom forward and backward to make it a bit faster.
+    hidden_states = cast(
+        torch.Tensor,
+        index_first_axis(rearrange(hidden_states, 'b s ... -> (b s) ...'),
+                         indices))
+    return hidden_states, indices, cu_seqlens, max_seqlen_in_batch
+def unpad_input_only(
+    hidden_states: torch.Tensor,
+    attention_mask: torch.Tensor,
+) -> torch.Tensor:
+    """Like unpad_input, but only return the unpadded first tensor.
+    Save a small amount of overhead.
+    Arguments:
+        hidden_states: (batch, seqlen, ...)
+        attention_mask: (batch, seqlen), bool / int, 1 means valid and 0 means not valid.
+    Returns:
+        hidden_states: (total_nnz, ...), where total_nnz = number of tokens in selected in attention_mask.
+    """
+    indices = torch.nonzero(attention_mask.flatten(), as_tuple=False).flatten()
+    rearranged = rearrange(hidden_states, 'b s ... -> (b s) ...')
+    return index_first_axis(rearranged, indices)  # type: ignore
+def pad_input(hidden_states: torch.Tensor, indices: torch.Tensor, batch: int,
+              seqlen: int) -> torch.Tensor:
+    """Add padding to sequences.
+    Arguments:
+        hidden_states: (total_nnz, ...), where total_nnz = number of tokens in selected in attention_mask.
+        indices: (total_nnz)
+    Returns:
+        hidden_states: (batch, seqlen, ...)
+    """
+    output = index_put_first_axis(hidden_states, indices, batch * seqlen)
+    return rearrange(output, '(b s) ... -> b s ...', b=batch)  # type: ignore

blockdiag_linear.py ADDED Viewed

	@@ -0,0 +1,73 @@

+# Adapted from https://github.com/HazyResearch/fly/tree/master/src/models/layers
+import math
+import torch
+import torch.nn as nn
+from einops import rearrange
+from .structured_linear import StructuredLinear
+from .blockdiag_multiply import blockdiag_multiply
+class BlockdiagLinear(StructuredLinear):
+    def __init__(self, *args, nblocks=4, shuffle=False, **kwargs):
+        """shuffle: apply channel_shuffle operation before the matmul as in ShuffleNet
+        """
+        super().__init__(*args, **kwargs)
+        in_blksz = int(math.ceil(self.in_features / nblocks))
+        out_blksz = int(math.ceil(self.out_features / nblocks))
+        self.in_features_extended = in_blksz * nblocks
+        self.out_features_extended = out_blksz * nblocks
+        self.shuffle = shuffle
+        self.weight = nn.Parameter(torch.empty(nblocks, out_blksz, in_blksz))
+        self.reset_parameters()
+    def set_weights_from_dense_init(self, dense_init_fn_):
+        dense_weight = torch.empty(self.out_features_extended, self.in_features_extended,
+                                   device=self.weight.device, dtype=self.weight.dtype)
+        dense_init_fn_(dense_weight)
+        # Scale by sqrt because the weight is sparse
+        scaling = math.sqrt(dense_weight.numel() / self.weight.numel())
+        dense_weight *= scaling
+        with torch.no_grad():
+            nblocks = self.weight.shape[0]
+            self.weight.copy_(rearrange(dense_weight, '(b o) (b1 i) -> b b1 o i',
+                                        b=nblocks, b1=nblocks)[0])
+    @property
+    def saving(self):
+        return self.weight.numel() / (self.in_features * self.out_features)
+    def forward_matmul(self, x):
+        x = self.preprocess(x)
+        if self.shuffle:
+            x = rearrange(x, '... (group c_per_group) -> ... (c_per_group group)',
+                          group=self.weight.shape[0])  # group=nblocks
+        output = blockdiag_multiply(x, self.weight)
+        return self.postprocess(output)
+class BlockdiagSparsityConfig:
+    def __init__(self, nblocks, block=32, global_size=0):
+        """shuffle: apply channel_shuffle operation before the matmul as in ShuffleNet
+        """
+        self.nblocks = nblocks
+        self.block = block
+        self.global_size = global_size
+    def make_layout(self, out_features, in_features):
+        assert out_features % self.block == 0 and in_features % self.block == 0
+        assert out_features % self.nblocks == 0 and in_features % self.nblocks == 0
+        layout = torch.block_diag(*[torch.ones(out_features // self.nblocks,
+                                               in_features // self.nblocks,
+                                               dtype=torch.int32)] * self.nblocks)
+        if self.global_size > 0:
+            layout[:self.global_size] = 1
+            layout[:, :self.global_size] = 1
+        # Convert from (out_features, in_features) mask to
+        # (out_features // block, in_features // block) mask
+        layout = rearrange(layout, '(p blksz) (r blksz1) -> p r (blksz blksz1)',
+                           blksz=self.block, blksz1=self.block)
+        return (layout > 0).any(dim=-1).int()

blockdiag_multiply.py ADDED Viewed

	@@ -0,0 +1,82 @@

+# Adapted from https://github.com/HazyResearch/fly/tree/master/src/models/layers
+import numpy as np
+import torch
+from torch.nn import functional as F
+from einops import rearrange
+def blockdiag_weight_to_dense_weight(weight):
+    """
+    Argumments:
+        weight: (nblocks, out / nblocks, in / blocks)
+    Return:
+        dense_weight: (out / in)
+    """
+    return torch.block_diag(*torch.unbind(weight, dim=0))
+def blockdiag_multiply_reference(x, weight):
+    """
+    This implementation is slow but more likely to be correct.
+    Arguments:
+        x: (..., n)
+        weight: (nblocks, q, n / nblocks)
+    Outputs:
+        out: (..., nblocks * q)
+    """
+    n = x.shape[-1]
+    nblocks, q, p = weight.shape
+    assert nblocks * p == n
+    x_reshaped = rearrange(x, '... (nblocks p) -> ... nblocks p', nblocks=nblocks)
+    return rearrange(torch.einsum('...kp, kqp -> ...kq', x_reshaped, weight),
+                     '... nblocks q -> ... (nblocks q)')
+class BlockdiagMultiply(torch.autograd.Function):
+    """This is a faster implementation, with careful memory copies for the fastest
+    bmm performance.
+    The backward pass is also written manually with careful memory copies.
+    Arguments:
+        x: (..., n)
+        weight: (nblocks, q, n / nblocks)
+    Outputs:
+        out: (..., nblocks * q)
+    """
+    @staticmethod
+    @torch.cuda.amp.custom_fwd(cast_inputs=torch.bfloat16)
+    def forward(ctx, x, weight):
+        ctx.save_for_backward(x, weight)
+        batch_shape, n = x.shape[:-1], x.shape[-1]
+        batch_dim = np.prod(batch_shape)
+        nblocks, q, p = weight.shape
+        assert nblocks * p == n
+        x_reshaped = x.reshape(batch_dim, nblocks, p).transpose(0, 1)
+        out = torch.empty(batch_dim, nblocks, q, device=x.device, dtype=x.dtype).transpose(0, 1)
+        out = torch.bmm(x_reshaped, weight.transpose(-1, -2), out=out).transpose(0, 1)
+        return out.reshape(*batch_shape, nblocks * q)
+    @staticmethod
+    @torch.cuda.amp.custom_bwd
+    def backward(ctx, dout):
+        x, weight = ctx.saved_tensors
+        batch_shape, n = x.shape[:-1], x.shape[-1]
+        batch_dim = np.prod(batch_shape)
+        nblocks, q, p = weight.shape
+        assert nblocks * p == n
+        dx, dweight = None, None
+        dout_reshaped = dout.reshape(batch_dim, nblocks, q).transpose(0, 1)
+        if ctx.needs_input_grad[0]:
+            dx = torch.empty(batch_dim, nblocks, p, device=x.device, dtype=x.dtype)
+            dx = torch.bmm(dout_reshaped, weight.conj(),
+                           out=dx.transpose(0, 1)).transpose(0, 1).reshape(*batch_shape, n)
+        if ctx.needs_input_grad[1]:
+            x_reshaped = x.reshape(batch_dim, nblocks, p).transpose(0, 1)
+            dweight = torch.bmm(dout_reshaped.transpose(-1, -2), x_reshaped.conj())
+        return dx, dweight
+blockdiag_multiply = BlockdiagMultiply.apply

config.json CHANGED Viewed

@@ -1,4 +1,48 @@
 {
-    "model_type": "m2_bert"
 }

 {
+    "_name_or_path": "togethercomputer/m2-bert-80M-32k-retrieval",
+    "alibi_starting_size": 32768,
+    "architectures": [
+        "BertForSequenceClassification"
+    ],
+    "attention_probs_dropout_prob": 0.0,
+    "bidirectional": true,
+    "auto_map": {
+        "AutoConfig": "configuration_bert.BertConfig",
+        "AutoModelForSequenceClassification": "bert_layers.BertForTextEncoding",
+        "AutoTokenizer": "bert-base-uncased"
+    },
+    "classifier_dropout": null,
+    "gradient_checkpointing": false,
+    "hidden_act": "gelu",
+    "hidden_dropout_prob": 0.1,
+    "hidden_size": 768,
+    "hyena_emb_dim": 5,
+    "hyena_filter_dropout": 0.2,
+    "hyena_filter_order": 128,
+    "hyena_lr_pos_emb": 1e-05,
+    "hyena_training_additions": false,
+    "hyena_w": 10,
+    "hyena_w_mod": 1,
+    "hyena_wd": 0.1,
+    "initializer_range": 0.02,
+    "intermediate_size": 3072,
+    "layer_norm_eps": 1e-12,
+    "long_conv_kernel_learning_rate": 0.001,
+    "long_conv_l_max": 32768,
+    "max_position_embeddings": 32768,
+    "model_type": "bert",
+    "monarch_mlp_nblocks": 4,
+    "num_attention_heads": 12,
+    "num_hidden_layers": 12,
+    "pad_token_id": 0,
+    "pool_all": false,
+    "position_embedding_type": "absolute",
+    "residual_long_conv": true,
+    "transformers_version": "4.28.1",
+    "type_vocab_size": 2,
+    "use_cache": true,
+    "use_glu_mlp": true,
+    "use_monarch_mlp": true,
+    "use_positional_encodings": true,
+    "vocab_size": 30528
 }

configuration_bert.py ADDED Viewed

	@@ -0,0 +1,75 @@

+from transformers import BertConfig
+class BertConfig(BertConfig):
+    def __init__(
+        self,
+        alibi_starting_size: int = 512,
+        attention_probs_dropout_prob: float = 0.0,
+        # mlp
+        use_glu_mlp: bool = True,
+        use_monarch_mlp: bool = False,
+        monarch_mlp_nblocks: int = 4,
+        # position
+        use_positional_encodings: bool = False,
+        max_position_embeddings: int = 512,
+        # architecture selection
+        residual_long_conv: bool = False,
+        # hyena and long conv hyperparameters
+        bidirectional: bool = True,
+        hyena_w_mod: int = 1,
+        hyena_filter_dropout: float = 0.2,
+        hyena_filter_order: int = 64,
+        hyena_training_additions: bool = False,
+        # efficiency
+        use_flash_mm: bool = False,
+        # average pooling instead of CLS token
+        pool_all: bool = False,
+        **kwargs,
+    ):
+        """Configuration class for MosaicBert.
+        Args:
+            alibi_starting_size (int): Use `alibi_starting_size` to determine how large of an alibi tensor to
+                create when initializing the model. You should be able to ignore this parameter in most cases.
+                Defaults to 512.
+            attention_probs_dropout_prob (float): By default, turn off attention dropout in Mosaic BERT.
+                Defaults to 0.0.
+        """
+        super().__init__(
+            attention_probs_dropout_prob=attention_probs_dropout_prob, **kwargs)
+        self.alibi_starting_size = alibi_starting_size
+        # mlp
+        self.use_glu_mlp = use_glu_mlp
+        self.use_monarch_mlp = use_monarch_mlp
+        self.monarch_mlp_nblocks = monarch_mlp_nblocks
+        # positional encodings
+        self.use_positional_encodings = use_positional_encodings
+        self.max_position_embeddings = max_position_embeddings
+        # architecture
+        self.residual_long_conv = residual_long_conv
+        # hyena and long conv hyperparameters
+        self.bidirectional = bidirectional
+        self.hyena_w_mod = hyena_w_mod
+        self.hyena_filter_dropout = hyena_filter_dropout
+        self.hyena_filter_order = hyena_filter_order
+        self.hyena_training_additions = hyena_training_additions
+        # efficiency
+        self.use_flash_mm = use_flash_mm
+        # average pooling instead of CLS token
+        self.pool_all = pool_all

generation_config.json ADDED Viewed

	@@ -0,0 +1,6 @@

+{
+    "_from_model_config": true,
+    "transformers_version": "4.28.1",
+    "use_cache": false,
+    "eos_token_id": [0, 50278]
+  }

hyena_utils.py ADDED Viewed

	@@ -0,0 +1,259 @@

+# Copyright (c) 2023, Dan Fu and Simran Arora.
+# Adapted from https://github.com/HazyResearch/safari/blob/main/src/models/sequence/hyena.py
+import math
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from einops import rearrange
+import opt_einsum as oe
+contract = oe.contract
+""" Utils for the training loop. Copied from https://github.com/HazyResearch/transformers/blob/master/src/utils/utils.py """
+class OptimModule(nn.Module):
+    """ Interface for Module that allows registering buffers/parameters with configurable optimizer hyperparameters """
+    def register(self, name, tensor, lr=None, wd=0.0):
+        """Register a tensor with a configurable learning rate and 0 weight decay"""
+        if lr == 0.0:
+            self.register_buffer(name, tensor)
+        else:
+            self.register_parameter(name, nn.Parameter(tensor))
+            optim = {}
+            if lr is not None: optim["lr"] = lr
+            if wd is not None: optim["weight_decay"] = wd
+            setattr(getattr(self, name), "_optim", optim)
+def fftconv_ref(u, k, D, dropout_mask, gelu=True, k_rev=None):
+    # u.shape:   B H L
+    seqlen = u.shape[-1]
+    fft_size = 2 * seqlen
+    k_f = torch.fft.rfft(k, n=fft_size) / fft_size
+    if k_rev is not None:
+        k_rev_f = torch.fft.rfft(k_rev, n=fft_size) / fft_size
+        k_f = k_f + k_rev_f.conj()
+    u_f = torch.fft.rfft(u.to(dtype=k.dtype), n=fft_size)
+    if len(u.shape) > 3:
+        k_f = k_f.unsqueeze(1)
+    y = torch.fft.irfft(u_f * k_f, n=fft_size, norm="forward")[..., :seqlen]
+    out = y + u * D
+    if gelu:
+        out = F.gelu(out)
+    if dropout_mask is not None:
+        return (out * rearrange(dropout_mask, "b H -> b H 1")).to(dtype=u.dtype)
+    else:
+        return out.to(dtype=u.dtype)
+@torch.jit.script
+def mul_sum(q, y):
+    return (q * y).sum(dim=1)
+class Sin(nn.Module):
+    def __init__(self, dim, w=10, w_mod=1, train_freq=True):
+        super().__init__()
+        init_tensor = torch.ones(1, dim)
+        self.freq = (
+            nn.Parameter(w * init_tensor)
+            if train_freq
+            else w * torch.ones(1, dim)
+        )
+        self.w_mod = w_mod
+    def forward(self, x):
+        return torch.sin(self.w_mod * self.freq * x)
+class PositionalEmbedding(OptimModule):
+    def __init__(self, emb_dim: int, seq_len: int, lr_pos_emb: float = 1e-5, **kwargs):
+        """Complex exponential positional embeddings for Hyena filters."""
+        super().__init__()
+        self.seq_len = seq_len
+        # The time embedding fed to the filteres is normalized so that t_f = 1
+        t = torch.linspace(0, 1, self.seq_len)[None, :, None]  # 1, L, 1
+        if emb_dim > 1:
+            bands = (emb_dim - 1) // 2
+        # To compute the right embeddings we use the "proper" linspace
+        t_rescaled = torch.linspace(0, seq_len - 1, seq_len)[None, :, None]
+        w = 2 * math.pi * t_rescaled / seq_len  # 1, L, 1
+        f = torch.linspace(1e-4, bands - 1, bands)[None, None]
+        z = torch.exp(-1j * f * w)
+        z = torch.cat([t, z.real, z.imag], dim=-1)
+        self.register("z", z, lr=lr_pos_emb)
+        self.register("t", t, lr=0.0)
+    def forward(self, L):
+        return self.z[:, :L], self.t[:, :L]
+class ExponentialModulation(OptimModule):
+    def __init__(
+        self,
+        d_model,
+        fast_decay_pct=0.3,
+        slow_decay_pct=1.5,
+        target=1e-2,
+        modulation_lr=0.0,
+        shift: float = 0.0,
+        **kwargs,
+    ):
+        super().__init__()
+        self.shift = shift
+        max_decay = math.log(target) / fast_decay_pct
+        min_decay = math.log(target) / slow_decay_pct
+        deltas = torch.linspace(min_decay, max_decay, d_model)[None, None]
+        self.register("deltas", deltas, lr=modulation_lr)
+    def forward(self, t, x):
+        decay = torch.exp(-t * self.deltas.abs())
+        x = x * (decay + self.shift)
+        return x
+class HyenaFilter(OptimModule):
+    def __init__(
+        self,
+        d_model,
+        emb_dim=3,  # dim of input to MLP, augments with positional encoding
+        order=16,  # width of the implicit MLP
+        seq_len=1024,
+        lr=1e-3,
+        lr_pos_emb=1e-5,
+        dropout=0.0,
+        w=1,  # frequency of periodic activations
+        w_mod=1, # non-learnable modification of w
+        wd=0,  # weight decay of kernel parameters
+        bias=True,
+        num_inner_mlps=2,
+        linear_mixer=False,
+        modulate: bool = True,
+        normalized=False,
+        bidirectional=False,
+        **kwargs,
+    ):
+        """
+        Implicit long filter with modulation.
+        Args:
+            d_model: number of channels in the input
+            emb_dim: dimension of the positional encoding (`emb_dim` - 1) // 2 is the number of bands
+            order: width of the FFN
+            num_inner_mlps: number of inner linear layers inside filter MLP
+        Note:
+            filter_dropout is not implemented
+        """
+        super().__init__()
+        self.d_model=d_model
+        self.emb_dim=emb_dim
+        self.seq_len=seq_len
+        self.modulate=modulate
+        self.use_bias = bias
+        self.bidirectional = bidirectional
+        self.bias = nn.Parameter(torch.randn(self.d_model))
+        self.dropout = nn.Dropout(dropout)
+        act = Sin(dim=order, w=w, w_mod=w_mod)
+        assert (
+            emb_dim % 2 != 0 and emb_dim >= 3
+        ), "emb_dim must be odd and greater or equal to 3 (time, sine and cosine)"
+        self.pos_emb = PositionalEmbedding(emb_dim, seq_len, lr_pos_emb)
+        # uses a variable number of inner linear layers
+        if linear_mixer is False:
+            self.implicit_filter = nn.Sequential(
+                nn.Linear(emb_dim, order),
+                act,
+            )
+            for i in range(num_inner_mlps):
+                self.implicit_filter.append(nn.Linear(order, order))
+                self.implicit_filter.append(act)
+            self.implicit_filter.append(nn.Linear(order, d_model, bias=False))
+        else:
+            self.implicit_filter = nn.Sequential(
+                nn.Linear(emb_dim, d_model, bias=False),
+            )
+        if self.bidirectional:
+            self.implicit_filter_rev = nn.Sequential(
+                nn.Linear(emb_dim, order),
+                act,
+            )
+            for i in range(num_inner_mlps):
+                self.implicit_filter_rev.append(nn.Linear(order, order))
+                self.implicit_filter_rev.append(act)
+            self.implicit_filter_rev.append(nn.Linear(order, d_model, bias=False))
+        self.modulation = ExponentialModulation(d_model, **kwargs)
+        self.normalized = normalized
+        for c in self.implicit_filter.children():
+            for name, v in c.state_dict().items():
+                optim = {"weight_decay": wd, "lr": lr}
+                setattr(getattr(c, name), "_optim", optim)
+    def filter(self, L, *args, **kwargs):
+        z, t = self.pos_emb(L)
+        h = self.implicit_filter(z)
+        if self.modulate:
+            h = self.modulation(t, h)
+        if self.normalized:
+            h = h / torch.norm(h, dim=-1, p=1, keepdim=True)
+        return h
+    def filter_rev(self, L, *args, **kwargs):
+        z, t = self.pos_emb(L)
+        h = self.implicit_filter_rev(z)
+        if self.modulate:
+            h = self.modulation(t, h)
+        if self.normalized:
+            h = h / torch.norm(h, dim=-1, p=1, keepdim=True)
+        return h
+    def forward(self, x, L, k_fwd=None, k_rev=None, bias=None, *args, **kwargs):
+        if k_fwd is None:
+            k_fwd = self.filter(L)
+            if self.bidirectional and k_rev is None:
+                k_rev = self.filter_rev(L)
+        # Ensure compatibility with filters that return a tuple
+        k_fwd = k_fwd[0] if type(k_fwd) is tuple else k_fwd
+        if bias is None:
+            bias = self.bias
+        bias = bias if self.use_bias else 0 * bias
+        if self.bidirectional:
+            k_rev = k_rev[0] if type(k_rev) is tuple else k_rev
+            k = F.pad(k_fwd, (0, L)) \
+                      + F.pad(k_rev.flip(-1), (L, 0))
+        else:
+            k = k_fwd
+        y = fftconv_ref(
+            x,
+            k,
+            bias,
+            dropout_mask=None,
+            gelu=False,
+        )
+        return y.to(dtype=x.dtype)

monarch_mixer_sequence_mixer.py ADDED Viewed

	@@ -0,0 +1,156 @@

+# Copyright (c) 2023, Dan Fu and Simran Arora.
+# Adapted from https://github.com/HazyResearch/safari/blob/main/src/models/sequence/hyena.py
+import torch.nn as nn
+from einops import rearrange
+import opt_einsum as oe
+contract = oe.contract
+from .hyena_utils import HyenaFilter
+class MonarchMixerSequenceMixing(nn.Module):
+    def __init__(
+        self,
+        d_model,
+        l_max=128,
+        dropout=0.0,
+        hyena_kernel_lr=None,
+        bidirectional=False,
+        hyena_lr_pos_emb=1e-5,
+        hyena_w=10,
+        hyena_w_mod=1,
+        hyena_wd=0.1,
+        hyena_emb_dim=3,
+        hyena_filter_dropout=0.0,
+        hyena_filter_order=16,
+        residual_long_conv=False,
+        hyena_training_additions=False,
+    ):
+        super().__init__()
+        self.d_model = d_model
+        self.l_max = l_max
+        self.kernel_lr = hyena_kernel_lr
+        self.channels = 1
+        self.bidirectional = bidirectional
+        self.residual_long_conv = residual_long_conv
+        self.NUM_PROJECTIONS = 3
+        print('-- Bidirectional:', self.bidirectional)
+        print("-- Using Long Conv Residual:", self.residual_long_conv)
+        print('-- Hyena w:', hyena_w)
+        print('-- Hyena w mod:', hyena_w_mod)
+        print(f"-- Hyena filter order: {hyena_filter_order}")
+        print(f"-- Hyena filter dropout: {hyena_filter_dropout}")
+        print(f"-- Hyena filter wd: {hyena_wd}")
+        print(f"-- Hyena filter emb dim: {hyena_emb_dim}")
+        print(f"-- Hyena filter lr: {hyena_kernel_lr}")
+        print(f"-- Hyena filter lr pos emb: {hyena_lr_pos_emb}")
+        self.filter_fn = HyenaFilter(
+            self.d_model,
+            order=hyena_filter_order,
+            seq_len=self.l_max,
+            dropout=hyena_filter_dropout,
+            bidirectional=self.bidirectional,
+            lr=hyena_kernel_lr,
+            lr_pos_emb=hyena_lr_pos_emb,
+            w=hyena_w,  # frequency of periodic activations
+            w_mod=hyena_w_mod,
+            wd=hyena_wd,  # weight decay of kernel parameters
+            emb_dim=hyena_emb_dim,
+        )
+        if self.residual_long_conv:
+            self.filter_fn2 = HyenaFilter(
+                self.d_model,
+                order=hyena_filter_order,
+                seq_len=self.l_max,
+                dropout=hyena_filter_dropout,
+                bidirectional=self.bidirectional,
+                lr=hyena_kernel_lr,
+                lr_pos_emb=hyena_lr_pos_emb,
+                w=hyena_w,  # frequency of periodic activations
+                w_mod=hyena_w_mod,
+                wd=hyena_wd,  # weight decay of kernel parameters
+                emb_dim=hyena_emb_dim,
+            )
+        # setup projections
+        self.in_linear = nn.Linear(d_model, 3 * d_model)
+        self.out_linear = nn.Linear(d_model, d_model)
+        self.hyena_training_additions = hyena_training_additions
+        if self.hyena_training_additions:
+            self.act = nn.Identity()
+            self.drop = nn.Dropout(dropout)
+            self.layernorm = nn.LayerNorm(d_model)
+        # setup short conv
+        total_width = self.d_model * self.NUM_PROJECTIONS
+        self.short_filter = nn.Conv1d(
+            in_channels=total_width,
+            out_channels=total_width,
+            kernel_size=3,
+            groups=total_width,
+            padding=2,
+        )
+    def forward(self, u, **kwargs):
+        # u is B L H
+        if self.hyena_training_additions:
+            u = self.layernorm(u)
+        L = u.size(-2)
+        # in projection
+        u_orig = u
+        u = self.in_linear(u)
+        u = rearrange(u, "b l d -> b d l")
+        # short filter
+        uc = self.short_filter(u)[..., :L]
+        x1, x2, v = uc.split(self.d_model, dim=1)
+        v = v * x1
+        if self.hyena_training_additions:
+            v = self.drop(v)
+        k = self.filter_fn.filter(L, device=u.device)
+        k = rearrange(k, "c l d -> c d l")[0] # `c` is always 1 by default
+        if self.bidirectional:
+            k_rev = self.filter_fn.filter_rev(L, device=u.device)
+            k_rev = rearrange(k_rev, "c l d -> c d l")[0] # `c` is always 1 by default
+        else:
+            k_rev = None
+        y = self.filter_fn(v, L, k_fwd=k, k_rev=k_rev, bias= self.filter_fn.bias[None, :, None])
+        if self.residual_long_conv:
+            k2 = self.filter_fn2.filter(L, device=u.device)
+            k2 = rearrange(k2, "c l d -> c d l")[0]
+            if self.bidirectional:
+                k2_rev = self.filter_fn2.filter_rev(L, device=u.device)
+                k2_rev = rearrange(k2_rev, "c l d -> c d l")[0] # `c` is always 1 by default
+            else:
+                k2_rev = None
+            yu = self.filter_fn2(u_orig.transpose(-1, -2), L, k_fwd=k2, k_rev=k2_rev, bias= self.filter_fn2.bias[None, :, None])
+        # post gating
+        y = y * x2
+        if self.residual_long_conv:
+            y = y + yu
+        y = y.transpose(-1, -2)
+        if self.hyena_training_additions:
+            y = self.drop(self.act(y))
+        y = self.out_linear(y)
+        return y, None

structured_linear.py ADDED Viewed

	@@ -0,0 +1,70 @@

+# Adapted from https://github.com/HazyResearch/fly/tree/master/src/models/layers
+import math
+from functools import partial
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from torch.nn import init
+class StructuredLinear(nn.Module):
+    def __init__(self, in_features, out_features, bias=True, device=None, dtype=None):
+        """Subclasses should call reset_parameters
+        """
+        factory_kwargs = {'device': device, 'dtype': dtype}
+        super().__init__()
+        self.in_features = in_features
+        self.out_features = out_features
+        # Subclasses may override {in,out}_features_extended
+        if not hasattr(self, 'in_features_extended'):
+            self.in_features_extended = in_features
+        if not hasattr(self, 'out_features_extended'):
+            self.out_features_extended = out_features
+        if bias:
+            self.bias = nn.Parameter(torch.zeros(out_features, **factory_kwargs))
+        else:
+            self.register_parameter('bias', None)
+    def reset_parameters(self) -> None:
+        self.set_weights_from_dense_init(dense_init_fn_=partial(init.kaiming_uniform_, a=math.sqrt(5)))
+        self.reset_parameters_bias()
+    def set_weights_from_dense_init(self, dense_init_fn_):
+        raise NotImplementedError
+    def reset_parameters_bias(self):
+        if self.bias is not None:
+            fan_in = self.bias.shape[-1]
+            bound = 1 / math.sqrt(fan_in) if fan_in > 0 else 0
+            init.uniform_(self.bias, -bound, bound)
+    @property
+    def saving(self):
+        raise NotImplementedError
+    def convert_to_dense_weight(self):
+        factory_kwargs = {'device': self.weight.device, 'dtype': self.weight.dtype}
+        dense_weight = self.forward_matmul(torch.eye(self.in_features, **factory_kwargs)).T
+        return dense_weight
+    def preprocess(self, x):
+        in_features = x.shape[-1]
+        if in_features < self.in_features_extended:
+            x = F.pad(x, (0, self.in_features_extended - in_features))
+        return x
+    def postprocess(self, output):
+        out_features_extended = output.shape[-1]
+        if out_features_extended > self.out_features:
+            output = output[..., :self.out_features]
+        return output
+    def forward_matmul(self, x):
+        raise NotImplementedError
+    def forward(self, x):
+        output = self.forward_matmul(x)
+        # Convert bias to output.dtype in case of AMP, otherwise bias and activation will be in FP32
+        return (output + self.bias.to(dtype=output.dtype)) if self.bias is not None else output