Changes in modelling_RW.py to be able to handle past_key_values for faster model generations

The current code has missed out passing past_key_values in every forward pass for fast generation of tokens. This results in lot of recompute. This "modelling_RW.py" I am uploading deals with this in the way pytorch huggingface transformers package generation/utils.py wants. All the changes are basically around including past_key_values everywhere. I think this will apply on all falcon models These are the changes specifically

1) Class RotaryEmbedding forward method
Include past_seq_length in forward pass and apply rotary embedding according to the position of the query token ---- if else condition added (line number 100-103)

2) _make_causal_mask function
to give masking according to the way F.scaled dot product attention behaves. F.scaled_dot_product attention treats the attention_mask matrix as receiving attentions. For example if attention_mask is
[[True, False], [True, True]]. It would mean the first token is "receiving" attentions from first token and not second token. This is unlike what we generally end up thinking which is first token is giving attention to itself and not to the second one. Due to reason the past_key_values attentions are all True in make_causal mask function. Also I have reversed the inequality above that due to the same reason. ---- (line number 114 inequality, line number 117 attention mask to be True)

3) Class Attention forward method
a) past_key_value length is passed in rotary function ---- if,else loop added (line number 271-277)
b) concatenation of past key and current key is done after permuting the past key shape to match the current key shape ---- (line number 280-284)
c) to keep key_layer shape consistent with the output expectation which is (batch_size, head_dim, seq_length), another permutation done before creating "present" to return in the output ---- (line number 289-293)

4) RW Model prepare_attn_mask
Have removed src_length > 1 criteria for making causal mask (line number 554).

5) RW causal LM prepare inputs for generation
Read pastkey values from the input coming from huggingface generate method and dont call convert_to_rw_cache method (line number 740-748)

Files changed (1) hide show

modelling_RW.py +72 -36

modelling_RW.py CHANGED Viewed

@@ -11,7 +11,9 @@ import torch.utils.checkpoint
 from torch import nn
 from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, LayerNorm, MSELoss
 from torch.nn import functional as F
 from transformers.modeling_outputs import (
     BaseModelOutputWithPastAndCrossAttentions,
     CausalLMOutputWithCrossAttentions,
@@ -87,10 +89,19 @@ class RotaryEmbedding(torch.nn.Module):
         return self.cos_cached, self.sin_cached
-    def forward(self, q, k):
-        batch, seq_len, head_dim = q.shape
         cos, sin = self.cos_sin(seq_len, q.device, q.dtype)
-        return (q * cos) + (rotate_half(q) * sin), (k * cos) + (rotate_half(k) * sin)
 def _make_causal_mask(
@@ -100,10 +111,10 @@ def _make_causal_mask(
     mask = torch.empty((target_length, target_length + past_key_values_length), dtype=torch.bool, device=device)
     # ONNX doesn't support `torch.Tensor.triu` properly, thus we use this workaround
     seq_ids = torch.arange(target_length, device=device)
-    mask[:, past_key_values_length:] = seq_ids[:, None] < seq_ids[None, :]
     if past_key_values_length > 0:
-        mask[:, :past_key_values_length] = False
     expanded_mask = mask[None, None, :, :].expand(batch_size, 1, target_length, target_length + past_key_values_length)
     return expanded_mask
@@ -150,6 +161,10 @@ def dropout_add(x: torch.Tensor, residual: torch.Tensor, prob: float, training:
     out = residual + out
     return out
 class Attention(nn.Module):
     def __init__(self, config: RWConfig):
@@ -239,9 +254,8 @@ class Attention(nn.Module):
         use_cache: bool = False,
         output_attentions: bool = False,
     ):
         fused_qkv = self.query_key_value(hidden_states)  # [batch_size, seq_length, 3 x hidden_size]
-        # 3 x [batch_size, seq_length, num_heads, head_dim]
         (query_layer, key_layer, value_layer) = self._split_heads(fused_qkv)
         batch_size, q_length, _, _ = query_layer.shape
@@ -254,20 +268,27 @@ class Attention(nn.Module):
         )
         value_layer = value_layer.transpose(1, 2).reshape(batch_size * self.num_kv, q_length, self.head_dim)
-        query_layer, key_layer = self.maybe_rotary(query_layer, key_layer)
         if layer_past is not None:
             past_key, past_value = layer_past
-            # concatenate along seq_length dimension:
-            #  - key: [batch_size * self.num_heads, head_dim, kv_length]
-            #  - value: [batch_size * self.num_heads, kv_length, head_dim]
             key_layer = torch.cat((past_key, key_layer), dim=1)
             value_layer = torch.cat((past_value, value_layer), dim=1)
         _, kv_length, _ = key_layer.shape
         if use_cache is True:
-            present = (key_layer, value_layer)
         else:
             present = None
@@ -275,10 +296,16 @@ class Attention(nn.Module):
             query_layer_ = query_layer.reshape(batch_size, self.num_heads, -1, self.head_dim)
             key_layer_ = key_layer.reshape(batch_size, self.num_kv, -1, self.head_dim)
             value_layer_ = value_layer.reshape(batch_size, self.num_kv, -1, self.head_dim)
-            attn_output = F.scaled_dot_product_attention(
-                query_layer_, key_layer_, value_layer_, None, 0.0, is_causal=True
-            )
             x = attn_output.view(batch_size, self.num_heads, q_length, self.head_dim)
             x = x.permute(0, 2, 1, 3)
@@ -475,8 +502,8 @@ class RWPreTrainedModel(PreTrainedModel):
     def _convert_to_rw_cache(
         past_key_value: Tuple[Tuple[torch.Tensor, torch.Tensor]]
     ) -> Tuple[Tuple[torch.Tensor, torch.Tensor]]:
-        batch_size, num_heads, head_dim, seq_length = past_key_value[0][0].shape
-        batch_size_times_num_heads = batch_size * num_heads
         # key:  [batch_size, num_heads, head_dim, seq_length] -> [batch_size * num_heads, head_dim, seq_length]
         # value: [batch_size, num_heads, seq_length, head_dim] -> [batch_size * num_heads, seq_length, head_dim]
         return tuple(
@@ -488,6 +515,7 @@ class RWPreTrainedModel(PreTrainedModel):
         )
 class RWModel(RWPreTrainedModel):
     def __init__(self, config: RWConfig):
         super().__init__(config)
@@ -522,10 +550,11 @@ class RWModel(RWPreTrainedModel):
         device = attention_mask.device
         _, src_length = input_shape
-        if src_length > 1:
-            combined_attention_mask = _make_causal_mask(
-                input_shape, device=device, past_key_values_length=past_key_values_length
-            )
         # [batch_size, seq_length] -> [batch_size, 1, tgt_length, src_length]
         expanded_attn_mask = _expand_mask(attention_mask, tgt_length=src_length)
@@ -560,7 +589,7 @@ class RWModel(RWPreTrainedModel):
             )
         if len(deprecated_arguments) > 0:
             raise ValueError(f"Got unexpected arguments: {deprecated_arguments}")
         output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
         output_hidden_states = (
             output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
@@ -616,6 +645,7 @@ class RWModel(RWPreTrainedModel):
             input_shape=(batch_size, seq_length),
             past_key_values_length=past_key_values_length,
         )
         for i, (block, layer_past) in enumerate(zip(self.h, past_key_values)):
@@ -646,16 +676,18 @@ class RWModel(RWPreTrainedModel):
                 )
             else:
                 outputs = block(
-                    hidden_states,
-                    layer_past=layer_past,
-                    attention_mask=causal_mask,
-                    head_mask=head_mask[i],
-                    use_cache=use_cache,
-                    output_attentions=output_attentions,
-                    alibi=alibi,
-                )
             hidden_states = outputs[0]
             if use_cache is True:
                 presents = presents + (outputs[1],)
@@ -704,16 +736,20 @@ class RWForCausalLM(RWPreTrainedModel):
         **kwargs,
     ) -> dict:
         # only last token for input_ids if past is not None
-        if past:
             input_ids = input_ids[:, -1].unsqueeze(-1)
             # the cache may be in the stardard format (e.g. in contrastive search), convert to our's format if needed
-            if past[0][0].shape[0] == input_ids.shape[0]:
-                past = self._convert_to_rw_cache(past)
         return {
             "input_ids": input_ids,
-            "past_key_values": past,
             "use_cache": kwargs.get("use_cache"),
             "attention_mask": attention_mask,
         }

 from torch import nn
 from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, LayerNorm, MSELoss
 from torch.nn import functional as F
+import pdb
+import os
+import  pickle
 from transformers.modeling_outputs import (
     BaseModelOutputWithPastAndCrossAttentions,
     CausalLMOutputWithCrossAttentions,
         return self.cos_cached, self.sin_cached
+    def forward(self, q, k, past_seq_length=None):
+        if past_seq_length == None :
+            batch, seq_len, head_dim = q.shape
+        else :
+            # print("past_seq_length", past_seq_length)
+            batch, input_seq_len, head_dim = q.shape
+            seq_len = past_seq_length + input_seq_len
         cos, sin = self.cos_sin(seq_len, q.device, q.dtype)
+        if past_seq_length != None :
+            return (q * cos[:, past_seq_length:, :]) + (rotate_half(q) * sin[:, past_seq_length:, :]), (k * cos[:, past_seq_length:, :]) + (rotate_half(k) * sin[:, past_seq_length:, :])
+        else :
+            return (q * cos) + (rotate_half(q) * sin), (k * cos) + (rotate_half(k) * sin)
 def _make_causal_mask(
     mask = torch.empty((target_length, target_length + past_key_values_length), dtype=torch.bool, device=device)
     # ONNX doesn't support `torch.Tensor.triu` properly, thus we use this workaround
     seq_ids = torch.arange(target_length, device=device)
+    mask[:, past_key_values_length:] = seq_ids[:, None] >= seq_ids[None, :]
     if past_key_values_length > 0:
+        mask[:, :past_key_values_length] = True
     expanded_mask = mask[None, None, :, :].expand(batch_size, 1, target_length, target_length + past_key_values_length)
     return expanded_mask
     out = residual + out
     return out
+def dump_value(name, tensor) :
+    with open("/home/purushottam/inspect_falcon/{}".format(name), "wb") as f :
+                pickle.dump(tensor, f)
 class Attention(nn.Module):
     def __init__(self, config: RWConfig):
         use_cache: bool = False,
         output_attentions: bool = False,
     ):
         fused_qkv = self.query_key_value(hidden_states)  # [batch_size, seq_length, 3 x hidden_size]
         (query_layer, key_layer, value_layer) = self._split_heads(fused_qkv)
         batch_size, q_length, _, _ = query_layer.shape
         )
         value_layer = value_layer.transpose(1, 2).reshape(batch_size * self.num_kv, q_length, self.head_dim)
+        if layer_past is not None :
+            past_key, past_value = layer_past
+            past_kv_length = past_key.shape[2]
+            query_layer, key_layer = self.maybe_rotary(query_layer, key_layer, past_kv_length)
+        else :
+            query_layer, key_layer = self.maybe_rotary(query_layer, key_layer)
         if layer_past is not None:
             past_key, past_value = layer_past
+            past_key = past_key.permute(0, 2, 1)
             key_layer = torch.cat((past_key, key_layer), dim=1)
             value_layer = torch.cat((past_value, value_layer), dim=1)
         _, kv_length, _ = key_layer.shape
         if use_cache is True:
+            key_layer_permute = key_layer.permute(0, 2, 1)
+            present = (key_layer_permute, value_layer)
         else:
             present = None
             query_layer_ = query_layer.reshape(batch_size, self.num_heads, -1, self.head_dim)
             key_layer_ = key_layer.reshape(batch_size, self.num_kv, -1, self.head_dim)
             value_layer_ = value_layer.reshape(batch_size, self.num_kv, -1, self.head_dim)
+            if attention_mask is not None :
+                attn_output = F.scaled_dot_product_attention(
+                    query_layer_, key_layer_, value_layer_, attention_mask, 0.0, is_causal=False
+                )
+            else :
+                attn_output = F.scaled_dot_product_attention(
+                    query_layer_, key_layer_, value_layer_, None, 0.0, is_causal=True
+                )
             x = attn_output.view(batch_size, self.num_heads, q_length, self.head_dim)
             x = x.permute(0, 2, 1, 3)
     def _convert_to_rw_cache(
         past_key_value: Tuple[Tuple[torch.Tensor, torch.Tensor]]
     ) -> Tuple[Tuple[torch.Tensor, torch.Tensor]]:
+        batch_size, seq_length, head_dim = past_key_value[0][0].shape
+        batch_size_times_num_heads = batch_size
         # key:  [batch_size, num_heads, head_dim, seq_length] -> [batch_size * num_heads, head_dim, seq_length]
         # value: [batch_size, num_heads, seq_length, head_dim] -> [batch_size * num_heads, seq_length, head_dim]
         return tuple(
         )
 class RWModel(RWPreTrainedModel):
     def __init__(self, config: RWConfig):
         super().__init__(config)
         device = attention_mask.device
         _, src_length = input_shape
+        # if src_length > 1:
+        combined_attention_mask = _make_causal_mask(
+            input_shape, device=device, past_key_values_length=past_key_values_length
+        )
         # [batch_size, seq_length] -> [batch_size, 1, tgt_length, src_length]
         expanded_attn_mask = _expand_mask(attention_mask, tgt_length=src_length)
             )
         if len(deprecated_arguments) > 0:
             raise ValueError(f"Got unexpected arguments: {deprecated_arguments}")
+        # pdb.set_trace()
         output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
         output_hidden_states = (
             output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
             input_shape=(batch_size, seq_length),
             past_key_values_length=past_key_values_length,
         )
+        # print("causal_mask", causal_mask)
         for i, (block, layer_past) in enumerate(zip(self.h, past_key_values)):
                 )
             else:
                 outputs = block(
+                hidden_states,
+                layer_past=layer_past,
+                attention_mask=causal_mask,
+                head_mask=head_mask[i],
+                use_cache=use_cache,
+                output_attentions=output_attentions,
+                alibi=alibi,
+                )
             hidden_states = outputs[0]
             if use_cache is True:
                 presents = presents + (outputs[1],)
         **kwargs,
     ) -> dict:
         # only last token for input_ids if past is not None
+        # only last token for input_ids if past is not None
+        if kwargs.get("past_key_values", None) :
             input_ids = input_ids[:, -1].unsqueeze(-1)
+            past_key_values = kwargs["past_key_values"]
             # the cache may be in the stardard format (e.g. in contrastive search), convert to our's format if needed
+            # if kwargs["past_key_values"][0][0].shape[0] == input_ids.shape[0]:
+            #     past_key_values = self._convert_to_rw_cache(kwargs["past_key_values"])
+                # past_key_values = kwargs["past_key_values"]
+        else :
+            past_key_values = None
         return {
             "input_ids": input_ids,
+            "past_key_values": past_key_values,
             "use_cache": kwargs.get("use_cache"),
             "attention_mask": attention_mask,
         }