poc

Browse files

Signed-off-by: jupyterjazz <[email protected]>

Files changed (5) hide show

embedding.py +1 -2
mha.py +5 -3
mlp.py +2 -2
modeling_lora.py +33 -35
modeling_xlm_roberta.py +1 -1

embedding.py CHANGED Viewed

@@ -47,7 +47,6 @@ class XLMRobertaEmbeddings(nn.Module):
         token_type_ids: (batch, seqlen)
         """
         batch_size, seqlen = input_ids.shape
-        print('input shape', input_ids.shape)
         embeddings = self.word_embeddings(input_ids, task='sts')
         if self.max_position_embeddings > 0:
             if position_ids is None:
@@ -58,6 +57,6 @@ class XLMRobertaEmbeddings(nn.Module):
         if self.type_vocab_size > 0:
             if token_type_ids is None:
                 token_type_ids = torch.zeros(seqlen, dtype=torch.long, device=input_ids.device)
-            token_type_embeddings = self.token_type_embeddings(token_type_ids)
             embeddings = embeddings + token_type_embeddings
         return embeddings

         token_type_ids: (batch, seqlen)
         """
         batch_size, seqlen = input_ids.shape
         embeddings = self.word_embeddings(input_ids, task='sts')
         if self.max_position_embeddings > 0:
             if position_ids is None:
         if self.type_vocab_size > 0:
             if token_type_ids is None:
                 token_type_ids = torch.zeros(seqlen, dtype=torch.long, device=input_ids.device)
+            token_type_embeddings = self.token_type_embeddings(token_type_ids, task='sts')
             embeddings = embeddings + token_type_embeddings
         return embeddings

mha.py CHANGED Viewed

@@ -341,6 +341,7 @@ class LinearResidual(nn.Linear):
     """Wrap nn.Linear to return the residual as well. For compatibility with FusedDense."""
     def forward(self, input: torch.Tensor, task=None) -> torch.Tensor:
         return super().forward(input, task=task), input
@@ -450,7 +451,7 @@ class MHA(nn.Module):
         if fused_bias_fc and FusedDense is None:
             raise ImportError("fused_dense is not installed")
-        print('is this true', fused_bias_fc)
         linear_cls = nn.Linear if not fused_bias_fc else FusedDense
         linear_resid_cls = (
             LinearResidual if not fused_bias_fc else partial(FusedDense, return_residual=True)
@@ -647,7 +648,8 @@ class MHA(nn.Module):
             if not self.return_residual:
                 qkv = self.Wqkv(x)
             else:
-                qkv, x = self.Wqkv(x, task='sts')
             if self.dwconv:
                 qkv = rearrange(
                     self.dwconv_qkv(rearrange(qkv, "b s d -> b d s"))[..., :-2], "b d s -> b s d"
@@ -732,5 +734,5 @@ class MHA(nn.Module):
                     context = self._update_kvcache_attention(q, kv, inference_params)
             else:
                 context = self._apply_rotary_update_kvcache_attention(q, kv, inference_params)
-        out = self.out_proj(rearrange(context, "... h d -> ... (h d)"))
         return out if not self.return_residual else (out, x)

     """Wrap nn.Linear to return the residual as well. For compatibility with FusedDense."""
     def forward(self, input: torch.Tensor, task=None) -> torch.Tensor:
+        print('aq vafshe ar modis?')
         return super().forward(input, task=task), input
         if fused_bias_fc and FusedDense is None:
             raise ImportError("fused_dense is not installed")
         linear_cls = nn.Linear if not fused_bias_fc else FusedDense
         linear_resid_cls = (
             LinearResidual if not fused_bias_fc else partial(FusedDense, return_residual=True)
             if not self.return_residual:
                 qkv = self.Wqkv(x)
             else:
+                qkv, x = self.Wqkv(x, task='query', residual=True)
             if self.dwconv:
                 qkv = rearrange(
                     self.dwconv_qkv(rearrange(qkv, "b s d -> b d s"))[..., :-2], "b d s -> b s d"
                     context = self._update_kvcache_attention(q, kv, inference_params)
             else:
                 context = self._apply_rotary_update_kvcache_attention(q, kv, inference_params)
+        out = self.out_proj(rearrange(context, "... h d -> ... (h d)"), task='passage')
         return out if not self.return_residual else (out, x)

mlp.py CHANGED Viewed

@@ -48,9 +48,9 @@ class Mlp(nn.Module):
         self.fc2 = nn.Linear(hidden_features, out_features, bias=bias2, **factory_kwargs)
     def forward(self, x):
-        y = self.fc1(x)
         y = self.activation(y)
-        y = self.fc2(y)
         return y if not self.return_residual else (y, x)

         self.fc2 = nn.Linear(hidden_features, out_features, bias=bias2, **factory_kwargs)
     def forward(self, x):
+        y = self.fc1(x, task='clustering')
         y = self.activation(y)
+        y = self.fc2(y, task='sts')
         return y if not self.return_residual else (y, x)

modeling_lora.py CHANGED Viewed

@@ -9,6 +9,7 @@ import torch
 import torch.nn.utils.parametrize as parametrize
 from torch import nn
 from torch.nn import Parameter
 from transformers import PretrainedConfig
 from .modeling_xlm_roberta import XLMRobertaFlashConfig, XLMRobertaModel, XLMRobertaPreTrainedModel
@@ -98,8 +99,7 @@ class LoRAParametrization(nn.Module):
         # to mimic the original implementation: A @ dropout(x), we do (A * dropout(ones)) @ x
         return A * self.lora_dropout(self.lora_dropout_mask)
-    def lora_forward(self, X, current_task=None):
-        print('lora input shape', X.shape)
         return (
             X
             + torch.matmul(
@@ -114,10 +114,7 @@ class LoRAParametrization(nn.Module):
         )
     def forward(self, X):
-        print('forward input shape', X.shape, X)
-        out = self.forward_fn(X)
-        print(out.shape)
-        return out
     @property
     def current_task(self):
@@ -195,13 +192,20 @@ class LoRAParametrization(nn.Module):
                     alpha=alpha,
                 ),
             )
-            original_forward = layer.forward
-            def new_forward(self, input, task):
-                print('an aq mitxari aba')
-                output = original_forward(input, task=task)
-                weight = self.parametrizations.weight(self.weight, task)
-                return nn.functional.linear(input, weight, self.bias)
             layer.forward = new_forward.__get__(layer, layer.__class__)
@@ -217,20 +221,20 @@ class LoRAParametrization(nn.Module):
                     alpha=alpha,
                 ),
             )
-            original_forward = layer.forward
             def new_forward(self, input, task):
-                print('input here', input, input.shape)
-                print('func', original_forward)
-                # original_forward['parametrizations'] = None
-                # print('funcc', original_forward.__dict__)
-                output = original_forward(input)
-                print(output.shape, 'output shape')
                 task_idx = adaptation_map[task] if task else None
                 if task_idx:
-                    output = self.parametrizations.weight[0].lora_forward(output, current_task=task_idx)
-                    print('thats it')
-                return output
             layer.forward = new_forward.__get__(layer, layer.__class__)
@@ -278,13 +282,7 @@ class XLMRobertaLoRA(XLMRobertaPreTrainedModel):
         self._task_idx = None
         # By default, disable LoRA until it's specified which adapter/task to use
         self.current_task = None
-        for name, param in super().named_parameters():
-            if name == 'roberta.encoder.layers.22.mixer.Wqkv.parametrizations.weight.0.lora_A':
-                print('A0', param[0])
-                print('A1', param[1])
-            if name == 'roberta.encoder.layers.22.mixer.Wqkv.parametrizations.weight.0.lora_B':
-                print('B0', param[0])
-                print('B1', param[1])
     @property
     def main_params_trainable(self):
@@ -364,12 +362,12 @@ class XLMRobertaLoRA(XLMRobertaPreTrainedModel):
                 f"Alternatively, set `task` to `None` if you want to disable LoRA."
             )
         task_idx = self._adaptation_map[task_name] if task_name else None
-        if self._task_idx != task_idx:
-            # In this case, we need to update the LoRAs everywhere
-            self._task_idx = task_idx
-            self.apply(
-                partial(LoRAParametrization.select_task_for_layer, task_idx=task_idx)
-            )
     def forward(self, *args, task: Union[str, None] = LORA_NO_UPDATE, **kwargs):
         if task != LORA_NO_UPDATE:

 import torch.nn.utils.parametrize as parametrize
 from torch import nn
 from torch.nn import Parameter
+from torch.nn import functional as F
 from transformers import PretrainedConfig
 from .modeling_xlm_roberta import XLMRobertaFlashConfig, XLMRobertaModel, XLMRobertaPreTrainedModel
         # to mimic the original implementation: A @ dropout(x), we do (A * dropout(ones)) @ x
         return A * self.lora_dropout(self.lora_dropout_mask)
+    def lora_forward(self, X, current_task):
         return (
             X
             + torch.matmul(
         )
     def forward(self, X):
+        return X
     @property
     def current_task(self):
                     alpha=alpha,
                 ),
             )
+            def new_forward(self, input, task, residual=False):
+                task_idx = adaptation_map[task] if task else None
+                if task_idx:
+                    weights = self.parametrizations.weight[0].lora_forward(self.weight, current_task=task_idx)
+                else:
+                    weights = self.weight
+                out = F.linear(input, weights, self.bias)
+                print('lin', task_idx, input.shape, out.shape)
+                if residual:
+                    return out, input
+                return out
             layer.forward = new_forward.__get__(layer, layer.__class__)
                     alpha=alpha,
                 ),
             )
             def new_forward(self, input, task):
                 task_idx = adaptation_map[task] if task else None
                 if task_idx:
+                    weights = self.parametrizations.weight[0].lora_forward(self.weight, current_task=task_idx)
+                else:
+                    weights = self.weight
+                out = F.embedding(
+                    input, weights, self.padding_idx, self.max_norm,
+                    self.norm_type, self.scale_grad_by_freq, self.sparse)
+                print('emb', task_idx, input.shape, out.shape)
+                return out
             layer.forward = new_forward.__get__(layer, layer.__class__)
         self._task_idx = None
         # By default, disable LoRA until it's specified which adapter/task to use
         self.current_task = None
     @property
     def main_params_trainable(self):
                 f"Alternatively, set `task` to `None` if you want to disable LoRA."
             )
         task_idx = self._adaptation_map[task_name] if task_name else None
+        # if self._task_idx != task_idx:
+        #     # In this case, we need to update the LoRAs everywhere
+        #     self._task_idx = task_idx
+        #     self.apply(
+        #         partial(LoRAParametrization.select_task_for_layer, task_idx=task_idx)
+        #     )
     def forward(self, *args, task: Union[str, None] = LORA_NO_UPDATE, **kwargs):
         if task != LORA_NO_UPDATE:

modeling_xlm_roberta.py CHANGED Viewed

@@ -313,7 +313,7 @@ class XLMRobertaPooler(nn.Module):
         # We "pool" the model by simply taking the hidden state corresponding
         # to the first token.
         first_token_tensor = hidden_states[:, 0] if pool else hidden_states
-        pooled_output = self.dense(first_token_tensor)
         pooled_output = self.activation(pooled_output)
         return pooled_output

         # We "pool" the model by simply taking the hidden state corresponding
         # to the first token.
         first_token_tensor = hidden_states[:, 0] if pool else hidden_states
+        pooled_output = self.dense(first_token_tensor, task='passage')
         pooled_output = self.activation(pooled_output)
         return pooled_output