feat: selective activation checkpointing

#16

by Markus28 - opened Mar 25, 2024

base: refs/heads/main

←

from: refs/pr/16

Discussion Files changed

+33

-9

This PR is in draft mode

Files changed (2) hide show

configuration_bert.py +21 -1
modeling_bert.py +12 -8

configuration_bert.py CHANGED Viewed

@@ -55,6 +55,10 @@ class JinaBertConfig(PretrainedConfig):
         layer_norm_eps (`float`, *optional*, defaults to 1e-12):
             The epsilon used by the layer normalization layers.
         window_size (`tuple`, *optional*, defaults to `(-1, -1)`): If not the default, use local attention
     """
     model_type = "bert"
@@ -86,6 +90,7 @@ class JinaBertConfig(PretrainedConfig):
         emb_pooler=None,
         classifier_dropout=None,
         num_loras=5,
         **kwargs,
     ):
         assert 'position_embedding_type' not in kwargs
@@ -95,6 +100,20 @@ class JinaBertConfig(PretrainedConfig):
         if mlp_type == 'fused_mlp' and hidden_act not in ["gelu_new", "gelu_fast", "gelu_pytorch_tanh"]:
             raise ValueError('Fused MLP only supports approximate gelu')
         self.vocab_size = vocab_size
         self.hidden_size = hidden_size
         self.num_hidden_layers = num_hidden_layers
@@ -118,4 +137,5 @@ class JinaBertConfig(PretrainedConfig):
         self.use_qk_norm = use_qk_norm
         self.emb_pooler = emb_pooler
         self.classifier_dropout = classifier_dropout
-        self.num_loras = num_loras

         layer_norm_eps (`float`, *optional*, defaults to 1e-12):
             The epsilon used by the layer normalization layers.
         window_size (`tuple`, *optional*, defaults to `(-1, -1)`): If not the default, use local attention
+        activation_checkpoint_lvl (`int`, *optional*, defaults to `100`): How many layers to activation-checkpoint.
+            If larger than 0, the MLP activation checkpointing level is expected to be 0 for the first
+            `activation_checkpoint_lvl` layers. The activation checkpointing will only come into effect
+            after `model.gradient_checkpointing_enable()` is called.
     """
     model_type = "bert"
         emb_pooler=None,
         classifier_dropout=None,
         num_loras=5,
+        activation_checkpoint_lvl=100,
         **kwargs,
     ):
         assert 'position_embedding_type' not in kwargs
         if mlp_type == 'fused_mlp' and hidden_act not in ["gelu_new", "gelu_fast", "gelu_pytorch_tanh"]:
             raise ValueError('Fused MLP only supports approximate gelu')
+        if mlp_checkpoint_lvl != 0 and mlp_type != 'fused_mlp':
+            raise ValueError('MLP checkpointing only available for `fused_mlp`')
+        if activation_checkpoint_lvl > 0 and isinstance(mlp_checkpoint_lvl, int) and mlp_checkpoint_lvl > 0:
+            raise ValueError('Trying to use layer-wise activation checkpointing and MLP-checkpointing '
+                             'in every layer simultaneously. Either only use one of the techniques, '
+                             'or specify layer-wise MLP checkpointing.')
+        elif activation_checkpoint_lvl > 0 and mlp_checkpoint_lvl > 0:
+            for layer_idx, mlp_lvl in enumerate(mlp_checkpoint_lvl):
+                if layer_idx < activation_checkpoint_lvl and mlp_lvl > 0:
+                    raise ValueError(f'Layer {layer_idx} is being checkpointed as a whole and its MLP '
+                                     f'is being checkpointed. Either remove MLP checkpointing for this layer '
+                                     f'or reduce the `activation_checkpoint_lvl` appropriately')
         self.vocab_size = vocab_size
         self.hidden_size = hidden_size
         self.num_hidden_layers = num_hidden_layers
         self.use_qk_norm = use_qk_norm
         self.emb_pooler = emb_pooler
         self.classifier_dropout = classifier_dropout
+        self.num_loras = num_loras
+        self.activation_checkpoint_lvl = activation_checkpoint_lvl

modeling_bert.py CHANGED Viewed

@@ -180,13 +180,17 @@ class BertEncoder(nn.Module):
             [create_block(config, layer_idx=i) for i in range(config.num_hidden_layers)]
         )
         self._grad_checkpointing = False
     @property
     def gradient_checkpointing(self):
         return self._grad_checkpointing
     @gradient_checkpointing.setter
-    def gradient_checkpointing(self, value):
         self._grad_checkpointing = value
     def forward(self, hidden_states, key_padding_mask=None, subset_mask=None):
@@ -198,8 +202,8 @@ class BertEncoder(nn.Module):
             mixer_kwargs = (
                 {"key_padding_mask": key_padding_mask.bool()} if key_padding_mask is not None else None
             )
-            for layer in self.layers:
-                if self._grad_checkpointing:
                     hidden_states = torch.utils.checkpoint.checkpoint(
                         layer,
                         hidden_states,
@@ -217,8 +221,8 @@ class BertEncoder(nn.Module):
             )
             mixer_kwargs = {"cu_seqlens": cu_seqlens, "max_seqlen": max_seqlen_in_batch}
             if subset_mask is None:
-                for layer in self.layers:
-                    if self._grad_checkpointing:
                         hidden_states = torch.utils.checkpoint.checkpoint(
                             layer,
                             hidden_states,
@@ -229,8 +233,8 @@ class BertEncoder(nn.Module):
                         hidden_states = layer(hidden_states, mixer_kwargs=mixer_kwargs)
                 hidden_states = pad_input(hidden_states, indices, batch, seqlen)
             else:
-                for layer in self.layers[:-1]:
-                    if self._grad_checkpointing:
                         hidden_states = torch.utils.checkpoint.checkpoint(
                             layer,
                             hidden_states,
@@ -264,7 +268,7 @@ class BertEncoder(nn.Module):
                     "cu_seqlens_k": cu_seqlens,
                     "max_seqlen_k": max_seqlen_in_batch,
                 }
-                if self._grad_checkpointing:
                     torch.utils.checkpoint.checkpoint(
                         self.layers[-1],
                         hidden_states_subset,

             [create_block(config, layer_idx=i) for i in range(config.num_hidden_layers)]
         )
         self._grad_checkpointing = False
+        self._num_checkpointed_layers = config.activation_checkpoint_lvl
     @property
     def gradient_checkpointing(self):
         return self._grad_checkpointing
     @gradient_checkpointing.setter
+    def gradient_checkpointing(self, value: bool):
+        if value and self._num_checkpointed_layers <= 0:
+            raise ValueError('Trying to use activation checkpointing, but `activation_checkpoint_lvl`'
+                             'is set to zero.')
         self._grad_checkpointing = value
     def forward(self, hidden_states, key_padding_mask=None, subset_mask=None):
             mixer_kwargs = (
                 {"key_padding_mask": key_padding_mask.bool()} if key_padding_mask is not None else None
             )
+            for idx, layer in enumerate(self.layers):
+                if self._grad_checkpointing and idx < self._num_checkpointed_layers:
                     hidden_states = torch.utils.checkpoint.checkpoint(
                         layer,
                         hidden_states,
             )
             mixer_kwargs = {"cu_seqlens": cu_seqlens, "max_seqlen": max_seqlen_in_batch}
             if subset_mask is None:
+                for idx, layer in enumerate(self.layers):
+                    if self._grad_checkpointing and idx < self._num_checkpointed_layers:
                         hidden_states = torch.utils.checkpoint.checkpoint(
                             layer,
                             hidden_states,
                         hidden_states = layer(hidden_states, mixer_kwargs=mixer_kwargs)
                 hidden_states = pad_input(hidden_states, indices, batch, seqlen)
             else:
+                for idx, layer in enumerate(self.layers[:-1]):
+                    if self._grad_checkpointing and idx < self._num_checkpointed_layers:
                         hidden_states = torch.utils.checkpoint.checkpoint(
                             layer,
                             hidden_states,
                     "cu_seqlens_k": cu_seqlens,
                     "max_seqlen_k": max_seqlen_in_batch,
                 }
+                if self._grad_checkpointing and len(self.layers) <= self._num_checkpointed_layers:
                     torch.utils.checkpoint.checkpoint(
                         self.layers[-1],
                         hidden_states_subset,