jinaai
/

jina-bert-flash-implementation

Markus28 commited on Mar 25, 2024

Commit

535ad9a

1 Parent(s): b563469

Set activation_checkpoint_lvl to 100 by default

Files changed (1) hide show

configuration_bert.py CHANGED Viewed

@@ -55,9 +55,10 @@ class JinaBertConfig(PretrainedConfig):
         layer_norm_eps (`float`, *optional*, defaults to 1e-12):
             The epsilon used by the layer normalization layers.
         window_size (`tuple`, *optional*, defaults to `(-1, -1)`): If not the default, use local attention
-        activation_checkpoint_lvl (`int`, *optional*, defaults to `0`): How many layers to activation-checkpoint.
             If larger than 0, the MLP activation checkpointing level is expected to be 0 for the first
-            `activation_checkpoint_lvl` layers.
     """
     model_type = "bert"
@@ -89,7 +90,7 @@ class JinaBertConfig(PretrainedConfig):
         emb_pooler=None,
         classifier_dropout=None,
         num_loras=5,
-        activation_checkpoint_lvl=0,
         **kwargs,
     ):
         assert 'position_embedding_type' not in kwargs

         layer_norm_eps (`float`, *optional*, defaults to 1e-12):
             The epsilon used by the layer normalization layers.
         window_size (`tuple`, *optional*, defaults to `(-1, -1)`): If not the default, use local attention
+        activation_checkpoint_lvl (`int`, *optional*, defaults to `100`): How many layers to activation-checkpoint.
             If larger than 0, the MLP activation checkpointing level is expected to be 0 for the first
+            `activation_checkpoint_lvl` layers. The activation checkpointing will only come into effect
+            after `model.gradient_checkpointing_enable()` is called.
     """
     model_type = "bert"
         emb_pooler=None,
         classifier_dropout=None,
         num_loras=5,
+        activation_checkpoint_lvl=100,
         **kwargs,
     ):
         assert 'position_embedding_type' not in kwargs