Set activation_checkpoint_lvl to 100 by default
Browse files- configuration_bert.py +4 -3
configuration_bert.py
CHANGED
@@ -55,9 +55,10 @@ class JinaBertConfig(PretrainedConfig):
|
|
55 |
layer_norm_eps (`float`, *optional*, defaults to 1e-12):
|
56 |
The epsilon used by the layer normalization layers.
|
57 |
window_size (`tuple`, *optional*, defaults to `(-1, -1)`): If not the default, use local attention
|
58 |
-
activation_checkpoint_lvl (`int`, *optional*, defaults to `
|
59 |
If larger than 0, the MLP activation checkpointing level is expected to be 0 for the first
|
60 |
-
`activation_checkpoint_lvl` layers.
|
|
|
61 |
"""
|
62 |
|
63 |
model_type = "bert"
|
@@ -89,7 +90,7 @@ class JinaBertConfig(PretrainedConfig):
|
|
89 |
emb_pooler=None,
|
90 |
classifier_dropout=None,
|
91 |
num_loras=5,
|
92 |
-
activation_checkpoint_lvl=
|
93 |
**kwargs,
|
94 |
):
|
95 |
assert 'position_embedding_type' not in kwargs
|
|
|
55 |
layer_norm_eps (`float`, *optional*, defaults to 1e-12):
|
56 |
The epsilon used by the layer normalization layers.
|
57 |
window_size (`tuple`, *optional*, defaults to `(-1, -1)`): If not the default, use local attention
|
58 |
+
activation_checkpoint_lvl (`int`, *optional*, defaults to `100`): How many layers to activation-checkpoint.
|
59 |
If larger than 0, the MLP activation checkpointing level is expected to be 0 for the first
|
60 |
+
`activation_checkpoint_lvl` layers. The activation checkpointing will only come into effect
|
61 |
+
after `model.gradient_checkpointing_enable()` is called.
|
62 |
"""
|
63 |
|
64 |
model_type = "bert"
|
|
|
90 |
emb_pooler=None,
|
91 |
classifier_dropout=None,
|
92 |
num_loras=5,
|
93 |
+
activation_checkpoint_lvl=100,
|
94 |
**kwargs,
|
95 |
):
|
96 |
assert 'position_embedding_type' not in kwargs
|