Add files using upload-large-folder tool
Browse files- config.json +6 -4
- config_molmo.py +6 -5
- model-00007-of-00007.safetensors +2 -2
- model.safetensors.index.json +3 -2
- modeling_molmo.py +34 -30
- preprocessing_molmo.py +1 -3
config.json
CHANGED
@@ -1,11 +1,11 @@
|
|
1 |
{
|
2 |
-
"_name_or_path": "/data/chris/hf/7b-v3",
|
3 |
"architectures": [
|
4 |
-
"
|
5 |
],
|
|
|
6 |
"auto_map": {
|
7 |
"AutoConfig": "config_molmo.MolmoConfig",
|
8 |
-
"AutoModelForCausalLM": "modeling_molmo.
|
9 |
},
|
10 |
"clip_qkv": null,
|
11 |
"embedding_size": 152064,
|
@@ -13,8 +13,10 @@
|
|
13 |
"initializer_range": 0.02,
|
14 |
"intermediate_size": 37888,
|
15 |
"layer_norm_eps": 1e-06,
|
|
|
16 |
"max_position_embeddings": 4096,
|
17 |
"model_type": "molmo",
|
|
|
18 |
"num_attention_heads": 28,
|
19 |
"num_hidden_layers": 28,
|
20 |
"num_key_value_heads": 4,
|
@@ -27,4 +29,4 @@
|
|
27 |
"use_position_ids": true,
|
28 |
"vocab_size": 152064,
|
29 |
"weight_tying": false
|
30 |
-
}
|
|
|
1 |
{
|
|
|
2 |
"architectures": [
|
3 |
+
"MolmoForCausalLM"
|
4 |
],
|
5 |
+
"attention_layer_norm": false,
|
6 |
"auto_map": {
|
7 |
"AutoConfig": "config_molmo.MolmoConfig",
|
8 |
+
"AutoModelForCausalLM": "modeling_molmo.MolmoForCausalLM"
|
9 |
},
|
10 |
"clip_qkv": null,
|
11 |
"embedding_size": 152064,
|
|
|
13 |
"initializer_range": 0.02,
|
14 |
"intermediate_size": 37888,
|
15 |
"layer_norm_eps": 1e-06,
|
16 |
+
"layer_norm_type": "rms",
|
17 |
"max_position_embeddings": 4096,
|
18 |
"model_type": "molmo",
|
19 |
+
"norm_after": false,
|
20 |
"num_attention_heads": 28,
|
21 |
"num_hidden_layers": 28,
|
22 |
"num_key_value_heads": 4,
|
|
|
29 |
"use_position_ids": true,
|
30 |
"vocab_size": 152064,
|
31 |
"weight_tying": false
|
32 |
+
}
|
config_molmo.py
CHANGED
@@ -26,6 +26,9 @@ class MolmoConfig(PretrainedConfig):
|
|
26 |
weight_tying: bool = False,
|
27 |
use_position_ids: bool=True,
|
28 |
tie_word_embeddings: bool=True,
|
|
|
|
|
|
|
29 |
**kwargs,
|
30 |
):
|
31 |
self.vocab_size = vocab_size
|
@@ -38,18 +41,16 @@ class MolmoConfig(PretrainedConfig):
|
|
38 |
self.layer_norm_eps = layer_norm_eps
|
39 |
self.weight_tying = weight_tying
|
40 |
self.use_position_ids = use_position_ids
|
41 |
-
|
42 |
-
# for backward compatibility
|
43 |
-
if num_key_value_heads is None:
|
44 |
-
num_key_value_heads = num_attention_heads
|
45 |
-
|
46 |
self.num_key_value_heads = num_key_value_heads
|
47 |
self.initializer_range = initializer_range
|
48 |
self.use_cache = use_cache
|
49 |
self.rope_theta = rope_theta
|
50 |
self.clip_qkv = clip_qkv
|
51 |
self.qkv_bias = qkv_bias
|
|
|
52 |
self.tie_word_embeddings = tie_word_embeddings
|
|
|
53 |
|
54 |
super().__init__(
|
55 |
tie_word_embeddings=tie_word_embeddings,
|
|
|
26 |
weight_tying: bool = False,
|
27 |
use_position_ids: bool=True,
|
28 |
tie_word_embeddings: bool=True,
|
29 |
+
attention_layer_norm: bool=False,
|
30 |
+
norm_after: bool = False,
|
31 |
+
layer_norm_type: str="rms",
|
32 |
**kwargs,
|
33 |
):
|
34 |
self.vocab_size = vocab_size
|
|
|
41 |
self.layer_norm_eps = layer_norm_eps
|
42 |
self.weight_tying = weight_tying
|
43 |
self.use_position_ids = use_position_ids
|
44 |
+
self.attention_layer_norm = attention_layer_norm
|
|
|
|
|
|
|
|
|
45 |
self.num_key_value_heads = num_key_value_heads
|
46 |
self.initializer_range = initializer_range
|
47 |
self.use_cache = use_cache
|
48 |
self.rope_theta = rope_theta
|
49 |
self.clip_qkv = clip_qkv
|
50 |
self.qkv_bias = qkv_bias
|
51 |
+
self.norm_after = norm_after
|
52 |
self.tie_word_embeddings = tie_word_embeddings
|
53 |
+
self.layer_norm_type = layer_norm_type
|
54 |
|
55 |
super().__init__(
|
56 |
tie_word_embeddings=tie_word_embeddings,
|
model-00007-of-00007.safetensors
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
-
size
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:2c84ff3f7adcfdf9eec4247291ca1fcad02cf7005c84801f31223711df54846a
|
3 |
+
size 3799846968
|
model.safetensors.index.json
CHANGED
@@ -1,6 +1,6 @@
|
|
1 |
{
|
2 |
"metadata": {
|
3 |
-
"total_size":
|
4 |
},
|
5 |
"weight_map": {
|
6 |
"model.transformer.blocks.0.att_proj.bias": "model-00001-of-00007.safetensors",
|
@@ -586,6 +586,7 @@
|
|
586 |
"model.vision_backbone.image_vit.transformer.resblocks.9.feed_forward.w2.bias": "model-00007-of-00007.safetensors",
|
587 |
"model.vision_backbone.image_vit.transformer.resblocks.9.feed_forward.w2.weight": "model-00007-of-00007.safetensors",
|
588 |
"model.vision_backbone.image_vit.transformer.resblocks.9.ffn_norm.bias": "model-00007-of-00007.safetensors",
|
589 |
-
"model.vision_backbone.image_vit.transformer.resblocks.9.ffn_norm.weight": "model-00007-of-00007.safetensors"
|
|
|
590 |
}
|
591 |
}
|
|
|
1 |
{
|
2 |
"metadata": {
|
3 |
+
"total_size": 32084101120
|
4 |
},
|
5 |
"weight_map": {
|
6 |
"model.transformer.blocks.0.att_proj.bias": "model-00001-of-00007.safetensors",
|
|
|
586 |
"model.vision_backbone.image_vit.transformer.resblocks.9.feed_forward.w2.bias": "model-00007-of-00007.safetensors",
|
587 |
"model.vision_backbone.image_vit.transformer.resblocks.9.feed_forward.w2.weight": "model-00007-of-00007.safetensors",
|
588 |
"model.vision_backbone.image_vit.transformer.resblocks.9.ffn_norm.bias": "model-00007-of-00007.safetensors",
|
589 |
+
"model.vision_backbone.image_vit.transformer.resblocks.9.ffn_norm.weight": "model-00007-of-00007.safetensors",
|
590 |
+
"model.vision_backbone.pad_embed": "model-00007-of-00007.safetensors"
|
591 |
}
|
592 |
}
|
modeling_molmo.py
CHANGED
@@ -77,7 +77,7 @@ def ensure_finite_(x: torch.Tensor, check_neg_inf: bool = True, check_pos_inf: b
|
|
77 |
x.masked_fill_(x == float("inf"), torch.finfo(x.dtype).max)
|
78 |
|
79 |
|
80 |
-
class
|
81 |
pass
|
82 |
|
83 |
|
@@ -189,7 +189,7 @@ class RotaryEmbedding(nn.Module):
|
|
189 |
return q_.type_as(q), k_.type_as(k)
|
190 |
|
191 |
|
192 |
-
class
|
193 |
"""
|
194 |
A base class for transformer block implementations.
|
195 |
"""
|
@@ -420,17 +420,17 @@ class OLMoBlock(nn.Module):
|
|
420 |
@classmethod
|
421 |
def build(cls, layer_id: int, config: MolmoConfig, cache: BufferCache):
|
422 |
if config.block_type == "sequential":
|
423 |
-
return
|
424 |
elif config.block_type == "llama":
|
425 |
return OLMoLlamaBlock(layer_id, config, cache)
|
426 |
else:
|
427 |
raise NotImplementedError(f"Unknown block type: '{config.block_type}'")
|
428 |
|
429 |
|
430 |
-
class OLMoLlamaBlock(
|
431 |
"""
|
432 |
This is a transformer block where the output is computed as ``MLP(LN(x + Attention(LN(x))))``
|
433 |
-
(plus another skip connection). This block is similar to `
|
434 |
but some operations have slightly different implementations to imitate the
|
435 |
behavior of Llama.
|
436 |
"""
|
@@ -598,7 +598,7 @@ class OLMoLlamaBlock(OLMoBlock):
|
|
598 |
return x, cache
|
599 |
|
600 |
|
601 |
-
class
|
602 |
"""
|
603 |
This is a typical transformer block where the output is computed as ``MLP(LN(x + Attention(LN(x))))``
|
604 |
(plus another skip connection).
|
@@ -825,7 +825,6 @@ class VisionBackboneConfig:
|
|
825 |
class FullMolmoConfig:
|
826 |
d_model: int = 768
|
827 |
n_heads: int = 12
|
828 |
-
head_dim: int = 64
|
829 |
n_kv_heads: Optional[int] = None
|
830 |
qkv_bias: bool = False
|
831 |
clip_qkv: Optional[float] = None
|
@@ -908,7 +907,7 @@ class FullMolmoConfig:
|
|
908 |
if self.n_kv_heads == n_kv_heads_should_be:
|
909 |
return n_kv_heads_should_be
|
910 |
else:
|
911 |
-
raise
|
912 |
"You can't set `multi_query_attention` and `n_kv_heads` at the same time."
|
913 |
)
|
914 |
|
@@ -1897,7 +1896,7 @@ class LayerNorm(LayerNormBase):
|
|
1897 |
return F.layer_norm(x, self.normalized_shape, weight=self.weight, bias=self.bias, eps=self.eps)
|
1898 |
|
1899 |
|
1900 |
-
class
|
1901 |
def __init__(self, config: FullMolmoConfig, init_params: bool = True):
|
1902 |
super().__init__()
|
1903 |
self.config = config
|
@@ -1906,7 +1905,7 @@ class MOLMo(nn.Module):
|
|
1906 |
# Validate config.
|
1907 |
if self.config.embedding_size is not None and self.config.embedding_size != self.config.vocab_size:
|
1908 |
if self.config.embedding_size < self.config.vocab_size:
|
1909 |
-
raise
|
1910 |
elif self.config.embedding_size % 128 != 0:
|
1911 |
import warnings
|
1912 |
|
@@ -1939,7 +1938,7 @@ class MOLMo(nn.Module):
|
|
1939 |
)
|
1940 |
)
|
1941 |
|
1942 |
-
blocks = [
|
1943 |
if self.config.block_group_size > 1:
|
1944 |
raise NotImplementedError()
|
1945 |
else:
|
@@ -2018,16 +2017,20 @@ class MOLMo(nn.Module):
|
|
2018 |
which input IDs are masked. A `1` value in the mask means that
|
2019 |
the corresponding input ID should *not* be ignored. A `0` means
|
2020 |
that the corresponding input ID is masked.
|
|
|
2021 |
This has the same meaning as the `attention_mask` in HuggingFace's `transformers`
|
2022 |
library.
|
2023 |
:param attention_bias: A tensor of shape `(batch_size, 1, seq_len, seq_len)`,
|
2024 |
`(1, 1, seq_len, seq_len)`, or `(seq_len, seq_len)`. This is used
|
2025 |
to introduce causal or other biases.
|
|
|
2026 |
If the tensor is a bool or byte tensor, a `True` or `1` at `attention_bias[:, :, i, j]`
|
2027 |
indicates that the i-th element in the sequence is allowed to attend to the j-th
|
2028 |
element in the sequence.
|
|
|
2029 |
If the tensor is a float tensor, it will just be added to the attention
|
2030 |
scores before the softmax.
|
|
|
2031 |
The default is causal, which corresponds to a lower-diagonal byte matrix of ones.
|
2032 |
:param response_mask: A tensor of shape `(batch_size, seq_len)` that indicates
|
2033 |
the response mask. A `1` value in the mask means that the corresponding token
|
@@ -2258,20 +2261,24 @@ class MOLMo(nn.Module):
|
|
2258 |
return ModelOutput(logits=logits, attn_key_values=attn_key_values, hidden_states=tuple(all_hidden_states) if output_hidden_states else None) # type: ignore[arg-type]
|
2259 |
|
2260 |
|
2261 |
-
class
|
2262 |
config_class = MolmoConfig
|
2263 |
base_model_prefix = "model"
|
2264 |
-
_no_split_modules = ["
|
2265 |
|
2266 |
-
def __init__(self, config: MolmoConfig, model: Optional[
|
2267 |
super().__init__(config)
|
2268 |
|
2269 |
if not model:
|
2270 |
full_config = FullMolmoConfig(
|
|
|
|
|
|
|
2271 |
rope_impl="llama",
|
2272 |
vocab_size=config.vocab_size,
|
2273 |
max_sequence_length=config.max_position_embeddings,
|
2274 |
qkv_bias=config.qkv_bias,
|
|
|
2275 |
embedding_size=config.embedding_size,
|
2276 |
attention_type="sdpa",
|
2277 |
embedding_dropout=0,
|
@@ -2287,9 +2294,9 @@ class MOLMoForCausalLM(PreTrainedModel):
|
|
2287 |
additional_vocab_size=128,
|
2288 |
n_heads=config.num_attention_heads,
|
2289 |
n_kv_heads=config.num_key_value_heads,
|
2290 |
-
rope_theta=
|
2291 |
-
layer_norm_eps=
|
2292 |
-
layer_norm_type=
|
2293 |
pad_tokenizer=True,
|
2294 |
vit_layers=[-2, -9],
|
2295 |
vision_backbone=VisionBackboneConfig(
|
@@ -2312,7 +2319,7 @@ class MOLMoForCausalLM(PreTrainedModel):
|
|
2312 |
initializer_range=0.02,
|
2313 |
)
|
2314 |
)
|
2315 |
-
self.model =
|
2316 |
else:
|
2317 |
self.model = model
|
2318 |
|
@@ -2345,7 +2352,7 @@ class MOLMoForCausalLM(PreTrainedModel):
|
|
2345 |
use_cache = self.config.use_cache
|
2346 |
|
2347 |
if output_attentions:
|
2348 |
-
raise ValueError("output_attentions is not yet supported in
|
2349 |
|
2350 |
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
|
2351 |
|
@@ -2524,16 +2531,6 @@ class MOLMoForCausalLM(PreTrainedModel):
|
|
2524 |
model_kwargs["cache_position"] = model_kwargs["cache_position"][-1:] + num_new_tokens
|
2525 |
return model_kwargs
|
2526 |
|
2527 |
-
# TODO: these are required to make the implementation complete.
|
2528 |
-
# def resize_position_embeddings(self, new_num_position_embeddings: int):
|
2529 |
-
# pass
|
2530 |
-
#
|
2531 |
-
# def get_position_embeddings(self) -> Union[nn.Embedding, Tuple[nn.Embedding]]:
|
2532 |
-
# pass
|
2533 |
-
#
|
2534 |
-
# def _reorder_cache(self, past_key_values, beam_idx):
|
2535 |
-
# pass
|
2536 |
-
|
2537 |
def get_input_embeddings(self) -> torch.nn.Module:
|
2538 |
return self.model.transformer.wte
|
2539 |
|
@@ -2555,11 +2552,13 @@ class MOLMoForCausalLM(PreTrainedModel):
|
|
2555 |
def tie_weights(self):
|
2556 |
"""
|
2557 |
This function is intentionally left as a no-op.
|
|
|
2558 |
Weight tying is handled as follows:
|
2559 |
- When the model is initialized, the `ff_out` layer is conditionally defined based on the `weight_tying` configuration.
|
2560 |
See: `if not config.weight_tying: self.transformer.update(...)` in `olmo/model.py`.
|
2561 |
- When computing logits, the `wte` weights are used directly if `weight_tying` is enabled.
|
2562 |
See: `if self.config.weight_tying: logits = F.linear(x, self.transformer.wte.weight, None)` in the `forward` method.
|
|
|
2563 |
Therefore, there is no need to explicitly tie the weights in this function.
|
2564 |
"""
|
2565 |
pass
|
@@ -2569,7 +2568,9 @@ class MOLMoForCausalLM(PreTrainedModel):
|
|
2569 |
) -> torch.nn.Embedding:
|
2570 |
"""
|
2571 |
Resizes input token embeddings matrix of the model if `new_num_tokens != config.embedding_size`.
|
|
|
2572 |
Takes care of tying weights embeddings afterwards if the model class has a `tie_weights()` method.
|
|
|
2573 |
Arguments:
|
2574 |
new_num_tokens (`int`, *optional*):
|
2575 |
The new number of tokens in the embedding matrix. Increasing the size will add newly initialized
|
@@ -2578,12 +2579,15 @@ class MOLMoForCausalLM(PreTrainedModel):
|
|
2578 |
pad_to_multiple_of (`int`, *optional*):
|
2579 |
If set will pad the embedding matrix to a multiple of the provided value. If `new_num_tokens` is set to
|
2580 |
`None` will just pad the embedding to a multiple of `pad_to_multiple_of`.
|
|
|
2581 |
This is especially useful to enable the use of Tensor Cores on NVIDIA hardware with compute capability
|
2582 |
`>= 7.5` (Volta), or on TPUs which benefit from having sequence lengths be a multiple of 128. For more
|
2583 |
details about this, or help on choosing the correct value for resizing, refer to this guide:
|
2584 |
https://docs.nvidia.com/deeplearning/performance/dl-performance-matrix-multiplication/index.html#requirements-tc
|
|
|
2585 |
Return:
|
2586 |
`torch.nn.Embedding`: Pointer to the input tokens Embeddings Module of the model.
|
|
|
2587 |
Note:
|
2588 |
This method differs from the base class implementation by resizing the `embedding_size` attribute of the
|
2589 |
model configuration instead of the `vocab_size`. It also includes a warning if the resized `embedding_size`
|
@@ -2614,4 +2618,4 @@ class MOLMoForCausalLM(PreTrainedModel):
|
|
2614 |
|
2615 |
|
2616 |
# Always register for multi-modal features
|
2617 |
-
AutoModelForCausalLM.register(MolmoConfig,
|
|
|
77 |
x.masked_fill_(x == float("inf"), torch.finfo(x.dtype).max)
|
78 |
|
79 |
|
80 |
+
class MolmoConfigurationError(Exception):
|
81 |
pass
|
82 |
|
83 |
|
|
|
189 |
return q_.type_as(q), k_.type_as(k)
|
190 |
|
191 |
|
192 |
+
class MolmoBlock(nn.Module):
|
193 |
"""
|
194 |
A base class for transformer block implementations.
|
195 |
"""
|
|
|
420 |
@classmethod
|
421 |
def build(cls, layer_id: int, config: MolmoConfig, cache: BufferCache):
|
422 |
if config.block_type == "sequential":
|
423 |
+
return MolmoSequentialBlock(layer_id, config, cache)
|
424 |
elif config.block_type == "llama":
|
425 |
return OLMoLlamaBlock(layer_id, config, cache)
|
426 |
else:
|
427 |
raise NotImplementedError(f"Unknown block type: '{config.block_type}'")
|
428 |
|
429 |
|
430 |
+
class OLMoLlamaBlock(MolmoBlock):
|
431 |
"""
|
432 |
This is a transformer block where the output is computed as ``MLP(LN(x + Attention(LN(x))))``
|
433 |
+
(plus another skip connection). This block is similar to `MolmoSequentialBlock`
|
434 |
but some operations have slightly different implementations to imitate the
|
435 |
behavior of Llama.
|
436 |
"""
|
|
|
598 |
return x, cache
|
599 |
|
600 |
|
601 |
+
class MolmoSequentialBlock(MolmoBlock):
|
602 |
"""
|
603 |
This is a typical transformer block where the output is computed as ``MLP(LN(x + Attention(LN(x))))``
|
604 |
(plus another skip connection).
|
|
|
825 |
class FullMolmoConfig:
|
826 |
d_model: int = 768
|
827 |
n_heads: int = 12
|
|
|
828 |
n_kv_heads: Optional[int] = None
|
829 |
qkv_bias: bool = False
|
830 |
clip_qkv: Optional[float] = None
|
|
|
907 |
if self.n_kv_heads == n_kv_heads_should_be:
|
908 |
return n_kv_heads_should_be
|
909 |
else:
|
910 |
+
raise MolmoConfigurationError(
|
911 |
"You can't set `multi_query_attention` and `n_kv_heads` at the same time."
|
912 |
)
|
913 |
|
|
|
1896 |
return F.layer_norm(x, self.normalized_shape, weight=self.weight, bias=self.bias, eps=self.eps)
|
1897 |
|
1898 |
|
1899 |
+
class Molmo(nn.Module):
|
1900 |
def __init__(self, config: FullMolmoConfig, init_params: bool = True):
|
1901 |
super().__init__()
|
1902 |
self.config = config
|
|
|
1905 |
# Validate config.
|
1906 |
if self.config.embedding_size is not None and self.config.embedding_size != self.config.vocab_size:
|
1907 |
if self.config.embedding_size < self.config.vocab_size:
|
1908 |
+
raise MolmoConfigurationError("embedding size should be at least as big as vocab size")
|
1909 |
elif self.config.embedding_size % 128 != 0:
|
1910 |
import warnings
|
1911 |
|
|
|
1938 |
)
|
1939 |
)
|
1940 |
|
1941 |
+
blocks = [MolmoBlock.build(i, config, self.__cache) for i in range(config.n_layers)]
|
1942 |
if self.config.block_group_size > 1:
|
1943 |
raise NotImplementedError()
|
1944 |
else:
|
|
|
2017 |
which input IDs are masked. A `1` value in the mask means that
|
2018 |
the corresponding input ID should *not* be ignored. A `0` means
|
2019 |
that the corresponding input ID is masked.
|
2020 |
+
|
2021 |
This has the same meaning as the `attention_mask` in HuggingFace's `transformers`
|
2022 |
library.
|
2023 |
:param attention_bias: A tensor of shape `(batch_size, 1, seq_len, seq_len)`,
|
2024 |
`(1, 1, seq_len, seq_len)`, or `(seq_len, seq_len)`. This is used
|
2025 |
to introduce causal or other biases.
|
2026 |
+
|
2027 |
If the tensor is a bool or byte tensor, a `True` or `1` at `attention_bias[:, :, i, j]`
|
2028 |
indicates that the i-th element in the sequence is allowed to attend to the j-th
|
2029 |
element in the sequence.
|
2030 |
+
|
2031 |
If the tensor is a float tensor, it will just be added to the attention
|
2032 |
scores before the softmax.
|
2033 |
+
|
2034 |
The default is causal, which corresponds to a lower-diagonal byte matrix of ones.
|
2035 |
:param response_mask: A tensor of shape `(batch_size, seq_len)` that indicates
|
2036 |
the response mask. A `1` value in the mask means that the corresponding token
|
|
|
2261 |
return ModelOutput(logits=logits, attn_key_values=attn_key_values, hidden_states=tuple(all_hidden_states) if output_hidden_states else None) # type: ignore[arg-type]
|
2262 |
|
2263 |
|
2264 |
+
class MolmoForCausalLM(PreTrainedModel):
|
2265 |
config_class = MolmoConfig
|
2266 |
base_model_prefix = "model"
|
2267 |
+
_no_split_modules = ["MolmoBlock"]
|
2268 |
|
2269 |
+
def __init__(self, config: MolmoConfig, model: Optional[Molmo] = None, init_params: bool = False):
|
2270 |
super().__init__(config)
|
2271 |
|
2272 |
if not model:
|
2273 |
full_config = FullMolmoConfig(
|
2274 |
+
attention_layer_norm=config.attention_layer_norm,
|
2275 |
+
image_padding_embed="pad_and_partial_pad",
|
2276 |
+
image_pooling_2d="attention-meanq",
|
2277 |
rope_impl="llama",
|
2278 |
vocab_size=config.vocab_size,
|
2279 |
max_sequence_length=config.max_position_embeddings,
|
2280 |
qkv_bias=config.qkv_bias,
|
2281 |
+
norm_after=config.norm_after,
|
2282 |
embedding_size=config.embedding_size,
|
2283 |
attention_type="sdpa",
|
2284 |
embedding_dropout=0,
|
|
|
2294 |
additional_vocab_size=128,
|
2295 |
n_heads=config.num_attention_heads,
|
2296 |
n_kv_heads=config.num_key_value_heads,
|
2297 |
+
rope_theta=config.rope_theta,
|
2298 |
+
layer_norm_eps=config.layer_norm_eps,
|
2299 |
+
layer_norm_type=config.layer_norm_type,
|
2300 |
pad_tokenizer=True,
|
2301 |
vit_layers=[-2, -9],
|
2302 |
vision_backbone=VisionBackboneConfig(
|
|
|
2319 |
initializer_range=0.02,
|
2320 |
)
|
2321 |
)
|
2322 |
+
self.model = Molmo(full_config, init_params=init_params)
|
2323 |
else:
|
2324 |
self.model = model
|
2325 |
|
|
|
2352 |
use_cache = self.config.use_cache
|
2353 |
|
2354 |
if output_attentions:
|
2355 |
+
raise ValueError("output_attentions is not yet supported in Molmo")
|
2356 |
|
2357 |
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
|
2358 |
|
|
|
2531 |
model_kwargs["cache_position"] = model_kwargs["cache_position"][-1:] + num_new_tokens
|
2532 |
return model_kwargs
|
2533 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
2534 |
def get_input_embeddings(self) -> torch.nn.Module:
|
2535 |
return self.model.transformer.wte
|
2536 |
|
|
|
2552 |
def tie_weights(self):
|
2553 |
"""
|
2554 |
This function is intentionally left as a no-op.
|
2555 |
+
|
2556 |
Weight tying is handled as follows:
|
2557 |
- When the model is initialized, the `ff_out` layer is conditionally defined based on the `weight_tying` configuration.
|
2558 |
See: `if not config.weight_tying: self.transformer.update(...)` in `olmo/model.py`.
|
2559 |
- When computing logits, the `wte` weights are used directly if `weight_tying` is enabled.
|
2560 |
See: `if self.config.weight_tying: logits = F.linear(x, self.transformer.wte.weight, None)` in the `forward` method.
|
2561 |
+
|
2562 |
Therefore, there is no need to explicitly tie the weights in this function.
|
2563 |
"""
|
2564 |
pass
|
|
|
2568 |
) -> torch.nn.Embedding:
|
2569 |
"""
|
2570 |
Resizes input token embeddings matrix of the model if `new_num_tokens != config.embedding_size`.
|
2571 |
+
|
2572 |
Takes care of tying weights embeddings afterwards if the model class has a `tie_weights()` method.
|
2573 |
+
|
2574 |
Arguments:
|
2575 |
new_num_tokens (`int`, *optional*):
|
2576 |
The new number of tokens in the embedding matrix. Increasing the size will add newly initialized
|
|
|
2579 |
pad_to_multiple_of (`int`, *optional*):
|
2580 |
If set will pad the embedding matrix to a multiple of the provided value. If `new_num_tokens` is set to
|
2581 |
`None` will just pad the embedding to a multiple of `pad_to_multiple_of`.
|
2582 |
+
|
2583 |
This is especially useful to enable the use of Tensor Cores on NVIDIA hardware with compute capability
|
2584 |
`>= 7.5` (Volta), or on TPUs which benefit from having sequence lengths be a multiple of 128. For more
|
2585 |
details about this, or help on choosing the correct value for resizing, refer to this guide:
|
2586 |
https://docs.nvidia.com/deeplearning/performance/dl-performance-matrix-multiplication/index.html#requirements-tc
|
2587 |
+
|
2588 |
Return:
|
2589 |
`torch.nn.Embedding`: Pointer to the input tokens Embeddings Module of the model.
|
2590 |
+
|
2591 |
Note:
|
2592 |
This method differs from the base class implementation by resizing the `embedding_size` attribute of the
|
2593 |
model configuration instead of the `vocab_size`. It also includes a warning if the resized `embedding_size`
|
|
|
2618 |
|
2619 |
|
2620 |
# Always register for multi-modal features
|
2621 |
+
AutoModelForCausalLM.register(MolmoConfig, MolmoForCausalLM)
|
preprocessing_molmo.py
CHANGED
@@ -2,9 +2,7 @@
|
|
2 |
Processor class for Molmo.
|
3 |
"""
|
4 |
|
5 |
-
from typing import
|
6 |
-
|
7 |
-
from transformers.utils.constants import OPENAI_CLIP_STD, OPENAI_CLIP_MEAN
|
8 |
|
9 |
try:
|
10 |
from typing import Unpack
|
|
|
2 |
Processor class for Molmo.
|
3 |
"""
|
4 |
|
5 |
+
from typing import Optional
|
|
|
|
|
6 |
|
7 |
try:
|
8 |
from typing import Unpack
|