Ubuntu commited on
Commit
bc7df4b
·
1 Parent(s): 7172b5f
.gitattributes CHANGED
@@ -1,35 +1,38 @@
1
  *.7z filter=lfs diff=lfs merge=lfs -text
2
  *.arrow filter=lfs diff=lfs merge=lfs -text
3
  *.bin filter=lfs diff=lfs merge=lfs -text
 
4
  *.bz2 filter=lfs diff=lfs merge=lfs -text
5
- *.ckpt filter=lfs diff=lfs merge=lfs -text
6
  *.ftz filter=lfs diff=lfs merge=lfs -text
7
  *.gz filter=lfs diff=lfs merge=lfs -text
8
  *.h5 filter=lfs diff=lfs merge=lfs -text
9
  *.joblib filter=lfs diff=lfs merge=lfs -text
10
  *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
- *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
  *.model filter=lfs diff=lfs merge=lfs -text
13
  *.msgpack filter=lfs diff=lfs merge=lfs -text
14
- *.npy filter=lfs diff=lfs merge=lfs -text
15
- *.npz filter=lfs diff=lfs merge=lfs -text
16
  *.onnx filter=lfs diff=lfs merge=lfs -text
17
  *.ot filter=lfs diff=lfs merge=lfs -text
18
  *.parquet filter=lfs diff=lfs merge=lfs -text
19
  *.pb filter=lfs diff=lfs merge=lfs -text
20
- *.pickle filter=lfs diff=lfs merge=lfs -text
21
- *.pkl filter=lfs diff=lfs merge=lfs -text
22
  *.pt filter=lfs diff=lfs merge=lfs -text
23
  *.pth filter=lfs diff=lfs merge=lfs -text
24
  *.rar filter=lfs diff=lfs merge=lfs -text
25
- *.safetensors filter=lfs diff=lfs merge=lfs -text
26
  saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
  *.tar.* filter=lfs diff=lfs merge=lfs -text
28
- *.tar filter=lfs diff=lfs merge=lfs -text
29
  *.tflite filter=lfs diff=lfs merge=lfs -text
30
  *.tgz filter=lfs diff=lfs merge=lfs -text
31
- *.wasm filter=lfs diff=lfs merge=lfs -text
32
  *.xz filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
- *.zst filter=lfs diff=lfs merge=lfs -text
35
- *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
 
 
 
 
 
 
 
 
 
 
1
  *.7z filter=lfs diff=lfs merge=lfs -text
2
  *.arrow filter=lfs diff=lfs merge=lfs -text
3
  *.bin filter=lfs diff=lfs merge=lfs -text
4
+ *.bin.* filter=lfs diff=lfs merge=lfs -text
5
  *.bz2 filter=lfs diff=lfs merge=lfs -text
 
6
  *.ftz filter=lfs diff=lfs merge=lfs -text
7
  *.gz filter=lfs diff=lfs merge=lfs -text
8
  *.h5 filter=lfs diff=lfs merge=lfs -text
9
  *.joblib filter=lfs diff=lfs merge=lfs -text
10
  *.lfs.* filter=lfs diff=lfs merge=lfs -text
 
11
  *.model filter=lfs diff=lfs merge=lfs -text
12
  *.msgpack filter=lfs diff=lfs merge=lfs -text
 
 
13
  *.onnx filter=lfs diff=lfs merge=lfs -text
14
  *.ot filter=lfs diff=lfs merge=lfs -text
15
  *.parquet filter=lfs diff=lfs merge=lfs -text
16
  *.pb filter=lfs diff=lfs merge=lfs -text
 
 
17
  *.pt filter=lfs diff=lfs merge=lfs -text
18
  *.pth filter=lfs diff=lfs merge=lfs -text
19
  *.rar filter=lfs diff=lfs merge=lfs -text
 
20
  saved_model/**/* filter=lfs diff=lfs merge=lfs -text
21
  *.tar.* filter=lfs diff=lfs merge=lfs -text
 
22
  *.tflite filter=lfs diff=lfs merge=lfs -text
23
  *.tgz filter=lfs diff=lfs merge=lfs -text
 
24
  *.xz filter=lfs diff=lfs merge=lfs -text
25
  *.zip filter=lfs diff=lfs merge=lfs -text
26
+ *.zstandard filter=lfs diff=lfs merge=lfs -text
27
+ *.tfevents* filter=lfs diff=lfs merge=lfs -text
28
+ *.db* filter=lfs diff=lfs merge=lfs -text
29
+ *.ark* filter=lfs diff=lfs merge=lfs -text
30
+ **/*ckpt*data* filter=lfs diff=lfs merge=lfs -text
31
+ **/*ckpt*.meta filter=lfs diff=lfs merge=lfs -text
32
+ **/*ckpt*.index filter=lfs diff=lfs merge=lfs -text
33
+ *.safetensors filter=lfs diff=lfs merge=lfs -text
34
+ *.ckpt filter=lfs diff=lfs merge=lfs -text
35
+ *.gguf* filter=lfs diff=lfs merge=lfs -text
36
+ *.ggml filter=lfs diff=lfs merge=lfs -text
37
+ *.llamafile* filter=lfs diff=lfs merge=lfs -text
38
+ *.pt2 filter=lfs diff=lfs merge=lfs -text
README.md ADDED
@@ -0,0 +1,51 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ frameworks:
3
+ - Pytorch
4
+ license: other
5
+ tasks:
6
+ - image-text-to-text
7
+ language:
8
+ - cn
9
+ - en
10
+ ---
11
+
12
+ # GLM-Edge-V-2B
13
+
14
+ 快速推理代码:
15
+
16
+ ```python
17
+ import torch
18
+ from PIL import Image
19
+ from transformers import (
20
+ AutoTokenizer,
21
+ AutoImageProcessor,
22
+ AutoModelForCausalLM,
23
+ )
24
+
25
+ url = "img.png"
26
+ messages = [{"role": "user", "content": [{"type": "image"}, {"type": "text", "text": "describe this image"}]}]
27
+ image = Image.open(url)
28
+
29
+ model_dir = "THUDM/glm-edge-v-2b"
30
+
31
+ processor = AutoImageProcessor.from_pretrained(model_dir, trust_remote_code=True)
32
+ tokenizer = AutoTokenizer.from_pretrained(model_dir, trust_remote_code=True)
33
+ model = AutoModelForCausalLM.from_pretrained(
34
+ model_dir,
35
+ torch_dtype=torch.bfloat16,
36
+ device_map="auto",
37
+ trust_remote_code=True,
38
+ )
39
+
40
+ inputs = tokenizer.apply_chat_template(
41
+ messages, add_generation_prompt=True, return_dict=True, tokenize=True, return_tensors="pt"
42
+ ).to(next(model.parameters()).device)
43
+
44
+ generate_kwargs = {
45
+ **inputs,
46
+ "pixel_values": torch.tensor(processor(image).pixel_values).to(next(model.parameters()).device),
47
+ }
48
+ output = model.generate(**generate_kwargs, max_new_tokens=100)
49
+ print(tokenizer.decode(output[0][len(inputs["input_ids"][0]) :]))
50
+
51
+ ```
config.json ADDED
@@ -0,0 +1,49 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "GlmForCausalLM"
4
+ ],
5
+ "auto_map": {
6
+ "AutoConfig": "configuration_glm.GlmConfig",
7
+ "AutoModel": "modeling_glm.GlmModel",
8
+ "AutoModelForCausalLM": "modeling_glm.GlmForCausalLM",
9
+ "AutoModelForSequenceClassification": "modeling_glm.GlmForSequenceClassification"
10
+ },
11
+ "attention_bias": false,
12
+ "attention_dropout": 0.0,
13
+ "boi_token_id": 59256,
14
+ "eoi_token_id": 59257,
15
+ "eos_token_id": [
16
+ 59246,
17
+ 59253,
18
+ 59255
19
+ ],
20
+ "head_dim": 128,
21
+ "hidden_act": "silu",
22
+ "hidden_size": 2048,
23
+ "initializer_range": 0.02,
24
+ "intermediate_size": 6144,
25
+ "max_position_embeddings": 4096,
26
+ "model_type": "glm",
27
+ "num_attention_heads": 16,
28
+ "num_hidden_layers": 28,
29
+ "num_key_value_heads": 4,
30
+ "pad_token_id": 59246,
31
+ "partial_rotary_factor": 1.0,
32
+ "rms_norm_eps": 1e-05,
33
+ "rope_theta": 10000.0,
34
+ "tie_word_embeddings": true,
35
+ "torch_dtype": "bfloat16",
36
+ "transformers_version": "4.47.0.dev0",
37
+ "use_cache": true,
38
+ "vision_config": {
39
+ "hidden_size": 1152,
40
+ "image_size": 672,
41
+ "intermediate_size": 4304,
42
+ "model_type": "siglip_vision_model",
43
+ "num_attention_heads": 16,
44
+ "num_hidden_layers": 27,
45
+ "patch_size": 14,
46
+ "torch_dtype": "bfloat16"
47
+ },
48
+ "vocab_size": 59264
49
+ }
configuration_glm.py ADDED
@@ -0,0 +1,152 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # coding=utf-8
2
+ # Copyright 2024 The GLM & ZhipuAI team and HuggingFace Inc. team. All rights reserved.
3
+ #
4
+ #
5
+ # Licensed under the Apache License, Version 2.0 (the "License");
6
+ # you may not use this file except in compliance with the License.
7
+ # You may obtain a copy of the License at
8
+ #
9
+ # http://www.apache.org/licenses/LICENSE-2.0
10
+ #
11
+ # Unless required by applicable law or agreed to in writing, software
12
+ # distributed under the License is distributed on an "AS IS" BASIS,
13
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14
+ # See the License for the specific language governing permissions and
15
+ # limitations under the License.
16
+
17
+ from transformers.configuration_utils import PretrainedConfig
18
+
19
+
20
+ class GlmConfig(PretrainedConfig):
21
+ r"""
22
+ This is the configuration class to store the configuration of a [`GlmModel`]. It is used to instantiate an Glm
23
+ model according to the specified arguments, defining the model architecture. Instantiating a configuration with the
24
+ defaults will yield a similar configuration to that of the Glm-4-9b-chat.
25
+ e.g. [THUDM/glm-4-9b-chat](https://huggingface.co/THUDM/glm-4-9b-chat)
26
+ Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
27
+ documentation from [`PretrainedConfig`] for more information.
28
+ Args:
29
+ vocab_size (`int`, *optional*, defaults to 151552):
30
+ Vocabulary size of the Glm model. Defines the number of different tokens that can be represented by the
31
+ `inputs_ids` passed when calling [`GlmModel`]
32
+ hidden_size (`int`, *optional*, defaults to 4096):
33
+ Dimension of the hidden representations.
34
+ intermediate_size (`int`, *optional*, defaults to 13696):
35
+ Dimension of the MLP representations.
36
+ num_hidden_layers (`int`, *optional*, defaults to 40):
37
+ Number of hidden layers in the Transformer decoder.
38
+ num_attention_heads (`int`, *optional*, defaults to 32):
39
+ Number of attention heads for each attention layer in the Transformer decoder.
40
+ num_key_value_heads (`int`, *optional*, defaults to 2):
41
+ This is the number of key_value heads that should be used to implement Grouped Query Attention. If
42
+ `num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if
43
+ `num_key_value_heads=1` the model will use Multi Query Attention (MQA) otherwise GQA is used. When
44
+ converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed
45
+ by meanpooling all the original heads within that group. For more details checkout [this
46
+ paper](https://arxiv.org/pdf/2305.13245.pdf). If it is not specified, will default to
47
+ `num_attention_heads`.
48
+ head_dim (`int`, *optional*, defaults to 128):
49
+ The attention head dimension.
50
+ hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):
51
+ The legacy activation function. It is overwritten by the `hidden_activation`.
52
+ attention_dropout (`float`, *optional*, defaults to 0.0):
53
+ The dropout ratio for the attention probabilities.
54
+ max_position_embeddings (`int`, *optional*, defaults to 131072):
55
+ The maximum sequence length that this model might ever be used with.
56
+ initializer_range (`float`, *optional*, defaults to 0.02):
57
+ The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
58
+ rms_norm_eps (`float`, *optional*, defaults to 1.5625e-07):
59
+ The epsilon used by the rms normalization layers.
60
+ use_cache (`bool`, *optional*, defaults to `True`):
61
+ Whether or not the model should return the last key/values attentions (not used by all models). Only
62
+ relevant if `config.is_decoder=True`.
63
+ tie_word_embeddings (`bool`, *optional*, defaults to `False`):
64
+ Whether to tie weight embeddings
65
+ rope_theta (`float`, *optional*, defaults to 10000.0):
66
+ The base period of the RoPE embeddings.
67
+ pad_token_id (`int`, *optional*, defaults to 151329):
68
+ Padding token id.
69
+ eos_token_id (`int` | `list`, *optional*, defaults to `[151329, 151336, 151338]`):
70
+ End of stream token id.
71
+ bos_token_id (`int`, *optional*):
72
+ Beginning of stream token id.
73
+ attention_bias (`bool`, defaults to `False`, *optional*, defaults to `True`):
74
+ Whether to use a bias in the query, key, value and output projection layers during self-attention.
75
+ boi_token_id (`int`, *optional*, defaults to 151339):
76
+ Beginning of image token id.
77
+ eoi_token_id (`int` | `list`, *optional*, defaults to `[151339, 151346, 151348]`):
78
+ End of image token id.
79
+ partial_rotary_factor (`float`, *optional*, defaults to 0.5):
80
+ The partial rotary factor.
81
+ vision_config (`VisionConfig`, *optional*, defaults to `None`):
82
+ The vision configuration object.
83
+ ```python
84
+ >>> from transformers import GlmModel, GlmConfig
85
+ >>> # Initializing a Glm glm-4-9b-chat style configuration
86
+ >>> configuration = GlmConfig()
87
+ >>> # Initializing a model from the glm-4-9b-chat style configuration
88
+ >>> model = GlmModel(configuration)
89
+ >>> # Accessing the model configuration
90
+ >>> configuration = model.config
91
+ ```"""
92
+
93
+ model_type = "glm"
94
+ keys_to_ignore_at_inference = ["past_key_values"]
95
+
96
+ def __init__(
97
+ self,
98
+ vocab_size=65024,
99
+ hidden_size=4096,
100
+ intermediate_size=13696,
101
+ num_hidden_layers=28,
102
+ head_dim=128,
103
+ num_attention_heads=32,
104
+ max_position_embeddings=2048,
105
+ attention_dropout=0.0,
106
+ rms_norm_eps=1e-5,
107
+ attention_bias=False,
108
+ num_key_value_heads=1,
109
+ rope_theta=10000.0,
110
+ hidden_act="silu",
111
+ initializer_range=0.02,
112
+ use_cache=True,
113
+ tie_word_embeddings=False,
114
+ pad_token_id=59246,
115
+ bos_token_id=None,
116
+ eos_token_id=[59246, 59253, 59255],
117
+ boi_token_id=59256,
118
+ eoi_token_id=59257,
119
+ vision_config=None,
120
+ partial_rotary_factor=0.5,
121
+ **kwargs,
122
+ ):
123
+ self.vocab_size = vocab_size
124
+ self.max_position_embeddings = max_position_embeddings
125
+ self.hidden_size = hidden_size
126
+ self.intermediate_size = intermediate_size
127
+ self.num_hidden_layers = num_hidden_layers
128
+ self.num_attention_heads = num_attention_heads
129
+ self.head_dim = head_dim
130
+ self.num_key_value_heads = num_key_value_heads
131
+ self.hidden_act = hidden_act
132
+ self.initializer_range = initializer_range
133
+ self.rms_norm_eps = rms_norm_eps
134
+ self.use_cache = use_cache
135
+ self.rope_theta = rope_theta
136
+ self.attention_bias = attention_bias
137
+ self.attention_dropout = attention_dropout
138
+ self.partial_rotary_factor = partial_rotary_factor
139
+ self.boi_token_id = boi_token_id
140
+ self.eoi_token_id = eoi_token_id
141
+ self.vision_config = vision_config
142
+
143
+ super().__init__(
144
+ pad_token_id=pad_token_id,
145
+ bos_token_id=bos_token_id,
146
+ eos_token_id=eos_token_id,
147
+ tie_word_embeddings=tie_word_embeddings,
148
+ **kwargs,
149
+ )
150
+
151
+
152
+ __all__ = ["GlmConfig"]
generation_config.json ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_from_model_config": true,
3
+ "eos_token_id": [
4
+ 59246,
5
+ 59253,
6
+ 59255
7
+ ],
8
+ "pad_token_id": 59246,
9
+ "transformers_version": "4.47.0.dev0"
10
+ }
model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3ba45bb26f4f23091df003d5b24c2bc9d9c8003f88c3370caad3974eb67fe254
3
+ size 4149810656
modeling_glm.py ADDED
@@ -0,0 +1,1327 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import math
2
+ from typing import List, Optional, Tuple, Union, Dict, Any
3
+
4
+ import torch
5
+ import torch.nn as nn
6
+
7
+ from transformers.activations import ACT2FN
8
+ from transformers.cache_utils import Cache, DynamicCache, StaticCache
9
+ from transformers.generation import GenerationMixin
10
+ from transformers.generation.utils import ModelOutput
11
+ from transformers.modeling_attn_mask_utils import AttentionMaskConverter
12
+ from transformers.modeling_flash_attention_utils import _flash_attention_forward
13
+ from transformers.modeling_outputs import (
14
+ BaseModelOutputWithPast,
15
+ CausalLMOutputWithPast,
16
+ SequenceClassifierOutputWithPast,
17
+ )
18
+ from transformers.modeling_utils import PreTrainedModel
19
+ from transformers.utils import (
20
+ add_start_docstrings,
21
+ add_start_docstrings_to_model_forward,
22
+ is_flash_attn_greater_or_equal_2_10,
23
+ logging,
24
+ replace_return_docstrings,
25
+ )
26
+ from transformers import __version__ as transformers_version
27
+
28
+ from .siglip import VisionModel
29
+ from .configuration_glm import GlmConfig
30
+
31
+
32
+ logger = logging.get_logger(__name__)
33
+
34
+ _CHECKPOINT_FOR_DOC = "THUDM/glm-edge-v-2b"
35
+ _CONFIG_FOR_DOC = "GlmConfig"
36
+
37
+
38
+ class GlmRMSNorm(nn.Module):
39
+ def __init__(self, hidden_size, eps=1e-6):
40
+ """
41
+ GlmRMSNorm is equivalent to T5LayerNorm
42
+ """
43
+ super().__init__()
44
+ self.weight = nn.Parameter(torch.ones(hidden_size))
45
+ self.variance_epsilon = eps
46
+
47
+ def forward(self, hidden_states):
48
+ input_dtype = hidden_states.dtype
49
+ hidden_states = hidden_states.to(torch.float32)
50
+ variance = hidden_states.pow(2).mean(-1, keepdim=True)
51
+ hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon)
52
+ return self.weight * hidden_states.to(input_dtype)
53
+
54
+ def extra_repr(self):
55
+ return f"{tuple(self.weight.shape)}, eps={self.variance_epsilon}"
56
+
57
+
58
+ class GlmRotaryEmbedding(nn.Module):
59
+ def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None):
60
+ super().__init__()
61
+
62
+ self.dim = dim
63
+ self.max_position_embeddings = max_position_embeddings
64
+ self.base = base
65
+
66
+ inv_freq = 1.0 / (self.base ** (torch.arange(0, self.dim, 2, dtype=torch.int64).float() / self.dim))
67
+ self.register_buffer("inv_freq", tensor=inv_freq, persistent=False)
68
+
69
+ @torch.no_grad()
70
+ def forward(self, x, position_ids, seq_len=None):
71
+ # x: [bs, num_attention_heads, seq_len, head_size]
72
+ self.inv_freq.to(x.device)
73
+ inv_freq_expanded = self.inv_freq[None, :, None].float().expand(position_ids.shape[0], -1, 1)
74
+ position_ids_expanded = position_ids[:, None, :].float()
75
+ # Force float32 since bfloat16 loses precision on long contexts
76
+ # See https://github.com/huggingface/transformers/pull/29285
77
+ device_type = x.device.type
78
+ device_type = device_type if isinstance(device_type, str) and device_type != "mps" else "cpu"
79
+ with torch.autocast(device_type=device_type, enabled=False):
80
+ freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).transpose(1, 2)
81
+ emb = torch.cat((freqs, freqs), dim=-1)
82
+ cos = emb.cos()
83
+ sin = emb.sin()
84
+ return cos.to(dtype=x.dtype), sin.to(dtype=x.dtype)
85
+
86
+
87
+ class GlmMLP(nn.Module):
88
+ def __init__(self, config):
89
+ super().__init__()
90
+
91
+ self.config = config
92
+ self.gate_up_proj = nn.Linear(config.hidden_size, 2 * config.intermediate_size, bias=False)
93
+ self.down_proj = nn.Linear(config.intermediate_size, config.hidden_size, bias=False)
94
+
95
+ self.activation_fn = ACT2FN[config.hidden_act]
96
+
97
+ def forward(self, hidden_states: torch.FloatTensor) -> torch.FloatTensor:
98
+ up_states = self.gate_up_proj(hidden_states)
99
+
100
+ gate, up_states = up_states.chunk(2, dim=-1)
101
+ up_states = up_states * self.activation_fn(gate)
102
+
103
+ return self.down_proj(up_states)
104
+
105
+
106
+ def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor:
107
+ """
108
+ This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch,
109
+ num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim)
110
+ """
111
+ batch, num_key_value_heads, slen, head_dim = hidden_states.shape
112
+ if n_rep == 1:
113
+ return hidden_states
114
+ hidden_states = hidden_states[:, :, None, :, :].expand(batch, num_key_value_heads, n_rep, slen, head_dim)
115
+ return hidden_states.reshape(batch, num_key_value_heads * n_rep, slen, head_dim)
116
+
117
+
118
+ def rotate_half(x):
119
+ """Rotates half the hidden dims of the input."""
120
+ x1 = x[..., 0::2]
121
+ x2 = x[..., 1::2]
122
+ return torch.stack((-x2, x1), dim=-1).flatten(-2)
123
+
124
+
125
+ def apply_rotary_pos_emb(q, k, cos, sin, position_ids=None, unsqueeze_dim=1, partial_rotary_factor=0.5):
126
+ """Applies Rotary Position Embedding to the query and key tensors.
127
+
128
+ Args:
129
+ q (`torch.Tensor`): The query tensor.
130
+ k (`torch.Tensor`): The key tensor.
131
+ cos (`torch.Tensor`): The cosine part of the rotary embedding.
132
+ sin (`torch.Tensor`): The sine part of the rotary embedding.
133
+ position_ids (`torch.Tensor`, *optional*):
134
+ Deprecated and unused.
135
+ unsqueeze_dim (`int`, *optional*, defaults to 1):
136
+ The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and
137
+ sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note
138
+ that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and
139
+ k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes
140
+ cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have
141
+ the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2.
142
+ partial_rotary_factor (`float`, *optional*, defaults to 0.5): The factor by which the rotary embedding.
143
+ Returns:
144
+ `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding.
145
+ """
146
+ cos = cos.unsqueeze(unsqueeze_dim)
147
+ sin = sin.unsqueeze(unsqueeze_dim)
148
+
149
+ # Interleave them instead of usual shape
150
+ cos = cos[..., : cos.shape[-1] // 2].repeat_interleave(2, dim=-1)
151
+ sin = sin[..., : sin.shape[-1] // 2].repeat_interleave(2, dim=-1)
152
+
153
+ rotary_dim = int(q.shape[-1] * partial_rotary_factor)
154
+ q, q_pass = q[..., :rotary_dim], q[..., rotary_dim:]
155
+ k, k_pass = k[..., :rotary_dim], k[..., rotary_dim:]
156
+
157
+ # Apply rotary embeddings to the rotary portion
158
+ q = (q * cos[..., :rotary_dim]) + (rotate_half(q) * sin[..., :rotary_dim])
159
+ k = (k * cos[..., :rotary_dim]) + (rotate_half(k) * sin[..., :rotary_dim])
160
+
161
+ # Concatenate back the rotary and non-rotary portions
162
+ q_embed = torch.cat([q, q_pass], dim=-1)
163
+ k_embed = torch.cat([k, k_pass], dim=-1)
164
+
165
+ return q_embed, k_embed
166
+
167
+
168
+ class GlmAttention(nn.Module):
169
+ """Multi-headed attention from 'Attention Is All You Need' paper"""
170
+
171
+ def __init__(self, config: GlmConfig, layer_idx: Optional[int] = None):
172
+ super().__init__()
173
+ self.config = config
174
+ self.layer_idx = layer_idx
175
+ if layer_idx is None:
176
+ logger.warning_once(
177
+ f"Instantiating {self.__class__.__name__} without passing a `layer_idx` is not recommended and will "
178
+ "lead to errors during the forward call if caching is used. Please make sure to provide a `layer_idx` "
179
+ "when creating this class."
180
+ )
181
+
182
+ self.attention_dropout = config.attention_dropout
183
+ self.hidden_size = config.hidden_size
184
+ self.num_heads = config.num_attention_heads
185
+ self.head_dim = self.hidden_size // self.num_heads
186
+ self.num_key_value_heads = config.num_key_value_heads
187
+ self.num_key_value_groups = self.num_heads // self.num_key_value_heads
188
+ self.is_causal = True
189
+ self.scaling = 1 / math.sqrt(self.head_dim)
190
+ self.partial_rotary_factor = config.partial_rotary_factor
191
+
192
+ if (self.head_dim * self.num_heads) != self.hidden_size:
193
+ raise ValueError(
194
+ f"hidden_size must be divisible by num_heads (got `hidden_size`: {self.hidden_size}"
195
+ f" and `num_heads`: {self.num_heads})."
196
+ )
197
+
198
+ self.q_proj = nn.Linear(self.hidden_size, self.num_heads * self.head_dim, bias=config.attention_bias)
199
+ self.k_proj = nn.Linear(self.hidden_size, self.num_key_value_heads * self.head_dim, bias=config.attention_bias)
200
+ self.v_proj = nn.Linear(self.hidden_size, self.num_key_value_heads * self.head_dim, bias=config.attention_bias)
201
+ self.o_proj = nn.Linear(self.hidden_size, self.hidden_size, bias=False)
202
+
203
+ def forward(
204
+ self,
205
+ hidden_states: torch.Tensor,
206
+ attention_mask: Optional[torch.Tensor] = None,
207
+ position_ids: Optional[torch.LongTensor] = None,
208
+ past_key_value: Optional[Cache] = None,
209
+ output_attentions: bool = False,
210
+ use_cache: bool = False,
211
+ cache_position: Optional[torch.LongTensor] = None,
212
+ position_embeddings: Optional[Tuple[torch.Tensor, torch.Tensor]] = None, # will become mandatory in v4.45
213
+ **kwargs,
214
+ ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
215
+ bsz, q_len, _ = hidden_states.size()
216
+
217
+ query_states = self.q_proj(hidden_states)
218
+ key_states = self.k_proj(hidden_states)
219
+ value_states = self.v_proj(hidden_states)
220
+
221
+ query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
222
+ key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
223
+ value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
224
+
225
+ cos, sin = position_embeddings
226
+
227
+ query_states, key_states = apply_rotary_pos_emb(
228
+ query_states, key_states, cos, sin, partial_rotary_factor=self.partial_rotary_factor
229
+ )
230
+ if past_key_value is not None:
231
+ # sin and cos are specific to RoPE models; cache_position needed for the static cache
232
+ cache_kwargs = {"sin": sin, "cos": cos, "cache_position": cache_position}
233
+ key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs)
234
+
235
+ key_states = repeat_kv(key_states, self.num_key_value_groups)
236
+ value_states = repeat_kv(value_states, self.num_key_value_groups)
237
+
238
+ attn_weights = torch.matmul(query_states, key_states.transpose(2, 3)) * self.scaling
239
+
240
+ if attention_mask is not None: # no matter the length, we just slice it
241
+ causal_mask = attention_mask[:, :, :, : key_states.shape[-2]]
242
+ attn_weights = attn_weights + causal_mask
243
+
244
+ # upcast attention to fp32
245
+ attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query_states.dtype)
246
+ attn_weights = nn.functional.dropout(attn_weights, p=self.attention_dropout, training=self.training)
247
+ attn_output = torch.matmul(attn_weights, value_states)
248
+
249
+ if attn_output.size() != (bsz, self.num_heads, q_len, self.head_dim):
250
+ raise ValueError(
251
+ f"`attn_output` should be of size {(bsz, self.num_heads, q_len, self.head_dim)}, but is"
252
+ f" {attn_output.size()}"
253
+ )
254
+
255
+ attn_output = attn_output.transpose(1, 2).contiguous()
256
+
257
+ attn_output = attn_output.view(bsz, q_len, -1)
258
+ attn_output = self.o_proj(attn_output)
259
+
260
+ if not output_attentions:
261
+ attn_weights = None
262
+
263
+ return attn_output, attn_weights, past_key_value
264
+
265
+
266
+ class GlmFlashAttention2(GlmAttention):
267
+ """
268
+ Glm flash attention module. This module inherits from `GlmAttention` as the weights of the module stays
269
+ untouched. The only required change would be on the forward pass where it needs to correctly call the public API of
270
+ flash attention and deal with padding tokens in case the input contains any of them.
271
+ """
272
+
273
+ def __init__(self, *args, **kwargs):
274
+ super().__init__(*args, **kwargs)
275
+
276
+ # TODO: Should be removed once Flash Attention for RoCm is bumped to 2.1.
277
+ # flash_attn<2.1 generates top-left aligned causal mask, while what is needed here is bottom-right alignement, that was made default for flash_attn>=2.1. This attribute is used to handle this difference. Reference: https://github.com/Dao-AILab/flash-attention/releases/tag/v2.1.0.
278
+ # Beware that with flash_attn<2.1, using q_seqlen != k_seqlen (except for the case q_seqlen == 1) produces a wrong mask (top-left).
279
+ self._flash_attn_uses_top_left_mask = not is_flash_attn_greater_or_equal_2_10()
280
+
281
+ def forward(
282
+ self,
283
+ hidden_states: torch.Tensor,
284
+ attention_mask: Optional[torch.LongTensor] = None,
285
+ position_ids: Optional[torch.LongTensor] = None,
286
+ past_key_value: Optional[Cache] = None,
287
+ output_attentions: bool = False,
288
+ use_cache: bool = False,
289
+ cache_position: Optional[torch.LongTensor] = None,
290
+ position_embeddings: Optional[Tuple[torch.Tensor, torch.Tensor]] = None, # will become mandatory in v4.45
291
+ ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
292
+ output_attentions = False
293
+
294
+ bsz, q_len, _ = hidden_states.size()
295
+
296
+ query_states = self.q_proj(hidden_states)
297
+ key_states = self.k_proj(hidden_states)
298
+ value_states = self.v_proj(hidden_states)
299
+
300
+ # Flash attention requires the input to have the shape
301
+ # batch_size x seq_length x head_dim x hidden_dim
302
+ # therefore we just need to keep the original shape
303
+ query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
304
+ key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
305
+ value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
306
+
307
+ cos, sin = position_embeddings
308
+ query_states, key_states = apply_rotary_pos_emb(
309
+ query_states, key_states, cos, sin, partial_rotary_factor=self.partial_rotary_factor
310
+ )
311
+
312
+ if past_key_value is not None:
313
+ # sin and cos are specific to RoPE models; cache_position needed for the static cache
314
+ cache_kwargs = {"sin": sin, "cos": cos, "cache_position": cache_position}
315
+ key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs)
316
+
317
+ # TODO: These transpose are quite inefficient but Flash Attention requires the layout [batch_size, sequence_length, num_heads, head_dim]. We would need to refactor the KV cache
318
+ # to be able to avoid many of these transpose/reshape/view.
319
+ query_states = query_states.transpose(1, 2)
320
+ key_states = key_states.transpose(1, 2)
321
+ value_states = value_states.transpose(1, 2)
322
+
323
+ dropout_rate = self.attention_dropout if self.training else 0.0
324
+
325
+ # In PEFT, usually we cast the layer norms in float32 for training stability reasons
326
+ # therefore the input hidden states gets silently casted in float32. Hence, we need
327
+ # cast them back in the correct dtype just to be sure everything works as expected.
328
+ # This might slowdown training & inference so it is recommended to not cast the LayerNorms
329
+ # in fp32. (GlmRMSNorm handles it correctly)
330
+
331
+ input_dtype = query_states.dtype
332
+ if input_dtype == torch.float32:
333
+ if torch.is_autocast_enabled():
334
+ target_dtype = torch.get_autocast_gpu_dtype()
335
+ # Handle the case where the model is quantized
336
+ elif hasattr(self.config, "_pre_quantization_dtype"):
337
+ target_dtype = self.config._pre_quantization_dtype
338
+ else:
339
+ target_dtype = self.q_proj.weight.dtype
340
+
341
+ logger.warning_once(
342
+ f"The input hidden states seems to be silently casted in float32, this might be related to"
343
+ f" the fact you have upcasted embedding or layer norm layers in float32. We will cast back the input in"
344
+ f" {target_dtype}."
345
+ )
346
+
347
+ query_states = query_states.to(target_dtype)
348
+ key_states = key_states.to(target_dtype)
349
+ value_states = value_states.to(target_dtype)
350
+
351
+ if attention_mask is not None and len(attention_mask.shape) == 4:
352
+ if attention_mask.shape[1] == attention_mask.shape[2] == 1:
353
+ attention_mask = attention_mask.reshape(attention_mask.shape[0], -1)
354
+ else:
355
+ raise ValueError(
356
+ "Get seqlens from a non-causal based full 4D attn mask is not expected. Maybe need to pass in `force_flash_attention` in `get_masks`."
357
+ ) # TODO
358
+
359
+ attn_output = _flash_attention_forward(
360
+ query_states,
361
+ key_states,
362
+ value_states,
363
+ attention_mask,
364
+ q_len,
365
+ position_ids=position_ids,
366
+ dropout=dropout_rate,
367
+ softmax_scale=self.scaling,
368
+ sliding_window=getattr(self, "sliding_window", None),
369
+ use_top_left_mask=self._flash_attn_uses_top_left_mask,
370
+ is_causal=self.is_causal,
371
+ )
372
+
373
+ attn_output = attn_output.reshape(bsz, q_len, -1).contiguous()
374
+ attn_output = self.o_proj(attn_output)
375
+
376
+ if not output_attentions:
377
+ attn_weights = None
378
+
379
+ return attn_output, attn_weights, past_key_value
380
+
381
+
382
+ class GlmSdpaAttention(GlmAttention):
383
+ """
384
+ Glm attention module using torch.nn.functional.scaled_dot_product_attention. This module inherits from
385
+ `GlmAttention` as the weights of the module stays untouched. The only changes are on the forward pass to adapt to
386
+ SDPA API.
387
+ """
388
+
389
+ # Adapted from GlmAttention.forward
390
+ def forward(
391
+ self,
392
+ hidden_states: torch.Tensor,
393
+ attention_mask: Optional[torch.Tensor] = None,
394
+ position_ids: Optional[torch.LongTensor] = None,
395
+ past_key_value: Optional[Cache] = None,
396
+ output_attentions: bool = False,
397
+ use_cache: bool = False,
398
+ cache_position: Optional[torch.LongTensor] = None,
399
+ position_embeddings: Optional[Tuple[torch.Tensor, torch.Tensor]] = None, # will become mandatory in v4.45
400
+ **kwargs,
401
+ ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
402
+ if output_attentions:
403
+ # TODO: Improve this warning with e.g. `model.config.attn_implementation = "manual"` once this is implemented.
404
+ logger.warning_once(
405
+ "GlmModel is using GlmSdpaAttention, but `torch.nn.functional.scaled_dot_product_attention` does not support `output_attentions=True`. Falling back to the manual attention implementation, "
406
+ 'but specifying the manual implementation will be required from Transformers version v5.0.0 onwards. This warning can be removed using the argument `attn_implementation="eager"` when loading the model.'
407
+ )
408
+ return super().forward(
409
+ hidden_states=hidden_states,
410
+ attention_mask=attention_mask,
411
+ position_ids=position_ids,
412
+ past_key_value=past_key_value,
413
+ output_attentions=output_attentions,
414
+ use_cache=use_cache,
415
+ cache_position=cache_position,
416
+ position_embeddings=position_embeddings,
417
+ )
418
+
419
+ bsz, q_len, _ = hidden_states.size()
420
+
421
+ query_states = self.q_proj(hidden_states)
422
+ key_states = self.k_proj(hidden_states)
423
+ value_states = self.v_proj(hidden_states)
424
+
425
+ query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
426
+ key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
427
+ value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
428
+
429
+ cos, sin = position_embeddings
430
+ query_states, key_states = apply_rotary_pos_emb(
431
+ query_states, key_states, cos, sin, partial_rotary_factor=self.partial_rotary_factor
432
+ )
433
+
434
+ if past_key_value is not None:
435
+ # sin and cos are specific to RoPE models; cache_position needed for the static cache
436
+ cache_kwargs = {"sin": sin, "cos": cos, "cache_position": cache_position}
437
+ key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs)
438
+
439
+ key_states = repeat_kv(key_states, self.num_key_value_groups)
440
+ value_states = repeat_kv(value_states, self.num_key_value_groups)
441
+
442
+ causal_mask = attention_mask
443
+ if attention_mask is not None:
444
+ causal_mask = causal_mask[:, :, :, : key_states.shape[-2]]
445
+
446
+ # SDPA with memory-efficient backend is currently (torch==2.1.2) bugged with non-contiguous inputs with custom attn_mask,
447
+ # Reference: https://github.com/pytorch/pytorch/issues/112577.
448
+ if query_states.device.type == "cuda" and causal_mask is not None:
449
+ query_states = query_states.contiguous()
450
+ key_states = key_states.contiguous()
451
+ value_states = value_states.contiguous()
452
+
453
+ # We dispatch to SDPA's Flash Attention or Efficient kernels via this `is_causal` if statement instead of an inline conditional assignment
454
+ # in SDPA to support both torch.compile's dynamic shapes and full graph options. An inline conditional prevents dynamic shapes from compiling.
455
+ is_causal = True if causal_mask is None and q_len > 1 else False
456
+
457
+ attn_output = torch.nn.functional.scaled_dot_product_attention(
458
+ query_states,
459
+ key_states,
460
+ value_states,
461
+ attn_mask=causal_mask,
462
+ dropout_p=self.attention_dropout if self.training else 0.0,
463
+ is_causal=is_causal,
464
+ scale=self.scaling,
465
+ )
466
+
467
+ attn_output = attn_output.transpose(1, 2).contiguous()
468
+ attn_output = attn_output.view(bsz, q_len, -1)
469
+
470
+ attn_output = self.o_proj(attn_output)
471
+
472
+ return attn_output, None, past_key_value
473
+
474
+
475
+ GLM_ATTENTION_CLASSES = {
476
+ "eager": GlmAttention,
477
+ "flash_attention_2": GlmFlashAttention2,
478
+ "sdpa": GlmSdpaAttention,
479
+ }
480
+
481
+
482
+ class GlmDecoderLayer(nn.Module):
483
+ def __init__(self, config: GlmConfig, layer_idx: Optional[int] = None):
484
+ super().__init__()
485
+ self.hidden_size = config.hidden_size
486
+
487
+ # attn_implementation will not work in config.json, the correct way to use it is to pass it as
488
+ # a keyword argument to the model, e.g. ..from_pretrained(..., attn_implementation="flash_attention_2")
489
+ self.self_attn = GLM_ATTENTION_CLASSES[config._attn_implementation](config=config, layer_idx=layer_idx)
490
+
491
+ self.mlp = GlmMLP(config)
492
+ self.input_layernorm = GlmRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
493
+ self.post_attention_layernorm = GlmRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
494
+
495
+ def forward(
496
+ self,
497
+ hidden_states: torch.Tensor,
498
+ attention_mask: Optional[torch.Tensor] = None,
499
+ position_ids: Optional[torch.LongTensor] = None,
500
+ past_key_value: Optional[Cache] = None,
501
+ output_attentions: Optional[bool] = False,
502
+ use_cache: Optional[bool] = False,
503
+ cache_position: Optional[torch.LongTensor] = None,
504
+ position_embeddings: Optional[Tuple[torch.Tensor, torch.Tensor]] = None, # will become mandatory in v4.46
505
+ **kwargs,
506
+ ) -> Tuple[torch.FloatTensor, Optional[Tuple[torch.FloatTensor, torch.FloatTensor]]]:
507
+ """
508
+ Args:
509
+ hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
510
+ attention_mask (`torch.FloatTensor`, *optional*):
511
+ attention mask of size `(batch_size, sequence_length)` if flash attention is used or `(batch_size, 1,
512
+ query_sequence_length, key_sequence_length)` if default attention is used.
513
+ output_attentions (`bool`, *optional*):
514
+ Whether or not to return the attentions tensors of all attention layers. See `attentions` under
515
+ returned tensors for more detail.
516
+ use_cache (`bool`, *optional*):
517
+ If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding
518
+ (see `past_key_values`).
519
+ past_key_value (`Tuple(torch.FloatTensor)`, *optional*): cached past key and value projection states
520
+ cache_position (`torch.LongTensor` of shape `(sequence_length)`, *optional*):
521
+ Indices depicting the position of the input sequence tokens in the sequence
522
+ position_embeddings (`Tuple[torch.FloatTensor, torch.FloatTensor]`, *optional*):
523
+ Tuple containing the cosine and sine positional embeddings of shape `(batch_size, seq_len, head_dim)`,
524
+ with `head_dim` being the embedding dimension of each attention head.
525
+ kwargs (`dict`, *optional*):
526
+ Arbitrary kwargs to be ignored, used for FSDP and other methods that injects code
527
+ into the model
528
+ """
529
+ residual = hidden_states
530
+
531
+ hidden_states = self.input_layernorm(hidden_states)
532
+
533
+ # Self Attention
534
+ hidden_states, self_attn_weights, present_key_value = self.self_attn(
535
+ hidden_states=hidden_states,
536
+ attention_mask=attention_mask,
537
+ position_ids=position_ids,
538
+ past_key_value=past_key_value,
539
+ output_attentions=output_attentions,
540
+ use_cache=use_cache,
541
+ cache_position=cache_position,
542
+ position_embeddings=position_embeddings,
543
+ **kwargs,
544
+ )
545
+ hidden_states = residual + hidden_states
546
+
547
+ # Fully Connected
548
+ residual = hidden_states
549
+ hidden_states = self.post_attention_layernorm(hidden_states)
550
+ hidden_states = self.mlp(hidden_states)
551
+ hidden_states = residual + hidden_states
552
+
553
+ outputs = (hidden_states,)
554
+
555
+ if output_attentions:
556
+ outputs += (self_attn_weights,)
557
+
558
+ if use_cache:
559
+ outputs += (present_key_value,)
560
+
561
+ return outputs
562
+
563
+
564
+ GLM_START_DOCSTRING = r"""
565
+ This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
566
+ library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
567
+ etc.)
568
+
569
+ This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
570
+ Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
571
+ and behavior.
572
+
573
+ Parameters:
574
+ config ([`GlmConfig`]):
575
+ Model configuration class with all the parameters of the model. Initializing with a config file does not
576
+ load the weights associated with the model, only the configuration. Check out the
577
+ [`~PreTrainedModel.from_pretrained`] method to load the model weights.
578
+ """
579
+
580
+
581
+ @add_start_docstrings(
582
+ "The bare Glm Model outputting raw hidden-states without any specific head on top.",
583
+ GLM_START_DOCSTRING,
584
+ )
585
+ class GlmPreTrainedModel(PreTrainedModel):
586
+ config_class = GlmConfig
587
+ base_model_prefix = "model"
588
+ supports_gradient_checkpointing = True
589
+ _no_split_modules = ["GlmDecoderLayer"]
590
+ _skip_keys_device_placement = ["past_key_values"]
591
+ _supports_flash_attn_2 = True
592
+ _supports_sdpa = True
593
+ _supports_cache_class = True
594
+ _supports_quantized_cache = True
595
+ _supports_static_cache = True
596
+
597
+ def _init_weights(self, module):
598
+ std = self.config.initializer_range
599
+ if isinstance(module, nn.Linear):
600
+ module.weight.data.normal_(mean=0.0, std=std)
601
+ if module.bias is not None:
602
+ module.bias.data.zero_()
603
+ elif isinstance(module, nn.Embedding):
604
+ module.weight.data.normal_(mean=0.0, std=std)
605
+ if module.padding_idx is not None:
606
+ module.weight.data[module.padding_idx].zero_()
607
+
608
+
609
+ def is_empty(images_list: Optional[List[List[torch.Tensor]]]):
610
+ if images_list is None or len(images_list) == 0:
611
+ return True
612
+ for image_list in images_list:
613
+ if image_list is not None:
614
+ return False
615
+ return True
616
+
617
+
618
+ GLM_INPUTS_DOCSTRING = r"""
619
+ Args:
620
+ input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
621
+ Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
622
+ it.
623
+
624
+ Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
625
+ [`PreTrainedTokenizer.__call__`] for details.
626
+
627
+ [What are input IDs?](../glossary#input-ids)
628
+ attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
629
+ Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
630
+
631
+ - 1 for tokens that are **not masked**,
632
+ - 0 for tokens that are **masked**.
633
+
634
+ [What are attention masks?](../glossary#attention-mask)
635
+
636
+ Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
637
+ [`PreTrainedTokenizer.__call__`] for details.
638
+
639
+ If `past_key_values` is used, optionally only the last `input_ids` have to be input (see
640
+ `past_key_values`).
641
+
642
+ If you want to change padding behavior, you should read [`modeling_opt._prepare_decoder_attention_mask`]
643
+ and modify to your needs. See diagram 1 in [the paper](https://arxiv.org/abs/1910.13461) for more
644
+ information on the default strategy.
645
+
646
+ - 1 indicates the head is **not masked**,
647
+ - 0 indicates the head is **masked**.
648
+ position_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
649
+ Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
650
+ config.n_positions - 1]`.
651
+
652
+ [What are position IDs?](../glossary#position-ids)
653
+ past_key_values (`Cache` or `tuple(tuple(torch.FloatTensor))`, *optional*):
654
+ Pre-computed hidden-states (key and values in the self-attention blocks and in the cross-attention
655
+ blocks) that can be used to speed up sequential decoding. This typically consists in the `past_key_values`
656
+ returned by the model at a previous stage of decoding, when `use_cache=True` or `config.use_cache=True`.
657
+
658
+ Two formats are allowed:
659
+ - a [`~cache_utils.Cache`] instance, see our
660
+ [kv cache guide](https://huggingface.co/docs/transformers/en/kv_cache);
661
+ - Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of
662
+ shape `(batch_size, num_heads, sequence_length, embed_size_per_head)`). This is also known as the legacy
663
+ cache format.
664
+
665
+ The model will output the same cache format that is fed as input. If no `past_key_values` are passed, the
666
+ legacy cache format will be returned.
667
+
668
+ If `past_key_values` are used, the user can optionally input only the last `input_ids` (those that don't
669
+ have their past key value states given to this model) of shape `(batch_size, 1)` instead of all `input_ids`
670
+ of shape `(batch_size, sequence_length)`.
671
+ inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
672
+ Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
673
+ is useful if you want more control over how to convert `input_ids` indices into associated vectors than the
674
+ model's internal embedding lookup matrix.
675
+ use_cache (`bool`, *optional*):
676
+ If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see
677
+ `past_key_values`).
678
+ output_attentions (`bool`, *optional*):
679
+ Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
680
+ tensors for more detail.
681
+ output_hidden_states (`bool`, *optional*):
682
+ Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
683
+ more detail.
684
+ return_dict (`bool`, *optional*):
685
+ Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
686
+ cache_position (`torch.LongTensor` of shape `(sequence_length)`, *optional*):
687
+ Indices depicting the position of the input sequence tokens in the sequence. Contrarily to `position_ids`,
688
+ this tensor is not affected by padding. It is used to update the cache in the correct position and to infer
689
+ the complete sequence length.
690
+ """
691
+
692
+
693
+ @add_start_docstrings(
694
+ "The bare Glm Model outputting raw hidden-states without any specific head on top.",
695
+ GLM_START_DOCSTRING,
696
+ )
697
+ class GlmModel(GlmPreTrainedModel):
698
+ """
699
+ Transformer decoder consisting of *config.num_hidden_layers* layers. Each layer is a [`GlmDecoderLayer`]
700
+
701
+ Args:
702
+ config: GlmConfig
703
+ """
704
+
705
+ def __init__(self, config: GlmConfig):
706
+ super().__init__(config)
707
+ self.padding_idx = config.pad_token_id
708
+ self.vocab_size = config.vocab_size
709
+
710
+ self.embed_tokens = nn.Embedding(config.vocab_size, config.hidden_size, self.padding_idx)
711
+ self.layers = nn.ModuleList(
712
+ [GlmDecoderLayer(config, layer_idx) for layer_idx in range(config.num_hidden_layers)]
713
+ )
714
+ self.norm = GlmRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
715
+ self.partial_rotary_factor = config.partial_rotary_factor
716
+ self.rotary_emb = GlmRotaryEmbedding(
717
+ dim=config.head_dim * self.partial_rotary_factor,
718
+ max_position_embeddings=config.max_position_embeddings,
719
+ base=config.rope_theta,
720
+ )
721
+ self.gradient_checkpointing = False
722
+
723
+ # Vision model
724
+ self.vision = VisionModel(config)
725
+
726
+ # Initialize weights and apply final processing
727
+ self.post_init()
728
+
729
+ def get_input_embeddings(self):
730
+ return self.embed_tokens
731
+
732
+ def set_input_embeddings(self, value):
733
+ self.embed_tokens = value
734
+
735
+ @add_start_docstrings_to_model_forward(GLM_INPUTS_DOCSTRING)
736
+ def forward(
737
+ self,
738
+ input_ids: torch.LongTensor = None,
739
+ images: torch.Tensor = None,
740
+ attention_mask: Optional[torch.Tensor] = None,
741
+ position_ids: Optional[torch.LongTensor] = None,
742
+ past_key_values: Optional[Union[Cache, List[torch.FloatTensor]]] = None,
743
+ inputs_embeds: Optional[torch.FloatTensor] = None,
744
+ use_cache: Optional[bool] = None,
745
+ output_attentions: Optional[bool] = None,
746
+ output_hidden_states: Optional[bool] = None,
747
+ return_dict: Optional[bool] = None,
748
+ cache_position: Optional[torch.LongTensor] = None,
749
+ **kwargs,
750
+ ) -> Union[Tuple, BaseModelOutputWithPast]:
751
+ output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
752
+ output_hidden_states = (
753
+ output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
754
+ )
755
+ use_cache = use_cache if use_cache is not None else self.config.use_cache
756
+ return_dict = return_dict if return_dict is not None else self.config.use_return_dict
757
+
758
+ if (input_ids is None) ^ (inputs_embeds is not None):
759
+ raise ValueError("You must specify exactly one of input_ids or inputs_embeds")
760
+
761
+ if not past_key_values:
762
+ # not allow for inputs_embeds, because we want to process image feature
763
+ assert input_ids is not None and inputs_embeds is None, f"{input_ids} {inputs_embeds}"
764
+ inputs_embeds = self.embed_tokens(input_ids)
765
+ new_input_embeds = []
766
+ multi_flags = [True if self.config.boi_token_id in input_id.tolist() else False for input_id in input_ids]
767
+ images_features = None
768
+ if not is_empty(images):
769
+ images_features = self.vision(images).to(inputs_embeds.dtype)
770
+ image_count = 0
771
+ for i in range(len(input_ids)):
772
+ input_id = input_ids[i].tolist()
773
+ if multi_flags[i]:
774
+ boi_token_pos = input_id.index(self.config.boi_token_id)
775
+ assert boi_token_pos >= 0, "begin_of_image not found!"
776
+ num_image_padding_tokens = input_id.count(self.config.boi_token_id)
777
+ assert (
778
+ num_image_padding_tokens == images_features[image_count].shape[0]
779
+ ), f"Wrong image padding token number: {num_image_padding_tokens}"
780
+ new_input_embeds.append(
781
+ torch.cat(
782
+ (
783
+ inputs_embeds[i, :boi_token_pos],
784
+ images_features[image_count].to(inputs_embeds.device),
785
+ inputs_embeds[i, boi_token_pos + num_image_padding_tokens :],
786
+ )
787
+ )
788
+ )
789
+ image_count += 1
790
+ else:
791
+ new_input_embeds.append(inputs_embeds[i])
792
+ inputs_embeds = torch.stack(new_input_embeds, dim=0)
793
+
794
+ if self.gradient_checkpointing and self.training and use_cache:
795
+ logger.warning_once(
796
+ "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`."
797
+ )
798
+ use_cache = False
799
+
800
+ if inputs_embeds is None:
801
+ if past_key_values:
802
+ inputs_embeds = self.embed_tokens(input_ids[:, -1:])
803
+ else:
804
+ inputs_embeds = self.embed_tokens(input_ids)
805
+
806
+ # kept for BC (non `Cache` `past_key_values` inputs)
807
+ return_legacy_cache = False
808
+ if use_cache and not isinstance(past_key_values, Cache):
809
+ return_legacy_cache = True
810
+ if past_key_values is None:
811
+ past_key_values = DynamicCache()
812
+ else:
813
+ past_key_values = DynamicCache.from_legacy_cache(past_key_values)
814
+ logger.warning_once(
815
+ "We detected that you are passing `past_key_values` as a tuple of tuples. This is deprecated and "
816
+ "will be removed in v4.47. Please convert your cache or use an appropriate `Cache` class "
817
+ "(https://huggingface.co/docs/transformers/kv_cache#legacy-cache-format)"
818
+ )
819
+
820
+ if cache_position is None:
821
+ past_seen_tokens = past_key_values.get_seq_length() if past_key_values is not None else 0
822
+ cache_position = torch.arange(
823
+ past_seen_tokens, past_seen_tokens + inputs_embeds.shape[1], device=inputs_embeds.device
824
+ )
825
+ if position_ids is None:
826
+ position_ids = cache_position.unsqueeze(0)
827
+
828
+ causal_mask = self._update_causal_mask(
829
+ attention_mask, inputs_embeds, cache_position, past_key_values, output_attentions
830
+ )
831
+ hidden_states = inputs_embeds
832
+
833
+ # create position embeddings to be shared across the decoder layers
834
+ position_embeddings = self.rotary_emb(hidden_states, position_ids)
835
+
836
+ # decoder layers
837
+ all_hidden_states = () if output_hidden_states else None
838
+ all_self_attns = () if output_attentions else None
839
+ next_decoder_cache = None
840
+
841
+ for decoder_layer in self.layers:
842
+ if output_hidden_states:
843
+ all_hidden_states += (hidden_states,)
844
+
845
+ if self.gradient_checkpointing and self.training:
846
+ layer_outputs = self._gradient_checkpointing_func(
847
+ decoder_layer.__call__,
848
+ hidden_states,
849
+ causal_mask,
850
+ position_ids,
851
+ past_key_values,
852
+ output_attentions,
853
+ use_cache,
854
+ cache_position,
855
+ position_embeddings,
856
+ )
857
+ else:
858
+ layer_outputs = decoder_layer(
859
+ hidden_states,
860
+ attention_mask=causal_mask,
861
+ position_ids=position_ids,
862
+ past_key_value=past_key_values,
863
+ output_attentions=output_attentions,
864
+ use_cache=use_cache,
865
+ cache_position=cache_position,
866
+ position_embeddings=position_embeddings,
867
+ **kwargs,
868
+ )
869
+
870
+ hidden_states = layer_outputs[0]
871
+
872
+ if use_cache:
873
+ next_decoder_cache = layer_outputs[2 if output_attentions else 1]
874
+
875
+ if output_attentions:
876
+ all_self_attns += (layer_outputs[1],)
877
+
878
+ hidden_states = self.norm(hidden_states)
879
+
880
+ # add hidden states from the last decoder layer
881
+ if output_hidden_states:
882
+ all_hidden_states += (hidden_states,)
883
+
884
+ next_cache = next_decoder_cache if use_cache else None
885
+ if return_legacy_cache:
886
+ next_cache = next_cache.to_legacy_cache()
887
+
888
+ if not return_dict:
889
+ return tuple(v for v in [hidden_states, next_cache, all_hidden_states, all_self_attns] if v is not None)
890
+ return BaseModelOutputWithPast(
891
+ last_hidden_state=hidden_states,
892
+ past_key_values=next_cache,
893
+ hidden_states=all_hidden_states,
894
+ attentions=all_self_attns,
895
+ )
896
+
897
+ def _update_causal_mask(
898
+ self,
899
+ attention_mask: torch.Tensor,
900
+ input_tensor: torch.Tensor,
901
+ cache_position: torch.Tensor,
902
+ past_key_values: Cache,
903
+ output_attentions: bool,
904
+ ):
905
+ if self.config._attn_implementation == "flash_attention_2":
906
+ if attention_mask is not None and 0.0 in attention_mask:
907
+ return attention_mask
908
+ return None
909
+
910
+ # For SDPA, when possible, we will rely on its `is_causal` argument instead of its `attn_mask` argument, in
911
+ # order to dispatch on Flash Attention 2. This feature is not compatible with static cache, as SDPA will fail
912
+ # to infer the attention mask.
913
+ past_seen_tokens = past_key_values.get_seq_length() if past_key_values is not None else 0
914
+ using_static_cache = isinstance(past_key_values, StaticCache)
915
+
916
+ # When output attentions is True, sdpa implementation's forward method calls the eager implementation's forward
917
+ if self.config._attn_implementation == "sdpa" and not using_static_cache and not output_attentions:
918
+ if AttentionMaskConverter._ignore_causal_mask_sdpa(
919
+ attention_mask,
920
+ inputs_embeds=input_tensor,
921
+ past_key_values_length=past_seen_tokens,
922
+ is_training=self.training,
923
+ ):
924
+ return None
925
+
926
+ dtype, device = input_tensor.dtype, input_tensor.device
927
+ sequence_length = input_tensor.shape[1]
928
+ if using_static_cache:
929
+ target_length = past_key_values.get_max_cache_shape()
930
+ else:
931
+ target_length = (
932
+ attention_mask.shape[-1]
933
+ if isinstance(attention_mask, torch.Tensor)
934
+ else past_seen_tokens + sequence_length + 1
935
+ )
936
+
937
+ # In case the provided `attention` mask is 2D, we generate a causal mask here (4D).
938
+ causal_mask = self._prepare_4d_causal_attention_mask_with_cache_position(
939
+ attention_mask,
940
+ sequence_length=sequence_length,
941
+ target_length=target_length,
942
+ dtype=dtype,
943
+ device=device,
944
+ cache_position=cache_position,
945
+ batch_size=input_tensor.shape[0],
946
+ )
947
+
948
+ if (
949
+ self.config._attn_implementation == "sdpa"
950
+ and attention_mask is not None
951
+ and attention_mask.device.type == "cuda"
952
+ and not output_attentions
953
+ ):
954
+ # Attend to all tokens in fully masked rows in the causal_mask, for example the relevant first rows when
955
+ # using left padding. This is required by F.scaled_dot_product_attention memory-efficient attention path.
956
+ # Details: https://github.com/pytorch/pytorch/issues/110213
957
+ min_dtype = torch.finfo(dtype).min
958
+ causal_mask = AttentionMaskConverter._unmask_unattended(causal_mask, min_dtype)
959
+
960
+ return causal_mask
961
+
962
+ @staticmethod
963
+ def _prepare_4d_causal_attention_mask_with_cache_position(
964
+ attention_mask: torch.Tensor,
965
+ sequence_length: int,
966
+ target_length: int,
967
+ dtype: torch.dtype,
968
+ device: torch.device,
969
+ cache_position: torch.Tensor,
970
+ batch_size: int,
971
+ **kwargs,
972
+ ):
973
+ """
974
+ Creates a causal 4D mask of shape `(batch_size, 1, query_length, key_value_length)` from a 2D mask of shape
975
+ `(batch_size, key_value_length)`, or if the input `attention_mask` is already 4D, do nothing.
976
+
977
+ Args:
978
+ attention_mask (`torch.Tensor`):
979
+ A 2D attention mask of shape `(batch_size, key_value_length)` or a 4D attention mask of shape
980
+ `(batch_size, 1, query_length, key_value_length)`.
981
+ sequence_length (`int`):
982
+ The sequence length being processed.
983
+ target_length (`int`):
984
+ The target length: when generating with static cache, the mask should be as long as the static cache,
985
+ to account for the 0 padding, the part of the cache that is not filled yet.
986
+ dtype (`torch.dtype`):
987
+ The dtype to use for the 4D attention mask.
988
+ device (`torch.device`):
989
+ The device to plcae the 4D attention mask on.
990
+ cache_position (`torch.Tensor`):
991
+ Indices depicting the position of the input sequence tokens in the sequence.
992
+ batch_size (`torch.Tensor`):
993
+ Batch size.
994
+ """
995
+ if attention_mask is not None and attention_mask.dim() == 4:
996
+ # In this case we assume that the mask comes already in inverted form and requires no inversion or slicing.
997
+ causal_mask = attention_mask
998
+ else:
999
+ min_dtype = torch.finfo(dtype).min
1000
+ causal_mask = torch.full(
1001
+ (sequence_length, target_length), fill_value=min_dtype, dtype=dtype, device=device
1002
+ )
1003
+ if sequence_length != 1:
1004
+ causal_mask = torch.triu(causal_mask, diagonal=1)
1005
+ causal_mask *= torch.arange(target_length, device=device) > cache_position.reshape(-1, 1)
1006
+ causal_mask = causal_mask[None, None, :, :].expand(batch_size, 1, -1, -1)
1007
+ if attention_mask is not None:
1008
+ causal_mask = causal_mask.clone() # copy to contiguous memory for in-place edit
1009
+ mask_length = attention_mask.shape[-1]
1010
+ padding_mask = causal_mask[:, :, :, :mask_length] + attention_mask[:, None, None, :]
1011
+ padding_mask = padding_mask == 0
1012
+ causal_mask[:, :, :, :mask_length] = causal_mask[:, :, :, :mask_length].masked_fill(
1013
+ padding_mask, min_dtype
1014
+ )
1015
+
1016
+ return causal_mask
1017
+
1018
+
1019
+ class GlmForCausalLM(GlmPreTrainedModel, GenerationMixin):
1020
+ _tied_weights_keys = ["lm_head.weight"]
1021
+
1022
+ def __init__(self, config: GlmConfig):
1023
+ super().__init__(config)
1024
+ self.model = GlmModel(config)
1025
+ self.vocab_size = config.vocab_size
1026
+ self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
1027
+
1028
+ # Initialize weights and apply final processing
1029
+ self.post_init()
1030
+
1031
+ def get_input_embeddings(self):
1032
+ return self.model.embed_tokens
1033
+
1034
+ def set_input_embeddings(self, value):
1035
+ self.model.embed_tokens = value
1036
+
1037
+ def get_output_embeddings(self):
1038
+ return self.lm_head
1039
+
1040
+ def set_output_embeddings(self, new_embeddings):
1041
+ self.lm_head = new_embeddings
1042
+
1043
+ def set_decoder(self, decoder):
1044
+ self.model = decoder
1045
+
1046
+ def get_decoder(self):
1047
+ return self.model
1048
+
1049
+ def _update_model_kwargs_for_generation(
1050
+ self,
1051
+ outputs: ModelOutput,
1052
+ model_kwargs: Dict[str, Any],
1053
+ is_encoder_decoder: bool = False,
1054
+ standardize_cache_format: bool = False,
1055
+ ) -> Dict[str, Any]:
1056
+ # update past_key_values
1057
+ if int(transformers_version.split(".")[1]) >= 44:
1058
+ assert not standardize_cache_format
1059
+ _, cache = self._extract_past_from_model_output(outputs)
1060
+ model_kwargs["past_key_values"] = cache
1061
+ else:
1062
+ cache = self._extract_past_from_model_output(outputs, standardize_cache_format)
1063
+
1064
+ # update attention mask
1065
+ if "attention_mask" in model_kwargs:
1066
+ attention_mask = model_kwargs["attention_mask"]
1067
+ model_kwargs["attention_mask"] = torch.cat(
1068
+ [attention_mask, attention_mask.new_ones((attention_mask.shape[0], 1))], dim=-1
1069
+ )
1070
+
1071
+ # update position ids
1072
+ if "position_ids" in model_kwargs:
1073
+ position_ids = model_kwargs["position_ids"]
1074
+ new_position_id = position_ids[..., -1:].clone()
1075
+ new_position_id += 1
1076
+ model_kwargs["position_ids"] = torch.cat([position_ids, new_position_id], dim=-1)
1077
+
1078
+ model_kwargs["is_first_forward"] = False
1079
+ return model_kwargs
1080
+
1081
+ def _create_position_ids_from_attention_mask(self, attention_mask):
1082
+ # Initialize a tensor of the same shape as attention_mask to hold position IDs
1083
+ position_ids = torch.zeros_like(attention_mask, dtype=torch.long, device=attention_mask.device)
1084
+ # Iterate over the batch
1085
+ for i, mask in enumerate(attention_mask):
1086
+ # Find the positions where the mask is 1
1087
+ positions = torch.nonzero(mask, as_tuple=False).squeeze(1).to(attention_mask.device)
1088
+ # Assign position IDs to those positions
1089
+ position_ids[i, positions] = torch.arange(start=0, end=positions.size(0), dtype=torch.long).to(
1090
+ attention_mask.device
1091
+ )
1092
+ return position_ids
1093
+
1094
+ def prepare_inputs_for_generation(
1095
+ self,
1096
+ input_ids: torch.LongTensor,
1097
+ pixel_values: Optional[torch.Tensor] = torch.zeros([1, 1, 1, 3, 672, 672]),
1098
+ past_key_values: Optional[torch.Tensor] = None,
1099
+ attention_mask: Optional[torch.Tensor] = None,
1100
+ position_ids: Optional[torch.Tensor] = None,
1101
+ use_cache: Optional[bool] = None,
1102
+ is_first_forward: bool = True,
1103
+ **kwargs,
1104
+ ) -> dict:
1105
+ if position_ids is None:
1106
+ if attention_mask is None:
1107
+ # Can only build sequential ids. Raise error right now
1108
+ raise ValueError("Cannot create position ids when attention mask is None")
1109
+ else:
1110
+ position_ids = self._create_position_ids_from_attention_mask(attention_mask)
1111
+ if not is_first_forward:
1112
+ if past_key_values is not None:
1113
+ position_ids = position_ids[..., -1:]
1114
+ input_ids = input_ids[:, -1:]
1115
+ return {
1116
+ "input_ids": input_ids,
1117
+ "pixel_values": pixel_values,
1118
+ "past_key_values": past_key_values,
1119
+ "position_ids": position_ids,
1120
+ "attention_mask": attention_mask,
1121
+ "return_last_logit": True,
1122
+ "use_cache": use_cache,
1123
+ }
1124
+
1125
+ @add_start_docstrings_to_model_forward(GLM_INPUTS_DOCSTRING)
1126
+ @replace_return_docstrings(output_type=CausalLMOutputWithPast, config_class=_CONFIG_FOR_DOC)
1127
+ def forward(
1128
+ self,
1129
+ input_ids: torch.LongTensor = None,
1130
+ pixel_values: torch.Tensor = torch.zeros([1, 1, 1, 3, 672, 672]),
1131
+ attention_mask: Optional[torch.Tensor] = None,
1132
+ position_ids: Optional[torch.LongTensor] = None,
1133
+ past_key_values: Optional[Union[Cache, List[torch.FloatTensor]]] = None,
1134
+ inputs_embeds: Optional[torch.FloatTensor] = None,
1135
+ labels: Optional[torch.LongTensor] = None,
1136
+ use_cache: Optional[bool] = None,
1137
+ output_attentions: Optional[bool] = None,
1138
+ output_hidden_states: Optional[bool] = None,
1139
+ return_dict: Optional[bool] = None,
1140
+ cache_position: Optional[torch.LongTensor] = None,
1141
+ num_logits_to_keep: int = 0,
1142
+ **loss_kwargs,
1143
+ ) -> Union[Tuple, CausalLMOutputWithPast]:
1144
+ r"""
1145
+ Args:
1146
+ labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
1147
+ Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
1148
+ config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
1149
+ (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
1150
+
1151
+ num_logits_to_keep (`int`, *optional*):
1152
+ Calculate logits for the last `num_logits_to_keep` tokens. If `0`, calculate logits for all
1153
+ `input_ids` (special case). Only last token logits are needed for generation, and calculating them only for that
1154
+ token can save memory, which becomes pretty significant for long sequences or large vocabulary size.
1155
+
1156
+ Returns:
1157
+
1158
+ Example:
1159
+
1160
+ ```python
1161
+ >>> from transformers import AutoTokenizer, GlmForCausalLM
1162
+
1163
+ >>> model = GlmForCausalLM.from_pretrained("THUDM/glm-4v-9b")
1164
+ >>> tokenizer = AutoTokenizer.from_pretrained("THUDm/glm-4v-9b")
1165
+
1166
+ >>> prompt = "What is your favorite condiment?"
1167
+ >>> inputs = tokenizer(prompt, return_tensors="pt")
1168
+
1169
+ >>> # Generate
1170
+ >>> generate_ids = model.generate(inputs.input_ids, max_length=30)
1171
+ >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
1172
+ "What is your favorite condiment?"
1173
+ ```"""
1174
+ output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
1175
+ output_hidden_states = (
1176
+ output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
1177
+ )
1178
+ return_dict = return_dict if return_dict is not None else self.config.use_return_dict
1179
+ batch_size, num_concurrent_media, num_tiles, num_channels, height, width = pixel_values.shape
1180
+ pixel_values = pixel_values.reshape(batch_size * num_concurrent_media * num_tiles, num_channels, height, width)
1181
+
1182
+ # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn)
1183
+ outputs = self.model(
1184
+ input_ids=input_ids,
1185
+ images=pixel_values,
1186
+ attention_mask=attention_mask,
1187
+ position_ids=position_ids,
1188
+ past_key_values=past_key_values,
1189
+ inputs_embeds=inputs_embeds,
1190
+ use_cache=use_cache,
1191
+ output_attentions=output_attentions,
1192
+ output_hidden_states=output_hidden_states,
1193
+ return_dict=return_dict,
1194
+ cache_position=cache_position,
1195
+ )
1196
+
1197
+ hidden_states = outputs[0]
1198
+ # Only compute necessary logits, and do not upcast them to float if we are not computing the loss
1199
+ logits = self.lm_head(hidden_states[:, -num_logits_to_keep:, :])
1200
+
1201
+ loss = None
1202
+ if labels is not None:
1203
+ loss = self.loss_function(logits, labels, self.vocab_size, **loss_kwargs)
1204
+
1205
+ if not return_dict:
1206
+ output = (logits,) + outputs[1:]
1207
+ return (loss,) + output if loss is not None else output
1208
+
1209
+ return CausalLMOutputWithPast(
1210
+ loss=loss,
1211
+ logits=logits,
1212
+ past_key_values=outputs.past_key_values,
1213
+ hidden_states=outputs.hidden_states,
1214
+ attentions=outputs.attentions,
1215
+ )
1216
+
1217
+
1218
+ @add_start_docstrings(
1219
+ """
1220
+ The Glm Model transformer with a sequence classification head on top (linear layer).
1221
+
1222
+ [`vForSequenceClassification`] uses the last token in order to do the classification, as other causal models
1223
+ (e.g. GPT-2) do.
1224
+
1225
+ Since it does classification on the last token, it requires to know the position of the last token. If a
1226
+ `pad_token_id` is defined in the configuration, it finds the last token that is not a padding token in each row. If
1227
+ no `pad_token_id` is defined, it simply takes the last value in each row of the batch. Since it cannot guess the
1228
+ padding tokens when `inputs_embeds` are passed instead of `input_ids`, it does the same (take the last value in
1229
+ each row of the batch).
1230
+ """,
1231
+ GLM_START_DOCSTRING,
1232
+ )
1233
+ class GlmForSequenceClassification(GlmPreTrainedModel):
1234
+ def __init__(self, config: GlmConfig):
1235
+ super().__init__(config)
1236
+ self.num_labels = config.num_labels
1237
+ self.model = GlmModel(config)
1238
+ self.score = nn.Linear(config.hidden_size, self.num_labels, bias=False)
1239
+
1240
+ # Initialize weights and apply final processing
1241
+ self.post_init()
1242
+
1243
+ def get_input_embeddings(self):
1244
+ return self.model.embed_tokens
1245
+
1246
+ def set_input_embeddings(self, value):
1247
+ self.model.embed_tokens = value
1248
+
1249
+ @add_start_docstrings_to_model_forward(GLM_INPUTS_DOCSTRING)
1250
+ def forward(
1251
+ self,
1252
+ input_ids: Optional[torch.LongTensor] = None,
1253
+ attention_mask: Optional[torch.Tensor] = None,
1254
+ position_ids: Optional[torch.LongTensor] = None,
1255
+ past_key_values: Optional[Union[Cache, List[torch.FloatTensor]]] = None,
1256
+ inputs_embeds: Optional[torch.FloatTensor] = None,
1257
+ labels: Optional[torch.LongTensor] = None,
1258
+ use_cache: Optional[bool] = None,
1259
+ output_attentions: Optional[bool] = None,
1260
+ output_hidden_states: Optional[bool] = None,
1261
+ return_dict: Optional[bool] = None,
1262
+ ) -> Union[Tuple, SequenceClassifierOutputWithPast]:
1263
+ r"""
1264
+ labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
1265
+ Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
1266
+ config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
1267
+ `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
1268
+ """
1269
+ return_dict = return_dict if return_dict is not None else self.config.use_return_dict
1270
+
1271
+ transformer_outputs = self.model(
1272
+ input_ids,
1273
+ attention_mask=attention_mask,
1274
+ position_ids=position_ids,
1275
+ past_key_values=past_key_values,
1276
+ inputs_embeds=inputs_embeds,
1277
+ use_cache=use_cache,
1278
+ output_attentions=output_attentions,
1279
+ output_hidden_states=output_hidden_states,
1280
+ return_dict=return_dict,
1281
+ )
1282
+ hidden_states = transformer_outputs[0]
1283
+ logits = self.score(hidden_states)
1284
+
1285
+ if input_ids is not None:
1286
+ batch_size = input_ids.shape[0]
1287
+ else:
1288
+ batch_size = inputs_embeds.shape[0]
1289
+
1290
+ if self.config.pad_token_id is None and batch_size != 1:
1291
+ raise ValueError("Cannot handle batch sizes > 1 if no padding token is defined.")
1292
+ if self.config.pad_token_id is None:
1293
+ sequence_lengths = -1
1294
+ else:
1295
+ if input_ids is not None:
1296
+ # if no pad token found, use modulo instead of reverse indexing for ONNX compatibility
1297
+ sequence_lengths = torch.eq(input_ids, self.config.pad_token_id).int().argmax(-1) - 1
1298
+ sequence_lengths = sequence_lengths % input_ids.shape[-1]
1299
+ sequence_lengths = sequence_lengths.to(logits.device)
1300
+ else:
1301
+ sequence_lengths = -1
1302
+
1303
+ pooled_logits = logits[torch.arange(batch_size, device=logits.device), sequence_lengths]
1304
+
1305
+ loss = None
1306
+ if labels is not None:
1307
+ loss = self.loss_function(logits=logits, labels=labels, pooled_logits=pooled_logits, config=self.config)
1308
+
1309
+ if not return_dict:
1310
+ output = (pooled_logits,) + transformer_outputs[1:]
1311
+ return ((loss,) + output) if loss is not None else output
1312
+
1313
+ return SequenceClassifierOutputWithPast(
1314
+ loss=loss,
1315
+ logits=pooled_logits,
1316
+ past_key_values=transformer_outputs.past_key_values,
1317
+ hidden_states=transformer_outputs.hidden_states,
1318
+ attentions=transformer_outputs.attentions,
1319
+ )
1320
+
1321
+
1322
+ __all__ = [
1323
+ "GlmPreTrainedModel",
1324
+ "GlmModel",
1325
+ "GlmForCausalLM",
1326
+ "GlmForSequenceClassification",
1327
+ ]
preprocessor_config.json ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "do_convert_rgb": true,
3
+ "do_normalize": true,
4
+ "do_pad": true,
5
+ "do_rescale": true,
6
+ "do_resize": true,
7
+ "image_mean": [
8
+ 0.5,
9
+ 0.5,
10
+ 0.5
11
+ ],
12
+ "image_processor_type": "MllamaImageProcessor",
13
+ "image_std": [
14
+ 0.5,
15
+ 0.5,
16
+ 0.5
17
+ ],
18
+ "max_image_tiles": 1,
19
+ "resample": 3,
20
+ "rescale_factor": 0.00392156862745098,
21
+ "size": {
22
+ "height": 672,
23
+ "width": 672
24
+ }
25
+ }
siglip.py ADDED
@@ -0,0 +1,67 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ from torch import nn
3
+ from transformers import SiglipVisionModel, SiglipVisionConfig
4
+
5
+ # 384/14=27.428571428571427 is not an integer, so the actual pos embedding is 729, sqrt(729)*14=378. So the implementation uses the floor
6
+
7
+ class SiglipEncoder(nn.Module):
8
+ def __init__(self, vision_config):
9
+ super(SiglipEncoder, self).__init__()
10
+
11
+ config = SiglipVisionConfig(**vision_config)
12
+ self.model = SiglipVisionModel(config)
13
+
14
+ def forward(self, images):
15
+ outputs = self.model(images).last_hidden_state
16
+ return outputs
17
+
18
+
19
+ class GLU(nn.Module):
20
+ def __init__(self, args, in_features):
21
+ super().__init__()
22
+ self.linear_proj = nn.Linear(in_features, args.hidden_size, bias=False)
23
+ self.norm1 = nn.LayerNorm(args.hidden_size)
24
+ self.act1 = nn.GELU()
25
+ self.act2 = nn.functional.silu
26
+ self.dense_h_to_4h = nn.Linear(args.hidden_size, args.intermediate_size, bias=False)
27
+ self.gate_proj = nn.Linear(args.hidden_size, args.intermediate_size, bias=False)
28
+ self.dense_4h_to_h = nn.Linear(args.intermediate_size, args.hidden_size, bias=False)
29
+
30
+ def forward(self, x):
31
+ x = self.linear_proj(x)
32
+ x = self.act1(self.norm1(x))
33
+ x = self.act2(self.gate_proj(x)) * self.dense_h_to_4h(x)
34
+ x = self.dense_4h_to_h(x)
35
+ return x
36
+
37
+
38
+ class Adapter(nn.Module):
39
+ def __init__(self, eva_hidden_size, args):
40
+ super().__init__()
41
+ self.boi = nn.Parameter(torch.ones(1, 1, args.hidden_size).float())
42
+ self.eoi = nn.Parameter(torch.ones(1, 1, args.hidden_size).float())
43
+ self.conv = nn.Conv2d(in_channels=eva_hidden_size, out_channels=args.hidden_size, kernel_size=2, stride=2)
44
+ self.linear_proj = GLU(args, args.hidden_size)
45
+
46
+ def forward(self, image_emb):
47
+ b, s, e = image_emb.shape # (b, 6400, 1792)
48
+ grid_size = int(s**0.5)
49
+ image_emb = image_emb.view(b, grid_size, grid_size, e).permute(0,3,1,2) # (b, 1792, 80, 80)
50
+ image_emb = self.conv(image_emb) # (b, 4096, 40, 40)
51
+ image_emb = image_emb.flatten(2).transpose(1, 2) # (b, 1600, 4096)
52
+ image_emb = self.linear_proj(image_emb) # (b, 1600, 6656)
53
+ image_emb = torch.cat([self.boi.repeat(len(image_emb), 1, 1), image_emb, self.eoi.repeat(len(image_emb), 1, 1)], dim=1)
54
+ return image_emb
55
+
56
+
57
+ class VisionModel(torch.nn.Module):
58
+ def __init__(self, config):
59
+ super().__init__()
60
+ self.dtype = config.torch_dtype
61
+ self.vit = SiglipEncoder(config.vision_config)
62
+ self.adapter = Adapter(config.vision_config['hidden_size'], config)
63
+
64
+ def forward(self, image):
65
+ image = image.to(self.dtype)
66
+ vit_output = self.vit(image)
67
+ return self.adapter(vit_output).to(self.dtype)
special_tokens_map.json ADDED
@@ -0,0 +1,32 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "additional_special_tokens": [
3
+ "<|endoftext|>",
4
+ "[MASK]",
5
+ "[gMASK]",
6
+ "[sMASK]",
7
+ "<sop>",
8
+ "<eop>",
9
+ "<|system|>",
10
+ "<|user|>",
11
+ "<|assistant|>",
12
+ "<|observation|>",
13
+ "<|begin_of_image|>",
14
+ "<|end_of_image|>",
15
+ "<|begin_of_video|>",
16
+ "<|end_of_video|>"
17
+ ],
18
+ "eos_token": {
19
+ "content": "<|endoftext|>",
20
+ "lstrip": false,
21
+ "normalized": false,
22
+ "rstrip": false,
23
+ "single_word": false
24
+ },
25
+ "pad_token": {
26
+ "content": "<|endoftext|>",
27
+ "lstrip": false,
28
+ "normalized": false,
29
+ "rstrip": false,
30
+ "single_word": false
31
+ }
32
+ }
tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
tokenizer_config.json ADDED
@@ -0,0 +1,143 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "added_tokens_decoder": {
3
+ "59246": {
4
+ "content": "<|endoftext|>",
5
+ "lstrip": false,
6
+ "normalized": false,
7
+ "rstrip": false,
8
+ "single_word": false,
9
+ "special": true
10
+ },
11
+ "59247": {
12
+ "content": "[MASK]",
13
+ "lstrip": false,
14
+ "normalized": false,
15
+ "rstrip": false,
16
+ "single_word": false,
17
+ "special": true
18
+ },
19
+ "59248": {
20
+ "content": "[gMASK]",
21
+ "lstrip": false,
22
+ "normalized": false,
23
+ "rstrip": false,
24
+ "single_word": false,
25
+ "special": true
26
+ },
27
+ "59249": {
28
+ "content": "[sMASK]",
29
+ "lstrip": false,
30
+ "normalized": false,
31
+ "rstrip": false,
32
+ "single_word": false,
33
+ "special": true
34
+ },
35
+ "59250": {
36
+ "content": "<sop>",
37
+ "lstrip": false,
38
+ "normalized": false,
39
+ "rstrip": false,
40
+ "single_word": false,
41
+ "special": true
42
+ },
43
+ "59251": {
44
+ "content": "<eop>",
45
+ "lstrip": false,
46
+ "normalized": false,
47
+ "rstrip": false,
48
+ "single_word": false,
49
+ "special": true
50
+ },
51
+ "59252": {
52
+ "content": "<|system|>",
53
+ "lstrip": false,
54
+ "normalized": false,
55
+ "rstrip": false,
56
+ "single_word": false,
57
+ "special": true
58
+ },
59
+ "59253": {
60
+ "content": "<|user|>",
61
+ "lstrip": false,
62
+ "normalized": false,
63
+ "rstrip": false,
64
+ "single_word": false,
65
+ "special": true
66
+ },
67
+ "59254": {
68
+ "content": "<|assistant|>",
69
+ "lstrip": false,
70
+ "normalized": false,
71
+ "rstrip": false,
72
+ "single_word": false,
73
+ "special": true
74
+ },
75
+ "59255": {
76
+ "content": "<|observation|>",
77
+ "lstrip": false,
78
+ "normalized": false,
79
+ "rstrip": false,
80
+ "single_word": false,
81
+ "special": true
82
+ },
83
+ "59256": {
84
+ "content": "<|begin_of_image|>",
85
+ "lstrip": false,
86
+ "normalized": false,
87
+ "rstrip": false,
88
+ "single_word": false,
89
+ "special": true
90
+ },
91
+ "59257": {
92
+ "content": "<|end_of_image|>",
93
+ "lstrip": false,
94
+ "normalized": false,
95
+ "rstrip": false,
96
+ "single_word": false,
97
+ "special": true
98
+ },
99
+ "59258": {
100
+ "content": "<|reserved_special_token_1|>",
101
+ "lstrip": false,
102
+ "normalized": false,
103
+ "rstrip": false,
104
+ "single_word": false,
105
+ "special": true
106
+ },
107
+ "59259": {
108
+ "content": "<|reserved_special_token_2|>",
109
+ "lstrip": false,
110
+ "normalized": false,
111
+ "rstrip": false,
112
+ "single_word": false,
113
+ "special": true
114
+ }
115
+ },
116
+ "additional_special_tokens": [
117
+ "<|endoftext|>",
118
+ "[MASK]",
119
+ "[gMASK]",
120
+ "[sMASK]",
121
+ "<sop>",
122
+ "<eop>",
123
+ "<|system|>",
124
+ "<|user|>",
125
+ "<|assistant|>",
126
+ "<|observation|>",
127
+ "<|begin_of_image|>"
128
+ ],
129
+ "chat_template": "{% for item in messages %}{% if item['role'] != 'system' %}<|{{ item['role'] }}|>\n{% for content in item['content'] %}{% if content['type'] == 'image' %}{% for _ in range(578) %}<|begin_of_image|>{% endfor %}{% elif content['type'] == 'text' %}{{ content['text'] }}{% endif %}{% endfor %}\n{% endif %}{% endfor %}{% if add_generation_prompt %}<|assistant|>\n{% endif %}",
130
+ "clean_up_tokenization_spaces": false,
131
+ "do_lower_case": false,
132
+ "eos_token": "<|user|>",
133
+ "image_size": 672,
134
+ "model_input_names": [
135
+ "input_ids",
136
+ "attention_mask"
137
+ ],
138
+ "model_max_length": 8192,
139
+ "pad_token": "<|endoftext|>",
140
+ "padding_side": "left",
141
+ "remove_space": false,
142
+ "tokenizer_class": "PreTrainedTokenizerFast"
143
+ }